From 63a0e38ac9b3e6dac238f04679a8c7943bf6f4c5 Mon Sep 17 00:00:00 2001
From: Alain St-Denis
Date: Tue, 6 Jun 2023 16:52:51 -0400
Subject: [PATCH 01/28] Fix platform quirks (hubzilla, guppe, bird.makeup) that
prevent their profiles processing to fail.
---
federation/entities/activitypub/ldcontext.py | 17 ++++++----
federation/entities/activitypub/models.py | 34 ++++++++++++++++----
2 files changed, 39 insertions(+), 12 deletions(-)
diff --git a/federation/entities/activitypub/ldcontext.py b/federation/entities/activitypub/ldcontext.py
index e46bfb9..8fdafc2 100644
--- a/federation/entities/activitypub/ldcontext.py
+++ b/federation/entities/activitypub/ldcontext.py
@@ -137,12 +137,17 @@ class LdContextManager:
# Merge all defined AP extensions to the inbound context
uris = []
defs = {}
- # Merge original context dicts in one dict
- for item in ctx:
- if isinstance(item, str):
- uris.append(item)
- else:
- defs.update(item)
+ # Merge original context dicts in one dict, taking into account nested @context
+ def parse_context(ctx):
+ for item in ctx:
+ if isinstance(item, str):
+ uris.append(item)
+ else:
+ if '@context' in item:
+ parse_context([item['@context']])
+ item.pop('@context')
+ defs.update(item)
+ parse_context(ctx)
for item in self._merged:
if isinstance(item, str) and item not in uris:
diff --git a/federation/entities/activitypub/models.py b/federation/entities/activitypub/models.py
index 9e7fdb5..6bd841e 100644
--- a/federation/entities/activitypub/models.py
+++ b/federation/entities/activitypub/models.py
@@ -1,6 +1,7 @@
import copy
import json
import logging
+import traceback
import uuid
from datetime import timedelta
from typing import List, Dict, Union
@@ -241,8 +242,8 @@ class Object(BaseEntity, metaclass=JsonLDAnnotation):
metadata={'ctx':[{ 'alsoKnownAs':{'@id':'as:alsoKnownAs','@type':'@id'}}]})
icon = MixedField(as2.icon, nested='ImageSchema')
image = MixedField(as2.image, nested='ImageSchema')
- tag_objects = MixedField(as2.tag, nested=['HashtagSchema','MentionSchema','PropertyValueSchema','EmojiSchema'], many=True)
- attachment = fields.Nested(as2.attachment, nested=['ImageSchema', 'AudioSchema', 'DocumentSchema','PropertyValueSchema','IdentityProofSchema'],
+ tag_objects = MixedField(as2.tag, nested=['NoteSchema', 'HashtagSchema','MentionSchema','PropertyValueSchema','EmojiSchema'], many=True)
+ attachment = fields.Nested(as2.attachment, nested=['NoteSchema', 'ImageSchema', 'AudioSchema', 'DocumentSchema','PropertyValueSchema','IdentityProofSchema'],
many=True, default=[])
content_map = LanguageMap(as2.content) # language maps are not implemented in calamus
context = fields.RawJsonLD(as2.context)
@@ -250,7 +251,7 @@ class Object(BaseEntity, metaclass=JsonLDAnnotation):
generator = MixedField(as2.generator, nested=['ApplicationSchema','ServiceSchema'])
created_at = fields.DateTime(as2.published, add_value_types=True)
replies = MixedField(as2.replies, nested=['CollectionSchema','OrderedCollectionSchema'])
- signature = MixedField(sec.signature, nested = 'SignatureSchema',
+ signature = MixedField(sec.signature, nested = 'RsaSignature2017Schema',
metadata={'ctx': [CONTEXT_SECURITY,
{'RsaSignature2017':'sec:RsaSignature2017'}]})
start_time = fields.DateTime(as2.startTime, add_value_types=True)
@@ -333,6 +334,20 @@ class Object(BaseEntity, metaclass=JsonLDAnnotation):
data['@context'] = context_manager.merge_context(ctx)
return data
+ # JSONLD specs states it is case sensitive.
+ # Ensure type names for which we have an implementation have the proper case
+ # for platforms that ignore the spec.
+ @pre_load
+ def patch_types(self, data, **kwargs):
+ def walk_payload(payload):
+ for key,val in copy.copy(payload).items():
+ if isinstance(val, dict):
+ payload.update(walk_payload(val))
+ if key == 'type':
+ payload[key] = MODEL_NAMES.get(val.lower(), val)
+ return payload
+ return walk_payload(data)
+
# A node without an id isn't true json-ld, but many payloads have
# id-less nodes. Since calamus forces random ids on such nodes,
# this removes it.
@@ -567,7 +582,7 @@ class Person(Object, base.Profile):
def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
- self._allowed_children += (PropertyValue, IdentityProof)
+ self._allowed_children += (Note, PropertyValue, IdentityProof)
# Set finger to username@host if not provided by the platform
def post_receive(self):
@@ -1019,7 +1034,7 @@ class Video(Document, base.Video):
return self
-class Signature(Object):
+class RsaSignature2017(Object):
created = fields.DateTime(dc.created, add_value_types=True)
creator = IRI(dc.creator)
key = fields.String(sec.signatureValue)
@@ -1396,6 +1411,7 @@ def model_to_objects(payload):
entity = model.schema().load(payload)
except (KeyError, jsonld.JsonLdError, exceptions.ValidationError) as exc : # Just give up for now. This must be made robust
logger.error("Error parsing jsonld payload (%s)", exc)
+ traceback.print_exception(exc)
return None
if isinstance(getattr(entity, 'object_', None), Object):
@@ -1416,4 +1432,10 @@ CLASSES_WITH_CONTEXT_EXTENSIONS = (
Person,
PropertyValue
)
-context_manager = LdContextManager(CLASSES_WITH_CONTEXT_EXTENSIONS)
\ No newline at end of file
+context_manager = LdContextManager(CLASSES_WITH_CONTEXT_EXTENSIONS)
+
+
+MODEL_NAMES = {}
+for key,val in copy.copy(globals()).items():
+ if type(val) == JsonLDAnnotation and issubclass(val, Object):
+ MODEL_NAMES[key.lower()] = key
From f72ecf459a598d506a502be7e7cc4da791180b18 Mon Sep 17 00:00:00 2001
From: Alain St-Denis
Date: Tue, 6 Jun 2023 17:57:47 -0400
Subject: [PATCH 02/28] Fix logic error.
---
federation/entities/activitypub/models.py | 5 ++++-
1 file changed, 4 insertions(+), 1 deletion(-)
diff --git a/federation/entities/activitypub/models.py b/federation/entities/activitypub/models.py
index 6bd841e..9c1c6e9 100644
--- a/federation/entities/activitypub/models.py
+++ b/federation/entities/activitypub/models.py
@@ -342,7 +342,7 @@ class Object(BaseEntity, metaclass=JsonLDAnnotation):
def walk_payload(payload):
for key,val in copy.copy(payload).items():
if isinstance(val, dict):
- payload.update(walk_payload(val))
+ walk_payload(val)
if key == 'type':
payload[key] = MODEL_NAMES.get(val.lower(), val)
return payload
@@ -1439,3 +1439,6 @@ MODEL_NAMES = {}
for key,val in copy.copy(globals()).items():
if type(val) == JsonLDAnnotation and issubclass(val, Object):
MODEL_NAMES[key.lower()] = key
+
+from pprint import pprint
+pprint(MODEL_NAMES)
\ No newline at end of file
From 33131bd9fe9c6728dc2ba9f808373915b7e2d530 Mon Sep 17 00:00:00 2001
From: Alain St-Denis
Date: Tue, 6 Jun 2023 18:00:01 -0400
Subject: [PATCH 03/28] Remove debug pprint.
---
federation/entities/activitypub/models.py | 3 ---
1 file changed, 3 deletions(-)
diff --git a/federation/entities/activitypub/models.py b/federation/entities/activitypub/models.py
index 9c1c6e9..e98321a 100644
--- a/federation/entities/activitypub/models.py
+++ b/federation/entities/activitypub/models.py
@@ -1439,6 +1439,3 @@ MODEL_NAMES = {}
for key,val in copy.copy(globals()).items():
if type(val) == JsonLDAnnotation and issubclass(val, Object):
MODEL_NAMES[key.lower()] = key
-
-from pprint import pprint
-pprint(MODEL_NAMES)
\ No newline at end of file
From e94533b222a58bbc208cd057b7e911f13cb910f5 Mon Sep 17 00:00:00 2001
From: Alain St-Denis
Date: Wed, 7 Jun 2023 10:34:08 -0400
Subject: [PATCH 04/28] Allow Link objects as items of the attachment property.
Must be rendered by the client app.
---
federation/entities/activitypub/models.py | 17 +++++++++--------
1 file changed, 9 insertions(+), 8 deletions(-)
diff --git a/federation/entities/activitypub/models.py b/federation/entities/activitypub/models.py
index e98321a..740cd11 100644
--- a/federation/entities/activitypub/models.py
+++ b/federation/entities/activitypub/models.py
@@ -243,7 +243,7 @@ class Object(BaseEntity, metaclass=JsonLDAnnotation):
icon = MixedField(as2.icon, nested='ImageSchema')
image = MixedField(as2.image, nested='ImageSchema')
tag_objects = MixedField(as2.tag, nested=['NoteSchema', 'HashtagSchema','MentionSchema','PropertyValueSchema','EmojiSchema'], many=True)
- attachment = fields.Nested(as2.attachment, nested=['NoteSchema', 'ImageSchema', 'AudioSchema', 'DocumentSchema','PropertyValueSchema','IdentityProofSchema'],
+ attachment = fields.Nested(as2.attachment, nested=['LinkSchema', 'NoteSchema', 'ImageSchema', 'AudioSchema', 'DocumentSchema','PropertyValueSchema','IdentityProofSchema'],
many=True, default=[])
content_map = LanguageMap(as2.content) # language maps are not implemented in calamus
context = fields.RawJsonLD(as2.context)
@@ -736,7 +736,7 @@ class Note(Object, RawContentMixin):
def __init__(self, *args, **kwargs):
self.tag_objects = [] # mutable objects...
super().__init__(*args, **kwargs)
- self._allowed_children += (base.Audio, base.Video)
+ self._allowed_children += (base.Audio, base.Video, Link)
def to_as2(self):
self.sensitive = 'nsfw' in self.tags
@@ -932,12 +932,13 @@ class Note(Object, RawContentMixin):
if isinstance(getattr(self, 'attachment', None), list):
children = []
for child in self.attachment:
- if isinstance(child, Document):
- obj = child.to_base()
- if isinstance(obj, Image):
- if obj.inline or (obj.image and obj.image in self.raw_content):
+ if isinstance(child, (Document, Link)):
+ if hasattr(child, 'to_base'):
+ child = child.to_base()
+ if isinstance(child, Image):
+ if child.inline or (child.image and child.image in self.raw_content):
continue
- children.append(obj)
+ children.append(child)
self._cached_children = children
return self._cached_children
@@ -1437,5 +1438,5 @@ context_manager = LdContextManager(CLASSES_WITH_CONTEXT_EXTENSIONS)
MODEL_NAMES = {}
for key,val in copy.copy(globals()).items():
- if type(val) == JsonLDAnnotation and issubclass(val, Object):
+ if type(val) == JsonLDAnnotation and issubclass(val, (Object, Link)):
MODEL_NAMES[key.lower()] = key
From e0993a7f7f49bc748820770ff4a32b3a0382a857 Mon Sep 17 00:00:00 2001
From: Alain St-Denis
Date: Sat, 8 Jul 2023 07:34:44 -0400
Subject: [PATCH 05/28] Switch to BeautifulSoup for content processing.
Outbound rendered content is now provided by the client app. Mark inbound AP
HTML content hashtags and mentions. Fix missing href attribute crashing
process_text_links.
---
CHANGELOG.md | 4 +-
docs/protocols.rst | 33 +++-
federation/entities/activitypub/models.py | 222 +++++++++++-----------
federation/entities/mixins.py | 66 ++-----
federation/tests/entities/test_base.py | 1 +
federation/utils/text.py | 80 +++-----
6 files changed, 177 insertions(+), 229 deletions(-)
diff --git a/CHANGELOG.md b/CHANGELOG.md
index 939a554..19d2bca 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -22,7 +22,7 @@
* For inbound payload, a cached dict of all the defined AP extensions is merged with each incoming LD context.
* Better handle conflicting property defaults by having `get_base_attributes` return only attributes that
- are not empty (or bool). This helps distinguishing between `marshmallow.missing` and empty values.
+ are not empty (or bool). This helps distinguish between `marshmallow.missing` and empty values.
* JsonLD document caching now set in `activitypub/__init__.py`.
@@ -45,6 +45,8 @@
* In fetch_document: if response.encoding is not set, default to utf-8.
+* Fix process_text_links that would crash on `a` tags with no `href` attribute.
+
## [0.24.1] - 2023-03-18
### Fixed
diff --git a/docs/protocols.rst b/docs/protocols.rst
index 1e15467..0dd845b 100644
--- a/docs/protocols.rst
+++ b/docs/protocols.rst
@@ -4,9 +4,8 @@ Protocols
Currently three protocols are being focused on.
* Diaspora is considered to be stable with most of the protocol implemented.
-* ActivityPub support should be considered as alpha - all the basic
- things work but there are likely to be a lot of compatibility issues with other ActivityPub
- implementations.
+* ActivityPub support should be considered as beta - all the basic
+ things work and we are fixing incompatibilities as they are identified.
* Matrix support cannot be considered usable as of yet.
For example implementations in real life projects check :ref:`example-projects`.
@@ -69,20 +68,21 @@ Content media type
The following keys will be set on the entity based on the ``source`` property existing:
* if the object has an ``object.source`` property:
- * ``_media_type`` will be the source media type
- * ``_rendered_content`` will be the object ``content``
+ * ``_media_type`` will be the source media type (only text/markdown is supported).
+ * ``rendered_content`` will be the object ``content``
* ``raw_content`` will be the source ``content``
* if the object has no ``object.source`` property:
* ``_media_type`` will be ``text/html``
- * ``_rendered_content`` will be the object ``content``
- * ``raw_content`` will object ``content`` run through a HTML2Markdown renderer
+ * ``rendered_content`` will be the object ``content``
+ * ``raw_content`` will be empty
The ``contentMap`` property is processed but content language selection is not implemented yet.
For outbound entities, ``raw_content`` is expected to be in ``text/markdown``,
-specifically CommonMark. When sending payloads, ``raw_content`` will be rendered via
-the ``commonmark`` library into ``object.content``. The original ``raw_content``
-will be added to the ``object.source`` property.
+specifically CommonMark. The client applications are expected to provide the
+rendered content for protocols that require it (e.g. ActivityPub).
+When sending payloads, ``object.contentMap`` will be set to ``rendered_content``
+and ``raw_content`` will be added to the ``object.source`` property.
Medias
......
@@ -98,6 +98,19 @@ support from client applications.
For inbound entities we do this automatically by not including received image attachments in
the entity ``_children`` attribute. Audio and video are passed through the client application.
+Hashtags and mentions
+.....................
+
+For outbound payloads, client applications must add/set the hashtag/mention value to
+the ``class`` attribute of rendered content linkified hashtags/mentions. These will be
+used to help build the corresponding ``Hashtag`` and ``Mention`` objects.
+
+For inbound payloads, if a markdown source is provided, hashtags/mentions will be extracted
+through the same method used for Diaspora. If only HTML content is provided, the ``a`` tags
+will be marked with a ``data-[hashtag|mention]`` attribute (based on the provided Hashtag/Mention
+objects) to facilitate the ``href`` attribute modifications lient applications might
+wish to make. This should ensure links can be replaced regardless of how the HTML is structured.
+
.. _matrix:
Matrix
diff --git a/federation/entities/activitypub/models.py b/federation/entities/activitypub/models.py
index 740cd11..1d58262 100644
--- a/federation/entities/activitypub/models.py
+++ b/federation/entities/activitypub/models.py
@@ -1,6 +1,7 @@
import copy
import json
import logging
+import re
import traceback
import uuid
from datetime import timedelta
@@ -8,6 +9,7 @@ from typing import List, Dict, Union
from urllib.parse import urlparse
import bleach
+from bs4 import BeautifulSoup
from calamus import fields
from calamus.schema import JsonLDAnnotation, JsonLDSchema, JsonLDSchemaOpts
from calamus.utils import normalize_value
@@ -731,15 +733,19 @@ class Note(Object, RawContentMixin):
_cached_raw_content = ''
_cached_children = []
+ _soup = None
signable = True
def __init__(self, *args, **kwargs):
self.tag_objects = [] # mutable objects...
super().__init__(*args, **kwargs)
+ self.raw_content # must be "primed" with source property for inbound payloads
+ self.rendered_content # must be "primed" with content_map property for inbound payloads
self._allowed_children += (base.Audio, base.Video, Link)
+ self._required.remove('raw_content')
+ self._required += ['rendered_content']
def to_as2(self):
- self.sensitive = 'nsfw' in self.tags
self.url = self.id
edited = False
@@ -767,8 +773,8 @@ class Note(Object, RawContentMixin):
def to_base(self):
kwargs = get_base_attributes(self, keep=(
- '_mentions', '_media_type', '_rendered_content', '_source_object',
- '_cached_children', '_cached_raw_content'))
+ '_mentions', '_media_type', '_source_object',
+ '_cached_children', '_cached_raw_content', '_soup'))
entity = Comment(**kwargs) if getattr(self, 'target_id') else Post(**kwargs)
# Plume (and maybe other platforms) send the attrbutedTo field as an array
if isinstance(entity.actor_id, list): entity.actor_id = entity.actor_id[0]
@@ -779,6 +785,7 @@ class Note(Object, RawContentMixin):
def pre_send(self) -> None:
"""
Attach any embedded images from raw_content.
+ Add Hashtag and Mention objects (the client app must define the class tag/mention property)
"""
super().pre_send()
self._children = [
@@ -789,135 +796,128 @@ class Note(Object, RawContentMixin):
) for image in self.embedded_images
]
- # Add other AP objects
- self.extract_mentions()
- self.content_map = {'orig': self.rendered_content}
- self.add_mention_objects()
- self.add_tag_objects()
+ # Add Hashtag objects
+ for el in self._soup('a', attrs={'class':'hashtag'}):
+ self.tag_objects.append(Hashtag(
+ href = el.attrs['href'],
+ name = el.text.lstrip('#')
+ ))
+ if el.text == '#nsfw': self.sensitive = True
+
+ # Add Mention objects
+ mentions = []
+ for el in self._soup('a', attrs={'class':'mention'}):
+ mentions.append(el.text.lstrip('@'))
+
+ mentions.sort()
+ for mention in mentions:
+ if validate_handle(mention):
+ profile = get_profile(finger=mention)
+ # only add AP profiles mentions
+ if getattr(profile, 'id', None):
+ self.tag_objects.append(Mention(href=profile.id, name='@'+mention))
+ # some platforms only render diaspora style markdown if it is available
+ self.source['content'] = self.source['content'].replace(mention, '{' + mention + '}')
+
def post_receive(self) -> None:
"""
- Make linkified tags normal tags.
+ Mark linkified tags and mentions with a data-{mention, tag} attribute.
"""
super().post_receive()
- if not self.raw_content or self._media_type == "text/markdown":
+ if self._media_type == "text/markdown":
# Skip when markdown
return
- hrefs = []
- for tag in self.tag_objects:
- if isinstance(tag, Hashtag):
- if tag.href is not missing:
- hrefs.append(tag.href.lower())
- elif tag.id is not missing:
- hrefs.append(tag.id.lower())
- # noinspection PyUnusedLocal
- def remove_tag_links(attrs, new=False):
- # Hashtag object hrefs
- href = (None, "href")
- url = attrs.get(href, "").lower()
- if url in hrefs:
- return
- # one more time without the query (for pixelfed)
- parsed = urlparse(url)
- url = f'{parsed.scheme}://{parsed.netloc}{parsed.path}'
- if url in hrefs:
- return
-
- # Mastodon
- rel = (None, "rel")
- if attrs.get(rel) == "tag":
- return
-
- # Friendica
- if attrs.get(href, "").endswith(f'tag={attrs.get("_text")}'):
- return
-
- return attrs
-
- self.raw_content = bleach.linkify(
- self.raw_content,
- callbacks=[remove_tag_links],
- parse_email=False,
- skip_tags=["code", "pre"],
- )
+ self._find_and_mark_hashtags()
+ self._find_and_mark_mentions()
if getattr(self, 'target_id'): self.entity_type = 'Comment'
- def add_tag_objects(self) -> None:
- """
- Populate tags to the object.tag list.
- """
- try:
- from federation.utils.django import get_configuration
- config = get_configuration()
- except ImportError:
- tags_path = None
- else:
- if config["tags_path"]:
- tags_path = f"{config['base_url']}{config['tags_path']}"
- else:
- tags_path = None
- for tag in self.tags:
- _tag = Hashtag(name=f'#{tag}')
- if tags_path:
- _tag.href = tags_path.replace(":tag:", tag)
- self.tag_objects.append(_tag)
+ def _find_and_mark_hashtags(self):
+ hrefs = set()
+ for tag in self.tag_objects:
+ if isinstance(tag, Hashtag):
+ if tag.href is not missing:
+ hrefs.add(tag.href.lower())
+ # Some platforms use id instead of href...
+ elif tag.id is not missing:
+ hrefs.add(tag.id.lower())
- def add_mention_objects(self) -> None:
- """
- Populate mentions to the object.tag list.
- """
- if len(self._mentions):
- mentions = list(self._mentions)
- mentions.sort()
- for mention in mentions:
- if validate_handle(mention):
- profile = get_profile(finger=mention)
- # only add AP profiles mentions
- if getattr(profile, 'id', None):
- self.tag_objects.append(Mention(href=profile.id, name='@'+mention))
- # some platforms only render diaspora style markdown if it is available
- self.source['content'] = self.source['content'].replace(mention, '{'+mention+'}')
+ for link in self._soup.find_all('a', href=True):
+ parsed = urlparse(link['href'].lower())
+ # remove the query part, if any
+ url = f'{parsed.scheme}://{parsed.netloc}{parsed.path}'
+ links = {link['href'].lower(), url}
+ if links.intersection(hrefs):
+ link['data-hashtag'] = link.text.lstrip('#').lower()
+
+ def _find_and_mark_mentions(self):
+ mentions = [mention for mention in self.tag_objects if isinstance(mention, Mention)]
+ hrefs = [mention.href for mention in mentions]
+ # add Mastodon's form
+ hrefs.extend([re.sub(r'/(users/)([\w]+)$', r'/@\2', href) for href in hrefs])
+ for href in hrefs:
+ links = self._soup.find_all(href=href)
+ for link in links:
+ profile = get_profile_or_entity(fid=link['href'])
+ if profile:
+ link['data-mention'] = profile.finger
+ self._mentions.add(profile.finger)
def extract_mentions(self):
"""
- Extract mentions from the source object.
- """
- super().extract_mentions()
+ Extract mentions from the inbound Mention objects.
- if getattr(self, 'tag_objects', None):
- #tag_objects = self.tag_objects if isinstance(self.tag_objects, list) else [self.tag_objects]
- for tag in self.tag_objects:
- if isinstance(tag, Mention):
- profile = get_profile_or_entity(fid=tag.href)
- handle = getattr(profile, 'finger', None)
- if handle: self._mentions.add(handle)
+ Also attempt to extract from raw_content if available
+ """
+
+ if self.raw_content:
+ super().extract_mentions()
+ return
+
+ for mention in self.tag_objects:
+ if isinstance(mention, Mention):
+ profile = get_profile_or_entity(fid=mention.href)
+ handle = getattr(profile, 'finger', None)
+ if handle: self._mentions.add(handle)
@property
- def raw_content(self):
-
- if self._cached_raw_content: return self._cached_raw_content
+ def rendered_content(self):
+ if self._soup: return str(self._soup)
+ content = ''
if self.content_map:
orig = self.content_map.pop('orig')
if len(self.content_map.keys()) > 1:
logger.warning('Language selection not implemented, falling back to default')
- self._rendered_content = orig.strip()
+ content = orig.strip()
else:
- self._rendered_content = orig.strip() if len(self.content_map.keys()) == 0 else next(iter(self.content_map.values())).strip()
+ content = orig.strip() if len(self.content_map.keys()) == 0 else next(iter(self.content_map.values())).strip()
self.content_map['orig'] = orig
+ # to allow for posts/replies with medias only.
+ if not content: content = ""
+ self._soup = BeautifulSoup(content, 'html.parser')
+ return str(self._soup)
+
+ @rendered_content.setter
+ def rendered_content(self, value):
+ if not value: return
+ self._soup = BeautifulSoup(value, 'html.parser')
+ self.content_map = {'orig': value}
+
+ @property
+ def raw_content(self):
+ if self._cached_raw_content: return self._cached_raw_content
+
+ if isinstance(self.source, dict) and self.source.get('mediaType') == 'text/markdown':
+ self._media_type = self.source['mediaType']
+ self._cached_raw_content = self.source.get('content').strip()
+ else:
+ self._media_type = 'text/html'
+ self._cached_raw_content = ""
+ return self._cached_raw_content
- if isinstance(self.source, dict) and self.source.get('mediaType') == 'text/markdown':
- self._media_type = self.source['mediaType']
- self._cached_raw_content = self.source.get('content').strip()
- else:
- self._media_type = 'text/html'
- self._cached_raw_content = self._rendered_content
- # to allow for posts/replies with medias only.
- if not self._cached_raw_content: self._cached_raw_content = ""
- return self._cached_raw_content
-
@raw_content.setter
def raw_content(self, value):
if not value: return
@@ -1026,7 +1026,7 @@ class Video(Document, base.Video):
self.actor_id = new_act[0]
entity = Post(**get_base_attributes(self,
- keep=('_mentions', '_media_type', '_rendered_content',
+ keep=('_mentions', '_media_type', '_soup',
'_cached_children', '_cached_raw_content', '_source_object')))
set_public(entity)
return entity
@@ -1330,14 +1330,16 @@ def extract_and_validate(entity):
entity._source_protocol = "activitypub"
# Extract receivers
entity._receivers = extract_receivers(entity)
+
+ # Extract mentions
+ if hasattr(entity, "extract_mentions"):
+ entity.extract_mentions()
+
if hasattr(entity, "post_receive"):
entity.post_receive()
if hasattr(entity, 'validate'): entity.validate()
- # Extract mentions
- if hasattr(entity, "extract_mentions"):
- entity.extract_mentions()
def extract_replies(replies):
diff --git a/federation/entities/mixins.py b/federation/entities/mixins.py
index 30ef9d8..8ca6745 100644
--- a/federation/entities/mixins.py
+++ b/federation/entities/mixins.py
@@ -4,12 +4,13 @@ import re
import warnings
from typing import List, Set, Union, Dict, Tuple
+from bs4 import BeautifulSoup
from commonmark import commonmark
from marshmallow import missing
from federation.entities.activitypub.enums import ActivityType
from federation.entities.utils import get_name_for_profile, get_profile
-from federation.utils.text import process_text_links, find_tags
+from federation.utils.text import process_text_links, find_elements, find_tags, MENTION_PATTERN
class BaseEntity:
@@ -22,6 +23,7 @@ class BaseEntity:
_source_object: Union[str, Dict] = None
_sender: str = ""
_sender_key: str = ""
+ _tags: Set = None
# ActivityType
activity: ActivityType = None
activity_id: str = ""
@@ -205,7 +207,7 @@ class CreatedAtMixin(BaseEntity):
class RawContentMixin(BaseEntity):
_media_type: str = "text/markdown"
_mentions: Set = None
- _rendered_content: str = ""
+ rendered_content: str = ""
raw_content: str = ""
def __init__(self, *args, **kwargs):
@@ -231,59 +233,22 @@ class RawContentMixin(BaseEntity):
images.append((groups[1], groups[0] or ""))
return images
- @property
- def rendered_content(self) -> str:
- """Returns the rendered version of raw_content, or just raw_content."""
- try:
- from federation.utils.django import get_configuration
- config = get_configuration()
- if config["tags_path"]:
- def linkifier(tag: str) -> str:
- return f'' \
- f'#{tag}'
- else:
- linkifier = None
- except ImportError:
- linkifier = None
-
- if self._rendered_content:
- return self._rendered_content
- elif self._media_type == "text/markdown" and self.raw_content:
- # Do tags
- _tags, rendered = find_tags(self.raw_content, replacer=linkifier)
- # Render markdown to HTML
- rendered = commonmark(rendered).strip()
- # Do mentions
- if self._mentions:
- for mention in self._mentions:
- # Diaspora mentions are linkified as mailto
- profile = get_profile(finger=mention)
- href = 'mailto:'+mention if not getattr(profile, 'id', None) else profile.id
- rendered = rendered.replace(
- "@%s" % mention,
- f'@{mention}',
- )
- # Finally linkify remaining URL's that are not links
- rendered = process_text_links(rendered)
- return rendered
- return self.raw_content
-
+ # Legacy. Keep this until tests are reworked
@property
def tags(self) -> List[str]:
- """Returns a `list` of unique tags contained in `raw_content`."""
if not self.raw_content:
- return []
- tags, _text = find_tags(self.raw_content)
- return sorted(tags)
+ return
+ return find_tags(self.raw_content)
def extract_mentions(self):
- if self._media_type != 'text/markdown': return
- matches = re.findall(r'@{?[\S ]?[^{}@]+[@;]?\s*[\w\-./@]+[\w/]+}?', self.raw_content)
- if not matches:
+ if not self.raw_content:
return
- for mention in matches:
+ mentions = find_elements(
+ BeautifulSoup(
+ commonmark(self.raw_content, ignore_html_blocks=True), 'html.parser'),
+ MENTION_PATTERN)
+ for ns in mentions:
+ mention = ns.text
handle = None
splits = mention.split(";")
if len(splits) == 1:
@@ -292,11 +257,12 @@ class RawContentMixin(BaseEntity):
handle = splits[1].strip(' }')
if handle:
self._mentions.add(handle)
- self.raw_content = self.raw_content.replace(mention, '@'+handle)
+ self.raw_content = self.raw_content.replace(mention, '@' + handle)
class OptionalRawContentMixin(RawContentMixin):
"""A version of the RawContentMixin where `raw_content` is not required."""
+
def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
self._required.remove("raw_content")
diff --git a/federation/tests/entities/test_base.py b/federation/tests/entities/test_base.py
index c04b832..43a69ea 100644
--- a/federation/tests/entities/test_base.py
+++ b/federation/tests/entities/test_base.py
@@ -123,6 +123,7 @@ class TestShareEntity:
class TestRawContentMixin:
+ @pytest.mark.skip
def test_rendered_content(self, post):
assert post.rendered_content == """One more test before sleep ๐
This time with an image.
"""
diff --git a/federation/utils/text.py b/federation/utils/text.py
index cebed5a..0bb6840 100644
--- a/federation/utils/text.py
+++ b/federation/utils/text.py
@@ -1,11 +1,16 @@
import re
-from typing import Set, Tuple
+from typing import Set, List
from urllib.parse import urlparse
import bleach
from bleach import callbacks
+from bs4 import BeautifulSoup
+from bs4.element import NavigableString
+from commonmark import commonmark
ILLEGAL_TAG_CHARS = "!#$%^&*+.,@ยฃ/()=?`'\\{[]}~;:\"โโโ\xa0"
+TAG_PATTERN = re.compile(r'(#[\w]+)', re.UNICODE)
+MENTION_PATTERN = re.compile(r'(@{?[\S ]?[^{}@]+[@;]?\s*[\w\-./@]+[\w/]+}?)', re.UNICODE)
def decode_if_bytes(text):
@@ -22,67 +27,26 @@ def encode_if_text(text):
return text
-def find_tags(text: str, replacer: callable = None) -> Tuple[Set, str]:
+def find_tags(text: str) -> List[str]:
"""Find tags in text.
- Tries to ignore tags inside code blocks.
+ Ignore tags inside code blocks.
- Optionally, if passed a "replacer", will also replace the tag word with the result
- of the replacer function called with the tag word.
+ Returns a set of tags.
- Returns a set of tags and the original or replaced text.
"""
- found_tags = set()
- #
and tags cause issues in us finding words - add some spacing around them
- new_text = text.replace("
", "
").replace("
", "
").replace("
", "
")
- lines = new_text.splitlines(keepends=True)
- final_lines = []
- code_block = False
- final_text = None
- # Check each line separately
- for line in lines:
- final_words = []
- if line[0:3] == "```":
- code_block = not code_block
- if line.find("#") == -1 or line[0:4] == " " or code_block:
- # Just add the whole line
- final_lines.append(line)
- continue
- # Check each word separately
- words = line.split(" ")
- for word in words:
- if word.find('#') > -1:
- candidate = word.strip().strip("([]),.!?:*_%/")
- if candidate.find('<') > -1 or candidate.find('>') > -1:
- # Strip html
- candidate = bleach.clean(word, strip=True)
- # Now split with slashes
- candidates = candidate.split("/")
- to_replace = []
- for candidate in candidates:
- if candidate.startswith("#"):
- candidate = candidate.strip("#")
- if test_tag(candidate.lower()):
- found_tags.add(candidate.lower())
- to_replace.append(candidate)
- if replacer:
- tag_word = word
- try:
- for counter, replacee in enumerate(to_replace, 1):
- tag_word = tag_word.replace("#%s" % replacee, replacer(replacee))
- except Exception:
- pass
- final_words.append(tag_word)
- else:
- final_words.append(word)
- else:
- final_words.append(word)
- final_lines.append(" ".join(final_words))
- if replacer:
- final_text = "".join(final_lines)
- if final_text:
- final_text = final_text.replace("
", "
").replace(" ", "
").replace("
", "")
- return found_tags, final_text or text
+ tags = find_elements(BeautifulSoup(commonmark(text, ignore_html_blocks=True), 'html.parser'),
+ TAG_PATTERN)
+ return sorted([tag.text.lstrip('#').lower() for tag in tags])
+
+
+def find_elements(soup: BeautifulSoup, pattern: re.Pattern) -> List[NavigableString]:
+ for candidate in soup.find_all(string=True):
+ if candidate.parent.name == 'code': continue
+ ns = [NavigableString(r) for r in re.split(pattern, candidate.text)]
+ candidate.replace_with(*ns)
+ return list(soup.find_all(string=pattern))
+
def get_path_from_url(url: str) -> str:
@@ -100,7 +64,7 @@ def process_text_links(text):
def link_attributes(attrs, new=False):
"""Run standard callbacks except for internal links."""
href_key = (None, "href")
- if attrs.get(href_key).startswith("/"):
+ if attrs.get(href_key, "").startswith("/"):
return attrs
# Run the standard callbacks
From 4dca31b17ffe266adba8cd8a48f4951304c22700 Mon Sep 17 00:00:00 2001
From: Alain St-Denis
Date: Sun, 9 Jul 2023 06:40:23 -0400
Subject: [PATCH 06/28] Make sure the code block a really ignored.
---
federation/utils/text.py | 12 ++++++++++--
1 file changed, 10 insertions(+), 2 deletions(-)
diff --git a/federation/utils/text.py b/federation/utils/text.py
index 0bb6840..d64ed3f 100644
--- a/federation/utils/text.py
+++ b/federation/utils/text.py
@@ -41,12 +41,20 @@ def find_tags(text: str) -> List[str]:
def find_elements(soup: BeautifulSoup, pattern: re.Pattern) -> List[NavigableString]:
+ """
+ Split a BeautifulSoup tree strings according to a pattern, replacing each element
+ with a NavigableString. The returned list can be used to linkify the found
+ elements.
+
+ :param soup: BeautifulSoup instance of the content being searched
+ :param pattern: Compiled regular expression defined using a single group
+ :return: A NavigableString list attached to the original soup
+ """
for candidate in soup.find_all(string=True):
if candidate.parent.name == 'code': continue
ns = [NavigableString(r) for r in re.split(pattern, candidate.text)]
candidate.replace_with(*ns)
- return list(soup.find_all(string=pattern))
-
+ return list(soup.find_all(string=re.compile(r'^'+pattern.pattern)))
def get_path_from_url(url: str) -> str:
From 6d885a5c40fe9f64e7b64de96fe1eaacae84489d Mon Sep 17 00:00:00 2001
From: Alain St-Denis
Date: Sun, 9 Jul 2023 08:51:10 -0400
Subject: [PATCH 07/28] Add lemmy namespace.
---
federation/entities/activitypub/models.py | 1 +
1 file changed, 1 insertion(+)
diff --git a/federation/entities/activitypub/models.py b/federation/entities/activitypub/models.py
index 1d58262..4b8aa1a 100644
--- a/federation/entities/activitypub/models.py
+++ b/federation/entities/activitypub/models.py
@@ -60,6 +60,7 @@ as2 = fields.Namespace("https://www.w3.org/ns/activitystreams#")
dc = fields.Namespace("http://purl.org/dc/terms/")
diaspora = fields.Namespace("https://diasporafoundation.org/ns/")
ldp = fields.Namespace("http://www.w3.org/ns/ldp#")
+lemmy = fields.Namespace("https://join-lemmy.org/ns#")
litepub = fields.Namespace("http://litepub.social/ns#")
misskey = fields.Namespace("https://misskey-hub.net/ns#")
ostatus = fields.Namespace("http://ostatus.org#")
From 1f8d4ac93fa93974e62d7e937a0af8b9ee9059df Mon Sep 17 00:00:00 2001
From: Alain St-Denis
Date: Mon, 10 Jul 2023 08:24:33 -0400
Subject: [PATCH 08/28] Fix a regression that broke GET requests signature
verification.
---
federation/entities/activitypub/django/views.py | 8 +++++---
federation/protocols/activitypub/protocol.py | 8 +++++++-
2 files changed, 12 insertions(+), 4 deletions(-)
diff --git a/federation/entities/activitypub/django/views.py b/federation/entities/activitypub/django/views.py
index e6eb688..18f316b 100644
--- a/federation/entities/activitypub/django/views.py
+++ b/federation/entities/activitypub/django/views.py
@@ -2,7 +2,7 @@ from cryptography.exceptions import InvalidSignature
from django.http import JsonResponse, HttpResponse, HttpResponseNotFound
from federation.entities.activitypub.mappers import get_outbound_entity
-from federation.protocols.activitypub.signing import verify_request_signature
+from federation.protocols.activitypub.protocol import Protocol
from federation.types import RequestType
from federation.utils.django import get_function_from_config
@@ -23,9 +23,11 @@ def get_and_verify_signer(request):
body=request.body,
method=request.method,
headers=request.headers)
+ protocol = Protocol(request=req, get_contact_key=get_public_key)
try:
- return verify_request_signature(req)
- except ValueError:
+ protocol.verify()
+ return protocol.sender
+ except (ValueError, KeyError, InvalidSignature) as exc:
return None
diff --git a/federation/protocols/activitypub/protocol.py b/federation/protocols/activitypub/protocol.py
index 0302eee..acb762a 100644
--- a/federation/protocols/activitypub/protocol.py
+++ b/federation/protocols/activitypub/protocol.py
@@ -49,6 +49,11 @@ class Protocol:
sender = None
user = None
+ def __init__(self, request=None, get_contact_key=None):
+ # this is required for calls to verify on GET requests
+ self.request = request
+ self.get_contact_key = get_contact_key
+
def build_send(self, entity: BaseEntity, from_user: UserType, to_user_key: RsaKey = None) -> Union[str, Dict]:
"""
Build POST data for sending out to remotes.
@@ -109,10 +114,11 @@ class Protocol:
signer = get_profile(key_id=sig.get('keyId'))
if not signer:
signer = retrieve_and_parse_document(sig.get('keyId'))
+ print(sig, signer)
self.sender = signer.id if signer else self.actor
key = getattr(signer, 'public_key', None)
if not key:
- key = self.get_contact_key(self.actor) if self.get_contact_key else ''
+ key = self.get_contact_key(self.actor) if self.get_contact_key and self.actor else ''
if key:
# fallback to the author's key the client app may have provided
logger.warning("Failed to retrieve keyId for %s, trying the actor's key", sig.get('keyId'))
From 24f5bb21a9eca400a3ddc23fcdf55553d8c30668 Mon Sep 17 00:00:00 2001
From: Alain St-Denis
Date: Mon, 10 Jul 2023 14:39:55 -0400
Subject: [PATCH 09/28] Fix some tests and fix code that was failing tests.
Catch HTML signatures with invalid padding.
---
federation/entities/activitypub/ldsigning.py | 2 +-
federation/entities/activitypub/models.py | 5 +-
federation/entities/mixins.py | 4 +-
.../entities/activitypub/test_entities.py | 31 +++++++-----
.../entities/activitypub/test_mappers.py | 42 +++++++++-------
federation/tests/fixtures/entities.py | 1 +
.../tests/fixtures/payloads/activitypub.py | 14 +++---
federation/tests/utils/test_text.py | 49 ++++---------------
federation/utils/text.py | 6 +--
9 files changed, 68 insertions(+), 86 deletions(-)
diff --git a/federation/entities/activitypub/ldsigning.py b/federation/entities/activitypub/ldsigning.py
index 381f419..f77b1fd 100644
--- a/federation/entities/activitypub/ldsigning.py
+++ b/federation/entities/activitypub/ldsigning.py
@@ -75,8 +75,8 @@ def verify_ld_signature(payload):
obj_digest = hash(obj)
digest = (sig_digest + obj_digest).encode('utf-8')
- sig_value = b64decode(signature.get('signatureValue'))
try:
+ sig_value = b64decode(signature.get('signatureValue'))
verifier.verify(SHA256.new(digest), sig_value)
logger.debug('ld_signature - %s has a valid signature', payload.get("id"))
return profile.id
diff --git a/federation/entities/activitypub/models.py b/federation/entities/activitypub/models.py
index 4b8aa1a..61c3787 100644
--- a/federation/entities/activitypub/models.py
+++ b/federation/entities/activitypub/models.py
@@ -4,7 +4,7 @@ import logging
import re
import traceback
import uuid
-from datetime import timedelta
+from operator import attrgetter
from typing import List, Dict, Union
from urllib.parse import urlparse
@@ -801,8 +801,9 @@ class Note(Object, RawContentMixin):
for el in self._soup('a', attrs={'class':'hashtag'}):
self.tag_objects.append(Hashtag(
href = el.attrs['href'],
- name = el.text.lstrip('#')
+ name = el.text
))
+ self.tag_objects = sorted(self.tag_objects, key=attrgetter('name'))
if el.text == '#nsfw': self.sensitive = True
# Add Mention objects
diff --git a/federation/entities/mixins.py b/federation/entities/mixins.py
index 8ca6745..506becd 100644
--- a/federation/entities/mixins.py
+++ b/federation/entities/mixins.py
@@ -237,8 +237,8 @@ class RawContentMixin(BaseEntity):
@property
def tags(self) -> List[str]:
if not self.raw_content:
- return
- return find_tags(self.raw_content)
+ return []
+ return sorted(find_tags(self.raw_content))
def extract_mentions(self):
if not self.raw_content:
diff --git a/federation/tests/entities/activitypub/test_entities.py b/federation/tests/entities/activitypub/test_entities.py
index 835ad74..8a7ba6b 100644
--- a/federation/tests/entities/activitypub/test_entities.py
+++ b/federation/tests/entities/activitypub/test_entities.py
@@ -1,3 +1,4 @@
+import commonmark
import pytest
from unittest.mock import patch
from pprint import pprint
@@ -9,6 +10,7 @@ from federation.entities.activitypub.models import context_manager
from federation.entities.activitypub.models import Accept
from federation.tests.fixtures.keys import PUBKEY
from federation.types import UserType
+from federation.utils.text import process_text_links
class TestEntitiesConvertToAS2:
@@ -65,6 +67,8 @@ class TestEntitiesConvertToAS2:
def test_comment_to_as2__url_in_raw_content(self, activitypubcomment):
activitypubcomment.raw_content = 'raw_content http://example.com'
+ activitypubcomment.rendered_content = process_text_links(
+ commonmark.commonmark(activitypubcomment.raw_content).strip())
activitypubcomment.pre_send()
result = activitypubcomment.to_as2()
assert result == {
@@ -118,6 +122,7 @@ class TestEntitiesConvertToAS2:
}
def test_post_to_as2(self, activitypubpost):
+ activitypubpost.rendered_content = commonmark.commonmark(activitypubpost.raw_content).strip()
activitypubpost.pre_send()
result = activitypubpost.to_as2()
assert result == {
@@ -191,6 +196,15 @@ class TestEntitiesConvertToAS2:
}
def test_post_to_as2__with_tags(self, activitypubpost_tags):
+ activitypubpost_tags.rendered_content = 'raw_content
\n' \
+ '#foobar\n' \
+ '#barfoo
'
activitypubpost_tags.pre_send()
result = activitypubpost_tags.to_as2()
assert result == {
@@ -204,11 +218,11 @@ class TestEntitiesConvertToAS2:
'url': 'http://127.0.0.1:8000/post/123456/',
'attributedTo': 'http://127.0.0.1:8000/profile/123456/',
'content': 'raw_content
\n'
- '#foobar\n'
- '#barfoo
',
@@ -235,6 +249,7 @@ class TestEntitiesConvertToAS2:
}
def test_post_to_as2__with_images(self, activitypubpost_images):
+ activitypubpost_images.rendered_content = 'raw_content
'
activitypubpost_images.pre_send()
result = activitypubpost_images.to_as2()
assert result == {
@@ -274,6 +289,7 @@ class TestEntitiesConvertToAS2:
}
def test_post_to_as2__with_diaspora_guid(self, activitypubpost_diaspora_guid):
+ activitypubpost_diaspora_guid.rendered_content = 'raw_content
'
activitypubpost_diaspora_guid.pre_send()
result = activitypubpost_diaspora_guid.to_as2()
assert result == {
@@ -418,17 +434,6 @@ class TestEntitiesPostReceive:
"public": False,
}]
- @patch("federation.entities.activitypub.models.bleach.linkify", autospec=True)
- def test_post_post_receive__linkifies_if_not_markdown(self, mock_linkify, activitypubpost):
- activitypubpost._media_type = 'text/html'
- activitypubpost.post_receive()
- mock_linkify.assert_called_once()
-
- @patch("federation.entities.activitypub.models.bleach.linkify", autospec=True)
- def test_post_post_receive__skips_linkify_if_markdown(self, mock_linkify, activitypubpost):
- activitypubpost.post_receive()
- mock_linkify.assert_not_called()
-
class TestEntitiesPreSend:
def test_post_inline_images_are_attached(self, activitypubpost_embedded_images):
diff --git a/federation/tests/entities/activitypub/test_mappers.py b/federation/tests/entities/activitypub/test_mappers.py
index 566503f..9a2c042 100644
--- a/federation/tests/entities/activitypub/test_mappers.py
+++ b/federation/tests/entities/activitypub/test_mappers.py
@@ -4,6 +4,9 @@ from unittest.mock import patch, Mock, DEFAULT
import json
import pytest
+from federation.entities.activitypub.models import Person
+
+
#from federation.entities.activitypub.entities import (
# models.Follow, models.Accept, models.Person, models.Note, models.Note,
# models.Delete, models.Announce)
@@ -70,9 +73,7 @@ class TestActivitypubEntityMappersReceive:
post = entities[0]
assert isinstance(post, models.Note)
assert isinstance(post, Post)
- assert post.raw_content == '' \
- '@jaywink boom
'
+ assert post.raw_content == ''
assert post.rendered_content == '' \
'@jaywink boom
'
assert post.id == "https://diaspodon.fr/users/jaywink/statuses/102356911717767237"
@@ -87,40 +88,41 @@ class TestActivitypubEntityMappersReceive:
post = entities[0]
assert isinstance(post, models.Note)
assert isinstance(post, Post)
- assert post.raw_content == 'boom #test
'
+ assert post.raw_content == ''
+ assert post.rendered_content == 'boom #test
'
- # TODO: fix this test
- @pytest.mark.skip
- def test_message_to_objects_simple_post__with_mentions(self):
+ @patch("federation.entities.activitypub.models.get_profile_or_entity", return_value=Person(finger="jaywink@dev3.jasonrobinson.me"))
+ def test_message_to_objects_simple_post__with_mentions(self, mock_get):
entities = message_to_objects(ACTIVITYPUB_POST_WITH_MENTIONS, "https://mastodon.social/users/jaywink")
assert len(entities) == 1
post = entities[0]
assert isinstance(post, models.Note)
assert isinstance(post, Post)
assert len(post._mentions) == 1
- assert list(post._mentions)[0] == "https://dev3.jasonrobinson.me/u/jaywink/"
+ assert list(post._mentions)[0] == "jaywink@dev3.jasonrobinson.me"
- def test_message_to_objects_simple_post__with_source__bbcode(self):
+
+ @patch("federation.entities.activitypub.models.get_profile_or_entity", return_value=Person(finger="jaywink@dev.jasonrobinson.me"))
+ def test_message_to_objects_simple_post__with_source__bbcode(self, mock_get):
entities = message_to_objects(ACTIVITYPUB_POST_WITH_SOURCE_BBCODE, "https://diaspodon.fr/users/jaywink")
assert len(entities) == 1
post = entities[0]
assert isinstance(post, models.Note)
assert isinstance(post, Post)
- assert post.rendered_content == '' \
+ assert post.rendered_content == '' \
'@jaywink boom
'
- assert post.raw_content == '' \
- '@jaywink boom
'
+ assert post.raw_content == ''
- def test_message_to_objects_simple_post__with_source__markdown(self):
+ @patch("federation.entities.activitypub.models.get_profile_or_entity", return_value=Person(finger="jaywink@dev.jasonrobinson.me"))
+ def test_message_to_objects_simple_post__with_source__markdown(self, mock_get):
entities = message_to_objects(ACTIVITYPUB_POST_WITH_SOURCE_MARKDOWN, "https://diaspodon.fr/users/jaywink")
assert len(entities) == 1
post = entities[0]
assert isinstance(post, models.Note)
assert isinstance(post, Post)
- assert post.rendered_content == '@jaywink boom
'
- assert post.raw_content == "@jaywink boom"
+ assert post.rendered_content == '@jaywink boom
'
+ assert post.raw_content == "@jaywink@dev.jasonrobinson.me boom"
assert post.id == "https://diaspodon.fr/users/jaywink/statuses/102356911717767237"
assert post.actor_id == "https://diaspodon.fr/users/jaywink"
assert post.public is True
@@ -145,15 +147,17 @@ class TestActivitypubEntityMappersReceive:
assert photo.guid == ""
assert photo.handle == ""
- def test_message_to_objects_comment(self):
+ @patch("federation.entities.activitypub.models.get_profile_or_entity", return_value=Person(finger="jaywink@dev.jasonrobinson.me"))
+ def test_message_to_objects_comment(self, mock_get):
entities = message_to_objects(ACTIVITYPUB_COMMENT, "https://diaspodon.fr/users/jaywink")
assert len(entities) == 1
comment = entities[0]
assert isinstance(comment, models.Note)
assert isinstance(comment, Comment)
- assert comment.raw_content == '' \
'@jaywink boom
'
+ assert comment.raw_content == ''
assert comment.id == "https://diaspodon.fr/users/jaywink/statuses/102356911717767237"
assert comment.actor_id == "https://diaspodon.fr/users/jaywink"
assert comment.target_id == "https://dev.jasonrobinson.me/content/653bad70-41b3-42c9-89cb-c4ee587e68e4/"
diff --git a/federation/tests/fixtures/entities.py b/federation/tests/fixtures/entities.py
index 8db61a9..c0d1a07 100644
--- a/federation/tests/fixtures/entities.py
+++ b/federation/tests/fixtures/entities.py
@@ -30,6 +30,7 @@ def activitypubcomment():
with freeze_time("2019-04-27"):
obj = models.Comment(
raw_content="raw_content",
+ rendered_content="raw_content
",
public=True,
provider_display_name="Socialhome",
id=f"http://127.0.0.1:8000/post/123456/",
diff --git a/federation/tests/fixtures/payloads/activitypub.py b/federation/tests/fixtures/payloads/activitypub.py
index 7c807c3..e577969 100644
--- a/federation/tests/fixtures/payloads/activitypub.py
+++ b/federation/tests/fixtures/payloads/activitypub.py
@@ -35,7 +35,7 @@ ACTIVITYPUB_COMMENT = {
'contentMap': {'en': '@jaywink boom
'},
'attachment': [],
'tag': [{'type': 'Mention',
- 'href': 'https://dev.jasonrobinson.me/p/d4574854-a5d7-42be-bfac-f70c16fcaa97/',
+ 'href': 'https://dev.jasonrobinson.me/u/jaywink/',
'name': '@jaywink@dev.jasonrobinson.me'}],
'replies': {'id': 'https://diaspodon.fr/users/jaywink/statuses/102356911717767237/replies',
'type': 'Collection',
@@ -459,9 +459,9 @@ ACTIVITYPUB_POST_WITH_TAGS = {
'conversation': 'tag:diaspodon.fr,2019-06-28:objectId=2347687:objectType=Conversation',
'content': 'boom #test
',
'attachment': [],
- 'tag': [{'type': 'Mention',
- 'href': 'https://dev.jasonrobinson.me/p/d4574854-a5d7-42be-bfac-f70c16fcaa97/',
- 'name': '@jaywink@dev.jasonrobinson.me'}],
+ 'tag': [{'type': 'Hashtag',
+ 'href': 'https://mastodon.social/tags/test',
+ 'name': '#test'}],
'replies': {'id': 'https://diaspodon.fr/users/jaywink/statuses/102356911717767237/replies',
'type': 'Collection',
'first': {'type': 'CollectionPage',
@@ -552,13 +552,13 @@ ACTIVITYPUB_POST_WITH_SOURCE_MARKDOWN = {
'conversation': 'tag:diaspodon.fr,2019-06-28:objectId=2347687:objectType=Conversation',
'content': '@jaywink boom
',
'source': {
- 'content': "@jaywink boom",
+ 'content': "@{jaywink@dev.jasonrobinson.me} boom",
'mediaType': "text/markdown",
},
'contentMap': {'en': '@jaywink boom
'},
'attachment': [],
'tag': [{'type': 'Mention',
- 'href': 'https://dev.jasonrobinson.me/p/d4574854-a5d7-42be-bfac-f70c16fcaa97/',
+ 'href': 'https://dev.jasonrobinson.me/u/jaywink/',
'name': '@jaywink@dev.jasonrobinson.me'}],
'replies': {'id': 'https://diaspodon.fr/users/jaywink/statuses/102356911717767237/replies',
'type': 'Collection',
@@ -612,7 +612,7 @@ ACTIVITYPUB_POST_WITH_SOURCE_BBCODE = {
'contentMap': {'en': '@jaywink boom
'},
'attachment': [],
'tag': [{'type': 'Mention',
- 'href': 'https://dev.jasonrobinson.me/p/d4574854-a5d7-42be-bfac-f70c16fcaa97/',
+ 'href': 'https://dev.jasonrobinson.me/u/jaywink/',
'name': '@jaywink@dev.jasonrobinson.me'}],
'replies': {'id': 'https://diaspodon.fr/users/jaywink/statuses/102356911717767237/replies',
'type': 'Collection',
diff --git a/federation/tests/utils/test_text.py b/federation/tests/utils/test_text.py
index 5d0a8df..2c73bc0 100644
--- a/federation/tests/utils/test_text.py
+++ b/federation/tests/utils/test_text.py
@@ -18,78 +18,49 @@ class TestFindTags:
def test_all_tags_are_parsed_from_text(self):
source = "#starting and #MixED with some #line\nendings also tags can\n#start on new line"
- tags, text = find_tags(source)
+ tags = find_tags(source)
assert tags == {"starting", "mixed", "line", "start"}
- assert text == source
- tags, text = find_tags(source, replacer=self._replacer)
- assert text == "#starting/starting and #MixED/mixed with some #line/line\nendings also tags can\n" \
- "#start/start on new line"
def test_code_block_tags_ignored(self):
source = "foo\n```\n#code\n```\n#notcode\n\n #alsocode\n"
- tags, text = find_tags(source)
+ tags = find_tags(source)
assert tags == {"notcode"}
- assert text == source
- tags, text = find_tags(source, replacer=self._replacer)
- assert text == "foo\n```\n#code\n```\n#notcode/notcode\n\n #alsocode\n"
def test_endings_are_filtered_out(self):
source = "#parenthesis) #exp! #list] *#doh* _#bah_ #gah% #foo/#bar"
- tags, text = find_tags(source)
+ tags = find_tags(source)
assert tags == {"parenthesis", "exp", "list", "doh", "bah", "gah", "foo", "bar"}
- assert text == source
- tags, text = find_tags(source, replacer=self._replacer)
- assert text == "#parenthesis/parenthesis) #exp/exp! #list/list] *#doh/doh* _#bah/bah_ #gah/gah% " \
- "#foo/foo/#bar/bar"
def test_finds_tags(self):
source = "#post **Foobar** #tag #OtherTag #third\n#fourth"
- tags, text = find_tags(source)
+ tags = find_tags(source)
assert tags == {"third", "fourth", "post", "othertag", "tag"}
- assert text == source
- tags, text = find_tags(source, replacer=self._replacer)
- assert text == "#post/post **Foobar** #tag/tag #OtherTag/othertag #third/third\n#fourth/fourth"
def test_ok_with_html_tags_in_text(self):
source = "#starting and #MixED however not <#>this#> or <#/>that"
- tags, text = find_tags(source)
+ tags = find_tags(source)
assert tags == {"starting", "mixed"}
- assert text == source
- tags, text = find_tags(source, replacer=self._replacer)
- assert text == "
#starting/starting and #MixED/mixed however not <#>this#> or <#/>that"
def test_postfixed_tags(self):
source = "#foo) #bar] #hoo, #hee."
- tags, text = find_tags(source)
+ tags = find_tags(source)
assert tags == {"foo", "bar", "hoo", "hee"}
- assert text == source
- tags, text = find_tags(source, replacer=self._replacer)
- assert text == "#foo/foo) #bar/bar] #hoo/hoo, #hee/hee."
def test_prefixed_tags(self):
source = "(#foo [#bar"
- tags, text = find_tags(source)
+ tags = find_tags(source)
assert tags == {"foo", "bar"}
- assert text == source
- tags, text = find_tags(source, replacer=self._replacer)
- assert text == "(#foo/foo [#bar/bar"
def test_invalid_text_returns_no_tags(self):
source = "#a!a #a#a #a$a #a%a #a^a #a&a #a*a #a+a #a.a #a,a #a@a #aยฃa #a(a #a)a #a=a " \
"#a?a #a`a #a'a #a\\a #a{a #a[a #a]a #a}a #a~a #a;a #a:a #a\"a #aโa #aโa #\xa0cd"
- tags, text = find_tags(source)
- assert tags == set()
- assert text == source
- tags, text = find_tags(source, replacer=self._replacer)
- assert text == source
+ tags = find_tags(source)
+ assert tags == {'a'}
def test_start_of_paragraph_in_html_content(self):
source = '
First line
#foobar #barfoo
'
- tags, text = find_tags(source)
+ tags = find_tags(source)
assert tags == {"foobar", "barfoo"}
- assert text == source
- tags, text = find_tags(source, replacer=self._replacer)
- assert text == 'First line
#foobar/foobar #barfoo/barfoo
'
class TestProcessTextLinks:
diff --git a/federation/utils/text.py b/federation/utils/text.py
index d64ed3f..e2cd78c 100644
--- a/federation/utils/text.py
+++ b/federation/utils/text.py
@@ -27,7 +27,7 @@ def encode_if_text(text):
return text
-def find_tags(text: str) -> List[str]:
+def find_tags(text: str) -> Set[str]:
"""Find tags in text.
Ignore tags inside code blocks.
@@ -37,7 +37,7 @@ def find_tags(text: str) -> List[str]:
"""
tags = find_elements(BeautifulSoup(commonmark(text, ignore_html_blocks=True), 'html.parser'),
TAG_PATTERN)
- return sorted([tag.text.lstrip('#').lower() for tag in tags])
+ return set([tag.text.lstrip('#').lower() for tag in tags])
def find_elements(soup: BeautifulSoup, pattern: re.Pattern) -> List[NavigableString]:
@@ -54,7 +54,7 @@ def find_elements(soup: BeautifulSoup, pattern: re.Pattern) -> List[NavigableStr
if candidate.parent.name == 'code': continue
ns = [NavigableString(r) for r in re.split(pattern, candidate.text)]
candidate.replace_with(*ns)
- return list(soup.find_all(string=re.compile(r'^'+pattern.pattern)))
+ return list(soup.find_all(string=re.compile(r'\A'+pattern.pattern+r'\Z')))
def get_path_from_url(url: str) -> str:
From 47af44582c6bec8720114f0896e11b8d2801988d Mon Sep 17 00:00:00 2001
From: Alain St-Denis
Date: Wed, 12 Jul 2023 07:32:04 -0400
Subject: [PATCH 10/28] Do not lowercase the finger attribute. Remove a debug
print.
---
federation/entities/activitypub/models.py | 2 +-
federation/protocols/activitypub/protocol.py | 1 -
2 files changed, 1 insertion(+), 2 deletions(-)
diff --git a/federation/entities/activitypub/models.py b/federation/entities/activitypub/models.py
index 61c3787..5bdcff7 100644
--- a/federation/entities/activitypub/models.py
+++ b/federation/entities/activitypub/models.py
@@ -594,7 +594,7 @@ class Person(Object, base.Profile):
self.finger = profile.finger
else:
domain = urlparse(self.id).netloc
- finger = f'{self.username.lower()}@{domain}'
+ finger = f'{self.username}@{domain}'
if get_profile_id_from_webfinger(finger) == self.id:
self.finger = finger
# multi-protocol platform
diff --git a/federation/protocols/activitypub/protocol.py b/federation/protocols/activitypub/protocol.py
index acb762a..516a2f8 100644
--- a/federation/protocols/activitypub/protocol.py
+++ b/federation/protocols/activitypub/protocol.py
@@ -114,7 +114,6 @@ class Protocol:
signer = get_profile(key_id=sig.get('keyId'))
if not signer:
signer = retrieve_and_parse_document(sig.get('keyId'))
- print(sig, signer)
self.sender = signer.id if signer else self.actor
key = getattr(signer, 'public_key', None)
if not key:
From d577e39777e39801ce9783ddbb0254e6e45ac263 Mon Sep 17 00:00:00 2001
From: Alain St-Denis
Date: Thu, 13 Jul 2023 11:09:00 -0400
Subject: [PATCH 11/28] Do not assume that the last part of a mention.href is
the user's name. Adjust patterns to match a leading whitespace or the
beginning.
---
federation/entities/activitypub/models.py | 10 +++++++---
federation/utils/text.py | 4 ++--
2 files changed, 9 insertions(+), 5 deletions(-)
diff --git a/federation/entities/activitypub/models.py b/federation/entities/activitypub/models.py
index 5bdcff7..f9df860 100644
--- a/federation/entities/activitypub/models.py
+++ b/federation/entities/activitypub/models.py
@@ -857,9 +857,13 @@ class Note(Object, RawContentMixin):
def _find_and_mark_mentions(self):
mentions = [mention for mention in self.tag_objects if isinstance(mention, Mention)]
- hrefs = [mention.href for mention in mentions]
- # add Mastodon's form
- hrefs.extend([re.sub(r'/(users/)([\w]+)$', r'/@\2', href) for href in hrefs])
+ hrefs = []
+ for mention in mentions:
+ hrefs.append(mention.href)
+ # add Mastodon's form
+ parsed = urlparse(mention.href)
+ username = mention.name.lstrip('@').split('@')[0]
+ hrefs.append(f'{parsed.scheme}://{parsed.netloc}/@{username}')
for href in hrefs:
links = self._soup.find_all(href=href)
for link in links:
diff --git a/federation/utils/text.py b/federation/utils/text.py
index e2cd78c..3291fe8 100644
--- a/federation/utils/text.py
+++ b/federation/utils/text.py
@@ -9,8 +9,8 @@ from bs4.element import NavigableString
from commonmark import commonmark
ILLEGAL_TAG_CHARS = "!#$%^&*+.,@ยฃ/()=?`'\\{[]}~;:\"โโโ\xa0"
-TAG_PATTERN = re.compile(r'(#[\w]+)', re.UNICODE)
-MENTION_PATTERN = re.compile(r'(@{?[\S ]?[^{}@]+[@;]?\s*[\w\-./@]+[\w/]+}?)', re.UNICODE)
+TAG_PATTERN = re.compile(r'(^|\s)(#[\w]+)', re.UNICODE)
+MENTION_PATTERN = re.compile(r'(^|\s)(@{?[\S ]?[^{}@]+[@;]?\s*[\w\-./@]+[\w/]+}?)', re.UNICODE)
def decode_if_bytes(text):
From 7d750d336563363333da4439a2a090de6be1c0cb Mon Sep 17 00:00:00 2001
From: Alain St-Denis
Date: Fri, 14 Jul 2023 08:55:30 -0400
Subject: [PATCH 12/28] Revert the change in patterns matching beginning or
space at the beginning because it prevented the use of markdown on hastags
and mentions.
---
federation/utils/text.py | 4 ++--
1 file changed, 2 insertions(+), 2 deletions(-)
diff --git a/federation/utils/text.py b/federation/utils/text.py
index 3291fe8..e2cd78c 100644
--- a/federation/utils/text.py
+++ b/federation/utils/text.py
@@ -9,8 +9,8 @@ from bs4.element import NavigableString
from commonmark import commonmark
ILLEGAL_TAG_CHARS = "!#$%^&*+.,@ยฃ/()=?`'\\{[]}~;:\"โโโ\xa0"
-TAG_PATTERN = re.compile(r'(^|\s)(#[\w]+)', re.UNICODE)
-MENTION_PATTERN = re.compile(r'(^|\s)(@{?[\S ]?[^{}@]+[@;]?\s*[\w\-./@]+[\w/]+}?)', re.UNICODE)
+TAG_PATTERN = re.compile(r'(#[\w]+)', re.UNICODE)
+MENTION_PATTERN = re.compile(r'(@{?[\S ]?[^{}@]+[@;]?\s*[\w\-./@]+[\w/]+}?)', re.UNICODE)
def decode_if_bytes(text):
From 0783bf43aa54978faaff5128f3b1d0307a3ee8eb Mon Sep 17 00:00:00 2001
From: Alain St-Denis
Date: Sat, 15 Jul 2023 09:54:41 -0400
Subject: [PATCH 13/28] Remove unreachable code. Improve (I hope) the mention
regex for raw text.
---
federation/entities/activitypub/models.py | 10 +---------
federation/utils/text.py | 4 ++--
2 files changed, 3 insertions(+), 11 deletions(-)
diff --git a/federation/entities/activitypub/models.py b/federation/entities/activitypub/models.py
index f9df860..e7afbbe 100644
--- a/federation/entities/activitypub/models.py
+++ b/federation/entities/activitypub/models.py
@@ -874,21 +874,13 @@ class Note(Object, RawContentMixin):
def extract_mentions(self):
"""
- Extract mentions from the inbound Mention objects.
-
- Also attempt to extract from raw_content if available
+ Attempt to extract mentions from raw_content if available
"""
if self.raw_content:
super().extract_mentions()
return
- for mention in self.tag_objects:
- if isinstance(mention, Mention):
- profile = get_profile_or_entity(fid=mention.href)
- handle = getattr(profile, 'finger', None)
- if handle: self._mentions.add(handle)
-
@property
def rendered_content(self):
if self._soup: return str(self._soup)
diff --git a/federation/utils/text.py b/federation/utils/text.py
index e2cd78c..cbe6086 100644
--- a/federation/utils/text.py
+++ b/federation/utils/text.py
@@ -10,8 +10,8 @@ from commonmark import commonmark
ILLEGAL_TAG_CHARS = "!#$%^&*+.,@ยฃ/()=?`'\\{[]}~;:\"โโโ\xa0"
TAG_PATTERN = re.compile(r'(#[\w]+)', re.UNICODE)
-MENTION_PATTERN = re.compile(r'(@{?[\S ]?[^{}@]+[@;]?\s*[\w\-./@]+[\w/]+}?)', re.UNICODE)
-
+# This will match non matching braces. I don't think it's an issue.
+MENTION_PATTERN = re.compile(r'(@\{?(?:[\w\-. \u263a-\U0001f645]*; *)?[\w]+@[\w\-.]+\.[\w]+}?)', re.UNICODE)
def decode_if_bytes(text):
try:
From 33366802c4741d25a22af6116115d3df58fcdf10 Mon Sep 17 00:00:00 2001
From: Alain St-Denis
Date: Sun, 16 Jul 2023 07:13:56 -0400
Subject: [PATCH 14/28] Move process_text_links back to the client app. Skip
related tests. Convert it to BeautifulSoup. Remove unused imports.
---
federation/entities/mixins.py | 2 +-
.../entities/activitypub/test_entities.py | 3 ++-
federation/tests/utils/test_text.py | 6 ++++-
federation/utils/text.py | 25 +------------------
4 files changed, 9 insertions(+), 27 deletions(-)
diff --git a/federation/entities/mixins.py b/federation/entities/mixins.py
index 506becd..d37fd93 100644
--- a/federation/entities/mixins.py
+++ b/federation/entities/mixins.py
@@ -10,7 +10,7 @@ from marshmallow import missing
from federation.entities.activitypub.enums import ActivityType
from federation.entities.utils import get_name_for_profile, get_profile
-from federation.utils.text import process_text_links, find_elements, find_tags, MENTION_PATTERN
+from federation.utils.text import find_elements, find_tags, MENTION_PATTERN
class BaseEntity:
diff --git a/federation/tests/entities/activitypub/test_entities.py b/federation/tests/entities/activitypub/test_entities.py
index 8a7ba6b..10335d9 100644
--- a/federation/tests/entities/activitypub/test_entities.py
+++ b/federation/tests/entities/activitypub/test_entities.py
@@ -10,7 +10,6 @@ from federation.entities.activitypub.models import context_manager
from federation.entities.activitypub.models import Accept
from federation.tests.fixtures.keys import PUBKEY
from federation.types import UserType
-from federation.utils.text import process_text_links
class TestEntitiesConvertToAS2:
@@ -65,6 +64,8 @@ class TestEntitiesConvertToAS2:
'published': '2019-04-27T00:00:00',
}
+ # Now handled by the client app
+ @pytest.mark.skip
def test_comment_to_as2__url_in_raw_content(self, activitypubcomment):
activitypubcomment.raw_content = 'raw_content http://example.com'
activitypubcomment.rendered_content = process_text_links(
diff --git a/federation/tests/utils/test_text.py b/federation/tests/utils/test_text.py
index 2c73bc0..a442e93 100644
--- a/federation/tests/utils/test_text.py
+++ b/federation/tests/utils/test_text.py
@@ -1,4 +1,6 @@
-from federation.utils.text import decode_if_bytes, encode_if_text, validate_handle, process_text_links, find_tags
+import pytest
+
+from federation.utils.text import decode_if_bytes, encode_if_text, validate_handle, find_tags
def test_decode_if_bytes():
@@ -63,6 +65,8 @@ class TestFindTags:
assert tags == {"foobar", "barfoo"}
+# TODO: move these tests to the client app
+@pytest.mark.skip
class TestProcessTextLinks:
def test_link_at_start_or_end(self):
assert process_text_links('https://example.org example.org\nhttp://example.org') == \
diff --git a/federation/utils/text.py b/federation/utils/text.py
index cbe6086..8ce6478 100644
--- a/federation/utils/text.py
+++ b/federation/utils/text.py
@@ -2,8 +2,6 @@ import re
from typing import Set, List
from urllib.parse import urlparse
-import bleach
-from bleach import callbacks
from bs4 import BeautifulSoup
from bs4.element import NavigableString
from commonmark import commonmark
@@ -12,6 +10,7 @@ ILLEGAL_TAG_CHARS = "!#$%^&*+.,@ยฃ/()=?`'\\{[]}~;:\"โโโ\xa0"
TAG_PATTERN = re.compile(r'(#[\w]+)', re.UNICODE)
# This will match non matching braces. I don't think it's an issue.
MENTION_PATTERN = re.compile(r'(@\{?(?:[\w\-. \u263a-\U0001f645]*; *)?[\w]+@[\w\-.]+\.[\w]+}?)', re.UNICODE)
+URL_PATTERN = re.compile(r'(https?://[\w_\-.#?&/]+)', re.UNICODE)
def decode_if_bytes(text):
try:
@@ -65,28 +64,6 @@ def get_path_from_url(url: str) -> str:
return parsed.path
-def process_text_links(text):
- """Process links in text, adding some attributes and linkifying textual links."""
- link_callbacks = [callbacks.nofollow, callbacks.target_blank]
-
- def link_attributes(attrs, new=False):
- """Run standard callbacks except for internal links."""
- href_key = (None, "href")
- if attrs.get(href_key, "").startswith("/"):
- return attrs
-
- # Run the standard callbacks
- for callback in link_callbacks:
- attrs = callback(attrs, new)
- return attrs
-
- return bleach.linkify(
- text,
- callbacks=[link_attributes],
- parse_email=False,
- skip_tags=["code"],
- )
-
def test_tag(tag: str) -> bool:
"""Test a word whether it could be accepted as a tag."""
From 4b5a886492f6bef5c7b3f07d27df76530c240280 Mon Sep 17 00:00:00 2001
From: Alain St-Denis
Date: Mon, 17 Jul 2023 11:36:24 -0400
Subject: [PATCH 15/28] Match links with no http prefix. Remove trailing
garbage from tags.
---
federation/entities/activitypub/models.py | 8 +++++---
federation/utils/text.py | 4 ++--
2 files changed, 7 insertions(+), 5 deletions(-)
diff --git a/federation/entities/activitypub/models.py b/federation/entities/activitypub/models.py
index e7afbbe..8989440 100644
--- a/federation/entities/activitypub/models.py
+++ b/federation/entities/activitypub/models.py
@@ -849,11 +849,13 @@ class Note(Object, RawContentMixin):
for link in self._soup.find_all('a', href=True):
parsed = urlparse(link['href'].lower())
- # remove the query part, if any
- url = f'{parsed.scheme}://{parsed.netloc}{parsed.path}'
+ # remove the query part and trailing garbage, if any
+ path = re.match(r'(/[\w/]+)', parsed.path).group()
+ url = f'{parsed.scheme}://{parsed.netloc}{path}'
links = {link['href'].lower(), url}
if links.intersection(hrefs):
- link['data-hashtag'] = link.text.lstrip('#').lower()
+ tag = re.match(r'#?([\w]+)', link.text).group(1).lower()
+ link['data-hashtag'] = tag
def _find_and_mark_mentions(self):
mentions = [mention for mention in self.tag_objects if isinstance(mention, Mention)]
diff --git a/federation/utils/text.py b/federation/utils/text.py
index 8ce6478..9d62c04 100644
--- a/federation/utils/text.py
+++ b/federation/utils/text.py
@@ -10,7 +10,7 @@ ILLEGAL_TAG_CHARS = "!#$%^&*+.,@ยฃ/()=?`'\\{[]}~;:\"โโโ\xa0"
TAG_PATTERN = re.compile(r'(#[\w]+)', re.UNICODE)
# This will match non matching braces. I don't think it's an issue.
MENTION_PATTERN = re.compile(r'(@\{?(?:[\w\-. \u263a-\U0001f645]*; *)?[\w]+@[\w\-.]+\.[\w]+}?)', re.UNICODE)
-URL_PATTERN = re.compile(r'(https?://[\w_\-.#?&/]+)', re.UNICODE)
+URL_PATTERN = re.compile(r'((?:https?://)?[\w_\-.#?&/~@!$()*,;%=+]+)', re.UNICODE)
def decode_if_bytes(text):
try:
@@ -52,7 +52,7 @@ def find_elements(soup: BeautifulSoup, pattern: re.Pattern) -> List[NavigableStr
for candidate in soup.find_all(string=True):
if candidate.parent.name == 'code': continue
ns = [NavigableString(r) for r in re.split(pattern, candidate.text)]
- candidate.replace_with(*ns)
+ if ns: candidate.replace_with(*ns)
return list(soup.find_all(string=re.compile(r'\A'+pattern.pattern+r'\Z')))
From b1bc8e729553d9dedf36252350a0dfcd50f90519 Mon Sep 17 00:00:00 2001
From: Alain St-Denis
Date: Tue, 18 Jul 2023 07:16:33 -0400
Subject: [PATCH 16/28] Improve URL pattern. Make find_elements more robust.
Move process_text_links tests to the client app.
---
federation/tests/utils/test_text.py | 31 -----------------------------
federation/utils/text.py | 12 +++++++----
2 files changed, 8 insertions(+), 35 deletions(-)
diff --git a/federation/tests/utils/test_text.py b/federation/tests/utils/test_text.py
index a442e93..71daba3 100644
--- a/federation/tests/utils/test_text.py
+++ b/federation/tests/utils/test_text.py
@@ -65,37 +65,6 @@ class TestFindTags:
assert tags == {"foobar", "barfoo"}
-# TODO: move these tests to the client app
-@pytest.mark.skip
-class TestProcessTextLinks:
- def test_link_at_start_or_end(self):
- assert process_text_links('https://example.org example.org\nhttp://example.org') == \
- 'https://example.org ' \
- 'example.org\n' \
- 'http://example.org'
-
- def test_existing_links_get_attrs_added(self):
- assert process_text_links('https://example.org') == \
- 'https://example.org'
-
- def test_code_sections_are_skipped(self):
- assert process_text_links('https://example.org
\nhttps://example.org\n
') == \
- 'https://example.org
\nhttps://example.org\n
'
-
- def test_emails_are_skipped(self):
- assert process_text_links('foo@example.org') == 'foo@example.org'
-
- def test_does_not_add_target_blank_if_link_is_internal(self):
- assert process_text_links('#foobar') == \
- '#foobar'
-
- def test_does_not_remove_mention_classes(self):
- assert process_text_links(''
- '@jaywink boom
') == \
- '@jaywink boom
'
-
-
def test_validate_handle():
assert validate_handle("foo@bar.com")
assert validate_handle("Foo@baR.com")
diff --git a/federation/utils/text.py b/federation/utils/text.py
index 9d62c04..f66f437 100644
--- a/federation/utils/text.py
+++ b/federation/utils/text.py
@@ -10,7 +10,7 @@ ILLEGAL_TAG_CHARS = "!#$%^&*+.,@ยฃ/()=?`'\\{[]}~;:\"โโโ\xa0"
TAG_PATTERN = re.compile(r'(#[\w]+)', re.UNICODE)
# This will match non matching braces. I don't think it's an issue.
MENTION_PATTERN = re.compile(r'(@\{?(?:[\w\-. \u263a-\U0001f645]*; *)?[\w]+@[\w\-.]+\.[\w]+}?)', re.UNICODE)
-URL_PATTERN = re.compile(r'((?:https?://)?[\w_\-.#?&/~@!$()*,;%=+]+)', re.UNICODE)
+URL_PATTERN = re.compile(r'(^|[#*_\s])((?:https?://)?[\w\-.]+\.[\w]{1}[\w_\-.#?&/~@!$()*,;%=+]*)', re.UNICODE)
def decode_if_bytes(text):
try:
@@ -49,11 +49,15 @@ def find_elements(soup: BeautifulSoup, pattern: re.Pattern) -> List[NavigableStr
:param pattern: Compiled regular expression defined using a single group
:return: A NavigableString list attached to the original soup
"""
+ found = []
for candidate in soup.find_all(string=True):
- if candidate.parent.name == 'code': continue
+ parent = candidate.find_parent()
+ if parent.name == 'code': continue
ns = [NavigableString(r) for r in re.split(pattern, candidate.text)]
- if ns: candidate.replace_with(*ns)
- return list(soup.find_all(string=re.compile(r'\A'+pattern.pattern+r'\Z')))
+ if ns:
+ candidate.replace_with(*ns)
+ found.extend([child for child in parent.find_all(string=pattern) if child in ns])
+ return found
def get_path_from_url(url: str) -> str:
From c87e1c3dd7ee44fae1fbc40ec20e605027868f3d Mon Sep 17 00:00:00 2001
From: Alain St-Denis
Date: Tue, 18 Jul 2023 09:19:53 -0400
Subject: [PATCH 17/28] Unquote and normalize tag links and add to the set
being intersected with the hrefs pulled from Hashtag objects.
---
federation/entities/activitypub/models.py | 10 +++++++---
1 file changed, 7 insertions(+), 3 deletions(-)
diff --git a/federation/entities/activitypub/models.py b/federation/entities/activitypub/models.py
index 8989440..8a6e31f 100644
--- a/federation/entities/activitypub/models.py
+++ b/federation/entities/activitypub/models.py
@@ -6,7 +6,8 @@ import traceback
import uuid
from operator import attrgetter
from typing import List, Dict, Union
-from urllib.parse import urlparse
+from unicodedata import normalize
+from urllib.parse import unquote, urlparse
import bleach
from bs4 import BeautifulSoup
@@ -848,11 +849,14 @@ class Note(Object, RawContentMixin):
hrefs.add(tag.id.lower())
for link in self._soup.find_all('a', href=True):
- parsed = urlparse(link['href'].lower())
+ parsed = urlparse(unquote(link['href']).lower())
# remove the query part and trailing garbage, if any
path = re.match(r'(/[\w/]+)', parsed.path).group()
url = f'{parsed.scheme}://{parsed.netloc}{path}'
- links = {link['href'].lower(), url}
+ # convert accented characters to their ascii equivalent
+ normalized_path = normalize('NFD', path).encode('ascii', 'ignore')
+ normalized_url = f'{parsed.scheme}://{parsed.netloc}{normalized_path.decode()}'
+ links = {link['href'].lower(), unquote(link['href']).lower(), url, normalized_url}
if links.intersection(hrefs):
tag = re.match(r'#?([\w]+)', link.text).group(1).lower()
link['data-hashtag'] = tag
From d53db6299f7676ca694de33595691f6ffacda3a8 Mon Sep 17 00:00:00 2001
From: Alain St-Denis
Date: Tue, 18 Jul 2023 12:42:36 -0400
Subject: [PATCH 18/28] Make tag link processing more robust.
---
federation/entities/activitypub/models.py | 5 ++++-
1 file changed, 4 insertions(+), 1 deletion(-)
diff --git a/federation/entities/activitypub/models.py b/federation/entities/activitypub/models.py
index 8a6e31f..1cda53e 100644
--- a/federation/entities/activitypub/models.py
+++ b/federation/entities/activitypub/models.py
@@ -851,7 +851,10 @@ class Note(Object, RawContentMixin):
for link in self._soup.find_all('a', href=True):
parsed = urlparse(unquote(link['href']).lower())
# remove the query part and trailing garbage, if any
- path = re.match(r'(/[\w/]+)', parsed.path).group()
+ path = parsed.path
+ trunc = re.match(r'(/[\w/]+)', parsed.path)
+ if trunc:
+ path = trunc.group()
url = f'{parsed.scheme}://{parsed.netloc}{path}'
# convert accented characters to their ascii equivalent
normalized_path = normalize('NFD', path).encode('ascii', 'ignore')
From d7e6a56eb69db8175c77ab7b143f5e448f67f1dd Mon Sep 17 00:00:00 2001
From: Alain St-Denis
Date: Sun, 23 Jul 2023 08:50:40 -0400
Subject: [PATCH 19/28] AP mention finding and marking now also relying on the
url property which is expected to be set as the remote_url property of the
client app profiles. Add the url property to some tests. The get_profile
function now expected to OR the query fields.
---
federation/entities/activitypub/models.py | 37 +++++++++++--------
.../entities/activitypub/test_mappers.py | 12 ++++--
federation/tests/fixtures/entities.py | 3 +-
federation/tests/utils/test_activitypub.py | 4 +-
4 files changed, 33 insertions(+), 23 deletions(-)
diff --git a/federation/entities/activitypub/models.py b/federation/entities/activitypub/models.py
index 1cda53e..711b979 100644
--- a/federation/entities/activitypub/models.py
+++ b/federation/entities/activitypub/models.py
@@ -35,10 +35,10 @@ from federation.utils.text import with_slash, validate_handle
logger = logging.getLogger("federation")
-def get_profile_or_entity(fid):
- obj = get_profile(fid=fid)
- if not obj:
- obj = retrieve_and_parse_document(fid)
+def get_profile_or_entity(**kwargs):
+ obj = get_profile(**kwargs)
+ if not obj and kwargs.get('fid'):
+ obj = retrieve_and_parse_document(kwargs['fid'])
return obj
@@ -586,6 +586,7 @@ class Person(Object, base.Profile):
def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
+ self._required += ['url']
self._allowed_children += (Note, PropertyValue, IdentityProof)
# Set finger to username@host if not provided by the platform
@@ -866,18 +867,22 @@ class Note(Object, RawContentMixin):
def _find_and_mark_mentions(self):
mentions = [mention for mention in self.tag_objects if isinstance(mention, Mention)]
- hrefs = []
+ # There seems to be consensus on using the profile url for
+ # the link and the profile id for the Mention object href property,
+ # but some platforms will set mention.href to the profile url, so
+ # we check both.
for mention in mentions:
- hrefs.append(mention.href)
- # add Mastodon's form
- parsed = urlparse(mention.href)
- username = mention.name.lstrip('@').split('@')[0]
- hrefs.append(f'{parsed.scheme}://{parsed.netloc}/@{username}')
- for href in hrefs:
- links = self._soup.find_all(href=href)
- for link in links:
- profile = get_profile_or_entity(fid=link['href'])
- if profile:
+ hrefs = []
+ profile = get_profile_or_entity(fid=mention.href, remote_url=mention.href)
+ if profile and not profile.url:
+ # This should be removed when we are confident that the remote_url property
+ # has been populated for most profiles on the client app side.
+ profile = retrieve_and_parse_profile(profile.id)
+ if profile:
+ hrefs.extend([profile.id, profile.url])
+ for href in hrefs:
+ links = self._soup.find_all(href=href)
+ for link in links:
link['data-mention'] = profile.finger
self._mentions.add(profile.finger)
@@ -1317,7 +1322,7 @@ def extract_receivers(entity):
profile = None
# don't care about receivers for payloads without an actor_id
if getattr(entity, 'actor_id'):
- profile = get_profile_or_entity(entity.actor_id)
+ profile = get_profile_or_entity(fid=entity.actor_id)
if not isinstance(profile, base.Profile):
return receivers
diff --git a/federation/tests/entities/activitypub/test_mappers.py b/federation/tests/entities/activitypub/test_mappers.py
index 9a2c042..ba6bbbb 100644
--- a/federation/tests/entities/activitypub/test_mappers.py
+++ b/federation/tests/entities/activitypub/test_mappers.py
@@ -91,7 +91,8 @@ class TestActivitypubEntityMappersReceive:
assert post.raw_content == ''
assert post.rendered_content == 'boom #test
'
- @patch("federation.entities.activitypub.models.get_profile_or_entity", return_value=Person(finger="jaywink@dev3.jasonrobinson.me"))
+ @patch("federation.entities.activitypub.models.get_profile_or_entity",
+ return_value=Person(finger="jaywink@dev3.jasonrobinson.me",url="https://dev3.jasonrobinson.me/u/jaywink/"))
def test_message_to_objects_simple_post__with_mentions(self, mock_get):
entities = message_to_objects(ACTIVITYPUB_POST_WITH_MENTIONS, "https://mastodon.social/users/jaywink")
assert len(entities) == 1
@@ -102,7 +103,8 @@ class TestActivitypubEntityMappersReceive:
assert list(post._mentions)[0] == "jaywink@dev3.jasonrobinson.me"
- @patch("federation.entities.activitypub.models.get_profile_or_entity", return_value=Person(finger="jaywink@dev.jasonrobinson.me"))
+ @patch("federation.entities.activitypub.models.get_profile_or_entity",
+ return_value=Person(finger="jaywink@dev.jasonrobinson.me",url="https://dev.jasonrobinson.me/u/jaywink/"))
def test_message_to_objects_simple_post__with_source__bbcode(self, mock_get):
entities = message_to_objects(ACTIVITYPUB_POST_WITH_SOURCE_BBCODE, "https://diaspodon.fr/users/jaywink")
assert len(entities) == 1
@@ -113,7 +115,8 @@ class TestActivitypubEntityMappersReceive:
'@jaywink boom
'
assert post.raw_content == ''
- @patch("federation.entities.activitypub.models.get_profile_or_entity", return_value=Person(finger="jaywink@dev.jasonrobinson.me"))
+ @patch("federation.entities.activitypub.models.get_profile_or_entity",
+ return_value=Person(finger="jaywink@dev.jasonrobinson.me",url="https://dev.robinson.me/u/jaywink/"))
def test_message_to_objects_simple_post__with_source__markdown(self, mock_get):
entities = message_to_objects(ACTIVITYPUB_POST_WITH_SOURCE_MARKDOWN, "https://diaspodon.fr/users/jaywink")
assert len(entities) == 1
@@ -147,7 +150,8 @@ class TestActivitypubEntityMappersReceive:
assert photo.guid == ""
assert photo.handle == ""
- @patch("federation.entities.activitypub.models.get_profile_or_entity", return_value=Person(finger="jaywink@dev.jasonrobinson.me"))
+ @patch("federation.entities.activitypub.models.get_profile_or_entity",
+ return_value=Person(finger="jaywink@dev.jasonrobinson.me", url="https://dev.jasonrobinson.me/u/jaywink/"))
def test_message_to_objects_comment(self, mock_get):
entities = message_to_objects(ACTIVITYPUB_COMMENT, "https://diaspodon.fr/users/jaywink")
assert len(entities) == 1
diff --git a/federation/tests/fixtures/entities.py b/federation/tests/fixtures/entities.py
index c0d1a07..e555a97 100644
--- a/federation/tests/fixtures/entities.py
+++ b/federation/tests/fixtures/entities.py
@@ -256,7 +256,8 @@ def profile():
inboxes={
"private": "https://example.com/bob/private",
"public": "https://example.com/public",
- }, public_key=PUBKEY, to=["https://www.w3.org/ns/activitystreams#Public"]
+ }, public_key=PUBKEY, to=["https://www.w3.org/ns/activitystreams#Public"],
+ url="https://example.com/alice"
)
diff --git a/federation/tests/utils/test_activitypub.py b/federation/tests/utils/test_activitypub.py
index 2572b42..46e7d46 100644
--- a/federation/tests/utils/test_activitypub.py
+++ b/federation/tests/utils/test_activitypub.py
@@ -60,7 +60,7 @@ class TestRetrieveAndParseDocument:
entity = retrieve_and_parse_document("https://example.com/foobar")
assert isinstance(entity, Follow)
- @patch("federation.entities.activitypub.models.extract_receivers", return_value=[])
+ @patch("federation.entities.activitypub.models.get_profile_or_entity", return_value=None)
@patch("federation.utils.activitypub.fetch_document", autospec=True, return_value=(
json.dumps(ACTIVITYPUB_POST_OBJECT), None, None),
)
@@ -80,7 +80,7 @@ class TestRetrieveAndParseDocument:
"/foobar.jpg"
@patch("federation.entities.activitypub.models.verify_ld_signature", return_value=None)
- @patch("federation.entities.activitypub.models.extract_receivers", return_value=[])
+ @patch("federation.entities.activitypub.models.get_profile_or_entity", return_value=None)
@patch("federation.utils.activitypub.fetch_document", autospec=True, return_value=(
json.dumps(ACTIVITYPUB_POST), None, None),
)
From cb96d83793e7906ddc3e60e5c7a30bdda7948e1f Mon Sep 17 00:00:00 2001
From: Alain St-Denis
Date: Sun, 23 Jul 2023 10:05:25 -0400
Subject: [PATCH 20/28] Case insensitive lookup with finger.
---
federation/entities/activitypub/models.py | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/federation/entities/activitypub/models.py b/federation/entities/activitypub/models.py
index 711b979..cd32fcd 100644
--- a/federation/entities/activitypub/models.py
+++ b/federation/entities/activitypub/models.py
@@ -816,7 +816,7 @@ class Note(Object, RawContentMixin):
mentions.sort()
for mention in mentions:
if validate_handle(mention):
- profile = get_profile(finger=mention)
+ profile = get_profile(finger__iexact=mention)
# only add AP profiles mentions
if getattr(profile, 'id', None):
self.tag_objects.append(Mention(href=profile.id, name='@'+mention))
From 091b156703622c0e1a29c805aff0b93127b9621a Mon Sep 17 00:00:00 2001
From: Alain St-Denis
Date: Sun, 23 Jul 2023 13:03:20 -0400
Subject: [PATCH 21/28] For Person, if the url property is missing, set it to
id.
---
federation/entities/activitypub/models.py | 3 +++
1 file changed, 3 insertions(+)
diff --git a/federation/entities/activitypub/models.py b/federation/entities/activitypub/models.py
index cd32fcd..9989177 100644
--- a/federation/entities/activitypub/models.py
+++ b/federation/entities/activitypub/models.py
@@ -602,6 +602,9 @@ class Person(Object, base.Profile):
# multi-protocol platform
if self.finger and self.guid is not missing and self.handle is missing:
self.handle = self.finger
+ # Some platforms don't set this property.
+ if self.url is missing:
+ self.url = self.id
def to_as2(self):
self.followers = f'{with_slash(self.id)}followers/'
From 54a8404c3d1616f23083ac68fd961b5b8be3cdfd Mon Sep 17 00:00:00 2001
From: Alain St-Denis
Date: Mon, 24 Jul 2023 08:32:32 -0400
Subject: [PATCH 22/28] Make verify_ld_signature more robust. Make Tombstone
objects signable.
---
federation/entities/activitypub/ldsigning.py | 2 +-
federation/entities/activitypub/models.py | 1 +
2 files changed, 2 insertions(+), 1 deletion(-)
diff --git a/federation/entities/activitypub/ldsigning.py b/federation/entities/activitypub/ldsigning.py
index f77b1fd..c118eaa 100644
--- a/federation/entities/activitypub/ldsigning.py
+++ b/federation/entities/activitypub/ldsigning.py
@@ -99,6 +99,6 @@ class NormalizedDoubles(jsonld.JsonLdProcessor):
item['@value'] = math.floor(value)
obj = super()._object_to_rdf(item, issuer, triples, rdfDirection)
# This is to address https://github.com/digitalbazaar/pyld/issues/175
- if obj.get('datatype') == jsonld.XSD_DOUBLE:
+ if obj and obj.get('datatype') == jsonld.XSD_DOUBLE:
obj['value'] = re.sub(r'(\d)0*E\+?(-)?0*(\d)', r'\1E\2\3', obj['value'])
return obj
diff --git a/federation/entities/activitypub/models.py b/federation/entities/activitypub/models.py
index 9989177..269c734 100644
--- a/federation/entities/activitypub/models.py
+++ b/federation/entities/activitypub/models.py
@@ -1205,6 +1205,7 @@ class Retraction(Announce, base.Retraction):
class Tombstone(Object, base.Retraction):
target_id = fields.Id()
+ signable = True
def to_as2(self):
if not isinstance(self.activity, type): return None
From 6fd445382dc9a8fceb044323769ca8785b875a0a Mon Sep 17 00:00:00 2001
From: Alain St-Denis
Date: Wed, 26 Jul 2023 12:40:46 -0400
Subject: [PATCH 23/28] Allow '-' in tags. Make AP tag discovery more robust.
---
federation/entities/activitypub/models.py | 7 ++++---
federation/utils/text.py | 5 +++--
2 files changed, 7 insertions(+), 5 deletions(-)
diff --git a/federation/entities/activitypub/models.py b/federation/entities/activitypub/models.py
index 269c734..862b6cd 100644
--- a/federation/entities/activitypub/models.py
+++ b/federation/entities/activitypub/models.py
@@ -856,7 +856,7 @@ class Note(Object, RawContentMixin):
parsed = urlparse(unquote(link['href']).lower())
# remove the query part and trailing garbage, if any
path = parsed.path
- trunc = re.match(r'(/[\w/]+)', parsed.path)
+ trunc = re.match(r'(/[\w/\-]+)', parsed.path)
if trunc:
path = trunc.group()
url = f'{parsed.scheme}://{parsed.netloc}{path}'
@@ -865,8 +865,9 @@ class Note(Object, RawContentMixin):
normalized_url = f'{parsed.scheme}://{parsed.netloc}{normalized_path.decode()}'
links = {link['href'].lower(), unquote(link['href']).lower(), url, normalized_url}
if links.intersection(hrefs):
- tag = re.match(r'#?([\w]+)', link.text).group(1).lower()
- link['data-hashtag'] = tag
+ tag = re.match(r'^#?([\w\-]+$)', link.text)
+ if tag:
+ link['data-hashtag'] = tag.group(1).lower()
def _find_and_mark_mentions(self):
mentions = [mention for mention in self.tag_objects if isinstance(mention, Mention)]
diff --git a/federation/utils/text.py b/federation/utils/text.py
index f66f437..7d728dd 100644
--- a/federation/utils/text.py
+++ b/federation/utils/text.py
@@ -7,7 +7,7 @@ from bs4.element import NavigableString
from commonmark import commonmark
ILLEGAL_TAG_CHARS = "!#$%^&*+.,@ยฃ/()=?`'\\{[]}~;:\"โโโ\xa0"
-TAG_PATTERN = re.compile(r'(#[\w]+)', re.UNICODE)
+TAG_PATTERN = re.compile(r'(#[\w\-]+)([)\]_!?*%/.,;\s]+\s*|\Z)', re.UNICODE)
# This will match non matching braces. I don't think it's an issue.
MENTION_PATTERN = re.compile(r'(@\{?(?:[\w\-. \u263a-\U0001f645]*; *)?[\w]+@[\w\-.]+\.[\w]+}?)', re.UNICODE)
URL_PATTERN = re.compile(r'(^|[#*_\s])((?:https?://)?[\w\-.]+\.[\w]{1}[\w_\-.#?&/~@!$()*,;%=+]*)', re.UNICODE)
@@ -56,7 +56,8 @@ def find_elements(soup: BeautifulSoup, pattern: re.Pattern) -> List[NavigableStr
ns = [NavigableString(r) for r in re.split(pattern, candidate.text)]
if ns:
candidate.replace_with(*ns)
- found.extend([child for child in parent.find_all(string=pattern) if child in ns])
+ found.extend([child for child in parent.find_all(
+ string=re.compile(r'\A'+pattern.pattern+r'\Z')) if child in ns])
return found
From 7559f16f4f67cdf24159636c323fe12f53f40601 Mon Sep 17 00:00:00 2001
From: Alain St-Denis
Date: Wed, 26 Jul 2023 12:57:47 -0400
Subject: [PATCH 24/28] Remove references to http://schema.org from inbound AP
contexts.
---
federation/entities/activitypub/ldcontext.py | 13 +++++++------
1 file changed, 7 insertions(+), 6 deletions(-)
diff --git a/federation/entities/activitypub/ldcontext.py b/federation/entities/activitypub/ldcontext.py
index 8fdafc2..414d60a 100644
--- a/federation/entities/activitypub/ldcontext.py
+++ b/federation/entities/activitypub/ldcontext.py
@@ -113,12 +113,13 @@ class LdContextManager:
if 'python-federation"' in s:
ctx = json.loads(s.replace('python-federation', 'python-federation#', 1))
- # some platforms have http://joinmastodon.com/ns in @context. This
- # is not a json-ld document.
- try:
- ctx.pop(ctx.index('http://joinmastodon.org/ns'))
- except ValueError:
- pass
+ # Some platforms have reference invalid json-ld document in @context.
+ # Remove those.
+ for url in ['http://joinmastodon.org/ns', 'http://schema.org']:
+ try:
+ ctx.pop(ctx.index(url))
+ except ValueError:
+ pass
# remove @language in context since this directive is not
# processed by calamus. Pleroma adds a useless @language: 'und'
From db87313535417ef5aa131f8c703a4b83c740aee6 Mon Sep 17 00:00:00 2001
From: Alain St-Denis
Date: Thu, 27 Jul 2023 08:00:41 -0400
Subject: [PATCH 25/28] Ignore relayed retractions.
---
CHANGELOG.md | 2 ++
federation/entities/activitypub/models.py | 3 +++
2 files changed, 5 insertions(+)
diff --git a/CHANGELOG.md b/CHANGELOG.md
index 19d2bca..ce9212a 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -47,6 +47,8 @@
* Fix process_text_links that would crash on `a` tags with no `href` attribute.
+* Ignore relayed AP retractions.
+
## [0.24.1] - 2023-03-18
### Fixed
diff --git a/federation/entities/activitypub/models.py b/federation/entities/activitypub/models.py
index 862b6cd..13e9bf6 100644
--- a/federation/entities/activitypub/models.py
+++ b/federation/entities/activitypub/models.py
@@ -1408,6 +1408,9 @@ def element_to_objects(element: Union[Dict, Object], sender: str = "") -> List:
logger.error("Failed to validate entity %s: %s", entity, ex)
return []
except InvalidSignature as exc:
+ if isinstance(entity, base.Retraction):
+ logger.warning('Relayed retraction on %s, ignoring', entity.target_id)
+ return []
logger.info('%s, fetching from remote', exc)
entity = retrieve_and_parse_document(entity.id)
if not entity:
From 5c168d6630772f36e5b18e0bb3146c06c3ea07dd Mon Sep 17 00:00:00 2001
From: Alain St-Denis
Date: Thu, 27 Jul 2023 22:26:45 -0400
Subject: [PATCH 26/28] Rework find_elements to make it more efficient and
resilient.
---
federation/utils/text.py | 13 ++++++-------
1 file changed, 6 insertions(+), 7 deletions(-)
diff --git a/federation/utils/text.py b/federation/utils/text.py
index 7d728dd..3bf5497 100644
--- a/federation/utils/text.py
+++ b/federation/utils/text.py
@@ -49,16 +49,15 @@ def find_elements(soup: BeautifulSoup, pattern: re.Pattern) -> List[NavigableStr
:param pattern: Compiled regular expression defined using a single group
:return: A NavigableString list attached to the original soup
"""
- found = []
+ final = []
for candidate in soup.find_all(string=True):
- parent = candidate.find_parent()
- if parent.name == 'code': continue
+ if candidate.parent.name == 'code': continue
ns = [NavigableString(r) for r in re.split(pattern, candidate.text)]
- if ns:
+ found = [s for s in ns if pattern.match(s.text)]
+ if found:
candidate.replace_with(*ns)
- found.extend([child for child in parent.find_all(
- string=re.compile(r'\A'+pattern.pattern+r'\Z')) if child in ns])
- return found
+ final.extend(found)
+ return final
def get_path_from_url(url: str) -> str:
From 5dac605c4b1c311edadd0e6dd6beb06228191234 Mon Sep 17 00:00:00 2001
From: Alain St-Denis
Date: Wed, 2 Aug 2023 07:45:57 -0400
Subject: [PATCH 27/28] Improve URL_PATTERN.
---
federation/utils/text.py | 7 +++++--
1 file changed, 5 insertions(+), 2 deletions(-)
diff --git a/federation/utils/text.py b/federation/utils/text.py
index 3bf5497..7e6058b 100644
--- a/federation/utils/text.py
+++ b/federation/utils/text.py
@@ -10,7 +10,10 @@ ILLEGAL_TAG_CHARS = "!#$%^&*+.,@ยฃ/()=?`'\\{[]}~;:\"โโโ\xa0"
TAG_PATTERN = re.compile(r'(#[\w\-]+)([)\]_!?*%/.,;\s]+\s*|\Z)', re.UNICODE)
# This will match non matching braces. I don't think it's an issue.
MENTION_PATTERN = re.compile(r'(@\{?(?:[\w\-. \u263a-\U0001f645]*; *)?[\w]+@[\w\-.]+\.[\w]+}?)', re.UNICODE)
-URL_PATTERN = re.compile(r'(^|[#*_\s])((?:https?://)?[\w\-.]+\.[\w]{1}[\w_\-.#?&/~@!$()*,;%=+]*)', re.UNICODE)
+# based on https://stackoverflow.com/a/6041965
+URL_PATTERN = re.compile(r'((?:(?:https?|ftp)://|\b(?:\w+\.)+\w+)(?:(?:[^\s()<>]+|\((?:[^\s()<>]+|(?:\([^\s()<>]+\)))?\))+(?:\((?:[^\s()<>]+|(?:\(?:[^\s()<>]+\)))?\)|[^\s`!()\[\]{};:\'".,<>?ยซยปโโโโ]))?)',
+ re.UNICODE)
+
def decode_if_bytes(text):
try:
@@ -52,7 +55,7 @@ def find_elements(soup: BeautifulSoup, pattern: re.Pattern) -> List[NavigableStr
final = []
for candidate in soup.find_all(string=True):
if candidate.parent.name == 'code': continue
- ns = [NavigableString(r) for r in re.split(pattern, candidate.text)]
+ ns = [NavigableString(r) for r in pattern.split(candidate.text) if r]
found = [s for s in ns if pattern.match(s.text)]
if found:
candidate.replace_with(*ns)
From ada8c20d398df45e40f7dac0764e5465aff51366 Mon Sep 17 00:00:00 2001
From: Alain St-Denis
Date: Sat, 5 Aug 2023 11:41:30 -0400
Subject: [PATCH 28/28] Impove the url matching regex.
---
federation/utils/text.py | 5 ++---
1 file changed, 2 insertions(+), 3 deletions(-)
diff --git a/federation/utils/text.py b/federation/utils/text.py
index 7e6058b..ab4b8ec 100644
--- a/federation/utils/text.py
+++ b/federation/utils/text.py
@@ -11,9 +11,8 @@ TAG_PATTERN = re.compile(r'(#[\w\-]+)([)\]_!?*%/.,;\s]+\s*|\Z)', re.UNICODE)
# This will match non matching braces. I don't think it's an issue.
MENTION_PATTERN = re.compile(r'(@\{?(?:[\w\-. \u263a-\U0001f645]*; *)?[\w]+@[\w\-.]+\.[\w]+}?)', re.UNICODE)
# based on https://stackoverflow.com/a/6041965
-URL_PATTERN = re.compile(r'((?:(?:https?|ftp)://|\b(?:\w+\.)+\w+)(?:(?:[^\s()<>]+|\((?:[^\s()<>]+|(?:\([^\s()<>]+\)))?\))+(?:\((?:[^\s()<>]+|(?:\(?:[^\s()<>]+\)))?\)|[^\s`!()\[\]{};:\'".,<>?ยซยปโโโโ]))?)',
- re.UNICODE)
-
+URL_PATTERN = re.compile(r'((?:(?:https?|ftp)://|^|(?<=[("<\s]))+(?:[\w\-]+(?:(?:\.[\w\-]+)+))(?:[\w.,@?^=%&:\/~+#-]*[\w@?^=%&\/~+#-]))',
+ re.UNICODE)
def decode_if_bytes(text):
try: