diff --git a/CHANGELOG.md b/CHANGELOG.md index 939a554..ce9212a 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -22,7 +22,7 @@ * For inbound payload, a cached dict of all the defined AP extensions is merged with each incoming LD context. * Better handle conflicting property defaults by having `get_base_attributes` return only attributes that - are not empty (or bool). This helps distinguishing between `marshmallow.missing` and empty values. + are not empty (or bool). This helps distinguish between `marshmallow.missing` and empty values. * JsonLD document caching now set in `activitypub/__init__.py`. @@ -45,6 +45,10 @@ * In fetch_document: if response.encoding is not set, default to utf-8. +* Fix process_text_links that would crash on `a` tags with no `href` attribute. + +* Ignore relayed AP retractions. + ## [0.24.1] - 2023-03-18 ### Fixed diff --git a/docs/protocols.rst b/docs/protocols.rst index 1e15467..0dd845b 100644 --- a/docs/protocols.rst +++ b/docs/protocols.rst @@ -4,9 +4,8 @@ Protocols Currently three protocols are being focused on. * Diaspora is considered to be stable with most of the protocol implemented. -* ActivityPub support should be considered as alpha - all the basic - things work but there are likely to be a lot of compatibility issues with other ActivityPub - implementations. +* ActivityPub support should be considered as beta - all the basic + things work and we are fixing incompatibilities as they are identified. * Matrix support cannot be considered usable as of yet. For example implementations in real life projects check :ref:`example-projects`. @@ -69,20 +68,21 @@ Content media type The following keys will be set on the entity based on the ``source`` property existing: * if the object has an ``object.source`` property: - * ``_media_type`` will be the source media type - * ``_rendered_content`` will be the object ``content`` + * ``_media_type`` will be the source media type (only text/markdown is supported). + * ``rendered_content`` will be the object ``content`` * ``raw_content`` will be the source ``content`` * if the object has no ``object.source`` property: * ``_media_type`` will be ``text/html`` - * ``_rendered_content`` will be the object ``content`` - * ``raw_content`` will object ``content`` run through a HTML2Markdown renderer + * ``rendered_content`` will be the object ``content`` + * ``raw_content`` will be empty The ``contentMap`` property is processed but content language selection is not implemented yet. For outbound entities, ``raw_content`` is expected to be in ``text/markdown``, -specifically CommonMark. When sending payloads, ``raw_content`` will be rendered via -the ``commonmark`` library into ``object.content``. The original ``raw_content`` -will be added to the ``object.source`` property. +specifically CommonMark. The client applications are expected to provide the +rendered content for protocols that require it (e.g. ActivityPub). +When sending payloads, ``object.contentMap`` will be set to ``rendered_content`` +and ``raw_content`` will be added to the ``object.source`` property. Medias ...... @@ -98,6 +98,19 @@ support from client applications. For inbound entities we do this automatically by not including received image attachments in the entity ``_children`` attribute. Audio and video are passed through the client application. +Hashtags and mentions +..................... + +For outbound payloads, client applications must add/set the hashtag/mention value to +the ``class`` attribute of rendered content linkified hashtags/mentions. These will be +used to help build the corresponding ``Hashtag`` and ``Mention`` objects. + +For inbound payloads, if a markdown source is provided, hashtags/mentions will be extracted +through the same method used for Diaspora. If only HTML content is provided, the ``a`` tags +will be marked with a ``data-[hashtag|mention]`` attribute (based on the provided Hashtag/Mention +objects) to facilitate the ``href`` attribute modifications lient applications might +wish to make. This should ensure links can be replaced regardless of how the HTML is structured. + .. _matrix: Matrix diff --git a/federation/entities/activitypub/django/views.py b/federation/entities/activitypub/django/views.py index e6eb688..18f316b 100644 --- a/federation/entities/activitypub/django/views.py +++ b/federation/entities/activitypub/django/views.py @@ -2,7 +2,7 @@ from cryptography.exceptions import InvalidSignature from django.http import JsonResponse, HttpResponse, HttpResponseNotFound from federation.entities.activitypub.mappers import get_outbound_entity -from federation.protocols.activitypub.signing import verify_request_signature +from federation.protocols.activitypub.protocol import Protocol from federation.types import RequestType from federation.utils.django import get_function_from_config @@ -23,9 +23,11 @@ def get_and_verify_signer(request): body=request.body, method=request.method, headers=request.headers) + protocol = Protocol(request=req, get_contact_key=get_public_key) try: - return verify_request_signature(req) - except ValueError: + protocol.verify() + return protocol.sender + except (ValueError, KeyError, InvalidSignature) as exc: return None diff --git a/federation/entities/activitypub/ldcontext.py b/federation/entities/activitypub/ldcontext.py index e46bfb9..414d60a 100644 --- a/federation/entities/activitypub/ldcontext.py +++ b/federation/entities/activitypub/ldcontext.py @@ -113,12 +113,13 @@ class LdContextManager: if 'python-federation"' in s: ctx = json.loads(s.replace('python-federation', 'python-federation#', 1)) - # some platforms have http://joinmastodon.com/ns in @context. This - # is not a json-ld document. - try: - ctx.pop(ctx.index('http://joinmastodon.org/ns')) - except ValueError: - pass + # Some platforms have reference invalid json-ld document in @context. + # Remove those. + for url in ['http://joinmastodon.org/ns', 'http://schema.org']: + try: + ctx.pop(ctx.index(url)) + except ValueError: + pass # remove @language in context since this directive is not # processed by calamus. Pleroma adds a useless @language: 'und' @@ -137,12 +138,17 @@ class LdContextManager: # Merge all defined AP extensions to the inbound context uris = [] defs = {} - # Merge original context dicts in one dict - for item in ctx: - if isinstance(item, str): - uris.append(item) - else: - defs.update(item) + # Merge original context dicts in one dict, taking into account nested @context + def parse_context(ctx): + for item in ctx: + if isinstance(item, str): + uris.append(item) + else: + if '@context' in item: + parse_context([item['@context']]) + item.pop('@context') + defs.update(item) + parse_context(ctx) for item in self._merged: if isinstance(item, str) and item not in uris: diff --git a/federation/entities/activitypub/ldsigning.py b/federation/entities/activitypub/ldsigning.py index 381f419..c118eaa 100644 --- a/federation/entities/activitypub/ldsigning.py +++ b/federation/entities/activitypub/ldsigning.py @@ -75,8 +75,8 @@ def verify_ld_signature(payload): obj_digest = hash(obj) digest = (sig_digest + obj_digest).encode('utf-8') - sig_value = b64decode(signature.get('signatureValue')) try: + sig_value = b64decode(signature.get('signatureValue')) verifier.verify(SHA256.new(digest), sig_value) logger.debug('ld_signature - %s has a valid signature', payload.get("id")) return profile.id @@ -99,6 +99,6 @@ class NormalizedDoubles(jsonld.JsonLdProcessor): item['@value'] = math.floor(value) obj = super()._object_to_rdf(item, issuer, triples, rdfDirection) # This is to address https://github.com/digitalbazaar/pyld/issues/175 - if obj.get('datatype') == jsonld.XSD_DOUBLE: + if obj and obj.get('datatype') == jsonld.XSD_DOUBLE: obj['value'] = re.sub(r'(\d)0*E\+?(-)?0*(\d)', r'\1E\2\3', obj['value']) return obj diff --git a/federation/entities/activitypub/models.py b/federation/entities/activitypub/models.py index 9e7fdb5..13e9bf6 100644 --- a/federation/entities/activitypub/models.py +++ b/federation/entities/activitypub/models.py @@ -1,12 +1,16 @@ import copy import json import logging +import re +import traceback import uuid -from datetime import timedelta +from operator import attrgetter from typing import List, Dict, Union -from urllib.parse import urlparse +from unicodedata import normalize +from urllib.parse import unquote, urlparse import bleach +from bs4 import BeautifulSoup from calamus import fields from calamus.schema import JsonLDAnnotation, JsonLDSchema, JsonLDSchemaOpts from calamus.utils import normalize_value @@ -31,10 +35,10 @@ from federation.utils.text import with_slash, validate_handle logger = logging.getLogger("federation") -def get_profile_or_entity(fid): - obj = get_profile(fid=fid) - if not obj: - obj = retrieve_and_parse_document(fid) +def get_profile_or_entity(**kwargs): + obj = get_profile(**kwargs) + if not obj and kwargs.get('fid'): + obj = retrieve_and_parse_document(kwargs['fid']) return obj @@ -57,6 +61,7 @@ as2 = fields.Namespace("https://www.w3.org/ns/activitystreams#") dc = fields.Namespace("http://purl.org/dc/terms/") diaspora = fields.Namespace("https://diasporafoundation.org/ns/") ldp = fields.Namespace("http://www.w3.org/ns/ldp#") +lemmy = fields.Namespace("https://join-lemmy.org/ns#") litepub = fields.Namespace("http://litepub.social/ns#") misskey = fields.Namespace("https://misskey-hub.net/ns#") ostatus = fields.Namespace("http://ostatus.org#") @@ -241,8 +246,8 @@ class Object(BaseEntity, metaclass=JsonLDAnnotation): metadata={'ctx':[{ 'alsoKnownAs':{'@id':'as:alsoKnownAs','@type':'@id'}}]}) icon = MixedField(as2.icon, nested='ImageSchema') image = MixedField(as2.image, nested='ImageSchema') - tag_objects = MixedField(as2.tag, nested=['HashtagSchema','MentionSchema','PropertyValueSchema','EmojiSchema'], many=True) - attachment = fields.Nested(as2.attachment, nested=['ImageSchema', 'AudioSchema', 'DocumentSchema','PropertyValueSchema','IdentityProofSchema'], + tag_objects = MixedField(as2.tag, nested=['NoteSchema', 'HashtagSchema','MentionSchema','PropertyValueSchema','EmojiSchema'], many=True) + attachment = fields.Nested(as2.attachment, nested=['LinkSchema', 'NoteSchema', 'ImageSchema', 'AudioSchema', 'DocumentSchema','PropertyValueSchema','IdentityProofSchema'], many=True, default=[]) content_map = LanguageMap(as2.content) # language maps are not implemented in calamus context = fields.RawJsonLD(as2.context) @@ -250,7 +255,7 @@ class Object(BaseEntity, metaclass=JsonLDAnnotation): generator = MixedField(as2.generator, nested=['ApplicationSchema','ServiceSchema']) created_at = fields.DateTime(as2.published, add_value_types=True) replies = MixedField(as2.replies, nested=['CollectionSchema','OrderedCollectionSchema']) - signature = MixedField(sec.signature, nested = 'SignatureSchema', + signature = MixedField(sec.signature, nested = 'RsaSignature2017Schema', metadata={'ctx': [CONTEXT_SECURITY, {'RsaSignature2017':'sec:RsaSignature2017'}]}) start_time = fields.DateTime(as2.startTime, add_value_types=True) @@ -333,6 +338,20 @@ class Object(BaseEntity, metaclass=JsonLDAnnotation): data['@context'] = context_manager.merge_context(ctx) return data + # JSONLD specs states it is case sensitive. + # Ensure type names for which we have an implementation have the proper case + # for platforms that ignore the spec. + @pre_load + def patch_types(self, data, **kwargs): + def walk_payload(payload): + for key,val in copy.copy(payload).items(): + if isinstance(val, dict): + walk_payload(val) + if key == 'type': + payload[key] = MODEL_NAMES.get(val.lower(), val) + return payload + return walk_payload(data) + # A node without an id isn't true json-ld, but many payloads have # id-less nodes. Since calamus forces random ids on such nodes, # this removes it. @@ -567,7 +586,8 @@ class Person(Object, base.Profile): def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) - self._allowed_children += (PropertyValue, IdentityProof) + self._required += ['url'] + self._allowed_children += (Note, PropertyValue, IdentityProof) # Set finger to username@host if not provided by the platform def post_receive(self): @@ -576,12 +596,15 @@ class Person(Object, base.Profile): self.finger = profile.finger else: domain = urlparse(self.id).netloc - finger = f'{self.username.lower()}@{domain}' + finger = f'{self.username}@{domain}' if get_profile_id_from_webfinger(finger) == self.id: self.finger = finger # multi-protocol platform if self.finger and self.guid is not missing and self.handle is missing: self.handle = self.finger + # Some platforms don't set this property. + if self.url is missing: + self.url = self.id def to_as2(self): self.followers = f'{with_slash(self.id)}followers/' @@ -716,15 +739,19 @@ class Note(Object, RawContentMixin): _cached_raw_content = '' _cached_children = [] + _soup = None signable = True def __init__(self, *args, **kwargs): self.tag_objects = [] # mutable objects... super().__init__(*args, **kwargs) - self._allowed_children += (base.Audio, base.Video) + self.raw_content # must be "primed" with source property for inbound payloads + self.rendered_content # must be "primed" with content_map property for inbound payloads + self._allowed_children += (base.Audio, base.Video, Link) + self._required.remove('raw_content') + self._required += ['rendered_content'] def to_as2(self): - self.sensitive = 'nsfw' in self.tags self.url = self.id edited = False @@ -752,8 +779,8 @@ class Note(Object, RawContentMixin): def to_base(self): kwargs = get_base_attributes(self, keep=( - '_mentions', '_media_type', '_rendered_content', '_source_object', - '_cached_children', '_cached_raw_content')) + '_mentions', '_media_type', '_source_object', + '_cached_children', '_cached_raw_content', '_soup')) entity = Comment(**kwargs) if getattr(self, 'target_id') else Post(**kwargs) # Plume (and maybe other platforms) send the attrbutedTo field as an array if isinstance(entity.actor_id, list): entity.actor_id = entity.actor_id[0] @@ -764,6 +791,7 @@ class Note(Object, RawContentMixin): def pre_send(self) -> None: """ Attach any embedded images from raw_content. + Add Hashtag and Mention objects (the client app must define the class tag/mention property) """ super().pre_send() self._children = [ @@ -774,135 +802,138 @@ class Note(Object, RawContentMixin): ) for image in self.embedded_images ] - # Add other AP objects - self.extract_mentions() - self.content_map = {'orig': self.rendered_content} - self.add_mention_objects() - self.add_tag_objects() + # Add Hashtag objects + for el in self._soup('a', attrs={'class':'hashtag'}): + self.tag_objects.append(Hashtag( + href = el.attrs['href'], + name = el.text + )) + self.tag_objects = sorted(self.tag_objects, key=attrgetter('name')) + if el.text == '#nsfw': self.sensitive = True + + # Add Mention objects + mentions = [] + for el in self._soup('a', attrs={'class':'mention'}): + mentions.append(el.text.lstrip('@')) + + mentions.sort() + for mention in mentions: + if validate_handle(mention): + profile = get_profile(finger__iexact=mention) + # only add AP profiles mentions + if getattr(profile, 'id', None): + self.tag_objects.append(Mention(href=profile.id, name='@'+mention)) + # some platforms only render diaspora style markdown if it is available + self.source['content'] = self.source['content'].replace(mention, '{' + mention + '}') + def post_receive(self) -> None: """ - Make linkified tags normal tags. + Mark linkified tags and mentions with a data-{mention, tag} attribute. """ super().post_receive() - if not self.raw_content or self._media_type == "text/markdown": + if self._media_type == "text/markdown": # Skip when markdown return - hrefs = [] - for tag in self.tag_objects: - if isinstance(tag, Hashtag): - if tag.href is not missing: - hrefs.append(tag.href.lower()) - elif tag.id is not missing: - hrefs.append(tag.id.lower()) - # noinspection PyUnusedLocal - def remove_tag_links(attrs, new=False): - # Hashtag object hrefs - href = (None, "href") - url = attrs.get(href, "").lower() - if url in hrefs: - return - # one more time without the query (for pixelfed) - parsed = urlparse(url) - url = f'{parsed.scheme}://{parsed.netloc}{parsed.path}' - if url in hrefs: - return - - # Mastodon - rel = (None, "rel") - if attrs.get(rel) == "tag": - return - - # Friendica - if attrs.get(href, "").endswith(f'tag={attrs.get("_text")}'): - return - - return attrs - - self.raw_content = bleach.linkify( - self.raw_content, - callbacks=[remove_tag_links], - parse_email=False, - skip_tags=["code", "pre"], - ) + self._find_and_mark_hashtags() + self._find_and_mark_mentions() if getattr(self, 'target_id'): self.entity_type = 'Comment' - def add_tag_objects(self) -> None: - """ - Populate tags to the object.tag list. - """ - try: - from federation.utils.django import get_configuration - config = get_configuration() - except ImportError: - tags_path = None - else: - if config["tags_path"]: - tags_path = f"{config['base_url']}{config['tags_path']}" - else: - tags_path = None - for tag in self.tags: - _tag = Hashtag(name=f'#{tag}') - if tags_path: - _tag.href = tags_path.replace(":tag:", tag) - self.tag_objects.append(_tag) + def _find_and_mark_hashtags(self): + hrefs = set() + for tag in self.tag_objects: + if isinstance(tag, Hashtag): + if tag.href is not missing: + hrefs.add(tag.href.lower()) + # Some platforms use id instead of href... + elif tag.id is not missing: + hrefs.add(tag.id.lower()) - def add_mention_objects(self) -> None: - """ - Populate mentions to the object.tag list. - """ - if len(self._mentions): - mentions = list(self._mentions) - mentions.sort() - for mention in mentions: - if validate_handle(mention): - profile = get_profile(finger=mention) - # only add AP profiles mentions - if getattr(profile, 'id', None): - self.tag_objects.append(Mention(href=profile.id, name='@'+mention)) - # some platforms only render diaspora style markdown if it is available - self.source['content'] = self.source['content'].replace(mention, '{'+mention+'}') + for link in self._soup.find_all('a', href=True): + parsed = urlparse(unquote(link['href']).lower()) + # remove the query part and trailing garbage, if any + path = parsed.path + trunc = re.match(r'(/[\w/\-]+)', parsed.path) + if trunc: + path = trunc.group() + url = f'{parsed.scheme}://{parsed.netloc}{path}' + # convert accented characters to their ascii equivalent + normalized_path = normalize('NFD', path).encode('ascii', 'ignore') + normalized_url = f'{parsed.scheme}://{parsed.netloc}{normalized_path.decode()}' + links = {link['href'].lower(), unquote(link['href']).lower(), url, normalized_url} + if links.intersection(hrefs): + tag = re.match(r'^#?([\w\-]+$)', link.text) + if tag: + link['data-hashtag'] = tag.group(1).lower() + + def _find_and_mark_mentions(self): + mentions = [mention for mention in self.tag_objects if isinstance(mention, Mention)] + # There seems to be consensus on using the profile url for + # the link and the profile id for the Mention object href property, + # but some platforms will set mention.href to the profile url, so + # we check both. + for mention in mentions: + hrefs = [] + profile = get_profile_or_entity(fid=mention.href, remote_url=mention.href) + if profile and not profile.url: + # This should be removed when we are confident that the remote_url property + # has been populated for most profiles on the client app side. + profile = retrieve_and_parse_profile(profile.id) + if profile: + hrefs.extend([profile.id, profile.url]) + for href in hrefs: + links = self._soup.find_all(href=href) + for link in links: + link['data-mention'] = profile.finger + self._mentions.add(profile.finger) def extract_mentions(self): """ - Extract mentions from the source object. + Attempt to extract mentions from raw_content if available """ - super().extract_mentions() - if getattr(self, 'tag_objects', None): - #tag_objects = self.tag_objects if isinstance(self.tag_objects, list) else [self.tag_objects] - for tag in self.tag_objects: - if isinstance(tag, Mention): - profile = get_profile_or_entity(fid=tag.href) - handle = getattr(profile, 'finger', None) - if handle: self._mentions.add(handle) + if self.raw_content: + super().extract_mentions() + return @property - def raw_content(self): - - if self._cached_raw_content: return self._cached_raw_content + def rendered_content(self): + if self._soup: return str(self._soup) + content = '' if self.content_map: orig = self.content_map.pop('orig') if len(self.content_map.keys()) > 1: logger.warning('Language selection not implemented, falling back to default') - self._rendered_content = orig.strip() + content = orig.strip() else: - self._rendered_content = orig.strip() if len(self.content_map.keys()) == 0 else next(iter(self.content_map.values())).strip() + content = orig.strip() if len(self.content_map.keys()) == 0 else next(iter(self.content_map.values())).strip() self.content_map['orig'] = orig + # to allow for posts/replies with medias only. + if not content: content = "
" + self._soup = BeautifulSoup(content, 'html.parser') + return str(self._soup) + + @rendered_content.setter + def rendered_content(self, value): + if not value: return + self._soup = BeautifulSoup(value, 'html.parser') + self.content_map = {'orig': value} + + @property + def raw_content(self): + if self._cached_raw_content: return self._cached_raw_content + + if isinstance(self.source, dict) and self.source.get('mediaType') == 'text/markdown': + self._media_type = self.source['mediaType'] + self._cached_raw_content = self.source.get('content').strip() + else: + self._media_type = 'text/html' + self._cached_raw_content = "" + return self._cached_raw_content - if isinstance(self.source, dict) and self.source.get('mediaType') == 'text/markdown': - self._media_type = self.source['mediaType'] - self._cached_raw_content = self.source.get('content').strip() - else: - self._media_type = 'text/html' - self._cached_raw_content = self._rendered_content - # to allow for posts/replies with medias only. - if not self._cached_raw_content: self._cached_raw_content = "" - return self._cached_raw_content - @raw_content.setter def raw_content(self, value): if not value: return @@ -917,12 +948,13 @@ class Note(Object, RawContentMixin): if isinstance(getattr(self, 'attachment', None), list): children = [] for child in self.attachment: - if isinstance(child, Document): - obj = child.to_base() - if isinstance(obj, Image): - if obj.inline or (obj.image and obj.image in self.raw_content): + if isinstance(child, (Document, Link)): + if hasattr(child, 'to_base'): + child = child.to_base() + if isinstance(child, Image): + if child.inline or (child.image and child.image in self.raw_content): continue - children.append(obj) + children.append(child) self._cached_children = children return self._cached_children @@ -1010,7 +1042,7 @@ class Video(Document, base.Video): self.actor_id = new_act[0] entity = Post(**get_base_attributes(self, - keep=('_mentions', '_media_type', '_rendered_content', + keep=('_mentions', '_media_type', '_soup', '_cached_children', '_cached_raw_content', '_source_object'))) set_public(entity) return entity @@ -1019,7 +1051,7 @@ class Video(Document, base.Video): return self -class Signature(Object): +class RsaSignature2017(Object): created = fields.DateTime(dc.created, add_value_types=True) creator = IRI(dc.creator) key = fields.String(sec.signatureValue) @@ -1174,6 +1206,7 @@ class Retraction(Announce, base.Retraction): class Tombstone(Object, base.Retraction): target_id = fields.Id() + signable = True def to_as2(self): if not isinstance(self.activity, type): return None @@ -1294,7 +1327,7 @@ def extract_receivers(entity): profile = None # don't care about receivers for payloads without an actor_id if getattr(entity, 'actor_id'): - profile = get_profile_or_entity(entity.actor_id) + profile = get_profile_or_entity(fid=entity.actor_id) if not isinstance(profile, base.Profile): return receivers @@ -1314,14 +1347,16 @@ def extract_and_validate(entity): entity._source_protocol = "activitypub" # Extract receivers entity._receivers = extract_receivers(entity) + + # Extract mentions + if hasattr(entity, "extract_mentions"): + entity.extract_mentions() + if hasattr(entity, "post_receive"): entity.post_receive() if hasattr(entity, 'validate'): entity.validate() - # Extract mentions - if hasattr(entity, "extract_mentions"): - entity.extract_mentions() def extract_replies(replies): @@ -1373,6 +1408,9 @@ def element_to_objects(element: Union[Dict, Object], sender: str = "") -> List: logger.error("Failed to validate entity %s: %s", entity, ex) return [] except InvalidSignature as exc: + if isinstance(entity, base.Retraction): + logger.warning('Relayed retraction on %s, ignoring', entity.target_id) + return [] logger.info('%s, fetching from remote', exc) entity = retrieve_and_parse_document(entity.id) if not entity: @@ -1396,6 +1434,7 @@ def model_to_objects(payload): entity = model.schema().load(payload) except (KeyError, jsonld.JsonLdError, exceptions.ValidationError) as exc : # Just give up for now. This must be made robust logger.error("Error parsing jsonld payload (%s)", exc) + traceback.print_exception(exc) return None if isinstance(getattr(entity, 'object_', None), Object): @@ -1416,4 +1455,10 @@ CLASSES_WITH_CONTEXT_EXTENSIONS = ( Person, PropertyValue ) -context_manager = LdContextManager(CLASSES_WITH_CONTEXT_EXTENSIONS) \ No newline at end of file +context_manager = LdContextManager(CLASSES_WITH_CONTEXT_EXTENSIONS) + + +MODEL_NAMES = {} +for key,val in copy.copy(globals()).items(): + if type(val) == JsonLDAnnotation and issubclass(val, (Object, Link)): + MODEL_NAMES[key.lower()] = key diff --git a/federation/entities/mixins.py b/federation/entities/mixins.py index 30ef9d8..d37fd93 100644 --- a/federation/entities/mixins.py +++ b/federation/entities/mixins.py @@ -4,12 +4,13 @@ import re import warnings from typing import List, Set, Union, Dict, Tuple +from bs4 import BeautifulSoup from commonmark import commonmark from marshmallow import missing from federation.entities.activitypub.enums import ActivityType from federation.entities.utils import get_name_for_profile, get_profile -from federation.utils.text import process_text_links, find_tags +from federation.utils.text import find_elements, find_tags, MENTION_PATTERN class BaseEntity: @@ -22,6 +23,7 @@ class BaseEntity: _source_object: Union[str, Dict] = None _sender: str = "" _sender_key: str = "" + _tags: Set = None # ActivityType activity: ActivityType = None activity_id: str = "" @@ -205,7 +207,7 @@ class CreatedAtMixin(BaseEntity): class RawContentMixin(BaseEntity): _media_type: str = "text/markdown" _mentions: Set = None - _rendered_content: str = "" + rendered_content: str = "" raw_content: str = "" def __init__(self, *args, **kwargs): @@ -231,59 +233,22 @@ class RawContentMixin(BaseEntity): images.append((groups[1], groups[0] or "")) return images - @property - def rendered_content(self) -> str: - """Returns the rendered version of raw_content, or just raw_content.""" - try: - from federation.utils.django import get_configuration - config = get_configuration() - if config["tags_path"]: - def linkifier(tag: str) -> str: - return f'' \ - f'#{tag}' - else: - linkifier = None - except ImportError: - linkifier = None - - if self._rendered_content: - return self._rendered_content - elif self._media_type == "text/markdown" and self.raw_content: - # Do tags - _tags, rendered = find_tags(self.raw_content, replacer=linkifier) - # Render markdown to HTML - rendered = commonmark(rendered).strip() - # Do mentions - if self._mentions: - for mention in self._mentions: - # Diaspora mentions are linkified as mailto - profile = get_profile(finger=mention) - href = 'mailto:'+mention if not getattr(profile, 'id', None) else profile.id - rendered = rendered.replace( - "@%s" % mention, - f'@{mention}', - ) - # Finally linkify remaining URL's that are not links - rendered = process_text_links(rendered) - return rendered - return self.raw_content - + # Legacy. Keep this until tests are reworked @property def tags(self) -> List[str]: - """Returns a `list` of unique tags contained in `raw_content`.""" if not self.raw_content: return [] - tags, _text = find_tags(self.raw_content) - return sorted(tags) + return sorted(find_tags(self.raw_content)) def extract_mentions(self): - if self._media_type != 'text/markdown': return - matches = re.findall(r'@{?[\S ]?[^{}@]+[@;]?\s*[\w\-./@]+[\w/]+}?', self.raw_content) - if not matches: + if not self.raw_content: return - for mention in matches: + mentions = find_elements( + BeautifulSoup( + commonmark(self.raw_content, ignore_html_blocks=True), 'html.parser'), + MENTION_PATTERN) + for ns in mentions: + mention = ns.text handle = None splits = mention.split(";") if len(splits) == 1: @@ -292,11 +257,12 @@ class RawContentMixin(BaseEntity): handle = splits[1].strip(' }') if handle: self._mentions.add(handle) - self.raw_content = self.raw_content.replace(mention, '@'+handle) + self.raw_content = self.raw_content.replace(mention, '@' + handle) class OptionalRawContentMixin(RawContentMixin): """A version of the RawContentMixin where `raw_content` is not required.""" + def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) self._required.remove("raw_content") diff --git a/federation/protocols/activitypub/protocol.py b/federation/protocols/activitypub/protocol.py index 0302eee..516a2f8 100644 --- a/federation/protocols/activitypub/protocol.py +++ b/federation/protocols/activitypub/protocol.py @@ -49,6 +49,11 @@ class Protocol: sender = None user = None + def __init__(self, request=None, get_contact_key=None): + # this is required for calls to verify on GET requests + self.request = request + self.get_contact_key = get_contact_key + def build_send(self, entity: BaseEntity, from_user: UserType, to_user_key: RsaKey = None) -> Union[str, Dict]: """ Build POST data for sending out to remotes. @@ -112,7 +117,7 @@ class Protocol: self.sender = signer.id if signer else self.actor key = getattr(signer, 'public_key', None) if not key: - key = self.get_contact_key(self.actor) if self.get_contact_key else '' + key = self.get_contact_key(self.actor) if self.get_contact_key and self.actor else '' if key: # fallback to the author's key the client app may have provided logger.warning("Failed to retrieve keyId for %s, trying the actor's key", sig.get('keyId')) diff --git a/federation/tests/entities/activitypub/test_entities.py b/federation/tests/entities/activitypub/test_entities.py index 835ad74..10335d9 100644 --- a/federation/tests/entities/activitypub/test_entities.py +++ b/federation/tests/entities/activitypub/test_entities.py @@ -1,3 +1,4 @@ +import commonmark import pytest from unittest.mock import patch from pprint import pprint @@ -63,8 +64,12 @@ class TestEntitiesConvertToAS2: 'published': '2019-04-27T00:00:00', } + # Now handled by the client app + @pytest.mark.skip def test_comment_to_as2__url_in_raw_content(self, activitypubcomment): activitypubcomment.raw_content = 'raw_content http://example.com' + activitypubcomment.rendered_content = process_text_links( + commonmark.commonmark(activitypubcomment.raw_content).strip()) activitypubcomment.pre_send() result = activitypubcomment.to_as2() assert result == { @@ -118,6 +123,7 @@ class TestEntitiesConvertToAS2: } def test_post_to_as2(self, activitypubpost): + activitypubpost.rendered_content = commonmark.commonmark(activitypubpost.raw_content).strip() activitypubpost.pre_send() result = activitypubpost.to_as2() assert result == { @@ -191,6 +197,15 @@ class TestEntitiesConvertToAS2: } def test_post_to_as2__with_tags(self, activitypubpost_tags): + activitypubpost_tags.rendered_content = 'raw_content
' activitypubpost_images.pre_send() result = activitypubpost_images.to_as2() assert result == { @@ -274,6 +290,7 @@ class TestEntitiesConvertToAS2: } def test_post_to_as2__with_diaspora_guid(self, activitypubpost_diaspora_guid): + activitypubpost_diaspora_guid.rendered_content = 'raw_content
' activitypubpost_diaspora_guid.pre_send() result = activitypubpost_diaspora_guid.to_as2() assert result == { @@ -418,17 +435,6 @@ class TestEntitiesPostReceive: "public": False, }] - @patch("federation.entities.activitypub.models.bleach.linkify", autospec=True) - def test_post_post_receive__linkifies_if_not_markdown(self, mock_linkify, activitypubpost): - activitypubpost._media_type = 'text/html' - activitypubpost.post_receive() - mock_linkify.assert_called_once() - - @patch("federation.entities.activitypub.models.bleach.linkify", autospec=True) - def test_post_post_receive__skips_linkify_if_markdown(self, mock_linkify, activitypubpost): - activitypubpost.post_receive() - mock_linkify.assert_not_called() - class TestEntitiesPreSend: def test_post_inline_images_are_attached(self, activitypubpost_embedded_images): diff --git a/federation/tests/entities/activitypub/test_mappers.py b/federation/tests/entities/activitypub/test_mappers.py index 566503f..ba6bbbb 100644 --- a/federation/tests/entities/activitypub/test_mappers.py +++ b/federation/tests/entities/activitypub/test_mappers.py @@ -4,6 +4,9 @@ from unittest.mock import patch, Mock, DEFAULT import json import pytest +from federation.entities.activitypub.models import Person + + #from federation.entities.activitypub.entities import ( # models.Follow, models.Accept, models.Person, models.Note, models.Note, # models.Delete, models.Announce) @@ -70,9 +73,7 @@ class TestActivitypubEntityMappersReceive: post = entities[0] assert isinstance(post, models.Note) assert isinstance(post, Post) - assert post.raw_content == '' \ - '@jaywink boom
' + assert post.raw_content == '' assert post.rendered_content == '' \ '@jaywink boom
' assert post.id == "https://diaspodon.fr/users/jaywink/statuses/102356911717767237" @@ -87,40 +88,44 @@ class TestActivitypubEntityMappersReceive: post = entities[0] assert isinstance(post, models.Note) assert isinstance(post, Post) - assert post.raw_content == 'boom #test
' + assert post.raw_content == '' + assert post.rendered_content == 'boom #test
' - # TODO: fix this test - @pytest.mark.skip - def test_message_to_objects_simple_post__with_mentions(self): + @patch("federation.entities.activitypub.models.get_profile_or_entity", + return_value=Person(finger="jaywink@dev3.jasonrobinson.me",url="https://dev3.jasonrobinson.me/u/jaywink/")) + def test_message_to_objects_simple_post__with_mentions(self, mock_get): entities = message_to_objects(ACTIVITYPUB_POST_WITH_MENTIONS, "https://mastodon.social/users/jaywink") assert len(entities) == 1 post = entities[0] assert isinstance(post, models.Note) assert isinstance(post, Post) assert len(post._mentions) == 1 - assert list(post._mentions)[0] == "https://dev3.jasonrobinson.me/u/jaywink/" + assert list(post._mentions)[0] == "jaywink@dev3.jasonrobinson.me" - def test_message_to_objects_simple_post__with_source__bbcode(self): + + @patch("federation.entities.activitypub.models.get_profile_or_entity", + return_value=Person(finger="jaywink@dev.jasonrobinson.me",url="https://dev.jasonrobinson.me/u/jaywink/")) + def test_message_to_objects_simple_post__with_source__bbcode(self, mock_get): entities = message_to_objects(ACTIVITYPUB_POST_WITH_SOURCE_BBCODE, "https://diaspodon.fr/users/jaywink") assert len(entities) == 1 post = entities[0] assert isinstance(post, models.Note) assert isinstance(post, Post) - assert post.rendered_content == '' \
+ assert post.rendered_content == ' ' \
'@jaywink boom ' \
- '@jaywink boom @jaywink boom @jaywink boom One more test before sleep 😅 This time with an image. raw_content @jaywink boom boom #test @jaywink boom @jaywink boom @jaywink boom #starting and #MixED however not <#>this#> or <#/>that"
- tags, text = find_tags(source)
+ tags = find_tags(source)
assert tags == {"starting", "mixed"}
- assert text == source
- tags, text = find_tags(source, replacer=self._replacer)
- assert text == " #starting/starting and #MixED/mixed however not <#>this#> or <#/>that"
def test_postfixed_tags(self):
source = "#foo) #bar] #hoo, #hee."
- tags, text = find_tags(source)
+ tags = find_tags(source)
assert tags == {"foo", "bar", "hoo", "hee"}
- assert text == source
- tags, text = find_tags(source, replacer=self._replacer)
- assert text == "#foo/foo) #bar/bar] #hoo/hoo, #hee/hee."
def test_prefixed_tags(self):
source = "(#foo [#bar"
- tags, text = find_tags(source)
+ tags = find_tags(source)
assert tags == {"foo", "bar"}
- assert text == source
- tags, text = find_tags(source, replacer=self._replacer)
- assert text == "(#foo/foo [#bar/bar"
def test_invalid_text_returns_no_tags(self):
source = "#a!a #a#a #a$a #a%a #a^a #a&a #a*a #a+a #a.a #a,a #a@a #a£a #a(a #a)a #a=a " \
"#a?a #a`a #a'a #a\\a #a{a #a[a #a]a #a}a #a~a #a;a #a:a #a\"a #a’a #a”a #\xa0cd"
- tags, text = find_tags(source)
- assert tags == set()
- assert text == source
- tags, text = find_tags(source, replacer=self._replacer)
- assert text == source
+ tags = find_tags(source)
+ assert tags == {'a'}
def test_start_of_paragraph_in_html_content(self):
source = ' First line #foobar #barfoo First line #foobar/foobar #barfoo/barfoo '
- '@jaywink boom @jaywink boom tags cause issues in us finding words - add some spacing around them
- new_text = text.replace(" ", " ").replace("https://example.org
\nhttps://example.org\n
') == \
- 'https://example.org
\nhttps://example.org\n
'
-
- def test_emails_are_skipped(self):
- assert process_text_links('foo@example.org') == 'foo@example.org'
-
- def test_does_not_add_target_blank_if_link_is_internal(self):
- assert process_text_links('#foobar') == \
- '#foobar'
-
- def test_does_not_remove_mention_classes(self):
- assert process_text_links('
and
", "
").replace("
", "
").replace("
", "") - return found_tags, final_text or text + tags = find_elements(BeautifulSoup(commonmark(text, ignore_html_blocks=True), 'html.parser'), + TAG_PATTERN) + return set([tag.text.lstrip('#').lower() for tag in tags]) + + +def find_elements(soup: BeautifulSoup, pattern: re.Pattern) -> List[NavigableString]: + """ + Split a BeautifulSoup tree strings according to a pattern, replacing each element + with a NavigableString. The returned list can be used to linkify the found + elements. + + :param soup: BeautifulSoup instance of the content being searched + :param pattern: Compiled regular expression defined using a single group + :return: A NavigableString list attached to the original soup + """ + final = [] + for candidate in soup.find_all(string=True): + if candidate.parent.name == 'code': continue + ns = [NavigableString(r) for r in pattern.split(candidate.text) if r] + found = [s for s in ns if pattern.match(s.text)] + if found: + candidate.replace_with(*ns) + final.extend(found) + return final def get_path_from_url(url: str) -> str: @@ -93,28 +70,6 @@ def get_path_from_url(url: str) -> str: return parsed.path -def process_text_links(text): - """Process links in text, adding some attributes and linkifying textual links.""" - link_callbacks = [callbacks.nofollow, callbacks.target_blank] - - def link_attributes(attrs, new=False): - """Run standard callbacks except for internal links.""" - href_key = (None, "href") - if attrs.get(href_key).startswith("/"): - return attrs - - # Run the standard callbacks - for callback in link_callbacks: - attrs = callback(attrs, new) - return attrs - - return bleach.linkify( - text, - callbacks=[link_attributes], - parse_email=False, - skip_tags=["code"], - ) - def test_tag(tag: str) -> bool: """Test a word whether it could be accepted as a tag."""