Merge branch 'todos-and-issues' into 'master'

Fixes addressing various manually tracked content issues. See merge request jaywink/federation!183
2024-02-29 00:58:03 +00:00 · 2024-02-29 00:58:03 +00:00 · 1f15583aad
commit 1f15583aad
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@ -1,5 +1,41 @@
 # Changelog
 ## Unreleased
 ### Changed
 * This is actually both a change and a fix. AP Image objects do not define properties matching the 
  HTML img tag alt and title properties. Image.name is used to render both alt and title, which IMHO is
  wrong. With this change, markdown images defining the title property will be recognized instead of
  being thrown away (the fix) and the title property, if defined, will have precedence over the
  alt property as the Image.name value (the change). Before this change, the client app would properly
  render the img tag from the markdown source (with distinct alt and title properties), but the Image
  object would not federate and hence not be displayed on other platforms (namely Mastodon).
 ### Fixed
 * Note._find_and_mark_mentions: When an AP Mention object href can't be found in the rendered content,
  try the name property.
 * Ignore media objects that don't define a media type.
 * Prevent rendered content image duplication when an image is both in the AP payload rendered content
  and defined as an attachment that doesn't set the inlineImage property.
 * Instead of discarding the whole AP payload out when encountering an undefined or unlisted AP object,
  log a warning and keep going. Ensure None is returned when a nested field only contains an undefined
  object.
 * Accept the application/ld+json type for webfinger AP links.
 * Mark an AP mention only if profile.finger is defined.
 * Handle escape sequences for inbound markdown mentions.
 * Extend the Unicode character range allowed in markdown mentions.
 * Discard illegal characters from tag text. Previously, this was done only on tag links.
 ## [0.25.1] - 2024-02-18
 ### Fixed
--- a/federation/entities/activitypub/models.py
+++ b/federation/entities/activitypub/models.py
@ -209,11 +209,16 @@ class MixedField(fields.Nested):
        ret = []
        for item in value:
            if item.get('@type'):
-                res = super()._deserialize(item, attr, data, **kwargs)
+                try:
                    res = super()._deserialize(item, attr, data, **kwargs)
                except KeyError as ex:
                    logger.warning("nested field: undefined JSON-LD type %s", ex)
                    continue
                ret.append(res if not isinstance(res, list) else res[0])
            else:
                ret.append(self.iri._deserialize(item, attr, data, **kwargs))
        if not ret: ret.append(None)
        return ret if len(ret) > 1 or self.many else ret[0]
@ -247,7 +252,7 @@ class Object(BaseEntity, metaclass=JsonLDAnnotation):
    icon = MixedField(as2.icon, nested='ImageSchema')
    image = MixedField(as2.image, nested='ImageSchema')
    tag_objects = MixedField(as2.tag, nested=['NoteSchema', 'HashtagSchema','MentionSchema','PropertyValueSchema','EmojiSchema'], many=True)
-    attachment = fields.Nested(as2.attachment, nested=['LinkSchema', 'NoteSchema', 'ImageSchema', 'AudioSchema', 'DocumentSchema','PropertyValueSchema','IdentityProofSchema'],
+    attachment = MixedField(as2.attachment, nested=['LinkSchema', 'NoteSchema', 'ImageSchema', 'AudioSchema', 'DocumentSchema','PropertyValueSchema','IdentityProofSchema'],
                               many=True, default=[])
    content_map = LanguageMap(as2.content)  # language maps are not implemented in calamus
    context = fields.RawJsonLD(as2.context)
@ -421,6 +426,8 @@ class Document(Object):
    url = MixedField(as2.url, nested='LinkSchema')
    def to_base(self):
        if self.media_type is missing:
            return self
        self.__dict__.update({'schema': True})
        if self.media_type.startswith('image'):
            return Image(**get_base_attributes(self))
@ -866,7 +873,7 @@ class Note(Object, RawContentMixin):
            normalized_url = f'{parsed.scheme}://{parsed.netloc}{normalized_path.decode()}'
            links = {link['href'].lower(), unquote(link['href']).lower(), url, normalized_url}
            if links.intersection(hrefs):
-                tag = re.match(r'^#?([\w\-]+$)', link.text)
+                tag = re.match(r'^#?([\w\-]+)', link.text)
                if tag:
                    link['data-hashtag'] = tag.group(1).lower()
@ -879,17 +886,28 @@ class Note(Object, RawContentMixin):
        for mention in mentions:
            hrefs = []
            profile = get_profile_or_entity(fid=mention.href, remote_url=mention.href)
-            if profile and not profile.url:
+            if profile and not (profile.url and profile.finger):
-                # This should be removed when we are confident that the remote_url property
+                # This should be removed when we are confident that the remote_url and
-                # has been populated for most profiles on the client app side.
+                # finger properties have been populated for most profiles on the client app side.
                profile = retrieve_and_parse_profile(profile.id)
-            if profile:
+            if profile and profile.finger:
                hrefs.extend([profile.id, profile.url])
            else:
                continue
            for href in hrefs:
                links = self._soup.find_all(href=href)
                for link in links:
                    link['data-mention'] = profile.finger
                    self._mentions.add(profile.finger)
            if profile.finger not in self._mentions:
                # can't find some mentions using their href property value
                # try with the name property
                matches = self._soup.find_all(string=mention.name)
                for match in matches:
                    link = match.find_parent('a')
                    if link:
                        link['data-mention'] = profile.finger
                        self._mentions.add(profile.finger)
    def extract_mentions(self):
        """
@ -953,7 +971,7 @@ class Note(Object, RawContentMixin):
                    if hasattr(child, 'to_base'):
                        child = child.to_base()
                    if isinstance(child, Image):
-                        if child.inline or (child.image and child.image in self.raw_content):
+                        if child.inline or self._soup.find('img', src=child.url):
                            continue
                    children.append(child)
            self._cached_children = children
--- a/federation/entities/mixins.py
+++ b/federation/entities/mixins.py
@ -6,6 +6,7 @@ from typing import List, Set, Union, Dict, Tuple
 from bs4 import BeautifulSoup
 from commonmark import commonmark
 from markdownify import markdownify
 from marshmallow import missing
 from federation.entities.activitypub.enums import ActivityType
@ -224,13 +225,17 @@ class RawContentMixin(BaseEntity):
        Returns a Tuple of (url, filename).
        """
        images = []
-        if self._media_type != "text/markdown" or self.raw_content is None:
+        if hasattr(self, '_soup'):
-            return images
+            for img in self._soup.find_all('img', src=re.compile(r'^http')):
-        regex = r"!\[([\w\s\-\']*)\]\((https?://[\w\d\-\./]+\.[\w]*((?<=jpg)|(?<=gif)|(?<=png)|(?<=jpeg)))\)"
+                images.append((img['src'], img.get('title', '') or img.get('alt', '')))
-        matches = re.finditer(regex, self.raw_content, re.MULTILINE | re.IGNORECASE)
+        else:
-        for match in matches:
+            if self._media_type != "text/markdown" or self.raw_content is None:
-            groups = match.groups()
+                return images
-            images.append((groups[1], groups[0] or ""))
+            regex = r"!\[([\w\s\-\']*)\]\((https?://[\w\d\-\./]+\.[\w]*((?<=jpg)|(?<=gif)|(?<=png)|(?<=jpeg)))\)"
            matches = re.finditer(regex, self.raw_content, re.MULTILINE | re.IGNORECASE)
            for match in matches:
                groups = match.groups()
                images.append((groups[1], groups[0] or ""))
        return images
    # Legacy. Keep this until tests are reworked
@ -258,6 +263,9 @@ class RawContentMixin(BaseEntity):
            if handle:
                self._mentions.add(handle)
                self.raw_content = self.raw_content.replace(mention, '@' + handle)
                # mardownify the extracted mention in case some characters are escaped in
                # raw_content
                self.raw_content = self.raw_content.replace(markdownify(mention), '@' + handle)
 class OptionalRawContentMixin(RawContentMixin):
--- a/federation/tests/fixtures/entities.py
+++ b/federation/tests/fixtures/entities.py
@ -1,5 +1,6 @@
 import pytest
 # noinspection PyPackageRequirements
 from commonmark import commonmark
 from freezegun import freeze_time
 from unittest.mock import patch
@ -152,8 +153,7 @@ def activitypubpost_tags():
@pytest.fixture
 def activitypubpost_embedded_images():
    with freeze_time("2019-04-27"):
-        obj = models.Post(
+        raw_content="""
            raw_content="""
 #Cycling #lauttasaari #sea #sun
@ -166,7 +166,10 @@ def activitypubpost_embedded_images():
 [foo](https://jasonrobinson.me/media/uploads/2019/07/16/daa24d89-cedf-4fc7-bad8-74a9025414710.jpg)
 #only a link, not embedded
 https://jasonrobinson.me/media/uploads/2019/07/16/daa24d89-cedf-4fc7-bad8-74a9025414711.jpg
-""",
+"""
        obj = models.Post(
            raw_content=raw_content,
            rendered_content=commonmark(raw_content, ignore_html_blocks=True),
            public=True,
            provider_display_name="Socialhome",
            id=f"http://127.0.0.1:8000/post/123456/",
--- a/federation/utils/activitypub.py
+++ b/federation/utils/activitypub.py
@ -1,5 +1,6 @@
 import json
 import logging
 import re
 from typing import Optional, Any
 from urllib.parse import urlparse
@ -16,6 +17,7 @@ except Exception as exc:
    federation_user = None
    logger.warning("django is required for get requests signing: %s", exc)
 type_path = re.compile(r'^application/(activity|ld)\+json')
 def get_profile_id_from_webfinger(handle: str) -> Optional[str]:
    """
@ -30,7 +32,7 @@ def get_profile_id_from_webfinger(handle: str) -> Optional[str]:
    except json.JSONDecodeError:
        return
    for link in doc.get("links", []):
-        if link.get("rel") == "self" and link.get("type") == "application/activity+json":
+        if link.get("rel") == "self" and type_path.match(link.get("type")):
            return link["href"]
    logger.debug("get_profile_id_from_webfinger: found webfinger but it has no as2 self href")
--- a/federation/utils/text.py
+++ b/federation/utils/text.py
@ -8,8 +8,8 @@ from commonmark import commonmark
 ILLEGAL_TAG_CHARS = "!#$%^&*+.,@£/()=?`'\\{[]}~;:\"’”—\xa0"
 TAG_PATTERN = re.compile(r'(#[\w\-]+)([)\]_!?*%/.,;\s]+\s*|\Z)', re.UNICODE)
-# This will match non matching braces. I don't think it's an issue.
+# This will match non-matching braces. I don't think it's an issue.
-MENTION_PATTERN = re.compile(r'(@\{?(?:[\w\-. \u263a-\U0001f645]*; *)?[\w]+@[\w\-.]+\.[\w]+}?)', re.UNICODE)
+MENTION_PATTERN = re.compile(r'(@\{?(?:[\w\-. \u0250-\U0001f64f]*; *)?[\w]+@[\w\-.]+\.[\w]+}?)', re.UNICODE)
 # based on https://stackoverflow.com/a/6041965
 URL_PATTERN = re.compile(r'((?:(?:https?|ftp)://|^|(?<=[("<\s]))+(?:[\w\-]+(?:(?:\.[\w\-]+)+))'
                         r'[\w.,;:@?!$()*^=%&/~+\-#]*(?<![:;,).>"]))',
--- a/setup.py
+++ b/setup.py
@ -37,6 +37,7 @@ setup(
        "lxml>=3.4.0",
        "iteration_utilities",
        "jsonschema>=2.0.0",
        "markdownify",
        "pycryptodome>=3.4.10",
        "python-dateutil>=2.4.0",
        "python-httpsig-socialhome",