Merge branch 'todos-and-issues' into 'master'

Fixes addressing various manually tracked content issues.

See merge request jaywink/federation!183
master
Alain St-Denis 2024-02-29 00:58:03 +00:00
commit 1f15583aad
7 zmienionych plików z 89 dodań i 21 usunięć

Wyświetl plik

@ -1,5 +1,41 @@
# Changelog
## Unreleased
### Changed
* This is actually both a change and a fix. AP Image objects do not define properties matching the
HTML img tag alt and title properties. Image.name is used to render both alt and title, which IMHO is
wrong. With this change, markdown images defining the title property will be recognized instead of
being thrown away (the fix) and the title property, if defined, will have precedence over the
alt property as the Image.name value (the change). Before this change, the client app would properly
render the img tag from the markdown source (with distinct alt and title properties), but the Image
object would not federate and hence not be displayed on other platforms (namely Mastodon).
### Fixed
* Note._find_and_mark_mentions: When an AP Mention object href can't be found in the rendered content,
try the name property.
* Ignore media objects that don't define a media type.
* Prevent rendered content image duplication when an image is both in the AP payload rendered content
and defined as an attachment that doesn't set the inlineImage property.
* Instead of discarding the whole AP payload out when encountering an undefined or unlisted AP object,
log a warning and keep going. Ensure None is returned when a nested field only contains an undefined
object.
* Accept the application/ld+json type for webfinger AP links.
* Mark an AP mention only if profile.finger is defined.
* Handle escape sequences for inbound markdown mentions.
* Extend the Unicode character range allowed in markdown mentions.
* Discard illegal characters from tag text. Previously, this was done only on tag links.
## [0.25.1] - 2024-02-18
### Fixed

Wyświetl plik

@ -209,11 +209,16 @@ class MixedField(fields.Nested):
ret = []
for item in value:
if item.get('@type'):
res = super()._deserialize(item, attr, data, **kwargs)
try:
res = super()._deserialize(item, attr, data, **kwargs)
except KeyError as ex:
logger.warning("nested field: undefined JSON-LD type %s", ex)
continue
ret.append(res if not isinstance(res, list) else res[0])
else:
ret.append(self.iri._deserialize(item, attr, data, **kwargs))
if not ret: ret.append(None)
return ret if len(ret) > 1 or self.many else ret[0]
@ -247,7 +252,7 @@ class Object(BaseEntity, metaclass=JsonLDAnnotation):
icon = MixedField(as2.icon, nested='ImageSchema')
image = MixedField(as2.image, nested='ImageSchema')
tag_objects = MixedField(as2.tag, nested=['NoteSchema', 'HashtagSchema','MentionSchema','PropertyValueSchema','EmojiSchema'], many=True)
attachment = fields.Nested(as2.attachment, nested=['LinkSchema', 'NoteSchema', 'ImageSchema', 'AudioSchema', 'DocumentSchema','PropertyValueSchema','IdentityProofSchema'],
attachment = MixedField(as2.attachment, nested=['LinkSchema', 'NoteSchema', 'ImageSchema', 'AudioSchema', 'DocumentSchema','PropertyValueSchema','IdentityProofSchema'],
many=True, default=[])
content_map = LanguageMap(as2.content) # language maps are not implemented in calamus
context = fields.RawJsonLD(as2.context)
@ -421,6 +426,8 @@ class Document(Object):
url = MixedField(as2.url, nested='LinkSchema')
def to_base(self):
if self.media_type is missing:
return self
self.__dict__.update({'schema': True})
if self.media_type.startswith('image'):
return Image(**get_base_attributes(self))
@ -866,7 +873,7 @@ class Note(Object, RawContentMixin):
normalized_url = f'{parsed.scheme}://{parsed.netloc}{normalized_path.decode()}'
links = {link['href'].lower(), unquote(link['href']).lower(), url, normalized_url}
if links.intersection(hrefs):
tag = re.match(r'^#?([\w\-]+$)', link.text)
tag = re.match(r'^#?([\w\-]+)', link.text)
if tag:
link['data-hashtag'] = tag.group(1).lower()
@ -879,17 +886,28 @@ class Note(Object, RawContentMixin):
for mention in mentions:
hrefs = []
profile = get_profile_or_entity(fid=mention.href, remote_url=mention.href)
if profile and not profile.url:
# This should be removed when we are confident that the remote_url property
# has been populated for most profiles on the client app side.
if profile and not (profile.url and profile.finger):
# This should be removed when we are confident that the remote_url and
# finger properties have been populated for most profiles on the client app side.
profile = retrieve_and_parse_profile(profile.id)
if profile:
if profile and profile.finger:
hrefs.extend([profile.id, profile.url])
else:
continue
for href in hrefs:
links = self._soup.find_all(href=href)
for link in links:
link['data-mention'] = profile.finger
self._mentions.add(profile.finger)
if profile.finger not in self._mentions:
# can't find some mentions using their href property value
# try with the name property
matches = self._soup.find_all(string=mention.name)
for match in matches:
link = match.find_parent('a')
if link:
link['data-mention'] = profile.finger
self._mentions.add(profile.finger)
def extract_mentions(self):
"""
@ -953,7 +971,7 @@ class Note(Object, RawContentMixin):
if hasattr(child, 'to_base'):
child = child.to_base()
if isinstance(child, Image):
if child.inline or (child.image and child.image in self.raw_content):
if child.inline or self._soup.find('img', src=child.url):
continue
children.append(child)
self._cached_children = children

Wyświetl plik

@ -6,6 +6,7 @@ from typing import List, Set, Union, Dict, Tuple
from bs4 import BeautifulSoup
from commonmark import commonmark
from markdownify import markdownify
from marshmallow import missing
from federation.entities.activitypub.enums import ActivityType
@ -224,13 +225,17 @@ class RawContentMixin(BaseEntity):
Returns a Tuple of (url, filename).
"""
images = []
if self._media_type != "text/markdown" or self.raw_content is None:
return images
regex = r"!\[([\w\s\-\']*)\]\((https?://[\w\d\-\./]+\.[\w]*((?<=jpg)|(?<=gif)|(?<=png)|(?<=jpeg)))\)"
matches = re.finditer(regex, self.raw_content, re.MULTILINE | re.IGNORECASE)
for match in matches:
groups = match.groups()
images.append((groups[1], groups[0] or ""))
if hasattr(self, '_soup'):
for img in self._soup.find_all('img', src=re.compile(r'^http')):
images.append((img['src'], img.get('title', '') or img.get('alt', '')))
else:
if self._media_type != "text/markdown" or self.raw_content is None:
return images
regex = r"!\[([\w\s\-\']*)\]\((https?://[\w\d\-\./]+\.[\w]*((?<=jpg)|(?<=gif)|(?<=png)|(?<=jpeg)))\)"
matches = re.finditer(regex, self.raw_content, re.MULTILINE | re.IGNORECASE)
for match in matches:
groups = match.groups()
images.append((groups[1], groups[0] or ""))
return images
# Legacy. Keep this until tests are reworked
@ -258,6 +263,9 @@ class RawContentMixin(BaseEntity):
if handle:
self._mentions.add(handle)
self.raw_content = self.raw_content.replace(mention, '@' + handle)
# mardownify the extracted mention in case some characters are escaped in
# raw_content
self.raw_content = self.raw_content.replace(markdownify(mention), '@' + handle)
class OptionalRawContentMixin(RawContentMixin):

Wyświetl plik

@ -1,5 +1,6 @@
import pytest
# noinspection PyPackageRequirements
from commonmark import commonmark
from freezegun import freeze_time
from unittest.mock import patch
@ -152,8 +153,7 @@ def activitypubpost_tags():
@pytest.fixture
def activitypubpost_embedded_images():
with freeze_time("2019-04-27"):
obj = models.Post(
raw_content="""
raw_content="""
#Cycling #lauttasaari #sea #sun
@ -166,7 +166,10 @@ def activitypubpost_embedded_images():
[foo](https://jasonrobinson.me/media/uploads/2019/07/16/daa24d89-cedf-4fc7-bad8-74a9025414710.jpg)
#only a link, not embedded
https://jasonrobinson.me/media/uploads/2019/07/16/daa24d89-cedf-4fc7-bad8-74a9025414711.jpg
""",
"""
obj = models.Post(
raw_content=raw_content,
rendered_content=commonmark(raw_content, ignore_html_blocks=True),
public=True,
provider_display_name="Socialhome",
id=f"http://127.0.0.1:8000/post/123456/",

Wyświetl plik

@ -1,5 +1,6 @@
import json
import logging
import re
from typing import Optional, Any
from urllib.parse import urlparse
@ -16,6 +17,7 @@ except Exception as exc:
federation_user = None
logger.warning("django is required for get requests signing: %s", exc)
type_path = re.compile(r'^application/(activity|ld)\+json')
def get_profile_id_from_webfinger(handle: str) -> Optional[str]:
"""
@ -30,7 +32,7 @@ def get_profile_id_from_webfinger(handle: str) -> Optional[str]:
except json.JSONDecodeError:
return
for link in doc.get("links", []):
if link.get("rel") == "self" and link.get("type") == "application/activity+json":
if link.get("rel") == "self" and type_path.match(link.get("type")):
return link["href"]
logger.debug("get_profile_id_from_webfinger: found webfinger but it has no as2 self href")

Wyświetl plik

@ -8,8 +8,8 @@ from commonmark import commonmark
ILLEGAL_TAG_CHARS = "!#$%^&*+.,@£/()=?`'\\{[]}~;:\"’”—\xa0"
TAG_PATTERN = re.compile(r'(#[\w\-]+)([)\]_!?*%/.,;\s]+\s*|\Z)', re.UNICODE)
# This will match non matching braces. I don't think it's an issue.
MENTION_PATTERN = re.compile(r'(@\{?(?:[\w\-. \u263a-\U0001f645]*; *)?[\w]+@[\w\-.]+\.[\w]+}?)', re.UNICODE)
# This will match non-matching braces. I don't think it's an issue.
MENTION_PATTERN = re.compile(r'(@\{?(?:[\w\-. \u0250-\U0001f64f]*; *)?[\w]+@[\w\-.]+\.[\w]+}?)', re.UNICODE)
# based on https://stackoverflow.com/a/6041965
URL_PATTERN = re.compile(r'((?:(?:https?|ftp)://|^|(?<=[("<\s]))+(?:[\w\-]+(?:(?:\.[\w\-]+)+))'
r'[\w.,;:@?!$()*^=%&/~+\-#]*(?<![:;,).>"]))',

Wyświetl plik

@ -37,6 +37,7 @@ setup(
"lxml>=3.4.0",
"iteration_utilities",
"jsonschema>=2.0.0",
"markdownify",
"pycryptodome>=3.4.10",
"python-dateutil>=2.4.0",
"python-httpsig-socialhome",