diff --git a/backend/lib/backend/crawler/crawlers/mastodon.ex b/backend/lib/backend/crawler/crawlers/mastodon.ex index 2109339..a884ac3 100644 --- a/backend/lib/backend/crawler/crawlers/mastodon.ex +++ b/backend/lib/backend/crawler/crawlers/mastodon.ex @@ -14,6 +14,9 @@ defmodule Backend.Crawler.Crawlers.Mastodon do # We might already know that this is a Pleroma instance from nodeinfo if result != nil do cond do + # for pleroma and smithereen, the instance_type will get overwritten + # with the correct value -- but we still want to return true here + # since they are compatible with the mastodon API Map.get(result, :instance_type) == :pleroma -> true Map.get(result, :instance_type) == :smithereen -> true Map.get(result, :instance_type) == :mastodon -> true @@ -97,16 +100,7 @@ defmodule Backend.Crawler.Crawlers.Mastodon do interactions \\ %{}, statuses_seen \\ 0 ) do - # If `statuses_seen == 0`, it's the first call of this function, which means we want to query the database for the - # most recent status we have. - min_timestamp = - if statuses_seen == 0 do - get_last_crawl_timestamp(domain) - else - min_timestamp - end - - endpoint = "https://#{domain}/api/v1/timelines/public?local=true" + endpoint = "https://#{domain}/api/v1/timelines/public?local=true&limit=40" endpoint = if max_id do @@ -117,7 +111,26 @@ defmodule Backend.Crawler.Crawlers.Mastodon do Logger.debug("Crawling #{endpoint}") - statuses = http_client().get_and_decode!(endpoint) + case http_client().get_and_decode(endpoint) do + {:ok, statuses} -> + handle_statuses(statuses, domain, min_timestamp, interactions, statuses_seen) + + # if there's an error (e.g. because the timeline prevents unauthenticated access) + # then stop here + {:error, _} -> + {interactions, statuses_seen} + end + end + + defp handle_statuses(statuses, domain, min_timestamp, interactions, statuses_seen) do + # If `statuses_seen == 0`, it's the first call of this function, which means we want to query the database for the + # most recent status we have. + min_timestamp = + if statuses_seen == 0 do + get_last_crawl_timestamp(domain) + else + min_timestamp + end filtered_statuses = statuses diff --git a/backend/test/backend/crawler/crawlers/mastodon_test.exs b/backend/test/backend/crawler/crawlers/mastodon_test.exs new file mode 100644 index 0000000..faf0721 --- /dev/null +++ b/backend/test/backend/crawler/crawlers/mastodon_test.exs @@ -0,0 +1,117 @@ +defmodule Backend.Crawler.Crawlers.MastodonTest do + use Backend.DataCase + + alias Backend.Crawler.Crawlers.Mastodon + alias Backend.Crawler.ApiCrawler + alias Backend.HttpBehaviour + import Mox + + setup :verify_on_exit! + + describe "is_instance_type?/2" do + test "returns true for pleroma and smithereen" do + assert Mastodon.is_instance_type?("example.com", %{instance_type: :pleroma}) + assert Mastodon.is_instance_type?("example.com", %{instance_type: :smithereen}) + end + + test "returns true for mastodon instance" do + expect(HttpMock, :get_and_decode, fn "https://example.com/api/v1/instance" -> + {:ok, TestHelpers.load_json("mastodon/instance.json")} + end) + + assert Mastodon.is_instance_type?("example.com", nil) + end + end + + describe "crawl/2" do + test "does nothing for small instances" do + expect(HttpMock, :get_and_decode!, fn "https://example.com/api/v1/instance" -> + TestHelpers.load_json("mastodon/instance.json") + |> Map.merge(%{"stats" => %{"user_count" => 1}}) + end) + + result = Mastodon.crawl("example.com", ApiCrawler.get_default()) + + assert result == + ApiCrawler.get_default() |> Map.merge(%{instance_type: :mastodon, user_count: 1}) + end + + test "crawls large instance" do + expect(HttpMock, :get_and_decode!, fn "https://example.com/api/v1/instance" -> + TestHelpers.load_json("mastodon/instance.json") + end) + + expect(HttpMock, :get_and_decode, fn "https://example.com/api/v1/instance/peers" -> + {:ok, TestHelpers.load_json("mastodon/peers.json")} + end) + + expect( + HttpMock, + :get_and_decode, + fn "https://example.com/api/v1/timelines/public?local=true&limit=40" -> + {:ok, TestHelpers.load_json("mastodon/timeline.json")} + end + ) + + expect( + HttpMock, + :get_and_decode, + 4, + fn "https://example.com/api/v1/timelines/public?local=true&limit=40&max_id=123" -> + {:ok, TestHelpers.load_json("mastodon/timeline.json")} + end + ) + + result = Mastodon.crawl("example.com", ApiCrawler.get_default()) + + assert result == %{ + description: "long description", + federation_restrictions: [], + instance_type: :mastodon, + interactions: %{}, + peers: ["other.com"], + user_count: 100, + status_count: 100, + statuses_seen: 5, + version: "1.2.3" + } + end + + test "handles timelines that require auth" do + expect(HttpMock, :get_and_decode!, fn "https://example.com/api/v1/instance" -> + TestHelpers.load_json("mastodon/instance.json") + end) + + expect(HttpMock, :get_and_decode, fn "https://example.com/api/v1/instance/peers" -> + {:ok, TestHelpers.load_json("mastodon/peers.json")} + end) + + expect( + HttpMock, + :get_and_decode, + fn "https://example.com/api/v1/timelines/public?local=true&limit=40" -> + {:error, + %HttpBehaviour.Error{ + message: "HTTP request failed with status code 422", + status_code: 422, + body: "{\"error\":\"This method requires an authenticated user\"}" + }} + end + ) + + result = Mastodon.crawl("example.com", ApiCrawler.get_default()) + + assert result == %{ + description: "long description", + federation_restrictions: [], + instance_type: :mastodon, + interactions: %{}, + peers: ["other.com"], + user_count: 100, + status_count: 100, + statuses_seen: 0, + version: "1.2.3" + } + end + end +end diff --git a/backend/test/support/data/json/mastodon/instance.json b/backend/test/support/data/json/mastodon/instance.json new file mode 100644 index 0000000..cdf71df --- /dev/null +++ b/backend/test/support/data/json/mastodon/instance.json @@ -0,0 +1,137 @@ +{ + "uri": "mastodon.social", + "title": "Mastodon", + "short_description": "short description", + "description": "long description", + "email": "staff@mastodon.social", + "version": "1.2.3", + "urls": { + "streaming_api": "wss://streaming.mastodon.social" + }, + "stats": { + "user_count": 100, + "status_count": 100, + "domain_count": 55958 + }, + "thumbnail": "https://files.mastodon.social/site_uploads/files/000/000/001/@1x/57c12f441d083cde.png", + "languages": ["en"], + "registrations": true, + "approval_required": false, + "invites_enabled": true, + "configuration": { + "accounts": { + "max_featured_tags": 10 + }, + "statuses": { + "max_characters": 500, + "max_media_attachments": 4, + "characters_reserved_per_url": 23 + }, + "media_attachments": { + "supported_mime_types": [ + "image/jpeg", + "image/png", + "image/gif", + "image/heic", + "image/heif", + "image/webp", + "image/avif", + "video/webm", + "video/mp4", + "video/quicktime", + "video/ogg", + "audio/wave", + "audio/wav", + "audio/x-wav", + "audio/x-pn-wave", + "audio/vnd.wave", + "audio/ogg", + "audio/vorbis", + "audio/mpeg", + "audio/mp3", + "audio/webm", + "audio/flac", + "audio/aac", + "audio/m4a", + "audio/x-m4a", + "audio/mp4", + "audio/3gpp", + "video/x-ms-asf" + ], + "image_size_limit": 16777216, + "image_matrix_limit": 33177600, + "video_size_limit": 103809024, + "video_frame_rate_limit": 120, + "video_matrix_limit": 8294400 + }, + "polls": { + "max_options": 4, + "max_characters_per_option": 50, + "min_expiration": 300, + "max_expiration": 2629746 + } + }, + "contact_account": { + "id": "13179", + "username": "Mastodon", + "acct": "Mastodon", + "display_name": "Mastodon", + "locked": false, + "bot": false, + "discoverable": true, + "group": false, + "created_at": "2016-11-23T00:00:00.000Z", + "note": "

Official account of the Mastodon project. News, releases, announcements! Learn more on our website!

", + "url": "https://mastodon.social/@Mastodon", + "avatar": "https://files.mastodon.social/accounts/avatars/000/013/179/original/b4ceb19c9c54ec7e.png", + "avatar_static": "https://files.mastodon.social/accounts/avatars/000/013/179/original/b4ceb19c9c54ec7e.png", + "header": "https://files.mastodon.social/accounts/headers/000/013/179/original/878f382e7dd9fb84.png", + "header_static": "https://files.mastodon.social/accounts/headers/000/013/179/original/878f382e7dd9fb84.png", + "followers_count": 778859, + "following_count": 8, + "statuses_count": 237, + "last_status_at": "2023-05-13", + "noindex": false, + "emojis": [], + "roles": [], + "fields": [ + { + "name": "Homepage", + "value": "https://joinmastodon.org", + "verified_at": "2018-10-31T04:11:00.076+00:00" + }, + { + "name": "Patreon", + "value": "https://patreon.com/mastodon", + "verified_at": null + }, + { + "name": "GitHub", + "value": "https://github.com/mastodon", + "verified_at": null + } + ] + }, + "rules": [ + { + "id": "1", + "text": "Sexually explicit or violent media must be marked as sensitive when posting" + }, + { + "id": "2", + "text": "No racism, sexism, homophobia, transphobia, xenophobia, or casteism" + }, + { + "id": "3", + "text": "No incitement of violence or promotion of violent ideologies" + }, + { + "id": "4", + "text": "No harassment, dogpiling or doxxing of other users" + }, + { + "id": "7", + "text": "Do not share intentionally false or misleading information" + } + ] +} diff --git a/backend/test/support/data/json/mastodon/peers.json b/backend/test/support/data/json/mastodon/peers.json new file mode 100644 index 0000000..af04900 --- /dev/null +++ b/backend/test/support/data/json/mastodon/peers.json @@ -0,0 +1 @@ +["other.com"] diff --git a/backend/test/support/data/json/mastodon/timeline.json b/backend/test/support/data/json/mastodon/timeline.json new file mode 100644 index 0000000..5a1e0e3 --- /dev/null +++ b/backend/test/support/data/json/mastodon/timeline.json @@ -0,0 +1,55 @@ +[ + { + "id": "123", + "created_at": "2023-06-10T18:59:36.207Z", + "in_reply_to_id": null, + "in_reply_to_account_id": null, + "sensitive": false, + "spoiler_text": "", + "visibility": "public", + "language": "de", + "uri": "https://mastodon.social/users/someuser/statuses/110521455489577427", + "url": "https://mastodon.social/@someuser/110521455489577427", + "replies_count": 0, + "reblogs_count": 0, + "favourites_count": 0, + "edited_at": null, + "content": "

New post

", + "reblog": null, + "application": { + "name": "IFTTT", + "website": "https://www.ifttt.com" + }, + "account": { + "id": "108265572384945996", + "username": "someuser", + "acct": "someuser", + "display_name": "Some User", + "locked": false, + "bot": false, + "discoverable": true, + "group": false, + "created_at": "2022-05-08T00:00:00.000Z", + "note": "

My account

", + "url": "https://mastodon.social/@someuser", + "avatar": "https://example.com/picture.jpg", + "avatar_static": "https://example.com/picture.jpg", + "header": "https://example.com/picture.jpg", + "header_static": "https://example.com/picture.jpg", + "followers_count": 7, + "following_count": 73, + "statuses_count": 256, + "last_status_at": "2023-06-10", + "noindex": false, + "emojis": [], + "roles": [], + "fields": [] + }, + "media_attachments": [], + "mentions": [], + "tags": [], + "emojis": [], + "card": {}, + "poll": null + } +]