Added robots.txt handling

main
Štěpán Škorpil 2022-11-22 16:37:11 +01:00
rodzic 8d0452d16b
commit 704c7c066e
24 zmienionych plików z 125 dodań i 64 usunięć

Wyświetl plik

@ -26,6 +26,7 @@
"geoip-lite": "^1.4.6",
"npmlog": "^6.0.0",
"rimraf": "^3.0.2",
"robots-parser": "^3.0.0",
"striptags": "^3.2.0",
"typescript-collections": "^1.3.3",
"zod": "^3.19.1"

Wyświetl plik

@ -1,16 +1,18 @@
import RobotsTxt from '../RobotsTxt/RobotsTxt.js'
import { retrieveWellKnown } from './retrieveWellKnown'
import { retrieveNodeInfo, NodeInfo } from './retrieveNodeInfo'
import { NoSupportedLinkError } from './NoSupportedLinkError'
export const retrieveDomainNodeInfo = async (
domain: string
domain: string,
robotsTxt: RobotsTxt
): Promise<NodeInfo> => {
const wellKnown = await retrieveWellKnown(domain)
const wellKnown = await retrieveWellKnown(domain, robotsTxt)
const link = wellKnown.links.find(
(link) => link.rel === 'http://nodeinfo.diaspora.software/ns/schema/2.0'
)
if (typeof link === 'undefined') {
throw new NoSupportedLinkError(domain)
}
return await retrieveNodeInfo(link.href)
return await retrieveNodeInfo(link.href, robotsTxt)
}

Wyświetl plik

@ -1,7 +1,7 @@
import axios from 'axios'
import { z } from 'zod'
import { assertSuccessJsonResponse } from '../assertSuccessJsonResponse'
import { getDefaultTimeoutMilliseconds } from '../getDefaultTimeoutMilliseconds'
import RobotsTxt from '../RobotsTxt/RobotsTxt.js'
const schema = z.object({
name: z.string().optional(),
@ -27,9 +27,9 @@ const schema = z.object({
export type NodeInfo = z.infer<typeof schema>
export const retrieveNodeInfo = async (url: string): Promise<NodeInfo> => {
export const retrieveNodeInfo = async (url: string, robotsTxt: RobotsTxt): Promise<NodeInfo> => {
console.info('Retrieving node info', { url })
const nodeInfoResponse = await axios.get(url, {
const nodeInfoResponse = await robotsTxt.getIfAllowed(url, {
timeout: getDefaultTimeoutMilliseconds()
})
assertSuccessJsonResponse(nodeInfoResponse)

Wyświetl plik

@ -1,7 +1,7 @@
import axios from 'axios'
import { assertSuccessJsonResponse } from '../assertSuccessJsonResponse'
import { z } from 'zod'
import { getDefaultTimeoutMilliseconds } from '../getDefaultTimeoutMilliseconds'
import RobotsTxt from '../RobotsTxt/RobotsTxt.js'
const wellKnownSchema = z.object({
links: z.array(
@ -14,10 +14,10 @@ const wellKnownSchema = z.object({
export type WellKnown = z.infer<typeof wellKnownSchema>
export const retrieveWellKnown = async (domain: string): Promise<WellKnown> => {
export const retrieveWellKnown = async (domain: string, robotsTxt: RobotsTxt): Promise<WellKnown> => {
console.info('Retrieving well known', { domain })
const wellKnownUrl = `https://${domain}/.well-known/nodeinfo`
const wellKnownResponse = await axios.get(wellKnownUrl, {
const wellKnownResponse = await robotsTxt.getIfAllowed(wellKnownUrl, {
timeout: getDefaultTimeoutMilliseconds(),
maxContentLength: 5000
})

Wyświetl plik

@ -1,6 +1,8 @@
import RobotsTxt from '../RobotsTxt/RobotsTxt.js'
import { FeedData } from './FeedData'
export type FeedProviderMethod = (
domain: string,
page: number
page: number,
robotsTxt: RobotsTxt
) => Promise<FeedData[]>

Wyświetl plik

@ -1,4 +1,3 @@
import axios from 'axios'
import { assertSuccessJsonResponse } from '../../assertSuccessJsonResponse'
import { z } from 'zod'
import { getDefaultTimeoutMilliseconds } from '../../getDefaultTimeoutMilliseconds'
@ -53,9 +52,10 @@ const replaceEmojis = (text: string, emojis: Emoji[]): string => {
export const retrieveLocalPublicUsersPage: FeedProviderMethod = async (
domain,
page
page,
robotsTxt
): Promise<FeedData[]> => {
const response = await axios.get('https://' + domain + '/api/v1/directory', {
const response = await robotsTxt.getIfAllowed(`https://${domain}/api/v1/directory`, {
params: {
limit,
offset: page * limit,

Wyświetl plik

@ -1,4 +1,3 @@
import axios from 'axios'
import { assertSuccessJsonResponse } from '../../assertSuccessJsonResponse'
import { z } from 'zod'
import { getDefaultTimeoutMilliseconds } from '../../getDefaultTimeoutMilliseconds'
@ -7,12 +6,12 @@ import { NoMoreNodesError } from '../NoMoreNodesError'
const schema = z.array(z.string())
export const retrievePeers: NodeProviderMethod = async (domain, page) => {
export const retrievePeers: NodeProviderMethod = async (domain, page, robotsTxt) => {
if (page !== 0) {
throw new NoMoreNodesError('peer')
}
const response = await axios.get(
'https://' + domain + '/api/v1/instance/peers',
const response = await robotsTxt.getIfAllowed(
`https://${domain}/api/v1/instance/peers`,
{
timeout: getDefaultTimeoutMilliseconds()
}

Wyświetl plik

@ -1,4 +1,3 @@
import axios from 'axios'
import { assertSuccessJsonResponse } from '../../assertSuccessJsonResponse'
import { z } from 'zod'
import { getDefaultTimeoutMilliseconds } from '../../getDefaultTimeoutMilliseconds'
@ -15,10 +14,11 @@ const schema = z.array(
export const retrieveInstancesPage: NodeProviderMethod = async (
domain,
page
page,
robotsTxt
) => {
const response = await axios.post(
'https://' + domain + '/api/federation/instances',
const response = await robotsTxt.postIfAllowed(
`https://${domain}/api/federation/instances`,
{
host: null,
blocked: null,

Wyświetl plik

@ -1,4 +1,3 @@
import axios from 'axios'
import { assertSuccessJsonResponse } from '../../assertSuccessJsonResponse'
import { z } from 'zod'
import { getDefaultTimeoutMilliseconds } from '../../getDefaultTimeoutMilliseconds'
@ -68,10 +67,11 @@ const parseDescription = (description: string | null): string => {
export const retrieveUsersPage: FeedProviderMethod = async (
domain,
page
page,
robotsTxt
): Promise<FeedData[]> => {
const response = await axios.post(
'https://' + domain + '/api/users',
const response = await robotsTxt.postIfAllowed(
`https://${domain}/api/users`,
{
state: 'all',
origin: 'local',

Wyświetl plik

@ -1,4 +1,7 @@
import RobotsTxt from '../RobotsTxt/RobotsTxt.js'
export type NodeProviderMethod = (
domain: string,
page: number
page: number,
robotsTxt: RobotsTxt
) => Promise<string[]>

Wyświetl plik

@ -1,5 +1,4 @@
import { FeedData } from '../FeedData'
import axios from 'axios'
import { assertSuccessJsonResponse } from '../../assertSuccessJsonResponse'
import { z } from 'zod'
import { avatarSchema } from './Avatar'
@ -29,8 +28,8 @@ const schema = z.object({
)
})
export const retrieveAccounts: FeedProviderMethod = async (domain, page) => {
const response = await axios.get(`https://${domain}/api/v1/accounts`, {
export const retrieveAccounts: FeedProviderMethod = async (domain, page, robotsTxt) => {
const response = await robotsTxt.getIfAllowed(`https://${domain}/api/v1/accounts`, {
params: {
count: limit,
sort: 'createdAt',

Wyświetl plik

@ -1,4 +1,3 @@
import axios from 'axios'
import { assertSuccessJsonResponse } from '../../assertSuccessJsonResponse'
import { z } from 'zod'
import { getDefaultTimeoutMilliseconds } from '../../getDefaultTimeoutMilliseconds'
@ -21,8 +20,8 @@ const schema = z.object({
)
})
export const retrieveFollowers: NodeProviderMethod = async (domain, page) => {
const response = await axios.get(
export const retrieveFollowers: NodeProviderMethod = async (domain, page, robotsTxt) => {
const response = await robotsTxt.getIfAllowed(
`https://${domain}/api/v1/server/followers`,
{
params: {

Wyświetl plik

@ -1,5 +1,4 @@
import { FeedData } from '../FeedData'
import axios from 'axios'
import { assertSuccessJsonResponse } from '../../assertSuccessJsonResponse'
import { z } from 'zod'
import { FieldData } from '../FieldData'
@ -38,9 +37,10 @@ const schema = z.object({
export const retrieveVideoChannels: FeedProviderMethod = async (
domain,
page
page,
robotsTxt
) => {
const response = await axios.get(`https://${domain}/api/v1/video-channels`, {
const response = await robotsTxt.getIfAllowed(`https://${domain}/api/v1/video-channels`, {
params: {
count: limit,
sort: 'createdAt',

Wyświetl plik

@ -0,0 +1,7 @@
import { AxiosRequestConfig, AxiosResponse } from 'axios'
export default interface RobotsTxt {
isAllowed: (url: string) => boolean
getIfAllowed: <T = any, R = AxiosResponse<T>, D = any>(url: string, config?: AxiosRequestConfig<D>) => Promise<R>
postIfAllowed: <T = any, R = AxiosResponse<T>, D = any>(url: string, data?: D, config?: AxiosRequestConfig<D>) => Promise<R>
}

Wyświetl plik

@ -0,0 +1,7 @@
export class RobotsTxtError extends Error {
public readonly url
public constructor (url: string) {
super('Request was blocked by robots.txt')
this.url = url
}
}

Wyświetl plik

@ -0,0 +1,41 @@
import axios, { AxiosRequestConfig, AxiosResponse } from 'axios'
import robotsParser from 'robots-parser'
import RobotsTxt from './RobotsTxt.js'
import { RobotsTxtError } from './RobotsTxtError.js'
const userAgent = 'FediCrawl/1.0'
export default async function fetchRobotsTxt (domain: string): Promise<RobotsTxt> {
console.info('Fetching robots.txt', { domain })
const url = `https://${domain}/robots.txt`
let content = ''
try {
const robotsTxt = await axios.get(url)
content = robotsTxt.data
} catch (error) {
console.info('Robots.txt not found', { error, url })
}
const robots = robotsParser(url, content)
const isAllowed = (url: string): boolean => robots.isAllowed(url, userAgent) ?? true
return {
isAllowed,
getIfAllowed: async <T = any, R = AxiosResponse<T>, D = any>(url: string, config?: AxiosRequestConfig<D>): Promise<R> => {
if (!isAllowed(url)) {
throw new RobotsTxtError(url)
}
return await axios.get(url, {
headers: { 'User-Agent': userAgent },
...config
})
},
postIfAllowed: async <T = any, R = AxiosResponse<T>, D = any>(url: string, data?: D, config?: AxiosRequestConfig<D>): Promise<R> => {
if (!isAllowed(url)) {
throw new RobotsTxtError(url)
}
return await axios.post(url, data, {
headers: { 'User-Agent': userAgent },
...config
})
}
}
}

Wyświetl plik

@ -1,3 +1,4 @@
import RobotsTxt from '../../Fediverse/RobotsTxt/RobotsTxt.js'
import { refreshFeedsOnPage } from './refreshFeedsOnPage'
import { FeedProvider } from '../../Fediverse/Providers/FeedProvider'
import Node from '../../Storage/Definitions/Node'
@ -6,7 +7,8 @@ import { ElasticClient } from '../../Storage/ElasticClient'
export const refreshFeeds = async (
elastic: ElasticClient,
provider: FeedProvider,
node: Node
node: Node,
robotsTxt: RobotsTxt
): Promise<void> => {
try {
// noinspection InfiniteLoopJS
@ -16,7 +18,7 @@ export const refreshFeeds = async (
provider: provider.getKey(),
page
})
await refreshFeedsOnPage(elastic, provider, node, page)
await refreshFeedsOnPage(elastic, provider, node, page, robotsTxt)
}
} catch (error) {
console.info('Feed search finished', {

Wyświetl plik

@ -1,3 +1,4 @@
import RobotsTxt from '../../Fediverse/RobotsTxt/RobotsTxt.js'
import { refreshOrAddFeed } from './refreshOrAddFeed'
import { FeedProvider } from '../../Fediverse/Providers/FeedProvider'
import Node from '../../Storage/Definitions/Node'
@ -8,9 +9,10 @@ export const refreshFeedsOnPage = async (
elastic: ElasticClient,
provider: FeedProvider,
node: Node,
page: number
page: number,
robotsTxt: RobotsTxt
): Promise<Feed[]> => {
const feedData = await provider.retrieveFeeds(node.domain, page)
const feedData = await provider.retrieveFeeds(node.domain, page, robotsTxt)
console.info('Retrieved feeds', {
count: feedData.length,
domain: node.domain,

Wyświetl plik

@ -1,15 +1,17 @@
import { retrieveDomainNodeInfo } from '../../Fediverse/NodeInfo/retrieveDomainNodeInfo'
import RobotsTxt from '../../Fediverse/RobotsTxt/RobotsTxt.js'
import { updateNodeInfo } from '../../Storage/Nodes/updateNodeInfo'
import Node from '../../Storage/Definitions/Node'
import { ElasticClient } from '../../Storage/ElasticClient'
export const refreshNodeInfo = async (
elastic: ElasticClient,
node: Node
node: Node,
robotsTxt: RobotsTxt
): Promise<Node> => {
console.info('Updating info of node', { nodeDomain: node.domain })
try {
const nodeInfo = await retrieveDomainNodeInfo(node.domain)
const nodeInfo = await retrieveDomainNodeInfo(node.domain, robotsTxt)
return await updateNodeInfo(elastic, node, nodeInfo)
} catch (error) {
console.warn('Failed to update node info', error)

Wyświetl plik

@ -1,4 +1,5 @@
import { NodeProvider } from '../../Fediverse/Providers/NodeProvider'
import RobotsTxt from '../../Fediverse/RobotsTxt/RobotsTxt.js'
import { findNewNodesOnPage } from './findNewNodesOnPage'
import Node from '../../Storage/Definitions/Node'
import { ElasticClient } from '../../Storage/ElasticClient'
@ -6,7 +7,8 @@ import { ElasticClient } from '../../Storage/ElasticClient'
export const findNewNodes = async (
elastic: ElasticClient,
provider: NodeProvider,
node: Node
node: Node,
robotsTxt: RobotsTxt
): Promise<void> => {
try {
// noinspection InfiniteLoopJS
@ -15,7 +17,7 @@ export const findNewNodes = async (
domain: node.domain,
provider: provider.getKey()
})
await findNewNodesOnPage(elastic, provider, node, page)
await findNewNodesOnPage(elastic, provider, node, page, robotsTxt)
}
} catch (error) {
console.info('Node search finished', {

Wyświetl plik

@ -1,3 +1,4 @@
import RobotsTxt from '../../Fediverse/RobotsTxt/RobotsTxt.js'
import { createMissingNodes } from '../../Storage/Nodes/createMissingNodes'
import { NodeProvider } from '../../Fediverse/Providers/NodeProvider'
import Node from '../../Storage/Definitions/Node'
@ -8,9 +9,10 @@ export const findNewNodesOnPage = async (
elastic: ElasticClient,
provider: NodeProvider,
node: Node,
page: number
page: number,
robotsTxt: RobotsTxt
): Promise<number> => {
let domains = await provider.retrieveNodes(node.domain, page)
let domains = await provider.retrieveNodes(node.domain, page, robotsTxt)
domains = domains.filter(isDomainNotBanned)
console.log('Found nodes', {
count: domains.length,

Wyświetl plik

@ -1,3 +1,4 @@
import fetchRobotsTxt from '../Fediverse/RobotsTxt/fetchRobotsTxt.js'
import { fetchNodeToProcess } from '../Storage/Nodes/fetchNodeToProcess'
import { ProviderRegistry } from '../Fediverse/Providers/ProviderRegistry'
import { setNodeRefreshed } from '../Storage/Nodes/setNodeRefreshed'
@ -21,7 +22,8 @@ export const processNextNode = async (
node = await setNodeRefreshAttempted(elastic, node)
node = await refreshNodeIps(elastic, node)
node = await refreshNodeInfo(elastic, node)
const robotsTxt = await fetchRobotsTxt(node.domain)
node = await refreshNodeInfo(elastic, node, robotsTxt)
const softwareName = node.softwareName ?? ''
if (!providerRegistry.containsKey(softwareName)) {
@ -41,7 +43,7 @@ export const processNextNode = async (
domain: node.domain,
provider: nodeProvider.getKey()
})
return await findNewNodes(elastic, nodeProvider, node)
return await findNewNodes(elastic, nodeProvider, node, robotsTxt)
})
)
@ -51,7 +53,7 @@ export const processNextNode = async (
domain: node.domain,
provider: feedProvider.getKey()
})
return await refreshFeeds(elastic, feedProvider, node)
return await refreshFeeds(elastic, feedProvider, node, robotsTxt)
})
)

Wyświetl plik

@ -8,22 +8,6 @@ const assertNodeIndex = async (elastic: ElasticClient): Promise<void> => {
id: 'node',
description: 'Default node pipeline',
processors: [
{
geoip: {
ignore_missing: true,
field: 'serverIps',
properties: [
'location',
'continent_name',
'country_name',
'country_iso_code',
'region_iso_code',
'region_name',
'city_name'
],
target_field: 'geoip'
}
},
{
grok: {
ignore_missing: true,

Wyświetl plik

@ -3071,6 +3071,11 @@ rimraf@^3.0.2:
dependencies:
glob "^7.1.3"
robots-parser@^3.0.0:
version "3.0.0"
resolved "https://registry.yarnpkg.com/robots-parser/-/robots-parser-3.0.0.tgz#66af89306302ecd004455f2f24298310d0966631"
integrity sha512-6xkze3WRdneibICBAzMKcXyTKQw5shA3GbwoEJy7RSvxpZNGF0GMuYKE1T0VMP4fwx/fQs0n0mtriOqRtk5L1w==
run-parallel@^1.1.9:
version "1.2.0"
resolved "https://registry.yarnpkg.com/run-parallel/-/run-parallel-1.2.0.tgz#66d1368da7bdf921eb9d95bd1a9229e7f21a43ee"