Added config for seed domain timeout

main
Štěpán Škorpil 2022-11-29 14:11:27 +01:00
rodzic 6a11d19781
commit 1cbd5df5ae
20 zmienionych plików z 86 dodań i 62 usunięć

Wyświetl plik

@ -7,6 +7,7 @@ ENV ELASTIC_URL='http://elastic:9200' \
REFRESH_HOURS='120' \
WAIT_FOR_JOB_MINUTES='60' \
DEFAULT_TIMEOUT_MILLISECONDS='10000' \
SEED_TIMEOUT_MILLISECONDS=${DEFAULT_TIMEOUT_MILLISECONDS} \
BANNED_DOMAINS='' \
MAX_CRAWLING_DEPTH='' \
CRAWLING_VERSION='0' \

Wyświetl plik

@ -24,20 +24,21 @@ Data providers for more apps will be probably added soon (Pull requests are welc
Configuration is done using environmental variables:
| Variable | Description | Default value / Example value |
|--------------------------------|--------------------------------------------------------------------------------------------------|-------------------------------|
| `ELASTIC_URL` | Url address of ElasticSearch server | `http://elastic:9200` |
| `ELASTIC_USER` | Username for EalsticSearch server | `elastic` |
| `ELASTIC_PASSWORD` | Username for EalsticSearch server | empty |
| `SEED_NODE_DOMAIN` | Domain of the first node to search users and other nodes on | `mastodon.social` |
| `REATTEMPT_MINUTES` | _Optional_, How many minutes should be waited for next node refresh attempt if the refresh fails | `60 ` |
| `REFRESH_HOURS` | _Optional_, How often (in hours) should be node info refreshed | `120` |
| `WAIT_FOR_JOB_MINUTES` | _Optional_, How many minutes should the thread sleep if there are no nodes to refresh | `60` |
| `DEFAULT_TIMEOUT_MILLISECONDS` | _Optional_, How many milliseconds should http wait for node api response on refresh | `10000` |
| `BANNED_DOMAINS` | _Optional_, Domains not to index (even with subdomains) | _empty_ |
| `CRAWLING_VERSION` | _Optional_, Increasing this number can enforce recrawling of the whole index | 0 |
| `MAX_CRAWLING_DEPTH` | _Optional_, Limits how far is fediverse indexed from seed nodes | _empty_ |
| `TZ` | _Optional_, Timezone | `UTC` |
| Variable | Description | Default value / Example value |
|--------------------------------|-----------------------------------------------------------------------------------------------------|-------------------------------------------|
| `ELASTIC_URL` | Url address of ElasticSearch server | `http://elastic:9200` |
| `ELASTIC_USER` | Username for EalsticSearch server | `elastic` |
| `ELASTIC_PASSWORD` | Username for EalsticSearch server | empty |
| `SEED_NODE_DOMAIN` | Domain of the first node to search users and other nodes on | `mastodon.social,mastodon.online` |
| `REATTEMPT_MINUTES` | _Optional_, How many minutes should be waited for next node refresh attempt if the refresh fails | `60 ` |
| `REFRESH_HOURS` | _Optional_, How often (in hours) should be node info refreshed | `120` |
| `WAIT_FOR_JOB_MINUTES` | _Optional_, How many minutes should the thread sleep if there are no nodes to refresh | `60` |
| `DEFAULT_TIMEOUT_MILLISECONDS` | _Optional_, How many milliseconds should http wait for node api response on refresh | `10000` |
| `SEED_TIMEOUT_MILLISECONDS` | _Optional_, How many milliseconds should http wait for node api response on refresh of seed domains | _value of `DEFAULT_TIMEOUT_MILLISECONDS`_ |
| `BANNED_DOMAINS` | _Optional_, Domains not to index (even with subdomains) | _empty_ |
| `CRAWLING_VERSION` | _Optional_, Increasing this number can enforce recrawling of the whole index | 0 |
| `MAX_CRAWLING_DEPTH` | _Optional_, Limits how far is fediverse indexed from seed nodes | _empty_ |
| `TZ` | _Optional_, Timezone | `UTC` |
## Deploy
App is designed to be run in docker container and deployed using docker-compose.
More info can be found in [FediSearch example docker-compose](https://github.com/Stopka/fedisearch-compose) project

Wyświetl plik

@ -14,5 +14,5 @@ export const retrieveDomainNodeInfo = async (
if (typeof link === 'undefined') {
throw new NoSupportedLinkError(domain)
}
return await retrieveNodeInfo(link.href, robotsTxt)
return await retrieveNodeInfo(new URL(link.href), robotsTxt)
}

Wyświetl plik

@ -1,6 +1,6 @@
import { z } from 'zod'
import { assertSuccessJsonResponse } from '../assertSuccessJsonResponse'
import { getDefaultTimeoutMilliseconds } from '../getDefaultTimeoutMilliseconds'
import getTimeoutMilliseconds from '../getTimeoutMilliseconds.js'
import RobotsTxt from '../RobotsTxt/RobotsTxt.js'
const schema = z.object({
@ -27,10 +27,10 @@ const schema = z.object({
export type NodeInfo = z.infer<typeof schema>
export const retrieveNodeInfo = async (url: string, robotsTxt: RobotsTxt): Promise<NodeInfo> => {
export const retrieveNodeInfo = async (url: URL, robotsTxt: RobotsTxt): Promise<NodeInfo> => {
console.info('Retrieving node info', { url })
const nodeInfoResponse = await robotsTxt.getIfAllowed(url, {
timeout: getDefaultTimeoutMilliseconds()
timeout: getTimeoutMilliseconds(url.hostname)
})
assertSuccessJsonResponse(nodeInfoResponse)
return schema.parse(nodeInfoResponse.data)

Wyświetl plik

@ -1,6 +1,6 @@
import { assertSuccessJsonResponse } from '../assertSuccessJsonResponse'
import { z } from 'zod'
import { getDefaultTimeoutMilliseconds } from '../getDefaultTimeoutMilliseconds'
import getTimeoutMilliseconds from '../getTimeoutMilliseconds.js'
import RobotsTxt from '../RobotsTxt/RobotsTxt.js'
const wellKnownSchema = z.object({
@ -16,9 +16,9 @@ export type WellKnown = z.infer<typeof wellKnownSchema>
export const retrieveWellKnown = async (domain: string, robotsTxt: RobotsTxt): Promise<WellKnown> => {
console.info('Retrieving well known', { domain })
const wellKnownUrl = `https://${domain}/.well-known/nodeinfo`
const wellKnownUrl = new URL(`https://${domain}/.well-known/nodeinfo`)
const wellKnownResponse = await robotsTxt.getIfAllowed(wellKnownUrl, {
timeout: getDefaultTimeoutMilliseconds(),
timeout: getTimeoutMilliseconds(domain),
maxContentLength: 5000
})
assertSuccessJsonResponse(wellKnownResponse)

Wyświetl plik

@ -1,6 +1,6 @@
import { assertSuccessJsonResponse } from '../../assertSuccessJsonResponse'
import { z } from 'zod'
import { getDefaultTimeoutMilliseconds } from '../../getDefaultTimeoutMilliseconds'
import getTimeoutMilliseconds from '../../getTimeoutMilliseconds.js'
import { FeedProviderMethod } from '../FeedProviderMethod'
import { NoMoreFeedsError } from '../NoMoreFeedsError'
import { FeedData } from '../FeedData'
@ -56,13 +56,13 @@ export const retrieveLocalPublicUsersPage: FeedProviderMethod = async (
page,
robotsTxt
): Promise<FeedData[]> => {
const response = await robotsTxt.getIfAllowed(`https://${domain}/api/v1/directory`, {
const response = await robotsTxt.getIfAllowed(new URL(`https://${domain}/api/v1/directory`), {
params: {
limit,
offset: page * limit,
local: true
},
timeout: getDefaultTimeoutMilliseconds()
timeout: getTimeoutMilliseconds(domain)
})
assertSuccessJsonResponse(response)
const responseData = schema.parse(response.data)

Wyświetl plik

@ -1,6 +1,6 @@
import { assertSuccessJsonResponse } from '../../assertSuccessJsonResponse'
import { z } from 'zod'
import { getDefaultTimeoutMilliseconds } from '../../getDefaultTimeoutMilliseconds'
import getTimeoutMilliseconds from '../../getTimeoutMilliseconds.js'
import { NodeProviderMethod } from '../NodeProviderMethod'
import { NoMoreNodesError } from '../NoMoreNodesError'
@ -11,9 +11,9 @@ export const retrievePeers: NodeProviderMethod = async (domain, page, robotsTxt)
throw new NoMoreNodesError('peer')
}
const response = await robotsTxt.getIfAllowed(
`https://${domain}/api/v1/instance/peers`,
new URL(`https://${domain}/api/v1/instance/peers`),
{
timeout: getDefaultTimeoutMilliseconds()
timeout: getTimeoutMilliseconds(domain)
}
)
assertSuccessJsonResponse(response)

Wyświetl plik

@ -1,6 +1,6 @@
import { assertSuccessJsonResponse } from '../../assertSuccessJsonResponse'
import { z } from 'zod'
import { getDefaultTimeoutMilliseconds } from '../../getDefaultTimeoutMilliseconds'
import getTimeoutMilliseconds from '../../getTimeoutMilliseconds.js'
import { NodeProviderMethod } from '../NodeProviderMethod'
import { NoMoreNodesError } from '../NoMoreNodesError'
@ -18,7 +18,7 @@ export const retrieveInstancesPage: NodeProviderMethod = async (
robotsTxt
) => {
const response = await robotsTxt.postIfAllowed(
`https://${domain}/api/federation/instances`,
new URL(`https://${domain}/api/federation/instances`),
{
host: null,
blocked: null,
@ -32,7 +32,7 @@ export const retrieveInstancesPage: NodeProviderMethod = async (
sort: '+id'
},
{
timeout: getDefaultTimeoutMilliseconds()
timeout: getTimeoutMilliseconds(domain)
}
)
assertSuccessJsonResponse(response)

Wyświetl plik

@ -1,6 +1,6 @@
import { assertSuccessJsonResponse } from '../../assertSuccessJsonResponse'
import { z } from 'zod'
import { getDefaultTimeoutMilliseconds } from '../../getDefaultTimeoutMilliseconds'
import getTimeoutMilliseconds from '../../getTimeoutMilliseconds.js'
import { NoMoreFeedsError } from '../NoMoreFeedsError'
import { FeedProviderMethod } from '../FeedProviderMethod'
import { FeedData } from '../FeedData'
@ -71,7 +71,7 @@ export const retrieveUsersPage: FeedProviderMethod = async (
robotsTxt
): Promise<FeedData[]> => {
const response = await robotsTxt.postIfAllowed(
`https://${domain}/api/users`,
new URL(`https://${domain}/api/users`),
{
state: 'all',
origin: 'local',
@ -80,7 +80,7 @@ export const retrieveUsersPage: FeedProviderMethod = async (
offset: limit * page
},
{
timeout: getDefaultTimeoutMilliseconds()
timeout: getTimeoutMilliseconds(domain)
}
)
assertSuccessJsonResponse(response)

Wyświetl plik

@ -1,9 +1,9 @@
import getTimeoutMilliseconds from '../../getTimeoutMilliseconds.js'
import { FeedData } from '../FeedData'
import { assertSuccessJsonResponse } from '../../assertSuccessJsonResponse'
import { z } from 'zod'
import { avatarSchema } from './Avatar'
import { parseAvatarUrl } from './parseAvatarUrl'
import { getDefaultTimeoutMilliseconds } from '../../getDefaultTimeoutMilliseconds'
import { parseDescription } from './parseDescription'
import { NoMoreFeedsError } from '../NoMoreFeedsError'
import { FeedProviderMethod } from '../FeedProviderMethod'
@ -29,13 +29,13 @@ const schema = z.object({
})
export const retrieveAccounts: FeedProviderMethod = async (domain, page, robotsTxt) => {
const response = await robotsTxt.getIfAllowed(`https://${domain}/api/v1/accounts`, {
const response = await robotsTxt.getIfAllowed(new URL(`https://${domain}/api/v1/accounts`), {
params: {
count: limit,
sort: 'createdAt',
start: page * limit
},
timeout: getDefaultTimeoutMilliseconds()
timeout: getTimeoutMilliseconds(domain)
})
assertSuccessJsonResponse(response)
const responseData = schema.parse(response.data)

Wyświetl plik

@ -1,6 +1,6 @@
import { assertSuccessJsonResponse } from '../../assertSuccessJsonResponse'
import { z } from 'zod'
import { getDefaultTimeoutMilliseconds } from '../../getDefaultTimeoutMilliseconds'
import getTimeoutMilliseconds from '../../getTimeoutMilliseconds.js'
import { NodeProviderMethod } from '../NodeProviderMethod'
import { NoMoreNodesError } from '../NoMoreNodesError'
@ -22,14 +22,14 @@ const schema = z.object({
export const retrieveFollowers: NodeProviderMethod = async (domain, page, robotsTxt) => {
const response = await robotsTxt.getIfAllowed(
`https://${domain}/api/v1/server/followers`,
new URL(`https://${domain}/api/v1/server/followers`),
{
params: {
count: limit,
sort: 'createdAt',
start: page * limit
},
timeout: getDefaultTimeoutMilliseconds()
timeout: getTimeoutMilliseconds(domain)
}
)
assertSuccessJsonResponse(response)

Wyświetl plik

@ -1,10 +1,10 @@
import getTimeoutMilliseconds from '../../getTimeoutMilliseconds.js'
import { FeedData } from '../FeedData'
import { assertSuccessJsonResponse } from '../../assertSuccessJsonResponse'
import { z } from 'zod'
import { FieldData } from '../FieldData'
import { avatarSchema } from './Avatar'
import { parseAvatarUrl } from './parseAvatarUrl'
import { getDefaultTimeoutMilliseconds } from '../../getDefaultTimeoutMilliseconds'
import { parseDescription } from './parseDescription'
import { FeedProviderMethod } from '../FeedProviderMethod'
import { NoMoreFeedsError } from '../NoMoreFeedsError'
@ -40,13 +40,13 @@ export const retrieveVideoChannels: FeedProviderMethod = async (
page,
robotsTxt
) => {
const response = await robotsTxt.getIfAllowed(`https://${domain}/api/v1/video-channels`, {
const response = await robotsTxt.getIfAllowed(new URL(`https://${domain}/api/v1/video-channels`), {
params: {
count: limit,
sort: 'createdAt',
start: page * limit
},
timeout: getDefaultTimeoutMilliseconds()
timeout: getTimeoutMilliseconds(domain)
})
assertSuccessJsonResponse(response)
const responseData = schema.parse(response.data)

Wyświetl plik

@ -1,7 +1,7 @@
import { AxiosRequestConfig, AxiosResponse } from 'axios'
export default interface RobotsTxt {
isAllowed: (url: string) => boolean
getIfAllowed: <T = any, R = AxiosResponse<T>, D = any>(url: string, config?: AxiosRequestConfig<D>) => Promise<R>
postIfAllowed: <T = any, R = AxiosResponse<T>, D = any>(url: string, data?: D, config?: AxiosRequestConfig<D>) => Promise<R>
isAllowed: (url: URL) => boolean
getIfAllowed: <T = any, R = AxiosResponse<T>, D = any>(url: URL, config?: AxiosRequestConfig<D>) => Promise<R>
postIfAllowed: <T = any, R = AxiosResponse<T>, D = any>(url: URL, data?: D, config?: AxiosRequestConfig<D>) => Promise<R>
}

Wyświetl plik

@ -1,7 +1,7 @@
export class RobotsTxtError extends Error {
public readonly url
public constructor (url: string) {
public readonly url: string
public constructor (url: URL) {
super('Request was blocked by robots.txt')
this.url = url
this.url = url.toString()
}
}

Wyświetl plik

@ -1,6 +1,6 @@
import axios, { AxiosRequestConfig, AxiosResponse } from 'axios'
import robotsParser from 'robots-parser'
import { getDefaultTimeoutMilliseconds } from '../getDefaultTimeoutMilliseconds.js'
import getTimeoutMilliseconds from '../getTimeoutMilliseconds.js'
import RobotsTxt from './RobotsTxt.js'
import { RobotsTxtError } from './RobotsTxtError.js'
@ -8,35 +8,35 @@ const userAgent = 'FediCrawl/1.0'
export default async function fetchRobotsTxt (domain: string): Promise<RobotsTxt> {
console.info('Fetching robots.txt', { domain })
const url = `https://${domain}/robots.txt`
const url = new URL(`https://${domain}/robots.txt`)
let content = ''
try {
const robotsTxt = await axios.get(url, {
const robotsTxt = await axios.get(url.toString(), {
headers: { 'User-Agent': userAgent },
timeout: getDefaultTimeoutMilliseconds()
timeout: getTimeoutMilliseconds(domain)
})
content = String(robotsTxt.data)
} catch (error) {
console.info('Robots.txt not found', { error, url })
}
const robots = robotsParser(url, content)
const isAllowed = (url: string): boolean => robots.isAllowed(url, userAgent) ?? true
const robots = robotsParser(url.toString(), content)
const isAllowed = (url: URL): boolean => robots.isAllowed(url.toString(), userAgent) ?? true
return {
isAllowed,
getIfAllowed: async <T = any, R = AxiosResponse<T>, D = any>(url: string, config?: AxiosRequestConfig<D>): Promise<R> => {
getIfAllowed: async <T = any, R = AxiosResponse<T>, D = any>(url: URL, config?: AxiosRequestConfig<D>): Promise<R> => {
if (!isAllowed(url)) {
throw new RobotsTxtError(url)
}
return await axios.get(url, {
return await axios.get(url.toString(), {
headers: { 'User-Agent': userAgent },
...config
})
},
postIfAllowed: async <T = any, R = AxiosResponse<T>, D = any>(url: string, data?: D, config?: AxiosRequestConfig<D>): Promise<R> => {
postIfAllowed: async <T = any, R = AxiosResponse<T>, D = any>(url: URL, data?: D, config?: AxiosRequestConfig<D>): Promise<R> => {
if (!isAllowed(url)) {
throw new RobotsTxtError(url)
}
return await axios.post(url, data, {
return await axios.post(url.toString(), data, {
headers: { 'User-Agent': userAgent },
...config
})

Wyświetl plik

@ -0,0 +1,5 @@
import { getDefaultTimeoutMilliseconds } from './getDefaultTimeoutMilliseconds.js'
export const getSeedTimeoutMilliseconds = (): number => {
return parseInt(process.env.SEED_TIMEOUT_MILLISECONDS ?? getDefaultTimeoutMilliseconds().toString())
}

Wyświetl plik

@ -0,0 +1,9 @@
import isSeedDomain from '../Jobs/Seed/isSeedDomain.js'
import { getDefaultTimeoutMilliseconds } from './getDefaultTimeoutMilliseconds.js'
import { getSeedTimeoutMilliseconds } from './getSeedTimeoutMilliseconds.js'
export default function getTimeoutMilliseconds (domain: string): number {
return isSeedDomain(domain)
? getSeedTimeoutMilliseconds()
: getDefaultTimeoutMilliseconds()
}

Wyświetl plik

@ -0,0 +1,5 @@
export default function getSeedDomains (): string[] {
return (process.env.SEED_NODE_DOMAIN ?? 'mastodon.social,mastodon.online').split(
','
)
}

Wyświetl plik

@ -0,0 +1,5 @@
import getSeedDomains from './getSeedDomains.js'
export default function isSeedDomain (domain: string): boolean {
return getSeedDomains().includes(domain)
}

Wyświetl plik

@ -1,6 +1,7 @@
import providerRegistry from './Fediverse/Providers'
import { addNodeSeed } from './Jobs/Seed/addNodeSeed'
import { processNextNode } from './Jobs/processNextNode'
import getSeedDomains from './Jobs/Seed/getSeedDomains.js'
import assertNodeIndex from './Storage/Nodes/assertNodeIndex'
import assertFeedIndex from './Storage/Feeds/assertFeedIndex'
import elasticClient from './Storage/ElasticClient'
@ -34,10 +35,7 @@ const app = async (): Promise<void> => {
await assertNodeIndex(elasticClient)
await assertFeedIndex(elasticClient)
await deleteDomains(elasticClient, getBannedDomains())
const seedDomains = (process.env.SEED_NODE_DOMAIN ?? 'mastodon.social').split(
','
)
await addNodeSeed(elasticClient, seedDomains)
await addNodeSeed(elasticClient, getSeedDomains())
await loop()
}