kopia lustrzana https://github.com/Stopka/fedicrawl
Added config for seed domain timeout
rodzic
6a11d19781
commit
1cbd5df5ae
|
@ -7,6 +7,7 @@ ENV ELASTIC_URL='http://elastic:9200' \
|
||||||
REFRESH_HOURS='120' \
|
REFRESH_HOURS='120' \
|
||||||
WAIT_FOR_JOB_MINUTES='60' \
|
WAIT_FOR_JOB_MINUTES='60' \
|
||||||
DEFAULT_TIMEOUT_MILLISECONDS='10000' \
|
DEFAULT_TIMEOUT_MILLISECONDS='10000' \
|
||||||
|
SEED_TIMEOUT_MILLISECONDS=${DEFAULT_TIMEOUT_MILLISECONDS} \
|
||||||
BANNED_DOMAINS='' \
|
BANNED_DOMAINS='' \
|
||||||
MAX_CRAWLING_DEPTH='' \
|
MAX_CRAWLING_DEPTH='' \
|
||||||
CRAWLING_VERSION='0' \
|
CRAWLING_VERSION='0' \
|
||||||
|
|
29
README.md
29
README.md
|
@ -24,20 +24,21 @@ Data providers for more apps will be probably added soon (Pull requests are welc
|
||||||
|
|
||||||
Configuration is done using environmental variables:
|
Configuration is done using environmental variables:
|
||||||
|
|
||||||
| Variable | Description | Default value / Example value |
|
| Variable | Description | Default value / Example value |
|
||||||
|--------------------------------|--------------------------------------------------------------------------------------------------|-------------------------------|
|
|--------------------------------|-----------------------------------------------------------------------------------------------------|-------------------------------------------|
|
||||||
| `ELASTIC_URL` | Url address of ElasticSearch server | `http://elastic:9200` |
|
| `ELASTIC_URL` | Url address of ElasticSearch server | `http://elastic:9200` |
|
||||||
| `ELASTIC_USER` | Username for EalsticSearch server | `elastic` |
|
| `ELASTIC_USER` | Username for EalsticSearch server | `elastic` |
|
||||||
| `ELASTIC_PASSWORD` | Username for EalsticSearch server | empty |
|
| `ELASTIC_PASSWORD` | Username for EalsticSearch server | empty |
|
||||||
| `SEED_NODE_DOMAIN` | Domain of the first node to search users and other nodes on | `mastodon.social` |
|
| `SEED_NODE_DOMAIN` | Domain of the first node to search users and other nodes on | `mastodon.social,mastodon.online` |
|
||||||
| `REATTEMPT_MINUTES` | _Optional_, How many minutes should be waited for next node refresh attempt if the refresh fails | `60 ` |
|
| `REATTEMPT_MINUTES` | _Optional_, How many minutes should be waited for next node refresh attempt if the refresh fails | `60 ` |
|
||||||
| `REFRESH_HOURS` | _Optional_, How often (in hours) should be node info refreshed | `120` |
|
| `REFRESH_HOURS` | _Optional_, How often (in hours) should be node info refreshed | `120` |
|
||||||
| `WAIT_FOR_JOB_MINUTES` | _Optional_, How many minutes should the thread sleep if there are no nodes to refresh | `60` |
|
| `WAIT_FOR_JOB_MINUTES` | _Optional_, How many minutes should the thread sleep if there are no nodes to refresh | `60` |
|
||||||
| `DEFAULT_TIMEOUT_MILLISECONDS` | _Optional_, How many milliseconds should http wait for node api response on refresh | `10000` |
|
| `DEFAULT_TIMEOUT_MILLISECONDS` | _Optional_, How many milliseconds should http wait for node api response on refresh | `10000` |
|
||||||
| `BANNED_DOMAINS` | _Optional_, Domains not to index (even with subdomains) | _empty_ |
|
| `SEED_TIMEOUT_MILLISECONDS` | _Optional_, How many milliseconds should http wait for node api response on refresh of seed domains | _value of `DEFAULT_TIMEOUT_MILLISECONDS`_ |
|
||||||
| `CRAWLING_VERSION` | _Optional_, Increasing this number can enforce recrawling of the whole index | 0 |
|
| `BANNED_DOMAINS` | _Optional_, Domains not to index (even with subdomains) | _empty_ |
|
||||||
| `MAX_CRAWLING_DEPTH` | _Optional_, Limits how far is fediverse indexed from seed nodes | _empty_ |
|
| `CRAWLING_VERSION` | _Optional_, Increasing this number can enforce recrawling of the whole index | 0 |
|
||||||
| `TZ` | _Optional_, Timezone | `UTC` |
|
| `MAX_CRAWLING_DEPTH` | _Optional_, Limits how far is fediverse indexed from seed nodes | _empty_ |
|
||||||
|
| `TZ` | _Optional_, Timezone | `UTC` |
|
||||||
## Deploy
|
## Deploy
|
||||||
App is designed to be run in docker container and deployed using docker-compose.
|
App is designed to be run in docker container and deployed using docker-compose.
|
||||||
More info can be found in [FediSearch example docker-compose](https://github.com/Stopka/fedisearch-compose) project
|
More info can be found in [FediSearch example docker-compose](https://github.com/Stopka/fedisearch-compose) project
|
||||||
|
|
|
@ -14,5 +14,5 @@ export const retrieveDomainNodeInfo = async (
|
||||||
if (typeof link === 'undefined') {
|
if (typeof link === 'undefined') {
|
||||||
throw new NoSupportedLinkError(domain)
|
throw new NoSupportedLinkError(domain)
|
||||||
}
|
}
|
||||||
return await retrieveNodeInfo(link.href, robotsTxt)
|
return await retrieveNodeInfo(new URL(link.href), robotsTxt)
|
||||||
}
|
}
|
||||||
|
|
|
@ -1,6 +1,6 @@
|
||||||
import { z } from 'zod'
|
import { z } from 'zod'
|
||||||
import { assertSuccessJsonResponse } from '../assertSuccessJsonResponse'
|
import { assertSuccessJsonResponse } from '../assertSuccessJsonResponse'
|
||||||
import { getDefaultTimeoutMilliseconds } from '../getDefaultTimeoutMilliseconds'
|
import getTimeoutMilliseconds from '../getTimeoutMilliseconds.js'
|
||||||
import RobotsTxt from '../RobotsTxt/RobotsTxt.js'
|
import RobotsTxt from '../RobotsTxt/RobotsTxt.js'
|
||||||
|
|
||||||
const schema = z.object({
|
const schema = z.object({
|
||||||
|
@ -27,10 +27,10 @@ const schema = z.object({
|
||||||
|
|
||||||
export type NodeInfo = z.infer<typeof schema>
|
export type NodeInfo = z.infer<typeof schema>
|
||||||
|
|
||||||
export const retrieveNodeInfo = async (url: string, robotsTxt: RobotsTxt): Promise<NodeInfo> => {
|
export const retrieveNodeInfo = async (url: URL, robotsTxt: RobotsTxt): Promise<NodeInfo> => {
|
||||||
console.info('Retrieving node info', { url })
|
console.info('Retrieving node info', { url })
|
||||||
const nodeInfoResponse = await robotsTxt.getIfAllowed(url, {
|
const nodeInfoResponse = await robotsTxt.getIfAllowed(url, {
|
||||||
timeout: getDefaultTimeoutMilliseconds()
|
timeout: getTimeoutMilliseconds(url.hostname)
|
||||||
})
|
})
|
||||||
assertSuccessJsonResponse(nodeInfoResponse)
|
assertSuccessJsonResponse(nodeInfoResponse)
|
||||||
return schema.parse(nodeInfoResponse.data)
|
return schema.parse(nodeInfoResponse.data)
|
||||||
|
|
|
@ -1,6 +1,6 @@
|
||||||
import { assertSuccessJsonResponse } from '../assertSuccessJsonResponse'
|
import { assertSuccessJsonResponse } from '../assertSuccessJsonResponse'
|
||||||
import { z } from 'zod'
|
import { z } from 'zod'
|
||||||
import { getDefaultTimeoutMilliseconds } from '../getDefaultTimeoutMilliseconds'
|
import getTimeoutMilliseconds from '../getTimeoutMilliseconds.js'
|
||||||
import RobotsTxt from '../RobotsTxt/RobotsTxt.js'
|
import RobotsTxt from '../RobotsTxt/RobotsTxt.js'
|
||||||
|
|
||||||
const wellKnownSchema = z.object({
|
const wellKnownSchema = z.object({
|
||||||
|
@ -16,9 +16,9 @@ export type WellKnown = z.infer<typeof wellKnownSchema>
|
||||||
|
|
||||||
export const retrieveWellKnown = async (domain: string, robotsTxt: RobotsTxt): Promise<WellKnown> => {
|
export const retrieveWellKnown = async (domain: string, robotsTxt: RobotsTxt): Promise<WellKnown> => {
|
||||||
console.info('Retrieving well known', { domain })
|
console.info('Retrieving well known', { domain })
|
||||||
const wellKnownUrl = `https://${domain}/.well-known/nodeinfo`
|
const wellKnownUrl = new URL(`https://${domain}/.well-known/nodeinfo`)
|
||||||
const wellKnownResponse = await robotsTxt.getIfAllowed(wellKnownUrl, {
|
const wellKnownResponse = await robotsTxt.getIfAllowed(wellKnownUrl, {
|
||||||
timeout: getDefaultTimeoutMilliseconds(),
|
timeout: getTimeoutMilliseconds(domain),
|
||||||
maxContentLength: 5000
|
maxContentLength: 5000
|
||||||
})
|
})
|
||||||
assertSuccessJsonResponse(wellKnownResponse)
|
assertSuccessJsonResponse(wellKnownResponse)
|
||||||
|
|
|
@ -1,6 +1,6 @@
|
||||||
import { assertSuccessJsonResponse } from '../../assertSuccessJsonResponse'
|
import { assertSuccessJsonResponse } from '../../assertSuccessJsonResponse'
|
||||||
import { z } from 'zod'
|
import { z } from 'zod'
|
||||||
import { getDefaultTimeoutMilliseconds } from '../../getDefaultTimeoutMilliseconds'
|
import getTimeoutMilliseconds from '../../getTimeoutMilliseconds.js'
|
||||||
import { FeedProviderMethod } from '../FeedProviderMethod'
|
import { FeedProviderMethod } from '../FeedProviderMethod'
|
||||||
import { NoMoreFeedsError } from '../NoMoreFeedsError'
|
import { NoMoreFeedsError } from '../NoMoreFeedsError'
|
||||||
import { FeedData } from '../FeedData'
|
import { FeedData } from '../FeedData'
|
||||||
|
@ -56,13 +56,13 @@ export const retrieveLocalPublicUsersPage: FeedProviderMethod = async (
|
||||||
page,
|
page,
|
||||||
robotsTxt
|
robotsTxt
|
||||||
): Promise<FeedData[]> => {
|
): Promise<FeedData[]> => {
|
||||||
const response = await robotsTxt.getIfAllowed(`https://${domain}/api/v1/directory`, {
|
const response = await robotsTxt.getIfAllowed(new URL(`https://${domain}/api/v1/directory`), {
|
||||||
params: {
|
params: {
|
||||||
limit,
|
limit,
|
||||||
offset: page * limit,
|
offset: page * limit,
|
||||||
local: true
|
local: true
|
||||||
},
|
},
|
||||||
timeout: getDefaultTimeoutMilliseconds()
|
timeout: getTimeoutMilliseconds(domain)
|
||||||
})
|
})
|
||||||
assertSuccessJsonResponse(response)
|
assertSuccessJsonResponse(response)
|
||||||
const responseData = schema.parse(response.data)
|
const responseData = schema.parse(response.data)
|
||||||
|
|
|
@ -1,6 +1,6 @@
|
||||||
import { assertSuccessJsonResponse } from '../../assertSuccessJsonResponse'
|
import { assertSuccessJsonResponse } from '../../assertSuccessJsonResponse'
|
||||||
import { z } from 'zod'
|
import { z } from 'zod'
|
||||||
import { getDefaultTimeoutMilliseconds } from '../../getDefaultTimeoutMilliseconds'
|
import getTimeoutMilliseconds from '../../getTimeoutMilliseconds.js'
|
||||||
import { NodeProviderMethod } from '../NodeProviderMethod'
|
import { NodeProviderMethod } from '../NodeProviderMethod'
|
||||||
import { NoMoreNodesError } from '../NoMoreNodesError'
|
import { NoMoreNodesError } from '../NoMoreNodesError'
|
||||||
|
|
||||||
|
@ -11,9 +11,9 @@ export const retrievePeers: NodeProviderMethod = async (domain, page, robotsTxt)
|
||||||
throw new NoMoreNodesError('peer')
|
throw new NoMoreNodesError('peer')
|
||||||
}
|
}
|
||||||
const response = await robotsTxt.getIfAllowed(
|
const response = await robotsTxt.getIfAllowed(
|
||||||
`https://${domain}/api/v1/instance/peers`,
|
new URL(`https://${domain}/api/v1/instance/peers`),
|
||||||
{
|
{
|
||||||
timeout: getDefaultTimeoutMilliseconds()
|
timeout: getTimeoutMilliseconds(domain)
|
||||||
}
|
}
|
||||||
)
|
)
|
||||||
assertSuccessJsonResponse(response)
|
assertSuccessJsonResponse(response)
|
||||||
|
|
|
@ -1,6 +1,6 @@
|
||||||
import { assertSuccessJsonResponse } from '../../assertSuccessJsonResponse'
|
import { assertSuccessJsonResponse } from '../../assertSuccessJsonResponse'
|
||||||
import { z } from 'zod'
|
import { z } from 'zod'
|
||||||
import { getDefaultTimeoutMilliseconds } from '../../getDefaultTimeoutMilliseconds'
|
import getTimeoutMilliseconds from '../../getTimeoutMilliseconds.js'
|
||||||
import { NodeProviderMethod } from '../NodeProviderMethod'
|
import { NodeProviderMethod } from '../NodeProviderMethod'
|
||||||
import { NoMoreNodesError } from '../NoMoreNodesError'
|
import { NoMoreNodesError } from '../NoMoreNodesError'
|
||||||
|
|
||||||
|
@ -18,7 +18,7 @@ export const retrieveInstancesPage: NodeProviderMethod = async (
|
||||||
robotsTxt
|
robotsTxt
|
||||||
) => {
|
) => {
|
||||||
const response = await robotsTxt.postIfAllowed(
|
const response = await robotsTxt.postIfAllowed(
|
||||||
`https://${domain}/api/federation/instances`,
|
new URL(`https://${domain}/api/federation/instances`),
|
||||||
{
|
{
|
||||||
host: null,
|
host: null,
|
||||||
blocked: null,
|
blocked: null,
|
||||||
|
@ -32,7 +32,7 @@ export const retrieveInstancesPage: NodeProviderMethod = async (
|
||||||
sort: '+id'
|
sort: '+id'
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
timeout: getDefaultTimeoutMilliseconds()
|
timeout: getTimeoutMilliseconds(domain)
|
||||||
}
|
}
|
||||||
)
|
)
|
||||||
assertSuccessJsonResponse(response)
|
assertSuccessJsonResponse(response)
|
||||||
|
|
|
@ -1,6 +1,6 @@
|
||||||
import { assertSuccessJsonResponse } from '../../assertSuccessJsonResponse'
|
import { assertSuccessJsonResponse } from '../../assertSuccessJsonResponse'
|
||||||
import { z } from 'zod'
|
import { z } from 'zod'
|
||||||
import { getDefaultTimeoutMilliseconds } from '../../getDefaultTimeoutMilliseconds'
|
import getTimeoutMilliseconds from '../../getTimeoutMilliseconds.js'
|
||||||
import { NoMoreFeedsError } from '../NoMoreFeedsError'
|
import { NoMoreFeedsError } from '../NoMoreFeedsError'
|
||||||
import { FeedProviderMethod } from '../FeedProviderMethod'
|
import { FeedProviderMethod } from '../FeedProviderMethod'
|
||||||
import { FeedData } from '../FeedData'
|
import { FeedData } from '../FeedData'
|
||||||
|
@ -71,7 +71,7 @@ export const retrieveUsersPage: FeedProviderMethod = async (
|
||||||
robotsTxt
|
robotsTxt
|
||||||
): Promise<FeedData[]> => {
|
): Promise<FeedData[]> => {
|
||||||
const response = await robotsTxt.postIfAllowed(
|
const response = await robotsTxt.postIfAllowed(
|
||||||
`https://${domain}/api/users`,
|
new URL(`https://${domain}/api/users`),
|
||||||
{
|
{
|
||||||
state: 'all',
|
state: 'all',
|
||||||
origin: 'local',
|
origin: 'local',
|
||||||
|
@ -80,7 +80,7 @@ export const retrieveUsersPage: FeedProviderMethod = async (
|
||||||
offset: limit * page
|
offset: limit * page
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
timeout: getDefaultTimeoutMilliseconds()
|
timeout: getTimeoutMilliseconds(domain)
|
||||||
}
|
}
|
||||||
)
|
)
|
||||||
assertSuccessJsonResponse(response)
|
assertSuccessJsonResponse(response)
|
||||||
|
|
|
@ -1,9 +1,9 @@
|
||||||
|
import getTimeoutMilliseconds from '../../getTimeoutMilliseconds.js'
|
||||||
import { FeedData } from '../FeedData'
|
import { FeedData } from '../FeedData'
|
||||||
import { assertSuccessJsonResponse } from '../../assertSuccessJsonResponse'
|
import { assertSuccessJsonResponse } from '../../assertSuccessJsonResponse'
|
||||||
import { z } from 'zod'
|
import { z } from 'zod'
|
||||||
import { avatarSchema } from './Avatar'
|
import { avatarSchema } from './Avatar'
|
||||||
import { parseAvatarUrl } from './parseAvatarUrl'
|
import { parseAvatarUrl } from './parseAvatarUrl'
|
||||||
import { getDefaultTimeoutMilliseconds } from '../../getDefaultTimeoutMilliseconds'
|
|
||||||
import { parseDescription } from './parseDescription'
|
import { parseDescription } from './parseDescription'
|
||||||
import { NoMoreFeedsError } from '../NoMoreFeedsError'
|
import { NoMoreFeedsError } from '../NoMoreFeedsError'
|
||||||
import { FeedProviderMethod } from '../FeedProviderMethod'
|
import { FeedProviderMethod } from '../FeedProviderMethod'
|
||||||
|
@ -29,13 +29,13 @@ const schema = z.object({
|
||||||
})
|
})
|
||||||
|
|
||||||
export const retrieveAccounts: FeedProviderMethod = async (domain, page, robotsTxt) => {
|
export const retrieveAccounts: FeedProviderMethod = async (domain, page, robotsTxt) => {
|
||||||
const response = await robotsTxt.getIfAllowed(`https://${domain}/api/v1/accounts`, {
|
const response = await robotsTxt.getIfAllowed(new URL(`https://${domain}/api/v1/accounts`), {
|
||||||
params: {
|
params: {
|
||||||
count: limit,
|
count: limit,
|
||||||
sort: 'createdAt',
|
sort: 'createdAt',
|
||||||
start: page * limit
|
start: page * limit
|
||||||
},
|
},
|
||||||
timeout: getDefaultTimeoutMilliseconds()
|
timeout: getTimeoutMilliseconds(domain)
|
||||||
})
|
})
|
||||||
assertSuccessJsonResponse(response)
|
assertSuccessJsonResponse(response)
|
||||||
const responseData = schema.parse(response.data)
|
const responseData = schema.parse(response.data)
|
||||||
|
|
|
@ -1,6 +1,6 @@
|
||||||
import { assertSuccessJsonResponse } from '../../assertSuccessJsonResponse'
|
import { assertSuccessJsonResponse } from '../../assertSuccessJsonResponse'
|
||||||
import { z } from 'zod'
|
import { z } from 'zod'
|
||||||
import { getDefaultTimeoutMilliseconds } from '../../getDefaultTimeoutMilliseconds'
|
import getTimeoutMilliseconds from '../../getTimeoutMilliseconds.js'
|
||||||
import { NodeProviderMethod } from '../NodeProviderMethod'
|
import { NodeProviderMethod } from '../NodeProviderMethod'
|
||||||
import { NoMoreNodesError } from '../NoMoreNodesError'
|
import { NoMoreNodesError } from '../NoMoreNodesError'
|
||||||
|
|
||||||
|
@ -22,14 +22,14 @@ const schema = z.object({
|
||||||
|
|
||||||
export const retrieveFollowers: NodeProviderMethod = async (domain, page, robotsTxt) => {
|
export const retrieveFollowers: NodeProviderMethod = async (domain, page, robotsTxt) => {
|
||||||
const response = await robotsTxt.getIfAllowed(
|
const response = await robotsTxt.getIfAllowed(
|
||||||
`https://${domain}/api/v1/server/followers`,
|
new URL(`https://${domain}/api/v1/server/followers`),
|
||||||
{
|
{
|
||||||
params: {
|
params: {
|
||||||
count: limit,
|
count: limit,
|
||||||
sort: 'createdAt',
|
sort: 'createdAt',
|
||||||
start: page * limit
|
start: page * limit
|
||||||
},
|
},
|
||||||
timeout: getDefaultTimeoutMilliseconds()
|
timeout: getTimeoutMilliseconds(domain)
|
||||||
}
|
}
|
||||||
)
|
)
|
||||||
assertSuccessJsonResponse(response)
|
assertSuccessJsonResponse(response)
|
||||||
|
|
|
@ -1,10 +1,10 @@
|
||||||
|
import getTimeoutMilliseconds from '../../getTimeoutMilliseconds.js'
|
||||||
import { FeedData } from '../FeedData'
|
import { FeedData } from '../FeedData'
|
||||||
import { assertSuccessJsonResponse } from '../../assertSuccessJsonResponse'
|
import { assertSuccessJsonResponse } from '../../assertSuccessJsonResponse'
|
||||||
import { z } from 'zod'
|
import { z } from 'zod'
|
||||||
import { FieldData } from '../FieldData'
|
import { FieldData } from '../FieldData'
|
||||||
import { avatarSchema } from './Avatar'
|
import { avatarSchema } from './Avatar'
|
||||||
import { parseAvatarUrl } from './parseAvatarUrl'
|
import { parseAvatarUrl } from './parseAvatarUrl'
|
||||||
import { getDefaultTimeoutMilliseconds } from '../../getDefaultTimeoutMilliseconds'
|
|
||||||
import { parseDescription } from './parseDescription'
|
import { parseDescription } from './parseDescription'
|
||||||
import { FeedProviderMethod } from '../FeedProviderMethod'
|
import { FeedProviderMethod } from '../FeedProviderMethod'
|
||||||
import { NoMoreFeedsError } from '../NoMoreFeedsError'
|
import { NoMoreFeedsError } from '../NoMoreFeedsError'
|
||||||
|
@ -40,13 +40,13 @@ export const retrieveVideoChannels: FeedProviderMethod = async (
|
||||||
page,
|
page,
|
||||||
robotsTxt
|
robotsTxt
|
||||||
) => {
|
) => {
|
||||||
const response = await robotsTxt.getIfAllowed(`https://${domain}/api/v1/video-channels`, {
|
const response = await robotsTxt.getIfAllowed(new URL(`https://${domain}/api/v1/video-channels`), {
|
||||||
params: {
|
params: {
|
||||||
count: limit,
|
count: limit,
|
||||||
sort: 'createdAt',
|
sort: 'createdAt',
|
||||||
start: page * limit
|
start: page * limit
|
||||||
},
|
},
|
||||||
timeout: getDefaultTimeoutMilliseconds()
|
timeout: getTimeoutMilliseconds(domain)
|
||||||
})
|
})
|
||||||
assertSuccessJsonResponse(response)
|
assertSuccessJsonResponse(response)
|
||||||
const responseData = schema.parse(response.data)
|
const responseData = schema.parse(response.data)
|
||||||
|
|
|
@ -1,7 +1,7 @@
|
||||||
import { AxiosRequestConfig, AxiosResponse } from 'axios'
|
import { AxiosRequestConfig, AxiosResponse } from 'axios'
|
||||||
|
|
||||||
export default interface RobotsTxt {
|
export default interface RobotsTxt {
|
||||||
isAllowed: (url: string) => boolean
|
isAllowed: (url: URL) => boolean
|
||||||
getIfAllowed: <T = any, R = AxiosResponse<T>, D = any>(url: string, config?: AxiosRequestConfig<D>) => Promise<R>
|
getIfAllowed: <T = any, R = AxiosResponse<T>, D = any>(url: URL, config?: AxiosRequestConfig<D>) => Promise<R>
|
||||||
postIfAllowed: <T = any, R = AxiosResponse<T>, D = any>(url: string, data?: D, config?: AxiosRequestConfig<D>) => Promise<R>
|
postIfAllowed: <T = any, R = AxiosResponse<T>, D = any>(url: URL, data?: D, config?: AxiosRequestConfig<D>) => Promise<R>
|
||||||
}
|
}
|
||||||
|
|
|
@ -1,7 +1,7 @@
|
||||||
export class RobotsTxtError extends Error {
|
export class RobotsTxtError extends Error {
|
||||||
public readonly url
|
public readonly url: string
|
||||||
public constructor (url: string) {
|
public constructor (url: URL) {
|
||||||
super('Request was blocked by robots.txt')
|
super('Request was blocked by robots.txt')
|
||||||
this.url = url
|
this.url = url.toString()
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -1,6 +1,6 @@
|
||||||
import axios, { AxiosRequestConfig, AxiosResponse } from 'axios'
|
import axios, { AxiosRequestConfig, AxiosResponse } from 'axios'
|
||||||
import robotsParser from 'robots-parser'
|
import robotsParser from 'robots-parser'
|
||||||
import { getDefaultTimeoutMilliseconds } from '../getDefaultTimeoutMilliseconds.js'
|
import getTimeoutMilliseconds from '../getTimeoutMilliseconds.js'
|
||||||
import RobotsTxt from './RobotsTxt.js'
|
import RobotsTxt from './RobotsTxt.js'
|
||||||
import { RobotsTxtError } from './RobotsTxtError.js'
|
import { RobotsTxtError } from './RobotsTxtError.js'
|
||||||
|
|
||||||
|
@ -8,35 +8,35 @@ const userAgent = 'FediCrawl/1.0'
|
||||||
|
|
||||||
export default async function fetchRobotsTxt (domain: string): Promise<RobotsTxt> {
|
export default async function fetchRobotsTxt (domain: string): Promise<RobotsTxt> {
|
||||||
console.info('Fetching robots.txt', { domain })
|
console.info('Fetching robots.txt', { domain })
|
||||||
const url = `https://${domain}/robots.txt`
|
const url = new URL(`https://${domain}/robots.txt`)
|
||||||
let content = ''
|
let content = ''
|
||||||
try {
|
try {
|
||||||
const robotsTxt = await axios.get(url, {
|
const robotsTxt = await axios.get(url.toString(), {
|
||||||
headers: { 'User-Agent': userAgent },
|
headers: { 'User-Agent': userAgent },
|
||||||
timeout: getDefaultTimeoutMilliseconds()
|
timeout: getTimeoutMilliseconds(domain)
|
||||||
})
|
})
|
||||||
content = String(robotsTxt.data)
|
content = String(robotsTxt.data)
|
||||||
} catch (error) {
|
} catch (error) {
|
||||||
console.info('Robots.txt not found', { error, url })
|
console.info('Robots.txt not found', { error, url })
|
||||||
}
|
}
|
||||||
const robots = robotsParser(url, content)
|
const robots = robotsParser(url.toString(), content)
|
||||||
const isAllowed = (url: string): boolean => robots.isAllowed(url, userAgent) ?? true
|
const isAllowed = (url: URL): boolean => robots.isAllowed(url.toString(), userAgent) ?? true
|
||||||
return {
|
return {
|
||||||
isAllowed,
|
isAllowed,
|
||||||
getIfAllowed: async <T = any, R = AxiosResponse<T>, D = any>(url: string, config?: AxiosRequestConfig<D>): Promise<R> => {
|
getIfAllowed: async <T = any, R = AxiosResponse<T>, D = any>(url: URL, config?: AxiosRequestConfig<D>): Promise<R> => {
|
||||||
if (!isAllowed(url)) {
|
if (!isAllowed(url)) {
|
||||||
throw new RobotsTxtError(url)
|
throw new RobotsTxtError(url)
|
||||||
}
|
}
|
||||||
return await axios.get(url, {
|
return await axios.get(url.toString(), {
|
||||||
headers: { 'User-Agent': userAgent },
|
headers: { 'User-Agent': userAgent },
|
||||||
...config
|
...config
|
||||||
})
|
})
|
||||||
},
|
},
|
||||||
postIfAllowed: async <T = any, R = AxiosResponse<T>, D = any>(url: string, data?: D, config?: AxiosRequestConfig<D>): Promise<R> => {
|
postIfAllowed: async <T = any, R = AxiosResponse<T>, D = any>(url: URL, data?: D, config?: AxiosRequestConfig<D>): Promise<R> => {
|
||||||
if (!isAllowed(url)) {
|
if (!isAllowed(url)) {
|
||||||
throw new RobotsTxtError(url)
|
throw new RobotsTxtError(url)
|
||||||
}
|
}
|
||||||
return await axios.post(url, data, {
|
return await axios.post(url.toString(), data, {
|
||||||
headers: { 'User-Agent': userAgent },
|
headers: { 'User-Agent': userAgent },
|
||||||
...config
|
...config
|
||||||
})
|
})
|
||||||
|
|
|
@ -0,0 +1,5 @@
|
||||||
|
import { getDefaultTimeoutMilliseconds } from './getDefaultTimeoutMilliseconds.js'
|
||||||
|
|
||||||
|
export const getSeedTimeoutMilliseconds = (): number => {
|
||||||
|
return parseInt(process.env.SEED_TIMEOUT_MILLISECONDS ?? getDefaultTimeoutMilliseconds().toString())
|
||||||
|
}
|
|
@ -0,0 +1,9 @@
|
||||||
|
import isSeedDomain from '../Jobs/Seed/isSeedDomain.js'
|
||||||
|
import { getDefaultTimeoutMilliseconds } from './getDefaultTimeoutMilliseconds.js'
|
||||||
|
import { getSeedTimeoutMilliseconds } from './getSeedTimeoutMilliseconds.js'
|
||||||
|
|
||||||
|
export default function getTimeoutMilliseconds (domain: string): number {
|
||||||
|
return isSeedDomain(domain)
|
||||||
|
? getSeedTimeoutMilliseconds()
|
||||||
|
: getDefaultTimeoutMilliseconds()
|
||||||
|
}
|
|
@ -0,0 +1,5 @@
|
||||||
|
export default function getSeedDomains (): string[] {
|
||||||
|
return (process.env.SEED_NODE_DOMAIN ?? 'mastodon.social,mastodon.online').split(
|
||||||
|
','
|
||||||
|
)
|
||||||
|
}
|
|
@ -0,0 +1,5 @@
|
||||||
|
import getSeedDomains from './getSeedDomains.js'
|
||||||
|
|
||||||
|
export default function isSeedDomain (domain: string): boolean {
|
||||||
|
return getSeedDomains().includes(domain)
|
||||||
|
}
|
|
@ -1,6 +1,7 @@
|
||||||
import providerRegistry from './Fediverse/Providers'
|
import providerRegistry from './Fediverse/Providers'
|
||||||
import { addNodeSeed } from './Jobs/Seed/addNodeSeed'
|
import { addNodeSeed } from './Jobs/Seed/addNodeSeed'
|
||||||
import { processNextNode } from './Jobs/processNextNode'
|
import { processNextNode } from './Jobs/processNextNode'
|
||||||
|
import getSeedDomains from './Jobs/Seed/getSeedDomains.js'
|
||||||
import assertNodeIndex from './Storage/Nodes/assertNodeIndex'
|
import assertNodeIndex from './Storage/Nodes/assertNodeIndex'
|
||||||
import assertFeedIndex from './Storage/Feeds/assertFeedIndex'
|
import assertFeedIndex from './Storage/Feeds/assertFeedIndex'
|
||||||
import elasticClient from './Storage/ElasticClient'
|
import elasticClient from './Storage/ElasticClient'
|
||||||
|
@ -34,10 +35,7 @@ const app = async (): Promise<void> => {
|
||||||
await assertNodeIndex(elasticClient)
|
await assertNodeIndex(elasticClient)
|
||||||
await assertFeedIndex(elasticClient)
|
await assertFeedIndex(elasticClient)
|
||||||
await deleteDomains(elasticClient, getBannedDomains())
|
await deleteDomains(elasticClient, getBannedDomains())
|
||||||
const seedDomains = (process.env.SEED_NODE_DOMAIN ?? 'mastodon.social').split(
|
await addNodeSeed(elasticClient, getSeedDomains())
|
||||||
','
|
|
||||||
)
|
|
||||||
await addNodeSeed(elasticClient, seedDomains)
|
|
||||||
await loop()
|
await loop()
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
Ładowanie…
Reference in New Issue