Added crawling depth limit

main
Štěpán Škorpil 2022-11-24 00:39:17 +01:00
rodzic 0723c2508d
commit 6a11d19781
11 zmienionych plików z 43 dodań i 5 usunięć

Wyświetl plik

@ -8,6 +8,8 @@ ENV ELASTIC_URL='http://elastic:9200' \
WAIT_FOR_JOB_MINUTES='60' \
DEFAULT_TIMEOUT_MILLISECONDS='10000' \
BANNED_DOMAINS='' \
MAX_CRAWLING_DEPTH='' \
CRAWLING_VERSION='0' \
TZ='UTC'
FROM prebuild AS build
WORKDIR /srv

Wyświetl plik

@ -35,6 +35,8 @@ Configuration is done using environmental variables:
| `WAIT_FOR_JOB_MINUTES` | _Optional_, How many minutes should the thread sleep if there are no nodes to refresh | `60` |
| `DEFAULT_TIMEOUT_MILLISECONDS` | _Optional_, How many milliseconds should http wait for node api response on refresh | `10000` |
| `BANNED_DOMAINS` | _Optional_, Domains not to index (even with subdomains) | _empty_ |
| `CRAWLING_VERSION` | _Optional_, Increasing this number can enforce recrawling of the whole index | 0 |
| `MAX_CRAWLING_DEPTH` | _Optional_, Limits how far is fediverse indexed from seed nodes | _empty_ |
| `TZ` | _Optional_, Timezone | `UTC` |
## Deploy
App is designed to be run in docker container and deployed using docker-compose.

Wyświetl plik

@ -0,0 +1,5 @@
const CRAWLING_VERSION = 0
export default function getCrawlingVersion (): number {
return CRAWLING_VERSION + parseInt(process.env.CRAWLING_VERSION ?? '0')
}

Wyświetl plik

@ -0,0 +1,10 @@
export default function getMaxCrawlingDepth (): number | undefined {
if (process.env.MAX_CRAWLING_DEPTH === undefined || process.env.MAX_CRAWLING_DEPTH === '') {
return undefined
}
const depth = parseInt(process.env.MAX_CRAWLING_DEPTH)
if (depth >= 0) {
return depth
}
return undefined
}

Wyświetl plik

@ -1,3 +1,4 @@
import getMaxCrawlingDepth from '../../Fediverse/getMaxCrawlingDepth.js'
import { NodeProvider } from '../../Fediverse/Providers/NodeProvider'
import RobotsTxt from '../../Fediverse/RobotsTxt/RobotsTxt.js'
import { findNewNodesOnPage } from './findNewNodesOnPage'
@ -10,6 +11,13 @@ export const findNewNodes = async (
node: Node,
robotsTxt: RobotsTxt
): Promise<void> => {
const maxCrawlingDepth = getMaxCrawlingDepth()
if (maxCrawlingDepth !== undefined && node.crawlingDepth >= maxCrawlingDepth) {
console.info('Skipping finding nodes, max crawling depth reached', {
maxCrawlingDepth
})
return
}
try {
// noinspection InfiniteLoopJS
for (let page = 0; true; page++) {

Wyświetl plik

@ -20,5 +20,5 @@ export const findNewNodesOnPage = async (
provider: provider.getKey(),
page
})
return await createMissingNodes(elastic, domains, node.domain)
return await createMissingNodes(elastic, domains, node.domain, node.crawlingDepth + 1)
}

Wyświetl plik

@ -6,6 +6,6 @@ export const addNodeSeed = async (
domains: string[]
): Promise<boolean> => {
console.info('Trying to add seed domain nodes', { domains })
const result = await createMissingNodes(elastic, domains, undefined)
const result = await createMissingNodes(elastic, domains, undefined, 0)
return result > 0
}

Wyświetl plik

@ -20,6 +20,8 @@ interface Node {
discoveredByDomain?: string
accountFeedCount?: number
channelFeedCount?: number
crawlingDepth: number
crawlingVersion: number
}
export default Node

Wyświetl plik

@ -69,9 +69,12 @@ const assertNodeIndex = async (elastic: ElasticClient): Promise<void> => {
}
},
accountFeedCount: { type: 'integer' },
channelFeedCount: { type: 'integer' }
channelFeedCount: { type: 'integer' },
crawlingDepth: { type: 'integer' },
crawlingVersion: { type: 'integer' }
}
})
await elastic.indices.refresh({ index: nodeIndex })
}

Wyświetl plik

@ -1,9 +1,11 @@
import getCrawlingVersion from '../../Fediverse/getCrawlingVersion.js'
import { ElasticClient } from '../ElasticClient'
import nodeIndex from '../Definitions/nodeIndex'
export const createMissingNodes = async (
elastic: ElasticClient,
domains: string[],
discoveredByDomain: string | undefined
discoveredByDomain: string | undefined,
crawlingDepth: number
): Promise<number> => {
const response = await elastic.bulk({
index: nodeIndex,
@ -14,6 +16,8 @@ export const createMissingNodes = async (
{
domain,
discoveredByDomain,
crawlingDepth,
crawlingVersion: getCrawlingVersion(),
foundAt: new Date().getTime()
}
])

Wyświetl plik

@ -1,3 +1,4 @@
import getCrawlingVersion from '../../Fediverse/getCrawlingVersion.js'
import { ElasticClient } from '../ElasticClient'
import nodeIndex from '../Definitions/nodeIndex'
import Node from '../Definitions/Node'
@ -14,7 +15,8 @@ export const setNodeRefreshed = async (
index: nodeIndex,
id: node.domain,
doc: {
refreshedAt: date.getTime()
refreshedAt: date.getTime(),
crawlingVersion: getCrawlingVersion()
}
})
return assertDefined(