fedicrawl/application/src/Jobs/Nodes/findNewNodesOnPage.ts

25 wiersze
903 B
TypeScript
Czysty Zwykły widok Historia

2022-11-22 15:37:11 +00:00
import RobotsTxt from '../../Fediverse/RobotsTxt/RobotsTxt.js'
2021-12-23 14:14:06 +00:00
import { createMissingNodes } from '../../Storage/Nodes/createMissingNodes'
import { NodeProvider } from '../../Fediverse/Providers/NodeProvider'
2022-09-14 19:16:04 +00:00
import Node from '../../Storage/Definitions/Node'
import { ElasticClient } from '../../Storage/ElasticClient'
import isDomainNotBanned from '../../Storage/Nodes/isDomainNotBanned'
2021-12-23 14:14:06 +00:00
export const findNewNodesOnPage = async (
2022-09-18 11:32:25 +00:00
elastic: ElasticClient,
provider: NodeProvider,
node: Node,
2022-11-22 15:37:11 +00:00
page: number,
robotsTxt: RobotsTxt
2022-09-18 11:32:25 +00:00
): Promise<number> => {
2022-11-22 15:37:11 +00:00
let domains = await provider.retrieveNodes(node.domain, page, robotsTxt)
2022-09-14 19:16:04 +00:00
domains = domains.filter(isDomainNotBanned)
2022-09-18 11:32:25 +00:00
console.log('Found nodes', {
count: domains.length,
domain: node.domain,
provider: provider.getKey(),
page
})
2022-11-23 23:39:17 +00:00
return await createMissingNodes(elastic, domains, node.domain, node.crawlingDepth + 1)
2021-12-23 14:14:06 +00:00
}