2022-11-22 15:37:11 +00:00
|
|
|
import RobotsTxt from '../../Fediverse/RobotsTxt/RobotsTxt.js'
|
2021-12-23 14:14:06 +00:00
|
|
|
import { createMissingNodes } from '../../Storage/Nodes/createMissingNodes'
|
|
|
|
import { NodeProvider } from '../../Fediverse/Providers/NodeProvider'
|
2022-09-14 19:16:04 +00:00
|
|
|
import Node from '../../Storage/Definitions/Node'
|
|
|
|
import { ElasticClient } from '../../Storage/ElasticClient'
|
|
|
|
import isDomainNotBanned from '../../Storage/Nodes/isDomainNotBanned'
|
2021-12-23 14:14:06 +00:00
|
|
|
|
|
|
|
export const findNewNodesOnPage = async (
|
2022-09-18 11:32:25 +00:00
|
|
|
elastic: ElasticClient,
|
|
|
|
provider: NodeProvider,
|
|
|
|
node: Node,
|
2022-11-22 15:37:11 +00:00
|
|
|
page: number,
|
|
|
|
robotsTxt: RobotsTxt
|
2022-09-18 11:32:25 +00:00
|
|
|
): Promise<number> => {
|
2022-11-22 15:37:11 +00:00
|
|
|
let domains = await provider.retrieveNodes(node.domain, page, robotsTxt)
|
2022-09-14 19:16:04 +00:00
|
|
|
domains = domains.filter(isDomainNotBanned)
|
2022-09-18 11:32:25 +00:00
|
|
|
console.log('Found nodes', {
|
|
|
|
count: domains.length,
|
|
|
|
domain: node.domain,
|
|
|
|
provider: provider.getKey(),
|
|
|
|
page
|
|
|
|
})
|
2022-11-23 23:39:17 +00:00
|
|
|
return await createMissingNodes(elastic, domains, node.domain, node.crawlingDepth + 1)
|
2021-12-23 14:14:06 +00:00
|
|
|
}
|