kopia lustrzana https://github.com/Stopka/fedicrawl
Added domain validation
rodzic
1cbd5df5ae
commit
425abd5af0
|
@ -4,6 +4,7 @@ import { NodeProvider } from '../../Fediverse/Providers/NodeProvider'
|
|||
import Node from '../../Storage/Definitions/Node'
|
||||
import { ElasticClient } from '../../Storage/ElasticClient'
|
||||
import isDomainNotBanned from '../../Storage/Nodes/isDomainNotBanned'
|
||||
import isDomainValid from '../../Storage/Nodes/isDomainValid.js'
|
||||
|
||||
export const findNewNodesOnPage = async (
|
||||
elastic: ElasticClient,
|
||||
|
@ -13,7 +14,9 @@ export const findNewNodesOnPage = async (
|
|||
robotsTxt: RobotsTxt
|
||||
): Promise<number> => {
|
||||
let domains = await provider.retrieveNodes(node.domain, page, robotsTxt)
|
||||
domains = domains.filter(isDomainNotBanned)
|
||||
domains = domains.filter(
|
||||
(domain: string): boolean => isDomainValid(domain) && isDomainNotBanned(domain)
|
||||
)
|
||||
console.log('Found nodes', {
|
||||
count: domains.length,
|
||||
domain: node.domain,
|
||||
|
|
|
@ -1,6 +1,7 @@
|
|||
import fetchRobotsTxt from '../Fediverse/RobotsTxt/fetchRobotsTxt.js'
|
||||
import { fetchNodeToProcess } from '../Storage/Nodes/fetchNodeToProcess'
|
||||
import { ProviderRegistry } from '../Fediverse/Providers/ProviderRegistry'
|
||||
import isDomainValid from '../Storage/Nodes/isDomainValid.js'
|
||||
import { setNodeRefreshed } from '../Storage/Nodes/setNodeRefreshed'
|
||||
import batchPromises from '../Utils/batchPromises.js'
|
||||
import { refreshNodeInfo } from './NodeInfo/refreshNodeInfo'
|
||||
|
@ -13,6 +14,7 @@ import { deleteOldFeeds } from '../Storage/Feeds/deleteOldFeeds'
|
|||
import refreshNodeIps from './Dns/refreshNodeIps'
|
||||
import { ElasticClient } from '../Storage/ElasticClient'
|
||||
import updateNodeFeedStats from './Nodes/updateNodeFeedStats'
|
||||
import deleteDomains from './Seed/deleteBannedNodes.js'
|
||||
|
||||
export const processNextNode = async (
|
||||
elastic: ElasticClient,
|
||||
|
@ -21,7 +23,12 @@ export const processNextNode = async (
|
|||
console.info('#############################################')
|
||||
let node = await fetchNodeToProcess(elastic)
|
||||
node = await setNodeRefreshAttempted(elastic, node)
|
||||
|
||||
// TODO remove check later
|
||||
if (!isDomainValid(node.domain)) {
|
||||
console.info('Node domain is invalid, deleting node', { domain: node.domain })
|
||||
await deleteDomains(elastic, [node.domain])
|
||||
return
|
||||
}
|
||||
node = await refreshNodeIps(elastic, node)
|
||||
const robotsTxt = await fetchRobotsTxt(node.domain)
|
||||
node = await refreshNodeInfo(elastic, node, robotsTxt)
|
||||
|
|
|
@ -0,0 +1,10 @@
|
|||
export default function isDomainValid (domain: string): boolean {
|
||||
try {
|
||||
// eslint-disable-next-line no-new
|
||||
new URL(`https://${domain}/`)
|
||||
} catch (e) {
|
||||
console.info('Domain is invalid', { domain })
|
||||
return false
|
||||
}
|
||||
return true
|
||||
}
|
Ładowanie…
Reference in New Issue