kopia lustrzana https://github.com/Stopka/fedicrawl
Added domain validation
rodzic
1cbd5df5ae
commit
425abd5af0
|
@ -4,6 +4,7 @@ import { NodeProvider } from '../../Fediverse/Providers/NodeProvider'
|
||||||
import Node from '../../Storage/Definitions/Node'
|
import Node from '../../Storage/Definitions/Node'
|
||||||
import { ElasticClient } from '../../Storage/ElasticClient'
|
import { ElasticClient } from '../../Storage/ElasticClient'
|
||||||
import isDomainNotBanned from '../../Storage/Nodes/isDomainNotBanned'
|
import isDomainNotBanned from '../../Storage/Nodes/isDomainNotBanned'
|
||||||
|
import isDomainValid from '../../Storage/Nodes/isDomainValid.js'
|
||||||
|
|
||||||
export const findNewNodesOnPage = async (
|
export const findNewNodesOnPage = async (
|
||||||
elastic: ElasticClient,
|
elastic: ElasticClient,
|
||||||
|
@ -13,7 +14,9 @@ export const findNewNodesOnPage = async (
|
||||||
robotsTxt: RobotsTxt
|
robotsTxt: RobotsTxt
|
||||||
): Promise<number> => {
|
): Promise<number> => {
|
||||||
let domains = await provider.retrieveNodes(node.domain, page, robotsTxt)
|
let domains = await provider.retrieveNodes(node.domain, page, robotsTxt)
|
||||||
domains = domains.filter(isDomainNotBanned)
|
domains = domains.filter(
|
||||||
|
(domain: string): boolean => isDomainValid(domain) && isDomainNotBanned(domain)
|
||||||
|
)
|
||||||
console.log('Found nodes', {
|
console.log('Found nodes', {
|
||||||
count: domains.length,
|
count: domains.length,
|
||||||
domain: node.domain,
|
domain: node.domain,
|
||||||
|
|
|
@ -1,6 +1,7 @@
|
||||||
import fetchRobotsTxt from '../Fediverse/RobotsTxt/fetchRobotsTxt.js'
|
import fetchRobotsTxt from '../Fediverse/RobotsTxt/fetchRobotsTxt.js'
|
||||||
import { fetchNodeToProcess } from '../Storage/Nodes/fetchNodeToProcess'
|
import { fetchNodeToProcess } from '../Storage/Nodes/fetchNodeToProcess'
|
||||||
import { ProviderRegistry } from '../Fediverse/Providers/ProviderRegistry'
|
import { ProviderRegistry } from '../Fediverse/Providers/ProviderRegistry'
|
||||||
|
import isDomainValid from '../Storage/Nodes/isDomainValid.js'
|
||||||
import { setNodeRefreshed } from '../Storage/Nodes/setNodeRefreshed'
|
import { setNodeRefreshed } from '../Storage/Nodes/setNodeRefreshed'
|
||||||
import batchPromises from '../Utils/batchPromises.js'
|
import batchPromises from '../Utils/batchPromises.js'
|
||||||
import { refreshNodeInfo } from './NodeInfo/refreshNodeInfo'
|
import { refreshNodeInfo } from './NodeInfo/refreshNodeInfo'
|
||||||
|
@ -13,6 +14,7 @@ import { deleteOldFeeds } from '../Storage/Feeds/deleteOldFeeds'
|
||||||
import refreshNodeIps from './Dns/refreshNodeIps'
|
import refreshNodeIps from './Dns/refreshNodeIps'
|
||||||
import { ElasticClient } from '../Storage/ElasticClient'
|
import { ElasticClient } from '../Storage/ElasticClient'
|
||||||
import updateNodeFeedStats from './Nodes/updateNodeFeedStats'
|
import updateNodeFeedStats from './Nodes/updateNodeFeedStats'
|
||||||
|
import deleteDomains from './Seed/deleteBannedNodes.js'
|
||||||
|
|
||||||
export const processNextNode = async (
|
export const processNextNode = async (
|
||||||
elastic: ElasticClient,
|
elastic: ElasticClient,
|
||||||
|
@ -21,7 +23,12 @@ export const processNextNode = async (
|
||||||
console.info('#############################################')
|
console.info('#############################################')
|
||||||
let node = await fetchNodeToProcess(elastic)
|
let node = await fetchNodeToProcess(elastic)
|
||||||
node = await setNodeRefreshAttempted(elastic, node)
|
node = await setNodeRefreshAttempted(elastic, node)
|
||||||
|
// TODO remove check later
|
||||||
|
if (!isDomainValid(node.domain)) {
|
||||||
|
console.info('Node domain is invalid, deleting node', { domain: node.domain })
|
||||||
|
await deleteDomains(elastic, [node.domain])
|
||||||
|
return
|
||||||
|
}
|
||||||
node = await refreshNodeIps(elastic, node)
|
node = await refreshNodeIps(elastic, node)
|
||||||
const robotsTxt = await fetchRobotsTxt(node.domain)
|
const robotsTxt = await fetchRobotsTxt(node.domain)
|
||||||
node = await refreshNodeInfo(elastic, node, robotsTxt)
|
node = await refreshNodeInfo(elastic, node, robotsTxt)
|
||||||
|
|
|
@ -0,0 +1,10 @@
|
||||||
|
export default function isDomainValid (domain: string): boolean {
|
||||||
|
try {
|
||||||
|
// eslint-disable-next-line no-new
|
||||||
|
new URL(`https://${domain}/`)
|
||||||
|
} catch (e) {
|
||||||
|
console.info('Domain is invalid', { domain })
|
||||||
|
return false
|
||||||
|
}
|
||||||
|
return true
|
||||||
|
}
|
Ładowanie…
Reference in New Issue