kopia lustrzana https://github.com/Stopka/fedicrawl
Added crawling depth limit
rodzic
0723c2508d
commit
6a11d19781
|
@ -8,6 +8,8 @@ ENV ELASTIC_URL='http://elastic:9200' \
|
|||
WAIT_FOR_JOB_MINUTES='60' \
|
||||
DEFAULT_TIMEOUT_MILLISECONDS='10000' \
|
||||
BANNED_DOMAINS='' \
|
||||
MAX_CRAWLING_DEPTH='' \
|
||||
CRAWLING_VERSION='0' \
|
||||
TZ='UTC'
|
||||
FROM prebuild AS build
|
||||
WORKDIR /srv
|
||||
|
|
|
@ -35,6 +35,8 @@ Configuration is done using environmental variables:
|
|||
| `WAIT_FOR_JOB_MINUTES` | _Optional_, How many minutes should the thread sleep if there are no nodes to refresh | `60` |
|
||||
| `DEFAULT_TIMEOUT_MILLISECONDS` | _Optional_, How many milliseconds should http wait for node api response on refresh | `10000` |
|
||||
| `BANNED_DOMAINS` | _Optional_, Domains not to index (even with subdomains) | _empty_ |
|
||||
| `CRAWLING_VERSION` | _Optional_, Increasing this number can enforce recrawling of the whole index | 0 |
|
||||
| `MAX_CRAWLING_DEPTH` | _Optional_, Limits how far is fediverse indexed from seed nodes | _empty_ |
|
||||
| `TZ` | _Optional_, Timezone | `UTC` |
|
||||
## Deploy
|
||||
App is designed to be run in docker container and deployed using docker-compose.
|
||||
|
|
|
@ -0,0 +1,5 @@
|
|||
const CRAWLING_VERSION = 0
|
||||
|
||||
export default function getCrawlingVersion (): number {
|
||||
return CRAWLING_VERSION + parseInt(process.env.CRAWLING_VERSION ?? '0')
|
||||
}
|
|
@ -0,0 +1,10 @@
|
|||
export default function getMaxCrawlingDepth (): number | undefined {
|
||||
if (process.env.MAX_CRAWLING_DEPTH === undefined || process.env.MAX_CRAWLING_DEPTH === '') {
|
||||
return undefined
|
||||
}
|
||||
const depth = parseInt(process.env.MAX_CRAWLING_DEPTH)
|
||||
if (depth >= 0) {
|
||||
return depth
|
||||
}
|
||||
return undefined
|
||||
}
|
|
@ -1,3 +1,4 @@
|
|||
import getMaxCrawlingDepth from '../../Fediverse/getMaxCrawlingDepth.js'
|
||||
import { NodeProvider } from '../../Fediverse/Providers/NodeProvider'
|
||||
import RobotsTxt from '../../Fediverse/RobotsTxt/RobotsTxt.js'
|
||||
import { findNewNodesOnPage } from './findNewNodesOnPage'
|
||||
|
@ -10,6 +11,13 @@ export const findNewNodes = async (
|
|||
node: Node,
|
||||
robotsTxt: RobotsTxt
|
||||
): Promise<void> => {
|
||||
const maxCrawlingDepth = getMaxCrawlingDepth()
|
||||
if (maxCrawlingDepth !== undefined && node.crawlingDepth >= maxCrawlingDepth) {
|
||||
console.info('Skipping finding nodes, max crawling depth reached', {
|
||||
maxCrawlingDepth
|
||||
})
|
||||
return
|
||||
}
|
||||
try {
|
||||
// noinspection InfiniteLoopJS
|
||||
for (let page = 0; true; page++) {
|
||||
|
|
|
@ -20,5 +20,5 @@ export const findNewNodesOnPage = async (
|
|||
provider: provider.getKey(),
|
||||
page
|
||||
})
|
||||
return await createMissingNodes(elastic, domains, node.domain)
|
||||
return await createMissingNodes(elastic, domains, node.domain, node.crawlingDepth + 1)
|
||||
}
|
||||
|
|
|
@ -6,6 +6,6 @@ export const addNodeSeed = async (
|
|||
domains: string[]
|
||||
): Promise<boolean> => {
|
||||
console.info('Trying to add seed domain nodes', { domains })
|
||||
const result = await createMissingNodes(elastic, domains, undefined)
|
||||
const result = await createMissingNodes(elastic, domains, undefined, 0)
|
||||
return result > 0
|
||||
}
|
||||
|
|
|
@ -20,6 +20,8 @@ interface Node {
|
|||
discoveredByDomain?: string
|
||||
accountFeedCount?: number
|
||||
channelFeedCount?: number
|
||||
crawlingDepth: number
|
||||
crawlingVersion: number
|
||||
}
|
||||
|
||||
export default Node
|
||||
|
|
|
@ -69,9 +69,12 @@ const assertNodeIndex = async (elastic: ElasticClient): Promise<void> => {
|
|||
}
|
||||
},
|
||||
accountFeedCount: { type: 'integer' },
|
||||
channelFeedCount: { type: 'integer' }
|
||||
channelFeedCount: { type: 'integer' },
|
||||
crawlingDepth: { type: 'integer' },
|
||||
crawlingVersion: { type: 'integer' }
|
||||
}
|
||||
})
|
||||
|
||||
await elastic.indices.refresh({ index: nodeIndex })
|
||||
}
|
||||
|
||||
|
|
|
@ -1,9 +1,11 @@
|
|||
import getCrawlingVersion from '../../Fediverse/getCrawlingVersion.js'
|
||||
import { ElasticClient } from '../ElasticClient'
|
||||
import nodeIndex from '../Definitions/nodeIndex'
|
||||
export const createMissingNodes = async (
|
||||
elastic: ElasticClient,
|
||||
domains: string[],
|
||||
discoveredByDomain: string | undefined
|
||||
discoveredByDomain: string | undefined,
|
||||
crawlingDepth: number
|
||||
): Promise<number> => {
|
||||
const response = await elastic.bulk({
|
||||
index: nodeIndex,
|
||||
|
@ -14,6 +16,8 @@ export const createMissingNodes = async (
|
|||
{
|
||||
domain,
|
||||
discoveredByDomain,
|
||||
crawlingDepth,
|
||||
crawlingVersion: getCrawlingVersion(),
|
||||
foundAt: new Date().getTime()
|
||||
}
|
||||
])
|
||||
|
|
|
@ -1,3 +1,4 @@
|
|||
import getCrawlingVersion from '../../Fediverse/getCrawlingVersion.js'
|
||||
import { ElasticClient } from '../ElasticClient'
|
||||
import nodeIndex from '../Definitions/nodeIndex'
|
||||
import Node from '../Definitions/Node'
|
||||
|
@ -14,7 +15,8 @@ export const setNodeRefreshed = async (
|
|||
index: nodeIndex,
|
||||
id: node.domain,
|
||||
doc: {
|
||||
refreshedAt: date.getTime()
|
||||
refreshedAt: date.getTime(),
|
||||
crawlingVersion: getCrawlingVersion()
|
||||
}
|
||||
})
|
||||
return assertDefined(
|
||||
|
|
Ładowanie…
Reference in New Issue