From 3a3248a636d9479f7ba67a213ca81e89d3221bbd Mon Sep 17 00:00:00 2001 From: Dan Groshev Date: Mon, 29 Jan 2024 09:47:50 +0000 Subject: [PATCH] Introduce a Cloudflare health worker (#2499) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This PR introduces a new Cloudflare worker for health checks. At the moment the worker only translates Updown webhooks into Discord webhooks. In the future we can teach this worker to check more things. ### Change Type - [x] `internal` — Any other changes that don't affect the published package --------- Co-authored-by: Steve Ruiz --- .github/workflows/deploy.yml | 2 + apps/health-worker/.gitignore | 2 + apps/health-worker/README.md | 3 + apps/health-worker/package.json | 21 ++++ apps/health-worker/src/discord.ts | 96 ++++++++++++++++ apps/health-worker/src/index.ts | 69 +++++++++++ apps/health-worker/src/updown_types.ts | 152 +++++++++++++++++++++++++ apps/health-worker/tsconfig.json | 11 ++ apps/health-worker/wrangler.toml | 3 + scripts/deploy.ts | 56 +++++++-- yarn.lock | 20 ++++ 11 files changed, 428 insertions(+), 7 deletions(-) create mode 100644 apps/health-worker/.gitignore create mode 100644 apps/health-worker/README.md create mode 100644 apps/health-worker/package.json create mode 100644 apps/health-worker/src/discord.ts create mode 100644 apps/health-worker/src/index.ts create mode 100644 apps/health-worker/src/updown_types.ts create mode 100644 apps/health-worker/tsconfig.json create mode 100644 apps/health-worker/wrangler.toml diff --git a/.github/workflows/deploy.yml b/.github/workflows/deploy.yml index f61f0c319..e5b475a67 100644 --- a/.github/workflows/deploy.yml +++ b/.github/workflows/deploy.yml @@ -63,6 +63,8 @@ jobs: CLOUDFLARE_API_TOKEN: ${{ secrets.CLOUDFLARE_API_TOKEN }} CLOUDFLARE_ACCOUNT_ID: ${{ secrets.CLOUDFLARE_ACCOUNT_ID }} DISCORD_DEPLOY_WEBHOOK_URL: ${{ secrets.DISCORD_DEPLOY_WEBHOOK_URL }} + DISCORD_HEALTH_WEBHOOK_URL: ${{ secrets.DISCORD_HEALTH_WEBHOOK_URL }} + HEALTH_WORKER_UPDOWN_WEBHOOK_PATH: ${{ secrets.HEALTH_WORKER_UPDOWN_WEBHOOK_PATH }} GC_MAPS_API_KEY: ${{ secrets.GC_MAPS_API_KEY }} WORKER_SENTRY_DSN: ${{ secrets.WORKER_SENTRY_DSN }} SENTRY_AUTH_TOKEN: ${{ secrets.SENTRY_AUTH_TOKEN }} diff --git a/apps/health-worker/.gitignore b/apps/health-worker/.gitignore new file mode 100644 index 000000000..b3e55cbea --- /dev/null +++ b/apps/health-worker/.gitignore @@ -0,0 +1,2 @@ +build +.wrangler \ No newline at end of file diff --git a/apps/health-worker/README.md b/apps/health-worker/README.md new file mode 100644 index 000000000..b292dc138 --- /dev/null +++ b/apps/health-worker/README.md @@ -0,0 +1,3 @@ +# Health Worker + +Accepts webhooks from [Updown](https://updown.io/), sends them to our Discord. \ No newline at end of file diff --git a/apps/health-worker/package.json b/apps/health-worker/package.json new file mode 100644 index 000000000..93504dc0d --- /dev/null +++ b/apps/health-worker/package.json @@ -0,0 +1,21 @@ +{ + "name": "health-worker", + "version": "1.0.0", + "private": true, + "scripts": { + "deploy": "wrangler deploy", + "dev": "wrangler dev", + "start": "wrangler dev", + "lint": "yarn run -T tsx ../../scripts/lint.ts" + }, + "dependencies": { + "@tldraw/utils": "workspace:*" + }, + "devDependencies": { + "@cloudflare/workers-types": "^4.20230821.0", + "@types/node": "^18.7.3", + "discord-api-types": "^0.37.67", + "typescript": "^5.2.2", + "wrangler": "3.16.0" + } +} diff --git a/apps/health-worker/src/discord.ts b/apps/health-worker/src/discord.ts new file mode 100644 index 000000000..e27f54c49 --- /dev/null +++ b/apps/health-worker/src/discord.ts @@ -0,0 +1,96 @@ +import { exhaustiveSwitchError } from '@tldraw/utils' +import { type APIEmbed } from 'discord-api-types/v10' +import { Event as UpdownEvent } from './updown_types' + +// discord wants decimal colours +const GREEN = 4243543 +const RED = 14692657 +const ORANGE = 16213767 + +// docs: https://birdie0.github.io/discord-webhooks-guide/index.html +export type DiscordPayload = { + username: string + content: string + embeds: APIEmbed[] +} + +function formatUpdownEvent(event: UpdownEvent): { + colour: number + title: string + description: string +} | null { + switch (event.event) { + case 'check.down': + return { + colour: RED, + title: `Check DOWN at <${event.check.url}>`, + description: `<${event.check.url}> is down: "${event.downtime.error}"\n\nNext check in ${event.check.period} seconds`, + } + case 'check.still_down': + return null + case 'check.up': { + return { + colour: GREEN, + title: `Check UP at <${event.check.url}>`, + description: `<${event.check.url}> is up\n\nIt was down for ${event.downtime.duration} seconds`, + } + } + case 'check.ssl_invalid': { + return { + colour: RED, + title: `SSL INVALID at <${event.check.url}>`, + description: `SSL certificate at <${event.check.url}> is invalid: "${event.ssl.error}"`, + } + } + case 'check.ssl_valid': { + return { + colour: GREEN, + title: `SSL VALID at <${event.check.url}>`, + description: `SSL certificate at <${event.check.url}> is now valid`, + } + } + case 'check.ssl_expiration': { + return { + colour: ORANGE, + title: `SSL EXPIRATION at <${event.check.url}>`, + description: `SSL certificate at <${event.check.url}> will expire in ${event.ssl.days_before_expiration} days`, + } + } + + case 'check.ssl_renewed': { + return { + colour: GREEN, + title: `SSL RENEWED at <${event.check.url}>`, + description: `SSL certificate at <${event.check.url}> was renewed`, + } + } + case 'check.performance_drop': + return { + colour: ORANGE, + title: `PERFORMANCE DROP at <${event.check.url}>`, + description: `Performance drop at <${event.check.url}>, apdex dropped ${event.apdex_dropped}`, + } + + default: + exhaustiveSwitchError(event, 'event') + } +} + +export function updownToDiscord(event: UpdownEvent): DiscordPayload | null { + const formatted = formatUpdownEvent(event) + if (!formatted) return null + + const { colour, title, description } = formatted + + return { + username: 'Health Worker', + content: `Updown: ${title}`, + embeds: [ + { + color: colour, + description: description, + timestamp: event.time, + }, + ], + } +} diff --git a/apps/health-worker/src/index.ts b/apps/health-worker/src/index.ts new file mode 100644 index 000000000..37829d237 --- /dev/null +++ b/apps/health-worker/src/index.ts @@ -0,0 +1,69 @@ +import { DiscordPayload, updownToDiscord } from './discord' +import { Event as UpdownEvent } from './updown_types' + +interface Env { + DISCORD_HEALTH_WEBHOOK_URL: string | undefined + // it needs to be passed in because it's effectively a secret, unless we want everyone to be able + // to stress us out with spurious discord alerts + HEALTH_WORKER_UPDOWN_WEBHOOK_PATH: string | undefined +} + +async function sendDiscordWebhook(url: string, discord: DiscordPayload): Promise { + return fetch(url, { + method: 'POST', + headers: { + 'Content-Type': 'application/json', + }, + body: JSON.stringify(discord), + }) +} + +async function handleUpdown(request: Request, discordUrl: string): Promise { + const updownEvents = (await request.json()) as Array + + let status = 200 + for (const e of updownEvents) { + const discordPayload = updownToDiscord(e) + if (!discordPayload) { + continue + } + const discordResult = await sendDiscordWebhook(discordUrl, discordPayload) + + if (!discordResult.ok) { + console.error(`Discord error ${discordResult.status}: ${discordResult.statusText}`) + status = discordResult.status + break + } + } + + return new Response(null, { status }) +} + +const handler: ExportedHandler = { + async fetch(request: Request, env: Env): Promise { + const discordUrl = env.DISCORD_HEALTH_WEBHOOK_URL + if (!discordUrl) { + console.error('missing DISCORD_HEALTH_WEBHOOK_URL') + return new Response('Internal error', { status: 500 }) + } + + const updownWebhookPath = env.HEALTH_WORKER_UPDOWN_WEBHOOK_PATH + if (!updownWebhookPath) { + console.error('missing HEALTH_WORKER_UPDOWN_WEBHOOK_PATH') + return new Response('Internal error', { status: 500 }) + } + + const url = new URL(request.url) + + // timing safety COULD be an issue, but it seems that in practice it isn't: + // https://github.com/scriptin/node-timing-attack + // my own testing confirms those observations + if (url.pathname === updownWebhookPath) { + return handleUpdown(request, discordUrl) + } + + return new Response('Not Found', { status: 404 }) + }, +} + +export default handler diff --git a/apps/health-worker/src/updown_types.ts b/apps/health-worker/src/updown_types.ts new file mode 100644 index 000000000..e7f4ca7df --- /dev/null +++ b/apps/health-worker/src/updown_types.ts @@ -0,0 +1,152 @@ +// docs: https://updown.io/api#webhooks + +export interface BaseCheck { + token: string + url: string + alias: null + last_status: number + uptime: number + period: number + apdex_t: number + string_match: string + enabled: boolean + published: boolean + disabled_locations: any[] + recipients: any[] + last_check_at: string + next_check_at: string + created_at: null + mute_until: null | string + favicon_url: string + custom_headers: CustomHeaders + http_verb: string + http_body: string +} + +export interface FailingCheck extends BaseCheck { + down: true + down_since: string + up_since: null + error: string +} + +export interface SucceedingCheck extends BaseCheck { + down: true + down_since: null + up_since: string + error: null +} + +export interface BaseDowntime { + id: string + error: string + started_at: string + partial: unknown +} + +export interface OngoingDowntime extends BaseDowntime { + ended_at: null + duration: null +} + +export interface FinishedDowntime extends BaseDowntime { + ended_at: string + // seconds + duration: number +} + +export type CustomHeaders = Record + +export interface SslCert { + subject: string + issuer: string + from: string + to: string + algorithm: string +} + +export interface EventDown { + event: 'check.down' + time: string + description: string + check: FailingCheck + downtime: OngoingDowntime +} + +export interface EventStillDown { + event: 'check.still_down' + time: string + description: string + check: FailingCheck + downtime: OngoingDowntime +} + +export interface EventUp { + event: 'check.up' + time: string + description: string + check: SucceedingCheck + downtime: FinishedDowntime +} + +export interface EventSslInvalid { + event: 'check.ssl_invalid' + time: string + description: string + check: SucceedingCheck | FailingCheck + ssl: { + cert: SslCert + error: string + } +} + +export interface EventSslValid { + event: 'check.ssl_valid' + time: string + description: string + check: SucceedingCheck | FailingCheck + ssl: { + cert: SslCert + } +} + +export interface EventSslExpiration { + event: 'check.ssl_expiration' + time: string + description: string + check: SucceedingCheck | FailingCheck + ssl: { + cert: SslCert + days_before_expiration: number + } +} + +export interface EventSslRenewed { + event: 'check.ssl_renewed' + time: string + description: string + check: SucceedingCheck | FailingCheck + ssl: { + new_cert: SslCert + old_cert: SslCert + } +} + +export interface EventPerformanceDrop { + event: 'check.performance_drop' + time: string + description: string + check: SucceedingCheck | FailingCheck + apdex_dropped: string + last_metrics: Record +} + +export type Event = + | EventDown + | EventStillDown + | EventUp + | EventSslInvalid + | EventSslValid + | EventSslExpiration + | EventSslRenewed + | EventPerformanceDrop diff --git a/apps/health-worker/tsconfig.json b/apps/health-worker/tsconfig.json new file mode 100644 index 000000000..983cb56b9 --- /dev/null +++ b/apps/health-worker/tsconfig.json @@ -0,0 +1,11 @@ +{ + "extends": "../../config/tsconfig.base.json", + "include": ["src"], + "exclude": ["node_modules", "dist", ".tsbuild*"], + "compilerOptions": { + "noEmit": true, + "emitDeclarationOnly": false, + "types": ["@cloudflare/workers-types", "@types/node"] + }, + "references": [] +} diff --git a/apps/health-worker/wrangler.toml b/apps/health-worker/wrangler.toml new file mode 100644 index 000000000..60ef7a7f6 --- /dev/null +++ b/apps/health-worker/wrangler.toml @@ -0,0 +1,3 @@ +name = "health-worker" +main = "src/index.ts" +compatibility_date = "2023-12-18" \ No newline at end of file diff --git a/scripts/deploy.ts b/scripts/deploy.ts index 531c4dd22..d550ac988 100644 --- a/scripts/deploy.ts +++ b/scripts/deploy.ts @@ -12,6 +12,7 @@ import { makeEnv } from './lib/makeEnv' import { nicelog } from './lib/nicelog' const worker = path.relative(process.cwd(), path.resolve(__dirname, '../apps/dotcom-worker')) +const healthWorker = path.relative(process.cwd(), path.resolve(__dirname, '../apps/health-worker')) const assetUpload = path.relative( process.cwd(), path.resolve(__dirname, '../apps/dotcom-asset-upload') @@ -26,6 +27,8 @@ const env = makeEnv([ 'CLOUDFLARE_ACCOUNT_ID', 'CLOUDFLARE_API_TOKEN', 'DISCORD_DEPLOY_WEBHOOK_URL', + 'DISCORD_HEALTH_WEBHOOK_URL', + 'HEALTH_WORKER_UPDOWN_WEBHOOK_PATH', 'GC_MAPS_API_KEY', 'RELEASE_COMMIT_HASH', 'SENTRY_AUTH_TOKEN', @@ -73,7 +76,7 @@ async function main() { await discordMessage(`--- **${env.TLDRAW_ENV} deploy pre-flight** ---`) - await discordStep('[1/6] setting up deploy', async () => { + await discordStep('[1/7] setting up deploy', async () => { // make sure the tldraw .css files are built: await exec('yarn', ['lazy', 'prebuild']) @@ -83,15 +86,16 @@ async function main() { // deploy pre-flight steps: // 1. get the dotcom app ready to go (env vars and pre-build) - await discordStep('[2/6] building dotcom app', async () => { + await discordStep('[2/7] building dotcom app', async () => { await createSentryRelease() await prepareDotcomApp() await uploadSourceMaps() await coalesceWithPreviousAssets(`${dotcom}/.vercel/output/static/assets`) }) - await discordStep('[3/6] cloudflare deploy dry run', async () => { + await discordStep('[3/7] cloudflare deploy dry run', async () => { await deployAssetUploadWorker({ dryRun: true }) + await deployHealthWorker({ dryRun: true }) await deployTlsyncWorker({ dryRun: true }) }) @@ -100,16 +104,19 @@ async function main() { await discordMessage(`--- **pre-flight complete, starting real deploy** ---`) // 2. deploy the cloudflare workers: - await discordStep('[4/6] deploying asset uploader to cloudflare', async () => { + await discordStep('[4/7] deploying asset uploader to cloudflare', async () => { await deployAssetUploadWorker({ dryRun: false }) }) - await discordStep('[5/6] deploying multiplayer worker to cloudflare', async () => { + await discordStep('[5/7] deploying multiplayer worker to cloudflare', async () => { await deployTlsyncWorker({ dryRun: false }) }) + await discordStep('[6/7] deploying health worker to cloudflare', async () => { + await deployHealthWorker({ dryRun: false }) + }) // 3. deploy the pre-build dotcom app: const { deploymentUrl, inspectUrl } = await discordStep( - '[6/6] deploying dotcom app to vercel', + '[7/7] deploying dotcom app to vercel', async () => { return await deploySpa() } @@ -119,7 +126,7 @@ async function main() { if (previewId) { const aliasDomain = `${previewId}-preview-deploy.tldraw.com` - await discordStep('[7/6] aliasing preview deployment', async () => { + await discordStep('[8/7] aliasing preview deployment', async () => { await vercelCli('alias', ['set', deploymentUrl, aliasDomain]) }) @@ -217,6 +224,41 @@ name = "${previewId}-tldraw-multiplayer"` ) } +let didUpdateHealthWorker = false +async function deployHealthWorker({ dryRun }: { dryRun: boolean }) { + if (previewId && !didUpdateHealthWorker) { + appendFileSync( + join(healthWorker, 'wrangler.toml'), + ` +[env.preview] +name = "${previewId}-tldraw-health"` + ) + didUpdateHealthWorker = true + } + await exec( + 'yarn', + [ + 'wrangler', + 'deploy', + dryRun ? '--dry-run' : null, + '--env', + env.TLDRAW_ENV, + '--var', + `DISCORD_HEALTH_WEBHOOK_URL:${env.DISCORD_HEALTH_WEBHOOK_URL}`, + '--var', + `HEALTH_WORKER_UPDOWN_WEBHOOK_PATH:${env.HEALTH_WORKER_UPDOWN_WEBHOOK_PATH}`, + ], + { + pwd: healthWorker, + env: { + NODE_ENV: 'production', + // wrangler needs CI=1 set to prevent it from trying to do interactive prompts + CI: '1', + }, + } + ) +} + type ExecOpts = NonNullable[2]> async function vercelCli(command: string, args: string[], opts?: ExecOpts) { return exec( diff --git a/yarn.lock b/yarn.lock index 1f6baa951..9b2bc4299 100644 --- a/yarn.lock +++ b/yarn.lock @@ -11324,6 +11324,13 @@ __metadata: languageName: node linkType: hard +"discord-api-types@npm:^0.37.67": + version: 0.37.67 + resolution: "discord-api-types@npm:0.37.67" + checksum: 5b474544a82148179e3d50f3092b9aa90b4142b470df397d3e6e211bd9c0b1f99724c618d7136fccdbe9cd931d8f48b1bdcd10599c7a08a92b76408c05829692 + languageName: node + linkType: hard + "doctrine@npm:^2.1.0": version: 2.1.0 resolution: "doctrine@npm:2.1.0" @@ -14567,6 +14574,19 @@ __metadata: languageName: node linkType: hard +"health-worker@workspace:apps/health-worker": + version: 0.0.0-use.local + resolution: "health-worker@workspace:apps/health-worker" + dependencies: + "@cloudflare/workers-types": "npm:^4.20230821.0" + "@tldraw/utils": "workspace:*" + "@types/node": "npm:^18.7.3" + discord-api-types: "npm:^0.37.67" + typescript: "npm:^5.2.2" + wrangler: "npm:3.16.0" + languageName: unknown + linkType: soft + "highlight.js@npm:~11.9.0": version: 11.9.0 resolution: "highlight.js@npm:11.9.0"