Skip to content

Commit 27f1b7e

Browse files
authored
feat: metrics alerting support (#145)
1 parent 4c0617e commit 27f1b7e

File tree

4 files changed

+187
-5
lines changed

4 files changed

+187
-5
lines changed

.changeset/light-pumas-obey.md

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
---
2+
'@hyperdx/api': minor
3+
---
4+
5+
feat: metrics alerting support

packages/api/src/clickhouse/index.ts

Lines changed: 8 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -705,7 +705,7 @@ export const getMetricsChart = async ({
705705
aggFn: AggFn;
706706
dataType: MetricsDataType;
707707
endTime: number; // unix in ms,
708-
granularity: Granularity;
708+
granularity: Granularity | string;
709709
groupBy?: string;
710710
name: string;
711711
q: string;
@@ -868,7 +868,13 @@ ORDER BY _timestamp_sort_key ASC
868868
query,
869869
format: 'JSON',
870870
});
871-
const result = await rows.json<ResponseJSON<Record<string, unknown>>>();
871+
const result = await rows.json<
872+
ResponseJSON<{
873+
data: number;
874+
group: string;
875+
ts_bucket: number;
876+
}>
877+
>();
872878
logger.info({
873879
message: 'getMetricsChart',
874880
query,

packages/api/src/tasks/__tests__/checkAlerts.test.ts

Lines changed: 145 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -241,7 +241,7 @@ describe('checkAlerts', () => {
241241
);
242242
});
243243

244-
it('CHART alert', async () => {
244+
it('CHART alert (logs table series)', async () => {
245245
jest
246246
.spyOn(slack, 'postMessageToWebhook')
247247
.mockResolvedValueOnce(null as any);
@@ -388,5 +388,149 @@ describe('checkAlerts', () => {
388388
},
389389
);
390390
});
391+
392+
it('CHART alert (metrics table series)', async () => {
393+
jest
394+
.spyOn(slack, 'postMessageToWebhook')
395+
.mockResolvedValueOnce(null as any);
396+
jest
397+
.spyOn(clickhouse, 'getMetricsChart')
398+
.mockResolvedValueOnce({
399+
rows: 1,
400+
data: [
401+
{
402+
data: 11,
403+
group: 'HyperDX',
404+
ts_bucket: 1700172600,
405+
},
406+
],
407+
} as any)
408+
// no logs found in the next window
409+
.mockResolvedValueOnce({
410+
rows: 0,
411+
data: [],
412+
} as any);
413+
414+
const team = await createTeam({ name: 'My Team' });
415+
const webhook = await new Webhook({
416+
team: team._id,
417+
service: 'slack',
418+
url: 'https://hooks.slack.com/services/123',
419+
name: 'My Webhook',
420+
}).save();
421+
const dashboard = await new Dashboard({
422+
name: 'My Dashboard',
423+
team: team._id,
424+
charts: [
425+
{
426+
id: '198hki',
427+
name: 'Redis Memory',
428+
x: 0,
429+
y: 0,
430+
w: 6,
431+
h: 3,
432+
series: [
433+
{
434+
table: 'metrics',
435+
type: 'time',
436+
aggFn: 'max',
437+
field: 'redis.memory.rss - Gauge',
438+
where: 'cloud.provider:"aws"',
439+
groupBy: ['host'],
440+
},
441+
],
442+
},
443+
{
444+
id: 'obil1',
445+
name: 'Min Duratioin',
446+
x: 6,
447+
y: 0,
448+
w: 6,
449+
h: 3,
450+
series: [
451+
{
452+
table: 'logs',
453+
type: 'time',
454+
aggFn: 'min',
455+
field: 'duration',
456+
where: '',
457+
groupBy: [],
458+
},
459+
],
460+
},
461+
],
462+
}).save();
463+
const alert = await createAlert({
464+
source: 'CHART',
465+
channel: {
466+
type: 'webhook',
467+
webhookId: webhook._id.toString(),
468+
},
469+
interval: '5m',
470+
type: 'presence',
471+
threshold: 10,
472+
dashboardId: dashboard._id.toString(),
473+
chartId: '198hki',
474+
});
475+
476+
const now = new Date('2023-11-16T22:12:00.000Z');
477+
478+
// shoud fetch 5m of logs
479+
await processAlert(now, alert);
480+
// check alert history
481+
const alertHistories = await AlertHistory.find({
482+
alertId: alert._id,
483+
});
484+
expect(alertHistories.length).toBe(1);
485+
expect(alertHistories[0].counts).toBe(1);
486+
expect(alertHistories[0].createdAt).toEqual(
487+
new Date('2023-11-16T22:10:00.000Z'),
488+
);
489+
expect(alert.state).toBe('ALERT');
490+
491+
// skip since time diff is less than 1 window size
492+
const later = new Date('2023-11-16T22:14:00.000Z');
493+
await processAlert(later, alert);
494+
// alert should still be in alert state
495+
expect(alert.state).toBe('ALERT');
496+
497+
const nextWindow = new Date('2023-11-16T22:16:00.000Z');
498+
await processAlert(nextWindow, alert);
499+
// alert should be in ok state
500+
expect(alert.state).toBe('OK');
501+
502+
// check if getLogsChart query + webhook were triggered
503+
expect(clickhouse.getMetricsChart).toHaveBeenNthCalledWith(1, {
504+
aggFn: 'max',
505+
dataType: 'Gauge',
506+
endTime: 1700172600000,
507+
granularity: '5 minute',
508+
groupBy: 'host',
509+
name: 'redis.memory.rss',
510+
q: 'cloud.provider:"aws"',
511+
startTime: 1700172300000,
512+
teamId: team._id.toString(),
513+
});
514+
expect(slack.postMessageToWebhook).toHaveBeenNthCalledWith(
515+
1,
516+
'https://hooks.slack.com/services/123',
517+
{
518+
text: 'Alert for "Redis Memory" in "My Dashboard" - 11 exceeds 10',
519+
blocks: [
520+
{
521+
text: {
522+
text: [
523+
`*<http://localhost:9090/dashboards/${dashboard._id}?from=1700170500000&granularity=5+minute&to=1700175000000 | Alert for "Redis Memory" in "My Dashboard">*`,
524+
'Group: "HyperDX"',
525+
'11 exceeds 10',
526+
].join('\n'),
527+
type: 'mrkdwn',
528+
},
529+
type: 'section',
530+
},
531+
],
532+
},
533+
);
534+
});
391535
});
392536
});

packages/api/src/tasks/checkAlerts.ts

Lines changed: 29 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,9 @@ import { URLSearchParams } from 'url';
66
import * as fns from 'date-fns';
77
import * as fnsTz from 'date-fns-tz';
88
import ms from 'ms';
9+
import { isString } from 'lodash';
910
import { serializeError } from 'serialize-error';
11+
import { z } from 'zod';
1012

1113
import * as clickhouse from '@/clickhouse';
1214
import * as config from '@/config';
@@ -336,6 +338,7 @@ export const processAlert = async (now: Date, alert: AlertDocument) => {
336338
let checksData:
337339
| Awaited<ReturnType<typeof clickhouse.checkAlert>>
338340
| Awaited<ReturnType<typeof clickhouse.getLogsChart>>
341+
| Awaited<ReturnType<typeof clickhouse.getMetricsChart>>
339342
| null = null;
340343
let logView: Awaited<ReturnType<typeof getLogViewEnhanced>> | null = null;
341344
let targetDashboard: EnhancedDashboard | null = null;
@@ -412,8 +415,30 @@ export const processAlert = async (now: Date, alert: AlertDocument) => {
412415
tableVersion: dashboard.team.logStreamTableVersion,
413416
teamId: dashboard.team._id.toString(),
414417
});
418+
} else if (
419+
series.type === 'time' &&
420+
series.table === 'metrics' &&
421+
series.field
422+
) {
423+
targetDashboard = dashboard;
424+
const startTimeMs = fns.getTime(checkStartTime);
425+
const endTimeMs = fns.getTime(checkEndTime);
426+
const [metricName, rawMetricDataType] = series.field.split(' - ');
427+
const metricDataType = z
428+
.nativeEnum(clickhouse.MetricsDataType)
429+
.parse(rawMetricDataType);
430+
checksData = await clickhouse.getMetricsChart({
431+
aggFn: series.aggFn,
432+
dataType: metricDataType,
433+
endTime: endTimeMs,
434+
granularity: `${windowSizeInMins} minute`,
435+
groupBy: series.groupBy[0],
436+
name: metricName,
437+
q: series.where,
438+
startTime: startTimeMs,
439+
teamId: dashboard.team._id.toString(),
440+
});
415441
}
416-
// TODO: support metrics table
417442
}
418443

419444
logger.info({
@@ -439,7 +464,9 @@ export const processAlert = async (now: Date, alert: AlertDocument) => {
439464
let alertState = AlertState.OK;
440465
if (checksData?.rows && checksData?.rows > 0) {
441466
for (const checkData of checksData.data) {
442-
const totalCount = parseInt(checkData.data);
467+
const totalCount = isString(checkData.data)
468+
? parseInt(checkData.data)
469+
: checkData.data;
443470
if (doesExceedThreshold(alert, totalCount)) {
444471
alertState = AlertState.ALERT;
445472
logger.info({

0 commit comments

Comments
 (0)