New upstream version 8.1.0
This commit is contained in:
155
mon/scripts/grafana/alerts/CPU-alert-v1.json
Normal file
155
mon/scripts/grafana/alerts/CPU-alert-v1.json
Normal file
@@ -0,0 +1,155 @@
|
||||
{
|
||||
"id": 2,
|
||||
"uid": "cf53330f-49cf-4b1e-bb59-e4580d32e707",
|
||||
"orgID": 1,
|
||||
"folderUID": "beegfsalertfolder",
|
||||
"ruleGroup": "evaluate",
|
||||
"title": "CPU Alert",
|
||||
"condition": "C",
|
||||
"data": [
|
||||
{
|
||||
"refId": "A",
|
||||
"queryType": "",
|
||||
"relativeTimeRange": {
|
||||
"from": 600,
|
||||
"to": 0
|
||||
},
|
||||
"datasourceUid": "${DS_UID}",
|
||||
"model": {
|
||||
"groupBy": [
|
||||
{
|
||||
"params": [
|
||||
"host::tag"
|
||||
],
|
||||
"type": "tag"
|
||||
}
|
||||
],
|
||||
"hide": false,
|
||||
"intervalMs": 1000,
|
||||
"maxDataPoints": 43200,
|
||||
"measurement": "cpu",
|
||||
"orderByTime": "ASC",
|
||||
"policy": "auto",
|
||||
"query": "SELECT mean(\"usage_system\") FROM \"auto\".\"cpu\" WHERE $timeFilter GROUP BY \"host\"::tag",
|
||||
"rawQuery": true,
|
||||
"refId": "A",
|
||||
"resultFormat": "time_series",
|
||||
"select": [
|
||||
[
|
||||
{
|
||||
"params": [
|
||||
"usage_system"
|
||||
],
|
||||
"type": "field"
|
||||
},
|
||||
{
|
||||
"params": [],
|
||||
"type": "mean"
|
||||
}
|
||||
]
|
||||
],
|
||||
"tags": []
|
||||
}
|
||||
},
|
||||
{
|
||||
"refId": "B",
|
||||
"queryType": "",
|
||||
"relativeTimeRange": {
|
||||
"from": 600,
|
||||
"to": 0
|
||||
},
|
||||
"datasourceUid": "__expr__",
|
||||
"model": {
|
||||
"conditions": [
|
||||
{
|
||||
"evaluator": {
|
||||
"params": [
|
||||
0,
|
||||
0
|
||||
],
|
||||
"type": "gt"
|
||||
},
|
||||
"operator": {
|
||||
"type": "and"
|
||||
},
|
||||
"query": {
|
||||
"params": []
|
||||
},
|
||||
"reducer": {
|
||||
"params": [],
|
||||
"type": "avg"
|
||||
},
|
||||
"type": "query"
|
||||
}
|
||||
],
|
||||
"datasource": {
|
||||
"name": "Expression",
|
||||
"type": "__expr__",
|
||||
"uid": "__expr__"
|
||||
},
|
||||
"expression": "A",
|
||||
"intervalMs": 1000,
|
||||
"maxDataPoints": 43200,
|
||||
"reducer": "last",
|
||||
"refId": "B",
|
||||
"type": "reduce"
|
||||
}
|
||||
},
|
||||
{
|
||||
"refId": "C",
|
||||
"queryType": "",
|
||||
"relativeTimeRange": {
|
||||
"from": 600,
|
||||
"to": 0
|
||||
},
|
||||
"datasourceUid": "__expr__",
|
||||
"model": {
|
||||
"conditions": [
|
||||
{
|
||||
"evaluator": {
|
||||
"params": [
|
||||
0,
|
||||
0
|
||||
],
|
||||
"type": "gt"
|
||||
},
|
||||
"operator": {
|
||||
"type": "and"
|
||||
},
|
||||
"query": {
|
||||
"params": []
|
||||
},
|
||||
"reducer": {
|
||||
"params": [],
|
||||
"type": "avg"
|
||||
},
|
||||
"type": "query"
|
||||
}
|
||||
],
|
||||
"datasource": {
|
||||
"name": "Expression",
|
||||
"type": "__expr__",
|
||||
"uid": "__expr__"
|
||||
},
|
||||
"expression": "$B > 80",
|
||||
"hide": false,
|
||||
"intervalMs": 1000,
|
||||
"maxDataPoints": 43200,
|
||||
"refId": "C",
|
||||
"type": "math"
|
||||
}
|
||||
}
|
||||
],
|
||||
"updated": "2023-10-17T18:28:40+05:30",
|
||||
"noDataState": "OK",
|
||||
"execErrState": "Error",
|
||||
"for": "30m",
|
||||
"annotations": {
|
||||
"summary": "CPU usage is above thershold set",
|
||||
"description": "Please check host \"{{ $labels.host }}\" its cpu usage is above thershold"
|
||||
},
|
||||
"labels": {
|
||||
"cpu-severity": "{{if gt $values.B.Value 90.0}}critical{{else if gt $values.B.Value 80.0}}warning{{else}}info{{end}}"
|
||||
},
|
||||
"isPaused": true
|
||||
}
|
||||
125
mon/scripts/grafana/alerts/CPU-alert-v2.json
Normal file
125
mon/scripts/grafana/alerts/CPU-alert-v2.json
Normal file
@@ -0,0 +1,125 @@
|
||||
{
|
||||
"id": 2,
|
||||
"uid": "c1ec4ef2-dae2-4c85-b478-8119bb4326e6",
|
||||
"orgID": 1,
|
||||
"folderUID": "beegfsalertfolder",
|
||||
"ruleGroup": "evaluate",
|
||||
"title": "CPU Alert",
|
||||
"condition": "C",
|
||||
"data": [
|
||||
{
|
||||
"refId": "A",
|
||||
"queryType": "",
|
||||
"relativeTimeRange": {
|
||||
"from": 600,
|
||||
"to": 0
|
||||
},
|
||||
"datasourceUid": "${DS_UID}",
|
||||
"model": {
|
||||
"hide": false,
|
||||
"intervalMs": 1000,
|
||||
"maxDataPoints": 43200,
|
||||
"query": "from(bucket: \"${BUCKET}\") \r\n|> range(start: v.timeRangeStart, stop: v.timeRangeStop) \r\n|> filter(fn: (r) => r[\"_measurement\"] == \"cpu\") \r\n|> filter(fn: (r) => r[\"_field\"] == \"usage_system\")\r\n|> filter(fn: (r) => r[\"cpu\"] == \"cpu-total\") \r\n|> aggregateWindow(every: v.windowPeriod, fn: mean, createEmpty: false) \r\n|> yield(name: \"mean\")",
|
||||
"refId": "A"
|
||||
}
|
||||
},
|
||||
{
|
||||
"refId": "B",
|
||||
"queryType": "",
|
||||
"relativeTimeRange": {
|
||||
"from": 600,
|
||||
"to": 0
|
||||
},
|
||||
"datasourceUid": "__expr__",
|
||||
"model": {
|
||||
"conditions": [
|
||||
{
|
||||
"evaluator": {
|
||||
"params": [],
|
||||
"type": "gt"
|
||||
},
|
||||
"operator": {
|
||||
"type": "and"
|
||||
},
|
||||
"query": {
|
||||
"params": [
|
||||
"B"
|
||||
]
|
||||
},
|
||||
"reducer": {
|
||||
"params": [],
|
||||
"type": "last"
|
||||
},
|
||||
"type": "query"
|
||||
}
|
||||
],
|
||||
"datasource": {
|
||||
"type": "__expr__",
|
||||
"uid": "__expr__"
|
||||
},
|
||||
"expression": "A",
|
||||
"hide": false,
|
||||
"intervalMs": 1000,
|
||||
"maxDataPoints": 43200,
|
||||
"reducer": "last",
|
||||
"refId": "B",
|
||||
"type": "reduce"
|
||||
}
|
||||
},
|
||||
{
|
||||
"refId": "C",
|
||||
"queryType": "",
|
||||
"relativeTimeRange": {
|
||||
"from": 600,
|
||||
"to": 0
|
||||
},
|
||||
"datasourceUid": "__expr__",
|
||||
"model": {
|
||||
"conditions": [
|
||||
{
|
||||
"evaluator": {
|
||||
"params": [
|
||||
0,
|
||||
0
|
||||
],
|
||||
"type": "gt"
|
||||
},
|
||||
"operator": {
|
||||
"type": "and"
|
||||
},
|
||||
"query": {
|
||||
"params": []
|
||||
},
|
||||
"reducer": {
|
||||
"params": [],
|
||||
"type": "avg"
|
||||
},
|
||||
"type": "query"
|
||||
}
|
||||
],
|
||||
"datasource": {
|
||||
"name": "Expression",
|
||||
"type": "__expr__",
|
||||
"uid": "__expr__"
|
||||
},
|
||||
"expression": "$B > 80",
|
||||
"intervalMs": 1000,
|
||||
"maxDataPoints": 43200,
|
||||
"refId": "C",
|
||||
"type": "math"
|
||||
}
|
||||
}
|
||||
],
|
||||
"updated": "2023-10-17T12:42:56Z",
|
||||
"noDataState": "OK",
|
||||
"execErrState": "Error",
|
||||
"for": "30m",
|
||||
"annotations": {
|
||||
"description": "Please check host \"{{ $labels.host }}\" its cpu usage is above thershold",
|
||||
"summary": "CPU usage is above thershold set"
|
||||
},
|
||||
"labels": {
|
||||
"cpu-severity": "{{ if gt $values.B.Value 90.0 }}critical{{ else if gt $values.B.Value 80.0 }}warning{{ else }}info{{ end }}"
|
||||
},
|
||||
"isPaused": true
|
||||
}
|
||||
156
mon/scripts/grafana/alerts/Disk-alert-v1.json
Normal file
156
mon/scripts/grafana/alerts/Disk-alert-v1.json
Normal file
@@ -0,0 +1,156 @@
|
||||
{
|
||||
"id": 3,
|
||||
"uid": "af36a69e-fd32-4ebc-94cd-474ea6c9edb2",
|
||||
"orgID": 1,
|
||||
"folderUID": "beegfsalertfolder",
|
||||
"ruleGroup": "evaluate",
|
||||
"title": "Disk Alert",
|
||||
"condition": "C",
|
||||
"data": [
|
||||
{
|
||||
"refId": "A",
|
||||
"queryType": "",
|
||||
"relativeTimeRange": {
|
||||
"from": 600,
|
||||
"to": 0
|
||||
},
|
||||
"datasourceUid": "${DS_UID}",
|
||||
"model": {
|
||||
"groupBy": [
|
||||
{
|
||||
"params": [
|
||||
"storageTargetID::tag"
|
||||
],
|
||||
"type": "tag"
|
||||
}
|
||||
],
|
||||
"hide": false,
|
||||
"intervalMs": 1000,
|
||||
"maxDataPoints": 43200,
|
||||
"measurement": "storageTargets",
|
||||
"orderByTime": "ASC",
|
||||
"policy": "auto",
|
||||
"query": "SELECT (diskSpaceFree / diskSpaceTotal) * 100 FROM \"auto\".\"storageTargets\" WHERE $timeFilter GROUP BY \"storageTargetID\"::tag",
|
||||
"rawQuery": true,
|
||||
"refId": "A",
|
||||
"resultFormat": "time_series",
|
||||
"select": [
|
||||
[
|
||||
{
|
||||
"params": [
|
||||
"diskSpaceFree"
|
||||
],
|
||||
"type": "field"
|
||||
},
|
||||
{
|
||||
"params": [
|
||||
" / "
|
||||
],
|
||||
"type": "math"
|
||||
}
|
||||
]
|
||||
],
|
||||
"tags": []
|
||||
}
|
||||
},
|
||||
{
|
||||
"refId": "B",
|
||||
"queryType": "",
|
||||
"relativeTimeRange": {
|
||||
"from": 600,
|
||||
"to": 0
|
||||
},
|
||||
"datasourceUid": "__expr__",
|
||||
"model": {
|
||||
"conditions": [
|
||||
{
|
||||
"evaluator": {
|
||||
"params": [
|
||||
0,
|
||||
0
|
||||
],
|
||||
"type": "gt"
|
||||
},
|
||||
"operator": {
|
||||
"type": "and"
|
||||
},
|
||||
"query": {
|
||||
"params": []
|
||||
},
|
||||
"reducer": {
|
||||
"params": [],
|
||||
"type": "avg"
|
||||
},
|
||||
"type": "query"
|
||||
}
|
||||
],
|
||||
"datasource": {
|
||||
"name": "Expression",
|
||||
"type": "__expr__",
|
||||
"uid": "__expr__"
|
||||
},
|
||||
"expression": "A",
|
||||
"intervalMs": 1000,
|
||||
"maxDataPoints": 43200,
|
||||
"reducer": "last",
|
||||
"refId": "B",
|
||||
"type": "reduce"
|
||||
}
|
||||
},
|
||||
{
|
||||
"refId": "C",
|
||||
"queryType": "",
|
||||
"relativeTimeRange": {
|
||||
"from": 600,
|
||||
"to": 0
|
||||
},
|
||||
"datasourceUid": "__expr__",
|
||||
"model": {
|
||||
"conditions": [
|
||||
{
|
||||
"evaluator": {
|
||||
"params": [
|
||||
0,
|
||||
0
|
||||
],
|
||||
"type": "gt"
|
||||
},
|
||||
"operator": {
|
||||
"type": "and"
|
||||
},
|
||||
"query": {
|
||||
"params": []
|
||||
},
|
||||
"reducer": {
|
||||
"params": [],
|
||||
"type": "avg"
|
||||
},
|
||||
"type": "query"
|
||||
}
|
||||
],
|
||||
"datasource": {
|
||||
"name": "Expression",
|
||||
"type": "__expr__",
|
||||
"uid": "__expr__"
|
||||
},
|
||||
"expression": "$B < 30",
|
||||
"intervalMs": 1000,
|
||||
"maxDataPoints": 43200,
|
||||
"refId": "C",
|
||||
"type": "math"
|
||||
}
|
||||
}
|
||||
],
|
||||
"updated": "2023-10-10T16:06:31+05:30",
|
||||
"noDataState": "OK",
|
||||
"execErrState": "Error",
|
||||
"for": "10m",
|
||||
"annotations": {
|
||||
"description": "Please check, as the Disk Space for Storage Target ID '{{ $labels.storageTargetID }}' is only {{ humanize $values.B.Value }}%.",
|
||||
"summary": "BeeGFS Storage Target disk space is low"
|
||||
},
|
||||
"labels": {
|
||||
"disk-severity": "{{if lt $values.B.Value 20.0}}critical{{else if lt $values.B.Value 30.0}}warning{{else}}info{{end}}"
|
||||
},
|
||||
"isPaused": true
|
||||
}
|
||||
125
mon/scripts/grafana/alerts/Disk-alert-v2.json
Normal file
125
mon/scripts/grafana/alerts/Disk-alert-v2.json
Normal file
@@ -0,0 +1,125 @@
|
||||
{
|
||||
"id": 3,
|
||||
"uid": "c0008edf-2473-47be-b0ff-ab50bad831c5",
|
||||
"orgID": 1,
|
||||
"folderUID": "beegfsalertfolder",
|
||||
"ruleGroup": "evaluate",
|
||||
"title": "Disk Alert",
|
||||
"condition": "C",
|
||||
"data": [
|
||||
{
|
||||
"refId": "A",
|
||||
"queryType": "",
|
||||
"relativeTimeRange": {
|
||||
"from": 600,
|
||||
"to": 0
|
||||
},
|
||||
"datasourceUid": "${DS_UID}",
|
||||
"model": {
|
||||
"hide": false,
|
||||
"intervalMs": 1000,
|
||||
"maxDataPoints": 43200,
|
||||
"query": "from(bucket: \"${BUCKET}\")\r\n |> range(start: v.timeRangeStart, stop: v.timeRangeStop)\r\n |> filter(fn: (r) => r[\"_measurement\"] == \"storageTargets\")\r\n |> filter(fn: (r) => r._field == \"diskSpaceTotal\" or r._field == \"diskSpaceFree\")\r\n |> aggregateWindow(every: v.windowPeriod, fn: last, createEmpty: false)\r\n |> pivot(rowKey: [\"_time\"], columnKey: [\"_field\"], valueColumn: \"_value\")\r\n |> map(fn: (r) => ({ r with _value:(r.diskSpaceFree/ r.diskSpaceTotal) * 100.0 }))\r\n |> rename(columns: {_value: \"DiskFreePercent\"})\r\n |> drop(columns:[\"_start\",\"_stop\",\"_measurement\",\"diskSpaceTotal\",\"diskSpaceFree\"])\r\n",
|
||||
"refId": "A"
|
||||
}
|
||||
},
|
||||
{
|
||||
"refId": "B",
|
||||
"queryType": "",
|
||||
"relativeTimeRange": {
|
||||
"from": 600,
|
||||
"to": 0
|
||||
},
|
||||
"datasourceUid": "__expr__",
|
||||
"model": {
|
||||
"conditions": [
|
||||
{
|
||||
"evaluator": {
|
||||
"params": [],
|
||||
"type": "gt"
|
||||
},
|
||||
"operator": {
|
||||
"type": "and"
|
||||
},
|
||||
"query": {
|
||||
"params": [
|
||||
"B"
|
||||
]
|
||||
},
|
||||
"reducer": {
|
||||
"params": [],
|
||||
"type": "last"
|
||||
},
|
||||
"type": "query"
|
||||
}
|
||||
],
|
||||
"datasource": {
|
||||
"type": "__expr__",
|
||||
"uid": "__expr__"
|
||||
},
|
||||
"expression": "A",
|
||||
"hide": false,
|
||||
"intervalMs": 1000,
|
||||
"maxDataPoints": 43200,
|
||||
"reducer": "last",
|
||||
"refId": "B",
|
||||
"type": "reduce"
|
||||
}
|
||||
},
|
||||
{
|
||||
"refId": "C",
|
||||
"queryType": "",
|
||||
"relativeTimeRange": {
|
||||
"from": 600,
|
||||
"to": 0
|
||||
},
|
||||
"datasourceUid": "__expr__",
|
||||
"model": {
|
||||
"conditions": [
|
||||
{
|
||||
"evaluator": {
|
||||
"params": [
|
||||
0,
|
||||
0
|
||||
],
|
||||
"type": "gt"
|
||||
},
|
||||
"operator": {
|
||||
"type": "and"
|
||||
},
|
||||
"query": {
|
||||
"params": []
|
||||
},
|
||||
"reducer": {
|
||||
"params": [],
|
||||
"type": "avg"
|
||||
},
|
||||
"type": "query"
|
||||
}
|
||||
],
|
||||
"datasource": {
|
||||
"name": "Expression",
|
||||
"type": "__expr__",
|
||||
"uid": "__expr__"
|
||||
},
|
||||
"expression": "$B < 30",
|
||||
"intervalMs": 1000,
|
||||
"maxDataPoints": 43200,
|
||||
"refId": "C",
|
||||
"type": "math"
|
||||
}
|
||||
}
|
||||
],
|
||||
"updated": "2023-10-16T18:16:45Z",
|
||||
"noDataState": "OK",
|
||||
"execErrState": "Error",
|
||||
"for": "10m",
|
||||
"annotations": {
|
||||
"description": "Please check, as the Disk Space for Storage Target ID '{{ $labels.storageTargetID }}' is only {{ humanize $values.B.Value }}%.",
|
||||
"summary": "BeeGFS Storage Target disk space is low"
|
||||
},
|
||||
"labels": {
|
||||
"disk-severity": "{{if lt $values.B.Value 20.0}}critical{{else if lt $values.B.Value 30.0}}warning{{else}}info{{end}}"
|
||||
},
|
||||
"isPaused": true
|
||||
}
|
||||
156
mon/scripts/grafana/alerts/Inodes-alert-v1.json
Normal file
156
mon/scripts/grafana/alerts/Inodes-alert-v1.json
Normal file
@@ -0,0 +1,156 @@
|
||||
{
|
||||
"id": 4,
|
||||
"uid": "e2ad5c16-110f-43df-a784-829561fe3317",
|
||||
"orgID": 1,
|
||||
"folderUID": "beegfsalertfolder",
|
||||
"ruleGroup": "evaluate",
|
||||
"title": "Inodes Alert ",
|
||||
"condition": "C",
|
||||
"data": [
|
||||
{
|
||||
"refId": "A",
|
||||
"queryType": "",
|
||||
"relativeTimeRange": {
|
||||
"from": 600,
|
||||
"to": 0
|
||||
},
|
||||
"datasourceUid": "${DS_UID}",
|
||||
"model": {
|
||||
"groupBy": [
|
||||
{
|
||||
"params": [
|
||||
"storageTargetID::tag"
|
||||
],
|
||||
"type": "tag"
|
||||
}
|
||||
],
|
||||
"hide": false,
|
||||
"intervalMs": 1000,
|
||||
"maxDataPoints": 43200,
|
||||
"measurement": "storageTargets",
|
||||
"orderByTime": "ASC",
|
||||
"policy": "auto",
|
||||
"query": "SELECT (inodesFree / inodesTotal) * 100 FROM \"auto\".\"storageTargets\" WHERE $timeFilter GROUP BY \"storageTargetID\"::tag",
|
||||
"rawQuery": true,
|
||||
"refId": "A",
|
||||
"resultFormat": "time_series",
|
||||
"select": [
|
||||
[
|
||||
{
|
||||
"params": [
|
||||
"diskSpaceFree"
|
||||
],
|
||||
"type": "field"
|
||||
},
|
||||
{
|
||||
"params": [
|
||||
" / "
|
||||
],
|
||||
"type": "math"
|
||||
}
|
||||
]
|
||||
],
|
||||
"tags": []
|
||||
}
|
||||
},
|
||||
{
|
||||
"refId": "B",
|
||||
"queryType": "",
|
||||
"relativeTimeRange": {
|
||||
"from": 600,
|
||||
"to": 0
|
||||
},
|
||||
"datasourceUid": "__expr__",
|
||||
"model": {
|
||||
"conditions": [
|
||||
{
|
||||
"evaluator": {
|
||||
"params": [
|
||||
0,
|
||||
0
|
||||
],
|
||||
"type": "gt"
|
||||
},
|
||||
"operator": {
|
||||
"type": "and"
|
||||
},
|
||||
"query": {
|
||||
"params": []
|
||||
},
|
||||
"reducer": {
|
||||
"params": [],
|
||||
"type": "avg"
|
||||
},
|
||||
"type": "query"
|
||||
}
|
||||
],
|
||||
"datasource": {
|
||||
"name": "Expression",
|
||||
"type": "__expr__",
|
||||
"uid": "__expr__"
|
||||
},
|
||||
"expression": "A",
|
||||
"intervalMs": 1000,
|
||||
"maxDataPoints": 43200,
|
||||
"reducer": "last",
|
||||
"refId": "B",
|
||||
"type": "reduce"
|
||||
}
|
||||
},
|
||||
{
|
||||
"refId": "C",
|
||||
"queryType": "",
|
||||
"relativeTimeRange": {
|
||||
"from": 600,
|
||||
"to": 0
|
||||
},
|
||||
"datasourceUid": "__expr__",
|
||||
"model": {
|
||||
"conditions": [
|
||||
{
|
||||
"evaluator": {
|
||||
"params": [
|
||||
0,
|
||||
0
|
||||
],
|
||||
"type": "gt"
|
||||
},
|
||||
"operator": {
|
||||
"type": "and"
|
||||
},
|
||||
"query": {
|
||||
"params": []
|
||||
},
|
||||
"reducer": {
|
||||
"params": [],
|
||||
"type": "avg"
|
||||
},
|
||||
"type": "query"
|
||||
}
|
||||
],
|
||||
"datasource": {
|
||||
"name": "Expression",
|
||||
"type": "__expr__",
|
||||
"uid": "__expr__"
|
||||
},
|
||||
"expression": "$B < 20",
|
||||
"intervalMs": 1000,
|
||||
"maxDataPoints": 43200,
|
||||
"refId": "C",
|
||||
"type": "math"
|
||||
}
|
||||
}
|
||||
],
|
||||
"updated": "2023-10-10T16:06:31+05:30",
|
||||
"noDataState": "OK",
|
||||
"execErrState": "Error",
|
||||
"for": "10m",
|
||||
"annotations": {
|
||||
"description": "Please check, as the free inodes for Storage Target ID '{{ $labels.storageTargetID }}' are only at {{ humanize $values.B.Value }}%.",
|
||||
"summary": "BeeGFS Storage Target Inodes are below the threshold."
|
||||
},
|
||||
"labels": {
|
||||
"inodes": "free"
|
||||
},
|
||||
"isPaused": true
|
||||
}
|
||||
125
mon/scripts/grafana/alerts/Inodes-alert-v2.json
Normal file
125
mon/scripts/grafana/alerts/Inodes-alert-v2.json
Normal file
@@ -0,0 +1,125 @@
|
||||
{
|
||||
"id": 4,
|
||||
"uid": "be096d59-9dc4-4821-9530-8447e7261d9c",
|
||||
"orgID": 1,
|
||||
"folderUID": "beegfsalertfolder",
|
||||
"ruleGroup": "evaluate",
|
||||
"title": "Inodes Alert",
|
||||
"condition": "C",
|
||||
"data": [
|
||||
{
|
||||
"refId": "A",
|
||||
"queryType": "",
|
||||
"relativeTimeRange": {
|
||||
"from": 600,
|
||||
"to": 0
|
||||
},
|
||||
"datasourceUid": "${DS_UID}",
|
||||
"model": {
|
||||
"hide": false,
|
||||
"intervalMs": 1000,
|
||||
"maxDataPoints": 43200,
|
||||
"query": "from(bucket: \"${BUCKET}\")\r\n |> range(start: v.timeRangeStart, stop: v.timeRangeStop)\r\n |> filter(fn: (r) => r[\"_measurement\"] == \"storageTargets\")\r\n |> filter(fn: (r) => r._field == \"inodesTotal\" or r._field == \"inodesFree\")\r\n |> aggregateWindow(every: v.windowPeriod, fn: last, createEmpty: false)\r\n |> pivot(rowKey: [\"_time\"], columnKey: [\"_field\"], valueColumn: \"_value\")\r\n |> map(fn: (r) => ({ r with _value:(r.inodesFree/ r.inodesTotal) * 100.0 }))\r\n |> rename(columns: {_value: \"InodesFreePercent\"})\r\n |> drop(columns:[\"_start\",\"_stop\",\"_measurement\",\"inodesFree\",\"inodesTotal\"])",
|
||||
"refId": "A"
|
||||
}
|
||||
},
|
||||
{
|
||||
"refId": "B",
|
||||
"queryType": "",
|
||||
"relativeTimeRange": {
|
||||
"from": 600,
|
||||
"to": 0
|
||||
},
|
||||
"datasourceUid": "__expr__",
|
||||
"model": {
|
||||
"conditions": [
|
||||
{
|
||||
"evaluator": {
|
||||
"params": [],
|
||||
"type": "gt"
|
||||
},
|
||||
"operator": {
|
||||
"type": "and"
|
||||
},
|
||||
"query": {
|
||||
"params": [
|
||||
"B"
|
||||
]
|
||||
},
|
||||
"reducer": {
|
||||
"params": [],
|
||||
"type": "last"
|
||||
},
|
||||
"type": "query"
|
||||
}
|
||||
],
|
||||
"datasource": {
|
||||
"type": "__expr__",
|
||||
"uid": "__expr__"
|
||||
},
|
||||
"expression": "A",
|
||||
"hide": false,
|
||||
"intervalMs": 1000,
|
||||
"maxDataPoints": 43200,
|
||||
"reducer": "last",
|
||||
"refId": "B",
|
||||
"type": "reduce"
|
||||
}
|
||||
},
|
||||
{
|
||||
"refId": "C",
|
||||
"queryType": "",
|
||||
"relativeTimeRange": {
|
||||
"from": 600,
|
||||
"to": 0
|
||||
},
|
||||
"datasourceUid": "__expr__",
|
||||
"model": {
|
||||
"conditions": [
|
||||
{
|
||||
"evaluator": {
|
||||
"params": [
|
||||
0,
|
||||
0
|
||||
],
|
||||
"type": "gt"
|
||||
},
|
||||
"operator": {
|
||||
"type": "and"
|
||||
},
|
||||
"query": {
|
||||
"params": []
|
||||
},
|
||||
"reducer": {
|
||||
"params": [],
|
||||
"type": "avg"
|
||||
},
|
||||
"type": "query"
|
||||
}
|
||||
],
|
||||
"datasource": {
|
||||
"name": "Expression",
|
||||
"type": "__expr__",
|
||||
"uid": "__expr__"
|
||||
},
|
||||
"expression": "$B < 20",
|
||||
"intervalMs": 1000,
|
||||
"maxDataPoints": 43200,
|
||||
"refId": "C",
|
||||
"type": "math"
|
||||
}
|
||||
}
|
||||
],
|
||||
"updated": "2023-10-16T18:16:45Z",
|
||||
"noDataState": "OK",
|
||||
"execErrState": "Error",
|
||||
"for": "10m",
|
||||
"annotations": {
|
||||
"description": "Please check, as the free inodes for Storage Target ID '{{ $labels.storageTargetID }}' are only at {{ humanize $values.B.Value }}%.",
|
||||
"summary": "BeeGFS Storage Target Inodes are below the threshold."
|
||||
},
|
||||
"labels": {
|
||||
"inodes": "free"
|
||||
},
|
||||
"isPaused": true
|
||||
}
|
||||
151
mon/scripts/grafana/alerts/MetaQueuedrequest-alert-v1.json
Normal file
151
mon/scripts/grafana/alerts/MetaQueuedrequest-alert-v1.json
Normal file
@@ -0,0 +1,151 @@
|
||||
{
|
||||
"id": 5,
|
||||
"uid": "bc49ff76-3db9-4f8b-b88a-947c7717fc18",
|
||||
"orgID": 1,
|
||||
"folderUID": "beegfsalertfolder",
|
||||
"ruleGroup": "evaluate",
|
||||
"title": "Meta Queued Request Alert",
|
||||
"condition": "C",
|
||||
"data": [
|
||||
{
|
||||
"refId": "A",
|
||||
"queryType": "",
|
||||
"relativeTimeRange": {
|
||||
"from": 600,
|
||||
"to": 0
|
||||
},
|
||||
"datasourceUid": "${DS_UID}",
|
||||
"model": {
|
||||
"groupBy": [
|
||||
{
|
||||
"params": [
|
||||
"nodeID::tag"
|
||||
],
|
||||
"type": "tag"
|
||||
}
|
||||
],
|
||||
"intervalMs": 1000,
|
||||
"maxDataPoints": 43200,
|
||||
"measurement": "highResMeta",
|
||||
"orderByTime": "ASC",
|
||||
"policy": "auto",
|
||||
"query": "SELECT max(\"queuedRequests\") FROM \"auto\".\"highResMeta\" WHERE $timeFilter GROUP BY \"nodeID\"::tag",
|
||||
"rawQuery": true,
|
||||
"refId": "A",
|
||||
"resultFormat": "time_series",
|
||||
"select": [
|
||||
[
|
||||
{
|
||||
"params": [
|
||||
"queuedRequests"
|
||||
],
|
||||
"type": "field"
|
||||
},
|
||||
{
|
||||
"params": [],
|
||||
"type": "max"
|
||||
}
|
||||
]
|
||||
],
|
||||
"tags": []
|
||||
}
|
||||
},
|
||||
{
|
||||
"refId": "B",
|
||||
"queryType": "",
|
||||
"relativeTimeRange": {
|
||||
"from": 600,
|
||||
"to": 0
|
||||
},
|
||||
"datasourceUid": "__expr__",
|
||||
"model": {
|
||||
"conditions": [
|
||||
{
|
||||
"evaluator": {
|
||||
"params": [],
|
||||
"type": "gt"
|
||||
},
|
||||
"operator": {
|
||||
"type": "and"
|
||||
},
|
||||
"query": {
|
||||
"params": [
|
||||
"B"
|
||||
]
|
||||
},
|
||||
"reducer": {
|
||||
"params": [],
|
||||
"type": "last"
|
||||
},
|
||||
"type": "query"
|
||||
}
|
||||
],
|
||||
"datasource": {
|
||||
"type": "__expr__",
|
||||
"uid": "__expr__"
|
||||
},
|
||||
"expression": "A",
|
||||
"intervalMs": 1000,
|
||||
"maxDataPoints": 43200,
|
||||
"reducer": "last",
|
||||
"refId": "B",
|
||||
"type": "reduce"
|
||||
}
|
||||
},
|
||||
{
|
||||
"refId": "C",
|
||||
"queryType": "",
|
||||
"relativeTimeRange": {
|
||||
"from": 600,
|
||||
"to": 0
|
||||
},
|
||||
"datasourceUid": "__expr__",
|
||||
"model": {
|
||||
"conditions": [
|
||||
{
|
||||
"evaluator": {
|
||||
"params": [
|
||||
50
|
||||
],
|
||||
"type": "gt"
|
||||
},
|
||||
"operator": {
|
||||
"type": "and"
|
||||
},
|
||||
"query": {
|
||||
"params": [
|
||||
"C"
|
||||
]
|
||||
},
|
||||
"reducer": {
|
||||
"params": [],
|
||||
"type": "last"
|
||||
},
|
||||
"type": "query"
|
||||
}
|
||||
],
|
||||
"datasource": {
|
||||
"type": "__expr__",
|
||||
"uid": "__expr__"
|
||||
},
|
||||
"expression": "B",
|
||||
"intervalMs": 1000,
|
||||
"maxDataPoints": 43200,
|
||||
"refId": "C",
|
||||
"type": "threshold"
|
||||
}
|
||||
}
|
||||
],
|
||||
"updated": "2023-10-17T18:57:50+05:30",
|
||||
"noDataState": "OK",
|
||||
"execErrState": "Error",
|
||||
"for": "2m",
|
||||
"annotations": {
|
||||
"description": "Queued requests of BeeGFS meta server with nodeID - \"{{ $labels.nodeID }}\" is {{ $values.B }}",
|
||||
"summary": "Meta server queued requests is above threshold"
|
||||
},
|
||||
"labels": {
|
||||
"queued": "request"
|
||||
},
|
||||
"isPaused": true
|
||||
}
|
||||
126
mon/scripts/grafana/alerts/MetaQueuedrequest-alert-v2.json
Normal file
126
mon/scripts/grafana/alerts/MetaQueuedrequest-alert-v2.json
Normal file
@@ -0,0 +1,126 @@
|
||||
{
|
||||
"id": 5,
|
||||
"uid": "a5a9072e-a8c2-46c1-b3a0-88608956e83e",
|
||||
"orgID": 1,
|
||||
"folderUID": "beegfsalertfolder",
|
||||
"ruleGroup": "evaluate",
|
||||
"title": "Meta Queued Request Alert",
|
||||
"condition": "C",
|
||||
"data": [
|
||||
{
|
||||
"refId": "A",
|
||||
"queryType": "",
|
||||
"relativeTimeRange": {
|
||||
"from": 600,
|
||||
"to": 0
|
||||
},
|
||||
"datasourceUid": "${DS_UID}",
|
||||
"model": {
|
||||
"hide": false,
|
||||
"intervalMs": 1000,
|
||||
"maxDataPoints": 43200,
|
||||
"query": "from(bucket: \"${BUCKET}\") \r\n|> range(start: v.timeRangeStart, stop:v.timeRangeStop) \r\n|> filter(fn: (r) => r._measurement == \"highResMeta\" and r._field == \"queuedRequests\") \r\n|> aggregateWindow(every: v.windowPeriod, fn: max, createEmpty: false) \r\n|> yield(name: \"max\")",
|
||||
"refId": "A"
|
||||
}
|
||||
},
|
||||
{
|
||||
"refId": "B",
|
||||
"queryType": "",
|
||||
"relativeTimeRange": {
|
||||
"from": 600,
|
||||
"to": 0
|
||||
},
|
||||
"datasourceUid": "__expr__",
|
||||
"model": {
|
||||
"conditions": [
|
||||
{
|
||||
"evaluator": {
|
||||
"params": [],
|
||||
"type": "gt"
|
||||
},
|
||||
"operator": {
|
||||
"type": "and"
|
||||
},
|
||||
"query": {
|
||||
"params": [
|
||||
"B"
|
||||
]
|
||||
},
|
||||
"reducer": {
|
||||
"params": [],
|
||||
"type": "last"
|
||||
},
|
||||
"type": "query"
|
||||
}
|
||||
],
|
||||
"datasource": {
|
||||
"type": "__expr__",
|
||||
"uid": "__expr__"
|
||||
},
|
||||
"expression": "A",
|
||||
"hide": false,
|
||||
"intervalMs": 1000,
|
||||
"maxDataPoints": 43200,
|
||||
"reducer": "last",
|
||||
"refId": "B",
|
||||
"type": "reduce"
|
||||
}
|
||||
},
|
||||
{
|
||||
"refId": "C",
|
||||
"queryType": "",
|
||||
"relativeTimeRange": {
|
||||
"from": 600,
|
||||
"to": 0
|
||||
},
|
||||
"datasourceUid": "__expr__",
|
||||
"model": {
|
||||
"conditions": [
|
||||
{
|
||||
"evaluator": {
|
||||
"params": [
|
||||
50
|
||||
],
|
||||
"type": "gt"
|
||||
},
|
||||
"operator": {
|
||||
"type": "and"
|
||||
},
|
||||
"query": {
|
||||
"params": [
|
||||
"C"
|
||||
]
|
||||
},
|
||||
"reducer": {
|
||||
"params": [],
|
||||
"type": "last"
|
||||
},
|
||||
"type": "query"
|
||||
}
|
||||
],
|
||||
"datasource": {
|
||||
"type": "__expr__",
|
||||
"uid": "__expr__"
|
||||
},
|
||||
"expression": "B",
|
||||
"hide": false,
|
||||
"intervalMs": 1000,
|
||||
"maxDataPoints": 43200,
|
||||
"refId": "C",
|
||||
"type": "threshold"
|
||||
}
|
||||
}
|
||||
],
|
||||
"updated": "2023-10-16T18:16:45Z",
|
||||
"noDataState": "OK",
|
||||
"execErrState": "Error",
|
||||
"for": "2m",
|
||||
"annotations": {
|
||||
"description": "Queued requests of BeeGFS meta server with nodeID - \"{{ $labels.nodeID }}\" is {{ $values.B }}",
|
||||
"summary": "Meta server queued requests is above threshold"
|
||||
},
|
||||
"labels": {
|
||||
"queued": "request"
|
||||
},
|
||||
"isPaused": true
|
||||
}
|
||||
158
mon/scripts/grafana/alerts/Services-alert-v1.json
Normal file
158
mon/scripts/grafana/alerts/Services-alert-v1.json
Normal file
@@ -0,0 +1,158 @@
|
||||
{
|
||||
"id": 1,
|
||||
"uid": "d9a3e5ba-b5bc-4ede-989b-c605547eb2d",
|
||||
"orgID": 1,
|
||||
"folderUID": "beegfsalertfolder",
|
||||
"ruleGroup": "evaluate",
|
||||
"title": "Services Alert",
|
||||
"condition": "C",
|
||||
"data": [
|
||||
{
|
||||
"refId": "A",
|
||||
"queryType": "",
|
||||
"relativeTimeRange": {
|
||||
"from": 600,
|
||||
"to": 0
|
||||
},
|
||||
"datasourceUid": "${DS_UID}",
|
||||
"model": {
|
||||
"groupBy": [
|
||||
{
|
||||
"params": [
|
||||
"host"
|
||||
],
|
||||
"type": "tag"
|
||||
},
|
||||
{
|
||||
"params": [
|
||||
"systemd_unit"
|
||||
],
|
||||
"type": "tag"
|
||||
}
|
||||
],
|
||||
"intervalMs": 1000,
|
||||
"maxDataPoints": 43200,
|
||||
"measurement": "procstat_lookup",
|
||||
"orderByTime": "ASC",
|
||||
"policy": "auto",
|
||||
"query": "SELECT last(\"running\") FROM \"auto\".\"procstat_lookup\" WHERE $timeFilter GROUP BY \"host\", \"systemd_unit\"",
|
||||
"rawQuery": true,
|
||||
"refId": "A",
|
||||
"resultFormat": "time_series",
|
||||
"select": [
|
||||
[
|
||||
{
|
||||
"params": [
|
||||
"running"
|
||||
],
|
||||
"type": "field"
|
||||
},
|
||||
{
|
||||
"params": [],
|
||||
"type": "last"
|
||||
}
|
||||
]
|
||||
]
|
||||
}
|
||||
},
|
||||
{
|
||||
"refId": "B",
|
||||
"queryType": "",
|
||||
"relativeTimeRange": {
|
||||
"from": 600,
|
||||
"to": 0
|
||||
},
|
||||
"datasourceUid": "__expr__",
|
||||
"model": {
|
||||
"conditions": [
|
||||
{
|
||||
"evaluator": {
|
||||
"params": [
|
||||
0,
|
||||
0
|
||||
],
|
||||
"type": "gt"
|
||||
},
|
||||
"operator": {
|
||||
"type": "and"
|
||||
},
|
||||
"query": {
|
||||
"params": []
|
||||
},
|
||||
"reducer": {
|
||||
"params": [],
|
||||
"type": "avg"
|
||||
},
|
||||
"type": "query"
|
||||
}
|
||||
],
|
||||
"datasource": {
|
||||
"name": "Expression",
|
||||
"type": "__expr__",
|
||||
"uid": "__expr__"
|
||||
},
|
||||
"expression": "A",
|
||||
"intervalMs": 1000,
|
||||
"maxDataPoints": 43200,
|
||||
"reducer": "last",
|
||||
"refId": "B",
|
||||
"type": "reduce"
|
||||
}
|
||||
},
|
||||
{
|
||||
"refId": "C",
|
||||
"queryType": "",
|
||||
"relativeTimeRange": {
|
||||
"from": 600,
|
||||
"to": 0
|
||||
},
|
||||
"datasourceUid": "__expr__",
|
||||
"model": {
|
||||
"conditions": [
|
||||
{
|
||||
"evaluator": {
|
||||
"params": [
|
||||
1,
|
||||
0
|
||||
],
|
||||
"type": "lt"
|
||||
},
|
||||
"operator": {
|
||||
"type": "and"
|
||||
},
|
||||
"query": {
|
||||
"params": []
|
||||
},
|
||||
"reducer": {
|
||||
"params": [],
|
||||
"type": "avg"
|
||||
},
|
||||
"type": "query"
|
||||
}
|
||||
],
|
||||
"datasource": {
|
||||
"name": "Expression",
|
||||
"type": "__expr__",
|
||||
"uid": "__expr__"
|
||||
},
|
||||
"expression": "B",
|
||||
"intervalMs": 1000,
|
||||
"maxDataPoints": 43200,
|
||||
"refId": "C",
|
||||
"type": "threshold"
|
||||
}
|
||||
}
|
||||
],
|
||||
"updated": "2023-10-17T09:19:39+05:30",
|
||||
"noDataState": "OK",
|
||||
"execErrState": "Error",
|
||||
"for": "1m",
|
||||
"annotations": {
|
||||
"description": "BeeGFS Service \"{{ $labels.systemd_unit }}\" is Down , Please check host \"{{ $labels.host }}\"",
|
||||
"summary": "BeeGFS Service \"{{ $labels.systemd_unit }}\" is Down"
|
||||
},
|
||||
"labels": {
|
||||
"service_status": "down"
|
||||
},
|
||||
"isPaused": true
|
||||
}
|
||||
126
mon/scripts/grafana/alerts/Services-alert-v2.json
Normal file
126
mon/scripts/grafana/alerts/Services-alert-v2.json
Normal file
@@ -0,0 +1,126 @@
|
||||
{
|
||||
"id": 1,
|
||||
"uid": "a96d9b2e-2a6b-4ab3-9858-200da324672f",
|
||||
"orgID": 1,
|
||||
"folderUID": "beegfsalertfolder",
|
||||
"ruleGroup": "evaluate",
|
||||
"title": "Service Alert",
|
||||
"condition": "C",
|
||||
"data": [
|
||||
{
|
||||
"refId": "A",
|
||||
"queryType": "",
|
||||
"relativeTimeRange": {
|
||||
"from": 600,
|
||||
"to": 0
|
||||
},
|
||||
"datasourceUid": "${DS_UID}",
|
||||
"model": {
|
||||
"hide": false,
|
||||
"intervalMs": 1000,
|
||||
"maxDataPoints": 43200,
|
||||
"query": "from(bucket: \"${BUCKET}\")\r\n |> range(start: v.timeRangeStart, stop: v.timeRangeStop)\r\n |> filter(fn: (r) => r[\"_measurement\"] == \"procstat_lookup\")\r\n |> filter(fn: (r) => r._field == \"running\")\r\n |> group(columns: [\"host\", \"systemd_unit\"], mode: \"by\")\r\n |> aggregateWindow(every: v.windowPeriod, fn: last, createEmpty: false)\r\n |> keep(columns: [\"_time\", \"_value\", \"host\", \"systemd_unit\"])\r\n |> sort(columns: [\"_time\"])",
|
||||
"refId": "A"
|
||||
}
|
||||
},
|
||||
{
|
||||
"refId": "B",
|
||||
"queryType": "",
|
||||
"relativeTimeRange": {
|
||||
"from": 600,
|
||||
"to": 0
|
||||
},
|
||||
"datasourceUid": "__expr__",
|
||||
"model": {
|
||||
"conditions": [
|
||||
{
|
||||
"evaluator": {
|
||||
"params": [],
|
||||
"type": "gt"
|
||||
},
|
||||
"operator": {
|
||||
"type": "and"
|
||||
},
|
||||
"query": {
|
||||
"params": [
|
||||
"B"
|
||||
]
|
||||
},
|
||||
"reducer": {
|
||||
"params": [],
|
||||
"type": "last"
|
||||
},
|
||||
"type": "query"
|
||||
}
|
||||
],
|
||||
"datasource": {
|
||||
"type": "__expr__",
|
||||
"uid": "__expr__"
|
||||
},
|
||||
"expression": "A",
|
||||
"hide": false,
|
||||
"intervalMs": 1000,
|
||||
"maxDataPoints": 43200,
|
||||
"reducer": "last",
|
||||
"refId": "B",
|
||||
"type": "reduce"
|
||||
}
|
||||
},
|
||||
{
|
||||
"refId": "C",
|
||||
"queryType": "",
|
||||
"relativeTimeRange": {
|
||||
"from": 600,
|
||||
"to": 0
|
||||
},
|
||||
"datasourceUid": "__expr__",
|
||||
"model": {
|
||||
"conditions": [
|
||||
{
|
||||
"evaluator": {
|
||||
"params": [
|
||||
1
|
||||
],
|
||||
"type": "lt"
|
||||
},
|
||||
"operator": {
|
||||
"type": "and"
|
||||
},
|
||||
"query": {
|
||||
"params": [
|
||||
"C"
|
||||
]
|
||||
},
|
||||
"reducer": {
|
||||
"params": [],
|
||||
"type": "last"
|
||||
},
|
||||
"type": "query"
|
||||
}
|
||||
],
|
||||
"datasource": {
|
||||
"type": "__expr__",
|
||||
"uid": "__expr__"
|
||||
},
|
||||
"expression": "B",
|
||||
"hide": false,
|
||||
"intervalMs": 1000,
|
||||
"maxDataPoints": 43200,
|
||||
"refId": "C",
|
||||
"type": "threshold"
|
||||
}
|
||||
}
|
||||
],
|
||||
"updated": "2023-10-17T11:33:42Z",
|
||||
"noDataState": "OK",
|
||||
"execErrState": "Error",
|
||||
"for": "1m",
|
||||
"annotations": {
|
||||
"description": "BeeGFS Service \"{{ $labels.systemd_unit }}\" is Down , Please check host \"{{ $labels.host }}\"",
|
||||
"summary": "BeeGFS Service \"{{ $labels.systemd_unit }}\" is Down"
|
||||
},
|
||||
"labels": {
|
||||
"service_status": "down"
|
||||
},
|
||||
"isPaused": true
|
||||
}
|
||||
151
mon/scripts/grafana/alerts/StorageQueuedrequest-alert-v1.json
Normal file
151
mon/scripts/grafana/alerts/StorageQueuedrequest-alert-v1.json
Normal file
@@ -0,0 +1,151 @@
|
||||
{
|
||||
"id": 6,
|
||||
"uid": "c81b9c61-d553-4240-aff1-e92627a40a11",
|
||||
"orgID": 1,
|
||||
"folderUID": "beegfsalertfolder",
|
||||
"ruleGroup": "evaluate",
|
||||
"title": "Storage Queued Request Alert ",
|
||||
"condition": "C",
|
||||
"data": [
|
||||
{
|
||||
"refId": "A",
|
||||
"queryType": "",
|
||||
"relativeTimeRange": {
|
||||
"from": 600,
|
||||
"to": 0
|
||||
},
|
||||
"datasourceUid": "${DS_UID}",
|
||||
"model": {
|
||||
"groupBy": [
|
||||
{
|
||||
"params": [
|
||||
"nodeID::tag"
|
||||
],
|
||||
"type": "tag"
|
||||
}
|
||||
],
|
||||
"intervalMs": 1000,
|
||||
"maxDataPoints": 43200,
|
||||
"measurement": "highResStorage",
|
||||
"orderByTime": "ASC",
|
||||
"policy": "auto",
|
||||
"query": "SELECT max(\"queuedRequests\") FROM \"auto\".\"highResStorage\" WHERE $timeFilter GROUP BY \"nodeID\"::tag",
|
||||
"rawQuery": true,
|
||||
"refId": "A",
|
||||
"resultFormat": "time_series",
|
||||
"select": [
|
||||
[
|
||||
{
|
||||
"params": [
|
||||
"queuedRequests"
|
||||
],
|
||||
"type": "field"
|
||||
},
|
||||
{
|
||||
"params": [],
|
||||
"type": "max"
|
||||
}
|
||||
]
|
||||
],
|
||||
"tags": []
|
||||
}
|
||||
},
|
||||
{
|
||||
"refId": "B",
|
||||
"queryType": "",
|
||||
"relativeTimeRange": {
|
||||
"from": 600,
|
||||
"to": 0
|
||||
},
|
||||
"datasourceUid": "__expr__",
|
||||
"model": {
|
||||
"conditions": [
|
||||
{
|
||||
"evaluator": {
|
||||
"params": [],
|
||||
"type": "gt"
|
||||
},
|
||||
"operator": {
|
||||
"type": "and"
|
||||
},
|
||||
"query": {
|
||||
"params": [
|
||||
"B"
|
||||
]
|
||||
},
|
||||
"reducer": {
|
||||
"params": [],
|
||||
"type": "last"
|
||||
},
|
||||
"type": "query"
|
||||
}
|
||||
],
|
||||
"datasource": {
|
||||
"type": "__expr__",
|
||||
"uid": "__expr__"
|
||||
},
|
||||
"expression": "A",
|
||||
"intervalMs": 1000,
|
||||
"maxDataPoints": 43200,
|
||||
"reducer": "last",
|
||||
"refId": "B",
|
||||
"type": "reduce"
|
||||
}
|
||||
},
|
||||
{
|
||||
"refId": "C",
|
||||
"queryType": "",
|
||||
"relativeTimeRange": {
|
||||
"from": 600,
|
||||
"to": 0
|
||||
},
|
||||
"datasourceUid": "__expr__",
|
||||
"model": {
|
||||
"conditions": [
|
||||
{
|
||||
"evaluator": {
|
||||
"params": [
|
||||
50
|
||||
],
|
||||
"type": "gt"
|
||||
},
|
||||
"operator": {
|
||||
"type": "and"
|
||||
},
|
||||
"query": {
|
||||
"params": [
|
||||
"C"
|
||||
]
|
||||
},
|
||||
"reducer": {
|
||||
"params": [],
|
||||
"type": "last"
|
||||
},
|
||||
"type": "query"
|
||||
}
|
||||
],
|
||||
"datasource": {
|
||||
"type": "__expr__",
|
||||
"uid": "__expr__"
|
||||
},
|
||||
"expression": "B",
|
||||
"intervalMs": 1000,
|
||||
"maxDataPoints": 43200,
|
||||
"refId": "C",
|
||||
"type": "threshold"
|
||||
}
|
||||
}
|
||||
],
|
||||
"updated": "2023-10-17T19:13:11+05:30",
|
||||
"noDataState": "OK",
|
||||
"execErrState": "Error",
|
||||
"for": "2m",
|
||||
"annotations": {
|
||||
"description": "Queued requests of BeeGFS Storage Server with nodeID - \"{{ $labels.nodeID }}\" is {{ $values.B }}",
|
||||
"summary": "Storage server queued requests is above threshold"
|
||||
},
|
||||
"labels": {
|
||||
"queued": "request"
|
||||
},
|
||||
"isPaused": true
|
||||
}
|
||||
126
mon/scripts/grafana/alerts/StorageQueuedrequest-alert-v2.json
Normal file
126
mon/scripts/grafana/alerts/StorageQueuedrequest-alert-v2.json
Normal file
@@ -0,0 +1,126 @@
|
||||
{
|
||||
"id": 6,
|
||||
"uid": "e0a4e911-6602-4adc-993b-d65672e7f431",
|
||||
"orgID": 1,
|
||||
"folderUID": "beegfsalertfolder",
|
||||
"ruleGroup": "evaluate",
|
||||
"title": "Storage Queued Request Alert",
|
||||
"condition": "C",
|
||||
"data": [
|
||||
{
|
||||
"refId": "A",
|
||||
"queryType": "",
|
||||
"relativeTimeRange": {
|
||||
"from": 600,
|
||||
"to": 0
|
||||
},
|
||||
"datasourceUid": "${DS_UID}",
|
||||
"model": {
|
||||
"hide": false,
|
||||
"intervalMs": 1000,
|
||||
"maxDataPoints": 43200,
|
||||
"query": "from(bucket: \"${BUCKET}\") \r\n|> range(start: v.timeRangeStart, stop:v.timeRangeStop) \r\n|> filter(fn: (r) => r._measurement == \"highResStorage\" and r._field == \"queuedRequests\") \r\n|> aggregateWindow(every: v.windowPeriod, fn: max, createEmpty: false) \r\n|> yield(name: \"max\")",
|
||||
"refId": "A"
|
||||
}
|
||||
},
|
||||
{
|
||||
"refId": "B",
|
||||
"queryType": "",
|
||||
"relativeTimeRange": {
|
||||
"from": 600,
|
||||
"to": 0
|
||||
},
|
||||
"datasourceUid": "__expr__",
|
||||
"model": {
|
||||
"conditions": [
|
||||
{
|
||||
"evaluator": {
|
||||
"params": [],
|
||||
"type": "gt"
|
||||
},
|
||||
"operator": {
|
||||
"type": "and"
|
||||
},
|
||||
"query": {
|
||||
"params": [
|
||||
"B"
|
||||
]
|
||||
},
|
||||
"reducer": {
|
||||
"params": [],
|
||||
"type": "last"
|
||||
},
|
||||
"type": "query"
|
||||
}
|
||||
],
|
||||
"datasource": {
|
||||
"type": "__expr__",
|
||||
"uid": "__expr__"
|
||||
},
|
||||
"expression": "A",
|
||||
"hide": false,
|
||||
"intervalMs": 1000,
|
||||
"maxDataPoints": 43200,
|
||||
"reducer": "last",
|
||||
"refId": "B",
|
||||
"type": "reduce"
|
||||
}
|
||||
},
|
||||
{
|
||||
"refId": "C",
|
||||
"queryType": "",
|
||||
"relativeTimeRange": {
|
||||
"from": 600,
|
||||
"to": 0
|
||||
},
|
||||
"datasourceUid": "__expr__",
|
||||
"model": {
|
||||
"conditions": [
|
||||
{
|
||||
"evaluator": {
|
||||
"params": [
|
||||
50
|
||||
],
|
||||
"type": "gt"
|
||||
},
|
||||
"operator": {
|
||||
"type": "and"
|
||||
},
|
||||
"query": {
|
||||
"params": [
|
||||
"C"
|
||||
]
|
||||
},
|
||||
"reducer": {
|
||||
"params": [],
|
||||
"type": "last"
|
||||
},
|
||||
"type": "query"
|
||||
}
|
||||
],
|
||||
"datasource": {
|
||||
"type": "__expr__",
|
||||
"uid": "__expr__"
|
||||
},
|
||||
"expression": "B",
|
||||
"hide": false,
|
||||
"intervalMs": 1000,
|
||||
"maxDataPoints": 43200,
|
||||
"refId": "C",
|
||||
"type": "threshold"
|
||||
}
|
||||
}
|
||||
],
|
||||
"updated": "2023-10-16T18:16:45Z",
|
||||
"noDataState": "OK",
|
||||
"execErrState": "Error",
|
||||
"for": "2m",
|
||||
"annotations": {
|
||||
"description": "Queued requests of BeeGFS storage server with nodeID - \"{{ $labels.nodeID }}\" is {{ $values.B }}",
|
||||
"summary": "Storage server queued requests is above threshold"
|
||||
},
|
||||
"labels": {
|
||||
"queued": "request"
|
||||
},
|
||||
"isPaused": true
|
||||
}
|
||||
121
mon/scripts/grafana/alerts/alert-dashboard.json
Normal file
121
mon/scripts/grafana/alerts/alert-dashboard.json
Normal file
@@ -0,0 +1,121 @@
|
||||
{
|
||||
"__inputs": [
|
||||
{
|
||||
"name": "DS_BEEGFS_MON_INFLUXDB",
|
||||
"label": "beegfs_mon_influxdb",
|
||||
"description": "",
|
||||
"type": "datasource",
|
||||
"pluginId": "influxdb",
|
||||
"pluginName": "InfluxDB"
|
||||
}
|
||||
],
|
||||
"__elements": {},
|
||||
"__requires": [
|
||||
{
|
||||
"type": "panel",
|
||||
"id": "alertlist",
|
||||
"name": "Alert list",
|
||||
"version": ""
|
||||
},
|
||||
{
|
||||
"type": "grafana",
|
||||
"id": "grafana",
|
||||
"name": "Grafana",
|
||||
"version": "10.1.4"
|
||||
},
|
||||
{
|
||||
"type": "datasource",
|
||||
"id": "influxdb",
|
||||
"name": "InfluxDB",
|
||||
"version": "1.0.0"
|
||||
}
|
||||
],
|
||||
"annotations": {
|
||||
"list": [
|
||||
{
|
||||
"builtIn": 1,
|
||||
"datasource": {
|
||||
"type": "grafana",
|
||||
"uid": "-- Grafana --"
|
||||
},
|
||||
"enable": true,
|
||||
"hide": true,
|
||||
"iconColor": "rgba(0, 211, 255, 1)",
|
||||
"name": "Annotations & Alerts",
|
||||
"type": "dashboard"
|
||||
}
|
||||
]
|
||||
},
|
||||
"description": "",
|
||||
"editable": true,
|
||||
"fiscalYearStartMonth": 0,
|
||||
"graphTooltip": 0,
|
||||
"id": null,
|
||||
"links": [],
|
||||
"liveNow": false,
|
||||
"panels": [
|
||||
{
|
||||
"datasource": {
|
||||
"type": "influxdb",
|
||||
"uid": "${DS_BEEGFS_MON_INFLUXDB}"
|
||||
},
|
||||
"gridPos": {
|
||||
"h": 22,
|
||||
"w": 24,
|
||||
"x": 0,
|
||||
"y": 0
|
||||
},
|
||||
"id": 1,
|
||||
"options": {
|
||||
"alertInstanceLabelFilter": "",
|
||||
"alertName": "",
|
||||
"dashboardAlerts": false,
|
||||
"folder": {
|
||||
"title": "BeeGFS-Alert",
|
||||
"uid": "beegfsalertfolder"
|
||||
},
|
||||
"groupBy": [],
|
||||
"groupMode": "default",
|
||||
"maxItems": 20,
|
||||
"sortOrder": 1,
|
||||
"stateFilter": {
|
||||
"error": true,
|
||||
"firing": true,
|
||||
"noData": false,
|
||||
"normal": true,
|
||||
"pending": true
|
||||
},
|
||||
"viewMode": "list"
|
||||
},
|
||||
"pluginVersion": "10.1.4",
|
||||
"targets": [
|
||||
{
|
||||
"datasource": {
|
||||
"type": "influxdb",
|
||||
"uid": "${DS_BEEGFS_MON_INFLUXDB}"
|
||||
},
|
||||
"refId": "A"
|
||||
}
|
||||
],
|
||||
"title": "Alert List",
|
||||
"type": "alertlist"
|
||||
}
|
||||
],
|
||||
"refresh": "",
|
||||
"schemaVersion": 38,
|
||||
"style": "dark",
|
||||
"tags": [],
|
||||
"templating": {
|
||||
"list": []
|
||||
},
|
||||
"time": {
|
||||
"from": "now-5m",
|
||||
"to": "now"
|
||||
},
|
||||
"timepicker": {},
|
||||
"timezone": "",
|
||||
"title": "BeeGFS Alerts List",
|
||||
"uid": "c4a31d8f-4dc6-4023-bc7a-1b06167a6f74",
|
||||
"version": 1,
|
||||
"weekStart": ""
|
||||
}
|
||||
12
mon/scripts/grafana/alerts/contact-point.json
Normal file
12
mon/scripts/grafana/alerts/contact-point.json
Normal file
@@ -0,0 +1,12 @@
|
||||
{
|
||||
"uid": "d5c51f44-07d047ca-a580-5a66f643e",
|
||||
"name": "beegfs-email",
|
||||
"type": "email",
|
||||
"settings": {
|
||||
"addresses": "beegfsalert@example.com",
|
||||
"message": "{{ template \"beegfs.message\" . }}",
|
||||
"singleEmail": false,
|
||||
"subject": "{{ template \"beegfs.title\" . }}"
|
||||
},
|
||||
"disableResolveMessage": false
|
||||
}
|
||||
4
mon/scripts/grafana/alerts/email-template.json
Normal file
4
mon/scripts/grafana/alerts/email-template.json
Normal file
@@ -0,0 +1,4 @@
|
||||
{
|
||||
"name": "BeeGFS-Email-Template",
|
||||
"template": "{{ define \"alert_severity_prefix_emoji\" -}}\n\t{{- if eq .Status \"firing\" -}}\n\t\t🔥\n\t{{- else -}}\n\t\t✅\n\t{{- end -}}\n{{- end -}}\n\n{{ define \"beegfs_subject\" }}\n{{ template \"alert_severity_prefix_emoji\" . }}\n[{{ .Status | toUpper }}{{ if eq .Status \"firing\" }}:{{ .Alerts.Firing | len }}{{ if gt (.Alerts.Resolved | len) 0 }}, RESOLVED:{{ .Alerts.Resolved | len }}{{ end }}{{ end }} | {{ .CommonLabels.alertname -}}] \n{{ end }}\n\n{{ define \"beegfs_text_alert_list\" }}{{ range . }}\nSummary:\n\t{{ .Annotations.summary }}\n\t\nDescription:\n\t{{ .Annotations.description }}\n\nLabels:\n {{ range .Labels.SortedPairs -}}\n {{ .Name }} = {{ .Value }}\n\t{{ end }}\n{{ end }}\n{{ range . }}\n{{ if gt (len .SilenceURL) 0 }}Silence: {{ .SilenceURL }}\n{{ end }}\n{{ end }}\n{{ end }}\n\n\n{{ define \"beegfs.title\" }}{{ template \"beegfs_subject\" . }}{{ end }}\n\n{{ define \"beegfs.message\" }}{{ if gt (len .Alerts.Firing) 0 }}*Firing 🔥*\n{{ template \"beegfs_text_alert_list\" .Alerts.Firing }}{{ if gt (len .Alerts.Resolved) 0 }}\n\n{{ end }}{{ end }}\n\n{{ if gt (len .Alerts.Resolved) 0 }}*Resolved ✅*\n\nBelow alert is resolved:\n{{ template \"beegfs_text_alert_list\" .Alerts.Resolved }}\n{{ end }}\n{{ end }}"
|
||||
}
|
||||
97
mon/scripts/grafana/alerts/policies-telegraf.json
Normal file
97
mon/scripts/grafana/alerts/policies-telegraf.json
Normal file
@@ -0,0 +1,97 @@
|
||||
|
||||
{
|
||||
"receiver": "grafana-default-email",
|
||||
"routes": [
|
||||
{
|
||||
"receiver": "beegfs-email",
|
||||
"object_matchers": [
|
||||
[
|
||||
"disk-severity",
|
||||
"=",
|
||||
"warning"
|
||||
]
|
||||
],
|
||||
"group_wait": "30s",
|
||||
"group_interval": "5m",
|
||||
"repeat_interval": "6h"
|
||||
},
|
||||
{
|
||||
"receiver": "beegfs-email",
|
||||
"object_matchers": [
|
||||
[
|
||||
"disk-severity",
|
||||
"=",
|
||||
"critical"
|
||||
]
|
||||
],
|
||||
"group_wait": "30s",
|
||||
"group_interval": "5m",
|
||||
"repeat_interval": "6h"
|
||||
},
|
||||
{
|
||||
"receiver": "beegfs-email",
|
||||
"object_matchers": [
|
||||
[
|
||||
"service_status",
|
||||
"=",
|
||||
"down"
|
||||
]
|
||||
],
|
||||
"group_wait": "30s",
|
||||
"group_interval": "3m",
|
||||
"repeat_interval": "6h"
|
||||
},
|
||||
{
|
||||
"receiver": "beegfs-email",
|
||||
"object_matchers": [
|
||||
[
|
||||
"inodes",
|
||||
"=",
|
||||
"free"
|
||||
]
|
||||
],
|
||||
"group_wait": "30s",
|
||||
"group_interval": "5m",
|
||||
"repeat_interval": "6h"
|
||||
},
|
||||
{
|
||||
"receiver": "beegfs-email",
|
||||
"object_matchers": [
|
||||
[
|
||||
"cpu-severity",
|
||||
"=",
|
||||
"critical"
|
||||
]
|
||||
],
|
||||
"group_wait": "30s",
|
||||
"group_interval": "5m",
|
||||
"repeat_interval": "6h"
|
||||
},
|
||||
{
|
||||
"receiver": "beegfs-email",
|
||||
"object_matchers": [
|
||||
[
|
||||
"cpu-severity",
|
||||
"=",
|
||||
"warning"
|
||||
]
|
||||
],
|
||||
"group_wait": "30s",
|
||||
"group_interval": "5m",
|
||||
"repeat_interval": "6h"
|
||||
},
|
||||
{
|
||||
"receiver": "beegfs-email",
|
||||
"object_matchers": [
|
||||
[
|
||||
"queued",
|
||||
"=",
|
||||
"request"
|
||||
]
|
||||
],
|
||||
"group_wait": "30s",
|
||||
"group_interval": "5m",
|
||||
"repeat_interval": "6h"
|
||||
}
|
||||
]
|
||||
}
|
||||
58
mon/scripts/grafana/alerts/policies.json
Normal file
58
mon/scripts/grafana/alerts/policies.json
Normal file
@@ -0,0 +1,58 @@
|
||||
|
||||
{
|
||||
"receiver": "grafana-default-email",
|
||||
"routes": [
|
||||
{
|
||||
"receiver": "beegfs-email",
|
||||
"object_matchers": [
|
||||
[
|
||||
"disk-severity",
|
||||
"=",
|
||||
"warning"
|
||||
]
|
||||
],
|
||||
"group_wait": "30s",
|
||||
"group_interval": "5m",
|
||||
"repeat_interval": "6h"
|
||||
},
|
||||
{
|
||||
"receiver": "beegfs-email",
|
||||
"object_matchers": [
|
||||
[
|
||||
"disk-severity",
|
||||
"=",
|
||||
"critical"
|
||||
]
|
||||
],
|
||||
"group_wait": "30s",
|
||||
"group_interval": "5m",
|
||||
"repeat_interval": "6h"
|
||||
},
|
||||
{
|
||||
"receiver": "beegfs-email",
|
||||
"object_matchers": [
|
||||
[
|
||||
"inodes",
|
||||
"=",
|
||||
"free"
|
||||
]
|
||||
],
|
||||
"group_wait": "30s",
|
||||
"group_interval": "5m",
|
||||
"repeat_interval": "6h"
|
||||
},
|
||||
{
|
||||
"receiver": "beegfs-email",
|
||||
"object_matchers": [
|
||||
[
|
||||
"queued",
|
||||
"=",
|
||||
"request"
|
||||
]
|
||||
],
|
||||
"group_wait": "30s",
|
||||
"group_interval": "5m",
|
||||
"repeat_interval": "6h"
|
||||
}
|
||||
]
|
||||
}
|
||||
Reference in New Issue
Block a user