Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
13 changes: 13 additions & 0 deletions .github/workflows/stackhpc.yml
Original file line number Diff line number Diff line change
Expand Up @@ -170,6 +170,11 @@ jobs:
env:
TESTUSER_PASSWORD: ${{ secrets.TEST_USER_PASSWORD }}

- name: Get timestamp in milliseconds before rebuild
id: timestamp
run: |
echo "TIME_BEFORE_REBUILD=$(date +%s%N | cut -b1-13)" >> $GITHUB_OUTPUT

# - name: Build environment-specific compute image
# id: packer_build
# run: |
Expand Down Expand Up @@ -209,6 +214,14 @@ jobs:
. environments/.stackhpc/activate
ansible-playbook -vv ansible/ci/check_grafana.yml

- name: Check Loki Slurm logs persisted through rebuild
run: |
. venv/bin/activate
. environments/.stackhpc/activate
ansible-playbook -vv ansible/ci/check_loki.yml -e "end_timestamp=${{ steps.timestamp.outputs.TIME_BEFORE_REBUILD }} testuser_password=$TESTUSER_PASSWORD"
env:
TESTUSER_PASSWORD: ${{ secrets.TEST_USER_PASSWORD }}

- name: Delete infrastructure
run: |
. venv/bin/activate
Expand Down
40 changes: 40 additions & 0 deletions ansible/ci/check_loki.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,40 @@
- hosts: control
gather_facts: no
become: no
tasks:
- name: Get Loki datasource uid
ansible.builtin.uri:
url: http://{{ grafana_api_address }}:{{ grafana_port }}/api/datasources/name/Loki
url_username: grafana
url_password: "{{ vault_grafana_admin_password }}"
follow_redirects: all
register: uid_response

- name: Get Loki logs from before rebuild
ansible.builtin.uri:
url: http://{{ grafana_api_address }}:{{ grafana_port }}/api/ds/query
follow_redirects: all
url_username: grafana
url_password: "{{ vault_grafana_admin_password }}"
method: POST
body_format: json
headers:
Accept: application/json
Content-Type: application/json
# Queries from 20 mins before timestamp to timestamp
body: |
{
"queries":[
{
"expr":"{unit=\"slurmd.service\"} |= ``",
"datasource":{"uid":"{{ uid_response.json.uid }}"},
"format":"time_series"
}],
"from":"{{ end_timestamp | int - 1200000 }}",
"to":"{{ end_timestamp }}"
}
register: log_query_content

- name: Check that logs exist
ansible.builtin.assert:
that: log_query_content.json.results.A.frames[0].data['values'][2] | length > 0
228 changes: 228 additions & 0 deletions ansible/roles/grafana-dashboards/files/loki-pod-logs-dashboard.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,228 @@
{
"annotations": {
"list": [
{
"builtIn": 1,
"datasource": {
"type": "grafana",
"uid": "-- Grafana --"
},
"enable": true,
"hide": true,
"iconColor": "rgba(0, 211, 255, 1)",
"name": "Annotations & Alerts",
"type": "dashboard"
}
]
},
"description": "Search pod logs stored in Loki",
"editable": true,
"fiscalYearStartMonth": 0,
"graphTooltip": 0,
"id": 31,
"links": [],
"panels": [
{
"datasource": "Loki",
"fieldConfig": {
"defaults": {
"color": {
"mode": "palette-classic"
},
"custom": {
"axisBorderShow": false,
"axisCenteredZero": false,
"axisColorMode": "text",
"axisLabel": "",
"axisPlacement": "hidden",
"barAlignment": 0,
"barWidthFactor": 0.6,
"drawStyle": "bars",
"fillOpacity": 100,
"gradientMode": "none",
"hideFrom": {
"legend": false,
"tooltip": false,
"viz": false
},
"insertNulls": false,
"lineInterpolation": "linear",
"lineWidth": 1,
"pointSize": 5,
"scaleDistribution": {
"type": "linear"
},
"showPoints": "never",
"spanNulls": false,
"stacking": {
"group": "A",
"mode": "none"
},
"thresholdsStyle": {
"mode": "off"
}
},
"mappings": [],
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "green",
"value": null
},
{
"color": "red",
"value": 80
}
]
},
"unit": "short"
},
"overrides": []
},
"gridPos": {
"h": 3,
"w": 24,
"x": 0,
"y": 0
},
"id": 6,
"options": {
"legend": {
"calcs": [],
"displayMode": "list",
"placement": "bottom",
"showLegend": false
},
"tooltip": {
"mode": "multi",
"sort": "none"
}
},
"targets": [
{
"datasource": "Loki",
"expr": "sum(count_over_time({namespace=\"$namespace\", pod=~\"$pod\"} |~ \"$search\"[$__interval]))",
"refId": "A"
}
],
"type": "timeseries"
},
{
"datasource": "Loki",
"gridPos": {
"h": 25,
"w": 24,
"x": 0,
"y": 3
},
"id": 2,
"maxDataPoints": "",
"options": {
"dedupStrategy": "none",
"enableLogDetails": true,
"prettifyLogMessage": false,
"showCommonLabels": false,
"showLabels": false,
"showTime": true,
"sortOrder": "Descending",
"wrapLogMessage": true
},
"targets": [
{
"datasource": "Loki",
"expr": "{namespace=\"$namespace\", pod=~\"$pod\"} |~ \"$search\"",
"refId": "A"
}
],
"title": "Logs Panel",
"type": "logs"
}
],
"schemaVersion": 39,
"tags": [],
"templating": {
"list": [
{
"allValue": ".+",
"current": {
"selected": true,
"text": "ingress-nginx",
"value": "ingress-nginx"
},
"datasource": "Loki",
"definition": "label_values(namespace)",
"hide": 0,
"includeAll": false,
"multi": false,
"name": "namespace",
"options": [],
"query": "label_values(namespace)",
"refresh": 1,
"regex": "",
"skipUrlSync": false,
"sort": 0,
"tagValuesQuery": "",
"tagsQuery": "",
"type": "query",
"useTags": false
},
{
"allValue": ".+",
"current": {
"selected": false,
"text": "All",
"value": "$__all"
},
"datasource": "Loki",
"definition": "label_values({namespace=~\"$namespace\"}, pod)",
"hide": 0,
"includeAll": true,
"multi": true,
"name": "pod",
"options": [],
"query": "label_values({namespace=~\"$namespace\"}, pod)",
"refresh": 1,
"regex": "",
"skipUrlSync": false,
"sort": 0,
"tagValuesQuery": "",
"tagsQuery": "",
"type": "query",
"useTags": false
},
{
"current": {},
"hide": 0,
"name": "search",
"options": [],
"query": "",
"skipUrlSync": false,
"type": "textbox"
}
]
},
"time": {
"from": "now-6h",
"to": "now"
},
"timepicker": {
"refresh_intervals": [
"5s",
"10s",
"30s",
"1m",
"5m",
"15m",
"30m",
"1h",
"2h",
"1d"
]
},
"timezone": "",
"title": "Loki / Pod Logs",
"uid": "209fd89b771c318dd442225414a50b59",
"version": 1,
"weekStart": ""
}
Loading
Loading