Skip to content

Commit ef6c3b6

Browse files
committed
Update to new backup logic and use new images
1 parent f60fc6a commit ef6c3b6

9 files changed

+308
-147
lines changed

backup_cronjob.yaml

Lines changed: 126 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,126 @@
1+
apiVersion: batch/v1
2+
kind: CronJob
3+
metadata:
4+
name: storage-backup
5+
spec:
6+
schedule: "*/30 * * * *"
7+
concurrencyPolicy: Forbid
8+
successfulJobsHistoryLimit: 1
9+
failedJobsHistoryLimit: 3
10+
jobTemplate:
11+
spec:
12+
backoffLimit: 1
13+
template:
14+
spec:
15+
imagePullSecrets: [ name: microstream-ocir-credentials ]
16+
restartPolicy: Never
17+
securityContext:
18+
runAsNonRoot: true
19+
fsGroupChangePolicy: OnRootMismatch
20+
fsGroup: 10000
21+
runAsUser: 10000
22+
runAsGroup: 10000
23+
containers:
24+
- name: backup
25+
image: curlimages/curl:8.11.1
26+
command: [ sh, -c ]
27+
resources:
28+
requests:
29+
memory: 100M
30+
cpu: 500m
31+
limits:
32+
memory: 100M
33+
cpu: 500m
34+
env:
35+
- name: MY_NAMESPACE
36+
valueFrom:
37+
fieldRef:
38+
fieldPath: metadata.namespace
39+
args:
40+
- |
41+
# CONSTANTS
42+
token="$(cat /var/run/secrets/kubernetes.io/serviceaccount/token)"
43+
cacert="/var/run/secrets/kubernetes.io/serviceaccount/ca.crt"
44+
backup_pod_url='http://masternode/microstream-cluster-controller'
45+
ns="$MY_NAMESPACE"
46+
wait_loop_retries=7200
47+
48+
# Appends the arguments to curl with a list of default options.
49+
# Sets 'ret' to the response
50+
http() {
51+
ret=$(curl --fail --no-progress-meter 2>&1 $@)
52+
}
53+
54+
fail() {
55+
echo "$1" >&2
56+
exit 1
57+
}
58+
59+
# Makes the http request and returns the response, or cleans up and exits on failure
60+
# Sets 'ret' to the response
61+
ensure_http() {
62+
http $@
63+
[ $? != 0 ] && fail "Failed to call endpoint: $ret"
64+
}
65+
66+
update_to_latest_microstream_offset() {
67+
echo "Updating to latest offset"
68+
ensure_http -X POST "$backup_pod_url/microstream-updates"
69+
70+
echo "Waiting for updates to finish..."
71+
local i=0
72+
until [ $i -gt $wait_loop_retries ]; do
73+
ensure_http -H 'Content-Type:application/json' "$backup_pod_url/microstream-updates"
74+
[ "$ret" = "true" ] && break
75+
sleep 1
76+
i=$(expr $i + 1)
77+
done
78+
if [ $i -gt $wait_loop_retries ]; then
79+
fail "Timed out waiting for backup to stop"
80+
fi
81+
echo "...done!"
82+
}
83+
84+
collect_backup_pod_garbage() {
85+
echo "Issuing garbage collection"
86+
ensure_http -X POST "$backup_pod_url/microstream-gc"
87+
88+
local i=0
89+
90+
echo "Waiting for garbage collection completion"
91+
until [ $i -gt $wait_loop_retries ]
92+
do
93+
ensure_http "$backup_pod_url/microstream-gc"
94+
[ "$ret" = "false" ] && break
95+
sleep 1
96+
i=$(expr $i + 1)
97+
done
98+
99+
if [ $i -gt $wait_loop_retries ]; then
100+
fail "Timed out waiting for backup to stop"
101+
fi
102+
}
103+
104+
create_backup() {
105+
echo "Creating backup"
106+
ensure_http -X POST "$backup_pod_url/microstream-backup"
107+
local id="$ret"
108+
echo "Waiting for backup to finish"
109+
while [ "$(ensure_http $backup_pod_url/microstream-backup)" = "true" ]; do
110+
sleep 1
111+
done
112+
}
113+
114+
resume_updates() {
115+
echo "Resuming updates"
116+
ensure_http -X POST "$backup_pod_url/microstream-resume-updates"
117+
}
118+
119+
# PROGRAM START
120+
121+
update_to_latest_microstream_offset
122+
collect_backup_pod_garbage
123+
create_backup
124+
resume_updates
125+
126+
echo "Done!"

kafka_statefulset.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,7 @@ spec:
1717
imagePullSecrets: [ name: microstream-ocir-credentials ]
1818
containers:
1919
- name: kafka
20-
image: ocir.microstream.one/onprem/image/microstream-cluster-kafka:1.14.0-SNAPSHOT
20+
image: ocir.microstream.one/onprem/image/microstream-cluster-kafka:1.14.0
2121
ports:
2222
- name: plaintext
2323
containerPort: 9092

masternode_pod.yaml

Lines changed: 85 additions & 67 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,8 @@ apiVersion: v1
22
kind: Pod
33
metadata:
44
name: masternode
5+
labels:
6+
microstream.one/cluster-component: masternode
57
spec:
68
imagePullSecrets: [ name: microstream-ocir-credentials ]
79
securityContext:
@@ -10,59 +12,98 @@ spec:
1012
fsGroup: 10000
1113
runAsUser: 10000
1214
runAsGroup: 10000
15+
resources:
16+
requests:
17+
memory: 2000M
18+
cpu: 1000m
19+
limits:
20+
memory: 2000M
21+
cpu: 1000m
1322
initContainers:
1423
- name: prepare-masternode
1524
image: curlimages/curl:8.11.1
25+
securityContext:
26+
allowPrivilegeEscalation: false
27+
capabilities:
28+
drop: [ all ]
29+
volumeMounts:
30+
- name: storage
31+
mountPath: /storage
32+
- name: app
33+
mountPath: /app
34+
resources:
35+
requests:
36+
memory: 2000M
37+
cpu: 1000m
38+
limits:
39+
memory: 2000M
40+
cpu: 1000m
1641
command:
1742
- sh
1843
- -ce
1944
- |
20-
# Wait for the user rest service project to exist
2145
# You can upload the jar like this:
22-
# `kubectl cp -c prepare-masternode /path/to/jar masternode:/storage/project/project.jar`
46+
# `kubectl cp -c prepare-masternode /path/to/jar masternode:/app/application.jar`
2347
# If you have a libs folder as well you can copy it with
24-
# `kubectl cp -c prepare-masternode /path/to/libs masternode:/storage/project`
48+
# `kubectl cp -c prepare-masternode /path/to/libs masternode:/app`
2549
# When you are done create the ready flag with
26-
# `kubectl exec -ti -c prepare-masternode pod/masternode -- touch /storage/project/ready`
27-
mkdir -p /storage/project
28-
echo "Waiting for user rest service jar (timeout=10min)..."
29-
i=0
30-
until [ -f /storage/project/ready ]; do
31-
sleep 1s
32-
# Fail if we time out
33-
if [ $i -gt 600 ]; then
34-
echo "Timed out waiting for /storage/project/ready to exist" >&2
35-
exit 1
36-
fi
37-
i=$((i+1))
38-
done
39-
echo "Success!"
50+
# `kubectl exec -ti -c prepare-masternode pod/masternode -- touch /app/ready`
4051
41-
# Check for kafka ready flag for 5 minutes
42-
echo "Waiting for kafka to be ready (timeout=5min)..."
43-
i=0
44-
until nc -z -w5 kafka 9092; do
45-
sleep 1s
46-
# Fail if we time out
47-
if [ $i -gt 300 ]; then
48-
echo "Timed out waiting for kafka to be ready" >&2
49-
exit 1
50-
fi
51-
i=$((i+1))
52-
done
53-
echo "Success!"
52+
# Wait for the user application to exist
53+
if [ ! -f /app/ready ]; then
54+
echo "Waiting for /app/ready flag..."
55+
while [ ! -f /app/ready ]; do sleep 1; done
56+
echo "...done!"
57+
fi
58+
59+
# Wait for kafka
60+
if ! nc -z -w5 kafka 9092; then
61+
echo "Waiting for kafka..."
62+
until nc -z -w5 kafka 9092; do sleep 1; done
63+
echo "...done!"
64+
fi
65+
containers:
66+
- name: masternode
67+
image: ocir.microstream.one/onprem/image/microstream-cluster-storage-node:1.14.0
68+
workingDir: /storage
69+
args: [ "/app/application.jar" ]
70+
ports:
71+
- name: http
72+
containerPort: 8080
73+
# Restart the pod if container is not responsive at all
74+
livenessProbe:
75+
periodSeconds: 10
76+
timeoutSeconds: 20
77+
failureThreshold: 2
78+
httpGet:
79+
path: /microstream-cluster-controller/microstream-health
80+
port: http
81+
# Remove the pod from being ready if we fail to check
82+
readinessProbe:
83+
periodSeconds: 10
84+
timeoutSeconds: 20
85+
failureThreshold: 1
86+
httpGet:
87+
path: /microstream-cluster-controller/microstream-health/ready
88+
port: http
5489
securityContext:
5590
allowPrivilegeEscalation: false
5691
capabilities:
5792
drop: [ all ]
5893
volumeMounts:
5994
- name: storage
6095
mountPath: /storage
61-
containers:
62-
- name: masternode
63-
image: ocir.microstream.one/onprem/image/microstream-cluster-storage-node:1.14.0-SNAPSHOT
64-
workingDir: /storage
65-
args: [ "/storage/project/project.jar" ]
96+
- name: backups
97+
mountPath: /backups
98+
- name: app
99+
mountPath: /app
100+
resources:
101+
requests:
102+
memory: 2000M
103+
cpu: 1000m
104+
limits:
105+
memory: 2000M
106+
cpu: 1000m
66107
env:
67108
- name: MSCNL_PROD_MODE
68109
value: "true"
@@ -81,40 +122,17 @@ spec:
81122
value: "true"
82123
- name: IS_BACKUP_NODE
83124
value: "true"
125+
- name: KEPT_BACKUPS_COUNT
126+
value: "3"
84127
- name: BACKUP_PROXY_SERVICE_URL
85128
value: external-resource-proxy
86-
ports:
87-
- name: http
88-
containerPort: 8080
89-
# Restart the pod if container is not responsive at all
90-
livenessProbe:
91-
timeoutSeconds: 5
92-
failureThreshold: 5
93-
httpGet:
94-
path: /microstream-cluster-controller/microstream-health
95-
port: http
96-
# Remove the pod from being ready if we fail to check
97-
readinessProbe:
98-
timeoutSeconds: 4
99-
failureThreshold: 3
100-
httpGet:
101-
path: /microstream-cluster-controller/microstream-health/ready
102-
port: http
103-
# Give the container ~50 seconds to fully start up
104-
startupProbe:
105-
timeoutSeconds: 5
106-
failureThreshold: 10
107-
httpGet:
108-
path: /microstream-cluster-controller/microstream-health
109-
port: http
110-
securityContext:
111-
allowPrivilegeEscalation: false
112-
capabilities:
113-
drop: [ all ]
114-
volumeMounts:
115-
- name: storage
116-
mountPath: /storage
117129
volumes:
118130
- name: storage
119131
persistentVolumeClaim:
120-
claimName: masternode-storage
132+
claimName: masternode-storage
133+
- name: backups
134+
persistentVolumeClaim:
135+
claimName: storage-backups
136+
- name: app
137+
persistentVolumeClaim:
138+
claimName: user-app

masternode_pvclaim.yaml

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -3,8 +3,7 @@ kind: PersistentVolumeClaim
33
metadata:
44
name: masternode-storage
55
spec:
6-
# Needs to be read-write-many so every storage node can attach and clone the storage as a starting point
7-
accessModes: [ ReadWriteMany ]
6+
accessModes: [ ReadWriteOnce ]
87
resources:
98
requests:
109
storage: 20G

masternode_service.yaml

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,11 @@
1+
apiVersion: v1
2+
kind: Service
3+
metadata:
4+
name: masternode
5+
spec:
6+
ports:
7+
- name: http
8+
port: 80
9+
targetPort: http
10+
selector:
11+
microstream.one/cluster-component: masternode

storage_backups_pvclaim.yaml

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,10 @@
1+
apiVersion: v1
2+
kind: PersistentVolumeClaim
3+
metadata:
4+
name: storage-backups
5+
spec:
6+
# Needs to be read-write-many so every storage node can attach and clone the backups as a starting point
7+
accessModes: [ ReadWriteMany ]
8+
resources:
9+
requests:
10+
storage: 60G

0 commit comments

Comments
 (0)