Skip to content

Commit f67ad64

Browse files
authored
chore: ensure metrics are correctly exposed (#978)
* remove headless & metrics methods, use op-rs methods * add prometheus annotations to metrics service * adapted changelog * clippy * fix cargo doc * replace expects with errors
1 parent e370101 commit f67ad64

File tree

7 files changed

+246
-211
lines changed

7 files changed

+246
-211
lines changed

CHANGELOG.md

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,7 @@ All notable changes to this project will be documented in this file.
1212
- `EOS_DISABLED` (`--eos-disabled`) to disable the EoS checker completely.
1313
- Helm: Allow Pod `priorityClassName` to be configured ([#974]).
1414
- Add support for `3.9.4` ([#977]).
15+
- Add `prometheus.io/path|port|scheme` annotations to metrics service (for native metrics) ([#978]).
1516

1617
### Changed
1718

@@ -20,6 +21,7 @@ All notable changes to this project will be documented in this file.
2021
[#974]: https://github.com/stackabletech/zookeeper-operator/pull/974
2122
[#976]: https://github.com/stackabletech/zookeeper-operator/pull/976
2223
[#977]: https://github.com/stackabletech/zookeeper-operator/pull/977
24+
[#978]: https://github.com/stackabletech/zookeeper-operator/pull/978
2325

2426
## [25.7.0] - 2025-07-23
2527

rust/operator-binary/src/crd/mod.rs

Lines changed: 2 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -36,7 +36,6 @@ use strum::{Display, EnumIter, EnumString, IntoEnumIterator};
3636

3737
use crate::{
3838
crd::{affinity::get_affinity, v1alpha1::ZookeeperServerRoleConfig},
39-
discovery::build_role_group_headless_service_name,
4039
listener::role_listener_name,
4140
};
4241

@@ -626,9 +625,8 @@ impl v1alpha1::ZookeeperCluster {
626625
for i in 0..rolegroup.replicas.unwrap_or(1) {
627626
pod_refs.push(ZookeeperPodRef {
628627
namespace: ns.clone(),
629-
role_group_headless_service_name: build_role_group_headless_service_name(
630-
rolegroup_ref.object_name(),
631-
),
628+
role_group_headless_service_name: rolegroup_ref
629+
.rolegroup_headless_service_name(),
632630
pod_name: format!("{role_group}-{i}", role_group = rolegroup_ref.object_name()),
633631
zookeeper_myid: i + myid_offset,
634632
});

rust/operator-binary/src/discovery.rs

Lines changed: 0 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -168,15 +168,3 @@ fn listener_addresses(
168168
false => Ok(address_port_pairs),
169169
}
170170
}
171-
172-
// TODO (@NickLarsenNZ): Implement this directly on RoleGroupRef, ie:
173-
// RoleGroupRef<K: Resource>::metrics_service_name(&self) to restrict what _name_ can be.
174-
pub fn build_role_group_headless_service_name(name: String) -> String {
175-
format!("{name}-headless")
176-
}
177-
178-
// TODO (@NickLarsenNZ): Implement this directly on RoleGroupRef, ie:
179-
// RoleGroupRef<K: Resource>::metrics_service_name(&self) to restrict what _name_ can be.
180-
pub fn build_role_group_metrics_service_name(name: String) -> String {
181-
format!("{name}-metrics")
182-
}

rust/operator-binary/src/main.rs

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -41,6 +41,7 @@ mod discovery;
4141
mod listener;
4242
mod operations;
4343
mod product_logging;
44+
mod service;
4445
mod utils;
4546
mod zk_controller;
4647
mod znode_controller;

rust/operator-binary/src/product_logging.rs

Lines changed: 0 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -16,21 +16,6 @@ use crate::crd::{
1616

1717
#[derive(Snafu, Debug)]
1818
pub enum Error {
19-
#[snafu(display("object has no namespace"))]
20-
ObjectHasNoNamespace,
21-
22-
#[snafu(display("failed to retrieve the ConfigMap {cm_name}"))]
23-
ConfigMapNotFound {
24-
source: stackable_operator::client::Error,
25-
cm_name: String,
26-
},
27-
28-
#[snafu(display("failed to retrieve the entry {entry} for ConfigMap {cm_name}"))]
29-
MissingConfigMapEntry {
30-
entry: &'static str,
31-
cm_name: String,
32-
},
33-
3419
#[snafu(display("crd validation failure"))]
3520
CrdValidationFailure { source: crate::crd::Error },
3621
}
Lines changed: 210 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,210 @@
1+
use std::{
2+
collections::{BTreeMap, HashMap},
3+
str::FromStr,
4+
};
5+
6+
use product_config::types::PropertyNameKind;
7+
use snafu::{OptionExt, ResultExt, Snafu};
8+
use stackable_operator::{
9+
builder::meta::ObjectMetaBuilder,
10+
commons::product_image_selection::ResolvedProductImage,
11+
k8s_openapi::api::core::v1::{Service, ServicePort, ServiceSpec},
12+
kvp::{Annotations, Labels},
13+
role_utils::RoleGroupRef,
14+
};
15+
16+
use crate::{
17+
crd::{
18+
APP_NAME, JMX_METRICS_PORT, JMX_METRICS_PORT_NAME, METRICS_PROVIDER_HTTP_PORT,
19+
METRICS_PROVIDER_HTTP_PORT_KEY, METRICS_PROVIDER_HTTP_PORT_NAME, ZOOKEEPER_ELECTION_PORT,
20+
ZOOKEEPER_ELECTION_PORT_NAME, ZOOKEEPER_LEADER_PORT, ZOOKEEPER_LEADER_PORT_NAME,
21+
ZOOKEEPER_PROPERTIES_FILE, v1alpha1,
22+
},
23+
utils::build_recommended_labels,
24+
zk_controller::ZK_CONTROLLER_NAME,
25+
};
26+
27+
#[derive(Snafu, Debug)]
28+
pub enum Error {
29+
#[snafu(display("object is missing metadata to build owner reference"))]
30+
ObjectMissingMetadataForOwnerRef {
31+
source: stackable_operator::builder::meta::Error,
32+
},
33+
34+
#[snafu(display("failed to build Metadata"))]
35+
BuildMetadata {
36+
source: stackable_operator::builder::meta::Error,
37+
},
38+
39+
#[snafu(display("failed to build Labels"))]
40+
BuildLabel {
41+
source: stackable_operator::kvp::LabelError,
42+
},
43+
44+
#[snafu(display("missing zookeeper properties file {ZOOKEEPER_PROPERTIES_FILE} in config"))]
45+
MissingPropertiesFile,
46+
47+
#[snafu(display("missing provider http port key {METRICS_PROVIDER_HTTP_PORT_KEY} in config"))]
48+
MissingProviderHttpPortKey,
49+
}
50+
51+
/// The rolegroup [`Service`] is a headless service that allows internal access to the instances of a certain rolegroup
52+
///
53+
/// This is mostly useful for internal communication between peers, or for clients that perform client-side load balancing.
54+
pub(crate) fn build_server_rolegroup_headless_service(
55+
zk: &v1alpha1::ZookeeperCluster,
56+
rolegroup: &RoleGroupRef<v1alpha1::ZookeeperCluster>,
57+
resolved_product_image: &ResolvedProductImage,
58+
) -> Result<Service, Error> {
59+
let metadata = ObjectMetaBuilder::new()
60+
.name_and_namespace(zk)
61+
.name(rolegroup.rolegroup_headless_service_name())
62+
.ownerreference_from_resource(zk, None, Some(true))
63+
.context(ObjectMissingMetadataForOwnerRefSnafu)?
64+
.with_recommended_labels(build_recommended_labels(
65+
zk,
66+
ZK_CONTROLLER_NAME,
67+
&resolved_product_image.app_version_label_value,
68+
&rolegroup.role,
69+
&rolegroup.role_group,
70+
))
71+
.context(BuildMetadataSnafu)?
72+
.build();
73+
74+
let service_selector_labels =
75+
Labels::role_group_selector(zk, APP_NAME, &rolegroup.role, &rolegroup.role_group)
76+
.context(BuildLabelSnafu)?;
77+
78+
let service_spec = ServiceSpec {
79+
// Internal communication does not need to be exposed
80+
type_: Some("ClusterIP".to_string()),
81+
cluster_ip: Some("None".to_string()),
82+
ports: Some(vec![
83+
ServicePort {
84+
name: Some(ZOOKEEPER_LEADER_PORT_NAME.to_string()),
85+
port: ZOOKEEPER_LEADER_PORT.into(),
86+
protocol: Some("TCP".to_string()),
87+
..ServicePort::default()
88+
},
89+
ServicePort {
90+
name: Some(ZOOKEEPER_ELECTION_PORT_NAME.to_string()),
91+
port: ZOOKEEPER_ELECTION_PORT.into(),
92+
protocol: Some("TCP".to_string()),
93+
..ServicePort::default()
94+
},
95+
]),
96+
selector: Some(service_selector_labels.into()),
97+
publish_not_ready_addresses: Some(true),
98+
..ServiceSpec::default()
99+
};
100+
101+
Ok(Service {
102+
metadata,
103+
spec: Some(service_spec),
104+
status: None,
105+
})
106+
}
107+
108+
/// The rolegroup [`Service`] for exposing metrics
109+
pub(crate) fn build_server_rolegroup_metrics_service(
110+
zk: &v1alpha1::ZookeeperCluster,
111+
rolegroup: &RoleGroupRef<v1alpha1::ZookeeperCluster>,
112+
resolved_product_image: &ResolvedProductImage,
113+
rolegroup_config: &HashMap<PropertyNameKind, BTreeMap<String, String>>,
114+
) -> Result<Service, Error> {
115+
let metrics_port = metrics_port_from_rolegroup_config(rolegroup_config)?;
116+
117+
let metadata = ObjectMetaBuilder::new()
118+
.name_and_namespace(zk)
119+
.name(rolegroup.rolegroup_metrics_service_name())
120+
.ownerreference_from_resource(zk, None, Some(true))
121+
.context(ObjectMissingMetadataForOwnerRefSnafu)?
122+
.with_recommended_labels(build_recommended_labels(
123+
zk,
124+
ZK_CONTROLLER_NAME,
125+
&resolved_product_image.app_version_label_value,
126+
&rolegroup.role,
127+
&rolegroup.role_group,
128+
))
129+
.context(BuildMetadataSnafu)?
130+
.with_labels(prometheus_labels())
131+
.with_annotations(prometheus_annotations(metrics_port))
132+
.build();
133+
134+
let service_selector_labels =
135+
Labels::role_group_selector(zk, APP_NAME, &rolegroup.role, &rolegroup.role_group)
136+
.context(BuildLabelSnafu)?;
137+
138+
let service_spec = ServiceSpec {
139+
// Internal communication does not need to be exposed
140+
type_: Some("ClusterIP".to_string()),
141+
cluster_ip: Some("None".to_string()),
142+
ports: Some(vec![
143+
// We keep this for legacy compatibility
144+
ServicePort {
145+
name: Some(JMX_METRICS_PORT_NAME.to_string()),
146+
port: JMX_METRICS_PORT.into(),
147+
protocol: Some("TCP".to_string()),
148+
..ServicePort::default()
149+
},
150+
ServicePort {
151+
name: Some(METRICS_PROVIDER_HTTP_PORT_NAME.to_string()),
152+
port: metrics_port.into(),
153+
protocol: Some("TCP".to_string()),
154+
..ServicePort::default()
155+
},
156+
]),
157+
selector: Some(service_selector_labels.into()),
158+
publish_not_ready_addresses: Some(true),
159+
..ServiceSpec::default()
160+
};
161+
162+
Ok(Service {
163+
metadata,
164+
spec: Some(service_spec),
165+
status: None,
166+
})
167+
}
168+
169+
pub(crate) fn metrics_port_from_rolegroup_config(
170+
rolegroup_config: &HashMap<PropertyNameKind, BTreeMap<String, String>>,
171+
) -> Result<u16, Error> {
172+
let metrics_port = rolegroup_config
173+
.get(&PropertyNameKind::File(
174+
ZOOKEEPER_PROPERTIES_FILE.to_string(),
175+
))
176+
.context(MissingPropertiesFileSnafu)?
177+
.get(METRICS_PROVIDER_HTTP_PORT_KEY)
178+
.context(MissingProviderHttpPortKeySnafu)?;
179+
180+
let port = match u16::from_str(metrics_port) {
181+
Ok(port) => port,
182+
Err(err) => {
183+
tracing::error!("{err}");
184+
tracing::info!("Defaulting to using {METRICS_PROVIDER_HTTP_PORT} as metrics port.");
185+
METRICS_PROVIDER_HTTP_PORT
186+
}
187+
};
188+
189+
Ok(port)
190+
}
191+
192+
/// Common labels for Prometheus
193+
fn prometheus_labels() -> Labels {
194+
Labels::try_from([("prometheus.io/scrape", "true")]).expect("should be a valid label")
195+
}
196+
197+
/// Common annotations for Prometheus
198+
///
199+
/// These annotations can be used in a ServiceMonitor.
200+
///
201+
/// see also <https://github.com/prometheus-community/helm-charts/blob/prometheus-27.32.0/charts/prometheus/values.yaml#L983-L1036>
202+
fn prometheus_annotations(metrics_port: u16) -> Annotations {
203+
Annotations::try_from([
204+
("prometheus.io/path".to_owned(), "/metrics".to_owned()),
205+
("prometheus.io/port".to_owned(), metrics_port.to_string()),
206+
("prometheus.io/scheme".to_owned(), "http".to_owned()),
207+
("prometheus.io/scrape".to_owned(), "true".to_owned()),
208+
])
209+
.expect("should be valid annotations")
210+
}

0 commit comments

Comments
 (0)