Skip to content

Commit 56841a5

Browse files
committed
fix(backend): disable metrics for mutation locks
Granular locking produces more cardinality for lock names thus bloating prometheus metrics leading to high memory usage. Completely disable metrics for these locks as they would be useless for the moment. Signed-off-by: Fatih Acar <[email protected]>
1 parent 9eefa2f commit 56841a5

File tree

5 files changed

+35
-16
lines changed

5 files changed

+35
-16
lines changed

backend/infrahub/core/node/create.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -212,7 +212,7 @@ async def _persist(current_db: InfrahubDatabase) -> Node:
212212
)
213213

214214
obj: Node
215-
async with InfrahubMultiLock(lock_registry=lock.registry, locks=lock_names):
215+
async with InfrahubMultiLock(lock_registry=lock.registry, locks=lock_names, metrics=False):
216216
if db.is_transaction:
217217
obj = await _persist(db)
218218
else:

backend/infrahub/core/node/save.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -51,7 +51,7 @@ async def _persist() -> None:
5151
await node.save(db=db, fields=fields_to_save)
5252

5353
if manage_lock:
54-
async with InfrahubMultiLock(lock_registry=lock.registry, locks=locks):
54+
async with InfrahubMultiLock(lock_registry=lock.registry, locks=locks, metrics=False):
5555
await _persist()
5656
else:
5757
await _persist()

backend/infrahub/graphql/mutations/ipam.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -223,7 +223,7 @@ async def mutate_update(
223223

224224
namespace_lock_names = cls._get_lock_names(namespace_id)
225225
async with InfrahubMultiLock(lock_registry=lock.registry, locks=namespace_lock_names):
226-
async with InfrahubMultiLock(lock_registry=lock.registry, locks=lock_names):
226+
async with InfrahubMultiLock(lock_registry=lock.registry, locks=lock_names, metrics=False):
227227
async with db.start_transaction() as dbt:
228228
reconciled_address = await cls._mutate_update_object_and_reconcile(
229229
info=info,
@@ -400,7 +400,7 @@ async def mutate_update(
400400

401401
namespace_lock_names = cls._get_lock_names(namespace_id)
402402
async with InfrahubMultiLock(lock_registry=lock.registry, locks=namespace_lock_names):
403-
async with InfrahubMultiLock(lock_registry=lock.registry, locks=lock_names):
403+
async with InfrahubMultiLock(lock_registry=lock.registry, locks=lock_names, metrics=False):
404404
async with db.start_transaction() as dbt:
405405
reconciled_prefix = await cls._mutate_update_object_and_reconcile(
406406
info=info,

backend/infrahub/graphql/mutations/main.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -248,7 +248,7 @@ async def _mutate(current_db: InfrahubDatabase) -> tuple[Node, Self]:
248248
result = await cls.mutate_update_to_graphql(db=current_db, info=info, obj=updated_obj)
249249
return updated_obj, result
250250

251-
async with InfrahubMultiLock(lock_registry=lock.registry, locks=lock_names):
251+
async with InfrahubMultiLock(lock_registry=lock.registry, locks=lock_names, metrics=False):
252252
if db.is_transaction:
253253
return await _mutate(db)
254254

backend/infrahub/lock.py

Lines changed: 30 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -50,9 +50,12 @@
5050
class InfrahubMultiLock:
5151
"""Context manager to allow multiple locks to be reserved together"""
5252

53-
def __init__(self, lock_registry: InfrahubLockRegistry, locks: list[str] | None = None) -> None:
53+
def __init__(
54+
self, lock_registry: InfrahubLockRegistry, locks: list[str] | None = None, metrics: bool = True
55+
) -> None:
5456
self.registry = lock_registry
5557
self.locks = locks or []
58+
self.metrics = metrics
5659

5760
async def __aenter__(self):
5861
await self.acquire()
@@ -67,11 +70,11 @@ async def __aexit__(
6770

6871
async def acquire(self) -> None:
6972
for lock in self.locks:
70-
await self.registry.get(name=lock).acquire()
73+
await self.registry.get(name=lock, metrics=self.metrics).acquire()
7174

7275
async def release(self) -> None:
7376
for lock in reversed(self.locks):
74-
await self.registry.get(name=lock).release()
77+
await self.registry.get(name=lock, metrics=self.metrics).release()
7578

7679

7780
class NATSLock:
@@ -123,6 +126,7 @@ def __init__(
123126
connection: redis.Redis | InfrahubServices | None = None,
124127
local: bool | None = None,
125128
in_multi: bool = False,
129+
metrics: bool = True,
126130
) -> None:
127131
self.use_local: bool = local
128132
self.local: LocalLock = None
@@ -134,6 +138,7 @@ def __init__(
134138
self.acquire_time: int | None = None
135139
self.event = asyncio.Event()
136140
self._recursion_var: ContextVar[int | None] = ContextVar(f"infrahub_lock_recursion_{self.name}", default=None)
141+
self.metrics = metrics
137142

138143
if not self.connection or (self.use_local is None and name.startswith("local.")):
139144
self.use_local = True
@@ -162,11 +167,17 @@ async def acquire(self) -> None:
162167
self._recursion_var.set(depth + 1)
163168
return
164169

165-
with LOCK_ACQUIRE_TIME_METRICS.labels(self.name, self.lock_type).time():
166-
if not self.use_local:
167-
await self.remote.acquire(token=f"{current_timestamp()}::{WORKER_IDENTITY}")
168-
else:
169-
await self.local.acquire()
170+
if self.metrics:
171+
with LOCK_ACQUIRE_TIME_METRICS.labels(self.name, self.lock_type).time():
172+
if not self.use_local:
173+
await self.remote.acquire(token=f"{current_timestamp()}::{WORKER_IDENTITY}")
174+
else:
175+
await self.local.acquire()
176+
elif not self.use_local:
177+
await self.remote.acquire(token=f"{current_timestamp()}::{WORKER_IDENTITY}")
178+
else:
179+
await self.local.acquire()
180+
170181
self.acquire_time = time.time_ns()
171182
self.event.clear()
172183
self._recursion_var.set(1)
@@ -182,7 +193,8 @@ async def release(self) -> None:
182193

183194
if self.acquire_time is not None:
184195
duration_ns = time.time_ns() - self.acquire_time
185-
LOCK_RESERVE_TIME_METRICS.labels(self.name, self.lock_type).observe(duration_ns / 1000000000)
196+
if self.metrics:
197+
LOCK_RESERVE_TIME_METRICS.labels(self.name, self.lock_type).observe(duration_ns / 1000000000)
186198
self.acquire_time = None
187199

188200
if not self.use_local:
@@ -252,11 +264,18 @@ def get_existing(
252264
return self.locks[lock_name]
253265

254266
def get(
255-
self, name: str, namespace: str | None = None, local: bool | None = None, in_multi: bool = False
267+
self,
268+
name: str,
269+
namespace: str | None = None,
270+
local: bool | None = None,
271+
in_multi: bool = False,
272+
metrics: bool = True,
256273
) -> InfrahubLock:
257274
lock_name = self._generate_name(name=name, namespace=namespace, local=local)
258275
if lock_name not in self.locks:
259-
self.locks[lock_name] = InfrahubLock(name=lock_name, connection=self.connection, in_multi=in_multi)
276+
self.locks[lock_name] = InfrahubLock(
277+
name=lock_name, connection=self.connection, in_multi=in_multi, metrics=metrics
278+
)
260279
return self.locks[lock_name]
261280

262281
def local_schema_lock(self) -> LocalLock:

0 commit comments

Comments
 (0)