Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -67,6 +67,7 @@ dependencies = [
"uvicorn>=0.34.0", # server
"opentelemetry-sdk>=1.42.1", # server
"opentelemetry-exporter-otlp-proto-http>=1.30.0", # server
"opentelemetry-exporter-prometheus>=0.60b1", # server - Prometheus /metrics scrape endpoint; only pre-releases on PyPI
"opentelemetry-distro>=0.60b1", # optional CLI instrumentation; only pre-releases on PyPI (latest 0.60b1)
"aiosqlite>=0.21.0", # server - for metadata store
"asyncpg", # for metadata store
Expand Down
5 changes: 5 additions & 0 deletions scripts/integration-tests.sh
Original file line number Diff line number Diff line change
Expand Up @@ -371,6 +371,11 @@ if [[ "$STACK_CONFIG" == *"server:"* && "$COLLECT_ONLY" == false ]]; then
export OTEL_BSP_EXPORT_TIMEOUT="2000"
export OTEL_METRIC_EXPORT_INTERVAL="200"

# Expose the Prometheus scrape endpoint at /v1/metrics so the metrics endpoint
# integration tests (tests/integration/inspect/test_metrics_endpoint.py) can scrape it.
# Exported so both the server process and the pytest process observe the flag.
export OGX_PROMETHEUS_ENABLED="1"

# remove "server:" from STACK_CONFIG
stack_config=$(echo "$STACK_CONFIG" | sed 's/^server://')
nohup ogx stack run $stack_config >server-main.log 2>&1 &
Expand Down
15 changes: 15 additions & 0 deletions src/ogx/core/server/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,21 @@ Routes are defined as native FastAPI routers. `fastapi_router_registry.py` auto-
- **`ClientVersionMiddleware`** (`server.py`): Rejects requests from clients with incompatible major.minor versions.
- **`ProviderDataMiddleware`** (`server.py`): Sets up request context for provider data propagation and test context.

### Metrics Export

OTel metrics can be exported two ways, independently and simultaneously:

- **OTLP push** — set `OTEL_EXPORTER_OTLP_ENDPOINT` to push metrics to an OTel Collector.
- **Prometheus scrape** — set `OGX_PROMETHEUS_ENABLED` (`1`/`true`/`yes`/`on`) to expose all
metrics at `GET /v1/metrics` in Prometheus exposition format, suitable for scrape-based
monitoring systems. The route is declared on the
Inspect API router (`ogx_api/inspect_api/fastapi_routes.py`) alongside `/v1/health`, opts
out of auth via `PUBLIC_ROUTE_KEY`, returns `404` when disabled, and is excluded from
`RequestMetricsMiddleware`.

Both readers are attached to a single global `MeterProvider` in
`ogx.telemetry.setup_telemetry()`.

### Response Handling

- Non-streaming responses return JSON via FastAPI's standard response handling.
Expand Down
2 changes: 1 addition & 1 deletion src/ogx/core/server/metrics.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,7 @@
)

# Paths excluded from metrics collection
_EXCLUDED_PATHS = ("/docs", "/redoc", "/openapi.json", "/favicon.ico", "/static")
_EXCLUDED_PATHS = ("/docs", "/redoc", "/openapi.json", "/favicon.ico", "/static", "/v1/metrics")


class RouteInfo:
Expand Down
69 changes: 43 additions & 26 deletions src/ogx/telemetry/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,10 @@
"""OpenTelemetry initialization for ogx.

This module configures OpenTelemetry metrics export based on environment variables.
If OTEL_EXPORTER_OTLP_ENDPOINT is set, metrics will be exported to that endpoint.
Two export paths can be enabled independently and simultaneously:
- OTLP push: enabled when OTEL_EXPORTER_OTLP_ENDPOINT is set.
- Prometheus scrape: enabled when OGX_PROMETHEUS_ENABLED is truthy, exposing metrics
at the server's /metrics endpoint in Prometheus exposition format.
"""

import os
Expand All @@ -17,51 +20,65 @@
logger = get_logger(__name__, category="telemetry")


def _is_prometheus_enabled() -> bool:
"""Return True if the Prometheus scrape endpoint is enabled via environment."""
return os.environ.get("OGX_PROMETHEUS_ENABLED", "").strip().lower() in ("1", "true", "yes", "on")


def setup_telemetry() -> None:
"""Initialize OpenTelemetry metrics exporter if configured via environment.
"""Initialize OpenTelemetry metric readers based on environment configuration.

This function checks for OTEL_EXPORTER_OTLP_ENDPOINT and configures the
MeterProvider to export metrics to the specified endpoint.
Adds an OTLP push reader when OTEL_EXPORTER_OTLP_ENDPOINT is set and a Prometheus
scrape reader when OGX_PROMETHEUS_ENABLED is truthy. Both readers attach to a single
global MeterProvider, so the two export paths operate independently. If neither is
configured, no MeterProvider is installed and metrics are not exported.
"""
otlp_endpoint = os.environ.get("OTEL_EXPORTER_OTLP_ENDPOINT")
prometheus_enabled = _is_prometheus_enabled()

if not otlp_endpoint:
logger.debug("OTEL_EXPORTER_OTLP_ENDPOINT not set, metrics will not be exported")
if not otlp_endpoint and not prometheus_enabled:
logger.debug("No metrics exporter configured, metrics will not be exported")
return

try:
from opentelemetry import metrics
from opentelemetry.exporter.otlp.proto.http.metric_exporter import (
OTLPMetricExporter,
)
from opentelemetry.sdk.metrics import MeterProvider
from opentelemetry.sdk.metrics.export import PeriodicExportingMetricReader
from opentelemetry.sdk.metrics.export import MetricReader
from opentelemetry.sdk.resources import Resource

# Get export interval from environment (default 200ms for tests, 60s otherwise)
export_interval_ms = int(os.environ.get("OTEL_METRIC_EXPORT_INTERVAL", "60000"))
export_interval_s = export_interval_ms / 1000.0
metric_readers: list[MetricReader] = []

if otlp_endpoint:
from opentelemetry.exporter.otlp.proto.http.metric_exporter import (
OTLPMetricExporter,
)
from opentelemetry.sdk.metrics.export import PeriodicExportingMetricReader

# Create OTLP exporter
exporter = OTLPMetricExporter(endpoint=f"{otlp_endpoint}/v1/metrics")
# Get export interval from environment (default 200ms for tests, 60s otherwise)
export_interval_ms = int(os.environ.get("OTEL_METRIC_EXPORT_INTERVAL", "60000"))

# Create metric reader with periodic export
reader = PeriodicExportingMetricReader(exporter, export_interval_millis=export_interval_ms)
exporter = OTLPMetricExporter(endpoint=f"{otlp_endpoint}/v1/metrics")
metric_readers.append(PeriodicExportingMetricReader(exporter, export_interval_millis=export_interval_ms))
logger.info(
"OpenTelemetry OTLP metrics exporter configured",
otlp_endpoint=otlp_endpoint,
export_interval_s=export_interval_ms / 1000.0,
)

if prometheus_enabled:
from opentelemetry.exporter.prometheus import PrometheusMetricReader

# Registers a collector on the default prometheus_client registry; the server's
# /metrics endpoint serves it via prometheus_client.generate_latest().
metric_readers.append(PrometheusMetricReader())
logger.info("OpenTelemetry Prometheus metrics reader configured, metrics exposed at /metrics")

# Create resource with service name
service_name = os.environ.get("OTEL_SERVICE_NAME", "ogx")
resource = Resource(attributes={"service.name": service_name})

# Create and set global MeterProvider
provider = MeterProvider(resource=resource, metric_readers=[reader])
provider = MeterProvider(resource=resource, metric_readers=metric_readers)
metrics.set_meter_provider(provider)

logger.info(
"OpenTelemetry metrics exporter configured: (interval: s)",
otlp_endpoint=otlp_endpoint,
export_interval_s=export_interval_s,
)

except Exception as e:
logger.warning("Failed to configure OpenTelemetry metrics exporter", error=str(e))

Expand Down
30 changes: 29 additions & 1 deletion src/ogx_api/inspect_api/fastapi_routes.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,9 +10,10 @@
FastAPI route decorators.
"""

import os
from typing import Annotated

from fastapi import APIRouter, Query
from fastapi import APIRouter, HTTPException, Query, Response

from ogx_api.router_utils import PUBLIC_ROUTE_KEY, standard_responses
from ogx_api.version import OGX_API_V1
Expand All @@ -25,6 +26,15 @@
VersionInfo,
)

# Mirrors the Prometheus gating in ogx.telemetry.setup_telemetry(): the scrape endpoint is
# served only when the same env var attaches a PrometheusMetricReader to the MeterProvider.
# ogx_api must not import ogx, so the check is duplicated here rather than shared.
_PROMETHEUS_ENABLED_ENV = "OGX_PROMETHEUS_ENABLED"


def _prometheus_enabled() -> bool:
return os.environ.get(_PROMETHEUS_ENABLED_ENV, "").strip().lower() in ("1", "true", "yes", "on")


def create_router(impl: Inspect) -> APIRouter:
"""Create a FastAPI router for the Inspect API."""
Expand Down Expand Up @@ -73,4 +83,22 @@ async def health() -> HealthInfo:
async def version() -> VersionInfo:
return await impl.version()

@router.get(
"/metrics",
summary="Get Prometheus metrics.",
description="Expose OTel metrics in Prometheus exposition format for scrape-based "
"monitoring systems. Returns 404 unless the Prometheus reader is enabled via the "
"OGX_PROMETHEUS_ENABLED environment variable.",
response_class=Response,
include_in_schema=False,
openapi_extra={PUBLIC_ROUTE_KEY: True},
)
async def metrics() -> Response:
if not _prometheus_enabled():
raise HTTPException(status_code=404, detail="Prometheus metrics endpoint is not enabled")

from prometheus_client import CONTENT_TYPE_LATEST, generate_latest

return Response(content=generate_latest(), media_type=CONTENT_TYPE_LATEST)

return router
72 changes: 72 additions & 0 deletions tests/integration/inspect/test_metrics_endpoint.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,72 @@
# Copyright (c) The OGX Contributors.
# All rights reserved.
#
# This source code is licensed under the terms described in the LICENSE file in
# the root directory of this source tree.

"""Integration tests for the Prometheus /v1/metrics scrape endpoint.

These only run in server mode with the Prometheus reader enabled: the endpoint is
exposed over HTTP only when the server is started with OGX_PROMETHEUS_ENABLED, which
scripts/integration-tests.sh sets for native server-mode runs. The endpoint is scraped
with a raw HTTP client (not the typed SDK) because it returns Prometheus text rather
than JSON and is intentionally absent from the OpenAPI spec.
"""

import os

import httpx
import pytest

_PROMETHEUS_ENABLED = os.environ.get("OGX_PROMETHEUS_ENABLED", "").strip().lower() in ("1", "true", "yes", "on")

pytestmark = pytest.mark.skipif(
os.environ.get("OGX_TEST_STACK_CONFIG_TYPE") != "server" or not _PROMETHEUS_ENABLED,
reason="The Prometheus /v1/metrics endpoint is only exposed by the HTTP server when OGX_PROMETHEUS_ENABLED is set",
)


def _server_base_url(ogx_client) -> str:
"""Root server URL (without the /v1 suffix) for raw HTTP scrapes."""
return str(ogx_client.base_url).rstrip("/").removesuffix("/v1")


def test_metrics_endpoint_exposes_prometheus_format(ogx_client):
base_url = _server_base_url(ogx_client)

# Exercise a non-excluded endpoint so request-level metrics are recorded.
for _ in range(3):
httpx.get(f"{base_url}/v1/health", timeout=30.0)

resp = httpx.get(f"{base_url}/v1/metrics", timeout=30.0)

assert resp.status_code == 200
assert resp.headers["content-type"].startswith("text/plain")

body = resp.text
# Prometheus exposition format markers.
assert "# HELP" in body
assert "# TYPE" in body
# OGX request metrics recorded by RequestMetricsMiddleware, proving OTel metrics
# flow through the PrometheusMetricReader to the scrape endpoint.
assert "ogx_requests_total" in body
assert 'method="health"' in body


def test_metrics_endpoint_requires_no_auth(ogx_client):
"""The scrape endpoint must be reachable without an Authorization header."""
base_url = _server_base_url(ogx_client)

resp = httpx.get(f"{base_url}/v1/metrics", timeout=30.0)

assert resp.status_code == 200


def test_metrics_endpoint_is_not_self_counted(ogx_client):
"""Scraping /v1/metrics must not be counted by RequestMetricsMiddleware."""
base_url = _server_base_url(ogx_client)

httpx.get(f"{base_url}/v1/metrics", timeout=30.0)
body = httpx.get(f"{base_url}/v1/metrics", timeout=30.0).text

assert 'method="metrics"' not in body
114 changes: 114 additions & 0 deletions tests/unit/telemetry/test_prometheus_metrics.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,114 @@
# Copyright (c) The OGX Contributors.
# All rights reserved.
#
# This source code is licensed under the terms described in the LICENSE file in
# the root directory of this source tree.

"""Unit tests for the Prometheus scrape endpoint and metric exposition."""

from unittest.mock import MagicMock

import pytest
from fastapi import FastAPI
from fastapi.testclient import TestClient
from opentelemetry.exporter.prometheus import PrometheusMetricReader
from opentelemetry.sdk.metrics import MeterProvider
from opentelemetry.sdk.resources import Resource
from prometheus_client import CollectorRegistry, generate_latest

from ogx.core.server.metrics import _EXCLUDED_PATHS
from ogx.telemetry import _is_prometheus_enabled
from ogx_api.inspect_api.fastapi_routes import create_router


@pytest.fixture
def prometheus_meter_provider():
"""A MeterProvider wired to a PrometheusMetricReader backed by an isolated registry.

Using a dedicated CollectorRegistry keeps the test independent of the process-global
prometheus_client registry and of OGX's global MeterProvider.
"""
registry = CollectorRegistry()
reader = PrometheusMetricReader(registry=registry)
provider = MeterProvider(
resource=Resource(attributes={"service.name": "ogx-test"}),
metric_readers=[reader],
)
yield provider, registry
provider.shutdown()


@pytest.fixture
def inspect_client():
"""A TestClient over the real Inspect router, which owns the /v1/metrics route."""
app = FastAPI()
app.include_router(create_router(MagicMock()))
return TestClient(app)


class TestPrometheusEnabledFlag:
@pytest.mark.parametrize("value", ["1", "true", "TRUE", "Yes", "on"])
def test_truthy_values(self, monkeypatch, value):
monkeypatch.setenv("OGX_PROMETHEUS_ENABLED", value)
assert _is_prometheus_enabled() is True

@pytest.mark.parametrize("value", ["", "0", "false", "no", "off", " "])
def test_falsy_values(self, monkeypatch, value):
monkeypatch.setenv("OGX_PROMETHEUS_ENABLED", value)
assert _is_prometheus_enabled() is False

def test_unset(self, monkeypatch):
monkeypatch.delenv("OGX_PROMETHEUS_ENABLED", raising=False)
assert _is_prometheus_enabled() is False


class TestPrometheusExposition:
def test_metrics_exposed_in_prometheus_format(self, prometheus_meter_provider):
provider, registry = prometheus_meter_provider

meter = provider.get_meter("ogx.test")
counter = meter.create_counter(name="ogx_test_requests_total", unit="1")
counter.add(3, {"api": "models", "status": "success"})

output = generate_latest(registry).decode("utf-8")

# Prometheus exposition format: the counter surfaces with a _total suffix,
# carries its labels, and exposes the recorded value.
assert "ogx_test_requests_total" in output
assert 'api="models"' in output
assert 'status="success"' in output
assert "3.0" in output


class TestMetricsEndpoint:
def test_endpoint_serves_prometheus_when_enabled(self, monkeypatch, inspect_client):
monkeypatch.setenv("OGX_PROMETHEUS_ENABLED", "1")

resp = inspect_client.get("/v1/metrics")

assert resp.status_code == 200
assert resp.headers["content-type"].startswith("text/plain")

def test_endpoint_returns_404_when_disabled(self, monkeypatch, inspect_client):
monkeypatch.delenv("OGX_PROMETHEUS_ENABLED", raising=False)

resp = inspect_client.get("/v1/metrics")

assert resp.status_code == 404

def test_endpoint_is_public(self):
"""The /v1/metrics route must opt out of auth via PUBLIC_ROUTE_KEY so collectors
can scrape it without credentials."""
from fastapi.routing import APIRoute

from ogx_api.router_utils import PUBLIC_ROUTE_KEY

router = create_router(MagicMock())
metrics_routes = [r for r in router.routes if isinstance(r, APIRoute) and r.path.endswith("/metrics")]
assert metrics_routes, "metrics route not registered"
assert (metrics_routes[0].openapi_extra or {}).get(PUBLIC_ROUTE_KEY) is True


def test_metrics_path_excluded_from_request_metrics():
"""Scraping the metrics endpoint must not be counted by RequestMetricsMiddleware."""
assert "/v1/metrics" in _EXCLUDED_PATHS
Loading
Loading