"""OpenTelemetry instrumentation for tracing, metrics, and logging.""" import logging from contextlib import contextmanager from typing import Any from opentelemetry import metrics, trace from opentelemetry.exporter.otlp.proto.grpc.trace_exporter import OTLPSpanExporter from opentelemetry.exporter.prometheus import PrometheusMetricReader from opentelemetry.instrumentation.asyncpg import AsyncPGInstrumentor from opentelemetry.instrumentation.fastapi import FastAPIInstrumentor from opentelemetry.instrumentation.httpx import HTTPXClientInstrumentor from opentelemetry.instrumentation.logging import LoggingInstrumentor from opentelemetry.instrumentation.redis import RedisInstrumentor from opentelemetry.instrumentation.system_metrics import SystemMetricsInstrumentor from opentelemetry.sdk.metrics import MeterProvider from opentelemetry.sdk.resources import Resource from opentelemetry.sdk.trace import TracerProvider from opentelemetry.sdk.trace.export import BatchSpanProcessor, ConsoleSpanExporter from opentelemetry.semconv.resource import ResourceAttributes from prometheus_client import REGISTRY, start_http_server from app.config import settings logger = logging.getLogger(__name__) _tracer_provider: TracerProvider | None = None _meter_provider: MeterProvider | None = None # Custom metrics _request_counter = None _request_duration = None _active_requests = None _error_counter = None def setup_telemetry(app: Any) -> None: """ Initialize OpenTelemetry with tracing, metrics, and logging instrumentation. Configures: - OTLP exporter for traces (to Tempo/Jaeger) - Prometheus exporter for metrics (scraped by Prometheus) - Auto-instrumentation for FastAPI, asyncpg, httpx, redis - System metrics (CPU, memory, etc.) - Logging instrumentation for trace context injection """ global _tracer_provider, _meter_provider global _request_counter, _request_duration, _active_requests, _error_counter if not settings.otel_enabled: logger.info("OpenTelemetry disabled") return # Create resource with service info resource = Resource.create( { ResourceAttributes.SERVICE_NAME: settings.otel_service_name, ResourceAttributes.SERVICE_VERSION: "0.1.0", ResourceAttributes.DEPLOYMENT_ENVIRONMENT: settings.otel_environment, } ) # ========================================= # TRACING SETUP # ========================================= _tracer_provider = TracerProvider(resource=resource) if settings.otel_exporter_otlp_endpoint: otlp_exporter = OTLPSpanExporter( endpoint=settings.otel_exporter_otlp_endpoint, insecure=settings.otel_exporter_otlp_insecure, ) _tracer_provider.add_span_processor(BatchSpanProcessor(otlp_exporter)) logger.info(f"OTLP exporter configured: {settings.otel_exporter_otlp_endpoint}") else: _tracer_provider.add_span_processor(BatchSpanProcessor(ConsoleSpanExporter())) logger.info("Console span exporter configured (no OTLP endpoint)") trace.set_tracer_provider(_tracer_provider) # ========================================= # METRICS SETUP # ========================================= # Prometheus metric reader exposes metrics at /metrics endpoint prometheus_reader = PrometheusMetricReader() _meter_provider = MeterProvider(resource=resource, metric_readers=[prometheus_reader]) metrics.set_meter_provider(_meter_provider) # Start Prometheus HTTP server on port 9464 prometheus_port = settings.prometheus_port try: start_http_server(port=prometheus_port, registry=REGISTRY) logger.info(f"Prometheus metrics server started on port {prometheus_port}") except OSError as e: logger.warning(f"Could not start Prometheus server on port {prometheus_port}: {e}") # Create custom metrics meter = metrics.get_meter(__name__) _request_counter = meter.create_counter( name="http_requests_total", description="Total number of HTTP requests", unit="1", ) _request_duration = meter.create_histogram( name="http_request_duration_seconds", description="HTTP request duration in seconds", unit="s", ) _active_requests = meter.create_up_down_counter( name="http_requests_active", description="Number of active HTTP requests", unit="1", ) _error_counter = meter.create_counter( name="http_errors_total", description="Total number of HTTP errors", unit="1", ) # Instrument system metrics (CPU, memory, etc.) SystemMetricsInstrumentor().instrument() logger.info("System metrics instrumentation enabled") # ========================================= # LIBRARY INSTRUMENTATION # ========================================= FastAPIInstrumentor.instrument_app( app, excluded_urls="healthz,readyz,metrics", tracer_provider=_tracer_provider, meter_provider=_meter_provider, ) AsyncPGInstrumentor().instrument(tracer_provider=_tracer_provider) HTTPXClientInstrumentor().instrument(tracer_provider=_tracer_provider) RedisInstrumentor().instrument(tracer_provider=_tracer_provider) # Inject trace context into logs LoggingInstrumentor().instrument( set_logging_format=True, log_level=logging.INFO, ) logger.info( f"OpenTelemetry initialized: service={settings.otel_service_name}, " f"env={settings.otel_environment}, metrics_port={prometheus_port}" ) async def shutdown_telemetry() -> None: """Gracefully shutdown the tracer and meter providers.""" global _tracer_provider, _meter_provider if _tracer_provider: _tracer_provider.shutdown() _tracer_provider = None logger.info("Tracer provider shutdown complete") if _meter_provider: _meter_provider.shutdown() _meter_provider = None logger.info("Meter provider shutdown complete") def get_tracer(name: str) -> trace.Tracer: """Get a tracer instance for manual span creation.""" return trace.get_tracer(name) def get_meter(name: str) -> metrics.Meter: """Get a meter instance for custom metrics.""" return metrics.get_meter(name) def get_current_trace_id() -> str | None: """Get the current trace ID for request correlation.""" span = trace.get_current_span() if span and span.get_span_context().is_valid: return format(span.get_span_context().trace_id, "032x") return None def get_current_span_id() -> str | None: """Get the current span ID.""" span = trace.get_current_span() if span and span.get_span_context().is_valid: return format(span.get_span_context().span_id, "016x") return None @contextmanager def create_span(name: str, attributes: dict[str, Any] | None = None): """Context manager for creating manual spans.""" tracer = get_tracer(__name__) with tracer.start_as_current_span(name, attributes=attributes) as span: yield span def add_span_attributes(attributes: dict[str, Any]) -> None: """Add attributes to the current span.""" span = trace.get_current_span() if span: for key, value in attributes.items(): span.set_attribute(key, value) def record_exception(exception: Exception) -> None: """Record an exception on the current span.""" span = trace.get_current_span() if span: span.record_exception(exception) span.set_status(trace.Status(trace.StatusCode.ERROR, str(exception))) # ========================================= # CUSTOM METRICS HELPERS # ========================================= def record_request(method: str, endpoint: str, status_code: int) -> None: """Record a request metric.""" if _request_counter: _request_counter.add( 1, { "method": method, "endpoint": endpoint, "status_code": str(status_code), }, ) def record_request_duration(method: str, endpoint: str, duration: float) -> None: """Record request duration in seconds.""" if _request_duration: _request_duration.record( duration, { "method": method, "endpoint": endpoint, }, ) def increment_active_requests(method: str, endpoint: str) -> None: """Increment active requests counter.""" if _active_requests: _active_requests.add(1, {"method": method, "endpoint": endpoint}) def decrement_active_requests(method: str, endpoint: str) -> None: """Decrement active requests counter.""" if _active_requests: _active_requests.add(-1, {"method": method, "endpoint": endpoint}) def record_error(method: str, endpoint: str, error_type: str) -> None: """Record an error metric.""" if _error_counter: _error_counter.add( 1, { "method": method, "endpoint": endpoint, "error_type": error_type, }, )