Add OpenTelemetry instrumentation with distributed tracing and metrics: - Structured JSON logging with trace context correlation - Auto-instrumentation for FastAPI, asyncpg, httpx, redis - OTLP exporter for traces and Prometheus metrics endpoint Implement Celery worker and notification task system: - Celery app with Redis/SQS broker support and configurable queues - Notification tasks for incident fan-out, webhooks, and escalations - Pluggable TaskQueue abstraction with in-memory driver for testing Add Grafana observability stack (Loki, Tempo, Prometheus, Grafana): - OpenTelemetry Collector for receiving OTLP traces and logs - Tempo for distributed tracing backend - Loki for log aggregation with Promtail DaemonSet - Prometheus for metrics scraping with RBAC configuration - Grafana with pre-provisioned datasources and API overview dashboard - Helm templates for all observability components Enhance application infrastructure: - Global exception handlers with structured ErrorResponse schema - Request logging middleware with timing metrics - Health check updated to verify task queue connectivity - Non-root user in Dockerfile for security - Init containers in Helm deployments for dependency ordering - Production Helm values with autoscaling and retention policies
231 lines
6.1 KiB
YAML
231 lines
6.1 KiB
YAML
version: "3.8"
|
|
|
|
services:
|
|
postgres:
|
|
image: postgres:16-alpine
|
|
container_name: incidentops-postgres
|
|
environment:
|
|
POSTGRES_USER: incidentops
|
|
POSTGRES_PASSWORD: incidentops
|
|
POSTGRES_DB: incidentops
|
|
ports:
|
|
- "5432:5432"
|
|
volumes:
|
|
- postgres_data:/var/lib/postgresql/data
|
|
healthcheck:
|
|
test: ["CMD-SHELL", "pg_isready -U incidentops"]
|
|
interval: 10s
|
|
timeout: 5s
|
|
retries: 5
|
|
|
|
# For Celery broker
|
|
redis:
|
|
image: redis:7-alpine
|
|
container_name: incidentops-redis
|
|
ports:
|
|
- "6379:6379"
|
|
volumes:
|
|
- redis_data:/data
|
|
healthcheck:
|
|
test: ["CMD", "redis-cli", "ping"]
|
|
interval: 10s
|
|
timeout: 5s
|
|
retries: 5
|
|
|
|
# api services
|
|
api:
|
|
build:
|
|
context: .
|
|
dockerfile: Dockerfile
|
|
target: api
|
|
container_name: incidentops-api
|
|
ports:
|
|
- "8000:8000"
|
|
- "9464:9464" # Prometheus metrics
|
|
environment:
|
|
DATABASE_URL: postgresql://incidentops:incidentops@postgres:5432/incidentops
|
|
REDIS_URL: redis://redis:6379/0
|
|
JWT_SECRET_KEY: dev-secret-key-change-in-production
|
|
JWT_ALGORITHM: HS256
|
|
ACCESS_TOKEN_EXPIRE_MINUTES: 30
|
|
REFRESH_TOKEN_EXPIRE_DAYS: 30
|
|
# OpenTelemetry
|
|
OTEL_ENABLED: "true"
|
|
OTEL_SERVICE_NAME: incidentops-api
|
|
OTEL_ENVIRONMENT: development
|
|
OTEL_EXPORTER_OTLP_ENDPOINT: http://otel-collector:4317
|
|
OTEL_EXPORTER_OTLP_INSECURE: "true"
|
|
OTEL_LOG_LEVEL: INFO
|
|
# Metrics
|
|
PROMETHEUS_PORT: "9464"
|
|
depends_on:
|
|
postgres:
|
|
condition: service_healthy
|
|
redis:
|
|
condition: service_healthy
|
|
otel-collector:
|
|
condition: service_started
|
|
prometheus:
|
|
condition: service_started
|
|
healthcheck:
|
|
test: ["CMD", "curl", "-f", "http://localhost:8000/v1/healthz"]
|
|
interval: 30s
|
|
timeout: 10s
|
|
retries: 3
|
|
start_period: 10s
|
|
|
|
# Worker service (Celery)
|
|
worker:
|
|
build:
|
|
context: .
|
|
dockerfile: Dockerfile
|
|
target: worker
|
|
container_name: incidentops-worker
|
|
environment:
|
|
DATABASE_URL: postgresql://incidentops:incidentops@postgres:5432/incidentops
|
|
REDIS_URL: redis://redis:6379/0
|
|
CELERY_BROKER_URL: redis://redis:6379/0
|
|
CELERY_RESULT_BACKEND: redis://redis:6379/1
|
|
# OpenTelemetry
|
|
OTEL_ENABLED: "true"
|
|
OTEL_SERVICE_NAME: incidentops-worker
|
|
OTEL_ENVIRONMENT: development
|
|
OTEL_EXPORTER_OTLP_ENDPOINT: http://otel-collector:4317
|
|
OTEL_EXPORTER_OTLP_INSECURE: "true"
|
|
depends_on:
|
|
postgres:
|
|
condition: service_healthy
|
|
redis:
|
|
condition: service_healthy
|
|
|
|
# Web frontend (Next.js)
|
|
web:
|
|
build:
|
|
context: .
|
|
dockerfile: Dockerfile.web
|
|
container_name: incidentops-web
|
|
ports:
|
|
- "3000:3000"
|
|
environment:
|
|
NEXT_PUBLIC_API_URL: http://localhost:8000
|
|
depends_on:
|
|
- api
|
|
|
|
# Database migrations (run once)
|
|
migrate:
|
|
build:
|
|
context: .
|
|
dockerfile: Dockerfile
|
|
target: api
|
|
container_name: incidentops-migrate
|
|
command: python migrations/migrate.py apply
|
|
environment:
|
|
DATABASE_URL: postgresql://incidentops:incidentops@postgres:5432/incidentops
|
|
depends_on:
|
|
postgres:
|
|
condition: service_healthy
|
|
profiles:
|
|
- migrate
|
|
|
|
# Flower for Celery monitoring (dev only)
|
|
flower:
|
|
image: mher/flower:2.0
|
|
container_name: incidentops-flower
|
|
ports:
|
|
- "5555:5555"
|
|
environment:
|
|
CELERY_BROKER_URL: redis://redis:6379/0
|
|
FLOWER_BASIC_AUTH: admin:admin
|
|
depends_on:
|
|
- redis
|
|
profiles:
|
|
- monitoring
|
|
|
|
# ============================================
|
|
# Observability Stack
|
|
# ============================================
|
|
|
|
# OpenTelemetry Collector - receives traces/logs from apps
|
|
otel-collector:
|
|
image: otel/opentelemetry-collector-contrib:0.96.0
|
|
container_name: incidentops-otel-collector
|
|
command: ["--config=/etc/otel-collector/config.yaml"]
|
|
volumes:
|
|
- ./observability/otel-collector/config.yaml:/etc/otel-collector/config.yaml:ro
|
|
ports:
|
|
- "4317:4317" # OTLP gRPC
|
|
- "4318:4318" # OTLP HTTP
|
|
depends_on:
|
|
- tempo
|
|
- loki
|
|
|
|
# Tempo - distributed tracing backend
|
|
tempo:
|
|
image: grafana/tempo:2.4.1
|
|
container_name: incidentops-tempo
|
|
command: ["-config.file=/etc/tempo/config.yaml"]
|
|
volumes:
|
|
- ./observability/tempo/config.yaml:/etc/tempo/config.yaml:ro
|
|
- tempo_data:/var/tempo
|
|
ports:
|
|
- "3200:3200" # Tempo HTTP
|
|
- "4320:4317" # Tempo OTLP gRPC (different host port to avoid conflict)
|
|
|
|
# Loki - log aggregation
|
|
loki:
|
|
image: grafana/loki:2.9.6
|
|
container_name: incidentops-loki
|
|
command: ["-config.file=/etc/loki/config.yaml"]
|
|
volumes:
|
|
- ./observability/loki/config.yaml:/etc/loki/config.yaml:ro
|
|
- loki_data:/loki
|
|
ports:
|
|
- "3100:3100" # Loki HTTP
|
|
|
|
# Prometheus - metrics storage
|
|
prometheus:
|
|
image: prom/prometheus:v2.51.0
|
|
container_name: incidentops-prometheus
|
|
command:
|
|
- "--config.file=/etc/prometheus/prometheus.yml"
|
|
- "--storage.tsdb.path=/prometheus"
|
|
- "--web.enable-lifecycle"
|
|
volumes:
|
|
- ./observability/prometheus/prometheus.yml:/etc/prometheus/prometheus.yml:ro
|
|
- prometheus_data:/prometheus
|
|
ports:
|
|
- "9090:9090" # Prometheus UI
|
|
|
|
# Grafana - visualization
|
|
grafana:
|
|
image: grafana/grafana:10.4.1
|
|
container_name: incidentops-grafana
|
|
environment:
|
|
GF_SECURITY_ADMIN_USER: admin
|
|
GF_SECURITY_ADMIN_PASSWORD: admin
|
|
GF_USERS_ALLOW_SIGN_UP: "false"
|
|
GF_EXPLORE_ENABLED: "true"
|
|
GF_FEATURE_TOGGLES_ENABLE: traceqlEditor tempoSearch tempoBackendSearch tempoApmTable
|
|
volumes:
|
|
- ./observability/grafana/provisioning:/etc/grafana/provisioning:ro
|
|
- ./observability/grafana/dashboards:/var/lib/grafana/dashboards:ro
|
|
- grafana_data:/var/lib/grafana
|
|
ports:
|
|
- "3001:3000" # Grafana UI (3001 to avoid conflict with web frontend)
|
|
depends_on:
|
|
- tempo
|
|
- loki
|
|
- prometheus
|
|
|
|
volumes:
|
|
postgres_data:
|
|
redis_data:
|
|
tempo_data:
|
|
loki_data:
|
|
prometheus_data:
|
|
grafana_data:
|
|
|
|
networks:
|
|
default:
|
|
name: incidentops-network
|