version: "3.8" services: postgres: image: postgres:16-alpine container_name: incidentops-postgres environment: POSTGRES_USER: incidentops POSTGRES_PASSWORD: incidentops POSTGRES_DB: incidentops ports: - "5432:5432" volumes: - postgres_data:/var/lib/postgresql/data healthcheck: test: ["CMD-SHELL", "pg_isready -U incidentops"] interval: 10s timeout: 5s retries: 5 # For Celery broker redis: image: redis:7-alpine container_name: incidentops-redis ports: - "6379:6379" volumes: - redis_data:/data healthcheck: test: ["CMD", "redis-cli", "ping"] interval: 10s timeout: 5s retries: 5 # api services api: build: context: . dockerfile: Dockerfile target: api container_name: incidentops-api ports: - "8000:8000" - "9464:9464" # Prometheus metrics environment: DATABASE_URL: postgresql://incidentops:incidentops@postgres:5432/incidentops REDIS_URL: redis://redis:6379/0 JWT_SECRET_KEY: dev-secret-key-change-in-production JWT_ALGORITHM: HS256 ACCESS_TOKEN_EXPIRE_MINUTES: 30 REFRESH_TOKEN_EXPIRE_DAYS: 30 # OpenTelemetry OTEL_ENABLED: "true" OTEL_SERVICE_NAME: incidentops-api OTEL_ENVIRONMENT: development OTEL_EXPORTER_OTLP_ENDPOINT: http://otel-collector:4317 OTEL_EXPORTER_OTLP_INSECURE: "true" OTEL_LOG_LEVEL: INFO # Metrics PROMETHEUS_PORT: "9464" depends_on: postgres: condition: service_healthy redis: condition: service_healthy otel-collector: condition: service_started prometheus: condition: service_started healthcheck: test: ["CMD", "curl", "-f", "http://localhost:8000/v1/healthz"] interval: 30s timeout: 10s retries: 3 start_period: 10s # Worker service (Celery) worker: build: context: . dockerfile: Dockerfile target: worker container_name: incidentops-worker environment: DATABASE_URL: postgresql://incidentops:incidentops@postgres:5432/incidentops REDIS_URL: redis://redis:6379/0 CELERY_BROKER_URL: redis://redis:6379/0 CELERY_RESULT_BACKEND: redis://redis:6379/1 # OpenTelemetry OTEL_ENABLED: "true" OTEL_SERVICE_NAME: incidentops-worker OTEL_ENVIRONMENT: development OTEL_EXPORTER_OTLP_ENDPOINT: http://otel-collector:4317 OTEL_EXPORTER_OTLP_INSECURE: "true" depends_on: postgres: condition: service_healthy redis: condition: service_healthy # Web frontend (Next.js) web: build: context: . dockerfile: Dockerfile.web container_name: incidentops-web ports: - "3000:3000" environment: NEXT_PUBLIC_API_URL: http://localhost:8000 depends_on: - api # Database migrations (run once) migrate: build: context: . dockerfile: Dockerfile target: api container_name: incidentops-migrate command: python migrations/migrate.py apply environment: DATABASE_URL: postgresql://incidentops:incidentops@postgres:5432/incidentops depends_on: postgres: condition: service_healthy profiles: - migrate # Flower for Celery monitoring (dev only) flower: image: mher/flower:2.0 container_name: incidentops-flower ports: - "5555:5555" environment: CELERY_BROKER_URL: redis://redis:6379/0 FLOWER_BASIC_AUTH: admin:admin depends_on: - redis profiles: - monitoring # ============================================ # Observability Stack # ============================================ # OpenTelemetry Collector - receives traces/logs from apps otel-collector: image: otel/opentelemetry-collector-contrib:0.96.0 container_name: incidentops-otel-collector command: ["--config=/etc/otel-collector/config.yaml"] volumes: - ./observability/otel-collector/config.yaml:/etc/otel-collector/config.yaml:ro ports: - "4317:4317" # OTLP gRPC - "4318:4318" # OTLP HTTP depends_on: - tempo - loki # Tempo - distributed tracing backend tempo: image: grafana/tempo:2.4.1 container_name: incidentops-tempo command: ["-config.file=/etc/tempo/config.yaml"] volumes: - ./observability/tempo/config.yaml:/etc/tempo/config.yaml:ro - tempo_data:/var/tempo ports: - "3200:3200" # Tempo HTTP - "4320:4317" # Tempo OTLP gRPC (different host port to avoid conflict) # Loki - log aggregation loki: image: grafana/loki:2.9.6 container_name: incidentops-loki command: ["-config.file=/etc/loki/config.yaml"] volumes: - ./observability/loki/config.yaml:/etc/loki/config.yaml:ro - loki_data:/loki ports: - "3100:3100" # Loki HTTP # Prometheus - metrics storage prometheus: image: prom/prometheus:v2.51.0 container_name: incidentops-prometheus command: - "--config.file=/etc/prometheus/prometheus.yml" - "--storage.tsdb.path=/prometheus" - "--web.enable-lifecycle" volumes: - ./observability/prometheus/prometheus.yml:/etc/prometheus/prometheus.yml:ro - prometheus_data:/prometheus ports: - "9090:9090" # Prometheus UI # Grafana - visualization grafana: image: grafana/grafana:10.4.1 container_name: incidentops-grafana environment: GF_SECURITY_ADMIN_USER: admin GF_SECURITY_ADMIN_PASSWORD: admin GF_USERS_ALLOW_SIGN_UP: "false" GF_EXPLORE_ENABLED: "true" GF_FEATURE_TOGGLES_ENABLE: traceqlEditor tempoSearch tempoBackendSearch tempoApmTable volumes: - ./observability/grafana/provisioning:/etc/grafana/provisioning:ro - ./observability/grafana/dashboards:/var/lib/grafana/dashboards:ro - grafana_data:/var/lib/grafana ports: - "3001:3000" # Grafana UI (3001 to avoid conflict with web frontend) depends_on: - tempo - loki - prometheus volumes: postgres_data: redis_data: tempo_data: loki_data: prometheus_data: grafana_data: networks: default: name: incidentops-network