version: "3.8"

services:
  postgres:
    image: postgres:16-alpine
    container_name: incidentops-postgres
    environment:
      POSTGRES_USER: incidentops
      POSTGRES_PASSWORD: incidentops
      POSTGRES_DB: incidentops
    ports:
      - "5432:5432"
    volumes:
      - postgres_data:/var/lib/postgresql/data
    healthcheck:
      test: ["CMD-SHELL", "pg_isready -U incidentops"]
      interval: 10s
      timeout: 5s
      retries: 5

  # For Celery broker
  redis:
    image: redis:7-alpine
    container_name: incidentops-redis
    ports:
      - "6379:6379"
    volumes:
      - redis_data:/data
    healthcheck:
      test: ["CMD", "redis-cli", "ping"]
      interval: 10s
      timeout: 5s
      retries: 5

  # api services
  api:
    build:
      context: .
      dockerfile: Dockerfile
      target: api
    container_name: incidentops-api
    ports:
      - "8000:8000"
      - "9464:9464"   # Prometheus metrics
    environment:
      DATABASE_URL: postgresql://incidentops:incidentops@postgres:5432/incidentops
      REDIS_URL: redis://redis:6379/0
      JWT_SECRET_KEY: dev-secret-key-change-in-production
      JWT_ALGORITHM: HS256
      ACCESS_TOKEN_EXPIRE_MINUTES: 30
      REFRESH_TOKEN_EXPIRE_DAYS: 30
      # OpenTelemetry
      OTEL_ENABLED: "true"
      OTEL_SERVICE_NAME: incidentops-api
      OTEL_ENVIRONMENT: development
      OTEL_EXPORTER_OTLP_ENDPOINT: http://otel-collector:4317
      OTEL_EXPORTER_OTLP_INSECURE: "true"
      OTEL_LOG_LEVEL: INFO
      # Metrics
      PROMETHEUS_PORT: "9464"
    depends_on:
      postgres:
        condition: service_healthy
      redis:
        condition: service_healthy
      otel-collector:
        condition: service_started
      prometheus:
        condition: service_started
    healthcheck:
      test: ["CMD", "curl", "-f", "http://localhost:8000/v1/healthz"]
      interval: 30s
      timeout: 10s
      retries: 3
      start_period: 10s

  # Worker service (Celery)
  worker:
    build:
      context: .
      dockerfile: Dockerfile
      target: worker
    container_name: incidentops-worker
    environment:
      DATABASE_URL: postgresql://incidentops:incidentops@postgres:5432/incidentops
      REDIS_URL: redis://redis:6379/0
      CELERY_BROKER_URL: redis://redis:6379/0
      CELERY_RESULT_BACKEND: redis://redis:6379/1
      # OpenTelemetry
      OTEL_ENABLED: "true"
      OTEL_SERVICE_NAME: incidentops-worker
      OTEL_ENVIRONMENT: development
      OTEL_EXPORTER_OTLP_ENDPOINT: http://otel-collector:4317
      OTEL_EXPORTER_OTLP_INSECURE: "true"
    depends_on:
      postgres:
        condition: service_healthy
      redis:
        condition: service_healthy

  # Web frontend (Next.js)
  web:
    build:
      context: .
      dockerfile: Dockerfile.web
    container_name: incidentops-web
    ports:
      - "3000:3000"
    environment:
      NEXT_PUBLIC_API_URL: http://localhost:8000
    depends_on:
      - api

  # Database migrations (run once)
  migrate:
    build:
      context: .
      dockerfile: Dockerfile
      target: api
    container_name: incidentops-migrate
    command: python migrations/migrate.py apply
    environment:
      DATABASE_URL: postgresql://incidentops:incidentops@postgres:5432/incidentops
    depends_on:
      postgres:
        condition: service_healthy
    profiles:
      - migrate

  # Flower for Celery monitoring (dev only)
  flower:
    image: mher/flower:2.0
    container_name: incidentops-flower
    ports:
      - "5555:5555"
    environment:
      CELERY_BROKER_URL: redis://redis:6379/0
      FLOWER_BASIC_AUTH: admin:admin
    depends_on:
      - redis
    profiles:
      - monitoring

  # ============================================
  # Observability Stack
  # ============================================

  # OpenTelemetry Collector - receives traces/logs from apps
  otel-collector:
    image: otel/opentelemetry-collector-contrib:0.96.0
    container_name: incidentops-otel-collector
    command: ["--config=/etc/otel-collector/config.yaml"]
    volumes:
      - ./observability/otel-collector/config.yaml:/etc/otel-collector/config.yaml:ro
    ports:
      - "4317:4317"   # OTLP gRPC
      - "4318:4318"   # OTLP HTTP
    depends_on:
      - tempo
      - loki

  # Tempo - distributed tracing backend
  tempo:
    image: grafana/tempo:2.4.1
    container_name: incidentops-tempo
    command: ["-config.file=/etc/tempo/config.yaml"]
    volumes:
      - ./observability/tempo/config.yaml:/etc/tempo/config.yaml:ro
      - tempo_data:/var/tempo
    ports:
      - "3200:3200"   # Tempo HTTP
      - "4320:4317"   # Tempo OTLP gRPC (different host port to avoid conflict)

  # Loki - log aggregation
  loki:
    image: grafana/loki:2.9.6
    container_name: incidentops-loki
    command: ["-config.file=/etc/loki/config.yaml"]
    volumes:
      - ./observability/loki/config.yaml:/etc/loki/config.yaml:ro
      - loki_data:/loki
    ports:
      - "3100:3100"   # Loki HTTP

  # Prometheus - metrics storage
  prometheus:
    image: prom/prometheus:v2.51.0
    container_name: incidentops-prometheus
    command:
      - "--config.file=/etc/prometheus/prometheus.yml"
      - "--storage.tsdb.path=/prometheus"
      - "--web.enable-lifecycle"
    volumes:
      - ./observability/prometheus/prometheus.yml:/etc/prometheus/prometheus.yml:ro
      - prometheus_data:/prometheus
    ports:
      - "9090:9090"   # Prometheus UI

  # Grafana - visualization
  grafana:
    image: grafana/grafana:10.4.1
    container_name: incidentops-grafana
    environment:
      GF_SECURITY_ADMIN_USER: admin
      GF_SECURITY_ADMIN_PASSWORD: admin
      GF_USERS_ALLOW_SIGN_UP: "false"
      GF_EXPLORE_ENABLED: "true"
      GF_FEATURE_TOGGLES_ENABLE: traceqlEditor tempoSearch tempoBackendSearch tempoApmTable
    volumes:
      - ./observability/grafana/provisioning:/etc/grafana/provisioning:ro
      - ./observability/grafana/dashboards:/var/lib/grafana/dashboards:ro
      - grafana_data:/var/lib/grafana
    ports:
      - "3001:3000"   # Grafana UI (3001 to avoid conflict with web frontend)
    depends_on:
      - tempo
      - loki
      - prometheus

volumes:
  postgres_data:
  redis_data:
  tempo_data:
  loki_data:
  prometheus_data:
  grafana_data:

networks:
  default:
    name: incidentops-network