Add OpenTelemetry instrumentation with distributed tracing and metrics: - Structured JSON logging with trace context correlation - Auto-instrumentation for FastAPI, asyncpg, httpx, redis - OTLP exporter for traces and Prometheus metrics endpoint Implement Celery worker and notification task system: - Celery app with Redis/SQS broker support and configurable queues - Notification tasks for incident fan-out, webhooks, and escalations - Pluggable TaskQueue abstraction with in-memory driver for testing Add Grafana observability stack (Loki, Tempo, Prometheus, Grafana): - OpenTelemetry Collector for receiving OTLP traces and logs - Tempo for distributed tracing backend - Loki for log aggregation with Promtail DaemonSet - Prometheus for metrics scraping with RBAC configuration - Grafana with pre-provisioned datasources and API overview dashboard - Helm templates for all observability components Enhance application infrastructure: - Global exception handlers with structured ErrorResponse schema - Request logging middleware with timing metrics - Health check updated to verify task queue connectivity - Non-root user in Dockerfile for security - Init containers in Helm deployments for dependency ordering - Production Helm values with autoscaling and retention policies
151 lines
3.9 KiB
YAML
151 lines
3.9 KiB
YAML
apiVersion: skaffold/v4beta11
|
|
kind: Config
|
|
metadata:
|
|
name: incidentops
|
|
|
|
build:
|
|
artifacts:
|
|
- image: incidentops/api
|
|
docker:
|
|
dockerfile: Dockerfile
|
|
target: api
|
|
sync:
|
|
manual:
|
|
- src: "app/**/*.py"
|
|
dest: /app
|
|
- src: "worker/**/*.py"
|
|
dest: /app
|
|
|
|
- image: incidentops/worker
|
|
docker:
|
|
dockerfile: Dockerfile
|
|
target: worker
|
|
sync:
|
|
manual:
|
|
- src: "app/**/*.py"
|
|
dest: /app
|
|
- src: "worker/**/*.py"
|
|
dest: /app
|
|
|
|
# Web frontend disabled until implemented
|
|
# - image: incidentops/web
|
|
# docker:
|
|
# dockerfile: Dockerfile.web
|
|
# context: .
|
|
# sync:
|
|
# manual:
|
|
# - src: "web/src/**/*"
|
|
# dest: /app
|
|
|
|
local:
|
|
push: false
|
|
useBuildkit: true
|
|
|
|
deploy:
|
|
helm:
|
|
releases:
|
|
- name: incidentops
|
|
chartPath: helm/incidentops
|
|
valuesFiles:
|
|
- helm/incidentops/values.yaml
|
|
setValues:
|
|
web.replicaCount: 0 # Disabled until frontend is implemented
|
|
migration.enabled: true
|
|
setValueTemplates:
|
|
api.image.repository: "{{.IMAGE_REPO_incidentops_api}}"
|
|
api.image.tag: "{{.IMAGE_TAG_incidentops_api}}"
|
|
worker.image.repository: "{{.IMAGE_REPO_incidentops_worker}}"
|
|
worker.image.tag: "{{.IMAGE_TAG_incidentops_worker}}"
|
|
migration.image.repository: "{{.IMAGE_REPO_incidentops_api}}"
|
|
migration.image.tag: "{{.IMAGE_TAG_incidentops_api}}"
|
|
createNamespace: true
|
|
namespace: incidentops
|
|
|
|
profiles:
|
|
- name: dev
|
|
activation:
|
|
- command: dev
|
|
build:
|
|
local:
|
|
push: false
|
|
deploy:
|
|
helm:
|
|
releases:
|
|
- name: incidentops
|
|
chartPath: helm/incidentops
|
|
valuesFiles:
|
|
- helm/incidentops/values.yaml
|
|
setValues:
|
|
api.replicaCount: 1
|
|
worker.replicaCount: 1
|
|
web.replicaCount: 0 # Disabled until frontend is implemented
|
|
migration.enabled: true
|
|
setValueTemplates:
|
|
api.image.repository: "{{.IMAGE_REPO_incidentops_api}}"
|
|
api.image.tag: "{{.IMAGE_TAG_incidentops_api}}"
|
|
worker.image.repository: "{{.IMAGE_REPO_incidentops_worker}}"
|
|
worker.image.tag: "{{.IMAGE_TAG_incidentops_worker}}"
|
|
migration.image.repository: "{{.IMAGE_REPO_incidentops_api}}"
|
|
migration.image.tag: "{{.IMAGE_TAG_incidentops_api}}"
|
|
createNamespace: true
|
|
namespace: incidentops
|
|
|
|
- name: production
|
|
activation:
|
|
- env: SKAFFOLD_PROFILE=production
|
|
build:
|
|
local:
|
|
push: true
|
|
deploy:
|
|
helm:
|
|
releases:
|
|
- name: incidentops
|
|
chartPath: helm/incidentops
|
|
valuesFiles:
|
|
- helm/incidentops/values.yaml
|
|
- helm/incidentops/values-production.yaml
|
|
createNamespace: true
|
|
namespace: incidentops-prod
|
|
|
|
- name: kind
|
|
activation:
|
|
- kubeContext: kind-.*
|
|
patches:
|
|
- op: add
|
|
path: /build/local/push
|
|
value: false
|
|
|
|
portForward:
|
|
- resourceType: service
|
|
resourceName: incidentops-api
|
|
namespace: incidentops
|
|
port: 8000
|
|
localPort: 8000
|
|
# Web frontend disabled until implemented
|
|
# - resourceType: service
|
|
# resourceName: incidentops-web
|
|
# namespace: incidentops
|
|
# port: 3000
|
|
# localPort: 3000
|
|
# Observability
|
|
- resourceType: service
|
|
resourceName: incidentops-grafana
|
|
namespace: incidentops
|
|
port: 80
|
|
localPort: 3001
|
|
- resourceType: service
|
|
resourceName: incidentops-prometheus
|
|
namespace: incidentops
|
|
port: 9090
|
|
localPort: 9090
|
|
- resourceType: service
|
|
resourceName: incidentops-tempo
|
|
namespace: incidentops
|
|
port: 3200
|
|
localPort: 3200
|
|
- resourceType: service
|
|
resourceName: incidentops-loki
|
|
namespace: incidentops
|
|
port: 3100
|
|
localPort: 3100
|