Files
incidentops/helm/incidentops/templates/api-deployment.yaml
minhtrannhat 46ede7757d feat: add observability stack and background task infrastructure
Add OpenTelemetry instrumentation with distributed tracing and metrics:
- Structured JSON logging with trace context correlation
- Auto-instrumentation for FastAPI, asyncpg, httpx, redis
- OTLP exporter for traces and Prometheus metrics endpoint

Implement Celery worker and notification task system:
- Celery app with Redis/SQS broker support and configurable queues
- Notification tasks for incident fan-out, webhooks, and escalations
- Pluggable TaskQueue abstraction with in-memory driver for testing

Add Grafana observability stack (Loki, Tempo, Prometheus, Grafana):
- OpenTelemetry Collector for receiving OTLP traces and logs
- Tempo for distributed tracing backend
- Loki for log aggregation with Promtail DaemonSet
- Prometheus for metrics scraping with RBAC configuration
- Grafana with pre-provisioned datasources and API overview dashboard
- Helm templates for all observability components

Enhance application infrastructure:
- Global exception handlers with structured ErrorResponse schema
- Request logging middleware with timing metrics
- Health check updated to verify task queue connectivity
- Non-root user in Dockerfile for security
- Init containers in Helm deployments for dependency ordering
- Production Helm values with autoscaling and retention policies
2026-01-07 20:51:13 -05:00

105 lines
3.3 KiB
YAML

apiVersion: apps/v1
kind: Deployment
metadata:
name: {{ include "incidentops.fullname" . }}-api
labels:
{{- include "incidentops.api.labels" . | nindent 4 }}
spec:
{{- if not .Values.api.autoscaling.enabled }}
replicas: {{ .Values.api.replicaCount }}
{{- end }}
selector:
matchLabels:
{{- include "incidentops.api.selectorLabels" . | nindent 6 }}
template:
metadata:
annotations:
checksum/config: {{ include (print $.Template.BasePath "/configmap.yaml") . | sha256sum }}
checksum/secret: {{ include (print $.Template.BasePath "/secret.yaml") . | sha256sum }}
{{- with .Values.api.podAnnotations }}
{{- toYaml . | nindent 8 }}
{{- end }}
labels:
{{- include "incidentops.api.selectorLabels" . | nindent 8 }}
spec:
{{- with .Values.global.imagePullSecrets }}
imagePullSecrets:
{{- toYaml . | nindent 8 }}
{{- end }}
serviceAccountName: {{ include "incidentops.serviceAccountName" . }}
securityContext:
{{- toYaml .Values.podSecurityContext | nindent 8 }}
initContainers:
- name: wait-for-postgres
image: busybox:1.36
command:
- sh
- -c
- |
until nc -z {{ include "incidentops.fullname" . }}-postgresql 5432; do
echo "Waiting for PostgreSQL..."
sleep 2
done
echo "PostgreSQL is ready"
- name: wait-for-redis
image: busybox:1.36
command:
- sh
- -c
- |
until nc -z {{ include "incidentops.fullname" . }}-redis 6379; do
echo "Waiting for Redis..."
sleep 2
done
echo "Redis is ready"
containers:
- name: api
securityContext:
{{- toYaml .Values.securityContext | nindent 12 }}
image: {{ include "incidentops.api.image" . }}
imagePullPolicy: {{ .Values.api.image.pullPolicy }}
ports:
- name: http
containerPort: 8000
protocol: TCP
{{- if .Values.metrics.enabled }}
- name: metrics
containerPort: {{ .Values.metrics.port }}
protocol: TCP
{{- end }}
envFrom:
- configMapRef:
name: {{ include "incidentops.fullname" . }}-config
- secretRef:
name: {{ include "incidentops.fullname" . }}-secret
livenessProbe:
httpGet:
path: /v1/healthz
port: http
initialDelaySeconds: 10
periodSeconds: 30
timeoutSeconds: 5
failureThreshold: 3
readinessProbe:
httpGet:
path: /v1/readyz
port: http
initialDelaySeconds: 5
periodSeconds: 10
timeoutSeconds: 5
failureThreshold: 3
resources:
{{- toYaml .Values.api.resources | nindent 12 }}
{{- with .Values.api.nodeSelector }}
nodeSelector:
{{- toYaml . | nindent 8 }}
{{- end }}
{{- with .Values.api.affinity }}
affinity:
{{- toYaml . | nindent 8 }}
{{- end }}
{{- with .Values.api.tolerations }}
tolerations:
{{- toYaml . | nindent 8 }}
{{- end }}