Files
incidentops/helm/incidentops/templates/prometheus-deployment.yaml
minhtrannhat 46ede7757d feat: add observability stack and background task infrastructure
Add OpenTelemetry instrumentation with distributed tracing and metrics:
- Structured JSON logging with trace context correlation
- Auto-instrumentation for FastAPI, asyncpg, httpx, redis
- OTLP exporter for traces and Prometheus metrics endpoint

Implement Celery worker and notification task system:
- Celery app with Redis/SQS broker support and configurable queues
- Notification tasks for incident fan-out, webhooks, and escalations
- Pluggable TaskQueue abstraction with in-memory driver for testing

Add Grafana observability stack (Loki, Tempo, Prometheus, Grafana):
- OpenTelemetry Collector for receiving OTLP traces and logs
- Tempo for distributed tracing backend
- Loki for log aggregation with Promtail DaemonSet
- Prometheus for metrics scraping with RBAC configuration
- Grafana with pre-provisioned datasources and API overview dashboard
- Helm templates for all observability components

Enhance application infrastructure:
- Global exception handlers with structured ErrorResponse schema
- Request logging middleware with timing metrics
- Health check updated to verify task queue connectivity
- Non-root user in Dockerfile for security
- Init containers in Helm deployments for dependency ordering
- Production Helm values with autoscaling and retention policies
2026-01-07 20:51:13 -05:00

164 lines
5.1 KiB
YAML

{{- if and .Values.observability.enabled .Values.metrics.enabled }}
apiVersion: v1
kind: ConfigMap
metadata:
name: {{ include "incidentops.fullname" . }}-prometheus
labels:
{{- include "incidentops.labels" . | nindent 4 }}
app.kubernetes.io/component: prometheus
data:
prometheus.yml: |
global:
scrape_interval: {{ .Values.observability.prometheus.scrapeInterval | default "15s" }}
evaluation_interval: 15s
scrape_configs:
- job_name: "prometheus"
static_configs:
- targets: ["localhost:9090"]
- job_name: "incidentops-api"
kubernetes_sd_configs:
- role: pod
namespaces:
names:
- {{ .Release.Namespace }}
relabel_configs:
- source_labels: [__meta_kubernetes_pod_label_app_kubernetes_io_component]
action: keep
regex: api
- source_labels: [__meta_kubernetes_pod_container_port_name]
action: keep
regex: metrics
- source_labels: [__meta_kubernetes_namespace]
target_label: namespace
- source_labels: [__meta_kubernetes_pod_name]
target_label: pod
metrics_path: /metrics
scrape_interval: 10s
- job_name: "incidentops-worker"
kubernetes_sd_configs:
- role: pod
namespaces:
names:
- {{ .Release.Namespace }}
relabel_configs:
- source_labels: [__meta_kubernetes_pod_label_app_kubernetes_io_component]
action: keep
regex: worker
- source_labels: [__meta_kubernetes_pod_container_port_name]
action: keep
regex: metrics
- source_labels: [__meta_kubernetes_namespace]
target_label: namespace
- source_labels: [__meta_kubernetes_pod_name]
target_label: pod
metrics_path: /metrics
scrape_interval: 10s
---
apiVersion: apps/v1
kind: Deployment
metadata:
name: {{ include "incidentops.fullname" . }}-prometheus
labels:
{{- include "incidentops.labels" . | nindent 4 }}
app.kubernetes.io/component: prometheus
spec:
replicas: 1
selector:
matchLabels:
{{- include "incidentops.selectorLabels" . | nindent 6 }}
app.kubernetes.io/component: prometheus
template:
metadata:
labels:
{{- include "incidentops.selectorLabels" . | nindent 8 }}
app.kubernetes.io/component: prometheus
annotations:
checksum/config: {{ .Values.observability.prometheus.image.tag | sha256sum }}
spec:
serviceAccountName: {{ include "incidentops.serviceAccountName" . }}
securityContext:
fsGroup: 65534
runAsUser: 65534
runAsNonRoot: true
containers:
- name: prometheus
image: "{{ .Values.observability.prometheus.image.repository }}:{{ .Values.observability.prometheus.image.tag }}"
imagePullPolicy: {{ .Values.observability.prometheus.image.pullPolicy }}
args:
- "--config.file=/etc/prometheus/prometheus.yml"
- "--storage.tsdb.path=/prometheus"
- "--storage.tsdb.retention.time={{ .Values.observability.prometheus.retention }}"
- "--web.enable-lifecycle"
ports:
- name: http
containerPort: 9090
protocol: TCP
volumeMounts:
- name: config
mountPath: /etc/prometheus
- name: data
mountPath: /prometheus
resources:
{{- toYaml .Values.observability.prometheus.resources | nindent 12 }}
readinessProbe:
httpGet:
path: /-/ready
port: http
initialDelaySeconds: 10
periodSeconds: 10
livenessProbe:
httpGet:
path: /-/healthy
port: http
initialDelaySeconds: 30
periodSeconds: 30
volumes:
- name: config
configMap:
name: {{ include "incidentops.fullname" . }}-prometheus
- name: data
{{- if .Values.observability.prometheus.persistence.enabled }}
persistentVolumeClaim:
claimName: {{ include "incidentops.fullname" . }}-prometheus
{{- else }}
emptyDir: {}
{{- end }}
---
apiVersion: v1
kind: Service
metadata:
name: {{ include "incidentops.fullname" . }}-prometheus
labels:
{{- include "incidentops.labels" . | nindent 4 }}
app.kubernetes.io/component: prometheus
spec:
type: ClusterIP
ports:
- name: http
port: 9090
targetPort: http
protocol: TCP
selector:
{{- include "incidentops.selectorLabels" . | nindent 4 }}
app.kubernetes.io/component: prometheus
{{- if .Values.observability.prometheus.persistence.enabled }}
---
apiVersion: v1
kind: PersistentVolumeClaim
metadata:
name: {{ include "incidentops.fullname" . }}-prometheus
labels:
{{- include "incidentops.labels" . | nindent 4 }}
app.kubernetes.io/component: prometheus
spec:
accessModes:
- ReadWriteOnce
resources:
requests:
storage: {{ .Values.observability.prometheus.persistence.size }}
{{- end }}
{{- end }}