feat: add observability stack and background task infrastructure

Add OpenTelemetry instrumentation with distributed tracing and metrics:
- Structured JSON logging with trace context correlation
- Auto-instrumentation for FastAPI, asyncpg, httpx, redis
- OTLP exporter for traces and Prometheus metrics endpoint

Implement Celery worker and notification task system:
- Celery app with Redis/SQS broker support and configurable queues
- Notification tasks for incident fan-out, webhooks, and escalations
- Pluggable TaskQueue abstraction with in-memory driver for testing

Add Grafana observability stack (Loki, Tempo, Prometheus, Grafana):
- OpenTelemetry Collector for receiving OTLP traces and logs
- Tempo for distributed tracing backend
- Loki for log aggregation with Promtail DaemonSet
- Prometheus for metrics scraping with RBAC configuration
- Grafana with pre-provisioned datasources and API overview dashboard
- Helm templates for all observability components

Enhance application infrastructure:
- Global exception handlers with structured ErrorResponse schema
- Request logging middleware with timing metrics
- Health check updated to verify task queue connectivity
- Non-root user in Dockerfile for security
- Init containers in Helm deployments for dependency ordering
- Production Helm values with autoscaling and retention policies
This commit is contained in:
2026-01-07 20:51:13 -05:00
parent f427d191e0
commit 46ede7757d
45 changed files with 3742 additions and 76 deletions

View File

@@ -0,0 +1,294 @@
{
"title": "IncidentOps API Overview",
"uid": "incidentops-api",
"tags": ["incidentops", "api"],
"timezone": "browser",
"editable": true,
"panels": [
{
"id": 1,
"title": "Request Rate",
"type": "timeseries",
"gridPos": {"h": 8, "w": 8, "x": 0, "y": 0},
"targets": [
{
"datasource": {"type": "prometheus", "uid": "prometheus"},
"expr": "sum(rate(http_server_request_duration_seconds_count{job=\"incidentops-api\"}[1m]))",
"legendFormat": "Requests/sec",
"refId": "A"
}
],
"fieldConfig": {
"defaults": {
"color": {"mode": "palette-classic"},
"unit": "reqps"
}
}
},
{
"id": 2,
"title": "Request Duration (p50, p95, p99)",
"type": "timeseries",
"gridPos": {"h": 8, "w": 8, "x": 8, "y": 0},
"targets": [
{
"datasource": {"type": "prometheus", "uid": "prometheus"},
"expr": "histogram_quantile(0.50, sum(rate(http_server_request_duration_seconds_bucket{job=\"incidentops-api\"}[5m])) by (le))",
"legendFormat": "p50",
"refId": "A"
},
{
"datasource": {"type": "prometheus", "uid": "prometheus"},
"expr": "histogram_quantile(0.95, sum(rate(http_server_request_duration_seconds_bucket{job=\"incidentops-api\"}[5m])) by (le))",
"legendFormat": "p95",
"refId": "B"
},
{
"datasource": {"type": "prometheus", "uid": "prometheus"},
"expr": "histogram_quantile(0.99, sum(rate(http_server_request_duration_seconds_bucket{job=\"incidentops-api\"}[5m])) by (le))",
"legendFormat": "p99",
"refId": "C"
}
],
"fieldConfig": {
"defaults": {
"color": {"mode": "palette-classic"},
"unit": "s"
}
}
},
{
"id": 3,
"title": "Error Rate",
"type": "timeseries",
"gridPos": {"h": 8, "w": 8, "x": 16, "y": 0},
"targets": [
{
"datasource": {"type": "prometheus", "uid": "prometheus"},
"expr": "sum(rate(http_server_request_duration_seconds_count{job=\"incidentops-api\", http_status_code=~\"5..\"}[1m])) / sum(rate(http_server_request_duration_seconds_count{job=\"incidentops-api\"}[1m])) * 100",
"legendFormat": "Error %",
"refId": "A"
}
],
"fieldConfig": {
"defaults": {
"color": {"fixedColor": "red", "mode": "fixed"},
"unit": "percent",
"min": 0,
"max": 100
}
}
},
{
"id": 4,
"title": "Requests by Status Code",
"type": "timeseries",
"gridPos": {"h": 8, "w": 12, "x": 0, "y": 8},
"targets": [
{
"datasource": {"type": "prometheus", "uid": "prometheus"},
"expr": "sum by (http_status_code) (rate(http_server_request_duration_seconds_count{job=\"incidentops-api\"}[1m]))",
"legendFormat": "{{http_status_code}}",
"refId": "A"
}
],
"fieldConfig": {
"defaults": {
"color": {"mode": "palette-classic"},
"unit": "reqps"
}
}
},
{
"id": 5,
"title": "Requests by Endpoint",
"type": "timeseries",
"gridPos": {"h": 8, "w": 12, "x": 12, "y": 8},
"targets": [
{
"datasource": {"type": "prometheus", "uid": "prometheus"},
"expr": "sum by (http_route) (rate(http_server_request_duration_seconds_count{job=\"incidentops-api\"}[1m]))",
"legendFormat": "{{http_route}}",
"refId": "A"
}
],
"fieldConfig": {
"defaults": {
"color": {"mode": "palette-classic"},
"unit": "reqps"
}
}
},
{
"id": 6,
"title": "System CPU Usage",
"type": "gauge",
"gridPos": {"h": 6, "w": 6, "x": 0, "y": 16},
"targets": [
{
"datasource": {"type": "prometheus", "uid": "prometheus"},
"expr": "avg(system_cpu_utilization{job=\"incidentops-api\"}) * 100",
"refId": "A"
}
],
"fieldConfig": {
"defaults": {
"color": {"mode": "thresholds"},
"thresholds": {
"mode": "absolute",
"steps": [
{"color": "green", "value": null},
{"color": "yellow", "value": 60},
{"color": "red", "value": 80}
]
},
"unit": "percent",
"min": 0,
"max": 100
}
}
},
{
"id": 7,
"title": "Memory Usage",
"type": "gauge",
"gridPos": {"h": 6, "w": 6, "x": 6, "y": 16},
"targets": [
{
"datasource": {"type": "prometheus", "uid": "prometheus"},
"expr": "process_runtime_cpython_memory_bytes{job=\"incidentops-api\", type=\"rss\"} / 1024 / 1024",
"refId": "A"
}
],
"fieldConfig": {
"defaults": {
"color": {"mode": "thresholds"},
"thresholds": {
"mode": "absolute",
"steps": [
{"color": "green", "value": null},
{"color": "yellow", "value": 256},
{"color": "red", "value": 512}
]
},
"unit": "decmbytes"
}
}
},
{
"id": 8,
"title": "Active Threads",
"type": "stat",
"gridPos": {"h": 6, "w": 6, "x": 12, "y": 16},
"targets": [
{
"datasource": {"type": "prometheus", "uid": "prometheus"},
"expr": "process_runtime_cpython_thread_count{job=\"incidentops-api\"}",
"refId": "A"
}
],
"fieldConfig": {
"defaults": {
"color": {"mode": "thresholds"},
"thresholds": {
"mode": "absolute",
"steps": [
{"color": "green", "value": null},
{"color": "yellow", "value": 50},
{"color": "red", "value": 100}
]
}
}
}
},
{
"id": 9,
"title": "GC Collections",
"type": "stat",
"gridPos": {"h": 6, "w": 6, "x": 18, "y": 16},
"targets": [
{
"datasource": {"type": "prometheus", "uid": "prometheus"},
"expr": "sum(rate(process_runtime_cpython_gc_count{job=\"incidentops-api\"}[5m]))",
"refId": "A"
}
],
"fieldConfig": {
"defaults": {
"color": {"mode": "thresholds"},
"thresholds": {
"mode": "absolute",
"steps": [
{"color": "green", "value": null}
]
},
"unit": "cps"
}
}
},
{
"id": 10,
"title": "Recent Logs",
"type": "logs",
"gridPos": {"h": 10, "w": 24, "x": 0, "y": 22},
"targets": [
{
"datasource": {"type": "loki", "uid": "loki"},
"expr": "{service_name=\"incidentops-api\"} | json",
"refId": "A"
}
],
"options": {
"showTime": true,
"showLabels": true,
"wrapLogMessage": true,
"enableLogDetails": true,
"sortOrder": "Descending"
}
},
{
"id": 11,
"title": "Error Logs",
"type": "logs",
"gridPos": {"h": 8, "w": 24, "x": 0, "y": 32},
"targets": [
{
"datasource": {"type": "loki", "uid": "loki"},
"expr": "{service_name=\"incidentops-api\"} |= \"ERROR\" | json",
"refId": "A"
}
],
"options": {
"showTime": true,
"showLabels": true,
"wrapLogMessage": true,
"enableLogDetails": true,
"sortOrder": "Descending"
}
},
{
"id": 12,
"title": "Recent Traces",
"type": "traces",
"gridPos": {"h": 10, "w": 24, "x": 0, "y": 40},
"targets": [
{
"datasource": {"type": "tempo", "uid": "tempo"},
"queryType": "traceqlSearch",
"filters": [
{
"id": "service-name",
"operator": "=",
"scope": "resource",
"tag": "service.name",
"value": ["incidentops-api"]
}
],
"refId": "A"
}
]
}
],
"schemaVersion": 38,
"version": 2
}

View File

@@ -0,0 +1,12 @@
apiVersion: 1
providers:
- name: 'default'
orgId: 1
folder: 'IncidentOps'
folderUid: 'incidentops'
type: file
disableDeletion: false
editable: true
options:
path: /var/lib/grafana/dashboards

View File

@@ -0,0 +1,48 @@
apiVersion: 1
datasources:
- name: Prometheus
type: prometheus
uid: prometheus
url: http://prometheus:9090
access: proxy
isDefault: false
jsonData:
httpMethod: POST
exemplarTraceIdDestinations:
- name: trace_id
datasourceUid: tempo
- name: Tempo
type: tempo
uid: tempo
url: http://tempo:3200
access: proxy
isDefault: false
jsonData:
tracesToLogsV2:
datasourceUid: loki
spanStartTimeShift: '-1h'
spanEndTimeShift: '1h'
filterByTraceID: true
filterBySpanID: true
tracesToMetrics:
datasourceUid: prometheus
nodeGraph:
enabled: true
lokiSearch:
datasourceUid: loki
- name: Loki
type: loki
uid: loki
url: http://loki:3100
access: proxy
isDefault: true
jsonData:
derivedFields:
- datasourceUid: tempo
matcherRegex: '"trace_id":"([a-f0-9]+)"'
name: TraceID
url: '$${__value.raw}'
urlDisplayLabel: 'View Trace'

View File

@@ -0,0 +1,41 @@
auth_enabled: false
server:
http_listen_port: 3100
grpc_listen_port: 9096
common:
path_prefix: /loki
storage:
filesystem:
chunks_directory: /loki/chunks
rules_directory: /loki/rules
replication_factor: 1
ring:
kvstore:
store: inmemory
query_range:
results_cache:
cache:
embedded_cache:
enabled: true
max_size_mb: 100
schema_config:
configs:
- from: "2020-10-24"
store: tsdb
object_store: filesystem
schema: v13
index:
prefix: index_
period: 24h
ruler:
alertmanager_url: http://localhost:9093
limits_config:
retention_period: 168h # 7 days
allow_structured_metadata: true
volume_enabled: true

View File

@@ -0,0 +1,38 @@
receivers:
otlp:
protocols:
grpc:
endpoint: 0.0.0.0:4317
http:
endpoint: 0.0.0.0:4318
processors:
batch:
timeout: 1s
send_batch_size: 1024
memory_limiter:
check_interval: 1s
limit_mib: 256
spike_limit_mib: 64
exporters:
otlp/tempo:
endpoint: tempo:4317
tls:
insecure: true
loki:
endpoint: http://loki:3100/loki/api/v1/push
default_labels_enabled:
exporter: true
job: true
service:
pipelines:
traces:
receivers: [otlp]
processors: [memory_limiter, batch]
exporters: [otlp/tempo]
logs:
receivers: [otlp]
processors: [memory_limiter, batch]
exporters: [loki]

View File

@@ -0,0 +1,23 @@
global:
scrape_interval: 15s
evaluation_interval: 15s
scrape_configs:
# Scrape Prometheus itself
- job_name: "prometheus"
static_configs:
- targets: ["localhost:9090"]
# Scrape IncidentOps API metrics
- job_name: "incidentops-api"
static_configs:
- targets: ["api:9464"]
metrics_path: /metrics
scrape_interval: 10s
# Scrape IncidentOps Worker metrics (when metrics are enabled)
- job_name: "incidentops-worker"
static_configs:
- targets: ["worker:9464"]
metrics_path: /metrics
scrape_interval: 10s

View File

@@ -0,0 +1,32 @@
server:
http_listen_port: 3200
distributor:
receivers:
otlp:
protocols:
grpc:
endpoint: 0.0.0.0:4317
http:
endpoint: 0.0.0.0:4318
ingester:
trace_idle_period: 10s
max_block_bytes: 1048576
max_block_duration: 5m
compactor:
compaction:
block_retention: 168h # 7 days
storage:
trace:
backend: local
local:
path: /var/tempo/traces
wal:
path: /var/tempo/wal
querier:
search:
query_timeout: 30s