feat: add observability stack and background task infrastructure

Add OpenTelemetry instrumentation with distributed tracing and metrics:
- Structured JSON logging with trace context correlation
- Auto-instrumentation for FastAPI, asyncpg, httpx, redis
- OTLP exporter for traces and Prometheus metrics endpoint

Implement Celery worker and notification task system:
- Celery app with Redis/SQS broker support and configurable queues
- Notification tasks for incident fan-out, webhooks, and escalations
- Pluggable TaskQueue abstraction with in-memory driver for testing

Add Grafana observability stack (Loki, Tempo, Prometheus, Grafana):
- OpenTelemetry Collector for receiving OTLP traces and logs
- Tempo for distributed tracing backend
- Loki for log aggregation with Promtail DaemonSet
- Prometheus for metrics scraping with RBAC configuration
- Grafana with pre-provisioned datasources and API overview dashboard
- Helm templates for all observability components

Enhance application infrastructure:
- Global exception handlers with structured ErrorResponse schema
- Request logging middleware with timing metrics
- Health check updated to verify task queue connectivity
- Non-root user in Dockerfile for security
- Init containers in Helm deployments for dependency ordering
- Production Helm values with autoscaling and retention policies
This commit is contained in:
2026-01-07 20:51:13 -05:00
parent f427d191e0
commit 46ede7757d
45 changed files with 3742 additions and 76 deletions

View File

@@ -4,7 +4,7 @@ from __future__ import annotations
import os
from contextlib import asynccontextmanager
from typing import AsyncGenerator, Callable
from typing import AsyncGenerator, Callable, Generator
from uuid import UUID, uuid4
import asyncpg
@@ -15,8 +15,11 @@ import pytest
os.environ.setdefault("DATABASE_URL", "postgresql://incidentops:incidentops@localhost:5432/incidentops_test")
os.environ.setdefault("JWT_SECRET_KEY", "test-secret-key-for-testing-only")
os.environ.setdefault("REDIS_URL", "redis://localhost:6379/1")
os.environ.setdefault("TASK_QUEUE_DRIVER", "inmemory")
os.environ.setdefault("TASK_QUEUE_BROKER_URL", "redis://localhost:6379/2")
from app.main import app
from app.taskqueue import task_queue
# Module-level setup: create database and run migrations once
@@ -163,3 +166,14 @@ async def db_admin(clean_database: None) -> AsyncGenerator[asyncpg.Connection, N
yield conn
finally:
await conn.close()
@pytest.fixture(autouse=True)
def reset_task_queue() -> Generator[None, None, None]:
"""Ensure in-memory task queue state is cleared between tests."""
if hasattr(task_queue, "reset"):
task_queue.reset()
yield
if hasattr(task_queue, "reset"):
task_queue.reset()

View File

@@ -14,6 +14,7 @@ from app.core import exceptions as exc, security
from app.db import Database
from app.schemas.incident import CommentRequest, IncidentCreate, TransitionRequest
from app.services.incident import IncidentService
from app.taskqueue import InMemoryTaskQueue
pytestmark = pytest.mark.asyncio
@@ -43,10 +44,24 @@ class _SingleConnectionDatabase(Database):
@pytest.fixture
async def incident_service(db_conn: asyncpg.Connection):
def incident_task_queue() -> InMemoryTaskQueue:
"""In-memory task queue used to assert dispatch behavior."""
return InMemoryTaskQueue()
@pytest.fixture
async def incident_service(
db_conn: asyncpg.Connection,
incident_task_queue: InMemoryTaskQueue,
):
"""IncidentService bound to the per-test database connection."""
return IncidentService(database=_SingleConnectionDatabase(db_conn))
return IncidentService(
database=_SingleConnectionDatabase(db_conn),
task_queue=incident_task_queue,
escalation_delay_seconds=60,
)
async def _seed_user_org_service(conn: asyncpg.Connection) -> tuple[CurrentUser, UUID]:
@@ -94,7 +109,9 @@ async def _seed_user_org_service(conn: asyncpg.Connection) -> tuple[CurrentUser,
async def test_create_incident_persists_and_records_event(
incident_service: IncidentService, db_conn: asyncpg.Connection
incident_service: IncidentService,
db_conn: asyncpg.Connection,
incident_task_queue: InMemoryTaskQueue,
) -> None:
current_user, service_id = await _seed_user_org_service(db_conn)
@@ -121,6 +138,12 @@ async def test_create_incident_persists_and_records_event(
assert event["event_type"] == "created"
assert event["actor_user_id"] == current_user.user_id
assert incident_task_queue.dispatched is not None
assert len(incident_task_queue.dispatched) == 2
first, second = incident_task_queue.dispatched
assert first[0] == "incident_triggered"
assert second[0] == "escalate_if_unacked"
async def test_get_incidents_paginates_by_created_at(
incident_service: IncidentService, db_conn: asyncpg.Connection

View File

@@ -0,0 +1,199 @@
"""End-to-end Celery worker tests against the real Redis broker."""
from __future__ import annotations
import asyncio
import inspect
from uuid import UUID, uuid4
import asyncpg
import pytest
import redis
from app.config import settings
from app.repositories.incident import IncidentRepository
from app.taskqueue import CeleryTaskQueue
from celery.contrib.testing.worker import start_worker
from worker.celery_app import celery_app
pytestmark = pytest.mark.asyncio
@pytest.fixture(scope="module", autouse=True)
def ensure_redis_available() -> None:
"""Skip the module if the configured Redis broker is unreachable."""
client = redis.Redis.from_url(settings.resolved_task_queue_broker_url)
try:
client.ping()
except redis.RedisError as exc: # pragma: no cover - diagnostic-only path
pytest.skip(f"Redis broker unavailable: {exc}")
finally:
client.close()
@pytest.fixture(scope="module")
def celery_worker_instance(ensure_redis_available: None):
"""Run a real Celery worker connected to Redis for the duration of the module."""
queues = [settings.task_queue_default_queue, settings.task_queue_critical_queue]
with start_worker(
celery_app,
loglevel="INFO",
pool="solo",
concurrency=1,
queues=queues,
perform_ping_check=False,
):
yield
@pytest.fixture(autouse=True)
def purge_celery_queues():
"""Clear any pending tasks before and after each test for isolation."""
celery_app.control.purge()
yield
celery_app.control.purge()
@pytest.fixture
def celery_queue() -> CeleryTaskQueue:
return CeleryTaskQueue(
default_queue=settings.task_queue_default_queue,
critical_queue=settings.task_queue_critical_queue,
)
async def _seed_incident_with_target(conn: asyncpg.Connection) -> tuple[UUID, UUID]:
org_id = uuid4()
service_id = uuid4()
incident_id = uuid4()
target_id = uuid4()
await conn.execute(
"INSERT INTO orgs (id, name, slug) VALUES ($1, $2, $3)",
org_id,
"Celery Org",
f"celery-{org_id.hex[:6]}",
)
await conn.execute(
"INSERT INTO services (id, org_id, name, slug) VALUES ($1, $2, $3, $4)",
service_id,
org_id,
"API",
f"svc-{service_id.hex[:6]}",
)
repo = IncidentRepository(conn)
await repo.create(
incident_id=incident_id,
org_id=org_id,
service_id=service_id,
title="Latency spike",
description="",
severity="high",
)
await conn.execute(
"""
INSERT INTO notification_targets (id, org_id, name, target_type, webhook_url, enabled)
VALUES ($1, $2, $3, $4, $5, $6)
""",
target_id,
org_id,
"Primary Webhook",
"webhook",
"https://example.com/hook",
True,
)
return org_id, incident_id
async def _wait_until(predicate, timeout: float = 5.0, interval: float = 0.1) -> None:
deadline = asyncio.get_running_loop().time() + timeout
while True:
result = predicate()
if inspect.isawaitable(result):
result = await result
if result:
return
if asyncio.get_running_loop().time() >= deadline:
raise AssertionError("Timed out waiting for Celery worker to finish")
await asyncio.sleep(interval)
async def _attempt_sent(conn: asyncpg.Connection, incident_id: UUID) -> bool:
row = await conn.fetchrow(
"SELECT status FROM notification_attempts WHERE incident_id = $1",
incident_id,
)
return bool(row and row["status"] == "sent")
async def _attempt_count(conn: asyncpg.Connection, incident_id: UUID) -> int:
count = await conn.fetchval(
"SELECT COUNT(*) FROM notification_attempts WHERE incident_id = $1",
incident_id,
)
return int(count or 0)
async def _attempt_count_is(conn: asyncpg.Connection, incident_id: UUID, expected: int) -> bool:
return await _attempt_count(conn, incident_id) == expected
async def test_incident_triggered_task_marks_attempt_sent(
db_admin: asyncpg.Connection,
celery_worker_instance: None,
celery_queue: CeleryTaskQueue,
) -> None:
org_id, incident_id = await _seed_incident_with_target(db_admin)
celery_queue.incident_triggered(
incident_id=incident_id,
org_id=org_id,
triggered_by=uuid4(),
)
await _wait_until(lambda: _attempt_sent(db_admin, incident_id))
async def test_escalate_task_refires_when_incident_still_triggered(
db_admin: asyncpg.Connection,
celery_worker_instance: None,
celery_queue: CeleryTaskQueue,
) -> None:
org_id, incident_id = await _seed_incident_with_target(db_admin)
celery_queue.schedule_escalation_check(
incident_id=incident_id,
org_id=org_id,
delay_seconds=0,
)
await _wait_until(lambda: _attempt_count_is(db_admin, incident_id, 1))
async def test_escalate_task_skips_when_incident_acknowledged(
db_admin: asyncpg.Connection,
celery_worker_instance: None,
celery_queue: CeleryTaskQueue,
) -> None:
org_id, incident_id = await _seed_incident_with_target(db_admin)
await db_admin.execute(
"UPDATE incidents SET status = 'acknowledged' WHERE id = $1",
incident_id,
)
celery_queue.schedule_escalation_check(
incident_id=incident_id,
org_id=org_id,
delay_seconds=0,
)
await asyncio.sleep(1)
assert await _attempt_count(db_admin, incident_id) == 0

View File

@@ -0,0 +1,96 @@
"""Tests for worker notification helpers."""
from __future__ import annotations
from uuid import UUID, uuid4
import asyncpg
import pytest
from app.repositories.incident import IncidentRepository
from worker.tasks.notifications import NotificationDispatch, prepare_notification_dispatches
pytestmark = pytest.mark.asyncio
async def _seed_incident(conn: asyncpg.Connection) -> tuple[UUID, UUID, UUID]:
org_id = uuid4()
service_id = uuid4()
incident_id = uuid4()
await conn.execute(
"INSERT INTO orgs (id, name, slug) VALUES ($1, $2, $3)",
org_id,
"Notif Org",
"notif-org",
)
await conn.execute(
"INSERT INTO services (id, org_id, name, slug) VALUES ($1, $2, $3, $4)",
service_id,
org_id,
"API",
"api",
)
repo = IncidentRepository(conn)
await repo.create(
incident_id=incident_id,
org_id=org_id,
service_id=service_id,
title="Outage",
description="",
severity="high",
)
return org_id, service_id, incident_id
async def test_prepare_notification_dispatches_creates_attempts(db_conn: asyncpg.Connection) -> None:
org_id, _service_id, incident_id = await _seed_incident(db_conn)
target_id = uuid4()
await db_conn.execute(
"""
INSERT INTO notification_targets (id, org_id, name, target_type, enabled)
VALUES ($1, $2, $3, $4, $5)
""",
target_id,
org_id,
"Primary Webhook",
"webhook",
True,
)
dispatches = await prepare_notification_dispatches(db_conn, incident_id=incident_id, org_id=org_id)
assert len(dispatches) == 1
dispatch = dispatches[0]
assert isinstance(dispatch, NotificationDispatch)
assert dispatch.target["name"] == "Primary Webhook"
attempt = await db_conn.fetchrow(
"SELECT status FROM notification_attempts WHERE id = $1",
dispatch.attempt_id,
)
assert attempt is not None and attempt["status"] == "pending"
async def test_prepare_notification_dispatches_skips_disabled_targets(db_conn: asyncpg.Connection) -> None:
org_id, _service_id, incident_id = await _seed_incident(db_conn)
await db_conn.execute(
"""
INSERT INTO notification_targets (id, org_id, name, target_type, enabled)
VALUES ($1, $2, $3, $4, $5)
""",
uuid4(),
org_id,
"Disabled",
"email",
False,
)
dispatches = await prepare_notification_dispatches(db_conn, incident_id=incident_id, org_id=org_id)
assert dispatches == []