feat: add observability stack and background task infrastructure

Add OpenTelemetry instrumentation with distributed tracing and metrics: - Structured JSON logging with trace context correlation - Auto-instrumentation for FastAPI, asyncpg, httpx, redis - OTLP exporter for traces and Prometheus metrics endpoint Implement Celery worker and notification task system: - Celery app with Redis/SQS broker support and configurable queues - Notification tasks for incident fan-out, webhooks, and escalations - Pluggable TaskQueue abstraction with in-memory driver for testing Add Grafana observability stack (Loki, Tempo, Prometheus, Grafana): - OpenTelemetry Collector for receiving OTLP traces and logs - Tempo for distributed tracing backend - Loki for log aggregation with Promtail DaemonSet - Prometheus for metrics scraping with RBAC configuration - Grafana with pre-provisioned datasources and API overview dashboard - Helm templates for all observability components Enhance application infrastructure: - Global exception handlers with structured ErrorResponse schema - Request logging middleware with timing metrics - Health check updated to verify task queue connectivity - Non-root user in Dockerfile for security - Init containers in Helm deployments for dependency ordering - Production Helm values with autoscaling and retention policies
2026-01-07 20:51:13 -05:00
parent f427d191e0
commit 46ede7757d
45 changed files with 3742 additions and 76 deletions
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -4,7 +4,7 @@ from __future__ import annotations

 import os
 from contextlib import asynccontextmanager
-from typing import AsyncGenerator, Callable
+from typing import AsyncGenerator, Callable, Generator
 from uuid import UUID, uuid4

 import asyncpg
@@ -15,8 +15,11 @@ import pytest
 os.environ.setdefault("DATABASE_URL", "postgresql://incidentops:incidentops@localhost:5432/incidentops_test")
 os.environ.setdefault("JWT_SECRET_KEY", "test-secret-key-for-testing-only")
 os.environ.setdefault("REDIS_URL", "redis://localhost:6379/1")
+os.environ.setdefault("TASK_QUEUE_DRIVER", "inmemory")
+os.environ.setdefault("TASK_QUEUE_BROKER_URL", "redis://localhost:6379/2")

 from app.main import app
+from app.taskqueue import task_queue


 # Module-level setup: create database and run migrations once
@@ -163,3 +166,14 @@ async def db_admin(clean_database: None) -> AsyncGenerator[asyncpg.Connection, N
        yield conn
    finally:
        await conn.close()
+
+
+@pytest.fixture(autouse=True)
+def reset_task_queue() -> Generator[None, None, None]:
+    """Ensure in-memory task queue state is cleared between tests."""
+
+    if hasattr(task_queue, "reset"):
+        task_queue.reset()
+    yield
+    if hasattr(task_queue, "reset"):
+        task_queue.reset()
--- a/tests/services/test_incident_service.py
+++ b/tests/services/test_incident_service.py
@@ -14,6 +14,7 @@ from app.core import exceptions as exc, security
 from app.db import Database
 from app.schemas.incident import CommentRequest, IncidentCreate, TransitionRequest
 from app.services.incident import IncidentService
+from app.taskqueue import InMemoryTaskQueue


 pytestmark = pytest.mark.asyncio
@@ -43,10 +44,24 @@ class _SingleConnectionDatabase(Database):


@pytest.fixture
-async def incident_service(db_conn: asyncpg.Connection):
+def incident_task_queue() -> InMemoryTaskQueue:
+    """In-memory task queue used to assert dispatch behavior."""
+
+    return InMemoryTaskQueue()
+
+
+@pytest.fixture
+async def incident_service(
+    db_conn: asyncpg.Connection,
+    incident_task_queue: InMemoryTaskQueue,
+):
    """IncidentService bound to the per-test database connection."""

-    return IncidentService(database=_SingleConnectionDatabase(db_conn))
+    return IncidentService(
+        database=_SingleConnectionDatabase(db_conn),
+        task_queue=incident_task_queue,
+        escalation_delay_seconds=60,
+    )


 async def _seed_user_org_service(conn: asyncpg.Connection) -> tuple[CurrentUser, UUID]:
@@ -94,7 +109,9 @@ async def _seed_user_org_service(conn: asyncpg.Connection) -> tuple[CurrentUser,


 async def test_create_incident_persists_and_records_event(
-    incident_service: IncidentService, db_conn: asyncpg.Connection
+    incident_service: IncidentService,
+    db_conn: asyncpg.Connection,
+    incident_task_queue: InMemoryTaskQueue,
 ) -> None:
    current_user, service_id = await _seed_user_org_service(db_conn)

@@ -121,6 +138,12 @@ async def test_create_incident_persists_and_records_event(
    assert event["event_type"] == "created"
    assert event["actor_user_id"] == current_user.user_id

+    assert incident_task_queue.dispatched is not None
+    assert len(incident_task_queue.dispatched) == 2
+    first, second = incident_task_queue.dispatched
+    assert first[0] == "incident_triggered"
+    assert second[0] == "escalate_if_unacked"
+

 async def test_get_incidents_paginates_by_created_at(
    incident_service: IncidentService, db_conn: asyncpg.Connection
--- a/tests/worker/test_celery_tasks.py
+++ b/tests/worker/test_celery_tasks.py
@@ -0,0 +1,199 @@
+"""End-to-end Celery worker tests against the real Redis broker."""
+
+from __future__ import annotations
+
+import asyncio
+import inspect
+from uuid import UUID, uuid4
+
+import asyncpg
+import pytest
+import redis
+
+from app.config import settings
+from app.repositories.incident import IncidentRepository
+from app.taskqueue import CeleryTaskQueue
+from celery.contrib.testing.worker import start_worker
+
+from worker.celery_app import celery_app
+
+
+pytestmark = pytest.mark.asyncio
+
+
+@pytest.fixture(scope="module", autouse=True)
+def ensure_redis_available() -> None:
+    """Skip the module if the configured Redis broker is unreachable."""
+
+    client = redis.Redis.from_url(settings.resolved_task_queue_broker_url)
+    try:
+        client.ping()
+    except redis.RedisError as exc:  # pragma: no cover - diagnostic-only path
+        pytest.skip(f"Redis broker unavailable: {exc}")
+    finally:
+        client.close()
+
+
+@pytest.fixture(scope="module")
+def celery_worker_instance(ensure_redis_available: None):
+    """Run a real Celery worker connected to Redis for the duration of the module."""
+
+    queues = [settings.task_queue_default_queue, settings.task_queue_critical_queue]
+    with start_worker(
+        celery_app,
+        loglevel="INFO",
+        pool="solo",
+        concurrency=1,
+        queues=queues,
+        perform_ping_check=False,
+    ):
+        yield
+
+
+@pytest.fixture(autouse=True)
+def purge_celery_queues():
+    """Clear any pending tasks before and after each test for isolation."""
+
+    celery_app.control.purge()
+    yield
+    celery_app.control.purge()
+
+
+@pytest.fixture
+def celery_queue() -> CeleryTaskQueue:
+    return CeleryTaskQueue(
+        default_queue=settings.task_queue_default_queue,
+        critical_queue=settings.task_queue_critical_queue,
+    )
+
+
+async def _seed_incident_with_target(conn: asyncpg.Connection) -> tuple[UUID, UUID]:
+    org_id = uuid4()
+    service_id = uuid4()
+    incident_id = uuid4()
+    target_id = uuid4()
+
+    await conn.execute(
+        "INSERT INTO orgs (id, name, slug) VALUES ($1, $2, $3)",
+        org_id,
+        "Celery Org",
+        f"celery-{org_id.hex[:6]}",
+    )
+    await conn.execute(
+        "INSERT INTO services (id, org_id, name, slug) VALUES ($1, $2, $3, $4)",
+        service_id,
+        org_id,
+        "API",
+        f"svc-{service_id.hex[:6]}",
+    )
+
+    repo = IncidentRepository(conn)
+    await repo.create(
+        incident_id=incident_id,
+        org_id=org_id,
+        service_id=service_id,
+        title="Latency spike",
+        description="",
+        severity="high",
+    )
+
+    await conn.execute(
+        """
+        INSERT INTO notification_targets (id, org_id, name, target_type, webhook_url, enabled)
+        VALUES ($1, $2, $3, $4, $5, $6)
+        """,
+        target_id,
+        org_id,
+        "Primary Webhook",
+        "webhook",
+        "https://example.com/hook",
+        True,
+    )
+
+    return org_id, incident_id
+
+
+async def _wait_until(predicate, timeout: float = 5.0, interval: float = 0.1) -> None:
+    deadline = asyncio.get_running_loop().time() + timeout
+    while True:
+        result = predicate()
+        if inspect.isawaitable(result):
+            result = await result
+        if result:
+            return
+        if asyncio.get_running_loop().time() >= deadline:
+            raise AssertionError("Timed out waiting for Celery worker to finish")
+        await asyncio.sleep(interval)
+
+
+async def _attempt_sent(conn: asyncpg.Connection, incident_id: UUID) -> bool:
+    row = await conn.fetchrow(
+        "SELECT status FROM notification_attempts WHERE incident_id = $1",
+        incident_id,
+    )
+    return bool(row and row["status"] == "sent")
+
+
+async def _attempt_count(conn: asyncpg.Connection, incident_id: UUID) -> int:
+    count = await conn.fetchval(
+        "SELECT COUNT(*) FROM notification_attempts WHERE incident_id = $1",
+        incident_id,
+    )
+    return int(count or 0)
+
+
+async def _attempt_count_is(conn: asyncpg.Connection, incident_id: UUID, expected: int) -> bool:
+    return await _attempt_count(conn, incident_id) == expected
+
+
+async def test_incident_triggered_task_marks_attempt_sent(
+    db_admin: asyncpg.Connection,
+    celery_worker_instance: None,
+    celery_queue: CeleryTaskQueue,
+) -> None:
+    org_id, incident_id = await _seed_incident_with_target(db_admin)
+
+    celery_queue.incident_triggered(
+        incident_id=incident_id,
+        org_id=org_id,
+        triggered_by=uuid4(),
+    )
+
+    await _wait_until(lambda: _attempt_sent(db_admin, incident_id))
+
+
+async def test_escalate_task_refires_when_incident_still_triggered(
+    db_admin: asyncpg.Connection,
+    celery_worker_instance: None,
+    celery_queue: CeleryTaskQueue,
+) -> None:
+    org_id, incident_id = await _seed_incident_with_target(db_admin)
+
+    celery_queue.schedule_escalation_check(
+        incident_id=incident_id,
+        org_id=org_id,
+        delay_seconds=0,
+    )
+
+    await _wait_until(lambda: _attempt_count_is(db_admin, incident_id, 1))
+
+
+async def test_escalate_task_skips_when_incident_acknowledged(
+    db_admin: asyncpg.Connection,
+    celery_worker_instance: None,
+    celery_queue: CeleryTaskQueue,
+) -> None:
+    org_id, incident_id = await _seed_incident_with_target(db_admin)
+    await db_admin.execute(
+        "UPDATE incidents SET status = 'acknowledged' WHERE id = $1",
+        incident_id,
+    )
+
+    celery_queue.schedule_escalation_check(
+        incident_id=incident_id,
+        org_id=org_id,
+        delay_seconds=0,
+    )
+
+    await asyncio.sleep(1)
+    assert await _attempt_count(db_admin, incident_id) == 0
--- a/tests/worker/test_notifications.py
+++ b/tests/worker/test_notifications.py
@@ -0,0 +1,96 @@
+"""Tests for worker notification helpers."""
+
+from __future__ import annotations
+
+from uuid import UUID, uuid4
+
+import asyncpg
+import pytest
+
+from app.repositories.incident import IncidentRepository
+from worker.tasks.notifications import NotificationDispatch, prepare_notification_dispatches
+
+
+pytestmark = pytest.mark.asyncio
+
+
+async def _seed_incident(conn: asyncpg.Connection) -> tuple[UUID, UUID, UUID]:
+    org_id = uuid4()
+    service_id = uuid4()
+    incident_id = uuid4()
+
+    await conn.execute(
+        "INSERT INTO orgs (id, name, slug) VALUES ($1, $2, $3)",
+        org_id,
+        "Notif Org",
+        "notif-org",
+    )
+    await conn.execute(
+        "INSERT INTO services (id, org_id, name, slug) VALUES ($1, $2, $3, $4)",
+        service_id,
+        org_id,
+        "API",
+        "api",
+    )
+
+    repo = IncidentRepository(conn)
+    await repo.create(
+        incident_id=incident_id,
+        org_id=org_id,
+        service_id=service_id,
+        title="Outage",
+        description="",
+        severity="high",
+    )
+
+    return org_id, service_id, incident_id
+
+
+async def test_prepare_notification_dispatches_creates_attempts(db_conn: asyncpg.Connection) -> None:
+    org_id, _service_id, incident_id = await _seed_incident(db_conn)
+
+    target_id = uuid4()
+    await db_conn.execute(
+        """
+        INSERT INTO notification_targets (id, org_id, name, target_type, enabled)
+        VALUES ($1, $2, $3, $4, $5)
+        """,
+        target_id,
+        org_id,
+        "Primary Webhook",
+        "webhook",
+        True,
+    )
+
+    dispatches = await prepare_notification_dispatches(db_conn, incident_id=incident_id, org_id=org_id)
+
+    assert len(dispatches) == 1
+    dispatch = dispatches[0]
+    assert isinstance(dispatch, NotificationDispatch)
+    assert dispatch.target["name"] == "Primary Webhook"
+
+    attempt = await db_conn.fetchrow(
+        "SELECT status FROM notification_attempts WHERE id = $1",
+        dispatch.attempt_id,
+    )
+    assert attempt is not None and attempt["status"] == "pending"
+
+
+async def test_prepare_notification_dispatches_skips_disabled_targets(db_conn: asyncpg.Connection) -> None:
+    org_id, _service_id, incident_id = await _seed_incident(db_conn)
+
+    await db_conn.execute(
+        """
+        INSERT INTO notification_targets (id, org_id, name, target_type, enabled)
+        VALUES ($1, $2, $3, $4, $5)
+        """,
+        uuid4(),
+        org_id,
+        "Disabled",
+        "email",
+        False,
+    )
+
+    dispatches = await prepare_notification_dispatches(db_conn, incident_id=incident_id, org_id=org_id)
+
+    assert dispatches == []