Files
incidentops/app/taskqueue.py
minhtrannhat 46ede7757d feat: add observability stack and background task infrastructure
Add OpenTelemetry instrumentation with distributed tracing and metrics:
- Structured JSON logging with trace context correlation
- Auto-instrumentation for FastAPI, asyncpg, httpx, redis
- OTLP exporter for traces and Prometheus metrics endpoint

Implement Celery worker and notification task system:
- Celery app with Redis/SQS broker support and configurable queues
- Notification tasks for incident fan-out, webhooks, and escalations
- Pluggable TaskQueue abstraction with in-memory driver for testing

Add Grafana observability stack (Loki, Tempo, Prometheus, Grafana):
- OpenTelemetry Collector for receiving OTLP traces and logs
- Tempo for distributed tracing backend
- Loki for log aggregation with Promtail DaemonSet
- Prometheus for metrics scraping with RBAC configuration
- Grafana with pre-provisioned datasources and API overview dashboard
- Helm templates for all observability components

Enhance application infrastructure:
- Global exception handlers with structured ErrorResponse schema
- Request logging middleware with timing metrics
- Health check updated to verify task queue connectivity
- Non-root user in Dockerfile for security
- Init containers in Helm deployments for dependency ordering
- Production Helm values with autoscaling and retention policies
2026-01-07 20:51:13 -05:00

189 lines
4.9 KiB
Python

"""Task queue abstractions for scheduling background work."""
from __future__ import annotations
import asyncio
from abc import ABC, abstractmethod
from dataclasses import dataclass
from typing import Any
from uuid import UUID
from app.config import settings
try:
from worker.celery_app import celery_app
except Exception: # pragma: no cover - celery app may not import during docs builds
celery_app = None # type: ignore[assignment]
class TaskQueue(ABC):
"""Interface for enqueueing background work."""
async def startup(self) -> None: # pragma: no cover - default no-op
"""Hook for queue initialization."""
async def shutdown(self) -> None: # pragma: no cover - default no-op
"""Hook for queue teardown."""
async def ping(self) -> bool:
"""Check if the queue backend is reachable."""
return True
def reset(self) -> None: # pragma: no cover - optional for in-memory impls
"""Reset any in-memory state (used in tests)."""
@abstractmethod
def incident_triggered(
self,
*,
incident_id: UUID,
org_id: UUID,
triggered_by: UUID | None,
) -> None:
"""Fan out an incident triggered notification."""
@abstractmethod
def schedule_escalation_check(
self,
*,
incident_id: UUID,
org_id: UUID,
delay_seconds: int,
) -> None:
"""Schedule a delayed escalation check."""
class CeleryTaskQueue(TaskQueue):
"""Celery-backed task queue that can use Redis or SQS brokers."""
def __init__(self, default_queue: str, critical_queue: str) -> None:
if celery_app is None: # pragma: no cover - guarded by try/except
raise RuntimeError("Celery application is unavailable")
self._celery = celery_app
self._default_queue = default_queue
self._critical_queue = critical_queue
def incident_triggered(
self,
*,
incident_id: UUID,
org_id: UUID,
triggered_by: UUID | None,
) -> None:
self._celery.send_task(
"worker.tasks.notifications.incident_triggered",
kwargs={
"incident_id": str(incident_id),
"org_id": str(org_id),
"triggered_by": str(triggered_by) if triggered_by else None,
},
queue=self._default_queue,
)
def schedule_escalation_check(
self,
*,
incident_id: UUID,
org_id: UUID,
delay_seconds: int,
) -> None:
self._celery.send_task(
"worker.tasks.notifications.escalate_if_unacked",
kwargs={
"incident_id": str(incident_id),
"org_id": str(org_id),
},
countdown=max(delay_seconds, 0),
queue=self._critical_queue,
)
async def ping(self) -> bool:
loop = asyncio.get_running_loop()
return await loop.run_in_executor(None, self._ping_sync)
def _ping_sync(self) -> bool:
connection = self._celery.connection()
try:
connection.connect()
return True
except Exception:
return False
finally:
try:
connection.release()
except Exception: # pragma: no cover - release best effort
pass
@dataclass
class InMemoryTaskQueue(TaskQueue):
"""Test-friendly queue that records dispatched tasks in memory."""
dispatched: list[tuple[str, dict[str, Any]]] | None = None
def __post_init__(self) -> None:
if self.dispatched is None:
self.dispatched = []
def incident_triggered(
self,
*,
incident_id: UUID,
org_id: UUID,
triggered_by: UUID | None,
) -> None:
self.dispatched.append(
(
"incident_triggered",
{
"incident_id": incident_id,
"org_id": org_id,
"triggered_by": triggered_by,
},
)
)
def schedule_escalation_check(
self,
*,
incident_id: UUID,
org_id: UUID,
delay_seconds: int,
) -> None:
self.dispatched.append(
(
"escalate_if_unacked",
{
"incident_id": incident_id,
"org_id": org_id,
"delay_seconds": delay_seconds,
},
)
)
def reset(self) -> None:
if self.dispatched is not None:
self.dispatched.clear()
def _build_task_queue() -> TaskQueue:
if settings.task_queue_driver == "inmemory":
return InMemoryTaskQueue()
return CeleryTaskQueue(
default_queue=settings.task_queue_default_queue,
critical_queue=settings.task_queue_critical_queue,
)
task_queue = _build_task_queue()
__all__ = [
"CeleryTaskQueue",
"InMemoryTaskQueue",
"TaskQueue",
"task_queue",
]