Files
incidentops/app/services/incident.py
minhtrannhat 46ede7757d feat: add observability stack and background task infrastructure
Add OpenTelemetry instrumentation with distributed tracing and metrics:
- Structured JSON logging with trace context correlation
- Auto-instrumentation for FastAPI, asyncpg, httpx, redis
- OTLP exporter for traces and Prometheus metrics endpoint

Implement Celery worker and notification task system:
- Celery app with Redis/SQS broker support and configurable queues
- Notification tasks for incident fan-out, webhooks, and escalations
- Pluggable TaskQueue abstraction with in-memory driver for testing

Add Grafana observability stack (Loki, Tempo, Prometheus, Grafana):
- OpenTelemetry Collector for receiving OTLP traces and logs
- Tempo for distributed tracing backend
- Loki for log aggregation with Promtail DaemonSet
- Prometheus for metrics scraping with RBAC configuration
- Grafana with pre-provisioned datasources and API overview dashboard
- Helm templates for all observability components

Enhance application infrastructure:
- Global exception handlers with structured ErrorResponse schema
- Request logging middleware with timing metrics
- Health check updated to verify task queue connectivity
- Non-root user in Dockerfile for security
- Init containers in Helm deployments for dependency ordering
- Production Helm values with autoscaling and retention policies
2026-01-07 20:51:13 -05:00

248 lines
8.5 KiB
Python

"""Incident service implementing incident lifecycle operations."""
from __future__ import annotations
from datetime import datetime
from typing import cast
from uuid import UUID, uuid4
import asyncpg
from asyncpg.pool import PoolConnectionProxy
from app.api.deps import CurrentUser, ensure_org_access
from app.config import settings
from app.core import exceptions as exc
from app.db import Database, db
from app.repositories import IncidentRepository, ServiceRepository
from app.schemas.common import PaginatedResponse
from app.schemas.incident import (
CommentRequest,
IncidentCreate,
IncidentEventResponse,
IncidentResponse,
TransitionRequest,
)
from app.taskqueue import TaskQueue
from app.taskqueue import task_queue as default_task_queue
_ALLOWED_TRANSITIONS: dict[str, set[str]] = {
"triggered": {"acknowledged"},
"acknowledged": {"mitigated"},
"mitigated": {"resolved"},
"resolved": set(),
}
def _as_conn(conn: asyncpg.Connection | PoolConnectionProxy) -> asyncpg.Connection:
"""Helper to satisfy typing when a pool proxy is returned."""
return cast(asyncpg.Connection, conn)
class IncidentService:
"""Encapsulates incident lifecycle operations within an org context."""
def __init__(
self,
database: Database | None = None,
task_queue: TaskQueue | None = None,
escalation_delay_seconds: int | None = None,
) -> None:
self.db = database or db
self.task_queue = task_queue or default_task_queue
self.escalation_delay_seconds = (
escalation_delay_seconds
if escalation_delay_seconds is not None
else settings.notification_escalation_delay_seconds
)
async def create_incident(
self,
current_user: CurrentUser,
service_id: UUID,
data: IncidentCreate,
) -> IncidentResponse:
"""Create an incident for a service in the active org and record the creation event."""
async with self.db.transaction() as conn:
db_conn = _as_conn(conn)
service_repo = ServiceRepository(db_conn)
incident_repo = IncidentRepository(db_conn)
service = await service_repo.get_by_id(service_id)
if service is None:
raise exc.NotFoundError("Service not found")
ensure_org_access(service["org_id"], current_user)
incident_id = uuid4()
incident = await incident_repo.create(
incident_id=incident_id,
org_id=current_user.org_id,
service_id=service_id,
title=data.title,
description=data.description,
severity=data.severity,
)
await incident_repo.add_event(
uuid4(),
incident_id,
"created",
actor_user_id=current_user.user_id,
payload={
"title": data.title,
"severity": data.severity,
"description": data.description,
},
)
incident_response = IncidentResponse(**incident)
self.task_queue.incident_triggered(
incident_id=incident_response.id,
org_id=current_user.org_id,
triggered_by=current_user.user_id,
)
if self.escalation_delay_seconds > 0:
self.task_queue.schedule_escalation_check(
incident_id=incident_response.id,
org_id=current_user.org_id,
delay_seconds=self.escalation_delay_seconds,
)
return incident_response
async def get_incidents(
self,
current_user: CurrentUser,
*,
status: str | None = None,
cursor: datetime | None = None,
limit: int = 20,
) -> PaginatedResponse[IncidentResponse]:
"""Return paginated incidents for the active organization."""
async with self.db.connection() as conn:
incident_repo = IncidentRepository(_as_conn(conn))
rows = await incident_repo.get_by_org(
org_id=current_user.org_id,
status=status,
cursor=cursor,
limit=limit,
)
has_more = len(rows) > limit
items = rows[:limit]
next_cursor = items[-1]["created_at"].isoformat() if has_more and items else None
incidents = [IncidentResponse(**row) for row in items]
return PaginatedResponse[IncidentResponse](
items=incidents,
next_cursor=next_cursor,
has_more=has_more,
)
async def get_incident(self, current_user: CurrentUser, incident_id: UUID) -> IncidentResponse:
"""Return a single incident, ensuring it belongs to the active org."""
async with self.db.connection() as conn:
incident_repo = IncidentRepository(_as_conn(conn))
incident = await incident_repo.get_by_id(incident_id)
if incident is None:
raise exc.NotFoundError("Incident not found")
ensure_org_access(incident["org_id"], current_user)
return IncidentResponse(**incident)
async def get_incident_events(
self, current_user: CurrentUser, incident_id: UUID
) -> list[IncidentEventResponse]:
"""Return the timeline events for an incident in the active org."""
async with self.db.connection() as conn:
incident_repo = IncidentRepository(_as_conn(conn))
incident = await incident_repo.get_by_id(incident_id)
if incident is None:
raise exc.NotFoundError("Incident not found")
ensure_org_access(incident["org_id"], current_user)
events = await incident_repo.get_events(incident_id)
return [IncidentEventResponse(**event) for event in events]
async def transition_incident(
self,
current_user: CurrentUser,
incident_id: UUID,
data: TransitionRequest,
) -> IncidentResponse:
"""Transition an incident status with optimistic locking and event recording."""
async with self.db.transaction() as conn:
db_conn = _as_conn(conn)
incident_repo = IncidentRepository(db_conn)
incident = await incident_repo.get_by_id(incident_id)
if incident is None:
raise exc.NotFoundError("Incident not found")
ensure_org_access(incident["org_id"], current_user)
self._validate_transition(incident["status"], data.to_status)
updated = await incident_repo.update_status(
incident_id,
data.to_status,
data.version,
)
if updated is None:
raise exc.ConflictError("Incident version mismatch")
payload = {"from": incident["status"], "to": data.to_status}
if data.note:
payload["note"] = data.note
await incident_repo.add_event(
uuid4(),
incident_id,
"status_changed",
actor_user_id=current_user.user_id,
payload=payload,
)
return IncidentResponse(**updated)
async def add_comment(
self,
current_user: CurrentUser,
incident_id: UUID,
data: CommentRequest,
) -> IncidentEventResponse:
"""Add a comment event to the incident timeline."""
async with self.db.connection() as conn:
incident_repo = IncidentRepository(_as_conn(conn))
incident = await incident_repo.get_by_id(incident_id)
if incident is None:
raise exc.NotFoundError("Incident not found")
ensure_org_access(incident["org_id"], current_user)
event = await incident_repo.add_event(
uuid4(),
incident_id,
"comment_added",
actor_user_id=current_user.user_id,
payload={"content": data.content},
)
return IncidentEventResponse(**event)
def _validate_transition(self, current_status: str, to_status: str) -> None:
"""Validate a requested status transition against the allowed state machine."""
if current_status == to_status:
raise exc.BadRequestError("Incident is already in the requested status")
allowed = _ALLOWED_TRANSITIONS.get(current_status, set())
if to_status not in allowed:
raise exc.BadRequestError("Invalid incident status transition")
__all__ = ["IncidentService"]