Compare commits

..

38 Commits

Author SHA1 Message Date
minhtrannhat f1115497c4 chore: broaden gitignore for configs 2025-01-21 12:00:00 -05:00
minhtrannhat 672b4ae893 chore: add Skaffold configuration for local development 2025-01-20 12:00:00 -05:00
minhtrannhat 58daa46912 chore(helm): add secrets configuration template 2025-01-19 12:00:00 -05:00
minhtrannhat 51d9aa09f0 chore(helm): add ingress configuration 2025-01-18 12:00:00 -05:00
minhtrannhat 1b9ab0f9e6 chore(helm): add web deployment and service templates 2025-01-17 12:00:00 -05:00
minhtrannhat ae037b8ae9 chore(helm): add worker deployment template 2025-01-16 12:00:00 -05:00
minhtrannhat f61eb6a79b chore(helm): add API deployment and service templates 2025-01-15 12:00:00 -05:00
minhtrannhat cda843a80e chore(helm): add Helm template helpers 2025-01-14 12:00:00 -05:00
minhtrannhat 885c288283 chore(helm): add Helm chart configuration 2025-01-13 12:00:00 -05:00
minhtrannhat 112a6eeba6 chore: add Docker Compose configuration 2025-01-12 12:00:00 -05:00
minhtrannhat 3abbd4a9aa build(web): add Dockerfile for web service 2025-01-11 12:00:00 -05:00
minhtrannhat f17fa5eb76 feat(web): add dashboard page 2025-01-10 12:00:00 -05:00
minhtrannhat 8ada5d1946 feat(web): add registration page 2025-01-09 12:00:00 -05:00
minhtrannhat 53418cf41c feat(web): add login page 2025-01-08 12:00:00 -05:00
minhtrannhat f635386b4d feat(web): add landing page 2025-01-07 12:00:00 -05:00
minhtrannhat d6ac0ddd3a feat(web): add root layout component 2025-01-06 12:00:00 -05:00
minhtrannhat a0e9fd71e6 feat(web): add global styles 2025-01-05 12:00:00 -05:00
minhtrannhat 03bc133e2c feat(web): add API client library 2025-01-04 12:00:00 -05:00
minhtrannhat 1d3ef9ef90 feat(web): add TypeScript type definitions 2025-01-03 12:00:00 -05:00
minhtrannhat 1d98cd5a73 feat(web): add Next.js project configuration 2025-01-02 12:00:00 -05:00
minhtrannhat 1a5e1d6c38 build(worker): add Dockerfile for worker service 2025-01-01 12:00:00 -05:00
minhtrannhat 8cac9b4377 feat(worker): configure Hangfire worker startup 2024-12-31 12:00:00 -05:00
minhtrannhat 06db4231cf feat(worker): implement background jobs for incidents 2024-12-30 12:00:00 -05:00
minhtrannhat 8ac4d814ee feat(worker): add worker project configuration 2024-12-29 12:00:00 -05:00
minhtrannhat 9e73887efc build(api): add Dockerfile for API service 2024-12-28 12:00:00 -05:00
minhtrannhat 4db3e56811 feat(api): configure API application startup 2024-12-27 12:00:00 -05:00
minhtrannhat d4c5f257af feat(api): add incident management endpoints 2024-12-26 12:00:00 -05:00
minhtrannhat 929327eca3 feat(api): add organization management endpoints 2024-12-25 12:00:00 -05:00
minhtrannhat 97905f9e19 feat(api): add authentication and health check endpoints 2024-12-24 12:00:00 -05:00
minhtrannhat 0aac1b6dc7 feat(api): add authentication middleware and request context 2024-12-23 12:00:00 -05:00
minhtrannhat a6d5a696a6 feat(api): add API project configuration 2024-12-22 12:00:00 -05:00
minhtrannhat 3e70ba560b feat(infrastructure): add dependency injection configuration 2024-12-21 12:00:00 -05:00
minhtrannhat 92f9ed001c feat(infrastructure): add job interfaces for background processing 2024-12-20 12:00:00 -05:00
minhtrannhat 38aa3fb12e feat(infrastructure): add FluentMigrator migrations for database schema 2024-12-19 12:00:00 -05:00
minhtrannhat 370408af95 feat(infrastructure): add data access and authentication services 2024-12-18 12:00:00 -05:00
minhtrannhat 7a09f8e2f6 feat(contracts): add API DTOs and request/response contracts 2024-12-17 12:00:00 -05:00
minhtrannhat 9357cbe026 feat(domain): add domain entities and enums 2024-12-16 12:00:00 -05:00
minhtrannhat 49ec9cd997 chore: initialize project structure and specification 2024-12-15 12:00:00 -05:00
191 changed files with 3926 additions and 13007 deletions
+64 -10
View File
@@ -1,11 +1,65 @@
# Python-generated files # .NET
__pycache__/ bin/
*.py[oc] obj/
build/ *.user
dist/ *.suo
wheels/ *.userosscache
*.egg-info *.sln.docstates
*.userprefs
.vs/
# Virtual environments # Build results
.venv [Dd]ebug/
.pytest_cache/ [Rr]elease/
x64/
x86/
[Aa][Rr][Mm]/
[Aa][Rr][Mm]64/
bld/
[Bb]in/
[Oo]bj/
[Ll]og/
[Ll]ogs/
# NuGet
*.nupkg
*.snupkg
.nuget/
packages/
# Node.js
node_modules/
.next/
out/
.npm/
# IDE
.idea/
*.swp
*.swo
*~
# OS
.DS_Store
Thumbs.db
# Environment
.env
.env.local
.env.*.local
appsettings.Local.json
appsettings.*.Local.json
appsettings*.json
# Project artifacts
**/Properties/
*.http
# Helm
helm/incidentops/charts/
# Docker
.docker/
# Kubernetes
*.kubeconfig
-1
View File
@@ -1 +0,0 @@
3.14
-38
View File
@@ -1,38 +0,0 @@
# Multi-stage Dockerfile for API and Worker services
FROM python:3.14-slim AS base
WORKDIR /app
# Install uv
COPY --from=ghcr.io/astral-sh/uv:latest /uv /uvx /bin/
# Install Python dependencies
COPY pyproject.toml uv.lock README.md ./
RUN uv sync --no-cache --no-dev
# Copy application code
COPY app/ ./app/
COPY worker/ ./worker/
COPY migrations/ ./migrations/
# Set up non-root user and cache directory
RUN useradd -m -u 1000 appuser && \
mkdir -p /app/.cache && \
chown -R appuser:appuser /app
ENV UV_CACHE_DIR=/app/.cache
# API service target
FROM base AS api
USER appuser
EXPOSE 8000
CMD ["uv", "run", "uvicorn", "app.main:app", "--host", "0.0.0.0", "--port", "8000"]
# Worker service target
FROM base AS worker
USER appuser
CMD ["uv", "run", "celery", "-A", "worker.celery_app", "worker", "--loglevel=info", "-Q", "critical,default,low"]
+9
View File
@@ -0,0 +1,9 @@
<Solution>
<Folder Name="/src/">
<Project Path="src/IncidentOps.Api/IncidentOps.Api.csproj" />
<Project Path="src/IncidentOps.Contracts/IncidentOps.Contracts.csproj" />
<Project Path="src/IncidentOps.Domain/IncidentOps.Domain.csproj" />
<Project Path="src/IncidentOps.Infrastructure/IncidentOps.Infrastructure.csproj" />
<Project Path="src/IncidentOps.Worker/IncidentOps.Worker.csproj" />
</Folder>
</Solution>
-86
View File
@@ -1,86 +0,0 @@
# IncidentOps
A fullstack on-call & incident management platform
## Environment Configuration
| Variable | Description | Default |
|----------|-------------|---------|
| `DATABASE_URL` | Postgres connection string | — |
| `REDIS_URL` | Legacy redis endpoint, also used if no broker override is supplied | `redis://localhost:6379/0` |
| `TASK_QUEUE_DRIVER` | Task queue implementation (`celery` or `inmemory`) | `celery` |
| `TASK_QUEUE_BROKER_URL` | Celery broker URL (falls back to `REDIS_URL` when unset) | `None` |
| `TASK_QUEUE_BACKEND` | Celery transport semantics (`redis` or `sqs`) | `redis` |
| `TASK_QUEUE_DEFAULT_QUEUE` | Queue used for fan-out + notification deliveries | `default` |
| `TASK_QUEUE_CRITICAL_QUEUE` | Queue used for escalation + delayed work | `critical` |
| `TASK_QUEUE_VISIBILITY_TIMEOUT` | Visibility timeout passed to `sqs` transport | `600` |
| `TASK_QUEUE_POLLING_INTERVAL` | Polling interval for `sqs` transport (seconds) | `1.0` |
| `NOTIFICATION_ESCALATION_DELAY_SECONDS` | Delay before re-checking unacknowledged incidents | `900` |
| `AWS_REGION` | Region used when `TASK_QUEUE_BACKEND=sqs` | `None` |
| `JWT_SECRET_KEY` | Symmetric JWT signing key | — |
| `JWT_ALGORITHM` | JWT algorithm | `HS256` |
| `JWT_ISSUER` | JWT issuer claim | `incidentops` |
| `JWT_AUDIENCE` | JWT audience claim | `incidentops-api` |
### Task Queue Modes
- **Development / Tests** Set `TASK_QUEUE_DRIVER=inmemory` to bypass Celery entirely (default for local pytest). The API will enqueue events into an in-memory recorder while the worker code remains importable.
- **Celery + Redis** Set `TASK_QUEUE_DRIVER=celery` and either leave `TASK_QUEUE_BROKER_URL` unset (and rely on `REDIS_URL`) or point it to another Redis endpoint. This is the default production-style configuration.
- **Celery + Amazon SQS** Provide `TASK_QUEUE_BROKER_URL=sqs://` (Celery automatically discovers credentials), set `TASK_QUEUE_BACKEND=sqs`, and configure `AWS_REGION`. Optional tuning is available via the visibility timeout and polling interval variables above.
### Running the Worker
The worker automatically discovers tasks under `worker/tasks`. Use the same environment variables as the API:
```
uv run celery -A worker.celery_app worker --loglevel=info
```
## Setup
### Docker Compose
```
docker compose up --build -d
```
### K8S with Skaffold and Helm
```
# Install with infrastructure only (for testing)
helm install incidentops helm/incidentops -n incidentops --create-namespace \
--set migration.enabled=false \
--set api.replicaCount=0 \
--set worker.replicaCount=0 \
--set web.replicaCount=0
# Full install (requires building app images first)
helm install incidentops helm/incidentops -n incidentops --create-namespace
# Create a cluster
kind create cluster --name incidentops
# We then deploy
skaffold dev
# One-time deployment
skaffold run
# Production deployment
skaffold run -p production
```
### Accessing Dashboards
When running with `skaffold dev`, the following dashboards are port-forwarded automatically:
| Dashboard | URL | Description |
|-----------|-----|-------------|
| **OpenAPI (Swagger)** | http://localhost:8000/docs | Interactive API documentation |
| **OpenAPI (ReDoc)** | http://localhost:8000/redoc | Alternative API docs |
| **Grafana** | http://localhost:3001 | Metrics, logs, and traces |
| **Prometheus** | http://localhost:9090 | Raw metrics queries |
| **Tempo** | http://localhost:3200 | Distributed tracing backend |
| **Loki** | http://localhost:3100 | Log aggregation backend |
Grafana comes pre-configured with datasources for Prometheus, Loki, and Tempo.
-163
View File
@@ -1,163 +0,0 @@
# IncidentOps Specification
Multi-tenant incident management API. Org context embedded in JWT — no `orgId` in URLs.
## Architecture
| Service | Stack | Purpose |
|---------|-------|---------|
| **api** | FastAPI, asyncpg | REST API, JWT auth, RBAC |
| **worker** | Celery, Redis | Notifications, escalations |
| **web** | Next.js | Dashboard (future) |
**Infrastructure:** PostgreSQL, Redis, ingress-nginx, Helm/Skaffold
## Auth
### JWT Access Token Claims
- `sub`: user_id (uuid)
- `org_id`: active org (uuid)
- `org_role`: `admin | member | viewer`
- `iss`: issuer (configurable, default: `incidentops`)
- `aud`: audience (configurable, default: `incidentops-api`)
- `jti`: unique token ID (uuid)
- `iat`: issued at (unix timestamp)
- `exp`: expiration (unix timestamp)
### Refresh Token
- Opaque token returned in JSON (not cookie)
- Stored hashed in DB with `active_org_id`
- Rotated on refresh and org-switch
### Endpoints
| Endpoint | Description |
|----------|-------------|
| `POST /v1/auth/register` | Create user + default org, return tokens |
| `POST /v1/auth/login` | Authenticate, return tokens |
| `POST /v1/auth/refresh` | Rotate refresh token, mint new access token |
| `POST /v1/auth/switch-org` | Change active org, rotate tokens |
| `POST /v1/auth/logout` | Revoke refresh token |
## Authorization
### Roles
| Role | Permissions |
|------|-------------|
| viewer | Read-only |
| member | + create incidents, transitions, comments |
| admin | + manage members, notification targets |
### Enforcement
- Role check via dependency injection
- Ownership check: resource `org_id` must match JWT `org_id`
## API Routes
All under `/v1`. Auth required unless noted.
### Org (implicit from JWT)
- `GET /org` — current org summary
- `GET /org/members` (admin)
- `GET /org/services`
- `POST /org/services` (member+)
- `GET /org/notification-targets` (admin)
- `POST /org/notification-targets` (admin)
### Incidents
- `GET /incidents?status=&cursor=&limit=`
- `POST /services/{serviceId}/incidents` (member+)
- `GET /incidents/{incidentId}`
- `GET /incidents/{incidentId}/events`
- `POST /incidents/{incidentId}/transition` (member+)
- `POST /incidents/{incidentId}/comment` (member+)
### Health
- `GET /healthz` — liveness
- `GET /readyz` — readiness (postgres + redis)
## Incident State Machine
```
Triggered → Acknowledged → Mitigated → Resolved
```
- Transitions validated at application level
- Optimistic locking via `version` column
- All changes recorded in `incident_events`
## Database Schema
| Table | Purpose |
|-------|---------|
| `users` | User accounts |
| `orgs` | Organizations |
| `org_members` | User-org membership + role |
| `services` | Org-scoped services |
| `incidents` | Org-scoped incidents with version |
| `incident_events` | Append-only timeline |
| `refresh_tokens` | Token rotation + active org |
| `notification_targets` | Webhook/email/slack configs |
| `notification_attempts` | Delivery tracking (idempotent) |
## Background Jobs (Celery)
| Task | Queue | Purpose |
|------|-------|---------|
| `incident_triggered` | default | Fan-out to notification targets |
| `send_webhook` | default | HTTP POST with retry |
| `escalate_if_unacked` | critical | Delayed escalation (stretch) |
## Config (Environment)
| Variable | Required | Default |
|----------|----------|---------|
| `DATABASE_URL` | Yes | — |
| `REDIS_URL` | No | `redis://localhost:6379/0` |
| `JWT_SECRET_KEY` | Yes | — |
| `JWT_ALGORITHM` | No | `HS256` |
| `JWT_ISSUER` | No | `incidentops` |
| `JWT_AUDIENCE` | No | `incidentops-api` |
| `ACCESS_TOKEN_EXPIRE_MINUTES` | No | `15` |
| `REFRESH_TOKEN_EXPIRE_DAYS` | No | `30` |
## Development
Use `uv` for all Python operations:
```bash
# Install dependencies
uv sync
# Run tests
uv run pytest tests/
# Run the API server
uv run uvicorn app.main:app --reload
# Run migrations
uv run python migrations/migrate.py
```
## Project Structure
```
incidentops/
├── app/
│ ├── main.py # FastAPI entry
│ ├── config.py # pydantic-settings
│ ├── db.py # asyncpg pool
│ ├── core/ # security, exceptions
│ ├── api/v1/ # route handlers
│ ├── schemas/ # pydantic models
│ ├── repositories/ # data access
│ └── services/ # business logic
├── worker/
│ ├── celery_app.py
│ └── tasks/
├── migrations/
│ └── *.sql + migrate.py
├── helm/
├── Dockerfile
├── docker-compose.yml
└── pyproject.toml
```
View File
View File
-101
View File
@@ -1,101 +0,0 @@
"""Shared FastAPI dependencies (auth, RBAC, ownership)."""
from __future__ import annotations
from dataclasses import dataclass
from typing import Callable
from uuid import UUID
from fastapi import Depends
from fastapi.security import HTTPAuthorizationCredentials, HTTPBearer
from app.core import exceptions as exc, security
from app.db import db
from app.repositories import OrgRepository, UserRepository
bearer_scheme = HTTPBearer(auto_error=False)
ROLE_RANKS: dict[str, int] = {"viewer": 0, "member": 1, "admin": 2}
@dataclass(slots=True)
class CurrentUser:
"""Authenticated user context derived from the access token."""
user_id: UUID
email: str
org_id: UUID
org_role: str
token: str
async def get_current_user(
credentials: HTTPAuthorizationCredentials | None = Depends(bearer_scheme),
) -> CurrentUser:
"""Extract and validate the current user from the Authorization header."""
if credentials is None or credentials.scheme.lower() != "bearer":
raise exc.UnauthorizedError("Missing bearer token")
try:
payload = security.TokenPayload(security.decode_access_token(credentials.credentials))
except security.JWTError as err: # pragma: no cover - jose error types
raise exc.UnauthorizedError("Invalid access token") from err
async with db.connection() as conn:
user_repo = UserRepository(conn)
user = await user_repo.get_by_id(payload.user_id)
if user is None:
raise exc.UnauthorizedError("User not found")
org_repo = OrgRepository(conn)
membership = await org_repo.get_member(payload.user_id, payload.org_id)
if membership is None:
raise exc.ForbiddenError("Organization access denied")
return CurrentUser(
user_id=payload.user_id,
email=user["email"],
org_id=payload.org_id,
org_role=membership["role"],
token=credentials.credentials,
)
class RoleChecker:
"""Dependency that enforces a minimum organization role."""
def __init__(self, minimum_role: str) -> None:
if minimum_role not in ROLE_RANKS:
raise ValueError(f"Unknown role '{minimum_role}'")
self.minimum_role = minimum_role
def __call__(self, current_user: CurrentUser = Depends(get_current_user)) -> CurrentUser:
if ROLE_RANKS[current_user.org_role] < ROLE_RANKS[self.minimum_role]:
raise exc.ForbiddenError("Insufficient role for this operation")
return current_user
def require_role(min_role: str) -> Callable[[CurrentUser], CurrentUser]:
"""Factory that returns a dependency enforcing the specified role."""
return RoleChecker(min_role)
def ensure_org_access(resource_org_id: UUID, current_user: CurrentUser) -> None:
"""Verify that the resource belongs to the active org in the token."""
if resource_org_id != current_user.org_id:
raise exc.ForbiddenError("Resource does not belong to the active organization")
__all__ = [
"CurrentUser",
"ROLE_RANKS",
"RoleChecker",
"bearer_scheme",
"ensure_org_access",
"get_current_user",
"require_role",
]
View File
-59
View File
@@ -1,59 +0,0 @@
"""Authentication API endpoints."""
from fastapi import APIRouter, Depends, status
from app.api.deps import CurrentUser, get_current_user
from app.schemas.auth import (
LoginRequest,
LogoutRequest,
RefreshRequest,
RegisterRequest,
SwitchOrgRequest,
TokenResponse,
)
from app.services import AuthService
router = APIRouter(prefix="/auth", tags=["auth"])
auth_service = AuthService()
@router.post("/register", response_model=TokenResponse, status_code=status.HTTP_201_CREATED)
async def register_user(payload: RegisterRequest) -> TokenResponse:
"""Register a new user and default org, returning auth tokens."""
return await auth_service.register_user(payload)
@router.post("/login", response_model=TokenResponse)
async def login_user(payload: LoginRequest) -> TokenResponse:
"""Authenticate an existing user and issue tokens."""
return await auth_service.login_user(payload)
@router.post("/refresh", response_model=TokenResponse)
async def refresh_tokens(payload: RefreshRequest) -> TokenResponse:
"""Rotate refresh token and mint a new access token."""
return await auth_service.refresh_tokens(payload)
@router.post("/switch-org", response_model=TokenResponse)
async def switch_org(
payload: SwitchOrgRequest,
current_user: CurrentUser = Depends(get_current_user),
) -> TokenResponse:
"""Switch the active organization for the authenticated user."""
return await auth_service.switch_org(current_user, payload)
@router.post("/logout", status_code=status.HTTP_204_NO_CONTENT)
async def logout(
payload: LogoutRequest,
current_user: CurrentUser = Depends(get_current_user),
) -> None:
"""Revoke the provided refresh token for the current session."""
await auth_service.logout(current_user, payload)
-47
View File
@@ -1,47 +0,0 @@
"""Health check endpoints."""
from fastapi import APIRouter, Response, status
from app.db import db
from app.taskqueue import task_queue
router = APIRouter()
@router.get("/healthz")
async def healthz() -> dict[str, str]:
"""Liveness probe - returns 200 if the service is running."""
return {"status": "ok"}
@router.get("/readyz")
async def readyz(response: Response) -> dict[str, str | dict[str, bool]]:
"""
Readiness probe - checks database and task queue connectivity.
- Check Postgres status
- Check configured task queue backend
- Return overall healthiness
"""
checks = {
"postgres": False,
"task_queue": False,
}
try:
if db.pool:
async with db.connection() as conn:
await conn.fetchval("SELECT 1")
checks["postgres"] = True
except Exception:
pass
checks["task_queue"] = await task_queue.ping()
all_healthy = all(checks.values())
if not all_healthy:
response.status_code = status.HTTP_503_SERVICE_UNAVAILABLE
return {
"status": "ok" if all_healthy else "degraded",
"checks": checks,
}
-103
View File
@@ -1,103 +0,0 @@
"""Incident API endpoints."""
from datetime import datetime
from uuid import UUID
from fastapi import APIRouter, Depends, Query, status
from app.api.deps import CurrentUser, get_current_user, require_role
from app.schemas.common import PaginatedResponse
from app.schemas.incident import (
CommentRequest,
IncidentEventResponse,
IncidentResponse,
IncidentStatus,
TransitionRequest,
IncidentCreate,
)
from app.services import IncidentService
router = APIRouter(tags=["incidents"])
incident_service = IncidentService()
@router.get("/incidents", response_model=PaginatedResponse[IncidentResponse])
async def list_incidents(
status: IncidentStatus | None = Query(default=None),
cursor: datetime | None = Query(default=None, description="Cursor (created_at)"),
limit: int = Query(default=20, ge=1, le=100),
current_user: CurrentUser = Depends(get_current_user),
) -> PaginatedResponse[IncidentResponse]:
"""List incidents for the active organization."""
return await incident_service.get_incidents(
current_user,
status=status,
cursor=cursor,
limit=limit,
)
@router.post(
"/services/{service_id}/incidents",
response_model=IncidentResponse,
status_code=status.HTTP_201_CREATED,
)
async def create_incident(
service_id: UUID,
payload: IncidentCreate,
current_user: CurrentUser = Depends(require_role("member")),
) -> IncidentResponse:
"""Create a new incident for the given service (member+)."""
return await incident_service.create_incident(current_user, service_id, payload)
@router.get("/incidents/{incident_id}", response_model=IncidentResponse)
async def get_incident(
incident_id: UUID,
current_user: CurrentUser = Depends(get_current_user),
) -> IncidentResponse:
"""Fetch a single incident by ID."""
return await incident_service.get_incident(current_user, incident_id)
@router.get("/incidents/{incident_id}/events", response_model=list[IncidentEventResponse])
async def get_incident_events(
incident_id: UUID,
current_user: CurrentUser = Depends(get_current_user),
) -> list[IncidentEventResponse]:
"""Get the event timeline for an incident."""
return await incident_service.get_incident_events(current_user, incident_id)
@router.post(
"/incidents/{incident_id}/transition",
response_model=IncidentResponse,
)
async def transition_incident(
incident_id: UUID,
payload: TransitionRequest,
current_user: CurrentUser = Depends(require_role("member")),
) -> IncidentResponse:
"""Transition an incident status (member+)."""
return await incident_service.transition_incident(current_user, incident_id, payload)
@router.post(
"/incidents/{incident_id}/comment",
response_model=IncidentEventResponse,
status_code=status.HTTP_201_CREATED,
)
async def add_comment(
incident_id: UUID,
payload: CommentRequest,
current_user: CurrentUser = Depends(require_role("member")),
) -> IncidentEventResponse:
"""Add a comment to the incident timeline (member+)."""
return await incident_service.add_comment(current_user, incident_id, payload)
-72
View File
@@ -1,72 +0,0 @@
"""Organization API endpoints."""
from fastapi import APIRouter, Depends, status
from app.api.deps import CurrentUser, get_current_user, require_role
from app.schemas.org import (
MemberResponse,
NotificationTargetCreate,
NotificationTargetResponse,
OrgResponse,
ServiceCreate,
ServiceResponse,
)
from app.services import OrgService
router = APIRouter(prefix="/org", tags=["org"])
org_service = OrgService()
@router.get("", response_model=OrgResponse)
async def get_org(current_user: CurrentUser = Depends(get_current_user)) -> OrgResponse:
"""Return the active organization summary for the authenticated user."""
return await org_service.get_current_org(current_user)
@router.get("/members", response_model=list[MemberResponse])
async def list_members(current_user: CurrentUser = Depends(require_role("admin"))) -> list[MemberResponse]:
"""List members of the current organization (admin only)."""
return await org_service.get_members(current_user)
@router.get("/services", response_model=list[ServiceResponse])
async def list_services(current_user: CurrentUser = Depends(get_current_user)) -> list[ServiceResponse]:
"""List services for the current organization."""
return await org_service.get_services(current_user)
@router.post("/services", response_model=ServiceResponse, status_code=status.HTTP_201_CREATED)
async def create_service(
payload: ServiceCreate,
current_user: CurrentUser = Depends(require_role("member")),
) -> ServiceResponse:
"""Create a new service within the current organization (member+)."""
return await org_service.create_service(current_user, payload)
@router.get("/notification-targets", response_model=list[NotificationTargetResponse])
async def list_notification_targets(
current_user: CurrentUser = Depends(require_role("admin")),
) -> list[NotificationTargetResponse]:
"""List notification targets for the current organization (admin only)."""
return await org_service.get_notification_targets(current_user)
@router.post(
"/notification-targets",
response_model=NotificationTargetResponse,
status_code=status.HTTP_201_CREATED,
)
async def create_notification_target(
payload: NotificationTargetCreate,
current_user: CurrentUser = Depends(require_role("admin")),
) -> NotificationTargetResponse:
"""Create a notification target for the current organization (admin only)."""
return await org_service.create_notification_target(current_user, payload)
-66
View File
@@ -1,66 +0,0 @@
"""Application configuration via pydantic-settings."""
from typing import Literal
from pydantic_settings import BaseSettings, SettingsConfigDict
class Settings(BaseSettings):
"""Application settings loaded from environment variables."""
model_config = SettingsConfigDict(
env_file=".env",
env_file_encoding="utf-8",
case_sensitive=False,
)
# Database
database_url: str
# Redis (legacy default for Celery broker)
redis_url: str = "redis://localhost:6379/0"
# Task queue
task_queue_driver: Literal["celery", "inmemory"] = "celery"
task_queue_broker_url: str | None = None
task_queue_backend: Literal["redis", "sqs"] = "redis"
task_queue_default_queue: str = "default"
task_queue_critical_queue: str = "critical"
task_queue_visibility_timeout: int = 600
task_queue_polling_interval: float = 1.0
notification_escalation_delay_seconds: int = 900
# AWS (used when task_queue_backend="sqs")
aws_region: str | None = None
# JWT
jwt_secret_key: str
jwt_algorithm: str = "HS256"
jwt_issuer: str = "incidentops"
jwt_audience: str = "incidentops-api"
access_token_expire_minutes: int = 15
refresh_token_expire_days: int = 30
# Application
debug: bool = False
api_v1_prefix: str = "/v1"
# OpenTelemetry
otel_enabled: bool = True
otel_service_name: str = "incidentops-api"
otel_environment: str = "development"
otel_exporter_otlp_endpoint: str | None = None # e.g., "http://tempo:4317"
otel_exporter_otlp_insecure: bool = True
otel_log_level: str = "INFO"
# Metrics
prometheus_port: int = 9464 # Port for Prometheus metrics endpoint
@property
def resolved_task_queue_broker_url(self) -> str:
"""Return the broker URL with redis fallback for backwards compatibility."""
return self.task_queue_broker_url or self.redis_url
settings = Settings() # type: ignore[call-arg]
View File
-59
View File
@@ -1,59 +0,0 @@
"""Custom HTTP exceptions for the API."""
from fastapi import HTTPException, status
class NotFoundError(HTTPException):
"""Resource not found."""
def __init__(self, detail: str = "Resource not found") -> None:
super().__init__(status_code=status.HTTP_404_NOT_FOUND, detail=detail)
class ConflictError(HTTPException):
"""Conflict with current state (e.g., version mismatch)."""
def __init__(self, detail: str = "Conflict with current state") -> None:
super().__init__(status_code=status.HTTP_409_CONFLICT, detail=detail)
class UnauthorizedError(HTTPException):
"""Authentication required or failed."""
def __init__(self, detail: str = "Not authenticated") -> None:
super().__init__(
status_code=status.HTTP_401_UNAUTHORIZED,
detail=detail,
headers={"WWW-Authenticate": "Bearer"},
)
class ForbiddenError(HTTPException):
"""Insufficient permissions."""
def __init__(self, detail: str = "Insufficient permissions") -> None:
super().__init__(status_code=status.HTTP_403_FORBIDDEN, detail=detail)
class BadRequestError(HTTPException):
"""Invalid request data."""
def __init__(self, detail: str = "Invalid request") -> None:
super().__init__(status_code=status.HTTP_400_BAD_REQUEST, detail=detail)
class ValidationError(HTTPException):
"""Validation failed."""
def __init__(self, detail: str = "Validation failed") -> None:
super().__init__(status_code=status.HTTP_422_UNPROCESSABLE_ENTITY, detail=detail)
__all__ = [
"BadRequestError",
"ConflictError",
"ForbiddenError",
"NotFoundError",
"UnauthorizedError",
"ValidationError",
]
-164
View File
@@ -1,164 +0,0 @@
"""Structured JSON logging configuration with OpenTelemetry integration."""
import json
import logging
import sys
from datetime import datetime, timezone
from typing import Any
from app.config import settings
class JSONFormatter(logging.Formatter):
"""
JSON log formatter that outputs structured logs with trace context.
Log format includes:
- timestamp: ISO 8601 format
- level: Log level name
- message: Log message
- logger: Logger name
- trace_id: OpenTelemetry trace ID (if available)
- span_id: OpenTelemetry span ID (if available)
- Extra fields from log record
"""
def format(self, record: logging.LogRecord) -> str:
log_data: dict[str, Any] = {
"timestamp": datetime.now(timezone.utc).isoformat(),
"level": record.levelname,
"message": record.getMessage(),
"logger": record.name,
}
# Add trace context if available (injected by OpenTelemetry LoggingInstrumentor)
if hasattr(record, "otelTraceID") and record.otelTraceID != "0":
log_data["trace_id"] = record.otelTraceID
if hasattr(record, "otelSpanID") and record.otelSpanID != "0":
log_data["span_id"] = record.otelSpanID
# Add exception info if present
if record.exc_info:
log_data["exception"] = self.formatException(record.exc_info)
# Add extra fields (excluding standard LogRecord attributes)
standard_attrs = {
"name",
"msg",
"args",
"created",
"filename",
"funcName",
"levelname",
"levelno",
"lineno",
"module",
"msecs",
"pathname",
"process",
"processName",
"relativeCreated",
"stack_info",
"exc_info",
"exc_text",
"thread",
"threadName",
"taskName",
"message",
"otelTraceID",
"otelSpanID",
"otelTraceSampled",
"otelServiceName",
}
for key, value in record.__dict__.items():
if key not in standard_attrs and not key.startswith("_"):
log_data[key] = value
return json.dumps(log_data, default=str)
class DevelopmentFormatter(logging.Formatter):
"""
Human-readable formatter for development with color support.
Format: [TIME] LEVEL logger - message [trace_id]
"""
COLORS = {
"DEBUG": "\033[36m", # Cyan
"INFO": "\033[32m", # Green
"WARNING": "\033[33m", # Yellow
"ERROR": "\033[31m", # Red
"CRITICAL": "\033[35m", # Magenta
}
RESET = "\033[0m"
def format(self, record: logging.LogRecord) -> str:
color = self.COLORS.get(record.levelname, "")
reset = self.RESET
# Format timestamp
timestamp = datetime.now(timezone.utc).strftime("%H:%M:%S.%f")[:-3]
# Build message
msg = f"[{timestamp}] {color}{record.levelname:8}{reset} {record.name} - {record.getMessage()}"
# Add trace context if available
if hasattr(record, "otelTraceID") and record.otelTraceID != "0":
msg += f" [{record.otelTraceID[:8]}...]"
# Add exception if present
if record.exc_info:
msg += f"\n{self.formatException(record.exc_info)}"
return msg
def setup_logging() -> None:
"""
Configure application logging.
- JSON format in production (OTEL enabled)
- Human-readable format in development
- Integrates with OpenTelemetry trace context
"""
# Determine log level
log_level = getattr(logging, settings.otel_log_level.upper(), logging.INFO)
# Choose formatter based on environment
if settings.otel_enabled and not settings.debug:
formatter = JSONFormatter()
else:
formatter = DevelopmentFormatter()
# Configure root logger
root_logger = logging.getLogger()
root_logger.setLevel(log_level)
# Remove existing handlers
for handler in root_logger.handlers[:]:
root_logger.removeHandler(handler)
# Add stdout handler
handler = logging.StreamHandler(sys.stdout)
handler.setFormatter(formatter)
root_logger.addHandler(handler)
# Reduce noise from third-party libraries (keep uvicorn access at INFO so requests are logged)
logging.getLogger("uvicorn.access").setLevel(logging.INFO)
logging.getLogger("asyncpg").setLevel(logging.WARNING)
logging.getLogger("httpx").setLevel(logging.WARNING)
logging.getLogger("httpcore").setLevel(logging.WARNING)
logging.info(
"Logging configured",
extra={
"log_level": settings.otel_log_level,
"format": "json" if settings.otel_enabled and not settings.debug else "dev",
},
)
def get_logger(name: str) -> logging.Logger:
"""Get a logger instance with the given name."""
return logging.getLogger(name)
-106
View File
@@ -1,106 +0,0 @@
"""Security utilities for JWT and password hashing."""
import hashlib
import secrets
from datetime import UTC, datetime, timedelta
from typing import Any
from uuid import UUID, uuid4
import bcrypt
from jose import JWTError, jwt
from app.config import settings
def hash_password(password: str) -> str:
"""Hash a password using bcrypt."""
return bcrypt.hashpw(password.encode(), bcrypt.gensalt()).decode()
def verify_password(plain_password: str, hashed_password: str) -> bool:
"""Verify a password against its hash."""
return bcrypt.checkpw(plain_password.encode(), hashed_password.encode())
def create_access_token(
sub: str,
org_id: str,
org_role: str,
expires_delta: timedelta | None = None,
) -> str:
"""Create a JWT access token with org context."""
if expires_delta is None:
expires_delta = timedelta(minutes=settings.access_token_expire_minutes)
now = datetime.now(UTC)
expire = now + expires_delta
payload = {
"sub": sub,
"org_id": org_id,
"org_role": org_role,
"iss": settings.jwt_issuer,
"aud": settings.jwt_audience,
"jti": str(uuid4()),
"iat": now,
"exp": expire,
}
return jwt.encode(payload, settings.jwt_secret_key, algorithm=settings.jwt_algorithm)
def decode_access_token(token: str) -> dict[str, Any]:
"""Decode and validate a JWT access token.
Raises:
JWTError: If token is invalid or expired.
"""
return jwt.decode(
token,
settings.jwt_secret_key,
algorithms=[settings.jwt_algorithm],
issuer=settings.jwt_issuer,
audience=settings.jwt_audience,
)
def generate_refresh_token() -> str:
"""Generate a secure random refresh token."""
return secrets.token_urlsafe(32)
def hash_token(token: str) -> str:
"""Hash a refresh token for storage."""
return hashlib.sha256(token.encode()).hexdigest()
def get_refresh_token_expiry() -> datetime:
"""Get expiry datetime for a new refresh token."""
return datetime.now(UTC) + timedelta(days=settings.refresh_token_expire_days)
class TokenPayload:
"""Parsed JWT token payload."""
def __init__(self, payload: dict[str, Any]) -> None:
self.user_id = UUID(payload["sub"])
self.org_id = UUID(payload["org_id"])
self.org_role = payload["org_role"]
self.issuer = payload["iss"]
self.audience = payload["aud"]
self.jti = UUID(payload["jti"])
self.issued_at = payload["iat"]
self.expires_at = payload["exp"]
__all__ = [
"JWTError",
"TokenPayload",
"create_access_token",
"decode_access_token",
"generate_refresh_token",
"get_refresh_token_expiry",
"hash_password",
"hash_token",
"verify_password",
]
-271
View File
@@ -1,271 +0,0 @@
"""OpenTelemetry instrumentation for tracing, metrics, and logging."""
import logging
from contextlib import contextmanager
from typing import Any
from opentelemetry import metrics, trace
from opentelemetry.exporter.otlp.proto.grpc.trace_exporter import OTLPSpanExporter
from opentelemetry.exporter.prometheus import PrometheusMetricReader
from opentelemetry.instrumentation.asyncpg import AsyncPGInstrumentor
from opentelemetry.instrumentation.fastapi import FastAPIInstrumentor
from opentelemetry.instrumentation.httpx import HTTPXClientInstrumentor
from opentelemetry.instrumentation.logging import LoggingInstrumentor
from opentelemetry.instrumentation.redis import RedisInstrumentor
from opentelemetry.instrumentation.system_metrics import SystemMetricsInstrumentor
from opentelemetry.sdk.metrics import MeterProvider
from opentelemetry.sdk.resources import Resource
from opentelemetry.sdk.trace import TracerProvider
from opentelemetry.sdk.trace.export import BatchSpanProcessor, ConsoleSpanExporter
from opentelemetry.semconv.resource import ResourceAttributes
from prometheus_client import REGISTRY, start_http_server
from app.config import settings
logger = logging.getLogger(__name__)
_tracer_provider: TracerProvider | None = None
_meter_provider: MeterProvider | None = None
# Custom metrics
_request_counter = None
_request_duration = None
_active_requests = None
_error_counter = None
def setup_telemetry(app: Any) -> None:
"""
Initialize OpenTelemetry with tracing, metrics, and logging instrumentation.
Configures:
- OTLP exporter for traces (to Tempo/Jaeger)
- Prometheus exporter for metrics (scraped by Prometheus)
- Auto-instrumentation for FastAPI, asyncpg, httpx, redis
- System metrics (CPU, memory, etc.)
- Logging instrumentation for trace context injection
"""
global _tracer_provider, _meter_provider
global _request_counter, _request_duration, _active_requests, _error_counter
if not settings.otel_enabled:
logger.info("OpenTelemetry disabled")
return
# Create resource with service info
resource = Resource.create(
{
ResourceAttributes.SERVICE_NAME: settings.otel_service_name,
ResourceAttributes.SERVICE_VERSION: "0.1.0",
ResourceAttributes.DEPLOYMENT_ENVIRONMENT: settings.otel_environment,
}
)
# =========================================
# TRACING SETUP
# =========================================
_tracer_provider = TracerProvider(resource=resource)
if settings.otel_exporter_otlp_endpoint:
otlp_exporter = OTLPSpanExporter(
endpoint=settings.otel_exporter_otlp_endpoint,
insecure=settings.otel_exporter_otlp_insecure,
)
_tracer_provider.add_span_processor(BatchSpanProcessor(otlp_exporter))
logger.info(f"OTLP exporter configured: {settings.otel_exporter_otlp_endpoint}")
else:
_tracer_provider.add_span_processor(BatchSpanProcessor(ConsoleSpanExporter()))
logger.info("Console span exporter configured (no OTLP endpoint)")
trace.set_tracer_provider(_tracer_provider)
# =========================================
# METRICS SETUP
# =========================================
# Prometheus metric reader exposes metrics at /metrics endpoint
prometheus_reader = PrometheusMetricReader()
_meter_provider = MeterProvider(resource=resource, metric_readers=[prometheus_reader])
metrics.set_meter_provider(_meter_provider)
# Start Prometheus HTTP server on port 9464
prometheus_port = settings.prometheus_port
try:
start_http_server(port=prometheus_port, registry=REGISTRY)
logger.info(f"Prometheus metrics server started on port {prometheus_port}")
except OSError as e:
logger.warning(f"Could not start Prometheus server on port {prometheus_port}: {e}")
# Create custom metrics
meter = metrics.get_meter(__name__)
_request_counter = meter.create_counter(
name="http_requests_total",
description="Total number of HTTP requests",
unit="1",
)
_request_duration = meter.create_histogram(
name="http_request_duration_seconds",
description="HTTP request duration in seconds",
unit="s",
)
_active_requests = meter.create_up_down_counter(
name="http_requests_active",
description="Number of active HTTP requests",
unit="1",
)
_error_counter = meter.create_counter(
name="http_errors_total",
description="Total number of HTTP errors",
unit="1",
)
# Instrument system metrics (CPU, memory, etc.)
SystemMetricsInstrumentor().instrument()
logger.info("System metrics instrumentation enabled")
# =========================================
# LIBRARY INSTRUMENTATION
# =========================================
FastAPIInstrumentor.instrument_app(
app,
excluded_urls="healthz,readyz,metrics",
tracer_provider=_tracer_provider,
meter_provider=_meter_provider,
)
AsyncPGInstrumentor().instrument(tracer_provider=_tracer_provider)
HTTPXClientInstrumentor().instrument(tracer_provider=_tracer_provider)
RedisInstrumentor().instrument(tracer_provider=_tracer_provider)
# Inject trace context into logs
LoggingInstrumentor().instrument(
set_logging_format=True,
log_level=logging.INFO,
)
logger.info(
f"OpenTelemetry initialized: service={settings.otel_service_name}, "
f"env={settings.otel_environment}, metrics_port={prometheus_port}"
)
async def shutdown_telemetry() -> None:
"""Gracefully shutdown the tracer and meter providers."""
global _tracer_provider, _meter_provider
if _tracer_provider:
_tracer_provider.shutdown()
_tracer_provider = None
logger.info("Tracer provider shutdown complete")
if _meter_provider:
_meter_provider.shutdown()
_meter_provider = None
logger.info("Meter provider shutdown complete")
def get_tracer(name: str) -> trace.Tracer:
"""Get a tracer instance for manual span creation."""
return trace.get_tracer(name)
def get_meter(name: str) -> metrics.Meter:
"""Get a meter instance for custom metrics."""
return metrics.get_meter(name)
def get_current_trace_id() -> str | None:
"""Get the current trace ID for request correlation."""
span = trace.get_current_span()
if span and span.get_span_context().is_valid:
return format(span.get_span_context().trace_id, "032x")
return None
def get_current_span_id() -> str | None:
"""Get the current span ID."""
span = trace.get_current_span()
if span and span.get_span_context().is_valid:
return format(span.get_span_context().span_id, "016x")
return None
@contextmanager
def create_span(name: str, attributes: dict[str, Any] | None = None):
"""Context manager for creating manual spans."""
tracer = get_tracer(__name__)
with tracer.start_as_current_span(name, attributes=attributes) as span:
yield span
def add_span_attributes(attributes: dict[str, Any]) -> None:
"""Add attributes to the current span."""
span = trace.get_current_span()
if span:
for key, value in attributes.items():
span.set_attribute(key, value)
def record_exception(exception: Exception) -> None:
"""Record an exception on the current span."""
span = trace.get_current_span()
if span:
span.record_exception(exception)
span.set_status(trace.Status(trace.StatusCode.ERROR, str(exception)))
# =========================================
# CUSTOM METRICS HELPERS
# =========================================
def record_request(method: str, endpoint: str, status_code: int) -> None:
"""Record a request metric."""
if _request_counter:
_request_counter.add(
1,
{
"method": method,
"endpoint": endpoint,
"status_code": str(status_code),
},
)
def record_request_duration(method: str, endpoint: str, duration: float) -> None:
"""Record request duration in seconds."""
if _request_duration:
_request_duration.record(
duration,
{
"method": method,
"endpoint": endpoint,
},
)
def increment_active_requests(method: str, endpoint: str) -> None:
"""Increment active requests counter."""
if _active_requests:
_active_requests.add(1, {"method": method, "endpoint": endpoint})
def decrement_active_requests(method: str, endpoint: str) -> None:
"""Decrement active requests counter."""
if _active_requests:
_active_requests.add(-1, {"method": method, "endpoint": endpoint})
def record_error(method: str, endpoint: str, error_type: str) -> None:
"""Record an error metric."""
if _error_counter:
_error_counter.add(
1,
{
"method": method,
"endpoint": endpoint,
"error_type": error_type,
},
)
-74
View File
@@ -1,74 +0,0 @@
"""Database connection management using asyncpg."""
from collections.abc import AsyncGenerator
from contextlib import asynccontextmanager
from contextvars import ContextVar
import asyncpg
from asyncpg.pool import PoolConnectionProxy
class Database:
"""Manages asyncpg connection pool."""
pool: asyncpg.Pool | None = None
async def connect(self, dsn: str) -> None:
"""Create connection pool."""
self.pool = await asyncpg.create_pool(
dsn,
min_size=5,
max_size=20,
command_timeout=60,
)
async def disconnect(self) -> None:
"""Close connection pool."""
if self.pool:
await self.pool.close()
@asynccontextmanager
async def connection(self) -> AsyncGenerator[asyncpg.Connection | PoolConnectionProxy, None]:
"""Acquire a connection from the pool."""
if not self.pool:
raise RuntimeError("Database not connected")
async with self.pool.acquire() as conn:
yield conn
@asynccontextmanager
async def transaction(self) -> AsyncGenerator[asyncpg.Connection | PoolConnectionProxy, None]:
"""Acquire a connection with an active transaction."""
if not self.pool:
raise RuntimeError("Database not connected")
async with self.pool.acquire() as conn:
async with conn.transaction():
yield conn
# Global instance
db = Database()
_connection_ctx: ContextVar[asyncpg.Connection | PoolConnectionProxy | None] = ContextVar(
"db_connection",
default=None,
)
async def get_conn() -> AsyncGenerator[asyncpg.Connection | PoolConnectionProxy, None]:
"""Dependency that reuses the same DB connection within a request context."""
existing_conn = _connection_ctx.get()
if existing_conn is not None:
yield existing_conn
return
if not db.pool:
raise RuntimeError("Database not connected")
async with db.pool.acquire() as conn:
token = _connection_ctx.set(conn)
try:
yield conn
finally:
_connection_ctx.reset(token)
-282
View File
@@ -1,282 +0,0 @@
"""FastAPI application entry point."""
import logging
import time
from contextlib import asynccontextmanager
from typing import AsyncGenerator
from fastapi import FastAPI, Request, status
from fastapi.encoders import jsonable_encoder
from fastapi.exceptions import RequestValidationError
from fastapi.openapi.utils import get_openapi
from fastapi.responses import JSONResponse
from starlette.exceptions import HTTPException as StarletteHTTPException
from app.api.v1 import auth, health, incidents, org
from app.config import settings
from app.core.logging import setup_logging
from app.core.telemetry import (
get_current_trace_id,
record_exception,
setup_telemetry,
shutdown_telemetry,
)
from app.db import db
from app.schemas.common import ErrorDetail, ErrorResponse
from app.taskqueue import task_queue
# Initialize logging before anything else
setup_logging()
logger = logging.getLogger(__name__)
@asynccontextmanager
async def lifespan(app: FastAPI) -> AsyncGenerator[None, None]:
"""Manage application lifecycle - connect/disconnect resources."""
# Startup
logger.info("Starting IncidentOps API")
await db.connect(settings.database_url)
await task_queue.startup()
logger.info("Startup complete")
yield
# Shutdown
logger.info("Shutting down IncidentOps API")
await task_queue.shutdown()
await db.disconnect()
await shutdown_telemetry()
logger.info("Shutdown complete")
app = FastAPI(
title="IncidentOps",
description="Incident management API with multi-tenant org support",
version="0.1.0",
docs_url="/docs",
redoc_url="/redoc",
openapi_url="/openapi.json",
lifespan=lifespan,
)
# Set up OpenTelemetry instrumentation
setup_telemetry(app)
@app.middleware("http")
async def request_logging_middleware(request: Request, call_next):
start = time.time()
response = await call_next(request)
duration_ms = (time.time() - start) * 1000
logger.info(
"request",
extra={
"method": request.method,
"path": request.url.path,
"status_code": response.status_code,
"duration_ms": round(duration_ms, 2),
},
)
return response
app.openapi_tags = [
{"name": "auth", "description": "Registration, login, token lifecycle"},
{"name": "org", "description": "Organization membership, services, and notifications"},
{"name": "incidents", "description": "Incident lifecycle and timelines"},
{"name": "health", "description": "Service health probes"},
]
# ---------------------------------------------------------------------------
# Global Exception Handlers
# ---------------------------------------------------------------------------
def _build_error_response(
error: str,
message: str,
status_code: int,
details: list[ErrorDetail] | None = None,
) -> JSONResponse:
"""Build a structured error response with trace context."""
response = ErrorResponse(
error=error,
message=message,
details=details,
request_id=get_current_trace_id(),
)
return JSONResponse(
status_code=status_code,
content=jsonable_encoder(response),
)
@app.exception_handler(StarletteHTTPException)
async def http_exception_handler(
request: Request, exc: StarletteHTTPException
) -> JSONResponse:
"""Handle HTTP exceptions with structured error responses."""
# Map status codes to error type strings
error_types = {
400: "bad_request",
401: "unauthorized",
403: "forbidden",
404: "not_found",
409: "conflict",
422: "validation_error",
429: "rate_limited",
500: "internal_error",
502: "bad_gateway",
503: "service_unavailable",
}
error_type = error_types.get(exc.status_code, "error")
logger.warning(
"HTTP exception",
extra={
"status_code": exc.status_code,
"error": error_type,
"detail": exc.detail,
"path": str(request.url.path),
"method": request.method,
},
)
return _build_error_response(
error=error_type,
message=str(exc.detail),
status_code=exc.status_code,
)
@app.exception_handler(RequestValidationError)
async def validation_exception_handler(
request: Request, exc: RequestValidationError
) -> JSONResponse:
"""Handle Pydantic validation errors with detailed error responses."""
details = [
ErrorDetail(
loc=[str(loc) for loc in error["loc"]],
msg=error["msg"],
type=error["type"],
)
for error in exc.errors()
]
logger.warning(
"Validation error",
extra={
"path": str(request.url.path),
"method": request.method,
"error_count": len(details),
},
)
return _build_error_response(
error="validation_error",
message="Request validation failed",
status_code=status.HTTP_422_UNPROCESSABLE_ENTITY,
details=details,
)
@app.exception_handler(Exception)
async def unhandled_exception_handler(request: Request, exc: Exception) -> JSONResponse:
"""Handle unexpected exceptions with logging and safe error response."""
# Record exception in the current span for tracing
record_exception(exc)
logger.exception(
"Unhandled exception",
extra={
"path": str(request.url.path),
"method": request.method,
"exception_type": type(exc).__name__,
},
)
# Don't leak internal error details in production
message = "An unexpected error occurred"
if settings.debug:
message = f"{type(exc).__name__}: {exc}"
return _build_error_response(
error="internal_error",
message=message,
status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
)
# ---------------------------------------------------------------------------
# OpenAPI Customization
# ---------------------------------------------------------------------------
def custom_openapi() -> dict:
"""Add JWT bearer security scheme and error responses to OpenAPI schema."""
if app.openapi_schema:
return app.openapi_schema
openapi_schema = get_openapi(
title=app.title,
version=app.version,
description=app.description,
routes=app.routes,
tags=app.openapi_tags,
)
# Add security schemes
components = openapi_schema.setdefault("components", {})
security_schemes = components.setdefault("securitySchemes", {})
security_schemes["BearerToken"] = {
"type": "http",
"scheme": "bearer",
"bearerFormat": "JWT",
"description": "Paste the JWT access token returned by /auth endpoints",
}
openapi_schema["security"] = [{"BearerToken": []}]
# Add common error response schemas
schemas = components.setdefault("schemas", {})
schemas["ErrorResponse"] = {
"type": "object",
"properties": {
"error": {"type": "string", "description": "Error type identifier"},
"message": {"type": "string", "description": "Human-readable error message"},
"details": {
"type": "array",
"items": {"$ref": "#/components/schemas/ErrorDetail"},
"nullable": True,
"description": "Validation error details",
},
"request_id": {
"type": "string",
"nullable": True,
"description": "Trace ID for debugging",
},
},
"required": ["error", "message"],
}
schemas["ErrorDetail"] = {
"type": "object",
"properties": {
"loc": {
"type": "array",
"items": {"oneOf": [{"type": "string"}, {"type": "integer"}]},
"description": "Error location path",
},
"msg": {"type": "string", "description": "Error message"},
"type": {"type": "string", "description": "Error type"},
},
"required": ["loc", "msg", "type"],
}
app.openapi_schema = openapi_schema
return app.openapi_schema
app.openapi = custom_openapi # type: ignore[assignment]
# Include routers
app.include_router(auth.router, prefix=settings.api_v1_prefix)
app.include_router(incidents.router, prefix=settings.api_v1_prefix)
app.include_router(org.router, prefix=settings.api_v1_prefix)
app.include_router(health.router, prefix=settings.api_v1_prefix, tags=["health"])
-17
View File
@@ -1,17 +0,0 @@
"""Repository layer for database operations."""
from app.repositories.incident import IncidentRepository
from app.repositories.notification import NotificationRepository
from app.repositories.org import OrgRepository
from app.repositories.refresh_token import RefreshTokenRepository
from app.repositories.service import ServiceRepository
from app.repositories.user import UserRepository
__all__ = [
"IncidentRepository",
"NotificationRepository",
"OrgRepository",
"RefreshTokenRepository",
"ServiceRepository",
"UserRepository",
]
-161
View File
@@ -1,161 +0,0 @@
"""Incident repository for database operations."""
from datetime import datetime
from typing import Any
from uuid import UUID
import asyncpg
class IncidentRepository:
"""Database operations for incidents."""
def __init__(self, conn: asyncpg.Connection) -> None:
self.conn = conn
async def create(
self,
incident_id: UUID,
org_id: UUID,
service_id: UUID,
title: str,
description: str | None,
severity: str,
) -> dict:
"""Create a new incident."""
row = await self.conn.fetchrow(
"""
INSERT INTO incidents (id, org_id, service_id, title, description, status, severity)
VALUES ($1, $2, $3, $4, $5, 'triggered', $6)
RETURNING id, org_id, service_id, title, description, status, severity,
version, created_at, updated_at
""",
incident_id,
org_id,
service_id,
title,
description,
severity,
)
return dict(row)
async def get_by_id(self, incident_id: UUID) -> dict | None:
"""Get incident by ID."""
row = await self.conn.fetchrow(
"""
SELECT id, org_id, service_id, title, description, status, severity,
version, created_at, updated_at
FROM incidents
WHERE id = $1
""",
incident_id,
)
return dict(row) if row else None
async def get_by_org(
self,
org_id: UUID,
status: str | None = None,
cursor: datetime | None = None,
limit: int = 20,
) -> list[dict]:
"""Get incidents for an organization with optional filtering and pagination."""
query = """
SELECT id, org_id, service_id, title, description, status, severity,
version, created_at, updated_at
FROM incidents
WHERE org_id = $1
"""
params: list[Any] = [org_id]
param_idx = 2
if status:
query += f" AND status = ${param_idx}"
params.append(status)
param_idx += 1
if cursor:
query += f" AND created_at < ${param_idx}"
params.append(cursor)
param_idx += 1
query += f" ORDER BY created_at DESC LIMIT ${param_idx}"
params.append(limit + 1) # Fetch one extra to check if there are more
rows = await self.conn.fetch(query, *params)
return [dict(row) for row in rows]
async def update_status(
self,
incident_id: UUID,
new_status: str,
expected_version: int,
) -> dict | None:
"""Update incident status with optimistic locking.
Returns updated incident if successful, None if version mismatch.
"""
row = await self.conn.fetchrow(
"""
UPDATE incidents
SET status = $2, version = version + 1, updated_at = now()
WHERE id = $1 AND version = $3
RETURNING id, org_id, service_id, title, description, status, severity,
version, created_at, updated_at
""",
incident_id,
new_status,
expected_version,
)
return dict(row) if row else None
async def add_event(
self,
event_id: UUID,
incident_id: UUID,
event_type: str,
actor_user_id: UUID | None,
payload: dict[str, Any] | None,
) -> dict:
"""Add an event to the incident timeline."""
import json
row = await self.conn.fetchrow(
"""
INSERT INTO incident_events (id, incident_id, event_type, actor_user_id, payload)
VALUES ($1, $2, $3, $4, $5)
RETURNING id, incident_id, event_type, actor_user_id, payload, created_at
""",
event_id,
incident_id,
event_type,
actor_user_id,
json.dumps(payload) if payload else None,
)
result = dict(row)
# Parse JSON payload back to dict
if result["payload"]:
result["payload"] = json.loads(result["payload"])
return result
async def get_events(self, incident_id: UUID) -> list[dict]:
"""Get all events for an incident."""
import json
rows = await self.conn.fetch(
"""
SELECT id, incident_id, event_type, actor_user_id, payload, created_at
FROM incident_events
WHERE incident_id = $1
ORDER BY created_at
""",
incident_id,
)
results = []
for row in rows:
result = dict(row)
if result["payload"]:
result["payload"] = json.loads(result["payload"])
results.append(result)
return results
-199
View File
@@ -1,199 +0,0 @@
"""Notification repository for database operations."""
from datetime import datetime
from uuid import UUID
import asyncpg
class NotificationRepository:
"""Database operations for notification targets and attempts."""
def __init__(self, conn: asyncpg.Connection) -> None:
self.conn = conn
async def create_target(
self,
target_id: UUID,
org_id: UUID,
name: str,
target_type: str,
webhook_url: str | None = None,
enabled: bool = True,
) -> dict:
"""Create a new notification target."""
row = await self.conn.fetchrow(
"""
INSERT INTO notification_targets (id, org_id, name, target_type, webhook_url, enabled)
VALUES ($1, $2, $3, $4, $5, $6)
RETURNING id, org_id, name, target_type, webhook_url, enabled, created_at
""",
target_id,
org_id,
name,
target_type,
webhook_url,
enabled,
)
return dict(row)
async def get_target_by_id(self, target_id: UUID) -> dict | None:
"""Get notification target by ID."""
row = await self.conn.fetchrow(
"""
SELECT id, org_id, name, target_type, webhook_url, enabled, created_at
FROM notification_targets
WHERE id = $1
""",
target_id,
)
return dict(row) if row else None
async def get_targets_by_org(
self,
org_id: UUID,
enabled_only: bool = False,
) -> list[dict]:
"""Get all notification targets for an organization."""
query = """
SELECT id, org_id, name, target_type, webhook_url, enabled, created_at
FROM notification_targets
WHERE org_id = $1
"""
if enabled_only:
query += " AND enabled = true"
query += " ORDER BY name"
rows = await self.conn.fetch(query, org_id)
return [dict(row) for row in rows]
async def update_target(
self,
target_id: UUID,
name: str | None = None,
webhook_url: str | None = None,
enabled: bool | None = None,
) -> dict | None:
"""Update a notification target."""
updates = []
params = [target_id]
param_idx = 2
if name is not None:
updates.append(f"name = ${param_idx}")
params.append(name)
param_idx += 1
if webhook_url is not None:
updates.append(f"webhook_url = ${param_idx}")
params.append(webhook_url)
param_idx += 1
if enabled is not None:
updates.append(f"enabled = ${param_idx}")
params.append(enabled)
param_idx += 1
if not updates:
return await self.get_target_by_id(target_id)
query = f"""
UPDATE notification_targets
SET {", ".join(updates)}
WHERE id = $1
RETURNING id, org_id, name, target_type, webhook_url, enabled, created_at
"""
row = await self.conn.fetchrow(query, *params)
return dict(row) if row else None
async def delete_target(self, target_id: UUID) -> bool:
"""Delete a notification target. Returns True if deleted."""
result = await self.conn.execute(
"DELETE FROM notification_targets WHERE id = $1",
target_id,
)
return result == "DELETE 1"
async def create_attempt(
self,
attempt_id: UUID,
incident_id: UUID,
target_id: UUID,
) -> dict:
"""Create a notification attempt (idempotent via unique constraint)."""
row = await self.conn.fetchrow(
"""
INSERT INTO notification_attempts (id, incident_id, target_id, status)
VALUES ($1, $2, $3, 'pending')
ON CONFLICT (incident_id, target_id) DO UPDATE SET id = notification_attempts.id
RETURNING id, incident_id, target_id, status, error, sent_at, created_at
""",
attempt_id,
incident_id,
target_id,
)
return dict(row)
async def get_attempt(self, incident_id: UUID, target_id: UUID) -> dict | None:
"""Get notification attempt for incident and target."""
row = await self.conn.fetchrow(
"""
SELECT id, incident_id, target_id, status, error, sent_at, created_at
FROM notification_attempts
WHERE incident_id = $1 AND target_id = $2
""",
incident_id,
target_id,
)
return dict(row) if row else None
async def update_attempt_success(
self,
attempt_id: UUID,
sent_at: datetime,
) -> dict | None:
"""Mark notification attempt as successful."""
row = await self.conn.fetchrow(
"""
UPDATE notification_attempts
SET status = 'sent', sent_at = $2, error = NULL
WHERE id = $1
RETURNING id, incident_id, target_id, status, error, sent_at, created_at
""",
attempt_id,
sent_at,
)
return dict(row) if row else None
async def update_attempt_failure(
self,
attempt_id: UUID,
error: str,
) -> dict | None:
"""Mark notification attempt as failed."""
row = await self.conn.fetchrow(
"""
UPDATE notification_attempts
SET status = 'failed', error = $2
WHERE id = $1
RETURNING id, incident_id, target_id, status, error, sent_at, created_at
""",
attempt_id,
error,
)
return dict(row) if row else None
async def get_pending_attempts(self, incident_id: UUID) -> list[dict]:
"""Get all pending notification attempts for an incident."""
rows = await self.conn.fetch(
"""
SELECT na.id, na.incident_id, na.target_id, na.status, na.error,
na.sent_at, na.created_at,
nt.target_type, nt.webhook_url, nt.name as target_name
FROM notification_attempts na
JOIN notification_targets nt ON nt.id = na.target_id
WHERE na.incident_id = $1 AND na.status = 'pending'
""",
incident_id,
)
return [dict(row) for row in rows]
-125
View File
@@ -1,125 +0,0 @@
"""Organization repository for database operations."""
from uuid import UUID
import asyncpg
class OrgRepository:
"""Database operations for organizations."""
def __init__(self, conn: asyncpg.Connection) -> None:
self.conn = conn
async def create(
self,
org_id: UUID,
name: str,
slug: str,
) -> dict:
"""Create a new organization."""
row = await self.conn.fetchrow(
"""
INSERT INTO orgs (id, name, slug)
VALUES ($1, $2, $3)
RETURNING id, name, slug, created_at
""",
org_id,
name,
slug,
)
return dict(row)
async def get_by_id(self, org_id: UUID) -> dict | None:
"""Get organization by ID."""
row = await self.conn.fetchrow(
"""
SELECT id, name, slug, created_at
FROM orgs
WHERE id = $1
""",
org_id,
)
return dict(row) if row else None
async def get_by_slug(self, slug: str) -> dict | None:
"""Get organization by slug."""
row = await self.conn.fetchrow(
"""
SELECT id, name, slug, created_at
FROM orgs
WHERE slug = $1
""",
slug,
)
return dict(row) if row else None
async def add_member(
self,
member_id: UUID,
user_id: UUID,
org_id: UUID,
role: str,
) -> dict:
"""Add a member to an organization."""
row = await self.conn.fetchrow(
"""
INSERT INTO org_members (id, user_id, org_id, role)
VALUES ($1, $2, $3, $4)
RETURNING id, user_id, org_id, role, created_at
""",
member_id,
user_id,
org_id,
role,
)
return dict(row)
async def get_member(self, user_id: UUID, org_id: UUID) -> dict | None:
"""Get membership for a user in an organization."""
row = await self.conn.fetchrow(
"""
SELECT om.id, om.user_id, om.org_id, om.role, om.created_at
FROM org_members om
WHERE om.user_id = $1 AND om.org_id = $2
""",
user_id,
org_id,
)
return dict(row) if row else None
async def get_members(self, org_id: UUID) -> list[dict]:
"""Get all members of an organization."""
rows = await self.conn.fetch(
"""
SELECT om.id, om.user_id, u.email, om.role, om.created_at
FROM org_members om
JOIN users u ON u.id = om.user_id
WHERE om.org_id = $1
ORDER BY om.created_at
""",
org_id,
)
return [dict(row) for row in rows]
async def get_user_orgs(self, user_id: UUID) -> list[dict]:
"""Get all organizations a user belongs to."""
rows = await self.conn.fetch(
"""
SELECT o.id, o.name, o.slug, o.created_at, om.role
FROM orgs o
JOIN org_members om ON om.org_id = o.id
WHERE om.user_id = $1
ORDER BY o.created_at
""",
user_id,
)
return [dict(row) for row in rows]
async def slug_exists(self, slug: str) -> bool:
"""Check if organization slug exists."""
result = await self.conn.fetchval(
"SELECT EXISTS(SELECT 1 FROM orgs WHERE slug = $1)",
slug,
)
return result
-396
View File
@@ -1,396 +0,0 @@
"""Refresh token repository for database operations.
Security considerations implemented:
- Atomic rotation using SELECT FOR UPDATE to prevent race conditions
- Token chain tracking via rotated_to for reuse/theft detection
- Defense-in-depth validation with user_id and active_org_id checks
- Uses RETURNING for robust row counting instead of string parsing
"""
from datetime import datetime
from uuid import UUID
import asyncpg
class RefreshTokenRepository:
"""Database operations for refresh tokens."""
def __init__(self, conn: asyncpg.Connection) -> None:
self.conn = conn
async def create(
self,
token_id: UUID,
user_id: UUID,
token_hash: str,
active_org_id: UUID,
expires_at: datetime,
) -> dict:
"""Create a new refresh token."""
row = await self.conn.fetchrow(
"""
INSERT INTO refresh_tokens (id, user_id, token_hash, active_org_id, expires_at)
VALUES ($1, $2, $3, $4, $5)
RETURNING id, user_id, token_hash, active_org_id, expires_at,
revoked_at, rotated_to, created_at
""",
token_id,
user_id,
token_hash,
active_org_id,
expires_at,
)
return dict(row)
async def get_by_hash(self, token_hash: str) -> dict | None:
"""Get refresh token by hash (includes revoked/expired for auditing)."""
row = await self.conn.fetchrow(
"""
SELECT id, user_id, token_hash, active_org_id, expires_at,
revoked_at, rotated_to, created_at
FROM refresh_tokens
WHERE token_hash = $1
""",
token_hash,
)
return dict(row) if row else None
async def get_valid_by_hash(
self,
token_hash: str,
user_id: UUID | None = None,
active_org_id: UUID | None = None,
) -> dict | None:
"""Get refresh token by hash, only if valid.
Validates:
- Token exists and matches hash
- Token is not revoked
- Token is not expired
- Token has not been rotated (rotated_to is NULL)
- Optionally: user_id matches (defense-in-depth)
- Optionally: active_org_id matches (defense-in-depth)
Args:
token_hash: The hashed token value
user_id: If provided, token must belong to this user
active_org_id: If provided, token must be bound to this org
Returns:
Token dict if valid, None otherwise
"""
query = """
SELECT id, user_id, token_hash, active_org_id, expires_at,
revoked_at, rotated_to, created_at
FROM refresh_tokens
WHERE token_hash = $1
AND revoked_at IS NULL
AND rotated_to IS NULL
AND expires_at > clock_timestamp()
"""
params: list = [token_hash]
param_idx = 2
if user_id is not None:
query += f" AND user_id = ${param_idx}"
params.append(user_id)
param_idx += 1
if active_org_id is not None:
query += f" AND active_org_id = ${param_idx}"
params.append(active_org_id)
row = await self.conn.fetchrow(query, *params)
return dict(row) if row else None
async def get_valid_for_rotation(
self,
token_hash: str,
user_id: UUID | None = None,
) -> dict | None:
"""Get and lock a valid token for rotation using SELECT FOR UPDATE.
This acquires a row-level lock to prevent concurrent rotation attempts.
Must be called within a transaction.
Args:
token_hash: The hashed token value
user_id: If provided, token must belong to this user
Returns:
Token dict if valid and locked, None otherwise
"""
query = """
SELECT id, user_id, token_hash, active_org_id, expires_at,
revoked_at, rotated_to, created_at
FROM refresh_tokens
WHERE token_hash = $1
AND revoked_at IS NULL
AND rotated_to IS NULL
AND expires_at > clock_timestamp()
"""
params: list = [token_hash]
if user_id is not None:
query += " AND user_id = $2"
params.append(user_id)
query += " FOR UPDATE"
row = await self.conn.fetchrow(query, *params)
return dict(row) if row else None
async def check_token_reuse(self, token_hash: str) -> dict | None:
"""Check if a token has already been rotated (potential theft).
If a token is presented that has rotated_to set, it means:
1. The token was legitimately rotated earlier
2. Someone is now trying to use the old token
3. This indicates the token may have been stolen
Returns:
Token dict if this is a reused/stolen token, None if not found or not rotated
"""
row = await self.conn.fetchrow(
"""
SELECT id, user_id, token_hash, active_org_id, expires_at,
revoked_at, rotated_to, created_at
FROM refresh_tokens
WHERE token_hash = $1 AND rotated_to IS NOT NULL
""",
token_hash,
)
return dict(row) if row else None
async def revoke_token_chain(self, token_id: UUID) -> int:
"""Revoke a token and all tokens in its chain (for breach response).
When token reuse is detected, this revokes:
1. The original stolen token
2. Any token it was rotated to (and their rotations, recursively)
Args:
token_id: The ID of the compromised token
Returns:
Count of tokens revoked
"""
# Use recursive CTE to find all tokens in the chain
rows = await self.conn.fetch(
"""
WITH RECURSIVE token_chain AS (
-- Start with the given token
SELECT id, rotated_to
FROM refresh_tokens
WHERE id = $1
UNION ALL
-- Follow the chain via rotated_to
SELECT rt.id, rt.rotated_to
FROM refresh_tokens rt
INNER JOIN token_chain tc ON rt.id = tc.rotated_to
)
UPDATE refresh_tokens
SET revoked_at = clock_timestamp()
WHERE id IN (SELECT id FROM token_chain)
AND revoked_at IS NULL
RETURNING id
""",
token_id,
)
return len(rows)
async def rotate(
self,
old_token_hash: str,
new_token_id: UUID,
new_token_hash: str,
new_expires_at: datetime,
new_active_org_id: UUID | None = None,
expected_user_id: UUID | None = None,
) -> dict | None:
"""Atomically rotate a refresh token.
This method:
1. Validates the old token (not expired, not revoked, not already rotated)
2. Locks the row to prevent concurrent rotation
3. Marks old token as rotated (sets rotated_to)
4. Creates new token with updated org if specified
5. All in a single atomic operation
Args:
old_token_hash: Hash of the token being rotated
new_token_id: UUID for the new token
new_token_hash: Hash for the new token
new_expires_at: Expiry time for the new token
new_active_org_id: New org ID (for org-switch), or None to keep current
expected_user_id: If provided, validates token belongs to this user
Returns:
New token dict if rotation succeeded, None if old token invalid/expired
"""
# First, get and lock the old token
old_token = await self.get_valid_for_rotation(old_token_hash, expected_user_id)
if old_token is None:
return None
# Determine the org for the new token
active_org_id = new_active_org_id or old_token["active_org_id"]
user_id = old_token["user_id"]
# Create the new token
new_token = await self.conn.fetchrow(
"""
INSERT INTO refresh_tokens (id, user_id, token_hash, active_org_id, expires_at)
VALUES ($1, $2, $3, $4, $5)
RETURNING id, user_id, token_hash, active_org_id, expires_at,
revoked_at, rotated_to, created_at
""",
new_token_id,
user_id,
new_token_hash,
active_org_id,
new_expires_at,
)
# Mark the old token as rotated (not revoked - for reuse detection)
await self.conn.execute(
"""
UPDATE refresh_tokens
SET rotated_to = $2
WHERE id = $1
""",
old_token["id"],
new_token_id,
)
return dict(new_token)
async def revoke(self, token_id: UUID) -> bool:
"""Revoke a refresh token by ID.
Returns:
True if token was revoked, False if not found or already revoked
"""
row = await self.conn.fetchrow(
"""
UPDATE refresh_tokens
SET revoked_at = clock_timestamp()
WHERE id = $1 AND revoked_at IS NULL
RETURNING id
""",
token_id,
)
return row is not None
async def revoke_by_hash(self, token_hash: str) -> bool:
"""Revoke a refresh token by hash.
Returns:
True if token was revoked, False if not found or already revoked
"""
row = await self.conn.fetchrow(
"""
UPDATE refresh_tokens
SET revoked_at = clock_timestamp()
WHERE token_hash = $1 AND revoked_at IS NULL
RETURNING id
""",
token_hash,
)
return row is not None
async def revoke_all_for_user(self, user_id: UUID) -> int:
"""Revoke all active refresh tokens for a user.
Use this for:
- User-initiated logout from all devices
- Password change
- Account compromise response
Returns:
Count of tokens revoked
"""
rows = await self.conn.fetch(
"""
UPDATE refresh_tokens
SET revoked_at = clock_timestamp()
WHERE user_id = $1 AND revoked_at IS NULL
RETURNING id
""",
user_id,
)
return len(rows)
async def revoke_all_for_user_except(self, user_id: UUID, keep_token_id: UUID) -> int:
"""Revoke all tokens for a user except one (logout other sessions).
Args:
user_id: The user whose tokens to revoke
keep_token_id: The token ID to keep active (current session)
Returns:
Count of tokens revoked
"""
rows = await self.conn.fetch(
"""
UPDATE refresh_tokens
SET revoked_at = clock_timestamp()
WHERE user_id = $1 AND revoked_at IS NULL AND id != $2
RETURNING id
""",
user_id,
keep_token_id,
)
return len(rows)
async def get_active_tokens_for_user(self, user_id: UUID) -> list[dict]:
"""Get all active (non-revoked, non-expired, non-rotated) tokens for a user.
Useful for:
- Showing active sessions
- Auditing
Returns:
List of active token records
"""
rows = await self.conn.fetch(
"""
SELECT id, user_id, token_hash, active_org_id, expires_at,
revoked_at, rotated_to, created_at
FROM refresh_tokens
WHERE user_id = $1
AND revoked_at IS NULL
AND rotated_to IS NULL
AND expires_at > clock_timestamp()
ORDER BY created_at DESC
""",
user_id,
)
return [dict(row) for row in rows]
async def cleanup_expired(self, older_than_days: int = 30) -> int:
"""Delete expired tokens older than specified days.
Note: This performs a hard delete. For audit trails, I think we should:
- Archiving to a separate table first
- Using partitioning with retention policies
- Only calling this for tokens well past their expiry
Args:
older_than_days: Only delete tokens expired more than this many days ago
Returns:
Count of tokens deleted
"""
rows = await self.conn.fetch(
"""
DELETE FROM refresh_tokens
WHERE expires_at < clock_timestamp() - interval '1 day' * $1
RETURNING id
""",
older_than_days,
)
return len(rows)
-80
View File
@@ -1,80 +0,0 @@
"""Service repository for database operations."""
from uuid import UUID
import asyncpg
class ServiceRepository:
"""Database operations for services."""
def __init__(self, conn: asyncpg.Connection) -> None:
self.conn = conn
async def create(
self,
service_id: UUID,
org_id: UUID,
name: str,
slug: str,
) -> dict:
"""Create a new service."""
row = await self.conn.fetchrow(
"""
INSERT INTO services (id, org_id, name, slug)
VALUES ($1, $2, $3, $4)
RETURNING id, org_id, name, slug, created_at
""",
service_id,
org_id,
name,
slug,
)
return dict(row)
async def get_by_id(self, service_id: UUID) -> dict | None:
"""Get service by ID."""
row = await self.conn.fetchrow(
"""
SELECT id, org_id, name, slug, created_at
FROM services
WHERE id = $1
""",
service_id,
)
return dict(row) if row else None
async def get_by_org(self, org_id: UUID) -> list[dict]:
"""Get all services for an organization."""
rows = await self.conn.fetch(
"""
SELECT id, org_id, name, slug, created_at
FROM services
WHERE org_id = $1
ORDER BY name
""",
org_id,
)
return [dict(row) for row in rows]
async def get_by_slug(self, org_id: UUID, slug: str) -> dict | None:
"""Get service by org and slug."""
row = await self.conn.fetchrow(
"""
SELECT id, org_id, name, slug, created_at
FROM services
WHERE org_id = $1 AND slug = $2
""",
org_id,
slug,
)
return dict(row) if row else None
async def slug_exists(self, org_id: UUID, slug: str) -> bool:
"""Check if service slug exists in organization."""
result = await self.conn.fetchval(
"SELECT EXISTS(SELECT 1 FROM services WHERE org_id = $1 AND slug = $2)",
org_id,
slug,
)
return result
-63
View File
@@ -1,63 +0,0 @@
"""User repository for database operations."""
from uuid import UUID
import asyncpg
class UserRepository:
"""Database operations for users."""
def __init__(self, conn: asyncpg.Connection) -> None:
self.conn = conn
async def create(
self,
user_id: UUID,
email: str,
password_hash: str,
) -> dict:
"""Create a new user."""
row = await self.conn.fetchrow(
"""
INSERT INTO users (id, email, password_hash)
VALUES ($1, $2, $3)
RETURNING id, email, created_at
""",
user_id,
email,
password_hash,
)
return dict(row)
async def get_by_id(self, user_id: UUID) -> dict | None:
"""Get user by ID."""
row = await self.conn.fetchrow(
"""
SELECT id, email, password_hash, created_at
FROM users
WHERE id = $1
""",
user_id,
)
return dict(row) if row else None
async def get_by_email(self, email: str) -> dict | None:
"""Get user by email."""
row = await self.conn.fetchrow(
"""
SELECT id, email, password_hash, created_at
FROM users
WHERE email = $1
""",
email,
)
return dict(row) if row else None
async def exists_by_email(self, email: str) -> bool:
"""Check if user exists by email."""
result = await self.conn.fetchval(
"SELECT EXISTS(SELECT 1 FROM users WHERE email = $1)",
email,
)
return result
-54
View File
@@ -1,54 +0,0 @@
"""Pydantic schemas for request/response models."""
from app.schemas.auth import (
LoginRequest,
LogoutRequest,
RefreshRequest,
RegisterRequest,
SwitchOrgRequest,
TokenResponse,
)
from app.schemas.common import CursorParams, ErrorDetail, ErrorResponse, PaginatedResponse
from app.schemas.incident import (
CommentRequest,
IncidentCreate,
IncidentEventResponse,
IncidentResponse,
TransitionRequest,
)
from app.schemas.org import (
MemberResponse,
NotificationTargetCreate,
NotificationTargetResponse,
OrgResponse,
ServiceCreate,
ServiceResponse,
)
__all__ = [
# Auth
"LoginRequest",
"LogoutRequest",
"RefreshRequest",
"RegisterRequest",
"SwitchOrgRequest",
"TokenResponse",
# Common
"CursorParams",
"ErrorDetail",
"ErrorResponse",
"PaginatedResponse",
# Incident
"CommentRequest",
"IncidentCreate",
"IncidentEventResponse",
"IncidentResponse",
"TransitionRequest",
# Org
"MemberResponse",
"NotificationTargetCreate",
"NotificationTargetResponse",
"OrgResponse",
"ServiceCreate",
"ServiceResponse",
]
-48
View File
@@ -1,48 +0,0 @@
"""Authentication schemas."""
from uuid import UUID
from pydantic import BaseModel, EmailStr, Field
class RegisterRequest(BaseModel):
"""Request body for user registration."""
email: EmailStr
password: str = Field(min_length=8, max_length=128)
org_name: str = Field(min_length=1, max_length=100, description="Name for the default org")
class LoginRequest(BaseModel):
"""Request body for user login."""
email: EmailStr
password: str
class RefreshRequest(BaseModel):
"""Request body for token refresh."""
refresh_token: str
class SwitchOrgRequest(BaseModel):
"""Request body for switching active organization."""
org_id: UUID
refresh_token: str
class LogoutRequest(BaseModel):
"""Request body for logging out and revoking a refresh token."""
refresh_token: str
class TokenResponse(BaseModel):
"""Response containing access and refresh tokens."""
access_token: str
refresh_token: str
token_type: str = "bearer"
expires_in: int = Field(description="Access token expiry in seconds")
-61
View File
@@ -1,61 +0,0 @@
"""Common schemas used across the API."""
from pydantic import BaseModel, Field
class ErrorDetail(BaseModel):
"""Individual error detail for validation errors."""
loc: list[str | int] = Field(description="Location of the error (field path)")
msg: str = Field(description="Error message")
type: str = Field(description="Error type identifier")
class ErrorResponse(BaseModel):
"""Structured error response returned by all error handlers."""
error: str = Field(description="Error type (e.g., 'not_found', 'validation_error')")
message: str = Field(description="Human-readable error message")
details: list[ErrorDetail] | None = Field(
default=None, description="Additional error details for validation errors"
)
request_id: str | None = Field(
default=None, description="Request trace ID for debugging"
)
model_config = {
"json_schema_extra": {
"examples": [
{
"error": "not_found",
"message": "Incident not found",
"request_id": "abc123def456",
},
{
"error": "validation_error",
"message": "Request validation failed",
"details": [
{"loc": ["body", "title"], "msg": "Field required", "type": "missing"}
],
"request_id": "abc123def456",
},
]
}
}
class CursorParams(BaseModel):
"""Pagination parameters using cursor-based pagination."""
cursor: str | None = Field(default=None, description="Cursor for pagination")
limit: int = Field(default=20, ge=1, le=100, description="Number of items per page")
class PaginatedResponse[T](BaseModel):
"""Generic paginated response wrapper."""
items: list[T]
next_cursor: str | None = Field(
default=None, description="Cursor for next page, null if no more items"
)
has_more: bool = Field(description="Whether there are more items")
-57
View File
@@ -1,57 +0,0 @@
"""Incident-related schemas."""
from datetime import datetime
from typing import Any, Literal
from uuid import UUID
from pydantic import BaseModel, Field
IncidentStatus = Literal["triggered", "acknowledged", "mitigated", "resolved"]
IncidentSeverity = Literal["critical", "high", "medium", "low"]
class IncidentCreate(BaseModel):
"""Request body for creating an incident."""
title: str = Field(min_length=1, max_length=200)
description: str | None = Field(default=None, max_length=5000)
severity: IncidentSeverity = "medium"
class IncidentResponse(BaseModel):
"""Incident response."""
id: UUID
service_id: UUID
title: str
description: str | None
status: IncidentStatus
severity: IncidentSeverity
version: int
created_at: datetime
updated_at: datetime
class IncidentEventResponse(BaseModel):
"""Incident event response."""
id: UUID
incident_id: UUID
event_type: str
actor_user_id: UUID | None
payload: dict[str, Any] | None
created_at: datetime
class TransitionRequest(BaseModel):
"""Request body for transitioning incident status."""
to_status: IncidentStatus
version: int = Field(description="Current version for optimistic locking")
note: str | None = Field(default=None, max_length=1000)
class CommentRequest(BaseModel):
"""Request body for adding a comment to an incident."""
content: str = Field(min_length=1, max_length=5000)
-69
View File
@@ -1,69 +0,0 @@
"""Organization-related schemas."""
from datetime import datetime
from typing import Literal
from uuid import UUID
from pydantic import BaseModel, Field, HttpUrl
class OrgResponse(BaseModel):
"""Organization summary response."""
id: UUID
name: str
slug: str
created_at: datetime
class MemberResponse(BaseModel):
"""Organization member response."""
id: UUID
user_id: UUID
email: str
role: Literal["admin", "member", "viewer"]
created_at: datetime
class ServiceCreate(BaseModel):
"""Request body for creating a service."""
name: str = Field(min_length=1, max_length=100)
slug: str = Field(
min_length=1,
max_length=50,
pattern=r"^[a-z0-9]+(?:-[a-z0-9]+)*$",
description="URL-friendly identifier (lowercase, hyphens allowed)",
)
class ServiceResponse(BaseModel):
"""Service response."""
id: UUID
name: str
slug: str
created_at: datetime
class NotificationTargetCreate(BaseModel):
"""Request body for creating a notification target."""
name: str = Field(min_length=1, max_length=100)
target_type: Literal["webhook", "email", "slack"]
webhook_url: HttpUrl | None = Field(
default=None, description="Required for webhook type"
)
enabled: bool = True
class NotificationTargetResponse(BaseModel):
"""Notification target response."""
id: UUID
name: str
target_type: Literal["webhook", "email", "slack"]
webhook_url: str | None
enabled: bool
created_at: datetime
-7
View File
@@ -1,7 +0,0 @@
"""Service layer entrypoints."""
from app.services.auth import AuthService
from app.services.incident import IncidentService
from app.services.org import OrgService
__all__ = ["AuthService", "OrgService", "IncidentService"]
-269
View File
@@ -1,269 +0,0 @@
"""Authentication service providing business logic for auth flows."""
from __future__ import annotations
import re
from typing import cast
from uuid import UUID, uuid4
import asyncpg
from asyncpg.pool import PoolConnectionProxy
from app.api.deps import CurrentUser
from app.config import settings
from app.core import exceptions as exc, security
from app.db import Database, db
from app.repositories import OrgRepository, RefreshTokenRepository, UserRepository
from app.schemas.auth import (
LoginRequest,
LogoutRequest,
RefreshRequest,
RegisterRequest,
SwitchOrgRequest,
TokenResponse,
)
_SLUG_PATTERN = re.compile(r"[^a-z0-9]+")
def _as_conn(conn: asyncpg.Connection | PoolConnectionProxy) -> asyncpg.Connection:
"""Helper to satisfy typing when a pool proxy is returned."""
return cast(asyncpg.Connection, conn)
class AuthService:
"""Encapsulates authentication workflows (register/login/refresh/logout)."""
def __init__(self, database: Database | None = None) -> None:
self.db = database or db
self._access_token_expires_in = settings.access_token_expire_minutes * 60
async def register_user(self, data: RegisterRequest) -> TokenResponse:
"""Create a new user, default org, membership, and token pair."""
async with self.db.transaction() as conn:
db_conn = _as_conn(conn)
user_repo = UserRepository(db_conn)
org_repo = OrgRepository(db_conn)
refresh_repo = RefreshTokenRepository(db_conn)
if await user_repo.exists_by_email(data.email):
raise exc.ConflictError("Email already registered")
user_id = uuid4()
org_id = uuid4()
member_id = uuid4()
password_hash = security.hash_password(data.password)
await user_repo.create(user_id, data.email, password_hash)
slug = await self._generate_unique_org_slug(org_repo, data.org_name)
await org_repo.create(org_id, data.org_name, slug)
await org_repo.add_member(member_id, user_id, org_id, "admin")
return await self._issue_token_pair(
refresh_repo,
user_id=user_id,
org_id=org_id,
role="admin",
)
async def login_user(self, data: LoginRequest) -> TokenResponse:
"""Authenticate a user and issue tokens for their first organization."""
async with self.db.connection() as conn:
db_conn = _as_conn(conn)
user_repo = UserRepository(db_conn)
org_repo = OrgRepository(db_conn)
refresh_repo = RefreshTokenRepository(db_conn)
user = await user_repo.get_by_email(data.email)
if not user or not security.verify_password(data.password, user["password_hash"]):
raise exc.UnauthorizedError("Invalid email or password")
orgs = await org_repo.get_user_orgs(user["id"])
if not orgs:
raise exc.ForbiddenError("User does not belong to any organization")
active_org = orgs[0]
return await self._issue_token_pair(
refresh_repo,
user_id=user["id"],
org_id=active_org["id"],
role=active_org["role"],
)
async def refresh_tokens(self, data: RefreshRequest) -> TokenResponse:
"""Rotate refresh token and mint a new access token."""
old_hash = security.hash_token(data.refresh_token)
new_refresh_token = security.generate_refresh_token()
new_refresh_hash = security.hash_token(new_refresh_token)
new_refresh_id = uuid4()
new_refresh_expiry = security.get_refresh_token_expiry()
rotated: dict | None = None
membership: dict | None = None
async with self.db.transaction() as conn:
db_conn = _as_conn(conn)
refresh_repo = RefreshTokenRepository(db_conn)
rotated = await refresh_repo.rotate(
old_token_hash=old_hash,
new_token_id=new_refresh_id,
new_token_hash=new_refresh_hash,
new_expires_at=new_refresh_expiry,
)
if rotated is not None:
org_repo = OrgRepository(db_conn)
membership = await org_repo.get_member(rotated["user_id"], rotated["active_org_id"])
if membership is None:
raise exc.UnauthorizedError("Invalid refresh token")
if rotated is None or membership is None:
await self._handle_invalid_refresh(old_hash)
assert rotated is not None and membership is not None
access_token = security.create_access_token(
sub=str(rotated["user_id"]),
org_id=str(rotated["active_org_id"]),
org_role=membership["role"],
)
return TokenResponse(
access_token=access_token,
refresh_token=new_refresh_token,
expires_in=self._access_token_expires_in,
)
async def switch_org(
self,
current_user: CurrentUser,
data: SwitchOrgRequest,
) -> TokenResponse:
"""Switch active organization (rotates refresh token + issues new JWT)."""
target_org_id = data.org_id
old_hash = security.hash_token(data.refresh_token)
new_refresh_token = security.generate_refresh_token()
new_refresh_hash = security.hash_token(new_refresh_token)
new_refresh_expiry = security.get_refresh_token_expiry()
rotated: dict | None = None
membership: dict | None = None
async with self.db.transaction() as conn:
db_conn = _as_conn(conn)
org_repo = OrgRepository(db_conn)
membership = await org_repo.get_member(current_user.user_id, target_org_id)
if membership is None:
raise exc.ForbiddenError("Not a member of the requested organization")
refresh_repo = RefreshTokenRepository(db_conn)
rotated = await refresh_repo.rotate(
old_token_hash=old_hash,
new_token_id=uuid4(),
new_token_hash=new_refresh_hash,
new_expires_at=new_refresh_expiry,
new_active_org_id=target_org_id,
expected_user_id=current_user.user_id,
)
if rotated is None:
await self._handle_invalid_refresh(old_hash)
access_token = security.create_access_token(
sub=str(current_user.user_id),
org_id=str(target_org_id),
org_role=membership["role"],
)
return TokenResponse(
access_token=access_token,
refresh_token=new_refresh_token,
expires_in=self._access_token_expires_in,
)
async def logout(self, current_user: CurrentUser, data: LogoutRequest) -> None:
"""Revoke the provided refresh token for the current session."""
token_hash = security.hash_token(data.refresh_token)
async with self.db.transaction() as conn:
refresh_repo = RefreshTokenRepository(_as_conn(conn))
token = await refresh_repo.get_by_hash(token_hash)
if token and token["user_id"] != current_user.user_id:
raise exc.ForbiddenError("Refresh token does not belong to this user")
if not token:
return
await refresh_repo.revoke(token["id"])
async def _issue_token_pair(
self,
refresh_repo: RefreshTokenRepository,
*,
user_id: UUID,
org_id: UUID,
role: str,
) -> TokenResponse:
"""Create access/refresh tokens and persist the refresh token."""
access_token = security.create_access_token(
sub=str(user_id),
org_id=str(org_id),
org_role=role,
)
refresh_token = security.generate_refresh_token()
await refresh_repo.create(
token_id=uuid4(),
user_id=user_id,
token_hash=security.hash_token(refresh_token),
active_org_id=org_id,
expires_at=security.get_refresh_token_expiry(),
)
return TokenResponse(
access_token=access_token,
refresh_token=refresh_token,
expires_in=self._access_token_expires_in,
)
async def _handle_invalid_refresh(self, token_hash: str) -> None:
"""Raise appropriate errors for invalid/compromised refresh tokens."""
async with self.db.connection() as conn:
refresh_repo = RefreshTokenRepository(_as_conn(conn))
reused = await refresh_repo.check_token_reuse(token_hash)
if reused:
await refresh_repo.revoke_token_chain(reused["id"])
raise exc.UnauthorizedError("Refresh token reuse detected")
raise exc.UnauthorizedError("Invalid refresh token")
async def _generate_unique_org_slug(
self,
org_repo: OrgRepository,
org_name: str,
) -> str:
"""Slugify the org name and append a counter until unique."""
base_slug = self._slugify(org_name)
candidate = base_slug
counter = 1
while await org_repo.slug_exists(candidate):
suffix = f"-{counter}"
max_base_len = 50 - len(suffix)
candidate = f"{base_slug[:max_base_len]}{suffix}"
counter += 1
return candidate
def _slugify(self, value: str) -> str:
"""Convert arbitrary text into a URL-friendly slug."""
slug = _SLUG_PATTERN.sub("-", value.strip().lower()).strip("-")
return slug[:50] or "org"
-247
View File
@@ -1,247 +0,0 @@
"""Incident service implementing incident lifecycle operations."""
from __future__ import annotations
from datetime import datetime
from typing import cast
from uuid import UUID, uuid4
import asyncpg
from asyncpg.pool import PoolConnectionProxy
from app.api.deps import CurrentUser, ensure_org_access
from app.config import settings
from app.core import exceptions as exc
from app.db import Database, db
from app.repositories import IncidentRepository, ServiceRepository
from app.schemas.common import PaginatedResponse
from app.schemas.incident import (
CommentRequest,
IncidentCreate,
IncidentEventResponse,
IncidentResponse,
TransitionRequest,
)
from app.taskqueue import TaskQueue
from app.taskqueue import task_queue as default_task_queue
_ALLOWED_TRANSITIONS: dict[str, set[str]] = {
"triggered": {"acknowledged"},
"acknowledged": {"mitigated"},
"mitigated": {"resolved"},
"resolved": set(),
}
def _as_conn(conn: asyncpg.Connection | PoolConnectionProxy) -> asyncpg.Connection:
"""Helper to satisfy typing when a pool proxy is returned."""
return cast(asyncpg.Connection, conn)
class IncidentService:
"""Encapsulates incident lifecycle operations within an org context."""
def __init__(
self,
database: Database | None = None,
task_queue: TaskQueue | None = None,
escalation_delay_seconds: int | None = None,
) -> None:
self.db = database or db
self.task_queue = task_queue or default_task_queue
self.escalation_delay_seconds = (
escalation_delay_seconds
if escalation_delay_seconds is not None
else settings.notification_escalation_delay_seconds
)
async def create_incident(
self,
current_user: CurrentUser,
service_id: UUID,
data: IncidentCreate,
) -> IncidentResponse:
"""Create an incident for a service in the active org and record the creation event."""
async with self.db.transaction() as conn:
db_conn = _as_conn(conn)
service_repo = ServiceRepository(db_conn)
incident_repo = IncidentRepository(db_conn)
service = await service_repo.get_by_id(service_id)
if service is None:
raise exc.NotFoundError("Service not found")
ensure_org_access(service["org_id"], current_user)
incident_id = uuid4()
incident = await incident_repo.create(
incident_id=incident_id,
org_id=current_user.org_id,
service_id=service_id,
title=data.title,
description=data.description,
severity=data.severity,
)
await incident_repo.add_event(
uuid4(),
incident_id,
"created",
actor_user_id=current_user.user_id,
payload={
"title": data.title,
"severity": data.severity,
"description": data.description,
},
)
incident_response = IncidentResponse(**incident)
self.task_queue.incident_triggered(
incident_id=incident_response.id,
org_id=current_user.org_id,
triggered_by=current_user.user_id,
)
if self.escalation_delay_seconds > 0:
self.task_queue.schedule_escalation_check(
incident_id=incident_response.id,
org_id=current_user.org_id,
delay_seconds=self.escalation_delay_seconds,
)
return incident_response
async def get_incidents(
self,
current_user: CurrentUser,
*,
status: str | None = None,
cursor: datetime | None = None,
limit: int = 20,
) -> PaginatedResponse[IncidentResponse]:
"""Return paginated incidents for the active organization."""
async with self.db.connection() as conn:
incident_repo = IncidentRepository(_as_conn(conn))
rows = await incident_repo.get_by_org(
org_id=current_user.org_id,
status=status,
cursor=cursor,
limit=limit,
)
has_more = len(rows) > limit
items = rows[:limit]
next_cursor = items[-1]["created_at"].isoformat() if has_more and items else None
incidents = [IncidentResponse(**row) for row in items]
return PaginatedResponse[IncidentResponse](
items=incidents,
next_cursor=next_cursor,
has_more=has_more,
)
async def get_incident(self, current_user: CurrentUser, incident_id: UUID) -> IncidentResponse:
"""Return a single incident, ensuring it belongs to the active org."""
async with self.db.connection() as conn:
incident_repo = IncidentRepository(_as_conn(conn))
incident = await incident_repo.get_by_id(incident_id)
if incident is None:
raise exc.NotFoundError("Incident not found")
ensure_org_access(incident["org_id"], current_user)
return IncidentResponse(**incident)
async def get_incident_events(
self, current_user: CurrentUser, incident_id: UUID
) -> list[IncidentEventResponse]:
"""Return the timeline events for an incident in the active org."""
async with self.db.connection() as conn:
incident_repo = IncidentRepository(_as_conn(conn))
incident = await incident_repo.get_by_id(incident_id)
if incident is None:
raise exc.NotFoundError("Incident not found")
ensure_org_access(incident["org_id"], current_user)
events = await incident_repo.get_events(incident_id)
return [IncidentEventResponse(**event) for event in events]
async def transition_incident(
self,
current_user: CurrentUser,
incident_id: UUID,
data: TransitionRequest,
) -> IncidentResponse:
"""Transition an incident status with optimistic locking and event recording."""
async with self.db.transaction() as conn:
db_conn = _as_conn(conn)
incident_repo = IncidentRepository(db_conn)
incident = await incident_repo.get_by_id(incident_id)
if incident is None:
raise exc.NotFoundError("Incident not found")
ensure_org_access(incident["org_id"], current_user)
self._validate_transition(incident["status"], data.to_status)
updated = await incident_repo.update_status(
incident_id,
data.to_status,
data.version,
)
if updated is None:
raise exc.ConflictError("Incident version mismatch")
payload = {"from": incident["status"], "to": data.to_status}
if data.note:
payload["note"] = data.note
await incident_repo.add_event(
uuid4(),
incident_id,
"status_changed",
actor_user_id=current_user.user_id,
payload=payload,
)
return IncidentResponse(**updated)
async def add_comment(
self,
current_user: CurrentUser,
incident_id: UUID,
data: CommentRequest,
) -> IncidentEventResponse:
"""Add a comment event to the incident timeline."""
async with self.db.connection() as conn:
incident_repo = IncidentRepository(_as_conn(conn))
incident = await incident_repo.get_by_id(incident_id)
if incident is None:
raise exc.NotFoundError("Incident not found")
ensure_org_access(incident["org_id"], current_user)
event = await incident_repo.add_event(
uuid4(),
incident_id,
"comment_added",
actor_user_id=current_user.user_id,
payload={"content": data.content},
)
return IncidentEventResponse(**event)
def _validate_transition(self, current_status: str, to_status: str) -> None:
"""Validate a requested status transition against the allowed state machine."""
if current_status == to_status:
raise exc.BadRequestError("Incident is already in the requested status")
allowed = _ALLOWED_TRANSITIONS.get(current_status, set())
if to_status not in allowed:
raise exc.BadRequestError("Invalid incident status transition")
__all__ = ["IncidentService"]
-115
View File
@@ -1,115 +0,0 @@
"""Organization service providing org-scoped operations."""
from __future__ import annotations
from typing import cast
from uuid import UUID, uuid4
import asyncpg
from asyncpg.pool import PoolConnectionProxy
from app.api.deps import CurrentUser
from app.core import exceptions as exc
from app.db import Database, db
from app.repositories import NotificationRepository, OrgRepository, ServiceRepository
from app.schemas.org import (
MemberResponse,
NotificationTargetCreate,
NotificationTargetResponse,
OrgResponse,
ServiceCreate,
ServiceResponse,
)
def _as_conn(conn: asyncpg.Connection | PoolConnectionProxy) -> asyncpg.Connection:
"""Helper to satisfy typing when a pool proxy is returned."""
return cast(asyncpg.Connection, conn)
class OrgService:
"""Encapsulates organization-level operations within the active org context."""
def __init__(self, database: Database | None = None) -> None:
self.db = database or db
async def get_current_org(self, current_user: CurrentUser) -> OrgResponse:
"""Return the active organization summary for the current user."""
async with self.db.connection() as conn:
org_repo = OrgRepository(_as_conn(conn))
org = await org_repo.get_by_id(current_user.org_id)
if org is None:
raise exc.NotFoundError("Organization not found")
return OrgResponse(**org)
async def get_members(self, current_user: CurrentUser) -> list[MemberResponse]:
"""List members of the active organization."""
async with self.db.connection() as conn:
org_repo = OrgRepository(_as_conn(conn))
members = await org_repo.get_members(current_user.org_id)
return [MemberResponse(**member) for member in members]
async def create_service(self, current_user: CurrentUser, data: ServiceCreate) -> ServiceResponse:
"""Create a new service within the active organization."""
async with self.db.connection() as conn:
service_repo = ServiceRepository(_as_conn(conn))
if await service_repo.slug_exists(current_user.org_id, data.slug):
raise exc.ConflictError("Service slug already exists in this organization")
try:
service = await service_repo.create(
service_id=uuid4(),
org_id=current_user.org_id,
name=data.name,
slug=data.slug,
)
except asyncpg.UniqueViolationError as err: # pragma: no cover - race protection
raise exc.ConflictError("Service slug already exists in this organization") from err
return ServiceResponse(**service)
async def get_services(self, current_user: CurrentUser) -> list[ServiceResponse]:
"""List services for the active organization."""
async with self.db.connection() as conn:
service_repo = ServiceRepository(_as_conn(conn))
services = await service_repo.get_by_org(current_user.org_id)
return [ServiceResponse(**svc) for svc in services]
async def create_notification_target(
self,
current_user: CurrentUser,
data: NotificationTargetCreate,
) -> NotificationTargetResponse:
"""Create a notification target for the active organization."""
if data.target_type == "webhook" and data.webhook_url is None:
raise exc.BadRequestError("webhook_url is required for webhook targets")
async with self.db.connection() as conn:
notification_repo = NotificationRepository(_as_conn(conn))
target = await notification_repo.create_target(
target_id=uuid4(),
org_id=current_user.org_id,
name=data.name,
target_type=data.target_type,
webhook_url=str(data.webhook_url) if data.webhook_url else None,
enabled=data.enabled,
)
return NotificationTargetResponse(**target)
async def get_notification_targets(self, current_user: CurrentUser) -> list[NotificationTargetResponse]:
"""List notification targets for the active organization."""
async with self.db.connection() as conn:
notification_repo = NotificationRepository(_as_conn(conn))
targets = await notification_repo.get_targets_by_org(current_user.org_id)
return [NotificationTargetResponse(**target) for target in targets]
__all__ = ["OrgService"]
-188
View File
@@ -1,188 +0,0 @@
"""Task queue abstractions for scheduling background work."""
from __future__ import annotations
import asyncio
from abc import ABC, abstractmethod
from dataclasses import dataclass
from typing import Any
from uuid import UUID
from app.config import settings
try:
from worker.celery_app import celery_app
except Exception: # pragma: no cover - celery app may not import during docs builds
celery_app = None # type: ignore[assignment]
class TaskQueue(ABC):
"""Interface for enqueueing background work."""
async def startup(self) -> None: # pragma: no cover - default no-op
"""Hook for queue initialization."""
async def shutdown(self) -> None: # pragma: no cover - default no-op
"""Hook for queue teardown."""
async def ping(self) -> bool:
"""Check if the queue backend is reachable."""
return True
def reset(self) -> None: # pragma: no cover - optional for in-memory impls
"""Reset any in-memory state (used in tests)."""
@abstractmethod
def incident_triggered(
self,
*,
incident_id: UUID,
org_id: UUID,
triggered_by: UUID | None,
) -> None:
"""Fan out an incident triggered notification."""
@abstractmethod
def schedule_escalation_check(
self,
*,
incident_id: UUID,
org_id: UUID,
delay_seconds: int,
) -> None:
"""Schedule a delayed escalation check."""
class CeleryTaskQueue(TaskQueue):
"""Celery-backed task queue that can use Redis or SQS brokers."""
def __init__(self, default_queue: str, critical_queue: str) -> None:
if celery_app is None: # pragma: no cover - guarded by try/except
raise RuntimeError("Celery application is unavailable")
self._celery = celery_app
self._default_queue = default_queue
self._critical_queue = critical_queue
def incident_triggered(
self,
*,
incident_id: UUID,
org_id: UUID,
triggered_by: UUID | None,
) -> None:
self._celery.send_task(
"worker.tasks.notifications.incident_triggered",
kwargs={
"incident_id": str(incident_id),
"org_id": str(org_id),
"triggered_by": str(triggered_by) if triggered_by else None,
},
queue=self._default_queue,
)
def schedule_escalation_check(
self,
*,
incident_id: UUID,
org_id: UUID,
delay_seconds: int,
) -> None:
self._celery.send_task(
"worker.tasks.notifications.escalate_if_unacked",
kwargs={
"incident_id": str(incident_id),
"org_id": str(org_id),
},
countdown=max(delay_seconds, 0),
queue=self._critical_queue,
)
async def ping(self) -> bool:
loop = asyncio.get_running_loop()
return await loop.run_in_executor(None, self._ping_sync)
def _ping_sync(self) -> bool:
connection = self._celery.connection()
try:
connection.connect()
return True
except Exception:
return False
finally:
try:
connection.release()
except Exception: # pragma: no cover - release best effort
pass
@dataclass
class InMemoryTaskQueue(TaskQueue):
"""Test-friendly queue that records dispatched tasks in memory."""
dispatched: list[tuple[str, dict[str, Any]]] | None = None
def __post_init__(self) -> None:
if self.dispatched is None:
self.dispatched = []
def incident_triggered(
self,
*,
incident_id: UUID,
org_id: UUID,
triggered_by: UUID | None,
) -> None:
self.dispatched.append(
(
"incident_triggered",
{
"incident_id": incident_id,
"org_id": org_id,
"triggered_by": triggered_by,
},
)
)
def schedule_escalation_check(
self,
*,
incident_id: UUID,
org_id: UUID,
delay_seconds: int,
) -> None:
self.dispatched.append(
(
"escalate_if_unacked",
{
"incident_id": incident_id,
"org_id": org_id,
"delay_seconds": delay_seconds,
},
)
)
def reset(self) -> None:
if self.dispatched is not None:
self.dispatched.clear()
def _build_task_queue() -> TaskQueue:
if settings.task_queue_driver == "inmemory":
return InMemoryTaskQueue()
return CeleryTaskQueue(
default_queue=settings.task_queue_default_queue,
critical_queue=settings.task_queue_critical_queue,
)
task_queue = _build_task_queue()
__all__ = [
"CeleryTaskQueue",
"InMemoryTaskQueue",
"TaskQueue",
"task_queue",
]
+17 -184
View File
@@ -1,230 +1,63 @@
version: "3.8" version: '3.8'
services: services:
postgres: postgres:
image: postgres:16-alpine image: postgres:16-alpine
container_name: incidentops-postgres
environment: environment:
POSTGRES_USER: incidentops POSTGRES_USER: postgres
POSTGRES_PASSWORD: incidentops POSTGRES_PASSWORD: postgres
POSTGRES_DB: incidentops POSTGRES_DB: incidentops
ports: ports:
- "5432:5432" - "5432:5432"
volumes: volumes:
- postgres_data:/var/lib/postgresql/data - postgres_data:/var/lib/postgresql/data
healthcheck: healthcheck:
test: ["CMD-SHELL", "pg_isready -U incidentops"] test: ["CMD-SHELL", "pg_isready -U postgres"]
interval: 10s interval: 5s
timeout: 5s timeout: 5s
retries: 5 retries: 5
# For Celery broker
redis: redis:
image: redis:7-alpine image: redis:7-alpine
container_name: incidentops-redis
ports: ports:
- "6379:6379" - "6379:6379"
volumes:
- redis_data:/data
healthcheck: healthcheck:
test: ["CMD", "redis-cli", "ping"] test: ["CMD", "redis-cli", "ping"]
interval: 10s interval: 5s
timeout: 5s timeout: 5s
retries: 5 retries: 5
# api services
api: api:
build: build:
context: . context: .
dockerfile: Dockerfile dockerfile: src/IncidentOps.Api/Dockerfile
target: api
container_name: incidentops-api
ports: ports:
- "8000:8000" - "8080:8080"
- "9464:9464" # Prometheus metrics
environment: environment:
DATABASE_URL: postgresql://incidentops:incidentops@postgres:5432/incidentops - ConnectionStrings__Postgres=Host=postgres;Port=5432;Database=incidentops;Username=postgres;Password=postgres
REDIS_URL: redis://redis:6379/0 - Redis__ConnectionString=redis:6379
JWT_SECRET_KEY: dev-secret-key-change-in-production - Jwt__SigningKey=your-super-secret-key-that-should-be-at-least-32-characters-long
JWT_ALGORITHM: HS256 - Jwt__Issuer=incidentops
ACCESS_TOKEN_EXPIRE_MINUTES: 30 - Jwt__Audience=incidentops
REFRESH_TOKEN_EXPIRE_DAYS: 30 - Cors__Origins__0=http://localhost:3000
# OpenTelemetry
OTEL_ENABLED: "true"
OTEL_SERVICE_NAME: incidentops-api
OTEL_ENVIRONMENT: development
OTEL_EXPORTER_OTLP_ENDPOINT: http://otel-collector:4317
OTEL_EXPORTER_OTLP_INSECURE: "true"
OTEL_LOG_LEVEL: INFO
# Metrics
PROMETHEUS_PORT: "9464"
depends_on: depends_on:
postgres: postgres:
condition: service_healthy condition: service_healthy
redis: redis:
condition: service_healthy condition: service_healthy
otel-collector:
condition: service_started
prometheus:
condition: service_started
healthcheck:
test: ["CMD", "curl", "-f", "http://localhost:8000/v1/healthz"]
interval: 30s
timeout: 10s
retries: 3
start_period: 10s
# Worker service (Celery)
worker: worker:
build: build:
context: . context: .
dockerfile: Dockerfile dockerfile: src/IncidentOps.Worker/Dockerfile
target: worker
container_name: incidentops-worker
environment: environment:
DATABASE_URL: postgresql://incidentops:incidentops@postgres:5432/incidentops - ConnectionStrings__Postgres=Host=postgres;Port=5432;Database=incidentops;Username=postgres;Password=postgres
REDIS_URL: redis://redis:6379/0 - Redis__ConnectionString=redis:6379
CELERY_BROKER_URL: redis://redis:6379/0
CELERY_RESULT_BACKEND: redis://redis:6379/1
# OpenTelemetry
OTEL_ENABLED: "true"
OTEL_SERVICE_NAME: incidentops-worker
OTEL_ENVIRONMENT: development
OTEL_EXPORTER_OTLP_ENDPOINT: http://otel-collector:4317
OTEL_EXPORTER_OTLP_INSECURE: "true"
depends_on: depends_on:
postgres: postgres:
condition: service_healthy condition: service_healthy
redis: redis:
condition: service_healthy condition: service_healthy
# Web frontend (Next.js)
web:
build:
context: .
dockerfile: Dockerfile.web
container_name: incidentops-web
ports:
- "3000:3000"
environment:
NEXT_PUBLIC_API_URL: http://localhost:8000
depends_on:
- api
# Database migrations (run once)
migrate:
build:
context: .
dockerfile: Dockerfile
target: api
container_name: incidentops-migrate
command: python migrations/migrate.py apply
environment:
DATABASE_URL: postgresql://incidentops:incidentops@postgres:5432/incidentops
depends_on:
postgres:
condition: service_healthy
profiles:
- migrate
# Flower for Celery monitoring (dev only)
flower:
image: mher/flower:2.0
container_name: incidentops-flower
ports:
- "5555:5555"
environment:
CELERY_BROKER_URL: redis://redis:6379/0
FLOWER_BASIC_AUTH: admin:admin
depends_on:
- redis
profiles:
- monitoring
# ============================================
# Observability Stack
# ============================================
# OpenTelemetry Collector - receives traces/logs from apps
otel-collector:
image: otel/opentelemetry-collector-contrib:0.96.0
container_name: incidentops-otel-collector
command: ["--config=/etc/otel-collector/config.yaml"]
volumes:
- ./observability/otel-collector/config.yaml:/etc/otel-collector/config.yaml:ro
ports:
- "4317:4317" # OTLP gRPC
- "4318:4318" # OTLP HTTP
depends_on:
- tempo
- loki
# Tempo - distributed tracing backend
tempo:
image: grafana/tempo:2.4.1
container_name: incidentops-tempo
command: ["-config.file=/etc/tempo/config.yaml"]
volumes:
- ./observability/tempo/config.yaml:/etc/tempo/config.yaml:ro
- tempo_data:/var/tempo
ports:
- "3200:3200" # Tempo HTTP
- "4320:4317" # Tempo OTLP gRPC (different host port to avoid conflict)
# Loki - log aggregation
loki:
image: grafana/loki:2.9.6
container_name: incidentops-loki
command: ["-config.file=/etc/loki/config.yaml"]
volumes:
- ./observability/loki/config.yaml:/etc/loki/config.yaml:ro
- loki_data:/loki
ports:
- "3100:3100" # Loki HTTP
# Prometheus - metrics storage
prometheus:
image: prom/prometheus:v2.51.0
container_name: incidentops-prometheus
command:
- "--config.file=/etc/prometheus/prometheus.yml"
- "--storage.tsdb.path=/prometheus"
- "--web.enable-lifecycle"
volumes:
- ./observability/prometheus/prometheus.yml:/etc/prometheus/prometheus.yml:ro
- prometheus_data:/prometheus
ports:
- "9090:9090" # Prometheus UI
# Grafana - visualization
grafana:
image: grafana/grafana:10.4.1
container_name: incidentops-grafana
environment:
GF_SECURITY_ADMIN_USER: admin
GF_SECURITY_ADMIN_PASSWORD: admin
GF_USERS_ALLOW_SIGN_UP: "false"
GF_EXPLORE_ENABLED: "true"
GF_FEATURE_TOGGLES_ENABLE: traceqlEditor tempoSearch tempoBackendSearch tempoApmTable
volumes:
- ./observability/grafana/provisioning:/etc/grafana/provisioning:ro
- ./observability/grafana/dashboards:/var/lib/grafana/dashboards:ro
- grafana_data:/var/lib/grafana
ports:
- "3001:3000" # Grafana UI (3001 to avoid conflict with web frontend)
depends_on:
- tempo
- loki
- prometheus
volumes: volumes:
postgres_data: postgres_data:
redis_data:
tempo_data:
loki_data:
prometheus_data:
grafana_data:
networks:
default:
name: incidentops-network
+657
View File
@@ -0,0 +1,657 @@
# IncidentOps Specification
A multi-tenant incident management system with implicit active-org context from JWT.
---
## Project Structure
```
incidentops/
├── IncidentOps.sln
├── docker-compose.yml
├── skaffold.yaml
├── .gitignore
├── src/
│ ├── IncidentOps.Api/ # ASP.NET Core REST API
│ │ ├── Auth/
│ │ │ ├── ClaimsPrincipalExtensions.cs
│ │ │ ├── RequestContext.cs
│ │ │ └── RoleRequirement.cs
│ │ ├── Controllers/
│ │ │ ├── AuthController.cs
│ │ │ ├── HealthController.cs
│ │ │ ├── IncidentsController.cs
│ │ │ └── OrgController.cs
│ │ ├── Dockerfile
│ │ ├── Program.cs
│ │ ├── appsettings.json
│ │ └── appsettings.Development.json
│ │
│ ├── IncidentOps.Worker/ # Hangfire Worker Service
│ │ ├── Jobs/
│ │ │ ├── EscalateIfUnackedJob.cs
│ │ │ ├── IncidentTriggeredJob.cs
│ │ │ └── SendWebhookNotificationJob.cs
│ │ ├── Dockerfile
│ │ ├── Program.cs
│ │ └── appsettings.json
│ │
│ ├── IncidentOps.Domain/ # Domain Entities & Enums
│ │ ├── Entities/
│ │ │ ├── Incident.cs
│ │ │ ├── IncidentEvent.cs
│ │ │ ├── NotificationAttempt.cs
│ │ │ ├── NotificationTarget.cs
│ │ │ ├── Org.cs
│ │ │ ├── OrgMember.cs
│ │ │ ├── RefreshToken.cs
│ │ │ ├── Service.cs
│ │ │ └── User.cs
│ │ └── Enums/
│ │ ├── IncidentEventType.cs
│ │ ├── IncidentStatus.cs
│ │ ├── NotificationTargetType.cs
│ │ └── OrgRole.cs
│ │
│ ├── IncidentOps.Infrastructure/ # Data Access & Services
│ │ ├── Auth/
│ │ │ ├── IPasswordService.cs
│ │ │ ├── ITokenService.cs
│ │ │ └── JwtSettings.cs
│ │ ├── Data/
│ │ │ ├── DbConnectionFactory.cs
│ │ │ └── Repositories/
│ │ │ ├── IIncidentEventRepository.cs
│ │ │ ├── IIncidentRepository.cs
│ │ │ ├── INotificationTargetRepository.cs
│ │ │ ├── IOrgMemberRepository.cs
│ │ │ ├── IOrgRepository.cs
│ │ │ ├── IRefreshTokenRepository.cs
│ │ │ ├── IServiceRepository.cs
│ │ │ └── IUserRepository.cs
│ │ ├── Jobs/
│ │ │ ├── IEscalateIfUnackedJob.cs
│ │ │ ├── IIncidentTriggeredJob.cs
│ │ │ └── ISendWebhookNotificationJob.cs
│ │ ├── Migrations/
│ │ │ ├── Migration0001_InitialSchema.cs
│ │ │ ├── Migration0002_RefreshTokens.cs
│ │ │ └── Migration0003_NotificationTargets.cs
│ │ └── ServiceCollectionExtensions.cs
│ │
│ └── IncidentOps.Contracts/ # DTOs / API Contracts
│ ├── Auth/
│ │ ├── AuthResponse.cs
│ │ ├── LoginRequest.cs
│ │ ├── LogoutRequest.cs
│ │ ├── MeResponse.cs
│ │ ├── RefreshRequest.cs
│ │ ├── RegisterRequest.cs
│ │ └── SwitchOrgRequest.cs
│ ├── Incidents/
│ │ ├── CommentRequest.cs
│ │ ├── CreateIncidentRequest.cs
│ │ ├── IncidentDto.cs
│ │ ├── IncidentEventDto.cs
│ │ ├── IncidentListResponse.cs
│ │ └── TransitionRequest.cs
│ ├── Orgs/
│ │ ├── CreateNotificationTargetRequest.cs
│ │ ├── NotificationTargetDto.cs
│ │ ├── OrgDto.cs
│ │ └── OrgMemberDto.cs
│ └── Services/
│ ├── CreateServiceRequest.cs
│ └── ServiceDto.cs
├── web/ # Next.js Frontend
│ ├── app/
│ │ ├── dashboard/page.tsx
│ │ ├── login/page.tsx
│ │ ├── register/page.tsx
│ │ ├── layout.tsx
│ │ ├── page.tsx
│ │ └── globals.css
│ ├── lib/
│ │ └── api.ts
│ ├── types/
│ │ └── index.ts
│ ├── Dockerfile
│ ├── package.json
│ ├── tsconfig.json
│ └── next.config.js
├── helm/incidentops/ # Helm Chart
│ ├── Chart.yaml
│ ├── values.yaml
│ └── templates/
│ ├── _helpers.tpl
│ ├── api-deployment.yaml
│ ├── api-service.yaml
│ ├── worker-deployment.yaml
│ ├── web-deployment.yaml
│ ├── web-service.yaml
│ ├── ingress.yaml
│ └── secrets.yaml
└── docs/
└── specs.md
```
---
## 1. Architecture (microservices-lite)
### Deployables
1. **api-service** (.NET 10, ASP.NET Core)
- REST API (implicit org scope from JWT)
- JWT access + refresh (both returned in JSON)
- RBAC enforced using `org_role` claim + DB ownership checks
- Writes incidents + timeline events
- Enqueues background jobs to Hangfire
2. **worker-service** (.NET 10 Worker Service)
- Runs **Hangfire Server** using Redis storage
- Executes jobs: notification send, escalation checks, rollups
- Writes notification attempts and system events
3. **web** (Next.js 14 + TypeScript)
- Auth pages + dashboard + incident detail
### Dependencies (in kind via Helm)
- PostgreSQL (Bitnami)
- Redis (Bitnami) - Hangfire storage
- ingress-nginx
- (later) Prometheus/Grafana/OTel
---
## 2. Auth Model (active org in JWT, implicit org scope)
### JWT Access Token Claims
| Claim | Description |
|-------|-------------|
| `sub` | userId (uuid) |
| `org_id` | activeOrgId (uuid) |
| `org_role` | `admin\|member\|viewer` |
| `iss` | Issuer |
| `aud` | Audience |
| `iat` | Issued at |
| `exp` | Expiration |
| `jti` | (optional) Token ID |
### Refresh Token Model (JSON, not cookie)
- Random opaque token returned in JSON
- Stored hashed in DB
- Rotated on refresh and switch-org
- Refresh token row stores `active_org_id` (per-session org selection)
### DB: `refresh_tokens`
```sql
id uuid PRIMARY KEY
user_id uuid NOT NULL
token_hash text NOT NULL UNIQUE
active_org_id uuid NOT NULL
expires_at timestamptz NOT NULL
revoked_at timestamptz NULL
created_at timestamptz NOT NULL
```
### Auth Endpoints
| Method | Endpoint | Description |
|--------|----------|-------------|
| POST | `/v1/auth/register` | Create user + default org |
| POST | `/v1/auth/login` | Authenticate, return tokens |
| POST | `/v1/auth/refresh` | Rotate refresh token |
| POST | `/v1/auth/switch-org` | Switch active org context |
| POST | `/v1/auth/logout` | Revoke refresh token |
#### Registration Flow
On `POST /v1/auth/register { email, password, displayName }`:
1. Create user record
2. Create a default org automatically (e.g., "John's Org")
3. Create org_member with role=Admin
4. Return access + refresh tokens
---
## 3. Authorization Rules (implicit org scope)
### Request Context
Middleware extracts from JWT:
- `UserId` from `sub`
- `OrgId` from `org_id`
- `Role` from `org_role`
### Authorization Approach
- **Role check**: enforce viewer/member/admin by claim
- **Ownership check**: for any resource ID in path, load its `org_id` from DB and require it equals token `org_id`
- Prevents cross-tenant IDOR even though org isn't in the URL
### Role Permissions
| Role | Permissions |
|------|-------------|
| viewer | Read-only access |
| member | Create incidents, transitions, comments |
| admin | Manage members, notification targets, on-call schedules |
---
## 4. API Surface (implicit org in JWT)
All routes under `/v1`. Unless noted, routes require auth.
### Auth
| Method | Endpoint | Auth | Description |
|--------|----------|------|-------------|
| POST | `/auth/register` | No | Register new user |
| POST | `/auth/login` | No | Login |
| POST | `/auth/refresh` | No | Refresh tokens |
| POST | `/auth/switch-org` | No | Switch org context |
| POST | `/auth/logout` | No | Logout |
| GET | `/me` | Yes | Get current user info |
### Org (current org context)
| Method | Endpoint | Role | Description |
|--------|----------|------|-------------|
| GET | `/org` | viewer+ | Current org summary + role |
| GET | `/org/members` | admin | List org members |
| POST | `/org/members` | admin | Invite/add member (stretch) |
| GET | `/org/services` | viewer+ | List services |
| POST | `/org/services` | member+ | Create service |
| GET | `/org/notification-targets` | admin | List notification targets |
| POST | `/org/notification-targets` | admin | Create notification target |
### Incidents
| Method | Endpoint | Role | Description |
|--------|----------|------|-------------|
| GET | `/incidents` | viewer+ | List incidents (cursor pagination) |
| POST | `/services/{serviceId}/incidents` | member+ | Create incident |
| GET | `/incidents/{incidentId}` | viewer+ | Get incident detail |
| GET | `/incidents/{incidentId}/events` | viewer+ | Get incident timeline |
| POST | `/incidents/{incidentId}/transition` | member+ | Transition incident state |
| POST | `/incidents/{incidentId}/comment` | member+ | Add comment |
### Health
| Method | Endpoint | Description |
|--------|----------|-------------|
| GET | `/healthz` | Liveness probe |
| GET | `/readyz` | Readiness probe (checks Postgres + Redis) |
---
## 5. Domain Workflows
### Incident State Machine
```
Triggered → Acknowledged → Mitigated → Resolved
```
### Enforcement
- Application-level validation (allowed transitions)
- DB optimistic concurrency using `incidents.version`
### Transition Write Pattern
```sql
UPDATE incidents
SET status = @newStatus, version = version + 1, updated_at = NOW()
WHERE id = @id AND org_id = @orgId AND version = @expectedVersion
```
- If 0 rows updated → `409 Conflict` (stale client) or `404` if not found in org
### Timeline Model
Append-only `incident_events` records for:
- Incident created
- Transitions (ack, mitigate, resolve)
- Comments
- Notifications sent/failed
- Escalations triggered
`actor_user_id` is null for system/worker actions.
---
## 6. PostgreSQL Schema (core tables)
### Users
```sql
CREATE TABLE users (
id uuid PRIMARY KEY,
email text NOT NULL UNIQUE,
password_hash text NOT NULL,
display_name text NOT NULL,
created_at timestamptz NOT NULL DEFAULT NOW()
);
```
### Orgs
```sql
CREATE TABLE orgs (
id uuid PRIMARY KEY,
name text NOT NULL,
slug text NOT NULL UNIQUE,
created_at timestamptz NOT NULL DEFAULT NOW()
);
```
### Org Members
```sql
CREATE TABLE org_members (
id uuid PRIMARY KEY,
org_id uuid NOT NULL REFERENCES orgs(id) ON DELETE CASCADE,
user_id uuid NOT NULL REFERENCES users(id) ON DELETE CASCADE,
role text NOT NULL CHECK (role IN ('admin', 'member', 'viewer')),
created_at timestamptz NOT NULL DEFAULT NOW(),
UNIQUE(org_id, user_id)
);
```
### Services
```sql
CREATE TABLE services (
id uuid PRIMARY KEY,
org_id uuid NOT NULL REFERENCES orgs(id) ON DELETE CASCADE,
name text NOT NULL,
slug text NOT NULL,
description text,
created_at timestamptz NOT NULL DEFAULT NOW(),
UNIQUE(org_id, slug)
);
```
### Incidents
```sql
CREATE TABLE incidents (
id uuid PRIMARY KEY,
org_id uuid NOT NULL REFERENCES orgs(id) ON DELETE CASCADE,
service_id uuid NOT NULL REFERENCES services(id) ON DELETE CASCADE,
title text NOT NULL,
description text,
status text NOT NULL DEFAULT 'triggered'
CHECK (status IN ('triggered', 'acknowledged', 'mitigated', 'resolved')),
severity text NOT NULL DEFAULT 'sev3'
CHECK (severity IN ('sev1', 'sev2', 'sev3', 'sev4')),
version integer NOT NULL DEFAULT 1,
created_at timestamptz NOT NULL DEFAULT NOW(),
updated_at timestamptz
);
CREATE INDEX idx_incidents_org_status ON incidents(org_id, status);
```
### Incident Events
```sql
CREATE TABLE incident_events (
id uuid PRIMARY KEY,
incident_id uuid NOT NULL REFERENCES incidents(id) ON DELETE CASCADE,
event_type text NOT NULL,
actor_user_id uuid REFERENCES users(id),
payload jsonb,
created_at timestamptz NOT NULL DEFAULT NOW()
);
CREATE INDEX idx_incident_events_incident ON incident_events(incident_id, created_at);
```
### Notification Targets
```sql
CREATE TABLE notification_targets (
id uuid PRIMARY KEY,
org_id uuid NOT NULL REFERENCES orgs(id) ON DELETE CASCADE,
name text NOT NULL,
target_type text NOT NULL CHECK (target_type IN ('webhook', 'email', 'slack')),
configuration text NOT NULL,
is_enabled boolean NOT NULL DEFAULT true,
created_at timestamptz NOT NULL DEFAULT NOW(),
updated_at timestamptz
);
```
### Notification Attempts
```sql
CREATE TABLE notification_attempts (
id uuid PRIMARY KEY,
incident_id uuid NOT NULL REFERENCES incidents(id) ON DELETE CASCADE,
target_id uuid NOT NULL REFERENCES notification_targets(id) ON DELETE CASCADE,
success boolean NOT NULL,
error_message text,
attempt_number integer NOT NULL DEFAULT 1,
created_at timestamptz NOT NULL DEFAULT NOW(),
UNIQUE(incident_id, target_id)
);
```
### Refresh Tokens
```sql
CREATE TABLE refresh_tokens (
id uuid PRIMARY KEY,
user_id uuid NOT NULL REFERENCES users(id) ON DELETE CASCADE,
token_hash text NOT NULL UNIQUE,
active_org_id uuid NOT NULL REFERENCES orgs(id),
expires_at timestamptz NOT NULL,
revoked_at timestamptz,
created_at timestamptz NOT NULL DEFAULT NOW()
);
CREATE INDEX idx_refresh_tokens_user ON refresh_tokens(user_id);
```
---
## 7. Data Access (Dapper) and Migrations (FluentMigrator)
### Dapper Conventions
- Repositories receive `OrgId` as an explicit parameter and include it in WHERE clauses
- Keep SQL close to repositories (or separate `.sql` files)
- Use `NpgsqlConnection` + `IDbTransaction` for multi-statement operations
### FluentMigrator
| Migration | Tables |
|-----------|--------|
| 0001 | users, orgs, org_members, services, incidents, incident_events |
| 0002 | refresh_tokens |
| 0003 | notification_targets, notification_attempts |
---
## 8. Hangfire Job Design (Redis storage)
### Setup
- API configures Hangfire Client (enqueue)
- Worker hosts Hangfire Server (process)
### Queues
| Queue | Purpose |
|-------|---------|
| critical | Escalations |
| default | Notifications |
| low | Rollups |
### Jobs
#### 1. IncidentTriggeredJob(incidentId)
- Reads incident (must belong to org in incident row)
- Loads enabled notification targets for the org
- Inserts `notification_attempts` rows (idempotent)
- Enqueues per-target send jobs
#### 2. SendWebhookNotificationJob(incidentId, targetId)
- Attempts HTTP POST with incident summary payload
- Updates attempt status + writes `incident_event` of type `system.notification_sent` or `system.notification_failed`
- Throws on transient failures to trigger retry; safe due to DB idempotency
#### 3. EscalateIfUnackedJob(incidentId, step) (stretch)
- Runs delayed
- Checks status; if still Triggered, sends secondary notifications
### Operational Note
- Expose Hangfire Dashboard **only in local** and protect it (basic auth or require a dev token)
---
## 9. Kubernetes (kind) + Helm + Skaffold (local-only)
### Helm Umbrella Chart Deploys
- bitnami/postgresql
- bitnami/redis
- api Deployment/Service
- worker Deployment
- web Deployment/Service
- Ingress with host `incidentops.local`:
- `/api`, `/v1`, `/healthz`, `/readyz` → api-service
- `/` → web
### Configuration via Environment
| Variable | Description |
|----------|-------------|
| `ConnectionStrings__Postgres` | PostgreSQL connection string |
| `Redis__ConnectionString` | Redis connection string |
| `Jwt__Issuer` | JWT issuer |
| `Jwt__Audience` | JWT audience |
| `Jwt__SigningKey` | JWT signing key (secret) |
### Readiness
- API checks Postgres + Redis
- Worker checks Postgres + Redis at startup
### Skaffold
- Builds three images (api, worker, web)
- `helm upgrade --install` on changes
---
## 10. Frontend UX Requirements (implicit org)
- On login, display `activeOrg` from response
- Org switcher calls `/v1/auth/switch-org` and replaces tokens
- All subsequent API calls use only `Authorization` header; no orgId params
- Store tokens in localStorage or secure cookie
- Handle 401 by attempting token refresh
---
## 11. Key Highlights (README/Resume)
- "Multi-tenant org context embedded in JWT; org switching re-issues tokens."
- "DB ownership checks prevent cross-tenant resource access."
- "Optimistic concurrency for incident transitions."
- "Background jobs with retries + idempotent notification attempts."
- "Deployed locally to Kubernetes via Helm + Skaffold."
---
## 12. Technology Stack
| Layer | Technology |
|-------|------------|
| Runtime | .NET 10 |
| API Framework | ASP.NET Core |
| Worker | .NET Worker Service |
| Background Jobs | Hangfire with Redis |
| Database | PostgreSQL |
| ORM | Dapper |
| Migrations | FluentMigrator |
| Auth | JWT Bearer + BCrypt |
| Frontend | Next.js 14 + TypeScript |
| Container | Docker |
| Orchestration | Kubernetes (kind) |
| Deployment | Helm + Skaffold |
---
## 13. Local Development
### Prerequisites
- .NET 10 SDK
- Node.js 20+
- Docker
- kind (Kubernetes in Docker)
- Helm
- Skaffold
### Quick Start
```bash
# With Docker Compose (simplest)
docker-compose up -d
# Run API
cd src/IncidentOps.Api
dotnet run
# Run Worker (separate terminal)
cd src/IncidentOps.Worker
dotnet run
# Run Web (separate terminal)
cd web
npm install
npm run dev
```
### With Kubernetes (kind)
```bash
# Create cluster
kind create cluster --name incidentops
# Deploy with Skaffold
skaffold dev
# Access at http://incidentops.local (add to /etc/hosts)
```
---
## 14. API Request/Response Examples
### Register
```http
POST /v1/auth/register
Content-Type: application/json
{
"email": "user@example.com",
"password": "SecurePass123!",
"displayName": "John Doe"
}
```
Response:
```json
{
"accessToken": "eyJhbG...",
"refreshToken": "a1b2c3d4...",
"activeOrg": {
"id": "uuid",
"name": "John Doe's Org",
"slug": "org-abc123",
"role": "admin"
}
}
```
### Create Incident
```http
POST /v1/services/{serviceId}/incidents
Authorization: Bearer {accessToken}
Content-Type: application/json
{
"title": "Database connection timeout",
"description": "Users experiencing slow queries",
"severity": "sev2"
}
```
### Transition Incident
```http
POST /v1/incidents/{incidentId}/transition
Authorization: Bearer {accessToken}
Content-Type: application/json
{
"action": "ack",
"expectedVersion": 1
}
```
+11 -10
View File
@@ -1,15 +1,16 @@
apiVersion: v2 apiVersion: v2
name: incidentops name: incidentops
description: A Helm chart for IncidentOps - Incident Management Platform description: IncidentOps - Incident Management Platform
type: application type: application
version: 0.1.0 version: 0.1.0
appVersion: "0.1.0" appVersion: "1.0.0"
keywords: dependencies:
- incidentops - name: postgresql
- incident-management version: "14.0.0"
- on-call repository: "https://charts.bitnami.com/bitnami"
- alerting condition: postgresql.enabled
- name: redis
maintainers: version: "18.0.0"
- name: IncidentOps Team repository: "https://charts.bitnami.com/bitnami"
condition: redis.enabled
-33
View File
@@ -1,33 +0,0 @@
IncidentOps has been deployed!
{{- if .Values.ingress.enabled }}
Access the application at:
http{{ if $.Values.ingress.tls }}s{{ end }}://{{ .Values.ingress.host }}
{{- else }}
To access the application, run:
API:
kubectl port-forward svc/{{ include "incidentops.fullname" . }}-api {{ .Values.api.service.port }}:{{ .Values.api.service.port }} -n {{ .Release.Namespace }}
Then open: http://localhost:{{ .Values.api.service.port }}
Web:
kubectl port-forward svc/{{ include "incidentops.fullname" . }}-web {{ .Values.web.service.port }}:{{ .Values.web.service.port }} -n {{ .Release.Namespace }}
Then open: http://localhost:{{ .Values.web.service.port }}
{{- end }}
To check the status of your deployment:
kubectl get pods -n {{ .Release.Namespace }} -l "app.kubernetes.io/instance={{ .Release.Name }}"
{{- if .Values.migration.enabled }}
Database migrations will run automatically as a Helm hook.
Check migration status:
kubectl get jobs -n {{ .Release.Namespace }} -l "app.kubernetes.io/component=migration"
{{- end }}
For more information, visit the documentation.
+6 -161
View File
@@ -49,170 +49,15 @@ app.kubernetes.io/instance: {{ .Release.Name }}
{{- end }} {{- end }}
{{/* {{/*
API labels PostgreSQL connection string
*/}} */}}
{{- define "incidentops.api.labels" -}} {{- define "incidentops.postgresConnectionString" -}}
{{ include "incidentops.labels" . }} Host={{ .Release.Name }}-postgresql;Port=5432;Database={{ .Values.postgresql.auth.database }};Username={{ .Values.postgresql.auth.username }};Password={{ .Values.postgresql.auth.password }}
app.kubernetes.io/component: api
{{- end }}
{{- define "incidentops.api.selectorLabels" -}}
{{ include "incidentops.selectorLabels" . }}
app.kubernetes.io/component: api
{{- end }} {{- end }}
{{/* {{/*
Worker labels Redis connection string
*/}} */}}
{{- define "incidentops.worker.labels" -}} {{- define "incidentops.redisConnectionString" -}}
{{ include "incidentops.labels" . }} {{ .Release.Name }}-redis-master:6379
app.kubernetes.io/component: worker
{{- end }}
{{- define "incidentops.worker.selectorLabels" -}}
{{ include "incidentops.selectorLabels" . }}
app.kubernetes.io/component: worker
{{- end }}
{{/*
Web labels
*/}}
{{- define "incidentops.web.labels" -}}
{{ include "incidentops.labels" . }}
app.kubernetes.io/component: web
{{- end }}
{{- define "incidentops.web.selectorLabels" -}}
{{ include "incidentops.selectorLabels" . }}
app.kubernetes.io/component: web
{{- end }}
{{/*
Create the name of the service account to use
*/}}
{{- define "incidentops.serviceAccountName" -}}
{{- if .Values.serviceAccount.create }}
{{- default (include "incidentops.fullname" .) .Values.serviceAccount.name }}
{{- else }}
{{- default "default" .Values.serviceAccount.name }}
{{- end }}
{{- end }}
{{/*
PostgreSQL host
*/}}
{{- define "incidentops.postgresql.host" -}}
{{- if .Values.postgresql.enabled }}
{{- printf "%s-postgresql" (include "incidentops.fullname" .) }}
{{- else }}
{{- .Values.externalDatabase.host }}
{{- end }}
{{- end }}
{{/*
PostgreSQL port
*/}}
{{- define "incidentops.postgresql.port" -}}
{{- if .Values.postgresql.enabled }}
{{- printf "5432" }}
{{- else }}
{{- .Values.externalDatabase.port | default "5432" }}
{{- end }}
{{- end }}
{{/*
Database URL
*/}}
{{- define "incidentops.databaseUrl" -}}
{{- $host := include "incidentops.postgresql.host" . }}
{{- $port := include "incidentops.postgresql.port" . }}
{{- if .Values.postgresql.enabled }}
{{- printf "postgresql://%s:%s@%s:%s/%s" .Values.postgresql.auth.username .Values.postgresql.auth.password $host $port .Values.postgresql.auth.database }}
{{- else }}
{{- printf "postgresql://%s:%s@%s:%s/%s" .Values.externalDatabase.user .Values.externalDatabase.password $host $port .Values.externalDatabase.database }}
{{- end }}
{{- end }}
{{/*
Redis host
*/}}
{{- define "incidentops.redis.host" -}}
{{- if .Values.redis.enabled }}
{{- printf "%s-redis" (include "incidentops.fullname" .) }}
{{- else }}
{{- .Values.externalRedis.host }}
{{- end }}
{{- end }}
{{/*
Redis URL
*/}}
{{- define "incidentops.redisUrl" -}}
{{- $host := include "incidentops.redis.host" . }}
{{- if .Values.redis.enabled }}
{{- printf "redis://%s:6379/0" $host }}
{{- else }}
{{- printf "redis://%s:%s/%s" $host (.Values.externalRedis.port | default "6379") (.Values.externalRedis.database | default "0") }}
{{- end }}
{{- end }}
{{/*
Celery broker URL
*/}}
{{- define "incidentops.celeryBrokerUrl" -}}
{{ include "incidentops.redisUrl" . }}
{{- end }}
{{/*
Celery result backend URL
*/}}
{{- define "incidentops.celeryResultBackend" -}}
{{- $host := include "incidentops.redis.host" . }}
{{- if .Values.redis.enabled }}
{{- printf "redis://%s:6379/1" $host }}
{{- else }}
{{- printf "redis://%s:%s/%s" $host (.Values.externalRedis.port | default "6379") (add (.Values.externalRedis.database | default 0) 1) }}
{{- end }}
{{- end }}
{{/*
API image
*/}}
{{- define "incidentops.api.image" -}}
{{- $registry := .Values.global.imageRegistry | default "" }}
{{- $repository := .Values.api.image.repository }}
{{- $tag := .Values.api.image.tag | default .Chart.AppVersion }}
{{- if $registry }}
{{- printf "%s/%s:%s" $registry $repository $tag }}
{{- else }}
{{- printf "%s:%s" $repository $tag }}
{{- end }}
{{- end }}
{{/*
Worker image
*/}}
{{- define "incidentops.worker.image" -}}
{{- $registry := .Values.global.imageRegistry | default "" }}
{{- $repository := .Values.worker.image.repository }}
{{- $tag := .Values.worker.image.tag | default .Chart.AppVersion }}
{{- if $registry }}
{{- printf "%s/%s:%s" $registry $repository $tag }}
{{- else }}
{{- printf "%s:%s" $repository $tag }}
{{- end }}
{{- end }}
{{/*
Web image
*/}}
{{- define "incidentops.web.image" -}}
{{- $registry := .Values.global.imageRegistry | default "" }}
{{- $repository := .Values.web.image.repository }}
{{- $tag := .Values.web.image.tag | default .Chart.AppVersion }}
{{- if $registry }}
{{- printf "%s/%s:%s" $registry $repository $tag }}
{{- else }}
{{- printf "%s:%s" $repository $tag }}
{{- end }}
{{- end }} {{- end }}
+34 -77
View File
@@ -3,102 +3,59 @@ kind: Deployment
metadata: metadata:
name: {{ include "incidentops.fullname" . }}-api name: {{ include "incidentops.fullname" . }}-api
labels: labels:
{{- include "incidentops.api.labels" . | nindent 4 }} {{- include "incidentops.labels" . | nindent 4 }}
app.kubernetes.io/component: api
spec: spec:
{{- if not .Values.api.autoscaling.enabled }} replicas: {{ .Values.api.replicas }}
replicas: {{ .Values.api.replicaCount }}
{{- end }}
selector: selector:
matchLabels: matchLabels:
{{- include "incidentops.api.selectorLabels" . | nindent 6 }} {{- include "incidentops.selectorLabels" . | nindent 6 }}
app.kubernetes.io/component: api
template: template:
metadata: metadata:
annotations:
checksum/config: {{ include (print $.Template.BasePath "/configmap.yaml") . | sha256sum }}
checksum/secret: {{ include (print $.Template.BasePath "/secret.yaml") . | sha256sum }}
{{- with .Values.api.podAnnotations }}
{{- toYaml . | nindent 8 }}
{{- end }}
labels: labels:
{{- include "incidentops.api.selectorLabels" . | nindent 8 }} {{- include "incidentops.selectorLabels" . | nindent 8 }}
app.kubernetes.io/component: api
spec: spec:
{{- with .Values.global.imagePullSecrets }}
imagePullSecrets:
{{- toYaml . | nindent 8 }}
{{- end }}
serviceAccountName: {{ include "incidentops.serviceAccountName" . }}
securityContext:
{{- toYaml .Values.podSecurityContext | nindent 8 }}
initContainers:
- name: wait-for-postgres
image: busybox:1.36
command:
- sh
- -c
- |
until nc -z {{ include "incidentops.fullname" . }}-postgresql 5432; do
echo "Waiting for PostgreSQL..."
sleep 2
done
echo "PostgreSQL is ready"
- name: wait-for-redis
image: busybox:1.36
command:
- sh
- -c
- |
until nc -z {{ include "incidentops.fullname" . }}-redis 6379; do
echo "Waiting for Redis..."
sleep 2
done
echo "Redis is ready"
containers: containers:
- name: api - name: api
securityContext: image: "{{ .Values.api.image }}:{{ .Values.api.tag }}"
{{- toYaml .Values.securityContext | nindent 12 }} imagePullPolicy: IfNotPresent
image: {{ include "incidentops.api.image" . }}
imagePullPolicy: {{ .Values.api.image.pullPolicy }}
ports: ports:
- name: http - name: http
containerPort: 8000 containerPort: {{ .Values.api.port }}
protocol: TCP protocol: TCP
{{- if .Values.metrics.enabled }} env:
- name: metrics - name: ConnectionStrings__Postgres
containerPort: {{ .Values.metrics.port }} value: {{ include "incidentops.postgresConnectionString" . | quote }}
protocol: TCP - name: Redis__ConnectionString
{{- end }} value: {{ include "incidentops.redisConnectionString" . | quote }}
envFrom: - name: Jwt__Issuer
- configMapRef: value: {{ .Values.jwt.issuer | quote }}
name: {{ include "incidentops.fullname" . }}-config - name: Jwt__Audience
- secretRef: value: {{ .Values.jwt.audience | quote }}
name: {{ include "incidentops.fullname" . }}-secret - name: Jwt__SigningKey
valueFrom:
secretKeyRef:
name: {{ include "incidentops.fullname" . }}-secrets
key: jwt-signing-key
- name: Jwt__AccessTokenExpirationMinutes
value: {{ .Values.jwt.accessTokenExpirationMinutes | quote }}
- name: Jwt__RefreshTokenExpirationDays
value: {{ .Values.jwt.refreshTokenExpirationDays | quote }}
- name: Cors__Origins__0
value: "http://{{ .Values.ingress.host }}"
livenessProbe: livenessProbe:
httpGet: httpGet:
path: /v1/healthz path: /healthz
port: http port: http
initialDelaySeconds: 10 initialDelaySeconds: 10
periodSeconds: 30 periodSeconds: 10
timeoutSeconds: 5
failureThreshold: 3
readinessProbe: readinessProbe:
httpGet: httpGet:
path: /v1/readyz path: /readyz
port: http port: http
initialDelaySeconds: 5 initialDelaySeconds: 5
periodSeconds: 10 periodSeconds: 5
timeoutSeconds: 5
failureThreshold: 3
resources: resources:
{{- toYaml .Values.api.resources | nindent 12 }} {{- toYaml .Values.api.resources | nindent 12 }}
{{- with .Values.api.nodeSelector }}
nodeSelector:
{{- toYaml . | nindent 8 }}
{{- end }}
{{- with .Values.api.affinity }}
affinity:
{{- toYaml . | nindent 8 }}
{{- end }}
{{- with .Values.api.tolerations }}
tolerations:
{{- toYaml . | nindent 8 }}
{{- end }}
-22
View File
@@ -1,22 +0,0 @@
{{- if .Values.api.autoscaling.enabled }}
apiVersion: autoscaling/v2
kind: HorizontalPodAutoscaler
metadata:
name: {{ include "incidentops.fullname" . }}-api
labels:
{{- include "incidentops.api.labels" . | nindent 4 }}
spec:
scaleTargetRef:
apiVersion: apps/v1
kind: Deployment
name: {{ include "incidentops.fullname" . }}-api
minReplicas: {{ .Values.api.autoscaling.minReplicas }}
maxReplicas: {{ .Values.api.autoscaling.maxReplicas }}
metrics:
- type: Resource
resource:
name: cpu
target:
type: Utilization
averageUtilization: {{ .Values.api.autoscaling.targetCPUUtilizationPercentage }}
{{- end }}
+6 -10
View File
@@ -3,19 +3,15 @@ kind: Service
metadata: metadata:
name: {{ include "incidentops.fullname" . }}-api name: {{ include "incidentops.fullname" . }}-api
labels: labels:
{{- include "incidentops.api.labels" . | nindent 4 }} {{- include "incidentops.labels" . | nindent 4 }}
app.kubernetes.io/component: api
spec: spec:
type: {{ .Values.api.service.type }} type: ClusterIP
ports: ports:
- port: {{ .Values.api.service.port }} - port: {{ .Values.api.port }}
targetPort: http targetPort: http
protocol: TCP protocol: TCP
name: http name: http
{{- if .Values.metrics.enabled }}
- port: {{ .Values.metrics.port }}
targetPort: metrics
protocol: TCP
name: metrics
{{- end }}
selector: selector:
{{- include "incidentops.api.selectorLabels" . | nindent 4 }} {{- include "incidentops.selectorLabels" . | nindent 4 }}
app.kubernetes.io/component: api
-23
View File
@@ -1,23 +0,0 @@
apiVersion: v1
kind: ConfigMap
metadata:
name: {{ include "incidentops.fullname" . }}-config
labels:
{{- include "incidentops.labels" . | nindent 4 }}
data:
JWT_ALGORITHM: {{ .Values.config.jwtAlgorithm | quote }}
ACCESS_TOKEN_EXPIRE_MINUTES: {{ .Values.config.accessTokenExpireMinutes | quote }}
REFRESH_TOKEN_EXPIRE_DAYS: {{ .Values.config.refreshTokenExpireDays | quote }}
# OpenTelemetry configuration
OTEL_ENABLED: {{ .Values.observability.enabled | quote }}
OTEL_SERVICE_NAME: "incidentops-api"
OTEL_ENVIRONMENT: {{ .Values.config.environment | default "production" | quote }}
{{- if .Values.observability.enabled }}
OTEL_EXPORTER_OTLP_ENDPOINT: "http://{{ include "incidentops.fullname" . }}-otel-collector:4317"
{{- end }}
OTEL_EXPORTER_OTLP_INSECURE: "true"
OTEL_LOG_LEVEL: {{ .Values.config.logLevel | default "INFO" | quote }}
# Metrics configuration
{{- if .Values.metrics.enabled }}
PROMETHEUS_PORT: {{ .Values.metrics.port | quote }}
{{- end }}
@@ -1,387 +0,0 @@
{{- if .Values.observability.enabled }}
apiVersion: v1
kind: ConfigMap
metadata:
name: {{ include "incidentops.fullname" . }}-grafana-datasources
labels:
{{- include "incidentops.labels" . | nindent 4 }}
app.kubernetes.io/component: grafana
data:
datasources.yaml: |
apiVersion: 1
datasources:
- name: Prometheus
type: prometheus
uid: prometheus
url: http://{{ include "incidentops.fullname" . }}-prometheus:9090
access: proxy
isDefault: false
jsonData:
httpMethod: POST
exemplarTraceIdDestinations:
- name: trace_id
datasourceUid: tempo
- name: Tempo
type: tempo
uid: tempo
url: http://{{ include "incidentops.fullname" . }}-tempo:3200
access: proxy
isDefault: false
jsonData:
tracesToLogsV2:
datasourceUid: loki
spanStartTimeShift: '-1h'
spanEndTimeShift: '1h'
filterByTraceID: true
filterBySpanID: true
tracesToMetrics:
datasourceUid: prometheus
spanStartTimeShift: '-1h'
spanEndTimeShift: '1h'
serviceMap:
datasourceUid: prometheus
nodeGraph:
enabled: true
lokiSearch:
datasourceUid: loki
- name: Loki
type: loki
uid: loki
url: http://{{ include "incidentops.fullname" . }}-loki:3100
access: proxy
isDefault: true
jsonData:
derivedFields:
- datasourceUid: tempo
matcherRegex: '"trace_id":"([a-f0-9]+)"'
name: TraceID
url: '$${__value.raw}'
urlDisplayLabel: 'View Trace'
---
apiVersion: v1
kind: ConfigMap
metadata:
name: {{ include "incidentops.fullname" . }}-grafana-dashboards-provider
labels:
{{- include "incidentops.labels" . | nindent 4 }}
app.kubernetes.io/component: grafana
data:
dashboards.yaml: |
apiVersion: 1
providers:
- name: 'default'
orgId: 1
folder: 'IncidentOps'
folderUid: 'incidentops'
type: file
disableDeletion: false
editable: true
options:
path: /var/lib/grafana/dashboards
---
apiVersion: v1
kind: ConfigMap
metadata:
name: {{ include "incidentops.fullname" . }}-grafana-dashboards
labels:
{{- include "incidentops.labels" . | nindent 4 }}
app.kubernetes.io/component: grafana
data:
api-overview.json: |
{
"title": "IncidentOps API Overview",
"uid": "incidentops-api",
"tags": ["incidentops", "api"],
"timezone": "browser",
"editable": true,
"panels": [
{
"id": 1,
"title": "Request Rate",
"type": "timeseries",
"gridPos": {"h": 8, "w": 8, "x": 0, "y": 0},
"targets": [
{
"datasource": {"type": "prometheus", "uid": "prometheus"},
"expr": "sum(rate(http_server_request_duration_seconds_count{job=\"incidentops-api\"}[1m]))",
"legendFormat": "Requests/sec",
"refId": "A"
}
],
"fieldConfig": {
"defaults": {
"unit": "reqps"
}
}
},
{
"id": 2,
"title": "Request Duration (p50, p95, p99)",
"type": "timeseries",
"gridPos": {"h": 8, "w": 8, "x": 8, "y": 0},
"targets": [
{
"datasource": {"type": "prometheus", "uid": "prometheus"},
"expr": "histogram_quantile(0.50, sum(rate(http_server_request_duration_seconds_bucket{job=\"incidentops-api\"}[5m])) by (le))",
"legendFormat": "p50",
"refId": "A"
},
{
"datasource": {"type": "prometheus", "uid": "prometheus"},
"expr": "histogram_quantile(0.95, sum(rate(http_server_request_duration_seconds_bucket{job=\"incidentops-api\"}[5m])) by (le))",
"legendFormat": "p95",
"refId": "B"
},
{
"datasource": {"type": "prometheus", "uid": "prometheus"},
"expr": "histogram_quantile(0.99, sum(rate(http_server_request_duration_seconds_bucket{job=\"incidentops-api\"}[5m])) by (le))",
"legendFormat": "p99",
"refId": "C"
}
],
"fieldConfig": {
"defaults": {
"unit": "s"
}
}
},
{
"id": 3,
"title": "Error Rate",
"type": "timeseries",
"gridPos": {"h": 8, "w": 8, "x": 16, "y": 0},
"targets": [
{
"datasource": {"type": "prometheus", "uid": "prometheus"},
"expr": "sum(rate(http_server_request_duration_seconds_count{job=\"incidentops-api\", http_status_code=~\"5..\"}[1m])) / sum(rate(http_server_request_duration_seconds_count{job=\"incidentops-api\"}[1m])) * 100",
"legendFormat": "Error %",
"refId": "A"
}
],
"fieldConfig": {
"defaults": {
"unit": "percent",
"min": 0,
"max": 100
}
}
},
{
"id": 4,
"title": "Requests by Status Code",
"type": "timeseries",
"gridPos": {"h": 8, "w": 12, "x": 0, "y": 8},
"targets": [
{
"datasource": {"type": "prometheus", "uid": "prometheus"},
"expr": "sum by (http_status_code) (rate(http_server_request_duration_seconds_count{job=\"incidentops-api\"}[1m]))",
"legendFormat": "{{ "{{" }}http_status_code{{ "}}" }}",
"refId": "A"
}
],
"fieldConfig": {
"defaults": {
"unit": "reqps"
}
}
},
{
"id": 5,
"title": "Requests by Endpoint",
"type": "timeseries",
"gridPos": {"h": 8, "w": 12, "x": 12, "y": 8},
"targets": [
{
"datasource": {"type": "prometheus", "uid": "prometheus"},
"expr": "sum by (http_route) (rate(http_server_request_duration_seconds_count{job=\"incidentops-api\"}[1m]))",
"legendFormat": "{{ "{{" }}http_route{{ "}}" }}",
"refId": "A"
}
],
"fieldConfig": {
"defaults": {
"unit": "reqps"
}
}
},
{
"id": 6,
"title": "Recent Logs",
"type": "logs",
"gridPos": {"h": 10, "w": 24, "x": 0, "y": 16},
"targets": [
{
"datasource": {"type": "loki", "uid": "loki"},
"expr": "{service_name=\"incidentops-api\"} | json",
"refId": "A"
}
],
"options": {
"showTime": true,
"showLabels": true,
"wrapLogMessage": true,
"enableLogDetails": true,
"sortOrder": "Descending"
}
},
{
"id": 7,
"title": "Recent Traces",
"type": "traces",
"gridPos": {"h": 10, "w": 24, "x": 0, "y": 26},
"targets": [
{
"datasource": {"type": "tempo", "uid": "tempo"},
"queryType": "traceqlSearch",
"filters": [
{
"id": "service-name",
"operator": "=",
"scope": "resource",
"tag": "service.name",
"value": ["incidentops-api"]
}
],
"refId": "A"
}
]
}
],
"schemaVersion": 38,
"version": 2
}
---
apiVersion: apps/v1
kind: Deployment
metadata:
name: {{ include "incidentops.fullname" . }}-grafana
labels:
{{- include "incidentops.labels" . | nindent 4 }}
app.kubernetes.io/component: grafana
spec:
replicas: 1
selector:
matchLabels:
{{- include "incidentops.selectorLabels" . | nindent 6 }}
app.kubernetes.io/component: grafana
template:
metadata:
labels:
{{- include "incidentops.selectorLabels" . | nindent 8 }}
app.kubernetes.io/component: grafana
annotations:
checksum/datasources: {{ .Values.observability.grafana.image.tag | sha256sum }}
spec:
securityContext:
fsGroup: 472
runAsUser: 472
containers:
- name: grafana
image: "{{ .Values.observability.grafana.image.repository }}:{{ .Values.observability.grafana.image.tag }}"
imagePullPolicy: {{ .Values.observability.grafana.image.pullPolicy }}
ports:
- name: http
containerPort: 3000
protocol: TCP
env:
- name: GF_SECURITY_ADMIN_USER
value: {{ .Values.observability.grafana.adminUser | quote }}
- name: GF_SECURITY_ADMIN_PASSWORD
valueFrom:
secretKeyRef:
name: {{ include "incidentops.fullname" . }}-grafana
key: admin-password
- name: GF_USERS_ALLOW_SIGN_UP
value: "false"
- name: GF_EXPLORE_ENABLED
value: "true"
- name: GF_FEATURE_TOGGLES_ENABLE
value: "traceqlEditor tempoSearch tempoBackendSearch tempoApmTable"
volumeMounts:
- name: datasources
mountPath: /etc/grafana/provisioning/datasources
- name: dashboards-provider
mountPath: /etc/grafana/provisioning/dashboards
- name: dashboards
mountPath: /var/lib/grafana/dashboards
- name: data
mountPath: /var/lib/grafana
resources:
{{- toYaml .Values.observability.grafana.resources | nindent 12 }}
readinessProbe:
httpGet:
path: /api/health
port: http
initialDelaySeconds: 10
periodSeconds: 10
livenessProbe:
httpGet:
path: /api/health
port: http
initialDelaySeconds: 30
periodSeconds: 30
volumes:
- name: datasources
configMap:
name: {{ include "incidentops.fullname" . }}-grafana-datasources
- name: dashboards-provider
configMap:
name: {{ include "incidentops.fullname" . }}-grafana-dashboards-provider
- name: dashboards
configMap:
name: {{ include "incidentops.fullname" . }}-grafana-dashboards
- name: data
{{- if .Values.observability.grafana.persistence.enabled }}
persistentVolumeClaim:
claimName: {{ include "incidentops.fullname" . }}-grafana
{{- else }}
emptyDir: {}
{{- end }}
---
apiVersion: v1
kind: Secret
metadata:
name: {{ include "incidentops.fullname" . }}-grafana
labels:
{{- include "incidentops.labels" . | nindent 4 }}
app.kubernetes.io/component: grafana
type: Opaque
data:
admin-password: {{ .Values.observability.grafana.adminPassword | b64enc | quote }}
---
apiVersion: v1
kind: Service
metadata:
name: {{ include "incidentops.fullname" . }}-grafana
labels:
{{- include "incidentops.labels" . | nindent 4 }}
app.kubernetes.io/component: grafana
spec:
type: {{ .Values.observability.grafana.service.type }}
ports:
- name: http
port: 80
targetPort: http
protocol: TCP
selector:
{{- include "incidentops.selectorLabels" . | nindent 4 }}
app.kubernetes.io/component: grafana
{{- if .Values.observability.grafana.persistence.enabled }}
---
apiVersion: v1
kind: PersistentVolumeClaim
metadata:
name: {{ include "incidentops.fullname" . }}-grafana
labels:
{{- include "incidentops.labels" . | nindent 4 }}
app.kubernetes.io/component: grafana
spec:
accessModes:
- ReadWriteOnce
resources:
requests:
storage: {{ .Values.observability.grafana.persistence.size }}
{{- end }}
{{- end }}
@@ -1,38 +0,0 @@
{{- if and .Values.observability.enabled .Values.observability.grafana.ingress.enabled -}}
apiVersion: networking.k8s.io/v1
kind: Ingress
metadata:
name: {{ include "incidentops.fullname" . }}-grafana
labels:
{{- include "incidentops.labels" . | nindent 4 }}
app.kubernetes.io/component: grafana
{{- with .Values.observability.grafana.ingress.annotations }}
annotations:
{{- toYaml . | nindent 4 }}
{{- end }}
spec:
{{- if .Values.ingress.className }}
ingressClassName: {{ .Values.ingress.className }}
{{- end }}
{{- if .Values.observability.grafana.ingress.tls }}
tls:
{{- range .Values.observability.grafana.ingress.tls }}
- hosts:
{{- range .hosts }}
- {{ . | quote }}
{{- end }}
secretName: {{ .secretName }}
{{- end }}
{{- end }}
rules:
- host: {{ .Values.observability.grafana.ingress.host | quote }}
http:
paths:
- path: /
pathType: Prefix
backend:
service:
name: {{ include "incidentops.fullname" . }}-grafana
port:
number: 80
{{- end }}
+17 -13
View File
@@ -13,16 +13,6 @@ spec:
{{- if .Values.ingress.className }} {{- if .Values.ingress.className }}
ingressClassName: {{ .Values.ingress.className }} ingressClassName: {{ .Values.ingress.className }}
{{- end }} {{- end }}
{{- if .Values.ingress.tls }}
tls:
{{- range .Values.ingress.tls }}
- hosts:
{{- range .hosts }}
- {{ . | quote }}
{{- end }}
secretName: {{ .secretName }}
{{- end }}
{{- end }}
rules: rules:
- host: {{ .Values.ingress.host | quote }} - host: {{ .Values.ingress.host | quote }}
http: http:
@@ -33,19 +23,33 @@ spec:
service: service:
name: {{ include "incidentops.fullname" . }}-api name: {{ include "incidentops.fullname" . }}-api
port: port:
number: {{ .Values.api.service.port }} number: {{ .Values.api.port }}
- path: /v1 - path: /v1
pathType: Prefix pathType: Prefix
backend: backend:
service: service:
name: {{ include "incidentops.fullname" . }}-api name: {{ include "incidentops.fullname" . }}-api
port: port:
number: {{ .Values.api.service.port }} number: {{ .Values.api.port }}
- path: /healthz
pathType: Exact
backend:
service:
name: {{ include "incidentops.fullname" . }}-api
port:
number: {{ .Values.api.port }}
- path: /readyz
pathType: Exact
backend:
service:
name: {{ include "incidentops.fullname" . }}-api
port:
number: {{ .Values.api.port }}
- path: / - path: /
pathType: Prefix pathType: Prefix
backend: backend:
service: service:
name: {{ include "incidentops.fullname" . }}-web name: {{ include "incidentops.fullname" . }}-web
port: port:
number: {{ .Values.web.service.port }} number: {{ .Values.web.port }}
{{- end }} {{- end }}
@@ -1,155 +0,0 @@
{{- if .Values.observability.enabled }}
apiVersion: v1
kind: ConfigMap
metadata:
name: {{ include "incidentops.fullname" . }}-loki-config
labels:
{{- include "incidentops.labels" . | nindent 4 }}
app.kubernetes.io/component: loki
data:
loki.yaml: |
auth_enabled: false
server:
http_listen_port: 3100
grpc_listen_port: 9096
common:
path_prefix: /loki
storage:
filesystem:
chunks_directory: /loki/chunks
rules_directory: /loki/rules
replication_factor: 1
ring:
kvstore:
store: inmemory
query_range:
results_cache:
cache:
embedded_cache:
enabled: true
max_size_mb: 100
schema_config:
configs:
- from: "2020-10-24"
store: tsdb
object_store: filesystem
schema: v13
index:
prefix: index_
period: 24h
ruler:
alertmanager_url: http://localhost:9093
limits_config:
retention_period: {{ .Values.observability.loki.retention }}
allow_structured_metadata: true
volume_enabled: true
---
apiVersion: apps/v1
kind: Deployment
metadata:
name: {{ include "incidentops.fullname" . }}-loki
labels:
{{- include "incidentops.labels" . | nindent 4 }}
app.kubernetes.io/component: loki
spec:
replicas: 1
selector:
matchLabels:
{{- include "incidentops.selectorLabels" . | nindent 6 }}
app.kubernetes.io/component: loki
template:
metadata:
labels:
{{- include "incidentops.selectorLabels" . | nindent 8 }}
app.kubernetes.io/component: loki
annotations:
checksum/config: {{ .Values.observability.loki.image.tag | sha256sum }}
spec:
containers:
- name: loki
image: "{{ .Values.observability.loki.image.repository }}:{{ .Values.observability.loki.image.tag }}"
imagePullPolicy: {{ .Values.observability.loki.image.pullPolicy }}
args:
- -config.file=/etc/loki/loki.yaml
ports:
- name: http
containerPort: 3100
protocol: TCP
- name: grpc
containerPort: 9096
protocol: TCP
volumeMounts:
- name: config
mountPath: /etc/loki
- name: data
mountPath: /loki
resources:
{{- toYaml .Values.observability.loki.resources | nindent 12 }}
readinessProbe:
httpGet:
path: /ready
port: http
initialDelaySeconds: 10
periodSeconds: 10
livenessProbe:
httpGet:
path: /ready
port: http
initialDelaySeconds: 30
periodSeconds: 30
volumes:
- name: config
configMap:
name: {{ include "incidentops.fullname" . }}-loki-config
- name: data
{{- if .Values.observability.loki.persistence.enabled }}
persistentVolumeClaim:
claimName: {{ include "incidentops.fullname" . }}-loki
{{- else }}
emptyDir: {}
{{- end }}
---
apiVersion: v1
kind: Service
metadata:
name: {{ include "incidentops.fullname" . }}-loki
labels:
{{- include "incidentops.labels" . | nindent 4 }}
app.kubernetes.io/component: loki
spec:
type: ClusterIP
ports:
- name: http
port: 3100
targetPort: http
protocol: TCP
- name: grpc
port: 9096
targetPort: grpc
protocol: TCP
selector:
{{- include "incidentops.selectorLabels" . | nindent 4 }}
app.kubernetes.io/component: loki
{{- if .Values.observability.loki.persistence.enabled }}
---
apiVersion: v1
kind: PersistentVolumeClaim
metadata:
name: {{ include "incidentops.fullname" . }}-loki
labels:
{{- include "incidentops.labels" . | nindent 4 }}
app.kubernetes.io/component: loki
spec:
accessModes:
- ReadWriteOnce
resources:
requests:
storage: {{ .Values.observability.loki.persistence.size }}
{{- end }}
{{- end }}
@@ -1,51 +0,0 @@
{{- if .Values.migration.enabled }}
apiVersion: batch/v1
kind: Job
metadata:
name: {{ include "incidentops.fullname" . }}-migrate
labels:
{{- include "incidentops.labels" . | nindent 4 }}
app.kubernetes.io/component: migration
annotations:
"helm.sh/hook": post-install,post-upgrade
"helm.sh/hook-weight": "-5"
"helm.sh/hook-delete-policy": before-hook-creation,hook-succeeded
spec:
backoffLimit: {{ .Values.migration.backoffLimit }}
template:
metadata:
labels:
{{- include "incidentops.selectorLabels" . | nindent 8 }}
app.kubernetes.io/component: migration
spec:
{{- with .Values.global.imagePullSecrets }}
imagePullSecrets:
{{- toYaml . | nindent 8 }}
{{- end }}
serviceAccountName: {{ include "incidentops.serviceAccountName" . }}
securityContext:
{{- toYaml .Values.podSecurityContext | nindent 8 }}
restartPolicy: Never
containers:
- name: migrate
securityContext:
{{- toYaml .Values.securityContext | nindent 12 }}
image: "{{ .Values.migration.image.repository }}:{{ .Values.migration.image.tag }}"
imagePullPolicy: {{ .Values.migration.image.pullPolicy }}
command:
- uv
- run
- python
- migrations/migrate.py
- apply
envFrom:
- secretRef:
name: {{ include "incidentops.fullname" . }}-secret
resources:
requests:
cpu: 50m
memory: 128Mi
limits:
cpu: 200m
memory: 256Mi
{{- end }}
@@ -1,132 +0,0 @@
{{- if .Values.observability.enabled }}
apiVersion: v1
kind: ConfigMap
metadata:
name: {{ include "incidentops.fullname" . }}-otel-collector-config
labels:
{{- include "incidentops.labels" . | nindent 4 }}
app.kubernetes.io/component: otel-collector
data:
otel-collector-config.yaml: |
extensions:
health_check:
endpoint: 0.0.0.0:13133
receivers:
otlp:
protocols:
grpc:
endpoint: 0.0.0.0:4317
http:
endpoint: 0.0.0.0:4318
processors:
batch:
timeout: 1s
send_batch_size: 1024
memory_limiter:
check_interval: 1s
limit_mib: 512
spike_limit_mib: 128
exporters:
otlp/tempo:
endpoint: {{ include "incidentops.fullname" . }}-tempo:4317
tls:
insecure: true
loki:
endpoint: http://{{ include "incidentops.fullname" . }}-loki:3100/loki/api/v1/push
default_labels_enabled:
exporter: true
job: true
service:
extensions: [health_check]
pipelines:
traces:
receivers: [otlp]
processors: [memory_limiter, batch]
exporters: [otlp/tempo]
logs:
receivers: [otlp]
processors: [memory_limiter, batch]
exporters: [loki]
---
apiVersion: apps/v1
kind: Deployment
metadata:
name: {{ include "incidentops.fullname" . }}-otel-collector
labels:
{{- include "incidentops.labels" . | nindent 4 }}
app.kubernetes.io/component: otel-collector
spec:
replicas: {{ .Values.observability.otelCollector.replicaCount }}
selector:
matchLabels:
{{- include "incidentops.selectorLabels" . | nindent 6 }}
app.kubernetes.io/component: otel-collector
template:
metadata:
labels:
{{- include "incidentops.selectorLabels" . | nindent 8 }}
app.kubernetes.io/component: otel-collector
annotations:
checksum/config: {{ .Values.observability.otelCollector.image.tag | sha256sum }}
spec:
containers:
- name: otel-collector
image: "{{ .Values.observability.otelCollector.image.repository }}:{{ .Values.observability.otelCollector.image.tag }}"
imagePullPolicy: {{ .Values.observability.otelCollector.image.pullPolicy }}
args:
- --config=/etc/otel-collector/otel-collector-config.yaml
ports:
- name: otlp-grpc
containerPort: 4317
protocol: TCP
- name: otlp-http
containerPort: 4318
protocol: TCP
volumeMounts:
- name: config
mountPath: /etc/otel-collector
resources:
{{- toYaml .Values.observability.otelCollector.resources | nindent 12 }}
livenessProbe:
httpGet:
path: /
port: 13133
initialDelaySeconds: 10
periodSeconds: 30
readinessProbe:
httpGet:
path: /
port: 13133
initialDelaySeconds: 5
periodSeconds: 10
volumes:
- name: config
configMap:
name: {{ include "incidentops.fullname" . }}-otel-collector-config
---
apiVersion: v1
kind: Service
metadata:
name: {{ include "incidentops.fullname" . }}-otel-collector
labels:
{{- include "incidentops.labels" . | nindent 4 }}
app.kubernetes.io/component: otel-collector
spec:
type: ClusterIP
ports:
- name: otlp-grpc
port: 4317
targetPort: otlp-grpc
protocol: TCP
- name: otlp-http
port: 4318
targetPort: otlp-http
protocol: TCP
selector:
{{- include "incidentops.selectorLabels" . | nindent 4 }}
app.kubernetes.io/component: otel-collector
{{- end }}
@@ -1,91 +0,0 @@
{{- if .Values.postgresql.enabled }}
apiVersion: apps/v1
kind: StatefulSet
metadata:
name: {{ include "incidentops.fullname" . }}-postgresql
labels:
{{- include "incidentops.labels" . | nindent 4 }}
app.kubernetes.io/component: postgresql
spec:
serviceName: {{ include "incidentops.fullname" . }}-postgresql
replicas: 1
selector:
matchLabels:
{{- include "incidentops.selectorLabels" . | nindent 6 }}
app.kubernetes.io/component: postgresql
template:
metadata:
labels:
{{- include "incidentops.selectorLabels" . | nindent 8 }}
app.kubernetes.io/component: postgresql
spec:
containers:
- name: postgresql
image: "{{ .Values.postgresql.image.repository }}:{{ .Values.postgresql.image.tag }}"
imagePullPolicy: {{ .Values.postgresql.image.pullPolicy }}
ports:
- name: postgresql
containerPort: 5432
protocol: TCP
env:
- name: POSTGRES_USER
value: {{ .Values.postgresql.auth.username | quote }}
- name: POSTGRES_PASSWORD
value: {{ .Values.postgresql.auth.password | quote }}
- name: POSTGRES_DB
value: {{ .Values.postgresql.auth.database | quote }}
- name: PGDATA
value: /var/lib/postgresql/data/pgdata
volumeMounts:
- name: data
mountPath: /var/lib/postgresql/data
livenessProbe:
exec:
command:
- pg_isready
- -U
- {{ .Values.postgresql.auth.username }}
initialDelaySeconds: 30
periodSeconds: 10
timeoutSeconds: 5
failureThreshold: 6
readinessProbe:
exec:
command:
- pg_isready
- -U
- {{ .Values.postgresql.auth.username }}
initialDelaySeconds: 5
periodSeconds: 10
timeoutSeconds: 5
failureThreshold: 6
resources:
{{- toYaml .Values.postgresql.resources | nindent 12 }}
volumeClaimTemplates:
- metadata:
name: data
spec:
accessModes:
- ReadWriteOnce
resources:
requests:
storage: {{ .Values.postgresql.persistence.size }}
---
apiVersion: v1
kind: Service
metadata:
name: {{ include "incidentops.fullname" . }}-postgresql
labels:
{{- include "incidentops.labels" . | nindent 4 }}
app.kubernetes.io/component: postgresql
spec:
type: ClusterIP
ports:
- port: 5432
targetPort: postgresql
protocol: TCP
name: postgresql
selector:
{{- include "incidentops.selectorLabels" . | nindent 4 }}
app.kubernetes.io/component: postgresql
{{- end }}
@@ -1,163 +0,0 @@
{{- if and .Values.observability.enabled .Values.metrics.enabled }}
apiVersion: v1
kind: ConfigMap
metadata:
name: {{ include "incidentops.fullname" . }}-prometheus
labels:
{{- include "incidentops.labels" . | nindent 4 }}
app.kubernetes.io/component: prometheus
data:
prometheus.yml: |
global:
scrape_interval: {{ .Values.observability.prometheus.scrapeInterval | default "15s" }}
evaluation_interval: 15s
scrape_configs:
- job_name: "prometheus"
static_configs:
- targets: ["localhost:9090"]
- job_name: "incidentops-api"
kubernetes_sd_configs:
- role: pod
namespaces:
names:
- {{ .Release.Namespace }}
relabel_configs:
- source_labels: [__meta_kubernetes_pod_label_app_kubernetes_io_component]
action: keep
regex: api
- source_labels: [__meta_kubernetes_pod_container_port_name]
action: keep
regex: metrics
- source_labels: [__meta_kubernetes_namespace]
target_label: namespace
- source_labels: [__meta_kubernetes_pod_name]
target_label: pod
metrics_path: /metrics
scrape_interval: 10s
- job_name: "incidentops-worker"
kubernetes_sd_configs:
- role: pod
namespaces:
names:
- {{ .Release.Namespace }}
relabel_configs:
- source_labels: [__meta_kubernetes_pod_label_app_kubernetes_io_component]
action: keep
regex: worker
- source_labels: [__meta_kubernetes_pod_container_port_name]
action: keep
regex: metrics
- source_labels: [__meta_kubernetes_namespace]
target_label: namespace
- source_labels: [__meta_kubernetes_pod_name]
target_label: pod
metrics_path: /metrics
scrape_interval: 10s
---
apiVersion: apps/v1
kind: Deployment
metadata:
name: {{ include "incidentops.fullname" . }}-prometheus
labels:
{{- include "incidentops.labels" . | nindent 4 }}
app.kubernetes.io/component: prometheus
spec:
replicas: 1
selector:
matchLabels:
{{- include "incidentops.selectorLabels" . | nindent 6 }}
app.kubernetes.io/component: prometheus
template:
metadata:
labels:
{{- include "incidentops.selectorLabels" . | nindent 8 }}
app.kubernetes.io/component: prometheus
annotations:
checksum/config: {{ .Values.observability.prometheus.image.tag | sha256sum }}
spec:
serviceAccountName: {{ include "incidentops.serviceAccountName" . }}
securityContext:
fsGroup: 65534
runAsUser: 65534
runAsNonRoot: true
containers:
- name: prometheus
image: "{{ .Values.observability.prometheus.image.repository }}:{{ .Values.observability.prometheus.image.tag }}"
imagePullPolicy: {{ .Values.observability.prometheus.image.pullPolicy }}
args:
- "--config.file=/etc/prometheus/prometheus.yml"
- "--storage.tsdb.path=/prometheus"
- "--storage.tsdb.retention.time={{ .Values.observability.prometheus.retention }}"
- "--web.enable-lifecycle"
ports:
- name: http
containerPort: 9090
protocol: TCP
volumeMounts:
- name: config
mountPath: /etc/prometheus
- name: data
mountPath: /prometheus
resources:
{{- toYaml .Values.observability.prometheus.resources | nindent 12 }}
readinessProbe:
httpGet:
path: /-/ready
port: http
initialDelaySeconds: 10
periodSeconds: 10
livenessProbe:
httpGet:
path: /-/healthy
port: http
initialDelaySeconds: 30
periodSeconds: 30
volumes:
- name: config
configMap:
name: {{ include "incidentops.fullname" . }}-prometheus
- name: data
{{- if .Values.observability.prometheus.persistence.enabled }}
persistentVolumeClaim:
claimName: {{ include "incidentops.fullname" . }}-prometheus
{{- else }}
emptyDir: {}
{{- end }}
---
apiVersion: v1
kind: Service
metadata:
name: {{ include "incidentops.fullname" . }}-prometheus
labels:
{{- include "incidentops.labels" . | nindent 4 }}
app.kubernetes.io/component: prometheus
spec:
type: ClusterIP
ports:
- name: http
port: 9090
targetPort: http
protocol: TCP
selector:
{{- include "incidentops.selectorLabels" . | nindent 4 }}
app.kubernetes.io/component: prometheus
{{- if .Values.observability.prometheus.persistence.enabled }}
---
apiVersion: v1
kind: PersistentVolumeClaim
metadata:
name: {{ include "incidentops.fullname" . }}-prometheus
labels:
{{- include "incidentops.labels" . | nindent 4 }}
app.kubernetes.io/component: prometheus
spec:
accessModes:
- ReadWriteOnce
resources:
requests:
storage: {{ .Values.observability.prometheus.persistence.size }}
{{- end }}
{{- end }}
@@ -1,29 +0,0 @@
{{- if and .Values.observability.enabled .Values.metrics.enabled }}
apiVersion: rbac.authorization.k8s.io/v1
kind: Role
metadata:
name: {{ include "incidentops.fullname" . }}-prometheus
labels:
{{- include "incidentops.labels" . | nindent 4 }}
app.kubernetes.io/component: prometheus
rules:
- apiGroups: [""]
resources: ["pods", "endpoints", "services"]
verbs: ["get", "list", "watch"]
---
apiVersion: rbac.authorization.k8s.io/v1
kind: RoleBinding
metadata:
name: {{ include "incidentops.fullname" . }}-prometheus
labels:
{{- include "incidentops.labels" . | nindent 4 }}
app.kubernetes.io/component: prometheus
subjects:
- kind: ServiceAccount
name: {{ include "incidentops.serviceAccountName" . }}
namespace: {{ .Release.Namespace }}
roleRef:
apiGroup: rbac.authorization.k8s.io
kind: Role
name: {{ include "incidentops.fullname" . }}-prometheus
{{- end }}
@@ -1,169 +0,0 @@
{{- if and .Values.observability.enabled .Values.observability.promtail.enabled }}
apiVersion: v1
kind: ConfigMap
metadata:
name: {{ include "incidentops.fullname" . }}-promtail-config
labels:
{{- include "incidentops.labels" . | nindent 4 }}
app.kubernetes.io/component: promtail
data:
promtail.yaml: |
server:
http_listen_port: 3101
grpc_listen_port: 0
positions:
filename: /run/promtail/positions.yaml
clients:
- url: http://{{ include "incidentops.fullname" . }}-loki:3100/loki/api/v1/push
scrape_configs:
- job_name: kubernetes-pods
pipeline_stages:
- cri: {}
kubernetes_sd_configs:
- role: pod
namespaces:
names: [{{ .Release.Namespace }}]
relabel_configs:
- source_labels: [__meta_kubernetes_pod_container_init]
regex: "true"
action: drop
- source_labels: [__meta_kubernetes_pod_phase]
regex: Pending|Failed|Succeeded
action: drop
- source_labels: [__meta_kubernetes_pod_name, __meta_kubernetes_pod_namespace, __meta_kubernetes_pod_container_name]
target_label: __path__
replacement: /var/log/containers/$1_$2_$3-*.log
- source_labels: [__meta_kubernetes_pod_label_app_kubernetes_io_component]
regex: (.*)
target_label: service_name
replacement: {{ include "incidentops.fullname" . }}-$1
- source_labels: [__meta_kubernetes_pod_namespace]
target_label: namespace
- source_labels: [__meta_kubernetes_pod_name]
target_label: pod
- source_labels: [__meta_kubernetes_pod_container_name]
target_label: container
- source_labels: [__meta_kubernetes_pod_uid]
target_label: pod_uid
- target_label: cluster
replacement: {{ .Release.Namespace }}
- job_name: containers-fallback
pipeline_stages:
- cri: {}
static_configs:
- labels:
job: containers
namespace: {{ .Release.Namespace }}
service_name: incidentops-api
__path__: /var/log/containers/incidentops-api-*_incidentops_api-*.log
- labels:
job: containers
namespace: {{ .Release.Namespace }}
service_name: incidentops-worker
__path__: /var/log/containers/incidentops-worker-*_incidentops_worker-*.log
---
apiVersion: v1
kind: ServiceAccount
metadata:
name: {{ include "incidentops.fullname" . }}-promtail
labels:
{{- include "incidentops.labels" . | nindent 4 }}
app.kubernetes.io/component: promtail
---
apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRole
metadata:
name: {{ include "incidentops.fullname" . }}-promtail
labels:
{{- include "incidentops.labels" . | nindent 4 }}
app.kubernetes.io/component: promtail
rules:
- apiGroups: [""]
resources: ["pods", "pods/log", "namespaces", "services", "endpoints", "nodes"]
verbs: ["get", "list", "watch"]
---
apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRoleBinding
metadata:
name: {{ include "incidentops.fullname" . }}-promtail
labels:
{{- include "incidentops.labels" . | nindent 4 }}
app.kubernetes.io/component: promtail
subjects:
- kind: ServiceAccount
name: {{ include "incidentops.fullname" . }}-promtail
namespace: {{ .Release.Namespace }}
roleRef:
apiGroup: rbac.authorization.k8s.io
kind: ClusterRole
name: {{ include "incidentops.fullname" . }}-promtail
---
apiVersion: apps/v1
kind: DaemonSet
metadata:
name: {{ include "incidentops.fullname" . }}-promtail
labels:
{{- include "incidentops.labels" . | nindent 4 }}
app.kubernetes.io/component: promtail
spec:
selector:
matchLabels:
{{- include "incidentops.selectorLabels" . | nindent 6 }}
app.kubernetes.io/component: promtail
template:
metadata:
labels:
{{- include "incidentops.selectorLabels" . | nindent 8 }}
app.kubernetes.io/component: promtail
annotations:
checksum/config: {{ .Values.observability.promtail.image.tag | sha256sum }}
spec:
serviceAccountName: {{ include "incidentops.fullname" . }}-promtail
securityContext:
runAsUser: 0
containers:
- name: promtail
image: "{{ .Values.observability.promtail.image.repository }}:{{ .Values.observability.promtail.image.tag }}"
imagePullPolicy: {{ .Values.observability.promtail.image.pullPolicy }}
args:
- -config.file=/etc/promtail/promtail.yaml
ports:
- name: http-metrics
containerPort: 3101
protocol: TCP
volumeMounts:
- name: config
mountPath: /etc/promtail
- name: positions
mountPath: /run/promtail
- name: varlog
mountPath: /var/log
readOnly: true
- name: varlogpods
mountPath: /var/log/pods
readOnly: true
- name: varlogcontainers
mountPath: /var/log/containers
readOnly: true
resources:
{{- toYaml .Values.observability.promtail.resources | nindent 12 }}
volumes:
- name: config
configMap:
name: {{ include "incidentops.fullname" . }}-promtail-config
- name: positions
emptyDir: {}
- name: varlog
hostPath:
path: /var/log
- name: varlogpods
hostPath:
path: /var/log/pods
- name: varlogcontainers
hostPath:
path: /var/log/containers
{{- end }}
@@ -1,80 +0,0 @@
{{- if .Values.redis.enabled }}
apiVersion: apps/v1
kind: StatefulSet
metadata:
name: {{ include "incidentops.fullname" . }}-redis
labels:
{{- include "incidentops.labels" . | nindent 4 }}
app.kubernetes.io/component: redis
spec:
serviceName: {{ include "incidentops.fullname" . }}-redis
replicas: 1
selector:
matchLabels:
{{- include "incidentops.selectorLabels" . | nindent 6 }}
app.kubernetes.io/component: redis
template:
metadata:
labels:
{{- include "incidentops.selectorLabels" . | nindent 8 }}
app.kubernetes.io/component: redis
spec:
containers:
- name: redis
image: "{{ .Values.redis.image.repository }}:{{ .Values.redis.image.tag }}"
imagePullPolicy: {{ .Values.redis.image.pullPolicy }}
ports:
- name: redis
containerPort: 6379
protocol: TCP
volumeMounts:
- name: data
mountPath: /data
livenessProbe:
exec:
command:
- redis-cli
- ping
initialDelaySeconds: 30
periodSeconds: 10
timeoutSeconds: 5
failureThreshold: 6
readinessProbe:
exec:
command:
- redis-cli
- ping
initialDelaySeconds: 5
periodSeconds: 10
timeoutSeconds: 5
failureThreshold: 6
resources:
{{- toYaml .Values.redis.resources | nindent 12 }}
volumeClaimTemplates:
- metadata:
name: data
spec:
accessModes:
- ReadWriteOnce
resources:
requests:
storage: {{ .Values.redis.persistence.size }}
---
apiVersion: v1
kind: Service
metadata:
name: {{ include "incidentops.fullname" . }}-redis
labels:
{{- include "incidentops.labels" . | nindent 4 }}
app.kubernetes.io/component: redis
spec:
type: ClusterIP
ports:
- port: 6379
targetPort: redis
protocol: TCP
name: redis
selector:
{{- include "incidentops.selectorLabels" . | nindent 4 }}
app.kubernetes.io/component: redis
{{- end }}
-13
View File
@@ -1,13 +0,0 @@
apiVersion: v1
kind: Secret
metadata:
name: {{ include "incidentops.fullname" . }}-secret
labels:
{{- include "incidentops.labels" . | nindent 4 }}
type: Opaque
stringData:
DATABASE_URL: {{ include "incidentops.databaseUrl" . | quote }}
REDIS_URL: {{ include "incidentops.redisUrl" . | quote }}
CELERY_BROKER_URL: {{ include "incidentops.celeryBrokerUrl" . | quote }}
CELERY_RESULT_BACKEND: {{ include "incidentops.celeryResultBackend" . | quote }}
JWT_SECRET_KEY: {{ .Values.secrets.jwtSecretKey | quote }}
+9
View File
@@ -0,0 +1,9 @@
apiVersion: v1
kind: Secret
metadata:
name: {{ include "incidentops.fullname" . }}-secrets
labels:
{{- include "incidentops.labels" . | nindent 4 }}
type: Opaque
stringData:
jwt-signing-key: {{ .Values.jwt.signingKey | quote }}
@@ -1,12 +0,0 @@
{{- if .Values.serviceAccount.create -}}
apiVersion: v1
kind: ServiceAccount
metadata:
name: {{ include "incidentops.serviceAccountName" . }}
labels:
{{- include "incidentops.labels" . | nindent 4 }}
{{- with .Values.serviceAccount.annotations }}
annotations:
{{- toYaml . | nindent 4 }}
{{- end }}
{{- end }}
@@ -1,153 +0,0 @@
{{- if .Values.observability.enabled }}
apiVersion: v1
kind: ConfigMap
metadata:
name: {{ include "incidentops.fullname" . }}-tempo-config
labels:
{{- include "incidentops.labels" . | nindent 4 }}
app.kubernetes.io/component: tempo
data:
tempo.yaml: |
server:
http_listen_port: 3200
distributor:
receivers:
otlp:
protocols:
grpc:
endpoint: 0.0.0.0:4317
http:
endpoint: 0.0.0.0:4318
ingester:
trace_idle_period: 10s
max_block_bytes: 1048576
max_block_duration: 5m
compactor:
compaction:
block_retention: {{ .Values.observability.tempo.retention }}
storage:
trace:
backend: local
local:
path: /var/tempo/traces
wal:
path: /var/tempo/wal
querier:
search:
query_timeout: 30s
---
apiVersion: apps/v1
kind: Deployment
metadata:
name: {{ include "incidentops.fullname" . }}-tempo
labels:
{{- include "incidentops.labels" . | nindent 4 }}
app.kubernetes.io/component: tempo
spec:
replicas: 1
selector:
matchLabels:
{{- include "incidentops.selectorLabels" . | nindent 6 }}
app.kubernetes.io/component: tempo
template:
metadata:
labels:
{{- include "incidentops.selectorLabels" . | nindent 8 }}
app.kubernetes.io/component: tempo
annotations:
checksum/config: {{ .Values.observability.tempo.image.tag | sha256sum }}
spec:
containers:
- name: tempo
image: "{{ .Values.observability.tempo.image.repository }}:{{ .Values.observability.tempo.image.tag }}"
imagePullPolicy: {{ .Values.observability.tempo.image.pullPolicy }}
args:
- -config.file=/etc/tempo/tempo.yaml
ports:
- name: http
containerPort: 3200
protocol: TCP
- name: otlp-grpc
containerPort: 4317
protocol: TCP
- name: otlp-http
containerPort: 4318
protocol: TCP
volumeMounts:
- name: config
mountPath: /etc/tempo
- name: data
mountPath: /var/tempo
resources:
{{- toYaml .Values.observability.tempo.resources | nindent 12 }}
readinessProbe:
httpGet:
path: /ready
port: http
initialDelaySeconds: 10
periodSeconds: 10
livenessProbe:
httpGet:
path: /ready
port: http
initialDelaySeconds: 30
periodSeconds: 30
volumes:
- name: config
configMap:
name: {{ include "incidentops.fullname" . }}-tempo-config
- name: data
{{- if .Values.observability.tempo.persistence.enabled }}
persistentVolumeClaim:
claimName: {{ include "incidentops.fullname" . }}-tempo
{{- else }}
emptyDir: {}
{{- end }}
---
apiVersion: v1
kind: Service
metadata:
name: {{ include "incidentops.fullname" . }}-tempo
labels:
{{- include "incidentops.labels" . | nindent 4 }}
app.kubernetes.io/component: tempo
spec:
type: ClusterIP
ports:
- name: http
port: 3200
targetPort: http
protocol: TCP
- name: otlp-grpc
port: 4317
targetPort: otlp-grpc
protocol: TCP
- name: otlp-http
port: 4318
targetPort: otlp-http
protocol: TCP
selector:
{{- include "incidentops.selectorLabels" . | nindent 4 }}
app.kubernetes.io/component: tempo
{{- if .Values.observability.tempo.persistence.enabled }}
---
apiVersion: v1
kind: PersistentVolumeClaim
metadata:
name: {{ include "incidentops.fullname" . }}-tempo
labels:
{{- include "incidentops.labels" . | nindent 4 }}
app.kubernetes.io/component: tempo
spec:
accessModes:
- ReadWriteOnce
resources:
requests:
storage: {{ .Values.observability.tempo.persistence.size }}
{{- end }}
{{- end }}
+13 -41
View File
@@ -3,70 +3,42 @@ kind: Deployment
metadata: metadata:
name: {{ include "incidentops.fullname" . }}-web name: {{ include "incidentops.fullname" . }}-web
labels: labels:
{{- include "incidentops.web.labels" . | nindent 4 }} {{- include "incidentops.labels" . | nindent 4 }}
app.kubernetes.io/component: web
spec: spec:
{{- if not .Values.web.autoscaling.enabled }} replicas: {{ .Values.web.replicas }}
replicas: {{ .Values.web.replicaCount }}
{{- end }}
selector: selector:
matchLabels: matchLabels:
{{- include "incidentops.web.selectorLabels" . | nindent 6 }} {{- include "incidentops.selectorLabels" . | nindent 6 }}
app.kubernetes.io/component: web
template: template:
metadata: metadata:
{{- with .Values.web.podAnnotations }}
annotations:
{{- toYaml . | nindent 8 }}
{{- end }}
labels: labels:
{{- include "incidentops.web.selectorLabels" . | nindent 8 }} {{- include "incidentops.selectorLabels" . | nindent 8 }}
app.kubernetes.io/component: web
spec: spec:
{{- with .Values.global.imagePullSecrets }}
imagePullSecrets:
{{- toYaml . | nindent 8 }}
{{- end }}
serviceAccountName: {{ include "incidentops.serviceAccountName" . }}
securityContext:
{{- toYaml .Values.podSecurityContext | nindent 8 }}
containers: containers:
- name: web - name: web
securityContext: image: "{{ .Values.web.image }}:{{ .Values.web.tag }}"
{{- toYaml .Values.securityContext | nindent 12 }} imagePullPolicy: IfNotPresent
image: {{ include "incidentops.web.image" . }}
imagePullPolicy: {{ .Values.web.image.pullPolicy }}
ports: ports:
- name: http - name: http
containerPort: 3000 containerPort: {{ .Values.web.port }}
protocol: TCP protocol: TCP
env: env:
- name: NEXT_PUBLIC_API_URL - name: NEXT_PUBLIC_API_URL
value: "http://{{ include "incidentops.fullname" . }}-api:{{ .Values.api.service.port }}" value: "http://{{ .Values.ingress.host }}/api"
livenessProbe: livenessProbe:
httpGet: httpGet:
path: / path: /
port: http port: http
initialDelaySeconds: 10 initialDelaySeconds: 10
periodSeconds: 30 periodSeconds: 10
timeoutSeconds: 5
failureThreshold: 3
readinessProbe: readinessProbe:
httpGet: httpGet:
path: / path: /
port: http port: http
initialDelaySeconds: 5 initialDelaySeconds: 5
periodSeconds: 10 periodSeconds: 5
timeoutSeconds: 5
failureThreshold: 3
resources: resources:
{{- toYaml .Values.web.resources | nindent 12 }} {{- toYaml .Values.web.resources | nindent 12 }}
{{- with .Values.web.nodeSelector }}
nodeSelector:
{{- toYaml . | nindent 8 }}
{{- end }}
{{- with .Values.web.affinity }}
affinity:
{{- toYaml . | nindent 8 }}
{{- end }}
{{- with .Values.web.tolerations }}
tolerations:
{{- toYaml . | nindent 8 }}
{{- end }}
-22
View File
@@ -1,22 +0,0 @@
{{- if .Values.web.autoscaling.enabled }}
apiVersion: autoscaling/v2
kind: HorizontalPodAutoscaler
metadata:
name: {{ include "incidentops.fullname" . }}-web
labels:
{{- include "incidentops.web.labels" . | nindent 4 }}
spec:
scaleTargetRef:
apiVersion: apps/v1
kind: Deployment
name: {{ include "incidentops.fullname" . }}-web
minReplicas: {{ .Values.web.autoscaling.minReplicas }}
maxReplicas: {{ .Values.web.autoscaling.maxReplicas }}
metrics:
- type: Resource
resource:
name: cpu
target:
type: Utilization
averageUtilization: {{ .Values.web.autoscaling.targetCPUUtilizationPercentage }}
{{- end }}
+6 -4
View File
@@ -3,13 +3,15 @@ kind: Service
metadata: metadata:
name: {{ include "incidentops.fullname" . }}-web name: {{ include "incidentops.fullname" . }}-web
labels: labels:
{{- include "incidentops.web.labels" . | nindent 4 }} {{- include "incidentops.labels" . | nindent 4 }}
app.kubernetes.io/component: web
spec: spec:
type: {{ .Values.web.service.type }} type: ClusterIP
ports: ports:
- port: {{ .Values.web.service.port }} - port: {{ .Values.web.port }}
targetPort: http targetPort: http
protocol: TCP protocol: TCP
name: http name: http
selector: selector:
{{- include "incidentops.web.selectorLabels" . | nindent 4 }} {{- include "incidentops.selectorLabels" . | nindent 4 }}
app.kubernetes.io/component: web
@@ -3,104 +3,28 @@ kind: Deployment
metadata: metadata:
name: {{ include "incidentops.fullname" . }}-worker name: {{ include "incidentops.fullname" . }}-worker
labels: labels:
{{- include "incidentops.worker.labels" . | nindent 4 }} {{- include "incidentops.labels" . | nindent 4 }}
app.kubernetes.io/component: worker
spec: spec:
{{- if not .Values.worker.autoscaling.enabled }} replicas: {{ .Values.worker.replicas }}
replicas: {{ .Values.worker.replicaCount }}
{{- end }}
selector: selector:
matchLabels: matchLabels:
{{- include "incidentops.worker.selectorLabels" . | nindent 6 }} {{- include "incidentops.selectorLabels" . | nindent 6 }}
app.kubernetes.io/component: worker
template: template:
metadata: metadata:
annotations:
checksum/config: {{ include (print $.Template.BasePath "/configmap.yaml") . | sha256sum }}
checksum/secret: {{ include (print $.Template.BasePath "/secret.yaml") . | sha256sum }}
{{- with .Values.worker.podAnnotations }}
{{- toYaml . | nindent 8 }}
{{- end }}
labels: labels:
{{- include "incidentops.worker.selectorLabels" . | nindent 8 }} {{- include "incidentops.selectorLabels" . | nindent 8 }}
app.kubernetes.io/component: worker
spec: spec:
{{- with .Values.global.imagePullSecrets }}
imagePullSecrets:
{{- toYaml . | nindent 8 }}
{{- end }}
serviceAccountName: {{ include "incidentops.serviceAccountName" . }}
securityContext:
{{- toYaml .Values.podSecurityContext | nindent 8 }}
initContainers:
- name: wait-for-postgres
image: busybox:1.36
command:
- sh
- -c
- |
until nc -z {{ include "incidentops.fullname" . }}-postgresql 5432; do
echo "Waiting for PostgreSQL..."
sleep 2
done
echo "PostgreSQL is ready"
- name: wait-for-redis
image: busybox:1.36
command:
- sh
- -c
- |
until nc -z {{ include "incidentops.fullname" . }}-redis 6379; do
echo "Waiting for Redis..."
sleep 2
done
echo "Redis is ready"
containers: containers:
- name: worker - name: worker
securityContext: image: "{{ .Values.worker.image }}:{{ .Values.worker.tag }}"
{{- toYaml .Values.securityContext | nindent 12 }} imagePullPolicy: IfNotPresent
image: {{ include "incidentops.worker.image" . }} env:
imagePullPolicy: {{ .Values.worker.image.pullPolicy }} - name: ConnectionStrings__Postgres
command: value: {{ include "incidentops.postgresConnectionString" . | quote }}
- uv - name: Redis__ConnectionString
- run value: {{ include "incidentops.redisConnectionString" . | quote }}
- celery
- -A
- worker.celery_app
- worker
- --loglevel=info
- -Q
- {{ .Values.worker.queues }}
- --concurrency={{ .Values.worker.concurrency }}
envFrom:
- configMapRef:
name: {{ include "incidentops.fullname" . }}-config
- secretRef:
name: {{ include "incidentops.fullname" . }}-secret
livenessProbe:
exec:
command:
- uv
- run
- celery
- -A
- worker.celery_app
- inspect
- ping
- -d
- celery@$HOSTNAME
initialDelaySeconds: 30
periodSeconds: 60
timeoutSeconds: 10
failureThreshold: 3
resources: resources:
{{- toYaml .Values.worker.resources | nindent 12 }} {{- toYaml .Values.worker.resources | nindent 12 }}
{{- with .Values.worker.nodeSelector }}
nodeSelector:
{{- toYaml . | nindent 8 }}
{{- end }}
{{- with .Values.worker.affinity }}
affinity:
{{- toYaml . | nindent 8 }}
{{- end }}
{{- with .Values.worker.tolerations }}
tolerations:
{{- toYaml . | nindent 8 }}
{{- end }}
@@ -1,22 +0,0 @@
{{- if .Values.worker.autoscaling.enabled }}
apiVersion: autoscaling/v2
kind: HorizontalPodAutoscaler
metadata:
name: {{ include "incidentops.fullname" . }}-worker
labels:
{{- include "incidentops.worker.labels" . | nindent 4 }}
spec:
scaleTargetRef:
apiVersion: apps/v1
kind: Deployment
name: {{ include "incidentops.fullname" . }}-worker
minReplicas: {{ .Values.worker.autoscaling.minReplicas }}
maxReplicas: {{ .Values.worker.autoscaling.maxReplicas }}
metrics:
- type: Resource
resource:
name: cpu
target:
type: Utilization
averageUtilization: {{ .Values.worker.autoscaling.targetCPUUtilizationPercentage }}
{{- end }}
-142
View File
@@ -1,142 +0,0 @@
# Production values for incidentops
# Use external secrets management in production
api:
replicaCount: 3
autoscaling:
enabled: true
minReplicas: 3
maxReplicas: 10
targetCPUUtilizationPercentage: 70
resources:
requests:
cpu: 250m
memory: 512Mi
limits:
cpu: 1000m
memory: 1Gi
worker:
replicaCount: 3
autoscaling:
enabled: true
minReplicas: 3
maxReplicas: 10
targetCPUUtilizationPercentage: 70
concurrency: 8
resources:
requests:
cpu: 250m
memory: 512Mi
limits:
cpu: 1000m
memory: 1Gi
web:
replicaCount: 3
autoscaling:
enabled: true
minReplicas: 3
maxReplicas: 10
targetCPUUtilizationPercentage: 70
resources:
requests:
cpu: 100m
memory: 256Mi
limits:
cpu: 500m
memory: 512Mi
ingress:
enabled: true
className: nginx
annotations:
nginx.ingress.kubernetes.io/proxy-body-size: "10m"
cert-manager.io/cluster-issuer: letsencrypt-prod
host: incidentops.example.com
tls:
- secretName: incidentops-tls
hosts:
- incidentops.example.com
postgresql:
persistence:
size: 50Gi
resources:
requests:
cpu: 500m
memory: 1Gi
limits:
cpu: 2000m
memory: 4Gi
redis:
persistence:
size: 10Gi
resources:
requests:
cpu: 250m
memory: 512Mi
limits:
cpu: 1000m
memory: 1Gi
# Application configuration
config:
environment: production
logLevel: INFO
# Observability Stack - Production settings
observability:
enabled: true
otelCollector:
replicaCount: 2
resources:
requests:
cpu: 100m
memory: 256Mi
limits:
cpu: 500m
memory: 512Mi
tempo:
retention: "720h" # 30 days
persistence:
enabled: true
size: 50Gi
resources:
requests:
cpu: 250m
memory: 512Mi
limits:
cpu: 1000m
memory: 2Gi
loki:
retention: "720h" # 30 days
persistence:
enabled: true
size: 100Gi
resources:
requests:
cpu: 250m
memory: 512Mi
limits:
cpu: 1000m
memory: 2Gi
grafana:
adminPassword: "" # Set via external secret in production
service:
type: ClusterIP
persistence:
enabled: true
size: 5Gi
resources:
requests:
cpu: 100m
memory: 256Mi
limits:
cpu: 500m
memory: 512Mi
+36 -243
View File
@@ -1,279 +1,72 @@
# Default values for incidentops
global:
imageRegistry: ""
imagePullSecrets: []
api: api:
replicaCount: 2 image: incidentops-api
image:
repository: incidentops/api
tag: latest tag: latest
pullPolicy: IfNotPresent replicas: 1
service: port: 8080
type: ClusterIP
port: 8000
resources: resources:
requests:
cpu: 100m
memory: 256Mi
limits: limits:
cpu: 500m
memory: 512Mi memory: 512Mi
autoscaling: cpu: 500m
enabled: false requests:
minReplicas: 2 memory: 256Mi
maxReplicas: 10 cpu: 100m
targetCPUUtilizationPercentage: 80
podAnnotations: {}
nodeSelector: {}
tolerations: []
affinity: {}
# Worker Service (Celery)
worker: worker:
replicaCount: 2 image: incidentops-worker
image:
repository: incidentops/worker
tag: latest tag: latest
pullPolicy: IfNotPresent replicas: 1
resources: resources:
requests:
cpu: 100m
memory: 256Mi
limits: limits:
cpu: 500m
memory: 512Mi memory: 512Mi
autoscaling: cpu: 500m
enabled: false requests:
minReplicas: 2 memory: 256Mi
maxReplicas: 10 cpu: 100m
targetCPUUtilizationPercentage: 80
queues: "critical,default,low"
concurrency: 4
podAnnotations: {}
nodeSelector: {}
tolerations: []
affinity: {}
# Web Frontend (Next.js)
web: web:
replicaCount: 2 image: incidentops-web
image:
repository: incidentops/web
tag: latest tag: latest
pullPolicy: IfNotPresent replicas: 1
service:
type: ClusterIP
port: 3000 port: 3000
resources: resources:
requests:
cpu: 50m
memory: 128Mi
limits: limits:
cpu: 200m
memory: 256Mi memory: 256Mi
autoscaling: cpu: 200m
enabled: false requests:
minReplicas: 2 memory: 128Mi
maxReplicas: 10 cpu: 50m
targetCPUUtilizationPercentage: 80
podAnnotations: {} jwt:
nodeSelector: {} issuer: incidentops
tolerations: [] audience: incidentops
affinity: {} signingKey: your-super-secret-key-that-should-be-at-least-32-characters-long
accessTokenExpirationMinutes: 15
refreshTokenExpirationDays: 7
# Ingress configuration
ingress: ingress:
enabled: true enabled: true
className: nginx className: nginx
host: incidentops.local
annotations: annotations:
nginx.ingress.kubernetes.io/proxy-body-size: "10m" nginx.ingress.kubernetes.io/proxy-body-size: "10m"
host: incidentops.local
tls: []
# Database migration job
migration:
enabled: true
image:
repository: incidentops/api
tag: latest
pullPolicy: IfNotPresent
backoffLimit: 3
# Application configuration
config:
jwtAlgorithm: HS256
accessTokenExpireMinutes: 30
refreshTokenExpireDays: 30
environment: development
logLevel: INFO
# Secrets (use external secrets in production)
secrets:
jwtSecretKey: "change-me-in-production"
# PostgreSQL configuration (using official postgres image)
postgresql: postgresql:
enabled: true enabled: true
image:
repository: postgres
tag: "16-alpine"
pullPolicy: IfNotPresent
auth: auth:
username: incidentops username: postgres
password: incidentops password: postgres
database: incidentops database: incidentops
primary:
persistence: persistence:
size: 8Gi enabled: true
resources: size: 1Gi
requests:
cpu: 100m
memory: 256Mi
limits:
cpu: 500m
memory: 512Mi
redis: redis:
enabled: true enabled: true
image: architecture: standalone
repository: redis auth:
tag: "7-alpine" enabled: false
pullPolicy: IfNotPresent master:
persistence: persistence:
size: 2Gi
resources:
requests:
cpu: 50m
memory: 128Mi
limits:
cpu: 200m
memory: 256Mi
# Service Account
serviceAccount:
create: true
annotations: {}
name: ""
# Pod Security Context
podSecurityContext:
fsGroup: 1000
securityContext:
runAsNonRoot: true
runAsUser: 1000
# Observability Stack (Grafana + Loki + Tempo + OpenTelemetry Collector)
observability:
enabled: true enabled: true
otelCollector:
replicaCount: 1
image:
repository: otel/opentelemetry-collector-contrib
tag: "0.96.0"
pullPolicy: IfNotPresent
resources:
requests:
cpu: 50m
memory: 128Mi
limits:
cpu: 200m
memory: 256Mi
tempo:
image:
repository: grafana/tempo
tag: "2.4.1"
pullPolicy: IfNotPresent
retention: "168h" # 7 days
persistence:
enabled: false
size: 10Gi
resources:
requests:
cpu: 50m
memory: 128Mi
limits:
cpu: 500m
memory: 512Mi
loki:
image:
repository: grafana/loki
tag: "2.9.6"
pullPolicy: IfNotPresent
retention: "168h" # 7 days
persistence:
enabled: false
size: 10Gi
resources:
requests:
cpu: 50m
memory: 128Mi
limits:
cpu: 500m
memory: 512Mi
prometheus:
image:
repository: prom/prometheus
tag: "v2.51.0"
pullPolicy: IfNotPresent
retention: "15d"
scrapeInterval: "15s"
persistence:
enabled: false
size: 10Gi
resources:
requests:
cpu: 50m
memory: 128Mi
limits:
cpu: 500m
memory: 512Mi
grafana:
image:
repository: grafana/grafana
tag: "10.4.1"
pullPolicy: IfNotPresent
adminUser: admin
adminPassword: "admin" # Change in production!
service:
type: ClusterIP
ingress:
enabled: false
host: grafana.incidentops.local
annotations: {}
tls: []
persistence:
enabled: false
size: 1Gi size: 1Gi
resources:
requests:
cpu: 50m
memory: 128Mi
limits:
cpu: 200m
memory: 256Mi
promtail:
enabled: true
image:
repository: grafana/promtail
tag: "2.9.6"
pullPolicy: IfNotPresent
resources:
requests:
cpu: 25m
memory: 64Mi
limits:
cpu: 200m
memory: 256Mi
# Metrics configuration
metrics:
enabled: true
port: 9464
-6
View File
@@ -1,6 +0,0 @@
def main():
print("Hello from incidentops!")
if __name__ == "__main__":
main()
-61
View File
@@ -1,61 +0,0 @@
-- Initial schema for IncidentOps
-- Creates core tables: users, orgs, org_members, services, incidents, incident_events
CREATE TABLE users (
id UUID PRIMARY KEY,
email TEXT NOT NULL UNIQUE,
password_hash TEXT NOT NULL,
created_at TIMESTAMPTZ NOT NULL DEFAULT now()
);
CREATE TABLE orgs (
id UUID PRIMARY KEY,
name TEXT NOT NULL,
slug TEXT NOT NULL UNIQUE,
created_at TIMESTAMPTZ NOT NULL DEFAULT now()
);
CREATE TABLE org_members (
id UUID PRIMARY KEY,
user_id UUID NOT NULL REFERENCES users(id),
org_id UUID NOT NULL REFERENCES orgs(id),
role TEXT NOT NULL CHECK (role IN ('admin', 'member', 'viewer')),
created_at TIMESTAMPTZ NOT NULL DEFAULT now(),
UNIQUE (user_id, org_id)
);
CREATE TABLE services (
id UUID PRIMARY KEY,
org_id UUID NOT NULL REFERENCES orgs(id),
name TEXT NOT NULL,
slug TEXT NOT NULL,
created_at TIMESTAMPTZ NOT NULL DEFAULT now(),
UNIQUE (org_id, slug)
);
CREATE TABLE incidents (
id UUID PRIMARY KEY,
org_id UUID NOT NULL REFERENCES orgs(id),
service_id UUID NOT NULL REFERENCES services(id),
title TEXT NOT NULL,
description TEXT,
status TEXT NOT NULL CHECK (status IN ('triggered', 'acknowledged', 'mitigated', 'resolved')),
severity TEXT NOT NULL CHECK (severity IN ('critical', 'high', 'medium', 'low')),
version INTEGER NOT NULL DEFAULT 1,
created_at TIMESTAMPTZ NOT NULL DEFAULT now(),
updated_at TIMESTAMPTZ NOT NULL DEFAULT now()
);
CREATE INDEX idx_incidents_org_status ON incidents(org_id, status);
CREATE INDEX idx_incidents_org_created ON incidents(org_id, created_at DESC);
CREATE TABLE incident_events (
id UUID PRIMARY KEY,
incident_id UUID NOT NULL REFERENCES incidents(id),
event_type TEXT NOT NULL,
actor_user_id UUID REFERENCES users(id),
payload JSONB,
created_at TIMESTAMPTZ NOT NULL DEFAULT now()
);
CREATE INDEX idx_incident_events_incident ON incident_events(incident_id, created_at);
-15
View File
@@ -1,15 +0,0 @@
-- Refresh tokens table for JWT token rotation
-- Stores hashed refresh tokens with active org context
CREATE TABLE refresh_tokens (
id UUID PRIMARY KEY,
user_id UUID NOT NULL REFERENCES users(id),
token_hash TEXT NOT NULL UNIQUE,
active_org_id UUID NOT NULL REFERENCES orgs(id),
expires_at TIMESTAMPTZ NOT NULL,
revoked_at TIMESTAMPTZ,
created_at TIMESTAMPTZ NOT NULL DEFAULT now()
);
CREATE INDEX idx_refresh_tokens_user ON refresh_tokens(user_id);
CREATE INDEX idx_refresh_tokens_hash ON refresh_tokens(token_hash);
-25
View File
@@ -1,25 +0,0 @@
-- Notification system tables
-- Stores notification targets and delivery attempts
CREATE TABLE notification_targets (
id UUID PRIMARY KEY,
org_id UUID NOT NULL REFERENCES orgs(id),
name TEXT NOT NULL,
target_type TEXT NOT NULL CHECK (target_type IN ('webhook', 'email', 'slack')),
webhook_url TEXT,
enabled BOOLEAN NOT NULL DEFAULT true,
created_at TIMESTAMPTZ NOT NULL DEFAULT now()
);
CREATE INDEX idx_notification_targets_org ON notification_targets(org_id);
CREATE TABLE notification_attempts (
id UUID PRIMARY KEY,
incident_id UUID NOT NULL REFERENCES incidents(id),
target_id UUID NOT NULL REFERENCES notification_targets(id),
status TEXT NOT NULL CHECK (status IN ('pending', 'sent', 'failed')),
error TEXT,
sent_at TIMESTAMPTZ,
created_at TIMESTAMPTZ NOT NULL DEFAULT now(),
UNIQUE (incident_id, target_id)
);
@@ -1,18 +0,0 @@
-- Enhance refresh tokens for secure rotation and reuse detection
-- Adds rotated_to column to track token chains and detect stolen token reuse
-- Add rotated_to column to track which token this was rotated into
-- When a token is rotated, we store the ID of the new token here
-- If a token with rotated_to set is used again, it indicates token theft
ALTER TABLE refresh_tokens ADD COLUMN rotated_to UUID REFERENCES refresh_tokens(id);
-- Index for efficient cleanup queries on expires_at
CREATE INDEX idx_refresh_tokens_expires ON refresh_tokens(expires_at);
-- Index for finding active tokens per user (for revoke_all and listing)
CREATE INDEX idx_refresh_tokens_user_active ON refresh_tokens(user_id, revoked_at)
WHERE revoked_at IS NULL;
-- Index for reuse detection queries
CREATE INDEX idx_refresh_tokens_rotated ON refresh_tokens(rotated_to)
WHERE rotated_to IS NOT NULL;
-119
View File
@@ -1,119 +0,0 @@
"""
Simple migration runner using asyncpg.
Tracks applied migrations in a _migrations table.
Usage:
DATABASE_URL=postgresql://user:pass@localhost/db uv run python migrations/migrate.py apply
DATABASE_URL=postgresql://user:pass@localhost/db uv run python migrations/migrate.py status
"""
import asyncio
import os
import sys
from pathlib import Path
import asyncpg
MIGRATIONS_DIR = Path(__file__).parent
async def ensure_migrations_table(conn: asyncpg.Connection) -> None:
"""Create the migrations tracking table if it doesn't exist."""
await conn.execute("""
CREATE TABLE IF NOT EXISTS _migrations (
id SERIAL PRIMARY KEY,
name TEXT NOT NULL UNIQUE,
applied_at TIMESTAMPTZ NOT NULL DEFAULT now()
)
""")
async def get_applied_migrations(conn: asyncpg.Connection) -> set[str]:
"""Get the set of already applied migration names."""
rows = await conn.fetch("SELECT name FROM _migrations")
return {row["name"] for row in rows}
async def get_pending_migrations(conn: asyncpg.Connection) -> list[Path]:
"""Get list of migration files that haven't been applied yet."""
applied = await get_applied_migrations(conn)
sql_files = sorted(MIGRATIONS_DIR.glob("*.sql"))
return [f for f in sql_files if f.name not in applied]
async def apply_migration(conn: asyncpg.Connection, migration_file: Path) -> None:
"""Apply a single migration file within a transaction."""
sql = migration_file.read_text()
async with conn.transaction():
await conn.execute(sql)
await conn.execute(
"INSERT INTO _migrations (name) VALUES ($1)",
migration_file.name
)
print(f"Applied: {migration_file.name}")
async def migrate(database_url: str) -> None:
"""Apply all pending migrations."""
conn = await asyncpg.connect(database_url)
try:
await ensure_migrations_table(conn)
pending = await get_pending_migrations(conn)
if not pending:
print("No pending migrations.")
return
for migration_file in pending:
await apply_migration(conn, migration_file)
print(f"Applied {len(pending)} migration(s).")
finally:
await conn.close()
async def status(database_url: str) -> None:
"""Show migration status."""
conn = await asyncpg.connect(database_url)
try:
await ensure_migrations_table(conn)
applied = await get_applied_migrations(conn)
pending = await get_pending_migrations(conn)
print("Applied migrations:")
for name in sorted(applied):
print(f" [x] {name}")
print("\nPending migrations:")
for f in pending:
print(f" [ ] {f.name}")
if not applied and not pending:
print(" (none)")
finally:
await conn.close()
def main() -> None:
database_url = os.environ.get("DATABASE_URL")
if not database_url:
print("Error: DATABASE_URL environment variable is required")
sys.exit(1)
if len(sys.argv) < 2:
print("Usage: python migrate.py [apply|status]")
sys.exit(1)
command = sys.argv[1]
if command == "apply":
asyncio.run(migrate(database_url))
elif command == "status":
asyncio.run(status(database_url))
else:
print(f"Unknown command: {command}")
print("Usage: python migrate.py [apply|status]")
sys.exit(1)
if __name__ == "__main__":
main()
@@ -1,294 +0,0 @@
{
"title": "IncidentOps API Overview",
"uid": "incidentops-api",
"tags": ["incidentops", "api"],
"timezone": "browser",
"editable": true,
"panels": [
{
"id": 1,
"title": "Request Rate",
"type": "timeseries",
"gridPos": {"h": 8, "w": 8, "x": 0, "y": 0},
"targets": [
{
"datasource": {"type": "prometheus", "uid": "prometheus"},
"expr": "sum(rate(http_server_request_duration_seconds_count{job=\"incidentops-api\"}[1m]))",
"legendFormat": "Requests/sec",
"refId": "A"
}
],
"fieldConfig": {
"defaults": {
"color": {"mode": "palette-classic"},
"unit": "reqps"
}
}
},
{
"id": 2,
"title": "Request Duration (p50, p95, p99)",
"type": "timeseries",
"gridPos": {"h": 8, "w": 8, "x": 8, "y": 0},
"targets": [
{
"datasource": {"type": "prometheus", "uid": "prometheus"},
"expr": "histogram_quantile(0.50, sum(rate(http_server_request_duration_seconds_bucket{job=\"incidentops-api\"}[5m])) by (le))",
"legendFormat": "p50",
"refId": "A"
},
{
"datasource": {"type": "prometheus", "uid": "prometheus"},
"expr": "histogram_quantile(0.95, sum(rate(http_server_request_duration_seconds_bucket{job=\"incidentops-api\"}[5m])) by (le))",
"legendFormat": "p95",
"refId": "B"
},
{
"datasource": {"type": "prometheus", "uid": "prometheus"},
"expr": "histogram_quantile(0.99, sum(rate(http_server_request_duration_seconds_bucket{job=\"incidentops-api\"}[5m])) by (le))",
"legendFormat": "p99",
"refId": "C"
}
],
"fieldConfig": {
"defaults": {
"color": {"mode": "palette-classic"},
"unit": "s"
}
}
},
{
"id": 3,
"title": "Error Rate",
"type": "timeseries",
"gridPos": {"h": 8, "w": 8, "x": 16, "y": 0},
"targets": [
{
"datasource": {"type": "prometheus", "uid": "prometheus"},
"expr": "sum(rate(http_server_request_duration_seconds_count{job=\"incidentops-api\", http_status_code=~\"5..\"}[1m])) / sum(rate(http_server_request_duration_seconds_count{job=\"incidentops-api\"}[1m])) * 100",
"legendFormat": "Error %",
"refId": "A"
}
],
"fieldConfig": {
"defaults": {
"color": {"fixedColor": "red", "mode": "fixed"},
"unit": "percent",
"min": 0,
"max": 100
}
}
},
{
"id": 4,
"title": "Requests by Status Code",
"type": "timeseries",
"gridPos": {"h": 8, "w": 12, "x": 0, "y": 8},
"targets": [
{
"datasource": {"type": "prometheus", "uid": "prometheus"},
"expr": "sum by (http_status_code) (rate(http_server_request_duration_seconds_count{job=\"incidentops-api\"}[1m]))",
"legendFormat": "{{http_status_code}}",
"refId": "A"
}
],
"fieldConfig": {
"defaults": {
"color": {"mode": "palette-classic"},
"unit": "reqps"
}
}
},
{
"id": 5,
"title": "Requests by Endpoint",
"type": "timeseries",
"gridPos": {"h": 8, "w": 12, "x": 12, "y": 8},
"targets": [
{
"datasource": {"type": "prometheus", "uid": "prometheus"},
"expr": "sum by (http_route) (rate(http_server_request_duration_seconds_count{job=\"incidentops-api\"}[1m]))",
"legendFormat": "{{http_route}}",
"refId": "A"
}
],
"fieldConfig": {
"defaults": {
"color": {"mode": "palette-classic"},
"unit": "reqps"
}
}
},
{
"id": 6,
"title": "System CPU Usage",
"type": "gauge",
"gridPos": {"h": 6, "w": 6, "x": 0, "y": 16},
"targets": [
{
"datasource": {"type": "prometheus", "uid": "prometheus"},
"expr": "avg(system_cpu_utilization{job=\"incidentops-api\"}) * 100",
"refId": "A"
}
],
"fieldConfig": {
"defaults": {
"color": {"mode": "thresholds"},
"thresholds": {
"mode": "absolute",
"steps": [
{"color": "green", "value": null},
{"color": "yellow", "value": 60},
{"color": "red", "value": 80}
]
},
"unit": "percent",
"min": 0,
"max": 100
}
}
},
{
"id": 7,
"title": "Memory Usage",
"type": "gauge",
"gridPos": {"h": 6, "w": 6, "x": 6, "y": 16},
"targets": [
{
"datasource": {"type": "prometheus", "uid": "prometheus"},
"expr": "process_runtime_cpython_memory_bytes{job=\"incidentops-api\", type=\"rss\"} / 1024 / 1024",
"refId": "A"
}
],
"fieldConfig": {
"defaults": {
"color": {"mode": "thresholds"},
"thresholds": {
"mode": "absolute",
"steps": [
{"color": "green", "value": null},
{"color": "yellow", "value": 256},
{"color": "red", "value": 512}
]
},
"unit": "decmbytes"
}
}
},
{
"id": 8,
"title": "Active Threads",
"type": "stat",
"gridPos": {"h": 6, "w": 6, "x": 12, "y": 16},
"targets": [
{
"datasource": {"type": "prometheus", "uid": "prometheus"},
"expr": "process_runtime_cpython_thread_count{job=\"incidentops-api\"}",
"refId": "A"
}
],
"fieldConfig": {
"defaults": {
"color": {"mode": "thresholds"},
"thresholds": {
"mode": "absolute",
"steps": [
{"color": "green", "value": null},
{"color": "yellow", "value": 50},
{"color": "red", "value": 100}
]
}
}
}
},
{
"id": 9,
"title": "GC Collections",
"type": "stat",
"gridPos": {"h": 6, "w": 6, "x": 18, "y": 16},
"targets": [
{
"datasource": {"type": "prometheus", "uid": "prometheus"},
"expr": "sum(rate(process_runtime_cpython_gc_count{job=\"incidentops-api\"}[5m]))",
"refId": "A"
}
],
"fieldConfig": {
"defaults": {
"color": {"mode": "thresholds"},
"thresholds": {
"mode": "absolute",
"steps": [
{"color": "green", "value": null}
]
},
"unit": "cps"
}
}
},
{
"id": 10,
"title": "Recent Logs",
"type": "logs",
"gridPos": {"h": 10, "w": 24, "x": 0, "y": 22},
"targets": [
{
"datasource": {"type": "loki", "uid": "loki"},
"expr": "{service_name=\"incidentops-api\"} | json",
"refId": "A"
}
],
"options": {
"showTime": true,
"showLabels": true,
"wrapLogMessage": true,
"enableLogDetails": true,
"sortOrder": "Descending"
}
},
{
"id": 11,
"title": "Error Logs",
"type": "logs",
"gridPos": {"h": 8, "w": 24, "x": 0, "y": 32},
"targets": [
{
"datasource": {"type": "loki", "uid": "loki"},
"expr": "{service_name=\"incidentops-api\"} |= \"ERROR\" | json",
"refId": "A"
}
],
"options": {
"showTime": true,
"showLabels": true,
"wrapLogMessage": true,
"enableLogDetails": true,
"sortOrder": "Descending"
}
},
{
"id": 12,
"title": "Recent Traces",
"type": "traces",
"gridPos": {"h": 10, "w": 24, "x": 0, "y": 40},
"targets": [
{
"datasource": {"type": "tempo", "uid": "tempo"},
"queryType": "traceqlSearch",
"filters": [
{
"id": "service-name",
"operator": "=",
"scope": "resource",
"tag": "service.name",
"value": ["incidentops-api"]
}
],
"refId": "A"
}
]
}
],
"schemaVersion": 38,
"version": 2
}
@@ -1,12 +0,0 @@
apiVersion: 1
providers:
- name: 'default'
orgId: 1
folder: 'IncidentOps'
folderUid: 'incidentops'
type: file
disableDeletion: false
editable: true
options:
path: /var/lib/grafana/dashboards
@@ -1,48 +0,0 @@
apiVersion: 1
datasources:
- name: Prometheus
type: prometheus
uid: prometheus
url: http://prometheus:9090
access: proxy
isDefault: false
jsonData:
httpMethod: POST
exemplarTraceIdDestinations:
- name: trace_id
datasourceUid: tempo
- name: Tempo
type: tempo
uid: tempo
url: http://tempo:3200
access: proxy
isDefault: false
jsonData:
tracesToLogsV2:
datasourceUid: loki
spanStartTimeShift: '-1h'
spanEndTimeShift: '1h'
filterByTraceID: true
filterBySpanID: true
tracesToMetrics:
datasourceUid: prometheus
nodeGraph:
enabled: true
lokiSearch:
datasourceUid: loki
- name: Loki
type: loki
uid: loki
url: http://loki:3100
access: proxy
isDefault: true
jsonData:
derivedFields:
- datasourceUid: tempo
matcherRegex: '"trace_id":"([a-f0-9]+)"'
name: TraceID
url: '$${__value.raw}'
urlDisplayLabel: 'View Trace'
-41
View File
@@ -1,41 +0,0 @@
auth_enabled: false
server:
http_listen_port: 3100
grpc_listen_port: 9096
common:
path_prefix: /loki
storage:
filesystem:
chunks_directory: /loki/chunks
rules_directory: /loki/rules
replication_factor: 1
ring:
kvstore:
store: inmemory
query_range:
results_cache:
cache:
embedded_cache:
enabled: true
max_size_mb: 100
schema_config:
configs:
- from: "2020-10-24"
store: tsdb
object_store: filesystem
schema: v13
index:
prefix: index_
period: 24h
ruler:
alertmanager_url: http://localhost:9093
limits_config:
retention_period: 168h # 7 days
allow_structured_metadata: true
volume_enabled: true
-38
View File
@@ -1,38 +0,0 @@
receivers:
otlp:
protocols:
grpc:
endpoint: 0.0.0.0:4317
http:
endpoint: 0.0.0.0:4318
processors:
batch:
timeout: 1s
send_batch_size: 1024
memory_limiter:
check_interval: 1s
limit_mib: 256
spike_limit_mib: 64
exporters:
otlp/tempo:
endpoint: tempo:4317
tls:
insecure: true
loki:
endpoint: http://loki:3100/loki/api/v1/push
default_labels_enabled:
exporter: true
job: true
service:
pipelines:
traces:
receivers: [otlp]
processors: [memory_limiter, batch]
exporters: [otlp/tempo]
logs:
receivers: [otlp]
processors: [memory_limiter, batch]
exporters: [loki]
-23
View File
@@ -1,23 +0,0 @@
global:
scrape_interval: 15s
evaluation_interval: 15s
scrape_configs:
# Scrape Prometheus itself
- job_name: "prometheus"
static_configs:
- targets: ["localhost:9090"]
# Scrape IncidentOps API metrics
- job_name: "incidentops-api"
static_configs:
- targets: ["api:9464"]
metrics_path: /metrics
scrape_interval: 10s
# Scrape IncidentOps Worker metrics (when metrics are enabled)
- job_name: "incidentops-worker"
static_configs:
- targets: ["worker:9464"]
metrics_path: /metrics
scrape_interval: 10s
-32
View File
@@ -1,32 +0,0 @@
server:
http_listen_port: 3200
distributor:
receivers:
otlp:
protocols:
grpc:
endpoint: 0.0.0.0:4317
http:
endpoint: 0.0.0.0:4318
ingester:
trace_idle_period: 10s
max_block_bytes: 1048576
max_block_duration: 5m
compactor:
compaction:
block_retention: 168h # 7 days
storage:
trace:
backend: local
local:
path: /var/tempo/traces
wal:
path: /var/tempo/wal
querier:
search:
query_timeout: 30s
-58
View File
@@ -1,58 +0,0 @@
[project]
name = "incidentops"
version = "0.1.0"
description = "Incident management API with multi-tenant org support"
readme = "README.md"
requires-python = ">=3.14"
dependencies = [
"fastapi>=0.115.0",
"uvicorn[standard]>=0.32.0",
"asyncpg>=0.30.0",
"pydantic[email]>=2.0.0",
"pydantic-settings>=2.0.0",
"python-jose[cryptography]>=3.3.0",
"bcrypt>=4.0.0",
"celery[redis]>=5.4.0",
"redis>=5.0.0",
"httpx>=0.28.0",
# OpenTelemetry
"opentelemetry-api>=1.27.0",
"opentelemetry-sdk>=1.27.0",
"opentelemetry-exporter-otlp>=1.27.0",
"opentelemetry-exporter-prometheus>=0.48b0",
"opentelemetry-instrumentation-fastapi>=0.48b0",
"opentelemetry-instrumentation-asyncpg>=0.48b0",
"opentelemetry-instrumentation-httpx>=0.48b0",
"opentelemetry-instrumentation-redis>=0.48b0",
"opentelemetry-instrumentation-logging>=0.48b0",
"opentelemetry-instrumentation-system-metrics>=0.48b0",
"prometheus-client>=0.20.0",
]
[project.optional-dependencies]
dev = [
"pytest>=8.0.0",
"pytest-asyncio>=0.24.0",
"ruff>=0.8.0",
]
[build-system]
requires = ["hatchling"]
build-backend = "hatchling.build"
[tool.hatch.build.targets.wheel]
packages = ["app", "migrations", "worker"]
[tool.ruff]
line-length = 100
target-version = "py314"
[tool.ruff.lint]
select = ["E", "F", "I", "N", "W", "UP"]
[tool.ruff.lint.per-file-ignores]
"tests/**/*.py" = ["E501"] # Allow longer lines in tests for descriptive method names
[tool.pytest.ini_options]
asyncio_mode = "auto"
testpaths = ["tests"]
+19 -128
View File
@@ -1,46 +1,24 @@
apiVersion: skaffold/v4beta11 apiVersion: skaffold/v4beta6
kind: Config kind: Config
metadata: metadata:
name: incidentops name: incidentops
build: build:
artifacts: artifacts:
- image: incidentops/api - image: incidentops-api
context: .
docker:
dockerfile: src/IncidentOps.Api/Dockerfile
- image: incidentops-worker
context: .
docker:
dockerfile: src/IncidentOps.Worker/Dockerfile
- image: incidentops-web
context: web
docker: docker:
dockerfile: Dockerfile dockerfile: Dockerfile
target: api
sync:
manual:
- src: "app/**/*.py"
dest: /app
- src: "worker/**/*.py"
dest: /app
- image: incidentops/worker
docker:
dockerfile: Dockerfile
target: worker
sync:
manual:
- src: "app/**/*.py"
dest: /app
- src: "worker/**/*.py"
dest: /app
# Web frontend disabled until implemented
# - image: incidentops/web
# docker:
# dockerfile: Dockerfile.web
# context: .
# sync:
# manual:
# - src: "web/src/**/*"
# dest: /app
local: local:
push: false push: false
useBuildkit: true useBuildkit: true
deploy: deploy:
helm: helm:
releases: releases:
@@ -49,102 +27,15 @@ deploy:
valuesFiles: valuesFiles:
- helm/incidentops/values.yaml - helm/incidentops/values.yaml
setValues: setValues:
web.replicaCount: 0 # Disabled until frontend is implemented api.image: incidentops-api
migration.enabled: true worker.image: incidentops-worker
setValueTemplates: web.image: incidentops-web
api.image.repository: "{{.IMAGE_REPO_incidentops_api}}"
api.image.tag: "{{.IMAGE_TAG_incidentops_api}}"
worker.image.repository: "{{.IMAGE_REPO_incidentops_worker}}"
worker.image.tag: "{{.IMAGE_TAG_incidentops_worker}}"
migration.image.repository: "{{.IMAGE_REPO_incidentops_api}}"
migration.image.tag: "{{.IMAGE_TAG_incidentops_api}}"
createNamespace: true
namespace: incidentops
profiles:
- name: dev
activation:
- command: dev
build:
local:
push: false
deploy:
helm:
releases:
- name: incidentops
chartPath: helm/incidentops
valuesFiles:
- helm/incidentops/values.yaml
setValues:
api.replicaCount: 1
worker.replicaCount: 1
web.replicaCount: 0 # Disabled until frontend is implemented
migration.enabled: true
setValueTemplates:
api.image.repository: "{{.IMAGE_REPO_incidentops_api}}"
api.image.tag: "{{.IMAGE_TAG_incidentops_api}}"
worker.image.repository: "{{.IMAGE_REPO_incidentops_worker}}"
worker.image.tag: "{{.IMAGE_TAG_incidentops_worker}}"
migration.image.repository: "{{.IMAGE_REPO_incidentops_api}}"
migration.image.tag: "{{.IMAGE_TAG_incidentops_api}}"
createNamespace: true
namespace: incidentops
- name: production
activation:
- env: SKAFFOLD_PROFILE=production
build:
local:
push: true
deploy:
helm:
releases:
- name: incidentops
chartPath: helm/incidentops
valuesFiles:
- helm/incidentops/values.yaml
- helm/incidentops/values-production.yaml
createNamespace: true
namespace: incidentops-prod
- name: kind
activation:
- kubeContext: kind-.*
patches:
- op: add
path: /build/local/push
value: false
portForward: portForward:
- resourceType: service - resourceType: service
resourceName: incidentops-api resourceName: incidentops-api
namespace: incidentops port: 8080
port: 8000 localPort: 8080
localPort: 8000
# Web frontend disabled until implemented
# - resourceType: service
# resourceName: incidentops-web
# namespace: incidentops
# port: 3000
# localPort: 3000
# Observability
- resourceType: service - resourceType: service
resourceName: incidentops-grafana resourceName: incidentops-web
namespace: incidentops port: 3000
port: 80 localPort: 3000
localPort: 3001
- resourceType: service
resourceName: incidentops-prometheus
namespace: incidentops
port: 9090
localPort: 9090
- resourceType: service
resourceName: incidentops-tempo
namespace: incidentops
port: 3200
localPort: 3200
- resourceType: service
resourceName: incidentops-loki
namespace: incidentops
port: 3100
localPort: 3100
@@ -0,0 +1,22 @@
using System.Security.Claims;
using IncidentOps.Domain.Enums;
namespace IncidentOps.Api.Auth;
public static class ClaimsPrincipalExtensions
{
public static RequestContext GetRequestContext(this ClaimsPrincipal principal)
{
var userId = Guid.Parse(principal.FindFirstValue("sub") ?? throw new InvalidOperationException("Missing sub claim"));
var orgId = Guid.Parse(principal.FindFirstValue("org_id") ?? throw new InvalidOperationException("Missing org_id claim"));
var roleStr = principal.FindFirstValue("org_role") ?? throw new InvalidOperationException("Missing org_role claim");
var role = Enum.Parse<OrgRole>(roleStr, ignoreCase: true);
return new RequestContext
{
UserId = userId,
OrgId = orgId,
Role = role
};
}
}
@@ -0,0 +1,10 @@
using IncidentOps.Domain.Enums;
namespace IncidentOps.Api.Auth;
public class RequestContext
{
public Guid UserId { get; set; }
public Guid OrgId { get; set; }
public OrgRole Role { get; set; }
}
@@ -0,0 +1,38 @@
using IncidentOps.Domain.Enums;
using Microsoft.AspNetCore.Authorization;
namespace IncidentOps.Api.Auth;
public class RoleRequirement : IAuthorizationRequirement
{
public OrgRole MinimumRole { get; }
public RoleRequirement(OrgRole minimumRole)
{
MinimumRole = minimumRole;
}
}
public class RoleRequirementHandler : AuthorizationHandler<RoleRequirement>
{
protected override Task HandleRequirementAsync(AuthorizationHandlerContext context, RoleRequirement requirement)
{
var roleClaim = context.User.FindFirst("org_role")?.Value;
if (roleClaim == null)
{
return Task.CompletedTask;
}
if (!Enum.TryParse<OrgRole>(roleClaim, ignoreCase: true, out var userRole))
{
return Task.CompletedTask;
}
if (userRole >= requirement.MinimumRole)
{
context.Succeed(requirement);
}
return Task.CompletedTask;
}
}
@@ -0,0 +1,226 @@
using IncidentOps.Api.Auth;
using IncidentOps.Contracts.Auth;
using IncidentOps.Domain.Entities;
using IncidentOps.Domain.Enums;
using IncidentOps.Infrastructure.Auth;
using IncidentOps.Infrastructure.Data.Repositories;
using Microsoft.AspNetCore.Authorization;
using Microsoft.AspNetCore.Mvc;
using OrgEntity = IncidentOps.Domain.Entities.Org;
namespace IncidentOps.Api.Controllers;
[ApiController]
[Route("v1/auth")]
public class AuthController : ControllerBase
{
private readonly IUserRepository _userRepository;
private readonly IOrgRepository _orgRepository;
private readonly IOrgMemberRepository _orgMemberRepository;
private readonly IRefreshTokenRepository _refreshTokenRepository;
private readonly ITokenService _tokenService;
private readonly IPasswordService _passwordService;
private readonly JwtSettings _jwtSettings;
public AuthController(
IUserRepository userRepository,
IOrgRepository orgRepository,
IOrgMemberRepository orgMemberRepository,
IRefreshTokenRepository refreshTokenRepository,
ITokenService tokenService,
IPasswordService passwordService,
JwtSettings jwtSettings)
{
_userRepository = userRepository;
_orgRepository = orgRepository;
_orgMemberRepository = orgMemberRepository;
_refreshTokenRepository = refreshTokenRepository;
_tokenService = tokenService;
_passwordService = passwordService;
_jwtSettings = jwtSettings;
}
[HttpPost("register")]
public async Task<ActionResult<AuthResponse>> Register([FromBody] RegisterRequest request)
{
var existingUser = await _userRepository.GetByEmailAsync(request.Email);
if (existingUser != null)
return Conflict(new { message = "Email already registered" });
var user = new User
{
Id = Guid.NewGuid(),
Email = request.Email.ToLowerInvariant(),
PasswordHash = _passwordService.HashPassword(request.Password),
DisplayName = request.DisplayName,
CreatedAt = DateTime.UtcNow
};
await _userRepository.CreateAsync(user);
// Create a default org for the user
var org = new OrgEntity
{
Id = Guid.NewGuid(),
Name = $"{request.DisplayName}'s Org",
Slug = $"org-{Guid.NewGuid():N}".Substring(0, 20),
CreatedAt = DateTime.UtcNow
};
await _orgRepository.CreateAsync(org);
var member = new OrgMember
{
Id = Guid.NewGuid(),
OrgId = org.Id,
UserId = user.Id,
Role = OrgRole.Admin,
CreatedAt = DateTime.UtcNow
};
await _orgMemberRepository.CreateAsync(member);
return await GenerateAuthResponse(user, org, member.Role);
}
[HttpPost("login")]
public async Task<ActionResult<AuthResponse>> Login([FromBody] LoginRequest request)
{
var user = await _userRepository.GetByEmailAsync(request.Email);
if (user == null || !_passwordService.VerifyPassword(request.Password, user.PasswordHash))
return Unauthorized(new { message = "Invalid credentials" });
var orgs = await _orgRepository.GetByUserIdAsync(user.Id);
if (orgs.Count == 0)
return Unauthorized(new { message = "User has no organizations" });
OrgEntity activeOrg;
if (request.OrgId.HasValue)
{
activeOrg = orgs.FirstOrDefault(o => o.Id == request.OrgId.Value)
?? throw new InvalidOperationException("User is not a member of the specified organization");
}
else
{
activeOrg = orgs.First();
}
var member = await _orgMemberRepository.GetByUserAndOrgAsync(user.Id, activeOrg.Id);
if (member == null)
return Unauthorized(new { message = "User is not a member of the organization" });
return await GenerateAuthResponse(user, activeOrg, member.Role);
}
[HttpPost("refresh")]
public async Task<ActionResult<AuthResponse>> Refresh([FromBody] RefreshRequest request)
{
var tokenHash = _tokenService.HashToken(request.RefreshToken);
var refreshToken = await _refreshTokenRepository.GetByHashAsync(tokenHash);
if (refreshToken == null)
return Unauthorized(new { message = "Invalid refresh token" });
var user = await _userRepository.GetByIdAsync(refreshToken.UserId);
if (user == null)
return Unauthorized(new { message = "User not found" });
var org = await _orgRepository.GetByIdAsync(refreshToken.ActiveOrgId);
if (org == null)
return Unauthorized(new { message = "Organization not found" });
var member = await _orgMemberRepository.GetByUserAndOrgAsync(user.Id, org.Id);
if (member == null)
return Unauthorized(new { message = "User is not a member of the organization" });
// Rotate refresh token
await _refreshTokenRepository.RevokeAsync(refreshToken.Id);
return await GenerateAuthResponse(user, org, member.Role);
}
[HttpPost("switch-org")]
public async Task<ActionResult<AuthResponse>> SwitchOrg([FromBody] SwitchOrgRequest request)
{
var tokenHash = _tokenService.HashToken(request.RefreshToken);
var refreshToken = await _refreshTokenRepository.GetByHashAsync(tokenHash);
if (refreshToken == null)
return Unauthorized(new { message = "Invalid refresh token" });
var user = await _userRepository.GetByIdAsync(refreshToken.UserId);
if (user == null)
return Unauthorized(new { message = "User not found" });
var org = await _orgRepository.GetByIdAsync(request.OrgId);
if (org == null)
return NotFound(new { message = "Organization not found" });
var member = await _orgMemberRepository.GetByUserAndOrgAsync(user.Id, org.Id);
if (member == null)
return Forbidden("User is not a member of the organization");
// Rotate refresh token with new org
await _refreshTokenRepository.RevokeAsync(refreshToken.Id);
return await GenerateAuthResponse(user, org, member.Role);
}
[HttpPost("logout")]
public async Task<IActionResult> Logout([FromBody] LogoutRequest request)
{
var tokenHash = _tokenService.HashToken(request.RefreshToken);
var refreshToken = await _refreshTokenRepository.GetByHashAsync(tokenHash);
if (refreshToken != null)
{
await _refreshTokenRepository.RevokeAsync(refreshToken.Id);
}
return NoContent();
}
[Authorize]
[HttpGet("/v1/me")]
public async Task<ActionResult<MeResponse>> Me()
{
var ctx = User.GetRequestContext();
var user = await _userRepository.GetByIdAsync(ctx.UserId);
if (user == null)
return NotFound();
var org = await _orgRepository.GetByIdAsync(ctx.OrgId);
if (org == null)
return NotFound();
return new MeResponse(
user.Id,
user.Email,
user.DisplayName,
new ActiveOrgDto(org.Id, org.Name, org.Slug, ctx.Role.ToString().ToLowerInvariant())
);
}
private async Task<ActionResult<AuthResponse>> GenerateAuthResponse(User user, OrgEntity org, OrgRole role)
{
var accessToken = _tokenService.GenerateAccessToken(user.Id, org.Id, role);
var refreshTokenValue = _tokenService.GenerateRefreshToken();
var refreshTokenHash = _tokenService.HashToken(refreshTokenValue);
var refreshToken = new RefreshToken
{
Id = Guid.NewGuid(),
UserId = user.Id,
TokenHash = refreshTokenHash,
ActiveOrgId = org.Id,
ExpiresAt = DateTime.UtcNow.AddDays(_jwtSettings.RefreshTokenExpirationDays),
CreatedAt = DateTime.UtcNow
};
await _refreshTokenRepository.CreateAsync(refreshToken);
return new AuthResponse(
accessToken,
refreshTokenValue,
new ActiveOrgDto(org.Id, org.Name, org.Slug, role.ToString().ToLowerInvariant())
);
}
private ObjectResult Forbidden(string message)
{
return StatusCode(403, new { message });
}
}
@@ -0,0 +1,60 @@
using Microsoft.AspNetCore.Mvc;
using Npgsql;
using StackExchange.Redis;
namespace IncidentOps.Api.Controllers;
[ApiController]
public class HealthController : ControllerBase
{
private readonly IConfiguration _configuration;
public HealthController(IConfiguration configuration)
{
_configuration = configuration;
}
[HttpGet("healthz")]
public IActionResult Healthz()
{
return Ok(new { status = "healthy" });
}
[HttpGet("readyz")]
public async Task<IActionResult> Readyz()
{
var checks = new Dictionary<string, string>();
// Check PostgreSQL
try
{
var connectionString = _configuration.GetConnectionString("Postgres");
await using var connection = new NpgsqlConnection(connectionString);
await connection.OpenAsync();
checks["postgres"] = "healthy";
}
catch (Exception ex)
{
checks["postgres"] = $"unhealthy: {ex.Message}";
}
// Check Redis
try
{
var redisConnectionString = _configuration["Redis:ConnectionString"];
var redis = await ConnectionMultiplexer.ConnectAsync(redisConnectionString!);
var db = redis.GetDatabase();
await db.PingAsync();
checks["redis"] = "healthy";
}
catch (Exception ex)
{
checks["redis"] = $"unhealthy: {ex.Message}";
}
var allHealthy = checks.Values.All(v => v == "healthy");
return allHealthy
? Ok(new { status = "ready", checks })
: StatusCode(503, new { status = "not ready", checks });
}
}
@@ -0,0 +1,290 @@
using Hangfire;
using IncidentOps.Api.Auth;
using IncidentOps.Contracts.Incidents;
using IncidentOps.Domain.Entities;
using IncidentOps.Domain.Enums;
using IncidentOps.Infrastructure.Data.Repositories;
using IncidentOps.Infrastructure.Jobs;
using Microsoft.AspNetCore.Authorization;
using Microsoft.AspNetCore.Mvc;
namespace IncidentOps.Api.Controllers;
[ApiController]
[Authorize]
public class IncidentsController : ControllerBase
{
private readonly IIncidentRepository _incidentRepository;
private readonly IIncidentEventRepository _incidentEventRepository;
private readonly IServiceRepository _serviceRepository;
private readonly IUserRepository _userRepository;
private readonly IBackgroundJobClient _backgroundJobClient;
public IncidentsController(
IIncidentRepository incidentRepository,
IIncidentEventRepository incidentEventRepository,
IServiceRepository serviceRepository,
IUserRepository userRepository,
IBackgroundJobClient backgroundJobClient)
{
_incidentRepository = incidentRepository;
_incidentEventRepository = incidentEventRepository;
_serviceRepository = serviceRepository;
_userRepository = userRepository;
_backgroundJobClient = backgroundJobClient;
}
[HttpGet("v1/incidents")]
public async Task<ActionResult<IncidentListResponse>> GetIncidents(
[FromQuery] string? status = null,
[FromQuery] string? cursor = null,
[FromQuery] int limit = 20)
{
var ctx = User.GetRequestContext();
IncidentStatus? statusFilter = null;
if (!string.IsNullOrEmpty(status) && Enum.TryParse<IncidentStatus>(status, ignoreCase: true, out var parsed))
{
statusFilter = parsed;
}
var incidents = await _incidentRepository.GetByOrgIdAsync(ctx.OrgId, statusFilter, limit + 1, cursor);
var hasMore = incidents.Count > limit;
var items = incidents.Take(limit).ToList();
var dtos = new List<IncidentDto>();
foreach (var incident in items)
{
var service = await _serviceRepository.GetByIdAsync(incident.ServiceId, ctx.OrgId);
var assignedUser = incident.AssignedToUserId.HasValue
? await _userRepository.GetByIdAsync(incident.AssignedToUserId.Value)
: null;
dtos.Add(new IncidentDto(
incident.Id,
incident.ServiceId,
service?.Name ?? "Unknown",
incident.Title,
incident.Description,
incident.Status.ToString().ToLowerInvariant(),
incident.Version,
incident.AssignedToUserId,
assignedUser?.DisplayName,
incident.CreatedAt,
incident.AcknowledgedAt,
incident.MitigatedAt,
incident.ResolvedAt
));
}
var nextCursor = hasMore ? items.Last().CreatedAt.ToString("O") : null;
return new IncidentListResponse(dtos, nextCursor);
}
[HttpPost("v1/services/{serviceId}/incidents")]
[Authorize(Policy = "Member")]
public async Task<ActionResult<IncidentDto>> CreateIncident(Guid serviceId, [FromBody] CreateIncidentRequest request)
{
var ctx = User.GetRequestContext();
var service = await _serviceRepository.GetByIdAsync(serviceId, ctx.OrgId);
if (service == null)
return NotFound(new { message = "Service not found" });
var incident = new Incident
{
Id = Guid.NewGuid(),
OrgId = ctx.OrgId,
ServiceId = serviceId,
Title = request.Title,
Description = request.Description,
Status = IncidentStatus.Triggered,
Version = 1,
CreatedAt = DateTime.UtcNow
};
await _incidentRepository.CreateAsync(incident);
var incidentEvent = new IncidentEvent
{
Id = Guid.NewGuid(),
IncidentId = incident.Id,
EventType = IncidentEventType.Created,
ActorUserId = ctx.UserId,
CreatedAt = DateTime.UtcNow
};
await _incidentEventRepository.CreateAsync(incidentEvent);
// Enqueue notification job
_backgroundJobClient.Enqueue<IIncidentTriggeredJob>(j => j.ExecuteAsync(incident.Id));
return CreatedAtAction(nameof(GetIncident), new { incidentId = incident.Id }, new IncidentDto(
incident.Id,
incident.ServiceId,
service.Name,
incident.Title,
incident.Description,
incident.Status.ToString().ToLowerInvariant(),
incident.Version,
null,
null,
incident.CreatedAt,
null,
null,
null
));
}
[HttpGet("v1/incidents/{incidentId}")]
public async Task<ActionResult<IncidentDto>> GetIncident(Guid incidentId)
{
var ctx = User.GetRequestContext();
var incident = await _incidentRepository.GetByIdAsync(incidentId, ctx.OrgId);
if (incident == null)
return NotFound();
var service = await _serviceRepository.GetByIdAsync(incident.ServiceId, ctx.OrgId);
var assignedUser = incident.AssignedToUserId.HasValue
? await _userRepository.GetByIdAsync(incident.AssignedToUserId.Value)
: null;
return new IncidentDto(
incident.Id,
incident.ServiceId,
service?.Name ?? "Unknown",
incident.Title,
incident.Description,
incident.Status.ToString().ToLowerInvariant(),
incident.Version,
incident.AssignedToUserId,
assignedUser?.DisplayName,
incident.CreatedAt,
incident.AcknowledgedAt,
incident.MitigatedAt,
incident.ResolvedAt
);
}
[HttpGet("v1/incidents/{incidentId}/events")]
public async Task<ActionResult<IReadOnlyList<IncidentEventDto>>> GetIncidentEvents(Guid incidentId)
{
var ctx = User.GetRequestContext();
var incident = await _incidentRepository.GetByIdAsync(incidentId, ctx.OrgId);
if (incident == null)
return NotFound();
var events = await _incidentEventRepository.GetByIncidentIdAsync(incidentId);
var dtos = new List<IncidentEventDto>();
foreach (var evt in events)
{
var actor = evt.ActorUserId.HasValue
? await _userRepository.GetByIdAsync(evt.ActorUserId.Value)
: null;
dtos.Add(new IncidentEventDto(
evt.Id,
evt.EventType.ToString().ToLowerInvariant(),
evt.ActorUserId,
actor?.DisplayName,
evt.Payload,
evt.CreatedAt
));
}
return dtos;
}
[HttpPost("v1/incidents/{incidentId}/transition")]
[Authorize(Policy = "Member")]
public async Task<ActionResult<IncidentDto>> TransitionIncident(Guid incidentId, [FromBody] TransitionRequest request)
{
var ctx = User.GetRequestContext();
var incident = await _incidentRepository.GetByIdAsync(incidentId, ctx.OrgId);
if (incident == null)
return NotFound();
var newStatus = request.Action.ToLowerInvariant() switch
{
"ack" or "acknowledge" => IncidentStatus.Acknowledged,
"mitigate" => IncidentStatus.Mitigated,
"resolve" => IncidentStatus.Resolved,
_ => (IncidentStatus?)null
};
if (newStatus == null)
return BadRequest(new { message = "Invalid action" });
// Validate transition
var validTransitions = new Dictionary<IncidentStatus, IncidentStatus[]>
{
{ IncidentStatus.Triggered, new[] { IncidentStatus.Acknowledged } },
{ IncidentStatus.Acknowledged, new[] { IncidentStatus.Mitigated } },
{ IncidentStatus.Mitigated, new[] { IncidentStatus.Resolved } }
};
if (!validTransitions.TryGetValue(incident.Status, out var allowedStatuses) || !allowedStatuses.Contains(newStatus.Value))
{
return BadRequest(new { message = $"Cannot transition from {incident.Status} to {newStatus}" });
}
var timestamp = DateTime.UtcNow;
var success = await _incidentRepository.TransitionAsync(incidentId, ctx.OrgId, request.ExpectedVersion, newStatus.Value, timestamp);
if (!success)
return Conflict(new { message = "Concurrent modification detected. Please refresh and try again." });
var eventType = newStatus.Value switch
{
IncidentStatus.Acknowledged => IncidentEventType.Acknowledged,
IncidentStatus.Mitigated => IncidentEventType.Mitigated,
IncidentStatus.Resolved => IncidentEventType.Resolved,
_ => throw new InvalidOperationException()
};
await _incidentEventRepository.CreateAsync(new IncidentEvent
{
Id = Guid.NewGuid(),
IncidentId = incidentId,
EventType = eventType,
ActorUserId = ctx.UserId,
CreatedAt = timestamp
});
return await GetIncident(incidentId);
}
[HttpPost("v1/incidents/{incidentId}/comment")]
[Authorize(Policy = "Member")]
public async Task<ActionResult<IncidentEventDto>> AddComment(Guid incidentId, [FromBody] CommentRequest request)
{
var ctx = User.GetRequestContext();
var incident = await _incidentRepository.GetByIdAsync(incidentId, ctx.OrgId);
if (incident == null)
return NotFound();
var incidentEvent = new IncidentEvent
{
Id = Guid.NewGuid(),
IncidentId = incidentId,
EventType = IncidentEventType.Comment,
ActorUserId = ctx.UserId,
Payload = request.Content,
CreatedAt = DateTime.UtcNow
};
await _incidentEventRepository.CreateAsync(incidentEvent);
var user = await _userRepository.GetByIdAsync(ctx.UserId);
return CreatedAtAction(nameof(GetIncidentEvents), new { incidentId }, new IncidentEventDto(
incidentEvent.Id,
incidentEvent.EventType.ToString().ToLowerInvariant(),
ctx.UserId,
user?.DisplayName,
incidentEvent.Payload,
incidentEvent.CreatedAt
));
}
}
@@ -0,0 +1,151 @@
using IncidentOps.Api.Auth;
using IncidentOps.Contracts.Orgs;
using IncidentOps.Contracts.Services;
using IncidentOps.Domain.Entities;
using IncidentOps.Domain.Enums;
using IncidentOps.Infrastructure.Data.Repositories;
using Microsoft.AspNetCore.Authorization;
using Microsoft.AspNetCore.Mvc;
namespace IncidentOps.Api.Controllers;
[ApiController]
[Route("v1/org")]
[Authorize]
public class OrgController : ControllerBase
{
private readonly IOrgRepository _orgRepository;
private readonly IOrgMemberRepository _orgMemberRepository;
private readonly IUserRepository _userRepository;
private readonly IServiceRepository _serviceRepository;
private readonly INotificationTargetRepository _notificationTargetRepository;
public OrgController(
IOrgRepository orgRepository,
IOrgMemberRepository orgMemberRepository,
IUserRepository userRepository,
IServiceRepository serviceRepository,
INotificationTargetRepository notificationTargetRepository)
{
_orgRepository = orgRepository;
_orgMemberRepository = orgMemberRepository;
_userRepository = userRepository;
_serviceRepository = serviceRepository;
_notificationTargetRepository = notificationTargetRepository;
}
[HttpGet]
public async Task<ActionResult<OrgDto>> GetCurrentOrg()
{
var ctx = User.GetRequestContext();
var org = await _orgRepository.GetByIdAsync(ctx.OrgId);
if (org == null)
return NotFound();
return new OrgDto(org.Id, org.Name, org.Slug, ctx.Role.ToString().ToLowerInvariant());
}
[HttpGet("members")]
[Authorize(Policy = "Admin")]
public async Task<ActionResult<IReadOnlyList<OrgMemberDto>>> GetMembers()
{
var ctx = User.GetRequestContext();
var members = await _orgMemberRepository.GetByOrgIdAsync(ctx.OrgId);
var result = new List<OrgMemberDto>();
foreach (var member in members)
{
var user = await _userRepository.GetByIdAsync(member.UserId);
if (user != null)
{
result.Add(new OrgMemberDto(
member.Id,
user.Id,
user.Email,
user.DisplayName,
member.Role.ToString().ToLowerInvariant(),
member.CreatedAt
));
}
}
return result;
}
[HttpGet("services")]
public async Task<ActionResult<IReadOnlyList<ServiceDto>>> GetServices()
{
var ctx = User.GetRequestContext();
var services = await _serviceRepository.GetByOrgIdAsync(ctx.OrgId);
return services.Select(s => new ServiceDto(s.Id, s.Name, s.Slug, s.Description, s.CreatedAt)).ToList();
}
[HttpPost("services")]
[Authorize(Policy = "Member")]
public async Task<ActionResult<ServiceDto>> CreateService([FromBody] CreateServiceRequest request)
{
var ctx = User.GetRequestContext();
var service = new Service
{
Id = Guid.NewGuid(),
OrgId = ctx.OrgId,
Name = request.Name,
Slug = request.Slug,
Description = request.Description,
CreatedAt = DateTime.UtcNow
};
await _serviceRepository.CreateAsync(service);
return CreatedAtAction(nameof(GetServices), new ServiceDto(service.Id, service.Name, service.Slug, service.Description, service.CreatedAt));
}
[HttpGet("notification-targets")]
[Authorize(Policy = "Admin")]
public async Task<ActionResult<IReadOnlyList<NotificationTargetDto>>> GetNotificationTargets()
{
var ctx = User.GetRequestContext();
var targets = await _notificationTargetRepository.GetByOrgIdAsync(ctx.OrgId);
return targets.Select(t => new NotificationTargetDto(
t.Id,
t.Name,
t.TargetType.ToString().ToLowerInvariant(),
t.Configuration,
t.IsEnabled,
t.CreatedAt
)).ToList();
}
[HttpPost("notification-targets")]
[Authorize(Policy = "Admin")]
public async Task<ActionResult<NotificationTargetDto>> CreateNotificationTarget([FromBody] CreateNotificationTargetRequest request)
{
var ctx = User.GetRequestContext();
if (!Enum.TryParse<NotificationTargetType>(request.TargetType, ignoreCase: true, out var targetType))
return BadRequest(new { message = "Invalid target type" });
var target = new NotificationTarget
{
Id = Guid.NewGuid(),
OrgId = ctx.OrgId,
Name = request.Name,
TargetType = targetType,
Configuration = request.Configuration,
IsEnabled = request.IsEnabled,
CreatedAt = DateTime.UtcNow
};
await _notificationTargetRepository.CreateAsync(target);
return CreatedAtAction(nameof(GetNotificationTargets), new NotificationTargetDto(
target.Id,
target.Name,
target.TargetType.ToString().ToLowerInvariant(),
target.Configuration,
target.IsEnabled,
target.CreatedAt
));
}
}
+23
View File
@@ -0,0 +1,23 @@
FROM mcr.microsoft.com/dotnet/sdk:10.0 AS build
WORKDIR /src
# Copy csproj files and restore
COPY src/IncidentOps.Contracts/IncidentOps.Contracts.csproj src/IncidentOps.Contracts/
COPY src/IncidentOps.Domain/IncidentOps.Domain.csproj src/IncidentOps.Domain/
COPY src/IncidentOps.Infrastructure/IncidentOps.Infrastructure.csproj src/IncidentOps.Infrastructure/
COPY src/IncidentOps.Api/IncidentOps.Api.csproj src/IncidentOps.Api/
RUN dotnet restore src/IncidentOps.Api/IncidentOps.Api.csproj
# Copy source and build
COPY src/ src/
WORKDIR /src/src/IncidentOps.Api
RUN dotnet publish -c Release -o /app --no-restore
FROM mcr.microsoft.com/dotnet/aspnet:10.0 AS runtime
WORKDIR /app
COPY --from=build /app .
ENV ASPNETCORE_URLS=http://+:8080
EXPOSE 8080
ENTRYPOINT ["dotnet", "IncidentOps.Api.dll"]
@@ -0,0 +1,28 @@
<Project Sdk="Microsoft.NET.Sdk.Web">
<PropertyGroup>
<TargetFramework>net10.0</TargetFramework>
<Nullable>enable</Nullable>
<ImplicitUsings>enable</ImplicitUsings>
<AllowMissingPrunePackageData>true</AllowMissingPrunePackageData>
</PropertyGroup>
<ItemGroup>
<PackageReference Include="FluentMigrator.Runner" Version="7.2.0" />
<PackageReference Include="FluentMigrator.Runner.Postgres" Version="7.2.0" />
<PackageReference Include="Hangfire.AspNetCore" Version="1.8.22" />
<PackageReference Include="Hangfire.Core" Version="1.8.22" />
<PackageReference Include="Hangfire.Redis.StackExchange" Version="1.12.0" />
<PackageReference Include="Microsoft.AspNetCore.Authentication.JwtBearer" Version="10.0.1" />
<PackageReference Include="Microsoft.AspNetCore.OpenApi" Version="10.0.0" />
<PackageReference Include="Npgsql" Version="10.0.1" />
<PackageReference Include="StackExchange.Redis" Version="2.10.1" />
</ItemGroup>
<ItemGroup>
<ProjectReference Include="..\IncidentOps.Infrastructure\IncidentOps.Infrastructure.csproj" />
<ProjectReference Include="..\IncidentOps.Domain\IncidentOps.Domain.csproj" />
<ProjectReference Include="..\IncidentOps.Contracts\IncidentOps.Contracts.csproj" />
</ItemGroup>
</Project>
+108
View File
@@ -0,0 +1,108 @@
using System.Text;
using FluentMigrator.Runner;
using Hangfire;
using Hangfire.Redis.StackExchange;
using IncidentOps.Api.Auth;
using IncidentOps.Infrastructure;
using IncidentOps.Infrastructure.Auth;
using IncidentOps.Infrastructure.Migrations;
using Microsoft.AspNetCore.Authentication.JwtBearer;
using Microsoft.AspNetCore.Authorization;
using Microsoft.IdentityModel.Tokens;
using StackExchange.Redis;
var builder = WebApplication.CreateBuilder(args);
// Add controllers
builder.Services.AddControllers();
builder.Services.AddEndpointsApiExplorer();
builder.Services.AddOpenApi();
// Configure JWT settings
var jwtSettings = new JwtSettings
{
Issuer = builder.Configuration["Jwt:Issuer"] ?? "incidentops",
Audience = builder.Configuration["Jwt:Audience"] ?? "incidentops",
SigningKey = builder.Configuration["Jwt:SigningKey"] ?? throw new InvalidOperationException("JWT signing key not configured"),
AccessTokenExpirationMinutes = builder.Configuration.GetValue<int>("Jwt:AccessTokenExpirationMinutes", 15),
RefreshTokenExpirationDays = builder.Configuration.GetValue<int>("Jwt:RefreshTokenExpirationDays", 7)
};
// Configure Infrastructure
var connectionString = builder.Configuration.GetConnectionString("Postgres")
?? throw new InvalidOperationException("Postgres connection string not configured");
builder.Services.AddInfrastructure(connectionString, jwtSettings);
// Configure FluentMigrator
builder.Services.AddFluentMigratorCore()
.ConfigureRunner(rb => rb
.AddPostgres()
.WithGlobalConnectionString(connectionString)
.ScanIn(typeof(Migration0001_InitialSchema).Assembly).For.Migrations())
.AddLogging(lb => lb.AddFluentMigratorConsole());
// Configure JWT Authentication
builder.Services.AddAuthentication(JwtBearerDefaults.AuthenticationScheme)
.AddJwtBearer(options =>
{
options.TokenValidationParameters = new TokenValidationParameters
{
ValidateIssuer = true,
ValidateAudience = true,
ValidateLifetime = true,
ValidateIssuerSigningKey = true,
ValidIssuer = jwtSettings.Issuer,
ValidAudience = jwtSettings.Audience,
IssuerSigningKey = new SymmetricSecurityKey(Encoding.UTF8.GetBytes(jwtSettings.SigningKey))
};
});
// Configure Authorization
builder.Services.AddSingleton<IAuthorizationHandler, RoleRequirementHandler>();
builder.Services.AddAuthorizationBuilder()
.AddPolicy("Viewer", policy => policy.Requirements.Add(new RoleRequirement(IncidentOps.Domain.Enums.OrgRole.Viewer)))
.AddPolicy("Member", policy => policy.Requirements.Add(new RoleRequirement(IncidentOps.Domain.Enums.OrgRole.Member)))
.AddPolicy("Admin", policy => policy.Requirements.Add(new RoleRequirement(IncidentOps.Domain.Enums.OrgRole.Admin)));
// Configure Hangfire (client only - server runs in Worker)
var redisConnectionString = builder.Configuration["Redis:ConnectionString"]
?? throw new InvalidOperationException("Redis connection string not configured");
builder.Services.AddHangfire(configuration => configuration
.SetDataCompatibilityLevel(CompatibilityLevel.Version_180)
.UseSimpleAssemblyNameTypeSerializer()
.UseRecommendedSerializerSettings()
.UseRedisStorage(ConnectionMultiplexer.Connect(redisConnectionString)));
// Add CORS
builder.Services.AddCors(options =>
{
options.AddDefaultPolicy(policy =>
{
policy.WithOrigins(builder.Configuration.GetSection("Cors:Origins").Get<string[]>() ?? ["http://localhost:3000"])
.AllowAnyHeader()
.AllowAnyMethod()
.AllowCredentials();
});
});
var app = builder.Build();
// Run migrations
using (var scope = app.Services.CreateScope())
{
var runner = scope.ServiceProvider.GetRequiredService<IMigrationRunner>();
runner.MigrateUp();
}
// Configure the HTTP request pipeline
if (app.Environment.IsDevelopment())
{
app.MapOpenApi();
}
app.UseCors();
app.UseAuthentication();
app.UseAuthorization();
app.MapControllers();
app.Run();
@@ -0,0 +1,5 @@
namespace IncidentOps.Contracts.Auth;
public record AuthResponse(string AccessToken, string RefreshToken, ActiveOrgDto ActiveOrg);
public record ActiveOrgDto(Guid Id, string Name, string Slug, string Role);
@@ -0,0 +1,3 @@
namespace IncidentOps.Contracts.Auth;
public record LoginRequest(string Email, string Password, Guid? OrgId = null);
@@ -0,0 +1,3 @@
namespace IncidentOps.Contracts.Auth;
public record LogoutRequest(string RefreshToken);
@@ -0,0 +1,3 @@
namespace IncidentOps.Contracts.Auth;
public record MeResponse(Guid Id, string Email, string DisplayName, ActiveOrgDto ActiveOrg);
@@ -0,0 +1,3 @@
namespace IncidentOps.Contracts.Auth;
public record RefreshRequest(string RefreshToken);

Some files were not shown because too many files have changed in this diff Show More