Compare commits
28 Commits
master
..
3abbd4a9aa
| Author | SHA1 | Date | |
|---|---|---|---|
|
3abbd4a9aa
|
|||
|
f17fa5eb76
|
|||
|
8ada5d1946
|
|||
|
53418cf41c
|
|||
|
f635386b4d
|
|||
|
d6ac0ddd3a
|
|||
|
a0e9fd71e6
|
|||
|
03bc133e2c
|
|||
|
1d3ef9ef90
|
|||
|
1d98cd5a73
|
|||
|
1a5e1d6c38
|
|||
|
8cac9b4377
|
|||
|
06db4231cf
|
|||
|
8ac4d814ee
|
|||
|
9e73887efc
|
|||
|
4db3e56811
|
|||
|
d4c5f257af
|
|||
|
929327eca3
|
|||
|
97905f9e19
|
|||
|
0aac1b6dc7
|
|||
|
a6d5a696a6
|
|||
|
3e70ba560b
|
|||
|
92f9ed001c
|
|||
|
38aa3fb12e
|
|||
|
370408af95
|
|||
|
7a09f8e2f6
|
|||
|
9357cbe026
|
|||
|
49ec9cd997
|
+59
-10
@@ -1,11 +1,60 @@
|
||||
# Python-generated files
|
||||
__pycache__/
|
||||
*.py[oc]
|
||||
build/
|
||||
dist/
|
||||
wheels/
|
||||
*.egg-info
|
||||
# .NET
|
||||
bin/
|
||||
obj/
|
||||
*.user
|
||||
*.suo
|
||||
*.userosscache
|
||||
*.sln.docstates
|
||||
*.userprefs
|
||||
.vs/
|
||||
|
||||
# Virtual environments
|
||||
.venv
|
||||
.pytest_cache/
|
||||
# Build results
|
||||
[Dd]ebug/
|
||||
[Rr]elease/
|
||||
x64/
|
||||
x86/
|
||||
[Aa][Rr][Mm]/
|
||||
[Aa][Rr][Mm]64/
|
||||
bld/
|
||||
[Bb]in/
|
||||
[Oo]bj/
|
||||
[Ll]og/
|
||||
[Ll]ogs/
|
||||
|
||||
# NuGet
|
||||
*.nupkg
|
||||
*.snupkg
|
||||
.nuget/
|
||||
packages/
|
||||
|
||||
# Node.js
|
||||
node_modules/
|
||||
.next/
|
||||
out/
|
||||
.npm/
|
||||
|
||||
# IDE
|
||||
.idea/
|
||||
*.swp
|
||||
*.swo
|
||||
*~
|
||||
|
||||
# OS
|
||||
.DS_Store
|
||||
Thumbs.db
|
||||
|
||||
# Environment
|
||||
.env
|
||||
.env.local
|
||||
.env.*.local
|
||||
appsettings.Local.json
|
||||
appsettings.*.Local.json
|
||||
|
||||
# Helm
|
||||
helm/incidentops/charts/
|
||||
|
||||
# Docker
|
||||
.docker/
|
||||
|
||||
# Kubernetes
|
||||
*.kubeconfig
|
||||
|
||||
@@ -1 +0,0 @@
|
||||
3.14
|
||||
-38
@@ -1,38 +0,0 @@
|
||||
# Multi-stage Dockerfile for API and Worker services
|
||||
FROM python:3.14-slim AS base
|
||||
|
||||
WORKDIR /app
|
||||
|
||||
# Install uv
|
||||
COPY --from=ghcr.io/astral-sh/uv:latest /uv /uvx /bin/
|
||||
|
||||
# Install Python dependencies
|
||||
COPY pyproject.toml uv.lock README.md ./
|
||||
RUN uv sync --no-cache --no-dev
|
||||
|
||||
# Copy application code
|
||||
COPY app/ ./app/
|
||||
COPY worker/ ./worker/
|
||||
COPY migrations/ ./migrations/
|
||||
|
||||
# Set up non-root user and cache directory
|
||||
RUN useradd -m -u 1000 appuser && \
|
||||
mkdir -p /app/.cache && \
|
||||
chown -R appuser:appuser /app
|
||||
|
||||
ENV UV_CACHE_DIR=/app/.cache
|
||||
|
||||
# API service target
|
||||
FROM base AS api
|
||||
|
||||
USER appuser
|
||||
EXPOSE 8000
|
||||
|
||||
CMD ["uv", "run", "uvicorn", "app.main:app", "--host", "0.0.0.0", "--port", "8000"]
|
||||
|
||||
# Worker service target
|
||||
FROM base AS worker
|
||||
|
||||
USER appuser
|
||||
|
||||
CMD ["uv", "run", "celery", "-A", "worker.celery_app", "worker", "--loglevel=info", "-Q", "critical,default,low"]
|
||||
@@ -0,0 +1,9 @@
|
||||
<Solution>
|
||||
<Folder Name="/src/">
|
||||
<Project Path="src/IncidentOps.Api/IncidentOps.Api.csproj" />
|
||||
<Project Path="src/IncidentOps.Contracts/IncidentOps.Contracts.csproj" />
|
||||
<Project Path="src/IncidentOps.Domain/IncidentOps.Domain.csproj" />
|
||||
<Project Path="src/IncidentOps.Infrastructure/IncidentOps.Infrastructure.csproj" />
|
||||
<Project Path="src/IncidentOps.Worker/IncidentOps.Worker.csproj" />
|
||||
</Folder>
|
||||
</Solution>
|
||||
@@ -1,86 +0,0 @@
|
||||
# IncidentOps
|
||||
|
||||
A fullstack on-call & incident management platform
|
||||
|
||||
## Environment Configuration
|
||||
|
||||
| Variable | Description | Default |
|
||||
|----------|-------------|---------|
|
||||
| `DATABASE_URL` | Postgres connection string | — |
|
||||
| `REDIS_URL` | Legacy redis endpoint, also used if no broker override is supplied | `redis://localhost:6379/0` |
|
||||
| `TASK_QUEUE_DRIVER` | Task queue implementation (`celery` or `inmemory`) | `celery` |
|
||||
| `TASK_QUEUE_BROKER_URL` | Celery broker URL (falls back to `REDIS_URL` when unset) | `None` |
|
||||
| `TASK_QUEUE_BACKEND` | Celery transport semantics (`redis` or `sqs`) | `redis` |
|
||||
| `TASK_QUEUE_DEFAULT_QUEUE` | Queue used for fan-out + notification deliveries | `default` |
|
||||
| `TASK_QUEUE_CRITICAL_QUEUE` | Queue used for escalation + delayed work | `critical` |
|
||||
| `TASK_QUEUE_VISIBILITY_TIMEOUT` | Visibility timeout passed to `sqs` transport | `600` |
|
||||
| `TASK_QUEUE_POLLING_INTERVAL` | Polling interval for `sqs` transport (seconds) | `1.0` |
|
||||
| `NOTIFICATION_ESCALATION_DELAY_SECONDS` | Delay before re-checking unacknowledged incidents | `900` |
|
||||
| `AWS_REGION` | Region used when `TASK_QUEUE_BACKEND=sqs` | `None` |
|
||||
| `JWT_SECRET_KEY` | Symmetric JWT signing key | — |
|
||||
| `JWT_ALGORITHM` | JWT algorithm | `HS256` |
|
||||
| `JWT_ISSUER` | JWT issuer claim | `incidentops` |
|
||||
| `JWT_AUDIENCE` | JWT audience claim | `incidentops-api` |
|
||||
|
||||
### Task Queue Modes
|
||||
|
||||
- **Development / Tests** – Set `TASK_QUEUE_DRIVER=inmemory` to bypass Celery entirely (default for local pytest). The API will enqueue events into an in-memory recorder while the worker code remains importable.
|
||||
- **Celery + Redis** – Set `TASK_QUEUE_DRIVER=celery` and either leave `TASK_QUEUE_BROKER_URL` unset (and rely on `REDIS_URL`) or point it to another Redis endpoint. This is the default production-style configuration.
|
||||
- **Celery + Amazon SQS** – Provide `TASK_QUEUE_BROKER_URL=sqs://` (Celery automatically discovers credentials), set `TASK_QUEUE_BACKEND=sqs`, and configure `AWS_REGION`. Optional tuning is available via the visibility timeout and polling interval variables above.
|
||||
|
||||
### Running the Worker
|
||||
|
||||
The worker automatically discovers tasks under `worker/tasks`. Use the same environment variables as the API:
|
||||
|
||||
```
|
||||
uv run celery -A worker.celery_app worker --loglevel=info
|
||||
```
|
||||
|
||||
## Setup
|
||||
|
||||
### Docker Compose
|
||||
|
||||
```
|
||||
docker compose up --build -d
|
||||
```
|
||||
|
||||
### K8S with Skaffold and Helm
|
||||
|
||||
```
|
||||
# Install with infrastructure only (for testing)
|
||||
helm install incidentops helm/incidentops -n incidentops --create-namespace \
|
||||
--set migration.enabled=false \
|
||||
--set api.replicaCount=0 \
|
||||
--set worker.replicaCount=0 \
|
||||
--set web.replicaCount=0
|
||||
|
||||
# Full install (requires building app images first)
|
||||
helm install incidentops helm/incidentops -n incidentops --create-namespace
|
||||
|
||||
# Create a cluster
|
||||
kind create cluster --name incidentops
|
||||
|
||||
# We then deploy
|
||||
skaffold dev
|
||||
|
||||
# One-time deployment
|
||||
skaffold run
|
||||
|
||||
# Production deployment
|
||||
skaffold run -p production
|
||||
```
|
||||
|
||||
### Accessing Dashboards
|
||||
|
||||
When running with `skaffold dev`, the following dashboards are port-forwarded automatically:
|
||||
|
||||
| Dashboard | URL | Description |
|
||||
|-----------|-----|-------------|
|
||||
| **OpenAPI (Swagger)** | http://localhost:8000/docs | Interactive API documentation |
|
||||
| **OpenAPI (ReDoc)** | http://localhost:8000/redoc | Alternative API docs |
|
||||
| **Grafana** | http://localhost:3001 | Metrics, logs, and traces |
|
||||
| **Prometheus** | http://localhost:9090 | Raw metrics queries |
|
||||
| **Tempo** | http://localhost:3200 | Distributed tracing backend |
|
||||
| **Loki** | http://localhost:3100 | Log aggregation backend |
|
||||
|
||||
Grafana comes pre-configured with datasources for Prometheus, Loki, and Tempo.
|
||||
@@ -1,163 +0,0 @@
|
||||
# IncidentOps Specification
|
||||
|
||||
Multi-tenant incident management API. Org context embedded in JWT — no `orgId` in URLs.
|
||||
|
||||
## Architecture
|
||||
|
||||
| Service | Stack | Purpose |
|
||||
|---------|-------|---------|
|
||||
| **api** | FastAPI, asyncpg | REST API, JWT auth, RBAC |
|
||||
| **worker** | Celery, Redis | Notifications, escalations |
|
||||
| **web** | Next.js | Dashboard (future) |
|
||||
|
||||
**Infrastructure:** PostgreSQL, Redis, ingress-nginx, Helm/Skaffold
|
||||
|
||||
## Auth
|
||||
|
||||
### JWT Access Token Claims
|
||||
- `sub`: user_id (uuid)
|
||||
- `org_id`: active org (uuid)
|
||||
- `org_role`: `admin | member | viewer`
|
||||
- `iss`: issuer (configurable, default: `incidentops`)
|
||||
- `aud`: audience (configurable, default: `incidentops-api`)
|
||||
- `jti`: unique token ID (uuid)
|
||||
- `iat`: issued at (unix timestamp)
|
||||
- `exp`: expiration (unix timestamp)
|
||||
|
||||
### Refresh Token
|
||||
- Opaque token returned in JSON (not cookie)
|
||||
- Stored hashed in DB with `active_org_id`
|
||||
- Rotated on refresh and org-switch
|
||||
|
||||
### Endpoints
|
||||
| Endpoint | Description |
|
||||
|----------|-------------|
|
||||
| `POST /v1/auth/register` | Create user + default org, return tokens |
|
||||
| `POST /v1/auth/login` | Authenticate, return tokens |
|
||||
| `POST /v1/auth/refresh` | Rotate refresh token, mint new access token |
|
||||
| `POST /v1/auth/switch-org` | Change active org, rotate tokens |
|
||||
| `POST /v1/auth/logout` | Revoke refresh token |
|
||||
|
||||
## Authorization
|
||||
|
||||
### Roles
|
||||
| Role | Permissions |
|
||||
|------|-------------|
|
||||
| viewer | Read-only |
|
||||
| member | + create incidents, transitions, comments |
|
||||
| admin | + manage members, notification targets |
|
||||
|
||||
### Enforcement
|
||||
- Role check via dependency injection
|
||||
- Ownership check: resource `org_id` must match JWT `org_id`
|
||||
|
||||
## API Routes
|
||||
|
||||
All under `/v1`. Auth required unless noted.
|
||||
|
||||
### Org (implicit from JWT)
|
||||
- `GET /org` — current org summary
|
||||
- `GET /org/members` (admin)
|
||||
- `GET /org/services`
|
||||
- `POST /org/services` (member+)
|
||||
- `GET /org/notification-targets` (admin)
|
||||
- `POST /org/notification-targets` (admin)
|
||||
|
||||
### Incidents
|
||||
- `GET /incidents?status=&cursor=&limit=`
|
||||
- `POST /services/{serviceId}/incidents` (member+)
|
||||
- `GET /incidents/{incidentId}`
|
||||
- `GET /incidents/{incidentId}/events`
|
||||
- `POST /incidents/{incidentId}/transition` (member+)
|
||||
- `POST /incidents/{incidentId}/comment` (member+)
|
||||
|
||||
### Health
|
||||
- `GET /healthz` — liveness
|
||||
- `GET /readyz` — readiness (postgres + redis)
|
||||
|
||||
## Incident State Machine
|
||||
|
||||
```
|
||||
Triggered → Acknowledged → Mitigated → Resolved
|
||||
```
|
||||
|
||||
- Transitions validated at application level
|
||||
- Optimistic locking via `version` column
|
||||
- All changes recorded in `incident_events`
|
||||
|
||||
## Database Schema
|
||||
|
||||
| Table | Purpose |
|
||||
|-------|---------|
|
||||
| `users` | User accounts |
|
||||
| `orgs` | Organizations |
|
||||
| `org_members` | User-org membership + role |
|
||||
| `services` | Org-scoped services |
|
||||
| `incidents` | Org-scoped incidents with version |
|
||||
| `incident_events` | Append-only timeline |
|
||||
| `refresh_tokens` | Token rotation + active org |
|
||||
| `notification_targets` | Webhook/email/slack configs |
|
||||
| `notification_attempts` | Delivery tracking (idempotent) |
|
||||
|
||||
## Background Jobs (Celery)
|
||||
|
||||
| Task | Queue | Purpose |
|
||||
|------|-------|---------|
|
||||
| `incident_triggered` | default | Fan-out to notification targets |
|
||||
| `send_webhook` | default | HTTP POST with retry |
|
||||
| `escalate_if_unacked` | critical | Delayed escalation (stretch) |
|
||||
|
||||
## Config (Environment)
|
||||
|
||||
| Variable | Required | Default |
|
||||
|----------|----------|---------|
|
||||
| `DATABASE_URL` | Yes | — |
|
||||
| `REDIS_URL` | No | `redis://localhost:6379/0` |
|
||||
| `JWT_SECRET_KEY` | Yes | — |
|
||||
| `JWT_ALGORITHM` | No | `HS256` |
|
||||
| `JWT_ISSUER` | No | `incidentops` |
|
||||
| `JWT_AUDIENCE` | No | `incidentops-api` |
|
||||
| `ACCESS_TOKEN_EXPIRE_MINUTES` | No | `15` |
|
||||
| `REFRESH_TOKEN_EXPIRE_DAYS` | No | `30` |
|
||||
|
||||
## Development
|
||||
|
||||
Use `uv` for all Python operations:
|
||||
|
||||
```bash
|
||||
# Install dependencies
|
||||
uv sync
|
||||
|
||||
# Run tests
|
||||
uv run pytest tests/
|
||||
|
||||
# Run the API server
|
||||
uv run uvicorn app.main:app --reload
|
||||
|
||||
# Run migrations
|
||||
uv run python migrations/migrate.py
|
||||
```
|
||||
|
||||
## Project Structure
|
||||
|
||||
```
|
||||
incidentops/
|
||||
├── app/
|
||||
│ ├── main.py # FastAPI entry
|
||||
│ ├── config.py # pydantic-settings
|
||||
│ ├── db.py # asyncpg pool
|
||||
│ ├── core/ # security, exceptions
|
||||
│ ├── api/v1/ # route handlers
|
||||
│ ├── schemas/ # pydantic models
|
||||
│ ├── repositories/ # data access
|
||||
│ └── services/ # business logic
|
||||
├── worker/
|
||||
│ ├── celery_app.py
|
||||
│ └── tasks/
|
||||
├── migrations/
|
||||
│ └── *.sql + migrate.py
|
||||
├── helm/
|
||||
├── Dockerfile
|
||||
├── docker-compose.yml
|
||||
└── pyproject.toml
|
||||
```
|
||||
-101
@@ -1,101 +0,0 @@
|
||||
"""Shared FastAPI dependencies (auth, RBAC, ownership)."""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
from dataclasses import dataclass
|
||||
from typing import Callable
|
||||
from uuid import UUID
|
||||
|
||||
from fastapi import Depends
|
||||
from fastapi.security import HTTPAuthorizationCredentials, HTTPBearer
|
||||
|
||||
from app.core import exceptions as exc, security
|
||||
from app.db import db
|
||||
from app.repositories import OrgRepository, UserRepository
|
||||
|
||||
|
||||
bearer_scheme = HTTPBearer(auto_error=False)
|
||||
|
||||
ROLE_RANKS: dict[str, int] = {"viewer": 0, "member": 1, "admin": 2}
|
||||
|
||||
|
||||
@dataclass(slots=True)
|
||||
class CurrentUser:
|
||||
"""Authenticated user context derived from the access token."""
|
||||
|
||||
user_id: UUID
|
||||
email: str
|
||||
org_id: UUID
|
||||
org_role: str
|
||||
token: str
|
||||
|
||||
|
||||
async def get_current_user(
|
||||
credentials: HTTPAuthorizationCredentials | None = Depends(bearer_scheme),
|
||||
) -> CurrentUser:
|
||||
"""Extract and validate the current user from the Authorization header."""
|
||||
|
||||
if credentials is None or credentials.scheme.lower() != "bearer":
|
||||
raise exc.UnauthorizedError("Missing bearer token")
|
||||
|
||||
try:
|
||||
payload = security.TokenPayload(security.decode_access_token(credentials.credentials))
|
||||
except security.JWTError as err: # pragma: no cover - jose error types
|
||||
raise exc.UnauthorizedError("Invalid access token") from err
|
||||
|
||||
async with db.connection() as conn:
|
||||
user_repo = UserRepository(conn)
|
||||
user = await user_repo.get_by_id(payload.user_id)
|
||||
if user is None:
|
||||
raise exc.UnauthorizedError("User not found")
|
||||
|
||||
org_repo = OrgRepository(conn)
|
||||
membership = await org_repo.get_member(payload.user_id, payload.org_id)
|
||||
if membership is None:
|
||||
raise exc.ForbiddenError("Organization access denied")
|
||||
|
||||
return CurrentUser(
|
||||
user_id=payload.user_id,
|
||||
email=user["email"],
|
||||
org_id=payload.org_id,
|
||||
org_role=membership["role"],
|
||||
token=credentials.credentials,
|
||||
)
|
||||
|
||||
|
||||
class RoleChecker:
|
||||
"""Dependency that enforces a minimum organization role."""
|
||||
|
||||
def __init__(self, minimum_role: str) -> None:
|
||||
if minimum_role not in ROLE_RANKS:
|
||||
raise ValueError(f"Unknown role '{minimum_role}'")
|
||||
self.minimum_role = minimum_role
|
||||
|
||||
def __call__(self, current_user: CurrentUser = Depends(get_current_user)) -> CurrentUser:
|
||||
if ROLE_RANKS[current_user.org_role] < ROLE_RANKS[self.minimum_role]:
|
||||
raise exc.ForbiddenError("Insufficient role for this operation")
|
||||
return current_user
|
||||
|
||||
|
||||
def require_role(min_role: str) -> Callable[[CurrentUser], CurrentUser]:
|
||||
"""Factory that returns a dependency enforcing the specified role."""
|
||||
|
||||
return RoleChecker(min_role)
|
||||
|
||||
|
||||
def ensure_org_access(resource_org_id: UUID, current_user: CurrentUser) -> None:
|
||||
"""Verify that the resource belongs to the active org in the token."""
|
||||
|
||||
if resource_org_id != current_user.org_id:
|
||||
raise exc.ForbiddenError("Resource does not belong to the active organization")
|
||||
|
||||
|
||||
__all__ = [
|
||||
"CurrentUser",
|
||||
"ROLE_RANKS",
|
||||
"RoleChecker",
|
||||
"bearer_scheme",
|
||||
"ensure_org_access",
|
||||
"get_current_user",
|
||||
"require_role",
|
||||
]
|
||||
@@ -1,59 +0,0 @@
|
||||
"""Authentication API endpoints."""
|
||||
|
||||
from fastapi import APIRouter, Depends, status
|
||||
|
||||
from app.api.deps import CurrentUser, get_current_user
|
||||
from app.schemas.auth import (
|
||||
LoginRequest,
|
||||
LogoutRequest,
|
||||
RefreshRequest,
|
||||
RegisterRequest,
|
||||
SwitchOrgRequest,
|
||||
TokenResponse,
|
||||
)
|
||||
from app.services import AuthService
|
||||
|
||||
|
||||
router = APIRouter(prefix="/auth", tags=["auth"])
|
||||
auth_service = AuthService()
|
||||
|
||||
|
||||
@router.post("/register", response_model=TokenResponse, status_code=status.HTTP_201_CREATED)
|
||||
async def register_user(payload: RegisterRequest) -> TokenResponse:
|
||||
"""Register a new user and default org, returning auth tokens."""
|
||||
|
||||
return await auth_service.register_user(payload)
|
||||
|
||||
|
||||
@router.post("/login", response_model=TokenResponse)
|
||||
async def login_user(payload: LoginRequest) -> TokenResponse:
|
||||
"""Authenticate an existing user and issue tokens."""
|
||||
|
||||
return await auth_service.login_user(payload)
|
||||
|
||||
|
||||
@router.post("/refresh", response_model=TokenResponse)
|
||||
async def refresh_tokens(payload: RefreshRequest) -> TokenResponse:
|
||||
"""Rotate refresh token and mint a new access token."""
|
||||
|
||||
return await auth_service.refresh_tokens(payload)
|
||||
|
||||
|
||||
@router.post("/switch-org", response_model=TokenResponse)
|
||||
async def switch_org(
|
||||
payload: SwitchOrgRequest,
|
||||
current_user: CurrentUser = Depends(get_current_user),
|
||||
) -> TokenResponse:
|
||||
"""Switch the active organization for the authenticated user."""
|
||||
|
||||
return await auth_service.switch_org(current_user, payload)
|
||||
|
||||
|
||||
@router.post("/logout", status_code=status.HTTP_204_NO_CONTENT)
|
||||
async def logout(
|
||||
payload: LogoutRequest,
|
||||
current_user: CurrentUser = Depends(get_current_user),
|
||||
) -> None:
|
||||
"""Revoke the provided refresh token for the current session."""
|
||||
|
||||
await auth_service.logout(current_user, payload)
|
||||
@@ -1,47 +0,0 @@
|
||||
"""Health check endpoints."""
|
||||
|
||||
from fastapi import APIRouter, Response, status
|
||||
|
||||
from app.db import db
|
||||
from app.taskqueue import task_queue
|
||||
|
||||
router = APIRouter()
|
||||
|
||||
|
||||
@router.get("/healthz")
|
||||
async def healthz() -> dict[str, str]:
|
||||
"""Liveness probe - returns 200 if the service is running."""
|
||||
return {"status": "ok"}
|
||||
|
||||
|
||||
@router.get("/readyz")
|
||||
async def readyz(response: Response) -> dict[str, str | dict[str, bool]]:
|
||||
"""
|
||||
Readiness probe - checks database and task queue connectivity.
|
||||
- Check Postgres status
|
||||
- Check configured task queue backend
|
||||
- Return overall healthiness
|
||||
"""
|
||||
checks = {
|
||||
"postgres": False,
|
||||
"task_queue": False,
|
||||
}
|
||||
|
||||
try:
|
||||
if db.pool:
|
||||
async with db.connection() as conn:
|
||||
await conn.fetchval("SELECT 1")
|
||||
checks["postgres"] = True
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
checks["task_queue"] = await task_queue.ping()
|
||||
|
||||
all_healthy = all(checks.values())
|
||||
if not all_healthy:
|
||||
response.status_code = status.HTTP_503_SERVICE_UNAVAILABLE
|
||||
|
||||
return {
|
||||
"status": "ok" if all_healthy else "degraded",
|
||||
"checks": checks,
|
||||
}
|
||||
@@ -1,103 +0,0 @@
|
||||
"""Incident API endpoints."""
|
||||
|
||||
from datetime import datetime
|
||||
from uuid import UUID
|
||||
|
||||
from fastapi import APIRouter, Depends, Query, status
|
||||
|
||||
from app.api.deps import CurrentUser, get_current_user, require_role
|
||||
from app.schemas.common import PaginatedResponse
|
||||
from app.schemas.incident import (
|
||||
CommentRequest,
|
||||
IncidentEventResponse,
|
||||
IncidentResponse,
|
||||
IncidentStatus,
|
||||
TransitionRequest,
|
||||
IncidentCreate,
|
||||
)
|
||||
from app.services import IncidentService
|
||||
|
||||
|
||||
router = APIRouter(tags=["incidents"])
|
||||
incident_service = IncidentService()
|
||||
|
||||
|
||||
@router.get("/incidents", response_model=PaginatedResponse[IncidentResponse])
|
||||
async def list_incidents(
|
||||
status: IncidentStatus | None = Query(default=None),
|
||||
cursor: datetime | None = Query(default=None, description="Cursor (created_at)"),
|
||||
limit: int = Query(default=20, ge=1, le=100),
|
||||
current_user: CurrentUser = Depends(get_current_user),
|
||||
) -> PaginatedResponse[IncidentResponse]:
|
||||
"""List incidents for the active organization."""
|
||||
|
||||
return await incident_service.get_incidents(
|
||||
current_user,
|
||||
status=status,
|
||||
cursor=cursor,
|
||||
limit=limit,
|
||||
)
|
||||
|
||||
|
||||
@router.post(
|
||||
"/services/{service_id}/incidents",
|
||||
response_model=IncidentResponse,
|
||||
status_code=status.HTTP_201_CREATED,
|
||||
)
|
||||
async def create_incident(
|
||||
service_id: UUID,
|
||||
payload: IncidentCreate,
|
||||
current_user: CurrentUser = Depends(require_role("member")),
|
||||
) -> IncidentResponse:
|
||||
"""Create a new incident for the given service (member+)."""
|
||||
|
||||
return await incident_service.create_incident(current_user, service_id, payload)
|
||||
|
||||
|
||||
@router.get("/incidents/{incident_id}", response_model=IncidentResponse)
|
||||
async def get_incident(
|
||||
incident_id: UUID,
|
||||
current_user: CurrentUser = Depends(get_current_user),
|
||||
) -> IncidentResponse:
|
||||
"""Fetch a single incident by ID."""
|
||||
|
||||
return await incident_service.get_incident(current_user, incident_id)
|
||||
|
||||
|
||||
@router.get("/incidents/{incident_id}/events", response_model=list[IncidentEventResponse])
|
||||
async def get_incident_events(
|
||||
incident_id: UUID,
|
||||
current_user: CurrentUser = Depends(get_current_user),
|
||||
) -> list[IncidentEventResponse]:
|
||||
"""Get the event timeline for an incident."""
|
||||
|
||||
return await incident_service.get_incident_events(current_user, incident_id)
|
||||
|
||||
|
||||
@router.post(
|
||||
"/incidents/{incident_id}/transition",
|
||||
response_model=IncidentResponse,
|
||||
)
|
||||
async def transition_incident(
|
||||
incident_id: UUID,
|
||||
payload: TransitionRequest,
|
||||
current_user: CurrentUser = Depends(require_role("member")),
|
||||
) -> IncidentResponse:
|
||||
"""Transition an incident status (member+)."""
|
||||
|
||||
return await incident_service.transition_incident(current_user, incident_id, payload)
|
||||
|
||||
|
||||
@router.post(
|
||||
"/incidents/{incident_id}/comment",
|
||||
response_model=IncidentEventResponse,
|
||||
status_code=status.HTTP_201_CREATED,
|
||||
)
|
||||
async def add_comment(
|
||||
incident_id: UUID,
|
||||
payload: CommentRequest,
|
||||
current_user: CurrentUser = Depends(require_role("member")),
|
||||
) -> IncidentEventResponse:
|
||||
"""Add a comment to the incident timeline (member+)."""
|
||||
|
||||
return await incident_service.add_comment(current_user, incident_id, payload)
|
||||
@@ -1,72 +0,0 @@
|
||||
"""Organization API endpoints."""
|
||||
|
||||
from fastapi import APIRouter, Depends, status
|
||||
|
||||
from app.api.deps import CurrentUser, get_current_user, require_role
|
||||
from app.schemas.org import (
|
||||
MemberResponse,
|
||||
NotificationTargetCreate,
|
||||
NotificationTargetResponse,
|
||||
OrgResponse,
|
||||
ServiceCreate,
|
||||
ServiceResponse,
|
||||
)
|
||||
from app.services import OrgService
|
||||
|
||||
|
||||
router = APIRouter(prefix="/org", tags=["org"])
|
||||
org_service = OrgService()
|
||||
|
||||
|
||||
@router.get("", response_model=OrgResponse)
|
||||
async def get_org(current_user: CurrentUser = Depends(get_current_user)) -> OrgResponse:
|
||||
"""Return the active organization summary for the authenticated user."""
|
||||
|
||||
return await org_service.get_current_org(current_user)
|
||||
|
||||
|
||||
@router.get("/members", response_model=list[MemberResponse])
|
||||
async def list_members(current_user: CurrentUser = Depends(require_role("admin"))) -> list[MemberResponse]:
|
||||
"""List members of the current organization (admin only)."""
|
||||
|
||||
return await org_service.get_members(current_user)
|
||||
|
||||
|
||||
@router.get("/services", response_model=list[ServiceResponse])
|
||||
async def list_services(current_user: CurrentUser = Depends(get_current_user)) -> list[ServiceResponse]:
|
||||
"""List services for the current organization."""
|
||||
|
||||
return await org_service.get_services(current_user)
|
||||
|
||||
|
||||
@router.post("/services", response_model=ServiceResponse, status_code=status.HTTP_201_CREATED)
|
||||
async def create_service(
|
||||
payload: ServiceCreate,
|
||||
current_user: CurrentUser = Depends(require_role("member")),
|
||||
) -> ServiceResponse:
|
||||
"""Create a new service within the current organization (member+)."""
|
||||
|
||||
return await org_service.create_service(current_user, payload)
|
||||
|
||||
|
||||
@router.get("/notification-targets", response_model=list[NotificationTargetResponse])
|
||||
async def list_notification_targets(
|
||||
current_user: CurrentUser = Depends(require_role("admin")),
|
||||
) -> list[NotificationTargetResponse]:
|
||||
"""List notification targets for the current organization (admin only)."""
|
||||
|
||||
return await org_service.get_notification_targets(current_user)
|
||||
|
||||
|
||||
@router.post(
|
||||
"/notification-targets",
|
||||
response_model=NotificationTargetResponse,
|
||||
status_code=status.HTTP_201_CREATED,
|
||||
)
|
||||
async def create_notification_target(
|
||||
payload: NotificationTargetCreate,
|
||||
current_user: CurrentUser = Depends(require_role("admin")),
|
||||
) -> NotificationTargetResponse:
|
||||
"""Create a notification target for the current organization (admin only)."""
|
||||
|
||||
return await org_service.create_notification_target(current_user, payload)
|
||||
@@ -1,66 +0,0 @@
|
||||
"""Application configuration via pydantic-settings."""
|
||||
|
||||
from typing import Literal
|
||||
|
||||
from pydantic_settings import BaseSettings, SettingsConfigDict
|
||||
|
||||
|
||||
class Settings(BaseSettings):
|
||||
"""Application settings loaded from environment variables."""
|
||||
|
||||
model_config = SettingsConfigDict(
|
||||
env_file=".env",
|
||||
env_file_encoding="utf-8",
|
||||
case_sensitive=False,
|
||||
)
|
||||
|
||||
# Database
|
||||
database_url: str
|
||||
|
||||
# Redis (legacy default for Celery broker)
|
||||
redis_url: str = "redis://localhost:6379/0"
|
||||
|
||||
# Task queue
|
||||
task_queue_driver: Literal["celery", "inmemory"] = "celery"
|
||||
task_queue_broker_url: str | None = None
|
||||
task_queue_backend: Literal["redis", "sqs"] = "redis"
|
||||
task_queue_default_queue: str = "default"
|
||||
task_queue_critical_queue: str = "critical"
|
||||
task_queue_visibility_timeout: int = 600
|
||||
task_queue_polling_interval: float = 1.0
|
||||
notification_escalation_delay_seconds: int = 900
|
||||
|
||||
# AWS (used when task_queue_backend="sqs")
|
||||
aws_region: str | None = None
|
||||
|
||||
# JWT
|
||||
jwt_secret_key: str
|
||||
jwt_algorithm: str = "HS256"
|
||||
jwt_issuer: str = "incidentops"
|
||||
jwt_audience: str = "incidentops-api"
|
||||
access_token_expire_minutes: int = 15
|
||||
refresh_token_expire_days: int = 30
|
||||
|
||||
# Application
|
||||
debug: bool = False
|
||||
api_v1_prefix: str = "/v1"
|
||||
|
||||
# OpenTelemetry
|
||||
otel_enabled: bool = True
|
||||
otel_service_name: str = "incidentops-api"
|
||||
otel_environment: str = "development"
|
||||
otel_exporter_otlp_endpoint: str | None = None # e.g., "http://tempo:4317"
|
||||
otel_exporter_otlp_insecure: bool = True
|
||||
otel_log_level: str = "INFO"
|
||||
|
||||
# Metrics
|
||||
prometheus_port: int = 9464 # Port for Prometheus metrics endpoint
|
||||
|
||||
@property
|
||||
def resolved_task_queue_broker_url(self) -> str:
|
||||
"""Return the broker URL with redis fallback for backwards compatibility."""
|
||||
|
||||
return self.task_queue_broker_url or self.redis_url
|
||||
|
||||
|
||||
settings = Settings() # type: ignore[call-arg]
|
||||
@@ -1,59 +0,0 @@
|
||||
"""Custom HTTP exceptions for the API."""
|
||||
|
||||
from fastapi import HTTPException, status
|
||||
|
||||
|
||||
class NotFoundError(HTTPException):
|
||||
"""Resource not found."""
|
||||
|
||||
def __init__(self, detail: str = "Resource not found") -> None:
|
||||
super().__init__(status_code=status.HTTP_404_NOT_FOUND, detail=detail)
|
||||
|
||||
|
||||
class ConflictError(HTTPException):
|
||||
"""Conflict with current state (e.g., version mismatch)."""
|
||||
|
||||
def __init__(self, detail: str = "Conflict with current state") -> None:
|
||||
super().__init__(status_code=status.HTTP_409_CONFLICT, detail=detail)
|
||||
|
||||
|
||||
class UnauthorizedError(HTTPException):
|
||||
"""Authentication required or failed."""
|
||||
|
||||
def __init__(self, detail: str = "Not authenticated") -> None:
|
||||
super().__init__(
|
||||
status_code=status.HTTP_401_UNAUTHORIZED,
|
||||
detail=detail,
|
||||
headers={"WWW-Authenticate": "Bearer"},
|
||||
)
|
||||
|
||||
|
||||
class ForbiddenError(HTTPException):
|
||||
"""Insufficient permissions."""
|
||||
|
||||
def __init__(self, detail: str = "Insufficient permissions") -> None:
|
||||
super().__init__(status_code=status.HTTP_403_FORBIDDEN, detail=detail)
|
||||
|
||||
|
||||
class BadRequestError(HTTPException):
|
||||
"""Invalid request data."""
|
||||
|
||||
def __init__(self, detail: str = "Invalid request") -> None:
|
||||
super().__init__(status_code=status.HTTP_400_BAD_REQUEST, detail=detail)
|
||||
|
||||
|
||||
class ValidationError(HTTPException):
|
||||
"""Validation failed."""
|
||||
|
||||
def __init__(self, detail: str = "Validation failed") -> None:
|
||||
super().__init__(status_code=status.HTTP_422_UNPROCESSABLE_ENTITY, detail=detail)
|
||||
|
||||
|
||||
__all__ = [
|
||||
"BadRequestError",
|
||||
"ConflictError",
|
||||
"ForbiddenError",
|
||||
"NotFoundError",
|
||||
"UnauthorizedError",
|
||||
"ValidationError",
|
||||
]
|
||||
@@ -1,164 +0,0 @@
|
||||
"""Structured JSON logging configuration with OpenTelemetry integration."""
|
||||
|
||||
import json
|
||||
import logging
|
||||
import sys
|
||||
from datetime import datetime, timezone
|
||||
from typing import Any
|
||||
|
||||
from app.config import settings
|
||||
|
||||
|
||||
class JSONFormatter(logging.Formatter):
|
||||
"""
|
||||
JSON log formatter that outputs structured logs with trace context.
|
||||
|
||||
Log format includes:
|
||||
- timestamp: ISO 8601 format
|
||||
- level: Log level name
|
||||
- message: Log message
|
||||
- logger: Logger name
|
||||
- trace_id: OpenTelemetry trace ID (if available)
|
||||
- span_id: OpenTelemetry span ID (if available)
|
||||
- Extra fields from log record
|
||||
"""
|
||||
|
||||
def format(self, record: logging.LogRecord) -> str:
|
||||
log_data: dict[str, Any] = {
|
||||
"timestamp": datetime.now(timezone.utc).isoformat(),
|
||||
"level": record.levelname,
|
||||
"message": record.getMessage(),
|
||||
"logger": record.name,
|
||||
}
|
||||
|
||||
# Add trace context if available (injected by OpenTelemetry LoggingInstrumentor)
|
||||
if hasattr(record, "otelTraceID") and record.otelTraceID != "0":
|
||||
log_data["trace_id"] = record.otelTraceID
|
||||
if hasattr(record, "otelSpanID") and record.otelSpanID != "0":
|
||||
log_data["span_id"] = record.otelSpanID
|
||||
|
||||
# Add exception info if present
|
||||
if record.exc_info:
|
||||
log_data["exception"] = self.formatException(record.exc_info)
|
||||
|
||||
# Add extra fields (excluding standard LogRecord attributes)
|
||||
standard_attrs = {
|
||||
"name",
|
||||
"msg",
|
||||
"args",
|
||||
"created",
|
||||
"filename",
|
||||
"funcName",
|
||||
"levelname",
|
||||
"levelno",
|
||||
"lineno",
|
||||
"module",
|
||||
"msecs",
|
||||
"pathname",
|
||||
"process",
|
||||
"processName",
|
||||
"relativeCreated",
|
||||
"stack_info",
|
||||
"exc_info",
|
||||
"exc_text",
|
||||
"thread",
|
||||
"threadName",
|
||||
"taskName",
|
||||
"message",
|
||||
"otelTraceID",
|
||||
"otelSpanID",
|
||||
"otelTraceSampled",
|
||||
"otelServiceName",
|
||||
}
|
||||
for key, value in record.__dict__.items():
|
||||
if key not in standard_attrs and not key.startswith("_"):
|
||||
log_data[key] = value
|
||||
|
||||
return json.dumps(log_data, default=str)
|
||||
|
||||
|
||||
class DevelopmentFormatter(logging.Formatter):
|
||||
"""
|
||||
Human-readable formatter for development with color support.
|
||||
|
||||
Format: [TIME] LEVEL logger - message [trace_id]
|
||||
"""
|
||||
|
||||
COLORS = {
|
||||
"DEBUG": "\033[36m", # Cyan
|
||||
"INFO": "\033[32m", # Green
|
||||
"WARNING": "\033[33m", # Yellow
|
||||
"ERROR": "\033[31m", # Red
|
||||
"CRITICAL": "\033[35m", # Magenta
|
||||
}
|
||||
RESET = "\033[0m"
|
||||
|
||||
def format(self, record: logging.LogRecord) -> str:
|
||||
color = self.COLORS.get(record.levelname, "")
|
||||
reset = self.RESET
|
||||
|
||||
# Format timestamp
|
||||
timestamp = datetime.now(timezone.utc).strftime("%H:%M:%S.%f")[:-3]
|
||||
|
||||
# Build message
|
||||
msg = f"[{timestamp}] {color}{record.levelname:8}{reset} {record.name} - {record.getMessage()}"
|
||||
|
||||
# Add trace context if available
|
||||
if hasattr(record, "otelTraceID") and record.otelTraceID != "0":
|
||||
msg += f" [{record.otelTraceID[:8]}...]"
|
||||
|
||||
# Add exception if present
|
||||
if record.exc_info:
|
||||
msg += f"\n{self.formatException(record.exc_info)}"
|
||||
|
||||
return msg
|
||||
|
||||
|
||||
def setup_logging() -> None:
|
||||
"""
|
||||
Configure application logging.
|
||||
|
||||
- JSON format in production (OTEL enabled)
|
||||
- Human-readable format in development
|
||||
- Integrates with OpenTelemetry trace context
|
||||
"""
|
||||
# Determine log level
|
||||
log_level = getattr(logging, settings.otel_log_level.upper(), logging.INFO)
|
||||
|
||||
# Choose formatter based on environment
|
||||
if settings.otel_enabled and not settings.debug:
|
||||
formatter = JSONFormatter()
|
||||
else:
|
||||
formatter = DevelopmentFormatter()
|
||||
|
||||
# Configure root logger
|
||||
root_logger = logging.getLogger()
|
||||
root_logger.setLevel(log_level)
|
||||
|
||||
# Remove existing handlers
|
||||
for handler in root_logger.handlers[:]:
|
||||
root_logger.removeHandler(handler)
|
||||
|
||||
# Add stdout handler
|
||||
handler = logging.StreamHandler(sys.stdout)
|
||||
handler.setFormatter(formatter)
|
||||
root_logger.addHandler(handler)
|
||||
|
||||
# Reduce noise from third-party libraries (keep uvicorn access at INFO so requests are logged)
|
||||
logging.getLogger("uvicorn.access").setLevel(logging.INFO)
|
||||
logging.getLogger("asyncpg").setLevel(logging.WARNING)
|
||||
logging.getLogger("httpx").setLevel(logging.WARNING)
|
||||
logging.getLogger("httpcore").setLevel(logging.WARNING)
|
||||
|
||||
logging.info(
|
||||
"Logging configured",
|
||||
extra={
|
||||
"log_level": settings.otel_log_level,
|
||||
"format": "json" if settings.otel_enabled and not settings.debug else "dev",
|
||||
},
|
||||
)
|
||||
|
||||
|
||||
def get_logger(name: str) -> logging.Logger:
|
||||
"""Get a logger instance with the given name."""
|
||||
return logging.getLogger(name)
|
||||
@@ -1,106 +0,0 @@
|
||||
"""Security utilities for JWT and password hashing."""
|
||||
|
||||
import hashlib
|
||||
import secrets
|
||||
from datetime import UTC, datetime, timedelta
|
||||
from typing import Any
|
||||
from uuid import UUID, uuid4
|
||||
|
||||
import bcrypt
|
||||
from jose import JWTError, jwt
|
||||
|
||||
from app.config import settings
|
||||
|
||||
|
||||
def hash_password(password: str) -> str:
|
||||
"""Hash a password using bcrypt."""
|
||||
return bcrypt.hashpw(password.encode(), bcrypt.gensalt()).decode()
|
||||
|
||||
|
||||
def verify_password(plain_password: str, hashed_password: str) -> bool:
|
||||
"""Verify a password against its hash."""
|
||||
return bcrypt.checkpw(plain_password.encode(), hashed_password.encode())
|
||||
|
||||
|
||||
def create_access_token(
|
||||
sub: str,
|
||||
org_id: str,
|
||||
org_role: str,
|
||||
expires_delta: timedelta | None = None,
|
||||
) -> str:
|
||||
"""Create a JWT access token with org context."""
|
||||
if expires_delta is None:
|
||||
expires_delta = timedelta(minutes=settings.access_token_expire_minutes)
|
||||
|
||||
now = datetime.now(UTC)
|
||||
expire = now + expires_delta
|
||||
|
||||
payload = {
|
||||
"sub": sub,
|
||||
"org_id": org_id,
|
||||
"org_role": org_role,
|
||||
"iss": settings.jwt_issuer,
|
||||
"aud": settings.jwt_audience,
|
||||
"jti": str(uuid4()),
|
||||
"iat": now,
|
||||
"exp": expire,
|
||||
}
|
||||
|
||||
return jwt.encode(payload, settings.jwt_secret_key, algorithm=settings.jwt_algorithm)
|
||||
|
||||
|
||||
def decode_access_token(token: str) -> dict[str, Any]:
|
||||
"""Decode and validate a JWT access token.
|
||||
|
||||
Raises:
|
||||
JWTError: If token is invalid or expired.
|
||||
"""
|
||||
return jwt.decode(
|
||||
token,
|
||||
settings.jwt_secret_key,
|
||||
algorithms=[settings.jwt_algorithm],
|
||||
issuer=settings.jwt_issuer,
|
||||
audience=settings.jwt_audience,
|
||||
)
|
||||
|
||||
|
||||
def generate_refresh_token() -> str:
|
||||
"""Generate a secure random refresh token."""
|
||||
return secrets.token_urlsafe(32)
|
||||
|
||||
|
||||
def hash_token(token: str) -> str:
|
||||
"""Hash a refresh token for storage."""
|
||||
return hashlib.sha256(token.encode()).hexdigest()
|
||||
|
||||
|
||||
def get_refresh_token_expiry() -> datetime:
|
||||
"""Get expiry datetime for a new refresh token."""
|
||||
return datetime.now(UTC) + timedelta(days=settings.refresh_token_expire_days)
|
||||
|
||||
|
||||
class TokenPayload:
|
||||
"""Parsed JWT token payload."""
|
||||
|
||||
def __init__(self, payload: dict[str, Any]) -> None:
|
||||
self.user_id = UUID(payload["sub"])
|
||||
self.org_id = UUID(payload["org_id"])
|
||||
self.org_role = payload["org_role"]
|
||||
self.issuer = payload["iss"]
|
||||
self.audience = payload["aud"]
|
||||
self.jti = UUID(payload["jti"])
|
||||
self.issued_at = payload["iat"]
|
||||
self.expires_at = payload["exp"]
|
||||
|
||||
|
||||
__all__ = [
|
||||
"JWTError",
|
||||
"TokenPayload",
|
||||
"create_access_token",
|
||||
"decode_access_token",
|
||||
"generate_refresh_token",
|
||||
"get_refresh_token_expiry",
|
||||
"hash_password",
|
||||
"hash_token",
|
||||
"verify_password",
|
||||
]
|
||||
@@ -1,271 +0,0 @@
|
||||
"""OpenTelemetry instrumentation for tracing, metrics, and logging."""
|
||||
|
||||
import logging
|
||||
from contextlib import contextmanager
|
||||
from typing import Any
|
||||
|
||||
from opentelemetry import metrics, trace
|
||||
from opentelemetry.exporter.otlp.proto.grpc.trace_exporter import OTLPSpanExporter
|
||||
from opentelemetry.exporter.prometheus import PrometheusMetricReader
|
||||
from opentelemetry.instrumentation.asyncpg import AsyncPGInstrumentor
|
||||
from opentelemetry.instrumentation.fastapi import FastAPIInstrumentor
|
||||
from opentelemetry.instrumentation.httpx import HTTPXClientInstrumentor
|
||||
from opentelemetry.instrumentation.logging import LoggingInstrumentor
|
||||
from opentelemetry.instrumentation.redis import RedisInstrumentor
|
||||
from opentelemetry.instrumentation.system_metrics import SystemMetricsInstrumentor
|
||||
from opentelemetry.sdk.metrics import MeterProvider
|
||||
from opentelemetry.sdk.resources import Resource
|
||||
from opentelemetry.sdk.trace import TracerProvider
|
||||
from opentelemetry.sdk.trace.export import BatchSpanProcessor, ConsoleSpanExporter
|
||||
from opentelemetry.semconv.resource import ResourceAttributes
|
||||
from prometheus_client import REGISTRY, start_http_server
|
||||
|
||||
from app.config import settings
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
_tracer_provider: TracerProvider | None = None
|
||||
_meter_provider: MeterProvider | None = None
|
||||
|
||||
# Custom metrics
|
||||
_request_counter = None
|
||||
_request_duration = None
|
||||
_active_requests = None
|
||||
_error_counter = None
|
||||
|
||||
|
||||
def setup_telemetry(app: Any) -> None:
|
||||
"""
|
||||
Initialize OpenTelemetry with tracing, metrics, and logging instrumentation.
|
||||
|
||||
Configures:
|
||||
- OTLP exporter for traces (to Tempo/Jaeger)
|
||||
- Prometheus exporter for metrics (scraped by Prometheus)
|
||||
- Auto-instrumentation for FastAPI, asyncpg, httpx, redis
|
||||
- System metrics (CPU, memory, etc.)
|
||||
- Logging instrumentation for trace context injection
|
||||
"""
|
||||
global _tracer_provider, _meter_provider
|
||||
global _request_counter, _request_duration, _active_requests, _error_counter
|
||||
|
||||
if not settings.otel_enabled:
|
||||
logger.info("OpenTelemetry disabled")
|
||||
return
|
||||
|
||||
# Create resource with service info
|
||||
resource = Resource.create(
|
||||
{
|
||||
ResourceAttributes.SERVICE_NAME: settings.otel_service_name,
|
||||
ResourceAttributes.SERVICE_VERSION: "0.1.0",
|
||||
ResourceAttributes.DEPLOYMENT_ENVIRONMENT: settings.otel_environment,
|
||||
}
|
||||
)
|
||||
|
||||
# =========================================
|
||||
# TRACING SETUP
|
||||
# =========================================
|
||||
_tracer_provider = TracerProvider(resource=resource)
|
||||
|
||||
if settings.otel_exporter_otlp_endpoint:
|
||||
otlp_exporter = OTLPSpanExporter(
|
||||
endpoint=settings.otel_exporter_otlp_endpoint,
|
||||
insecure=settings.otel_exporter_otlp_insecure,
|
||||
)
|
||||
_tracer_provider.add_span_processor(BatchSpanProcessor(otlp_exporter))
|
||||
logger.info(f"OTLP exporter configured: {settings.otel_exporter_otlp_endpoint}")
|
||||
else:
|
||||
_tracer_provider.add_span_processor(BatchSpanProcessor(ConsoleSpanExporter()))
|
||||
logger.info("Console span exporter configured (no OTLP endpoint)")
|
||||
|
||||
trace.set_tracer_provider(_tracer_provider)
|
||||
|
||||
# =========================================
|
||||
# METRICS SETUP
|
||||
# =========================================
|
||||
# Prometheus metric reader exposes metrics at /metrics endpoint
|
||||
prometheus_reader = PrometheusMetricReader()
|
||||
_meter_provider = MeterProvider(resource=resource, metric_readers=[prometheus_reader])
|
||||
metrics.set_meter_provider(_meter_provider)
|
||||
|
||||
# Start Prometheus HTTP server on port 9464
|
||||
prometheus_port = settings.prometheus_port
|
||||
try:
|
||||
start_http_server(port=prometheus_port, registry=REGISTRY)
|
||||
logger.info(f"Prometheus metrics server started on port {prometheus_port}")
|
||||
except OSError as e:
|
||||
logger.warning(f"Could not start Prometheus server on port {prometheus_port}: {e}")
|
||||
|
||||
# Create custom metrics
|
||||
meter = metrics.get_meter(__name__)
|
||||
|
||||
_request_counter = meter.create_counter(
|
||||
name="http_requests_total",
|
||||
description="Total number of HTTP requests",
|
||||
unit="1",
|
||||
)
|
||||
|
||||
_request_duration = meter.create_histogram(
|
||||
name="http_request_duration_seconds",
|
||||
description="HTTP request duration in seconds",
|
||||
unit="s",
|
||||
)
|
||||
|
||||
_active_requests = meter.create_up_down_counter(
|
||||
name="http_requests_active",
|
||||
description="Number of active HTTP requests",
|
||||
unit="1",
|
||||
)
|
||||
|
||||
_error_counter = meter.create_counter(
|
||||
name="http_errors_total",
|
||||
description="Total number of HTTP errors",
|
||||
unit="1",
|
||||
)
|
||||
|
||||
# Instrument system metrics (CPU, memory, etc.)
|
||||
SystemMetricsInstrumentor().instrument()
|
||||
logger.info("System metrics instrumentation enabled")
|
||||
|
||||
# =========================================
|
||||
# LIBRARY INSTRUMENTATION
|
||||
# =========================================
|
||||
FastAPIInstrumentor.instrument_app(
|
||||
app,
|
||||
excluded_urls="healthz,readyz,metrics",
|
||||
tracer_provider=_tracer_provider,
|
||||
meter_provider=_meter_provider,
|
||||
)
|
||||
AsyncPGInstrumentor().instrument(tracer_provider=_tracer_provider)
|
||||
HTTPXClientInstrumentor().instrument(tracer_provider=_tracer_provider)
|
||||
RedisInstrumentor().instrument(tracer_provider=_tracer_provider)
|
||||
|
||||
# Inject trace context into logs
|
||||
LoggingInstrumentor().instrument(
|
||||
set_logging_format=True,
|
||||
log_level=logging.INFO,
|
||||
)
|
||||
|
||||
logger.info(
|
||||
f"OpenTelemetry initialized: service={settings.otel_service_name}, "
|
||||
f"env={settings.otel_environment}, metrics_port={prometheus_port}"
|
||||
)
|
||||
|
||||
|
||||
async def shutdown_telemetry() -> None:
|
||||
"""Gracefully shutdown the tracer and meter providers."""
|
||||
global _tracer_provider, _meter_provider
|
||||
|
||||
if _tracer_provider:
|
||||
_tracer_provider.shutdown()
|
||||
_tracer_provider = None
|
||||
logger.info("Tracer provider shutdown complete")
|
||||
|
||||
if _meter_provider:
|
||||
_meter_provider.shutdown()
|
||||
_meter_provider = None
|
||||
logger.info("Meter provider shutdown complete")
|
||||
|
||||
|
||||
def get_tracer(name: str) -> trace.Tracer:
|
||||
"""Get a tracer instance for manual span creation."""
|
||||
return trace.get_tracer(name)
|
||||
|
||||
|
||||
def get_meter(name: str) -> metrics.Meter:
|
||||
"""Get a meter instance for custom metrics."""
|
||||
return metrics.get_meter(name)
|
||||
|
||||
|
||||
def get_current_trace_id() -> str | None:
|
||||
"""Get the current trace ID for request correlation."""
|
||||
span = trace.get_current_span()
|
||||
if span and span.get_span_context().is_valid:
|
||||
return format(span.get_span_context().trace_id, "032x")
|
||||
return None
|
||||
|
||||
|
||||
def get_current_span_id() -> str | None:
|
||||
"""Get the current span ID."""
|
||||
span = trace.get_current_span()
|
||||
if span and span.get_span_context().is_valid:
|
||||
return format(span.get_span_context().span_id, "016x")
|
||||
return None
|
||||
|
||||
|
||||
@contextmanager
|
||||
def create_span(name: str, attributes: dict[str, Any] | None = None):
|
||||
"""Context manager for creating manual spans."""
|
||||
tracer = get_tracer(__name__)
|
||||
with tracer.start_as_current_span(name, attributes=attributes) as span:
|
||||
yield span
|
||||
|
||||
|
||||
def add_span_attributes(attributes: dict[str, Any]) -> None:
|
||||
"""Add attributes to the current span."""
|
||||
span = trace.get_current_span()
|
||||
if span:
|
||||
for key, value in attributes.items():
|
||||
span.set_attribute(key, value)
|
||||
|
||||
|
||||
def record_exception(exception: Exception) -> None:
|
||||
"""Record an exception on the current span."""
|
||||
span = trace.get_current_span()
|
||||
if span:
|
||||
span.record_exception(exception)
|
||||
span.set_status(trace.Status(trace.StatusCode.ERROR, str(exception)))
|
||||
|
||||
|
||||
# =========================================
|
||||
# CUSTOM METRICS HELPERS
|
||||
# =========================================
|
||||
|
||||
|
||||
def record_request(method: str, endpoint: str, status_code: int) -> None:
|
||||
"""Record a request metric."""
|
||||
if _request_counter:
|
||||
_request_counter.add(
|
||||
1,
|
||||
{
|
||||
"method": method,
|
||||
"endpoint": endpoint,
|
||||
"status_code": str(status_code),
|
||||
},
|
||||
)
|
||||
|
||||
|
||||
def record_request_duration(method: str, endpoint: str, duration: float) -> None:
|
||||
"""Record request duration in seconds."""
|
||||
if _request_duration:
|
||||
_request_duration.record(
|
||||
duration,
|
||||
{
|
||||
"method": method,
|
||||
"endpoint": endpoint,
|
||||
},
|
||||
)
|
||||
|
||||
|
||||
def increment_active_requests(method: str, endpoint: str) -> None:
|
||||
"""Increment active requests counter."""
|
||||
if _active_requests:
|
||||
_active_requests.add(1, {"method": method, "endpoint": endpoint})
|
||||
|
||||
|
||||
def decrement_active_requests(method: str, endpoint: str) -> None:
|
||||
"""Decrement active requests counter."""
|
||||
if _active_requests:
|
||||
_active_requests.add(-1, {"method": method, "endpoint": endpoint})
|
||||
|
||||
|
||||
def record_error(method: str, endpoint: str, error_type: str) -> None:
|
||||
"""Record an error metric."""
|
||||
if _error_counter:
|
||||
_error_counter.add(
|
||||
1,
|
||||
{
|
||||
"method": method,
|
||||
"endpoint": endpoint,
|
||||
"error_type": error_type,
|
||||
},
|
||||
)
|
||||
@@ -1,74 +0,0 @@
|
||||
"""Database connection management using asyncpg."""
|
||||
|
||||
from collections.abc import AsyncGenerator
|
||||
from contextlib import asynccontextmanager
|
||||
from contextvars import ContextVar
|
||||
|
||||
import asyncpg
|
||||
from asyncpg.pool import PoolConnectionProxy
|
||||
|
||||
|
||||
class Database:
|
||||
"""Manages asyncpg connection pool."""
|
||||
|
||||
pool: asyncpg.Pool | None = None
|
||||
|
||||
async def connect(self, dsn: str) -> None:
|
||||
"""Create connection pool."""
|
||||
self.pool = await asyncpg.create_pool(
|
||||
dsn,
|
||||
min_size=5,
|
||||
max_size=20,
|
||||
command_timeout=60,
|
||||
)
|
||||
|
||||
async def disconnect(self) -> None:
|
||||
"""Close connection pool."""
|
||||
if self.pool:
|
||||
await self.pool.close()
|
||||
|
||||
@asynccontextmanager
|
||||
async def connection(self) -> AsyncGenerator[asyncpg.Connection | PoolConnectionProxy, None]:
|
||||
"""Acquire a connection from the pool."""
|
||||
if not self.pool:
|
||||
raise RuntimeError("Database not connected")
|
||||
async with self.pool.acquire() as conn:
|
||||
yield conn
|
||||
|
||||
@asynccontextmanager
|
||||
async def transaction(self) -> AsyncGenerator[asyncpg.Connection | PoolConnectionProxy, None]:
|
||||
"""Acquire a connection with an active transaction."""
|
||||
if not self.pool:
|
||||
raise RuntimeError("Database not connected")
|
||||
async with self.pool.acquire() as conn:
|
||||
async with conn.transaction():
|
||||
yield conn
|
||||
|
||||
|
||||
# Global instance
|
||||
db = Database()
|
||||
|
||||
|
||||
_connection_ctx: ContextVar[asyncpg.Connection | PoolConnectionProxy | None] = ContextVar(
|
||||
"db_connection",
|
||||
default=None,
|
||||
)
|
||||
|
||||
|
||||
async def get_conn() -> AsyncGenerator[asyncpg.Connection | PoolConnectionProxy, None]:
|
||||
"""Dependency that reuses the same DB connection within a request context."""
|
||||
|
||||
existing_conn = _connection_ctx.get()
|
||||
if existing_conn is not None:
|
||||
yield existing_conn
|
||||
return
|
||||
|
||||
if not db.pool:
|
||||
raise RuntimeError("Database not connected")
|
||||
|
||||
async with db.pool.acquire() as conn:
|
||||
token = _connection_ctx.set(conn)
|
||||
try:
|
||||
yield conn
|
||||
finally:
|
||||
_connection_ctx.reset(token)
|
||||
-282
@@ -1,282 +0,0 @@
|
||||
"""FastAPI application entry point."""
|
||||
|
||||
import logging
|
||||
import time
|
||||
from contextlib import asynccontextmanager
|
||||
from typing import AsyncGenerator
|
||||
|
||||
from fastapi import FastAPI, Request, status
|
||||
from fastapi.encoders import jsonable_encoder
|
||||
from fastapi.exceptions import RequestValidationError
|
||||
from fastapi.openapi.utils import get_openapi
|
||||
from fastapi.responses import JSONResponse
|
||||
from starlette.exceptions import HTTPException as StarletteHTTPException
|
||||
|
||||
from app.api.v1 import auth, health, incidents, org
|
||||
from app.config import settings
|
||||
from app.core.logging import setup_logging
|
||||
from app.core.telemetry import (
|
||||
get_current_trace_id,
|
||||
record_exception,
|
||||
setup_telemetry,
|
||||
shutdown_telemetry,
|
||||
)
|
||||
from app.db import db
|
||||
from app.schemas.common import ErrorDetail, ErrorResponse
|
||||
from app.taskqueue import task_queue
|
||||
|
||||
# Initialize logging before anything else
|
||||
setup_logging()
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
@asynccontextmanager
|
||||
async def lifespan(app: FastAPI) -> AsyncGenerator[None, None]:
|
||||
"""Manage application lifecycle - connect/disconnect resources."""
|
||||
# Startup
|
||||
logger.info("Starting IncidentOps API")
|
||||
await db.connect(settings.database_url)
|
||||
await task_queue.startup()
|
||||
logger.info("Startup complete")
|
||||
yield
|
||||
# Shutdown
|
||||
logger.info("Shutting down IncidentOps API")
|
||||
await task_queue.shutdown()
|
||||
await db.disconnect()
|
||||
await shutdown_telemetry()
|
||||
logger.info("Shutdown complete")
|
||||
|
||||
|
||||
app = FastAPI(
|
||||
title="IncidentOps",
|
||||
description="Incident management API with multi-tenant org support",
|
||||
version="0.1.0",
|
||||
docs_url="/docs",
|
||||
redoc_url="/redoc",
|
||||
openapi_url="/openapi.json",
|
||||
lifespan=lifespan,
|
||||
)
|
||||
|
||||
# Set up OpenTelemetry instrumentation
|
||||
setup_telemetry(app)
|
||||
|
||||
|
||||
@app.middleware("http")
|
||||
async def request_logging_middleware(request: Request, call_next):
|
||||
start = time.time()
|
||||
response = await call_next(request)
|
||||
duration_ms = (time.time() - start) * 1000
|
||||
logger.info(
|
||||
"request",
|
||||
extra={
|
||||
"method": request.method,
|
||||
"path": request.url.path,
|
||||
"status_code": response.status_code,
|
||||
"duration_ms": round(duration_ms, 2),
|
||||
},
|
||||
)
|
||||
return response
|
||||
|
||||
app.openapi_tags = [
|
||||
{"name": "auth", "description": "Registration, login, token lifecycle"},
|
||||
{"name": "org", "description": "Organization membership, services, and notifications"},
|
||||
{"name": "incidents", "description": "Incident lifecycle and timelines"},
|
||||
{"name": "health", "description": "Service health probes"},
|
||||
]
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Global Exception Handlers
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
def _build_error_response(
|
||||
error: str,
|
||||
message: str,
|
||||
status_code: int,
|
||||
details: list[ErrorDetail] | None = None,
|
||||
) -> JSONResponse:
|
||||
"""Build a structured error response with trace context."""
|
||||
response = ErrorResponse(
|
||||
error=error,
|
||||
message=message,
|
||||
details=details,
|
||||
request_id=get_current_trace_id(),
|
||||
)
|
||||
return JSONResponse(
|
||||
status_code=status_code,
|
||||
content=jsonable_encoder(response),
|
||||
)
|
||||
|
||||
|
||||
@app.exception_handler(StarletteHTTPException)
|
||||
async def http_exception_handler(
|
||||
request: Request, exc: StarletteHTTPException
|
||||
) -> JSONResponse:
|
||||
"""Handle HTTP exceptions with structured error responses."""
|
||||
# Map status codes to error type strings
|
||||
error_types = {
|
||||
400: "bad_request",
|
||||
401: "unauthorized",
|
||||
403: "forbidden",
|
||||
404: "not_found",
|
||||
409: "conflict",
|
||||
422: "validation_error",
|
||||
429: "rate_limited",
|
||||
500: "internal_error",
|
||||
502: "bad_gateway",
|
||||
503: "service_unavailable",
|
||||
}
|
||||
error_type = error_types.get(exc.status_code, "error")
|
||||
|
||||
logger.warning(
|
||||
"HTTP exception",
|
||||
extra={
|
||||
"status_code": exc.status_code,
|
||||
"error": error_type,
|
||||
"detail": exc.detail,
|
||||
"path": str(request.url.path),
|
||||
"method": request.method,
|
||||
},
|
||||
)
|
||||
|
||||
return _build_error_response(
|
||||
error=error_type,
|
||||
message=str(exc.detail),
|
||||
status_code=exc.status_code,
|
||||
)
|
||||
|
||||
|
||||
@app.exception_handler(RequestValidationError)
|
||||
async def validation_exception_handler(
|
||||
request: Request, exc: RequestValidationError
|
||||
) -> JSONResponse:
|
||||
"""Handle Pydantic validation errors with detailed error responses."""
|
||||
details = [
|
||||
ErrorDetail(
|
||||
loc=[str(loc) for loc in error["loc"]],
|
||||
msg=error["msg"],
|
||||
type=error["type"],
|
||||
)
|
||||
for error in exc.errors()
|
||||
]
|
||||
|
||||
logger.warning(
|
||||
"Validation error",
|
||||
extra={
|
||||
"path": str(request.url.path),
|
||||
"method": request.method,
|
||||
"error_count": len(details),
|
||||
},
|
||||
)
|
||||
|
||||
return _build_error_response(
|
||||
error="validation_error",
|
||||
message="Request validation failed",
|
||||
status_code=status.HTTP_422_UNPROCESSABLE_ENTITY,
|
||||
details=details,
|
||||
)
|
||||
|
||||
|
||||
@app.exception_handler(Exception)
|
||||
async def unhandled_exception_handler(request: Request, exc: Exception) -> JSONResponse:
|
||||
"""Handle unexpected exceptions with logging and safe error response."""
|
||||
# Record exception in the current span for tracing
|
||||
record_exception(exc)
|
||||
|
||||
logger.exception(
|
||||
"Unhandled exception",
|
||||
extra={
|
||||
"path": str(request.url.path),
|
||||
"method": request.method,
|
||||
"exception_type": type(exc).__name__,
|
||||
},
|
||||
)
|
||||
|
||||
# Don't leak internal error details in production
|
||||
message = "An unexpected error occurred"
|
||||
if settings.debug:
|
||||
message = f"{type(exc).__name__}: {exc}"
|
||||
|
||||
return _build_error_response(
|
||||
error="internal_error",
|
||||
message=message,
|
||||
status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
|
||||
)
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# OpenAPI Customization
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
def custom_openapi() -> dict:
|
||||
"""Add JWT bearer security scheme and error responses to OpenAPI schema."""
|
||||
if app.openapi_schema:
|
||||
return app.openapi_schema
|
||||
|
||||
openapi_schema = get_openapi(
|
||||
title=app.title,
|
||||
version=app.version,
|
||||
description=app.description,
|
||||
routes=app.routes,
|
||||
tags=app.openapi_tags,
|
||||
)
|
||||
|
||||
# Add security schemes
|
||||
components = openapi_schema.setdefault("components", {})
|
||||
security_schemes = components.setdefault("securitySchemes", {})
|
||||
security_schemes["BearerToken"] = {
|
||||
"type": "http",
|
||||
"scheme": "bearer",
|
||||
"bearerFormat": "JWT",
|
||||
"description": "Paste the JWT access token returned by /auth endpoints",
|
||||
}
|
||||
openapi_schema["security"] = [{"BearerToken": []}]
|
||||
|
||||
# Add common error response schemas
|
||||
schemas = components.setdefault("schemas", {})
|
||||
schemas["ErrorResponse"] = {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"error": {"type": "string", "description": "Error type identifier"},
|
||||
"message": {"type": "string", "description": "Human-readable error message"},
|
||||
"details": {
|
||||
"type": "array",
|
||||
"items": {"$ref": "#/components/schemas/ErrorDetail"},
|
||||
"nullable": True,
|
||||
"description": "Validation error details",
|
||||
},
|
||||
"request_id": {
|
||||
"type": "string",
|
||||
"nullable": True,
|
||||
"description": "Trace ID for debugging",
|
||||
},
|
||||
},
|
||||
"required": ["error", "message"],
|
||||
}
|
||||
schemas["ErrorDetail"] = {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"loc": {
|
||||
"type": "array",
|
||||
"items": {"oneOf": [{"type": "string"}, {"type": "integer"}]},
|
||||
"description": "Error location path",
|
||||
},
|
||||
"msg": {"type": "string", "description": "Error message"},
|
||||
"type": {"type": "string", "description": "Error type"},
|
||||
},
|
||||
"required": ["loc", "msg", "type"],
|
||||
}
|
||||
|
||||
app.openapi_schema = openapi_schema
|
||||
return app.openapi_schema
|
||||
|
||||
|
||||
app.openapi = custom_openapi # type: ignore[assignment]
|
||||
|
||||
# Include routers
|
||||
app.include_router(auth.router, prefix=settings.api_v1_prefix)
|
||||
app.include_router(incidents.router, prefix=settings.api_v1_prefix)
|
||||
app.include_router(org.router, prefix=settings.api_v1_prefix)
|
||||
app.include_router(health.router, prefix=settings.api_v1_prefix, tags=["health"])
|
||||
@@ -1,17 +0,0 @@
|
||||
"""Repository layer for database operations."""
|
||||
|
||||
from app.repositories.incident import IncidentRepository
|
||||
from app.repositories.notification import NotificationRepository
|
||||
from app.repositories.org import OrgRepository
|
||||
from app.repositories.refresh_token import RefreshTokenRepository
|
||||
from app.repositories.service import ServiceRepository
|
||||
from app.repositories.user import UserRepository
|
||||
|
||||
__all__ = [
|
||||
"IncidentRepository",
|
||||
"NotificationRepository",
|
||||
"OrgRepository",
|
||||
"RefreshTokenRepository",
|
||||
"ServiceRepository",
|
||||
"UserRepository",
|
||||
]
|
||||
@@ -1,161 +0,0 @@
|
||||
"""Incident repository for database operations."""
|
||||
|
||||
from datetime import datetime
|
||||
from typing import Any
|
||||
from uuid import UUID
|
||||
|
||||
import asyncpg
|
||||
|
||||
|
||||
class IncidentRepository:
|
||||
"""Database operations for incidents."""
|
||||
|
||||
def __init__(self, conn: asyncpg.Connection) -> None:
|
||||
self.conn = conn
|
||||
|
||||
async def create(
|
||||
self,
|
||||
incident_id: UUID,
|
||||
org_id: UUID,
|
||||
service_id: UUID,
|
||||
title: str,
|
||||
description: str | None,
|
||||
severity: str,
|
||||
) -> dict:
|
||||
"""Create a new incident."""
|
||||
row = await self.conn.fetchrow(
|
||||
"""
|
||||
INSERT INTO incidents (id, org_id, service_id, title, description, status, severity)
|
||||
VALUES ($1, $2, $3, $4, $5, 'triggered', $6)
|
||||
RETURNING id, org_id, service_id, title, description, status, severity,
|
||||
version, created_at, updated_at
|
||||
""",
|
||||
incident_id,
|
||||
org_id,
|
||||
service_id,
|
||||
title,
|
||||
description,
|
||||
severity,
|
||||
)
|
||||
return dict(row)
|
||||
|
||||
async def get_by_id(self, incident_id: UUID) -> dict | None:
|
||||
"""Get incident by ID."""
|
||||
row = await self.conn.fetchrow(
|
||||
"""
|
||||
SELECT id, org_id, service_id, title, description, status, severity,
|
||||
version, created_at, updated_at
|
||||
FROM incidents
|
||||
WHERE id = $1
|
||||
""",
|
||||
incident_id,
|
||||
)
|
||||
return dict(row) if row else None
|
||||
|
||||
async def get_by_org(
|
||||
self,
|
||||
org_id: UUID,
|
||||
status: str | None = None,
|
||||
cursor: datetime | None = None,
|
||||
limit: int = 20,
|
||||
) -> list[dict]:
|
||||
"""Get incidents for an organization with optional filtering and pagination."""
|
||||
query = """
|
||||
SELECT id, org_id, service_id, title, description, status, severity,
|
||||
version, created_at, updated_at
|
||||
FROM incidents
|
||||
WHERE org_id = $1
|
||||
"""
|
||||
params: list[Any] = [org_id]
|
||||
param_idx = 2
|
||||
|
||||
if status:
|
||||
query += f" AND status = ${param_idx}"
|
||||
params.append(status)
|
||||
param_idx += 1
|
||||
|
||||
if cursor:
|
||||
query += f" AND created_at < ${param_idx}"
|
||||
params.append(cursor)
|
||||
param_idx += 1
|
||||
|
||||
query += f" ORDER BY created_at DESC LIMIT ${param_idx}"
|
||||
params.append(limit + 1) # Fetch one extra to check if there are more
|
||||
|
||||
rows = await self.conn.fetch(query, *params)
|
||||
return [dict(row) for row in rows]
|
||||
|
||||
async def update_status(
|
||||
self,
|
||||
incident_id: UUID,
|
||||
new_status: str,
|
||||
expected_version: int,
|
||||
) -> dict | None:
|
||||
"""Update incident status with optimistic locking.
|
||||
|
||||
Returns updated incident if successful, None if version mismatch.
|
||||
"""
|
||||
row = await self.conn.fetchrow(
|
||||
"""
|
||||
UPDATE incidents
|
||||
SET status = $2, version = version + 1, updated_at = now()
|
||||
WHERE id = $1 AND version = $3
|
||||
RETURNING id, org_id, service_id, title, description, status, severity,
|
||||
version, created_at, updated_at
|
||||
""",
|
||||
incident_id,
|
||||
new_status,
|
||||
expected_version,
|
||||
)
|
||||
return dict(row) if row else None
|
||||
|
||||
async def add_event(
|
||||
self,
|
||||
event_id: UUID,
|
||||
incident_id: UUID,
|
||||
event_type: str,
|
||||
actor_user_id: UUID | None,
|
||||
payload: dict[str, Any] | None,
|
||||
) -> dict:
|
||||
"""Add an event to the incident timeline."""
|
||||
import json
|
||||
|
||||
row = await self.conn.fetchrow(
|
||||
"""
|
||||
INSERT INTO incident_events (id, incident_id, event_type, actor_user_id, payload)
|
||||
VALUES ($1, $2, $3, $4, $5)
|
||||
RETURNING id, incident_id, event_type, actor_user_id, payload, created_at
|
||||
""",
|
||||
event_id,
|
||||
incident_id,
|
||||
event_type,
|
||||
actor_user_id,
|
||||
json.dumps(payload) if payload else None,
|
||||
)
|
||||
result = dict(row)
|
||||
|
||||
# Parse JSON payload back to dict
|
||||
if result["payload"]:
|
||||
result["payload"] = json.loads(result["payload"])
|
||||
return result
|
||||
|
||||
async def get_events(self, incident_id: UUID) -> list[dict]:
|
||||
"""Get all events for an incident."""
|
||||
import json
|
||||
|
||||
rows = await self.conn.fetch(
|
||||
"""
|
||||
SELECT id, incident_id, event_type, actor_user_id, payload, created_at
|
||||
FROM incident_events
|
||||
WHERE incident_id = $1
|
||||
ORDER BY created_at
|
||||
""",
|
||||
incident_id,
|
||||
)
|
||||
results = []
|
||||
for row in rows:
|
||||
result = dict(row)
|
||||
if result["payload"]:
|
||||
result["payload"] = json.loads(result["payload"])
|
||||
results.append(result)
|
||||
return results
|
||||
@@ -1,199 +0,0 @@
|
||||
"""Notification repository for database operations."""
|
||||
|
||||
from datetime import datetime
|
||||
from uuid import UUID
|
||||
|
||||
import asyncpg
|
||||
|
||||
|
||||
class NotificationRepository:
|
||||
"""Database operations for notification targets and attempts."""
|
||||
|
||||
def __init__(self, conn: asyncpg.Connection) -> None:
|
||||
self.conn = conn
|
||||
|
||||
async def create_target(
|
||||
self,
|
||||
target_id: UUID,
|
||||
org_id: UUID,
|
||||
name: str,
|
||||
target_type: str,
|
||||
webhook_url: str | None = None,
|
||||
enabled: bool = True,
|
||||
) -> dict:
|
||||
"""Create a new notification target."""
|
||||
row = await self.conn.fetchrow(
|
||||
"""
|
||||
INSERT INTO notification_targets (id, org_id, name, target_type, webhook_url, enabled)
|
||||
VALUES ($1, $2, $3, $4, $5, $6)
|
||||
RETURNING id, org_id, name, target_type, webhook_url, enabled, created_at
|
||||
""",
|
||||
target_id,
|
||||
org_id,
|
||||
name,
|
||||
target_type,
|
||||
webhook_url,
|
||||
enabled,
|
||||
)
|
||||
return dict(row)
|
||||
|
||||
async def get_target_by_id(self, target_id: UUID) -> dict | None:
|
||||
"""Get notification target by ID."""
|
||||
row = await self.conn.fetchrow(
|
||||
"""
|
||||
SELECT id, org_id, name, target_type, webhook_url, enabled, created_at
|
||||
FROM notification_targets
|
||||
WHERE id = $1
|
||||
""",
|
||||
target_id,
|
||||
)
|
||||
return dict(row) if row else None
|
||||
|
||||
async def get_targets_by_org(
|
||||
self,
|
||||
org_id: UUID,
|
||||
enabled_only: bool = False,
|
||||
) -> list[dict]:
|
||||
"""Get all notification targets for an organization."""
|
||||
query = """
|
||||
SELECT id, org_id, name, target_type, webhook_url, enabled, created_at
|
||||
FROM notification_targets
|
||||
WHERE org_id = $1
|
||||
"""
|
||||
if enabled_only:
|
||||
query += " AND enabled = true"
|
||||
query += " ORDER BY name"
|
||||
|
||||
rows = await self.conn.fetch(query, org_id)
|
||||
return [dict(row) for row in rows]
|
||||
|
||||
async def update_target(
|
||||
self,
|
||||
target_id: UUID,
|
||||
name: str | None = None,
|
||||
webhook_url: str | None = None,
|
||||
enabled: bool | None = None,
|
||||
) -> dict | None:
|
||||
"""Update a notification target."""
|
||||
updates = []
|
||||
params = [target_id]
|
||||
param_idx = 2
|
||||
|
||||
if name is not None:
|
||||
updates.append(f"name = ${param_idx}")
|
||||
params.append(name)
|
||||
param_idx += 1
|
||||
|
||||
if webhook_url is not None:
|
||||
updates.append(f"webhook_url = ${param_idx}")
|
||||
params.append(webhook_url)
|
||||
param_idx += 1
|
||||
|
||||
if enabled is not None:
|
||||
updates.append(f"enabled = ${param_idx}")
|
||||
params.append(enabled)
|
||||
param_idx += 1
|
||||
|
||||
if not updates:
|
||||
return await self.get_target_by_id(target_id)
|
||||
|
||||
query = f"""
|
||||
UPDATE notification_targets
|
||||
SET {", ".join(updates)}
|
||||
WHERE id = $1
|
||||
RETURNING id, org_id, name, target_type, webhook_url, enabled, created_at
|
||||
"""
|
||||
row = await self.conn.fetchrow(query, *params)
|
||||
return dict(row) if row else None
|
||||
|
||||
async def delete_target(self, target_id: UUID) -> bool:
|
||||
"""Delete a notification target. Returns True if deleted."""
|
||||
result = await self.conn.execute(
|
||||
"DELETE FROM notification_targets WHERE id = $1",
|
||||
target_id,
|
||||
)
|
||||
return result == "DELETE 1"
|
||||
|
||||
async def create_attempt(
|
||||
self,
|
||||
attempt_id: UUID,
|
||||
incident_id: UUID,
|
||||
target_id: UUID,
|
||||
) -> dict:
|
||||
"""Create a notification attempt (idempotent via unique constraint)."""
|
||||
row = await self.conn.fetchrow(
|
||||
"""
|
||||
INSERT INTO notification_attempts (id, incident_id, target_id, status)
|
||||
VALUES ($1, $2, $3, 'pending')
|
||||
ON CONFLICT (incident_id, target_id) DO UPDATE SET id = notification_attempts.id
|
||||
RETURNING id, incident_id, target_id, status, error, sent_at, created_at
|
||||
""",
|
||||
attempt_id,
|
||||
incident_id,
|
||||
target_id,
|
||||
)
|
||||
return dict(row)
|
||||
|
||||
async def get_attempt(self, incident_id: UUID, target_id: UUID) -> dict | None:
|
||||
"""Get notification attempt for incident and target."""
|
||||
row = await self.conn.fetchrow(
|
||||
"""
|
||||
SELECT id, incident_id, target_id, status, error, sent_at, created_at
|
||||
FROM notification_attempts
|
||||
WHERE incident_id = $1 AND target_id = $2
|
||||
""",
|
||||
incident_id,
|
||||
target_id,
|
||||
)
|
||||
return dict(row) if row else None
|
||||
|
||||
async def update_attempt_success(
|
||||
self,
|
||||
attempt_id: UUID,
|
||||
sent_at: datetime,
|
||||
) -> dict | None:
|
||||
"""Mark notification attempt as successful."""
|
||||
row = await self.conn.fetchrow(
|
||||
"""
|
||||
UPDATE notification_attempts
|
||||
SET status = 'sent', sent_at = $2, error = NULL
|
||||
WHERE id = $1
|
||||
RETURNING id, incident_id, target_id, status, error, sent_at, created_at
|
||||
""",
|
||||
attempt_id,
|
||||
sent_at,
|
||||
)
|
||||
return dict(row) if row else None
|
||||
|
||||
async def update_attempt_failure(
|
||||
self,
|
||||
attempt_id: UUID,
|
||||
error: str,
|
||||
) -> dict | None:
|
||||
"""Mark notification attempt as failed."""
|
||||
row = await self.conn.fetchrow(
|
||||
"""
|
||||
UPDATE notification_attempts
|
||||
SET status = 'failed', error = $2
|
||||
WHERE id = $1
|
||||
RETURNING id, incident_id, target_id, status, error, sent_at, created_at
|
||||
""",
|
||||
attempt_id,
|
||||
error,
|
||||
)
|
||||
return dict(row) if row else None
|
||||
|
||||
async def get_pending_attempts(self, incident_id: UUID) -> list[dict]:
|
||||
"""Get all pending notification attempts for an incident."""
|
||||
rows = await self.conn.fetch(
|
||||
"""
|
||||
SELECT na.id, na.incident_id, na.target_id, na.status, na.error,
|
||||
na.sent_at, na.created_at,
|
||||
nt.target_type, nt.webhook_url, nt.name as target_name
|
||||
FROM notification_attempts na
|
||||
JOIN notification_targets nt ON nt.id = na.target_id
|
||||
WHERE na.incident_id = $1 AND na.status = 'pending'
|
||||
""",
|
||||
incident_id,
|
||||
)
|
||||
return [dict(row) for row in rows]
|
||||
@@ -1,125 +0,0 @@
|
||||
"""Organization repository for database operations."""
|
||||
|
||||
from uuid import UUID
|
||||
|
||||
import asyncpg
|
||||
|
||||
|
||||
class OrgRepository:
|
||||
"""Database operations for organizations."""
|
||||
|
||||
def __init__(self, conn: asyncpg.Connection) -> None:
|
||||
self.conn = conn
|
||||
|
||||
async def create(
|
||||
self,
|
||||
org_id: UUID,
|
||||
name: str,
|
||||
slug: str,
|
||||
) -> dict:
|
||||
"""Create a new organization."""
|
||||
row = await self.conn.fetchrow(
|
||||
"""
|
||||
INSERT INTO orgs (id, name, slug)
|
||||
VALUES ($1, $2, $3)
|
||||
RETURNING id, name, slug, created_at
|
||||
""",
|
||||
org_id,
|
||||
name,
|
||||
slug,
|
||||
)
|
||||
return dict(row)
|
||||
|
||||
async def get_by_id(self, org_id: UUID) -> dict | None:
|
||||
"""Get organization by ID."""
|
||||
row = await self.conn.fetchrow(
|
||||
"""
|
||||
SELECT id, name, slug, created_at
|
||||
FROM orgs
|
||||
WHERE id = $1
|
||||
""",
|
||||
org_id,
|
||||
)
|
||||
return dict(row) if row else None
|
||||
|
||||
async def get_by_slug(self, slug: str) -> dict | None:
|
||||
"""Get organization by slug."""
|
||||
row = await self.conn.fetchrow(
|
||||
"""
|
||||
SELECT id, name, slug, created_at
|
||||
FROM orgs
|
||||
WHERE slug = $1
|
||||
""",
|
||||
slug,
|
||||
)
|
||||
return dict(row) if row else None
|
||||
|
||||
async def add_member(
|
||||
self,
|
||||
member_id: UUID,
|
||||
user_id: UUID,
|
||||
org_id: UUID,
|
||||
role: str,
|
||||
) -> dict:
|
||||
"""Add a member to an organization."""
|
||||
row = await self.conn.fetchrow(
|
||||
"""
|
||||
INSERT INTO org_members (id, user_id, org_id, role)
|
||||
VALUES ($1, $2, $3, $4)
|
||||
RETURNING id, user_id, org_id, role, created_at
|
||||
""",
|
||||
member_id,
|
||||
user_id,
|
||||
org_id,
|
||||
role,
|
||||
)
|
||||
return dict(row)
|
||||
|
||||
async def get_member(self, user_id: UUID, org_id: UUID) -> dict | None:
|
||||
"""Get membership for a user in an organization."""
|
||||
row = await self.conn.fetchrow(
|
||||
"""
|
||||
SELECT om.id, om.user_id, om.org_id, om.role, om.created_at
|
||||
FROM org_members om
|
||||
WHERE om.user_id = $1 AND om.org_id = $2
|
||||
""",
|
||||
user_id,
|
||||
org_id,
|
||||
)
|
||||
return dict(row) if row else None
|
||||
|
||||
async def get_members(self, org_id: UUID) -> list[dict]:
|
||||
"""Get all members of an organization."""
|
||||
rows = await self.conn.fetch(
|
||||
"""
|
||||
SELECT om.id, om.user_id, u.email, om.role, om.created_at
|
||||
FROM org_members om
|
||||
JOIN users u ON u.id = om.user_id
|
||||
WHERE om.org_id = $1
|
||||
ORDER BY om.created_at
|
||||
""",
|
||||
org_id,
|
||||
)
|
||||
return [dict(row) for row in rows]
|
||||
|
||||
async def get_user_orgs(self, user_id: UUID) -> list[dict]:
|
||||
"""Get all organizations a user belongs to."""
|
||||
rows = await self.conn.fetch(
|
||||
"""
|
||||
SELECT o.id, o.name, o.slug, o.created_at, om.role
|
||||
FROM orgs o
|
||||
JOIN org_members om ON om.org_id = o.id
|
||||
WHERE om.user_id = $1
|
||||
ORDER BY o.created_at
|
||||
""",
|
||||
user_id,
|
||||
)
|
||||
return [dict(row) for row in rows]
|
||||
|
||||
async def slug_exists(self, slug: str) -> bool:
|
||||
"""Check if organization slug exists."""
|
||||
result = await self.conn.fetchval(
|
||||
"SELECT EXISTS(SELECT 1 FROM orgs WHERE slug = $1)",
|
||||
slug,
|
||||
)
|
||||
return result
|
||||
@@ -1,396 +0,0 @@
|
||||
"""Refresh token repository for database operations.
|
||||
|
||||
Security considerations implemented:
|
||||
- Atomic rotation using SELECT FOR UPDATE to prevent race conditions
|
||||
- Token chain tracking via rotated_to for reuse/theft detection
|
||||
- Defense-in-depth validation with user_id and active_org_id checks
|
||||
- Uses RETURNING for robust row counting instead of string parsing
|
||||
"""
|
||||
|
||||
from datetime import datetime
|
||||
from uuid import UUID
|
||||
|
||||
import asyncpg
|
||||
|
||||
|
||||
class RefreshTokenRepository:
|
||||
"""Database operations for refresh tokens."""
|
||||
|
||||
def __init__(self, conn: asyncpg.Connection) -> None:
|
||||
self.conn = conn
|
||||
|
||||
async def create(
|
||||
self,
|
||||
token_id: UUID,
|
||||
user_id: UUID,
|
||||
token_hash: str,
|
||||
active_org_id: UUID,
|
||||
expires_at: datetime,
|
||||
) -> dict:
|
||||
"""Create a new refresh token."""
|
||||
row = await self.conn.fetchrow(
|
||||
"""
|
||||
INSERT INTO refresh_tokens (id, user_id, token_hash, active_org_id, expires_at)
|
||||
VALUES ($1, $2, $3, $4, $5)
|
||||
RETURNING id, user_id, token_hash, active_org_id, expires_at,
|
||||
revoked_at, rotated_to, created_at
|
||||
""",
|
||||
token_id,
|
||||
user_id,
|
||||
token_hash,
|
||||
active_org_id,
|
||||
expires_at,
|
||||
)
|
||||
return dict(row)
|
||||
|
||||
async def get_by_hash(self, token_hash: str) -> dict | None:
|
||||
"""Get refresh token by hash (includes revoked/expired for auditing)."""
|
||||
row = await self.conn.fetchrow(
|
||||
"""
|
||||
SELECT id, user_id, token_hash, active_org_id, expires_at,
|
||||
revoked_at, rotated_to, created_at
|
||||
FROM refresh_tokens
|
||||
WHERE token_hash = $1
|
||||
""",
|
||||
token_hash,
|
||||
)
|
||||
return dict(row) if row else None
|
||||
|
||||
async def get_valid_by_hash(
|
||||
self,
|
||||
token_hash: str,
|
||||
user_id: UUID | None = None,
|
||||
active_org_id: UUID | None = None,
|
||||
) -> dict | None:
|
||||
"""Get refresh token by hash, only if valid.
|
||||
|
||||
Validates:
|
||||
- Token exists and matches hash
|
||||
- Token is not revoked
|
||||
- Token is not expired
|
||||
- Token has not been rotated (rotated_to is NULL)
|
||||
- Optionally: user_id matches (defense-in-depth)
|
||||
- Optionally: active_org_id matches (defense-in-depth)
|
||||
|
||||
Args:
|
||||
token_hash: The hashed token value
|
||||
user_id: If provided, token must belong to this user
|
||||
active_org_id: If provided, token must be bound to this org
|
||||
|
||||
Returns:
|
||||
Token dict if valid, None otherwise
|
||||
"""
|
||||
query = """
|
||||
SELECT id, user_id, token_hash, active_org_id, expires_at,
|
||||
revoked_at, rotated_to, created_at
|
||||
FROM refresh_tokens
|
||||
WHERE token_hash = $1
|
||||
AND revoked_at IS NULL
|
||||
AND rotated_to IS NULL
|
||||
AND expires_at > clock_timestamp()
|
||||
"""
|
||||
params: list = [token_hash]
|
||||
param_idx = 2
|
||||
|
||||
if user_id is not None:
|
||||
query += f" AND user_id = ${param_idx}"
|
||||
params.append(user_id)
|
||||
param_idx += 1
|
||||
|
||||
if active_org_id is not None:
|
||||
query += f" AND active_org_id = ${param_idx}"
|
||||
params.append(active_org_id)
|
||||
|
||||
row = await self.conn.fetchrow(query, *params)
|
||||
return dict(row) if row else None
|
||||
|
||||
async def get_valid_for_rotation(
|
||||
self,
|
||||
token_hash: str,
|
||||
user_id: UUID | None = None,
|
||||
) -> dict | None:
|
||||
"""Get and lock a valid token for rotation using SELECT FOR UPDATE.
|
||||
|
||||
This acquires a row-level lock to prevent concurrent rotation attempts.
|
||||
Must be called within a transaction.
|
||||
|
||||
Args:
|
||||
token_hash: The hashed token value
|
||||
user_id: If provided, token must belong to this user
|
||||
|
||||
Returns:
|
||||
Token dict if valid and locked, None otherwise
|
||||
"""
|
||||
query = """
|
||||
SELECT id, user_id, token_hash, active_org_id, expires_at,
|
||||
revoked_at, rotated_to, created_at
|
||||
FROM refresh_tokens
|
||||
WHERE token_hash = $1
|
||||
AND revoked_at IS NULL
|
||||
AND rotated_to IS NULL
|
||||
AND expires_at > clock_timestamp()
|
||||
"""
|
||||
params: list = [token_hash]
|
||||
|
||||
if user_id is not None:
|
||||
query += " AND user_id = $2"
|
||||
params.append(user_id)
|
||||
|
||||
query += " FOR UPDATE"
|
||||
|
||||
row = await self.conn.fetchrow(query, *params)
|
||||
return dict(row) if row else None
|
||||
|
||||
async def check_token_reuse(self, token_hash: str) -> dict | None:
|
||||
"""Check if a token has already been rotated (potential theft).
|
||||
|
||||
If a token is presented that has rotated_to set, it means:
|
||||
1. The token was legitimately rotated earlier
|
||||
2. Someone is now trying to use the old token
|
||||
3. This indicates the token may have been stolen
|
||||
|
||||
Returns:
|
||||
Token dict if this is a reused/stolen token, None if not found or not rotated
|
||||
"""
|
||||
row = await self.conn.fetchrow(
|
||||
"""
|
||||
SELECT id, user_id, token_hash, active_org_id, expires_at,
|
||||
revoked_at, rotated_to, created_at
|
||||
FROM refresh_tokens
|
||||
WHERE token_hash = $1 AND rotated_to IS NOT NULL
|
||||
""",
|
||||
token_hash,
|
||||
)
|
||||
return dict(row) if row else None
|
||||
|
||||
async def revoke_token_chain(self, token_id: UUID) -> int:
|
||||
"""Revoke a token and all tokens in its chain (for breach response).
|
||||
|
||||
When token reuse is detected, this revokes:
|
||||
1. The original stolen token
|
||||
2. Any token it was rotated to (and their rotations, recursively)
|
||||
|
||||
Args:
|
||||
token_id: The ID of the compromised token
|
||||
|
||||
Returns:
|
||||
Count of tokens revoked
|
||||
"""
|
||||
# Use recursive CTE to find all tokens in the chain
|
||||
rows = await self.conn.fetch(
|
||||
"""
|
||||
WITH RECURSIVE token_chain AS (
|
||||
-- Start with the given token
|
||||
SELECT id, rotated_to
|
||||
FROM refresh_tokens
|
||||
WHERE id = $1
|
||||
|
||||
UNION ALL
|
||||
|
||||
-- Follow the chain via rotated_to
|
||||
SELECT rt.id, rt.rotated_to
|
||||
FROM refresh_tokens rt
|
||||
INNER JOIN token_chain tc ON rt.id = tc.rotated_to
|
||||
)
|
||||
UPDATE refresh_tokens
|
||||
SET revoked_at = clock_timestamp()
|
||||
WHERE id IN (SELECT id FROM token_chain)
|
||||
AND revoked_at IS NULL
|
||||
RETURNING id
|
||||
""",
|
||||
token_id,
|
||||
)
|
||||
return len(rows)
|
||||
|
||||
async def rotate(
|
||||
self,
|
||||
old_token_hash: str,
|
||||
new_token_id: UUID,
|
||||
new_token_hash: str,
|
||||
new_expires_at: datetime,
|
||||
new_active_org_id: UUID | None = None,
|
||||
expected_user_id: UUID | None = None,
|
||||
) -> dict | None:
|
||||
"""Atomically rotate a refresh token.
|
||||
|
||||
This method:
|
||||
1. Validates the old token (not expired, not revoked, not already rotated)
|
||||
2. Locks the row to prevent concurrent rotation
|
||||
3. Marks old token as rotated (sets rotated_to)
|
||||
4. Creates new token with updated org if specified
|
||||
5. All in a single atomic operation
|
||||
|
||||
Args:
|
||||
old_token_hash: Hash of the token being rotated
|
||||
new_token_id: UUID for the new token
|
||||
new_token_hash: Hash for the new token
|
||||
new_expires_at: Expiry time for the new token
|
||||
new_active_org_id: New org ID (for org-switch), or None to keep current
|
||||
expected_user_id: If provided, validates token belongs to this user
|
||||
|
||||
Returns:
|
||||
New token dict if rotation succeeded, None if old token invalid/expired
|
||||
"""
|
||||
# First, get and lock the old token
|
||||
old_token = await self.get_valid_for_rotation(old_token_hash, expected_user_id)
|
||||
if old_token is None:
|
||||
return None
|
||||
|
||||
# Determine the org for the new token
|
||||
active_org_id = new_active_org_id or old_token["active_org_id"]
|
||||
user_id = old_token["user_id"]
|
||||
|
||||
# Create the new token
|
||||
new_token = await self.conn.fetchrow(
|
||||
"""
|
||||
INSERT INTO refresh_tokens (id, user_id, token_hash, active_org_id, expires_at)
|
||||
VALUES ($1, $2, $3, $4, $5)
|
||||
RETURNING id, user_id, token_hash, active_org_id, expires_at,
|
||||
revoked_at, rotated_to, created_at
|
||||
""",
|
||||
new_token_id,
|
||||
user_id,
|
||||
new_token_hash,
|
||||
active_org_id,
|
||||
new_expires_at,
|
||||
)
|
||||
|
||||
# Mark the old token as rotated (not revoked - for reuse detection)
|
||||
await self.conn.execute(
|
||||
"""
|
||||
UPDATE refresh_tokens
|
||||
SET rotated_to = $2
|
||||
WHERE id = $1
|
||||
""",
|
||||
old_token["id"],
|
||||
new_token_id,
|
||||
)
|
||||
|
||||
return dict(new_token)
|
||||
|
||||
async def revoke(self, token_id: UUID) -> bool:
|
||||
"""Revoke a refresh token by ID.
|
||||
|
||||
Returns:
|
||||
True if token was revoked, False if not found or already revoked
|
||||
"""
|
||||
row = await self.conn.fetchrow(
|
||||
"""
|
||||
UPDATE refresh_tokens
|
||||
SET revoked_at = clock_timestamp()
|
||||
WHERE id = $1 AND revoked_at IS NULL
|
||||
RETURNING id
|
||||
""",
|
||||
token_id,
|
||||
)
|
||||
return row is not None
|
||||
|
||||
async def revoke_by_hash(self, token_hash: str) -> bool:
|
||||
"""Revoke a refresh token by hash.
|
||||
|
||||
Returns:
|
||||
True if token was revoked, False if not found or already revoked
|
||||
"""
|
||||
row = await self.conn.fetchrow(
|
||||
"""
|
||||
UPDATE refresh_tokens
|
||||
SET revoked_at = clock_timestamp()
|
||||
WHERE token_hash = $1 AND revoked_at IS NULL
|
||||
RETURNING id
|
||||
""",
|
||||
token_hash,
|
||||
)
|
||||
return row is not None
|
||||
|
||||
async def revoke_all_for_user(self, user_id: UUID) -> int:
|
||||
"""Revoke all active refresh tokens for a user.
|
||||
|
||||
Use this for:
|
||||
- User-initiated logout from all devices
|
||||
- Password change
|
||||
- Account compromise response
|
||||
|
||||
Returns:
|
||||
Count of tokens revoked
|
||||
"""
|
||||
rows = await self.conn.fetch(
|
||||
"""
|
||||
UPDATE refresh_tokens
|
||||
SET revoked_at = clock_timestamp()
|
||||
WHERE user_id = $1 AND revoked_at IS NULL
|
||||
RETURNING id
|
||||
""",
|
||||
user_id,
|
||||
)
|
||||
return len(rows)
|
||||
|
||||
async def revoke_all_for_user_except(self, user_id: UUID, keep_token_id: UUID) -> int:
|
||||
"""Revoke all tokens for a user except one (logout other sessions).
|
||||
|
||||
Args:
|
||||
user_id: The user whose tokens to revoke
|
||||
keep_token_id: The token ID to keep active (current session)
|
||||
|
||||
Returns:
|
||||
Count of tokens revoked
|
||||
"""
|
||||
rows = await self.conn.fetch(
|
||||
"""
|
||||
UPDATE refresh_tokens
|
||||
SET revoked_at = clock_timestamp()
|
||||
WHERE user_id = $1 AND revoked_at IS NULL AND id != $2
|
||||
RETURNING id
|
||||
""",
|
||||
user_id,
|
||||
keep_token_id,
|
||||
)
|
||||
return len(rows)
|
||||
|
||||
async def get_active_tokens_for_user(self, user_id: UUID) -> list[dict]:
|
||||
"""Get all active (non-revoked, non-expired, non-rotated) tokens for a user.
|
||||
|
||||
Useful for:
|
||||
- Showing active sessions
|
||||
- Auditing
|
||||
|
||||
Returns:
|
||||
List of active token records
|
||||
"""
|
||||
rows = await self.conn.fetch(
|
||||
"""
|
||||
SELECT id, user_id, token_hash, active_org_id, expires_at,
|
||||
revoked_at, rotated_to, created_at
|
||||
FROM refresh_tokens
|
||||
WHERE user_id = $1
|
||||
AND revoked_at IS NULL
|
||||
AND rotated_to IS NULL
|
||||
AND expires_at > clock_timestamp()
|
||||
ORDER BY created_at DESC
|
||||
""",
|
||||
user_id,
|
||||
)
|
||||
return [dict(row) for row in rows]
|
||||
|
||||
async def cleanup_expired(self, older_than_days: int = 30) -> int:
|
||||
"""Delete expired tokens older than specified days.
|
||||
|
||||
Note: This performs a hard delete. For audit trails, I think we should:
|
||||
- Archiving to a separate table first
|
||||
- Using partitioning with retention policies
|
||||
- Only calling this for tokens well past their expiry
|
||||
|
||||
Args:
|
||||
older_than_days: Only delete tokens expired more than this many days ago
|
||||
|
||||
Returns:
|
||||
Count of tokens deleted
|
||||
"""
|
||||
rows = await self.conn.fetch(
|
||||
"""
|
||||
DELETE FROM refresh_tokens
|
||||
WHERE expires_at < clock_timestamp() - interval '1 day' * $1
|
||||
RETURNING id
|
||||
""",
|
||||
older_than_days,
|
||||
)
|
||||
return len(rows)
|
||||
@@ -1,80 +0,0 @@
|
||||
"""Service repository for database operations."""
|
||||
|
||||
from uuid import UUID
|
||||
|
||||
import asyncpg
|
||||
|
||||
|
||||
class ServiceRepository:
|
||||
"""Database operations for services."""
|
||||
|
||||
def __init__(self, conn: asyncpg.Connection) -> None:
|
||||
self.conn = conn
|
||||
|
||||
async def create(
|
||||
self,
|
||||
service_id: UUID,
|
||||
org_id: UUID,
|
||||
name: str,
|
||||
slug: str,
|
||||
) -> dict:
|
||||
"""Create a new service."""
|
||||
row = await self.conn.fetchrow(
|
||||
"""
|
||||
INSERT INTO services (id, org_id, name, slug)
|
||||
VALUES ($1, $2, $3, $4)
|
||||
RETURNING id, org_id, name, slug, created_at
|
||||
""",
|
||||
service_id,
|
||||
org_id,
|
||||
name,
|
||||
slug,
|
||||
)
|
||||
return dict(row)
|
||||
|
||||
async def get_by_id(self, service_id: UUID) -> dict | None:
|
||||
"""Get service by ID."""
|
||||
row = await self.conn.fetchrow(
|
||||
"""
|
||||
SELECT id, org_id, name, slug, created_at
|
||||
FROM services
|
||||
WHERE id = $1
|
||||
""",
|
||||
service_id,
|
||||
)
|
||||
return dict(row) if row else None
|
||||
|
||||
async def get_by_org(self, org_id: UUID) -> list[dict]:
|
||||
"""Get all services for an organization."""
|
||||
rows = await self.conn.fetch(
|
||||
"""
|
||||
SELECT id, org_id, name, slug, created_at
|
||||
FROM services
|
||||
WHERE org_id = $1
|
||||
ORDER BY name
|
||||
""",
|
||||
org_id,
|
||||
)
|
||||
return [dict(row) for row in rows]
|
||||
|
||||
async def get_by_slug(self, org_id: UUID, slug: str) -> dict | None:
|
||||
"""Get service by org and slug."""
|
||||
row = await self.conn.fetchrow(
|
||||
"""
|
||||
SELECT id, org_id, name, slug, created_at
|
||||
FROM services
|
||||
WHERE org_id = $1 AND slug = $2
|
||||
""",
|
||||
org_id,
|
||||
slug,
|
||||
)
|
||||
return dict(row) if row else None
|
||||
|
||||
async def slug_exists(self, org_id: UUID, slug: str) -> bool:
|
||||
"""Check if service slug exists in organization."""
|
||||
result = await self.conn.fetchval(
|
||||
"SELECT EXISTS(SELECT 1 FROM services WHERE org_id = $1 AND slug = $2)",
|
||||
org_id,
|
||||
slug,
|
||||
)
|
||||
return result
|
||||
@@ -1,63 +0,0 @@
|
||||
"""User repository for database operations."""
|
||||
|
||||
from uuid import UUID
|
||||
|
||||
import asyncpg
|
||||
|
||||
|
||||
class UserRepository:
|
||||
"""Database operations for users."""
|
||||
|
||||
def __init__(self, conn: asyncpg.Connection) -> None:
|
||||
self.conn = conn
|
||||
|
||||
async def create(
|
||||
self,
|
||||
user_id: UUID,
|
||||
email: str,
|
||||
password_hash: str,
|
||||
) -> dict:
|
||||
"""Create a new user."""
|
||||
row = await self.conn.fetchrow(
|
||||
"""
|
||||
INSERT INTO users (id, email, password_hash)
|
||||
VALUES ($1, $2, $3)
|
||||
RETURNING id, email, created_at
|
||||
""",
|
||||
user_id,
|
||||
email,
|
||||
password_hash,
|
||||
)
|
||||
return dict(row)
|
||||
|
||||
async def get_by_id(self, user_id: UUID) -> dict | None:
|
||||
"""Get user by ID."""
|
||||
row = await self.conn.fetchrow(
|
||||
"""
|
||||
SELECT id, email, password_hash, created_at
|
||||
FROM users
|
||||
WHERE id = $1
|
||||
""",
|
||||
user_id,
|
||||
)
|
||||
return dict(row) if row else None
|
||||
|
||||
async def get_by_email(self, email: str) -> dict | None:
|
||||
"""Get user by email."""
|
||||
row = await self.conn.fetchrow(
|
||||
"""
|
||||
SELECT id, email, password_hash, created_at
|
||||
FROM users
|
||||
WHERE email = $1
|
||||
""",
|
||||
email,
|
||||
)
|
||||
return dict(row) if row else None
|
||||
|
||||
async def exists_by_email(self, email: str) -> bool:
|
||||
"""Check if user exists by email."""
|
||||
result = await self.conn.fetchval(
|
||||
"SELECT EXISTS(SELECT 1 FROM users WHERE email = $1)",
|
||||
email,
|
||||
)
|
||||
return result
|
||||
@@ -1,54 +0,0 @@
|
||||
"""Pydantic schemas for request/response models."""
|
||||
|
||||
from app.schemas.auth import (
|
||||
LoginRequest,
|
||||
LogoutRequest,
|
||||
RefreshRequest,
|
||||
RegisterRequest,
|
||||
SwitchOrgRequest,
|
||||
TokenResponse,
|
||||
)
|
||||
from app.schemas.common import CursorParams, ErrorDetail, ErrorResponse, PaginatedResponse
|
||||
from app.schemas.incident import (
|
||||
CommentRequest,
|
||||
IncidentCreate,
|
||||
IncidentEventResponse,
|
||||
IncidentResponse,
|
||||
TransitionRequest,
|
||||
)
|
||||
from app.schemas.org import (
|
||||
MemberResponse,
|
||||
NotificationTargetCreate,
|
||||
NotificationTargetResponse,
|
||||
OrgResponse,
|
||||
ServiceCreate,
|
||||
ServiceResponse,
|
||||
)
|
||||
|
||||
__all__ = [
|
||||
# Auth
|
||||
"LoginRequest",
|
||||
"LogoutRequest",
|
||||
"RefreshRequest",
|
||||
"RegisterRequest",
|
||||
"SwitchOrgRequest",
|
||||
"TokenResponse",
|
||||
# Common
|
||||
"CursorParams",
|
||||
"ErrorDetail",
|
||||
"ErrorResponse",
|
||||
"PaginatedResponse",
|
||||
# Incident
|
||||
"CommentRequest",
|
||||
"IncidentCreate",
|
||||
"IncidentEventResponse",
|
||||
"IncidentResponse",
|
||||
"TransitionRequest",
|
||||
# Org
|
||||
"MemberResponse",
|
||||
"NotificationTargetCreate",
|
||||
"NotificationTargetResponse",
|
||||
"OrgResponse",
|
||||
"ServiceCreate",
|
||||
"ServiceResponse",
|
||||
]
|
||||
@@ -1,48 +0,0 @@
|
||||
"""Authentication schemas."""
|
||||
|
||||
from uuid import UUID
|
||||
|
||||
from pydantic import BaseModel, EmailStr, Field
|
||||
|
||||
|
||||
class RegisterRequest(BaseModel):
|
||||
"""Request body for user registration."""
|
||||
|
||||
email: EmailStr
|
||||
password: str = Field(min_length=8, max_length=128)
|
||||
org_name: str = Field(min_length=1, max_length=100, description="Name for the default org")
|
||||
|
||||
|
||||
class LoginRequest(BaseModel):
|
||||
"""Request body for user login."""
|
||||
|
||||
email: EmailStr
|
||||
password: str
|
||||
|
||||
|
||||
class RefreshRequest(BaseModel):
|
||||
"""Request body for token refresh."""
|
||||
|
||||
refresh_token: str
|
||||
|
||||
|
||||
class SwitchOrgRequest(BaseModel):
|
||||
"""Request body for switching active organization."""
|
||||
|
||||
org_id: UUID
|
||||
refresh_token: str
|
||||
|
||||
|
||||
class LogoutRequest(BaseModel):
|
||||
"""Request body for logging out and revoking a refresh token."""
|
||||
|
||||
refresh_token: str
|
||||
|
||||
|
||||
class TokenResponse(BaseModel):
|
||||
"""Response containing access and refresh tokens."""
|
||||
|
||||
access_token: str
|
||||
refresh_token: str
|
||||
token_type: str = "bearer"
|
||||
expires_in: int = Field(description="Access token expiry in seconds")
|
||||
@@ -1,61 +0,0 @@
|
||||
"""Common schemas used across the API."""
|
||||
|
||||
from pydantic import BaseModel, Field
|
||||
|
||||
|
||||
class ErrorDetail(BaseModel):
|
||||
"""Individual error detail for validation errors."""
|
||||
|
||||
loc: list[str | int] = Field(description="Location of the error (field path)")
|
||||
msg: str = Field(description="Error message")
|
||||
type: str = Field(description="Error type identifier")
|
||||
|
||||
|
||||
class ErrorResponse(BaseModel):
|
||||
"""Structured error response returned by all error handlers."""
|
||||
|
||||
error: str = Field(description="Error type (e.g., 'not_found', 'validation_error')")
|
||||
message: str = Field(description="Human-readable error message")
|
||||
details: list[ErrorDetail] | None = Field(
|
||||
default=None, description="Additional error details for validation errors"
|
||||
)
|
||||
request_id: str | None = Field(
|
||||
default=None, description="Request trace ID for debugging"
|
||||
)
|
||||
|
||||
model_config = {
|
||||
"json_schema_extra": {
|
||||
"examples": [
|
||||
{
|
||||
"error": "not_found",
|
||||
"message": "Incident not found",
|
||||
"request_id": "abc123def456",
|
||||
},
|
||||
{
|
||||
"error": "validation_error",
|
||||
"message": "Request validation failed",
|
||||
"details": [
|
||||
{"loc": ["body", "title"], "msg": "Field required", "type": "missing"}
|
||||
],
|
||||
"request_id": "abc123def456",
|
||||
},
|
||||
]
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
class CursorParams(BaseModel):
|
||||
"""Pagination parameters using cursor-based pagination."""
|
||||
|
||||
cursor: str | None = Field(default=None, description="Cursor for pagination")
|
||||
limit: int = Field(default=20, ge=1, le=100, description="Number of items per page")
|
||||
|
||||
|
||||
class PaginatedResponse[T](BaseModel):
|
||||
"""Generic paginated response wrapper."""
|
||||
|
||||
items: list[T]
|
||||
next_cursor: str | None = Field(
|
||||
default=None, description="Cursor for next page, null if no more items"
|
||||
)
|
||||
has_more: bool = Field(description="Whether there are more items")
|
||||
@@ -1,57 +0,0 @@
|
||||
"""Incident-related schemas."""
|
||||
|
||||
from datetime import datetime
|
||||
from typing import Any, Literal
|
||||
from uuid import UUID
|
||||
|
||||
from pydantic import BaseModel, Field
|
||||
|
||||
IncidentStatus = Literal["triggered", "acknowledged", "mitigated", "resolved"]
|
||||
IncidentSeverity = Literal["critical", "high", "medium", "low"]
|
||||
|
||||
|
||||
class IncidentCreate(BaseModel):
|
||||
"""Request body for creating an incident."""
|
||||
|
||||
title: str = Field(min_length=1, max_length=200)
|
||||
description: str | None = Field(default=None, max_length=5000)
|
||||
severity: IncidentSeverity = "medium"
|
||||
|
||||
|
||||
class IncidentResponse(BaseModel):
|
||||
"""Incident response."""
|
||||
|
||||
id: UUID
|
||||
service_id: UUID
|
||||
title: str
|
||||
description: str | None
|
||||
status: IncidentStatus
|
||||
severity: IncidentSeverity
|
||||
version: int
|
||||
created_at: datetime
|
||||
updated_at: datetime
|
||||
|
||||
|
||||
class IncidentEventResponse(BaseModel):
|
||||
"""Incident event response."""
|
||||
|
||||
id: UUID
|
||||
incident_id: UUID
|
||||
event_type: str
|
||||
actor_user_id: UUID | None
|
||||
payload: dict[str, Any] | None
|
||||
created_at: datetime
|
||||
|
||||
|
||||
class TransitionRequest(BaseModel):
|
||||
"""Request body for transitioning incident status."""
|
||||
|
||||
to_status: IncidentStatus
|
||||
version: int = Field(description="Current version for optimistic locking")
|
||||
note: str | None = Field(default=None, max_length=1000)
|
||||
|
||||
|
||||
class CommentRequest(BaseModel):
|
||||
"""Request body for adding a comment to an incident."""
|
||||
|
||||
content: str = Field(min_length=1, max_length=5000)
|
||||
@@ -1,69 +0,0 @@
|
||||
"""Organization-related schemas."""
|
||||
|
||||
from datetime import datetime
|
||||
from typing import Literal
|
||||
from uuid import UUID
|
||||
|
||||
from pydantic import BaseModel, Field, HttpUrl
|
||||
|
||||
|
||||
class OrgResponse(BaseModel):
|
||||
"""Organization summary response."""
|
||||
|
||||
id: UUID
|
||||
name: str
|
||||
slug: str
|
||||
created_at: datetime
|
||||
|
||||
|
||||
class MemberResponse(BaseModel):
|
||||
"""Organization member response."""
|
||||
|
||||
id: UUID
|
||||
user_id: UUID
|
||||
email: str
|
||||
role: Literal["admin", "member", "viewer"]
|
||||
created_at: datetime
|
||||
|
||||
|
||||
class ServiceCreate(BaseModel):
|
||||
"""Request body for creating a service."""
|
||||
|
||||
name: str = Field(min_length=1, max_length=100)
|
||||
slug: str = Field(
|
||||
min_length=1,
|
||||
max_length=50,
|
||||
pattern=r"^[a-z0-9]+(?:-[a-z0-9]+)*$",
|
||||
description="URL-friendly identifier (lowercase, hyphens allowed)",
|
||||
)
|
||||
|
||||
|
||||
class ServiceResponse(BaseModel):
|
||||
"""Service response."""
|
||||
|
||||
id: UUID
|
||||
name: str
|
||||
slug: str
|
||||
created_at: datetime
|
||||
|
||||
|
||||
class NotificationTargetCreate(BaseModel):
|
||||
"""Request body for creating a notification target."""
|
||||
|
||||
name: str = Field(min_length=1, max_length=100)
|
||||
target_type: Literal["webhook", "email", "slack"]
|
||||
webhook_url: HttpUrl | None = Field(
|
||||
default=None, description="Required for webhook type"
|
||||
)
|
||||
enabled: bool = True
|
||||
|
||||
|
||||
class NotificationTargetResponse(BaseModel):
|
||||
"""Notification target response."""
|
||||
|
||||
id: UUID
|
||||
name: str
|
||||
target_type: Literal["webhook", "email", "slack"]
|
||||
webhook_url: str | None
|
||||
enabled: bool
|
||||
created_at: datetime
|
||||
@@ -1,7 +0,0 @@
|
||||
"""Service layer entrypoints."""
|
||||
|
||||
from app.services.auth import AuthService
|
||||
from app.services.incident import IncidentService
|
||||
from app.services.org import OrgService
|
||||
|
||||
__all__ = ["AuthService", "OrgService", "IncidentService"]
|
||||
@@ -1,269 +0,0 @@
|
||||
"""Authentication service providing business logic for auth flows."""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import re
|
||||
from typing import cast
|
||||
from uuid import UUID, uuid4
|
||||
|
||||
import asyncpg
|
||||
from asyncpg.pool import PoolConnectionProxy
|
||||
|
||||
from app.api.deps import CurrentUser
|
||||
from app.config import settings
|
||||
from app.core import exceptions as exc, security
|
||||
from app.db import Database, db
|
||||
from app.repositories import OrgRepository, RefreshTokenRepository, UserRepository
|
||||
from app.schemas.auth import (
|
||||
LoginRequest,
|
||||
LogoutRequest,
|
||||
RefreshRequest,
|
||||
RegisterRequest,
|
||||
SwitchOrgRequest,
|
||||
TokenResponse,
|
||||
)
|
||||
|
||||
|
||||
_SLUG_PATTERN = re.compile(r"[^a-z0-9]+")
|
||||
|
||||
|
||||
def _as_conn(conn: asyncpg.Connection | PoolConnectionProxy) -> asyncpg.Connection:
|
||||
"""Helper to satisfy typing when a pool proxy is returned."""
|
||||
|
||||
return cast(asyncpg.Connection, conn)
|
||||
|
||||
|
||||
class AuthService:
|
||||
"""Encapsulates authentication workflows (register/login/refresh/logout)."""
|
||||
|
||||
def __init__(self, database: Database | None = None) -> None:
|
||||
self.db = database or db
|
||||
self._access_token_expires_in = settings.access_token_expire_minutes * 60
|
||||
|
||||
async def register_user(self, data: RegisterRequest) -> TokenResponse:
|
||||
"""Create a new user, default org, membership, and token pair."""
|
||||
|
||||
async with self.db.transaction() as conn:
|
||||
db_conn = _as_conn(conn)
|
||||
user_repo = UserRepository(db_conn)
|
||||
org_repo = OrgRepository(db_conn)
|
||||
refresh_repo = RefreshTokenRepository(db_conn)
|
||||
|
||||
if await user_repo.exists_by_email(data.email):
|
||||
raise exc.ConflictError("Email already registered")
|
||||
|
||||
user_id = uuid4()
|
||||
org_id = uuid4()
|
||||
member_id = uuid4()
|
||||
password_hash = security.hash_password(data.password)
|
||||
|
||||
await user_repo.create(user_id, data.email, password_hash)
|
||||
slug = await self._generate_unique_org_slug(org_repo, data.org_name)
|
||||
await org_repo.create(org_id, data.org_name, slug)
|
||||
await org_repo.add_member(member_id, user_id, org_id, "admin")
|
||||
|
||||
return await self._issue_token_pair(
|
||||
refresh_repo,
|
||||
user_id=user_id,
|
||||
org_id=org_id,
|
||||
role="admin",
|
||||
)
|
||||
|
||||
async def login_user(self, data: LoginRequest) -> TokenResponse:
|
||||
"""Authenticate a user and issue tokens for their first organization."""
|
||||
|
||||
async with self.db.connection() as conn:
|
||||
db_conn = _as_conn(conn)
|
||||
user_repo = UserRepository(db_conn)
|
||||
org_repo = OrgRepository(db_conn)
|
||||
refresh_repo = RefreshTokenRepository(db_conn)
|
||||
|
||||
user = await user_repo.get_by_email(data.email)
|
||||
if not user or not security.verify_password(data.password, user["password_hash"]):
|
||||
raise exc.UnauthorizedError("Invalid email or password")
|
||||
|
||||
orgs = await org_repo.get_user_orgs(user["id"])
|
||||
if not orgs:
|
||||
raise exc.ForbiddenError("User does not belong to any organization")
|
||||
|
||||
active_org = orgs[0]
|
||||
return await self._issue_token_pair(
|
||||
refresh_repo,
|
||||
user_id=user["id"],
|
||||
org_id=active_org["id"],
|
||||
role=active_org["role"],
|
||||
)
|
||||
|
||||
async def refresh_tokens(self, data: RefreshRequest) -> TokenResponse:
|
||||
"""Rotate refresh token and mint a new access token."""
|
||||
|
||||
old_hash = security.hash_token(data.refresh_token)
|
||||
new_refresh_token = security.generate_refresh_token()
|
||||
new_refresh_hash = security.hash_token(new_refresh_token)
|
||||
new_refresh_id = uuid4()
|
||||
new_refresh_expiry = security.get_refresh_token_expiry()
|
||||
|
||||
rotated: dict | None = None
|
||||
membership: dict | None = None
|
||||
|
||||
async with self.db.transaction() as conn:
|
||||
db_conn = _as_conn(conn)
|
||||
refresh_repo = RefreshTokenRepository(db_conn)
|
||||
rotated = await refresh_repo.rotate(
|
||||
old_token_hash=old_hash,
|
||||
new_token_id=new_refresh_id,
|
||||
new_token_hash=new_refresh_hash,
|
||||
new_expires_at=new_refresh_expiry,
|
||||
)
|
||||
|
||||
if rotated is not None:
|
||||
org_repo = OrgRepository(db_conn)
|
||||
membership = await org_repo.get_member(rotated["user_id"], rotated["active_org_id"])
|
||||
if membership is None:
|
||||
raise exc.UnauthorizedError("Invalid refresh token")
|
||||
|
||||
if rotated is None or membership is None:
|
||||
await self._handle_invalid_refresh(old_hash)
|
||||
|
||||
assert rotated is not None and membership is not None
|
||||
access_token = security.create_access_token(
|
||||
sub=str(rotated["user_id"]),
|
||||
org_id=str(rotated["active_org_id"]),
|
||||
org_role=membership["role"],
|
||||
)
|
||||
|
||||
return TokenResponse(
|
||||
access_token=access_token,
|
||||
refresh_token=new_refresh_token,
|
||||
expires_in=self._access_token_expires_in,
|
||||
)
|
||||
|
||||
async def switch_org(
|
||||
self,
|
||||
current_user: CurrentUser,
|
||||
data: SwitchOrgRequest,
|
||||
) -> TokenResponse:
|
||||
"""Switch active organization (rotates refresh token + issues new JWT)."""
|
||||
|
||||
target_org_id = data.org_id
|
||||
old_hash = security.hash_token(data.refresh_token)
|
||||
new_refresh_token = security.generate_refresh_token()
|
||||
new_refresh_hash = security.hash_token(new_refresh_token)
|
||||
new_refresh_expiry = security.get_refresh_token_expiry()
|
||||
|
||||
rotated: dict | None = None
|
||||
membership: dict | None = None
|
||||
|
||||
async with self.db.transaction() as conn:
|
||||
db_conn = _as_conn(conn)
|
||||
org_repo = OrgRepository(db_conn)
|
||||
membership = await org_repo.get_member(current_user.user_id, target_org_id)
|
||||
if membership is None:
|
||||
raise exc.ForbiddenError("Not a member of the requested organization")
|
||||
|
||||
refresh_repo = RefreshTokenRepository(db_conn)
|
||||
rotated = await refresh_repo.rotate(
|
||||
old_token_hash=old_hash,
|
||||
new_token_id=uuid4(),
|
||||
new_token_hash=new_refresh_hash,
|
||||
new_expires_at=new_refresh_expiry,
|
||||
new_active_org_id=target_org_id,
|
||||
expected_user_id=current_user.user_id,
|
||||
)
|
||||
|
||||
if rotated is None:
|
||||
await self._handle_invalid_refresh(old_hash)
|
||||
|
||||
access_token = security.create_access_token(
|
||||
sub=str(current_user.user_id),
|
||||
org_id=str(target_org_id),
|
||||
org_role=membership["role"],
|
||||
)
|
||||
|
||||
return TokenResponse(
|
||||
access_token=access_token,
|
||||
refresh_token=new_refresh_token,
|
||||
expires_in=self._access_token_expires_in,
|
||||
)
|
||||
|
||||
async def logout(self, current_user: CurrentUser, data: LogoutRequest) -> None:
|
||||
"""Revoke the provided refresh token for the current session."""
|
||||
|
||||
token_hash = security.hash_token(data.refresh_token)
|
||||
|
||||
async with self.db.transaction() as conn:
|
||||
refresh_repo = RefreshTokenRepository(_as_conn(conn))
|
||||
token = await refresh_repo.get_by_hash(token_hash)
|
||||
if token and token["user_id"] != current_user.user_id:
|
||||
raise exc.ForbiddenError("Refresh token does not belong to this user")
|
||||
|
||||
if not token:
|
||||
return
|
||||
|
||||
await refresh_repo.revoke(token["id"])
|
||||
|
||||
async def _issue_token_pair(
|
||||
self,
|
||||
refresh_repo: RefreshTokenRepository,
|
||||
*,
|
||||
user_id: UUID,
|
||||
org_id: UUID,
|
||||
role: str,
|
||||
) -> TokenResponse:
|
||||
"""Create access/refresh tokens and persist the refresh token."""
|
||||
|
||||
access_token = security.create_access_token(
|
||||
sub=str(user_id),
|
||||
org_id=str(org_id),
|
||||
org_role=role,
|
||||
)
|
||||
|
||||
refresh_token = security.generate_refresh_token()
|
||||
await refresh_repo.create(
|
||||
token_id=uuid4(),
|
||||
user_id=user_id,
|
||||
token_hash=security.hash_token(refresh_token),
|
||||
active_org_id=org_id,
|
||||
expires_at=security.get_refresh_token_expiry(),
|
||||
)
|
||||
|
||||
return TokenResponse(
|
||||
access_token=access_token,
|
||||
refresh_token=refresh_token,
|
||||
expires_in=self._access_token_expires_in,
|
||||
)
|
||||
|
||||
async def _handle_invalid_refresh(self, token_hash: str) -> None:
|
||||
"""Raise appropriate errors for invalid/compromised refresh tokens."""
|
||||
|
||||
async with self.db.connection() as conn:
|
||||
refresh_repo = RefreshTokenRepository(_as_conn(conn))
|
||||
reused = await refresh_repo.check_token_reuse(token_hash)
|
||||
if reused:
|
||||
await refresh_repo.revoke_token_chain(reused["id"])
|
||||
raise exc.UnauthorizedError("Refresh token reuse detected")
|
||||
|
||||
raise exc.UnauthorizedError("Invalid refresh token")
|
||||
|
||||
async def _generate_unique_org_slug(
|
||||
self,
|
||||
org_repo: OrgRepository,
|
||||
org_name: str,
|
||||
) -> str:
|
||||
"""Slugify the org name and append a counter until unique."""
|
||||
|
||||
base_slug = self._slugify(org_name)
|
||||
candidate = base_slug
|
||||
counter = 1
|
||||
while await org_repo.slug_exists(candidate):
|
||||
suffix = f"-{counter}"
|
||||
max_base_len = 50 - len(suffix)
|
||||
candidate = f"{base_slug[:max_base_len]}{suffix}"
|
||||
counter += 1
|
||||
return candidate
|
||||
|
||||
def _slugify(self, value: str) -> str:
|
||||
"""Convert arbitrary text into a URL-friendly slug."""
|
||||
|
||||
slug = _SLUG_PATTERN.sub("-", value.strip().lower()).strip("-")
|
||||
return slug[:50] or "org"
|
||||
@@ -1,247 +0,0 @@
|
||||
"""Incident service implementing incident lifecycle operations."""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
from datetime import datetime
|
||||
from typing import cast
|
||||
from uuid import UUID, uuid4
|
||||
|
||||
import asyncpg
|
||||
from asyncpg.pool import PoolConnectionProxy
|
||||
|
||||
from app.api.deps import CurrentUser, ensure_org_access
|
||||
from app.config import settings
|
||||
from app.core import exceptions as exc
|
||||
from app.db import Database, db
|
||||
from app.repositories import IncidentRepository, ServiceRepository
|
||||
from app.schemas.common import PaginatedResponse
|
||||
from app.schemas.incident import (
|
||||
CommentRequest,
|
||||
IncidentCreate,
|
||||
IncidentEventResponse,
|
||||
IncidentResponse,
|
||||
TransitionRequest,
|
||||
)
|
||||
from app.taskqueue import TaskQueue
|
||||
from app.taskqueue import task_queue as default_task_queue
|
||||
|
||||
_ALLOWED_TRANSITIONS: dict[str, set[str]] = {
|
||||
"triggered": {"acknowledged"},
|
||||
"acknowledged": {"mitigated"},
|
||||
"mitigated": {"resolved"},
|
||||
"resolved": set(),
|
||||
}
|
||||
|
||||
|
||||
def _as_conn(conn: asyncpg.Connection | PoolConnectionProxy) -> asyncpg.Connection:
|
||||
"""Helper to satisfy typing when a pool proxy is returned."""
|
||||
|
||||
return cast(asyncpg.Connection, conn)
|
||||
|
||||
|
||||
class IncidentService:
|
||||
"""Encapsulates incident lifecycle operations within an org context."""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
database: Database | None = None,
|
||||
task_queue: TaskQueue | None = None,
|
||||
escalation_delay_seconds: int | None = None,
|
||||
) -> None:
|
||||
self.db = database or db
|
||||
self.task_queue = task_queue or default_task_queue
|
||||
self.escalation_delay_seconds = (
|
||||
escalation_delay_seconds
|
||||
if escalation_delay_seconds is not None
|
||||
else settings.notification_escalation_delay_seconds
|
||||
)
|
||||
|
||||
async def create_incident(
|
||||
self,
|
||||
current_user: CurrentUser,
|
||||
service_id: UUID,
|
||||
data: IncidentCreate,
|
||||
) -> IncidentResponse:
|
||||
"""Create an incident for a service in the active org and record the creation event."""
|
||||
|
||||
async with self.db.transaction() as conn:
|
||||
db_conn = _as_conn(conn)
|
||||
service_repo = ServiceRepository(db_conn)
|
||||
incident_repo = IncidentRepository(db_conn)
|
||||
|
||||
service = await service_repo.get_by_id(service_id)
|
||||
if service is None:
|
||||
raise exc.NotFoundError("Service not found")
|
||||
ensure_org_access(service["org_id"], current_user)
|
||||
|
||||
incident_id = uuid4()
|
||||
incident = await incident_repo.create(
|
||||
incident_id=incident_id,
|
||||
org_id=current_user.org_id,
|
||||
service_id=service_id,
|
||||
title=data.title,
|
||||
description=data.description,
|
||||
severity=data.severity,
|
||||
)
|
||||
|
||||
await incident_repo.add_event(
|
||||
uuid4(),
|
||||
incident_id,
|
||||
"created",
|
||||
actor_user_id=current_user.user_id,
|
||||
payload={
|
||||
"title": data.title,
|
||||
"severity": data.severity,
|
||||
"description": data.description,
|
||||
},
|
||||
)
|
||||
|
||||
incident_response = IncidentResponse(**incident)
|
||||
|
||||
self.task_queue.incident_triggered(
|
||||
incident_id=incident_response.id,
|
||||
org_id=current_user.org_id,
|
||||
triggered_by=current_user.user_id,
|
||||
)
|
||||
|
||||
if self.escalation_delay_seconds > 0:
|
||||
self.task_queue.schedule_escalation_check(
|
||||
incident_id=incident_response.id,
|
||||
org_id=current_user.org_id,
|
||||
delay_seconds=self.escalation_delay_seconds,
|
||||
)
|
||||
|
||||
return incident_response
|
||||
|
||||
async def get_incidents(
|
||||
self,
|
||||
current_user: CurrentUser,
|
||||
*,
|
||||
status: str | None = None,
|
||||
cursor: datetime | None = None,
|
||||
limit: int = 20,
|
||||
) -> PaginatedResponse[IncidentResponse]:
|
||||
"""Return paginated incidents for the active organization."""
|
||||
|
||||
async with self.db.connection() as conn:
|
||||
incident_repo = IncidentRepository(_as_conn(conn))
|
||||
rows = await incident_repo.get_by_org(
|
||||
org_id=current_user.org_id,
|
||||
status=status,
|
||||
cursor=cursor,
|
||||
limit=limit,
|
||||
)
|
||||
|
||||
has_more = len(rows) > limit
|
||||
items = rows[:limit]
|
||||
next_cursor = items[-1]["created_at"].isoformat() if has_more and items else None
|
||||
|
||||
incidents = [IncidentResponse(**row) for row in items]
|
||||
return PaginatedResponse[IncidentResponse](
|
||||
items=incidents,
|
||||
next_cursor=next_cursor,
|
||||
has_more=has_more,
|
||||
)
|
||||
|
||||
async def get_incident(self, current_user: CurrentUser, incident_id: UUID) -> IncidentResponse:
|
||||
"""Return a single incident, ensuring it belongs to the active org."""
|
||||
|
||||
async with self.db.connection() as conn:
|
||||
incident_repo = IncidentRepository(_as_conn(conn))
|
||||
incident = await incident_repo.get_by_id(incident_id)
|
||||
if incident is None:
|
||||
raise exc.NotFoundError("Incident not found")
|
||||
ensure_org_access(incident["org_id"], current_user)
|
||||
return IncidentResponse(**incident)
|
||||
|
||||
async def get_incident_events(
|
||||
self, current_user: CurrentUser, incident_id: UUID
|
||||
) -> list[IncidentEventResponse]:
|
||||
"""Return the timeline events for an incident in the active org."""
|
||||
|
||||
async with self.db.connection() as conn:
|
||||
incident_repo = IncidentRepository(_as_conn(conn))
|
||||
incident = await incident_repo.get_by_id(incident_id)
|
||||
if incident is None:
|
||||
raise exc.NotFoundError("Incident not found")
|
||||
ensure_org_access(incident["org_id"], current_user)
|
||||
|
||||
events = await incident_repo.get_events(incident_id)
|
||||
return [IncidentEventResponse(**event) for event in events]
|
||||
|
||||
async def transition_incident(
|
||||
self,
|
||||
current_user: CurrentUser,
|
||||
incident_id: UUID,
|
||||
data: TransitionRequest,
|
||||
) -> IncidentResponse:
|
||||
"""Transition an incident status with optimistic locking and event recording."""
|
||||
|
||||
async with self.db.transaction() as conn:
|
||||
db_conn = _as_conn(conn)
|
||||
incident_repo = IncidentRepository(db_conn)
|
||||
|
||||
incident = await incident_repo.get_by_id(incident_id)
|
||||
if incident is None:
|
||||
raise exc.NotFoundError("Incident not found")
|
||||
ensure_org_access(incident["org_id"], current_user)
|
||||
self._validate_transition(incident["status"], data.to_status)
|
||||
|
||||
updated = await incident_repo.update_status(
|
||||
incident_id,
|
||||
data.to_status,
|
||||
data.version,
|
||||
)
|
||||
if updated is None:
|
||||
raise exc.ConflictError("Incident version mismatch")
|
||||
|
||||
payload = {"from": incident["status"], "to": data.to_status}
|
||||
if data.note:
|
||||
payload["note"] = data.note
|
||||
|
||||
await incident_repo.add_event(
|
||||
uuid4(),
|
||||
incident_id,
|
||||
"status_changed",
|
||||
actor_user_id=current_user.user_id,
|
||||
payload=payload,
|
||||
)
|
||||
|
||||
return IncidentResponse(**updated)
|
||||
|
||||
async def add_comment(
|
||||
self,
|
||||
current_user: CurrentUser,
|
||||
incident_id: UUID,
|
||||
data: CommentRequest,
|
||||
) -> IncidentEventResponse:
|
||||
"""Add a comment event to the incident timeline."""
|
||||
|
||||
async with self.db.connection() as conn:
|
||||
incident_repo = IncidentRepository(_as_conn(conn))
|
||||
incident = await incident_repo.get_by_id(incident_id)
|
||||
if incident is None:
|
||||
raise exc.NotFoundError("Incident not found")
|
||||
ensure_org_access(incident["org_id"], current_user)
|
||||
|
||||
event = await incident_repo.add_event(
|
||||
uuid4(),
|
||||
incident_id,
|
||||
"comment_added",
|
||||
actor_user_id=current_user.user_id,
|
||||
payload={"content": data.content},
|
||||
)
|
||||
return IncidentEventResponse(**event)
|
||||
|
||||
def _validate_transition(self, current_status: str, to_status: str) -> None:
|
||||
"""Validate a requested status transition against the allowed state machine."""
|
||||
|
||||
if current_status == to_status:
|
||||
raise exc.BadRequestError("Incident is already in the requested status")
|
||||
|
||||
allowed = _ALLOWED_TRANSITIONS.get(current_status, set())
|
||||
if to_status not in allowed:
|
||||
raise exc.BadRequestError("Invalid incident status transition")
|
||||
|
||||
|
||||
__all__ = ["IncidentService"]
|
||||
@@ -1,115 +0,0 @@
|
||||
"""Organization service providing org-scoped operations."""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
from typing import cast
|
||||
from uuid import UUID, uuid4
|
||||
|
||||
import asyncpg
|
||||
from asyncpg.pool import PoolConnectionProxy
|
||||
|
||||
from app.api.deps import CurrentUser
|
||||
from app.core import exceptions as exc
|
||||
from app.db import Database, db
|
||||
from app.repositories import NotificationRepository, OrgRepository, ServiceRepository
|
||||
from app.schemas.org import (
|
||||
MemberResponse,
|
||||
NotificationTargetCreate,
|
||||
NotificationTargetResponse,
|
||||
OrgResponse,
|
||||
ServiceCreate,
|
||||
ServiceResponse,
|
||||
)
|
||||
|
||||
|
||||
def _as_conn(conn: asyncpg.Connection | PoolConnectionProxy) -> asyncpg.Connection:
|
||||
"""Helper to satisfy typing when a pool proxy is returned."""
|
||||
|
||||
return cast(asyncpg.Connection, conn)
|
||||
|
||||
|
||||
class OrgService:
|
||||
"""Encapsulates organization-level operations within the active org context."""
|
||||
|
||||
def __init__(self, database: Database | None = None) -> None:
|
||||
self.db = database or db
|
||||
|
||||
async def get_current_org(self, current_user: CurrentUser) -> OrgResponse:
|
||||
"""Return the active organization summary for the current user."""
|
||||
|
||||
async with self.db.connection() as conn:
|
||||
org_repo = OrgRepository(_as_conn(conn))
|
||||
org = await org_repo.get_by_id(current_user.org_id)
|
||||
if org is None:
|
||||
raise exc.NotFoundError("Organization not found")
|
||||
return OrgResponse(**org)
|
||||
|
||||
async def get_members(self, current_user: CurrentUser) -> list[MemberResponse]:
|
||||
"""List members of the active organization."""
|
||||
|
||||
async with self.db.connection() as conn:
|
||||
org_repo = OrgRepository(_as_conn(conn))
|
||||
members = await org_repo.get_members(current_user.org_id)
|
||||
return [MemberResponse(**member) for member in members]
|
||||
|
||||
async def create_service(self, current_user: CurrentUser, data: ServiceCreate) -> ServiceResponse:
|
||||
"""Create a new service within the active organization."""
|
||||
|
||||
async with self.db.connection() as conn:
|
||||
service_repo = ServiceRepository(_as_conn(conn))
|
||||
|
||||
if await service_repo.slug_exists(current_user.org_id, data.slug):
|
||||
raise exc.ConflictError("Service slug already exists in this organization")
|
||||
|
||||
try:
|
||||
service = await service_repo.create(
|
||||
service_id=uuid4(),
|
||||
org_id=current_user.org_id,
|
||||
name=data.name,
|
||||
slug=data.slug,
|
||||
)
|
||||
except asyncpg.UniqueViolationError as err: # pragma: no cover - race protection
|
||||
raise exc.ConflictError("Service slug already exists in this organization") from err
|
||||
|
||||
return ServiceResponse(**service)
|
||||
|
||||
async def get_services(self, current_user: CurrentUser) -> list[ServiceResponse]:
|
||||
"""List services for the active organization."""
|
||||
|
||||
async with self.db.connection() as conn:
|
||||
service_repo = ServiceRepository(_as_conn(conn))
|
||||
services = await service_repo.get_by_org(current_user.org_id)
|
||||
return [ServiceResponse(**svc) for svc in services]
|
||||
|
||||
async def create_notification_target(
|
||||
self,
|
||||
current_user: CurrentUser,
|
||||
data: NotificationTargetCreate,
|
||||
) -> NotificationTargetResponse:
|
||||
"""Create a notification target for the active organization."""
|
||||
|
||||
if data.target_type == "webhook" and data.webhook_url is None:
|
||||
raise exc.BadRequestError("webhook_url is required for webhook targets")
|
||||
|
||||
async with self.db.connection() as conn:
|
||||
notification_repo = NotificationRepository(_as_conn(conn))
|
||||
target = await notification_repo.create_target(
|
||||
target_id=uuid4(),
|
||||
org_id=current_user.org_id,
|
||||
name=data.name,
|
||||
target_type=data.target_type,
|
||||
webhook_url=str(data.webhook_url) if data.webhook_url else None,
|
||||
enabled=data.enabled,
|
||||
)
|
||||
return NotificationTargetResponse(**target)
|
||||
|
||||
async def get_notification_targets(self, current_user: CurrentUser) -> list[NotificationTargetResponse]:
|
||||
"""List notification targets for the active organization."""
|
||||
|
||||
async with self.db.connection() as conn:
|
||||
notification_repo = NotificationRepository(_as_conn(conn))
|
||||
targets = await notification_repo.get_targets_by_org(current_user.org_id)
|
||||
return [NotificationTargetResponse(**target) for target in targets]
|
||||
|
||||
|
||||
__all__ = ["OrgService"]
|
||||
@@ -1,188 +0,0 @@
|
||||
"""Task queue abstractions for scheduling background work."""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import asyncio
|
||||
from abc import ABC, abstractmethod
|
||||
from dataclasses import dataclass
|
||||
from typing import Any
|
||||
from uuid import UUID
|
||||
|
||||
from app.config import settings
|
||||
|
||||
try:
|
||||
from worker.celery_app import celery_app
|
||||
except Exception: # pragma: no cover - celery app may not import during docs builds
|
||||
celery_app = None # type: ignore[assignment]
|
||||
|
||||
|
||||
class TaskQueue(ABC):
|
||||
"""Interface for enqueueing background work."""
|
||||
|
||||
async def startup(self) -> None: # pragma: no cover - default no-op
|
||||
"""Hook for queue initialization."""
|
||||
|
||||
async def shutdown(self) -> None: # pragma: no cover - default no-op
|
||||
"""Hook for queue teardown."""
|
||||
|
||||
async def ping(self) -> bool:
|
||||
"""Check if the queue backend is reachable."""
|
||||
|
||||
return True
|
||||
|
||||
def reset(self) -> None: # pragma: no cover - optional for in-memory impls
|
||||
"""Reset any in-memory state (used in tests)."""
|
||||
|
||||
@abstractmethod
|
||||
def incident_triggered(
|
||||
self,
|
||||
*,
|
||||
incident_id: UUID,
|
||||
org_id: UUID,
|
||||
triggered_by: UUID | None,
|
||||
) -> None:
|
||||
"""Fan out an incident triggered notification."""
|
||||
|
||||
@abstractmethod
|
||||
def schedule_escalation_check(
|
||||
self,
|
||||
*,
|
||||
incident_id: UUID,
|
||||
org_id: UUID,
|
||||
delay_seconds: int,
|
||||
) -> None:
|
||||
"""Schedule a delayed escalation check."""
|
||||
|
||||
|
||||
class CeleryTaskQueue(TaskQueue):
|
||||
"""Celery-backed task queue that can use Redis or SQS brokers."""
|
||||
|
||||
def __init__(self, default_queue: str, critical_queue: str) -> None:
|
||||
if celery_app is None: # pragma: no cover - guarded by try/except
|
||||
raise RuntimeError("Celery application is unavailable")
|
||||
self._celery = celery_app
|
||||
self._default_queue = default_queue
|
||||
self._critical_queue = critical_queue
|
||||
|
||||
def incident_triggered(
|
||||
self,
|
||||
*,
|
||||
incident_id: UUID,
|
||||
org_id: UUID,
|
||||
triggered_by: UUID | None,
|
||||
) -> None:
|
||||
self._celery.send_task(
|
||||
"worker.tasks.notifications.incident_triggered",
|
||||
kwargs={
|
||||
"incident_id": str(incident_id),
|
||||
"org_id": str(org_id),
|
||||
"triggered_by": str(triggered_by) if triggered_by else None,
|
||||
},
|
||||
queue=self._default_queue,
|
||||
)
|
||||
|
||||
def schedule_escalation_check(
|
||||
self,
|
||||
*,
|
||||
incident_id: UUID,
|
||||
org_id: UUID,
|
||||
delay_seconds: int,
|
||||
) -> None:
|
||||
self._celery.send_task(
|
||||
"worker.tasks.notifications.escalate_if_unacked",
|
||||
kwargs={
|
||||
"incident_id": str(incident_id),
|
||||
"org_id": str(org_id),
|
||||
},
|
||||
countdown=max(delay_seconds, 0),
|
||||
queue=self._critical_queue,
|
||||
)
|
||||
|
||||
async def ping(self) -> bool:
|
||||
loop = asyncio.get_running_loop()
|
||||
return await loop.run_in_executor(None, self._ping_sync)
|
||||
|
||||
def _ping_sync(self) -> bool:
|
||||
connection = self._celery.connection()
|
||||
try:
|
||||
connection.connect()
|
||||
return True
|
||||
except Exception:
|
||||
return False
|
||||
finally:
|
||||
try:
|
||||
connection.release()
|
||||
except Exception: # pragma: no cover - release best effort
|
||||
pass
|
||||
|
||||
|
||||
@dataclass
|
||||
class InMemoryTaskQueue(TaskQueue):
|
||||
"""Test-friendly queue that records dispatched tasks in memory."""
|
||||
|
||||
dispatched: list[tuple[str, dict[str, Any]]] | None = None
|
||||
|
||||
def __post_init__(self) -> None:
|
||||
if self.dispatched is None:
|
||||
self.dispatched = []
|
||||
|
||||
def incident_triggered(
|
||||
self,
|
||||
*,
|
||||
incident_id: UUID,
|
||||
org_id: UUID,
|
||||
triggered_by: UUID | None,
|
||||
) -> None:
|
||||
self.dispatched.append(
|
||||
(
|
||||
"incident_triggered",
|
||||
{
|
||||
"incident_id": incident_id,
|
||||
"org_id": org_id,
|
||||
"triggered_by": triggered_by,
|
||||
},
|
||||
)
|
||||
)
|
||||
|
||||
def schedule_escalation_check(
|
||||
self,
|
||||
*,
|
||||
incident_id: UUID,
|
||||
org_id: UUID,
|
||||
delay_seconds: int,
|
||||
) -> None:
|
||||
self.dispatched.append(
|
||||
(
|
||||
"escalate_if_unacked",
|
||||
{
|
||||
"incident_id": incident_id,
|
||||
"org_id": org_id,
|
||||
"delay_seconds": delay_seconds,
|
||||
},
|
||||
)
|
||||
)
|
||||
|
||||
def reset(self) -> None:
|
||||
if self.dispatched is not None:
|
||||
self.dispatched.clear()
|
||||
|
||||
|
||||
def _build_task_queue() -> TaskQueue:
|
||||
if settings.task_queue_driver == "inmemory":
|
||||
return InMemoryTaskQueue()
|
||||
|
||||
return CeleryTaskQueue(
|
||||
default_queue=settings.task_queue_default_queue,
|
||||
critical_queue=settings.task_queue_critical_queue,
|
||||
)
|
||||
|
||||
|
||||
task_queue = _build_task_queue()
|
||||
|
||||
|
||||
__all__ = [
|
||||
"CeleryTaskQueue",
|
||||
"InMemoryTaskQueue",
|
||||
"TaskQueue",
|
||||
"task_queue",
|
||||
]
|
||||
@@ -1,230 +0,0 @@
|
||||
version: "3.8"
|
||||
|
||||
services:
|
||||
postgres:
|
||||
image: postgres:16-alpine
|
||||
container_name: incidentops-postgres
|
||||
environment:
|
||||
POSTGRES_USER: incidentops
|
||||
POSTGRES_PASSWORD: incidentops
|
||||
POSTGRES_DB: incidentops
|
||||
ports:
|
||||
- "5432:5432"
|
||||
volumes:
|
||||
- postgres_data:/var/lib/postgresql/data
|
||||
healthcheck:
|
||||
test: ["CMD-SHELL", "pg_isready -U incidentops"]
|
||||
interval: 10s
|
||||
timeout: 5s
|
||||
retries: 5
|
||||
|
||||
# For Celery broker
|
||||
redis:
|
||||
image: redis:7-alpine
|
||||
container_name: incidentops-redis
|
||||
ports:
|
||||
- "6379:6379"
|
||||
volumes:
|
||||
- redis_data:/data
|
||||
healthcheck:
|
||||
test: ["CMD", "redis-cli", "ping"]
|
||||
interval: 10s
|
||||
timeout: 5s
|
||||
retries: 5
|
||||
|
||||
# api services
|
||||
api:
|
||||
build:
|
||||
context: .
|
||||
dockerfile: Dockerfile
|
||||
target: api
|
||||
container_name: incidentops-api
|
||||
ports:
|
||||
- "8000:8000"
|
||||
- "9464:9464" # Prometheus metrics
|
||||
environment:
|
||||
DATABASE_URL: postgresql://incidentops:incidentops@postgres:5432/incidentops
|
||||
REDIS_URL: redis://redis:6379/0
|
||||
JWT_SECRET_KEY: dev-secret-key-change-in-production
|
||||
JWT_ALGORITHM: HS256
|
||||
ACCESS_TOKEN_EXPIRE_MINUTES: 30
|
||||
REFRESH_TOKEN_EXPIRE_DAYS: 30
|
||||
# OpenTelemetry
|
||||
OTEL_ENABLED: "true"
|
||||
OTEL_SERVICE_NAME: incidentops-api
|
||||
OTEL_ENVIRONMENT: development
|
||||
OTEL_EXPORTER_OTLP_ENDPOINT: http://otel-collector:4317
|
||||
OTEL_EXPORTER_OTLP_INSECURE: "true"
|
||||
OTEL_LOG_LEVEL: INFO
|
||||
# Metrics
|
||||
PROMETHEUS_PORT: "9464"
|
||||
depends_on:
|
||||
postgres:
|
||||
condition: service_healthy
|
||||
redis:
|
||||
condition: service_healthy
|
||||
otel-collector:
|
||||
condition: service_started
|
||||
prometheus:
|
||||
condition: service_started
|
||||
healthcheck:
|
||||
test: ["CMD", "curl", "-f", "http://localhost:8000/v1/healthz"]
|
||||
interval: 30s
|
||||
timeout: 10s
|
||||
retries: 3
|
||||
start_period: 10s
|
||||
|
||||
# Worker service (Celery)
|
||||
worker:
|
||||
build:
|
||||
context: .
|
||||
dockerfile: Dockerfile
|
||||
target: worker
|
||||
container_name: incidentops-worker
|
||||
environment:
|
||||
DATABASE_URL: postgresql://incidentops:incidentops@postgres:5432/incidentops
|
||||
REDIS_URL: redis://redis:6379/0
|
||||
CELERY_BROKER_URL: redis://redis:6379/0
|
||||
CELERY_RESULT_BACKEND: redis://redis:6379/1
|
||||
# OpenTelemetry
|
||||
OTEL_ENABLED: "true"
|
||||
OTEL_SERVICE_NAME: incidentops-worker
|
||||
OTEL_ENVIRONMENT: development
|
||||
OTEL_EXPORTER_OTLP_ENDPOINT: http://otel-collector:4317
|
||||
OTEL_EXPORTER_OTLP_INSECURE: "true"
|
||||
depends_on:
|
||||
postgres:
|
||||
condition: service_healthy
|
||||
redis:
|
||||
condition: service_healthy
|
||||
|
||||
# Web frontend (Next.js)
|
||||
web:
|
||||
build:
|
||||
context: .
|
||||
dockerfile: Dockerfile.web
|
||||
container_name: incidentops-web
|
||||
ports:
|
||||
- "3000:3000"
|
||||
environment:
|
||||
NEXT_PUBLIC_API_URL: http://localhost:8000
|
||||
depends_on:
|
||||
- api
|
||||
|
||||
# Database migrations (run once)
|
||||
migrate:
|
||||
build:
|
||||
context: .
|
||||
dockerfile: Dockerfile
|
||||
target: api
|
||||
container_name: incidentops-migrate
|
||||
command: python migrations/migrate.py apply
|
||||
environment:
|
||||
DATABASE_URL: postgresql://incidentops:incidentops@postgres:5432/incidentops
|
||||
depends_on:
|
||||
postgres:
|
||||
condition: service_healthy
|
||||
profiles:
|
||||
- migrate
|
||||
|
||||
# Flower for Celery monitoring (dev only)
|
||||
flower:
|
||||
image: mher/flower:2.0
|
||||
container_name: incidentops-flower
|
||||
ports:
|
||||
- "5555:5555"
|
||||
environment:
|
||||
CELERY_BROKER_URL: redis://redis:6379/0
|
||||
FLOWER_BASIC_AUTH: admin:admin
|
||||
depends_on:
|
||||
- redis
|
||||
profiles:
|
||||
- monitoring
|
||||
|
||||
# ============================================
|
||||
# Observability Stack
|
||||
# ============================================
|
||||
|
||||
# OpenTelemetry Collector - receives traces/logs from apps
|
||||
otel-collector:
|
||||
image: otel/opentelemetry-collector-contrib:0.96.0
|
||||
container_name: incidentops-otel-collector
|
||||
command: ["--config=/etc/otel-collector/config.yaml"]
|
||||
volumes:
|
||||
- ./observability/otel-collector/config.yaml:/etc/otel-collector/config.yaml:ro
|
||||
ports:
|
||||
- "4317:4317" # OTLP gRPC
|
||||
- "4318:4318" # OTLP HTTP
|
||||
depends_on:
|
||||
- tempo
|
||||
- loki
|
||||
|
||||
# Tempo - distributed tracing backend
|
||||
tempo:
|
||||
image: grafana/tempo:2.4.1
|
||||
container_name: incidentops-tempo
|
||||
command: ["-config.file=/etc/tempo/config.yaml"]
|
||||
volumes:
|
||||
- ./observability/tempo/config.yaml:/etc/tempo/config.yaml:ro
|
||||
- tempo_data:/var/tempo
|
||||
ports:
|
||||
- "3200:3200" # Tempo HTTP
|
||||
- "4320:4317" # Tempo OTLP gRPC (different host port to avoid conflict)
|
||||
|
||||
# Loki - log aggregation
|
||||
loki:
|
||||
image: grafana/loki:2.9.6
|
||||
container_name: incidentops-loki
|
||||
command: ["-config.file=/etc/loki/config.yaml"]
|
||||
volumes:
|
||||
- ./observability/loki/config.yaml:/etc/loki/config.yaml:ro
|
||||
- loki_data:/loki
|
||||
ports:
|
||||
- "3100:3100" # Loki HTTP
|
||||
|
||||
# Prometheus - metrics storage
|
||||
prometheus:
|
||||
image: prom/prometheus:v2.51.0
|
||||
container_name: incidentops-prometheus
|
||||
command:
|
||||
- "--config.file=/etc/prometheus/prometheus.yml"
|
||||
- "--storage.tsdb.path=/prometheus"
|
||||
- "--web.enable-lifecycle"
|
||||
volumes:
|
||||
- ./observability/prometheus/prometheus.yml:/etc/prometheus/prometheus.yml:ro
|
||||
- prometheus_data:/prometheus
|
||||
ports:
|
||||
- "9090:9090" # Prometheus UI
|
||||
|
||||
# Grafana - visualization
|
||||
grafana:
|
||||
image: grafana/grafana:10.4.1
|
||||
container_name: incidentops-grafana
|
||||
environment:
|
||||
GF_SECURITY_ADMIN_USER: admin
|
||||
GF_SECURITY_ADMIN_PASSWORD: admin
|
||||
GF_USERS_ALLOW_SIGN_UP: "false"
|
||||
GF_EXPLORE_ENABLED: "true"
|
||||
GF_FEATURE_TOGGLES_ENABLE: traceqlEditor tempoSearch tempoBackendSearch tempoApmTable
|
||||
volumes:
|
||||
- ./observability/grafana/provisioning:/etc/grafana/provisioning:ro
|
||||
- ./observability/grafana/dashboards:/var/lib/grafana/dashboards:ro
|
||||
- grafana_data:/var/lib/grafana
|
||||
ports:
|
||||
- "3001:3000" # Grafana UI (3001 to avoid conflict with web frontend)
|
||||
depends_on:
|
||||
- tempo
|
||||
- loki
|
||||
- prometheus
|
||||
|
||||
volumes:
|
||||
postgres_data:
|
||||
redis_data:
|
||||
tempo_data:
|
||||
loki_data:
|
||||
prometheus_data:
|
||||
grafana_data:
|
||||
|
||||
networks:
|
||||
default:
|
||||
name: incidentops-network
|
||||
+657
@@ -0,0 +1,657 @@
|
||||
# IncidentOps Specification
|
||||
|
||||
A multi-tenant incident management system with implicit active-org context from JWT.
|
||||
|
||||
---
|
||||
|
||||
## Project Structure
|
||||
|
||||
```
|
||||
incidentops/
|
||||
├── IncidentOps.sln
|
||||
├── docker-compose.yml
|
||||
├── skaffold.yaml
|
||||
├── .gitignore
|
||||
│
|
||||
├── src/
|
||||
│ ├── IncidentOps.Api/ # ASP.NET Core REST API
|
||||
│ │ ├── Auth/
|
||||
│ │ │ ├── ClaimsPrincipalExtensions.cs
|
||||
│ │ │ ├── RequestContext.cs
|
||||
│ │ │ └── RoleRequirement.cs
|
||||
│ │ ├── Controllers/
|
||||
│ │ │ ├── AuthController.cs
|
||||
│ │ │ ├── HealthController.cs
|
||||
│ │ │ ├── IncidentsController.cs
|
||||
│ │ │ └── OrgController.cs
|
||||
│ │ ├── Dockerfile
|
||||
│ │ ├── Program.cs
|
||||
│ │ ├── appsettings.json
|
||||
│ │ └── appsettings.Development.json
|
||||
│ │
|
||||
│ ├── IncidentOps.Worker/ # Hangfire Worker Service
|
||||
│ │ ├── Jobs/
|
||||
│ │ │ ├── EscalateIfUnackedJob.cs
|
||||
│ │ │ ├── IncidentTriggeredJob.cs
|
||||
│ │ │ └── SendWebhookNotificationJob.cs
|
||||
│ │ ├── Dockerfile
|
||||
│ │ ├── Program.cs
|
||||
│ │ └── appsettings.json
|
||||
│ │
|
||||
│ ├── IncidentOps.Domain/ # Domain Entities & Enums
|
||||
│ │ ├── Entities/
|
||||
│ │ │ ├── Incident.cs
|
||||
│ │ │ ├── IncidentEvent.cs
|
||||
│ │ │ ├── NotificationAttempt.cs
|
||||
│ │ │ ├── NotificationTarget.cs
|
||||
│ │ │ ├── Org.cs
|
||||
│ │ │ ├── OrgMember.cs
|
||||
│ │ │ ├── RefreshToken.cs
|
||||
│ │ │ ├── Service.cs
|
||||
│ │ │ └── User.cs
|
||||
│ │ └── Enums/
|
||||
│ │ ├── IncidentEventType.cs
|
||||
│ │ ├── IncidentStatus.cs
|
||||
│ │ ├── NotificationTargetType.cs
|
||||
│ │ └── OrgRole.cs
|
||||
│ │
|
||||
│ ├── IncidentOps.Infrastructure/ # Data Access & Services
|
||||
│ │ ├── Auth/
|
||||
│ │ │ ├── IPasswordService.cs
|
||||
│ │ │ ├── ITokenService.cs
|
||||
│ │ │ └── JwtSettings.cs
|
||||
│ │ ├── Data/
|
||||
│ │ │ ├── DbConnectionFactory.cs
|
||||
│ │ │ └── Repositories/
|
||||
│ │ │ ├── IIncidentEventRepository.cs
|
||||
│ │ │ ├── IIncidentRepository.cs
|
||||
│ │ │ ├── INotificationTargetRepository.cs
|
||||
│ │ │ ├── IOrgMemberRepository.cs
|
||||
│ │ │ ├── IOrgRepository.cs
|
||||
│ │ │ ├── IRefreshTokenRepository.cs
|
||||
│ │ │ ├── IServiceRepository.cs
|
||||
│ │ │ └── IUserRepository.cs
|
||||
│ │ ├── Jobs/
|
||||
│ │ │ ├── IEscalateIfUnackedJob.cs
|
||||
│ │ │ ├── IIncidentTriggeredJob.cs
|
||||
│ │ │ └── ISendWebhookNotificationJob.cs
|
||||
│ │ ├── Migrations/
|
||||
│ │ │ ├── Migration0001_InitialSchema.cs
|
||||
│ │ │ ├── Migration0002_RefreshTokens.cs
|
||||
│ │ │ └── Migration0003_NotificationTargets.cs
|
||||
│ │ └── ServiceCollectionExtensions.cs
|
||||
│ │
|
||||
│ └── IncidentOps.Contracts/ # DTOs / API Contracts
|
||||
│ ├── Auth/
|
||||
│ │ ├── AuthResponse.cs
|
||||
│ │ ├── LoginRequest.cs
|
||||
│ │ ├── LogoutRequest.cs
|
||||
│ │ ├── MeResponse.cs
|
||||
│ │ ├── RefreshRequest.cs
|
||||
│ │ ├── RegisterRequest.cs
|
||||
│ │ └── SwitchOrgRequest.cs
|
||||
│ ├── Incidents/
|
||||
│ │ ├── CommentRequest.cs
|
||||
│ │ ├── CreateIncidentRequest.cs
|
||||
│ │ ├── IncidentDto.cs
|
||||
│ │ ├── IncidentEventDto.cs
|
||||
│ │ ├── IncidentListResponse.cs
|
||||
│ │ └── TransitionRequest.cs
|
||||
│ ├── Orgs/
|
||||
│ │ ├── CreateNotificationTargetRequest.cs
|
||||
│ │ ├── NotificationTargetDto.cs
|
||||
│ │ ├── OrgDto.cs
|
||||
│ │ └── OrgMemberDto.cs
|
||||
│ └── Services/
|
||||
│ ├── CreateServiceRequest.cs
|
||||
│ └── ServiceDto.cs
|
||||
│
|
||||
├── web/ # Next.js Frontend
|
||||
│ ├── app/
|
||||
│ │ ├── dashboard/page.tsx
|
||||
│ │ ├── login/page.tsx
|
||||
│ │ ├── register/page.tsx
|
||||
│ │ ├── layout.tsx
|
||||
│ │ ├── page.tsx
|
||||
│ │ └── globals.css
|
||||
│ ├── lib/
|
||||
│ │ └── api.ts
|
||||
│ ├── types/
|
||||
│ │ └── index.ts
|
||||
│ ├── Dockerfile
|
||||
│ ├── package.json
|
||||
│ ├── tsconfig.json
|
||||
│ └── next.config.js
|
||||
│
|
||||
├── helm/incidentops/ # Helm Chart
|
||||
│ ├── Chart.yaml
|
||||
│ ├── values.yaml
|
||||
│ └── templates/
|
||||
│ ├── _helpers.tpl
|
||||
│ ├── api-deployment.yaml
|
||||
│ ├── api-service.yaml
|
||||
│ ├── worker-deployment.yaml
|
||||
│ ├── web-deployment.yaml
|
||||
│ ├── web-service.yaml
|
||||
│ ├── ingress.yaml
|
||||
│ └── secrets.yaml
|
||||
│
|
||||
└── docs/
|
||||
└── specs.md
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## 1. Architecture (microservices-lite)
|
||||
|
||||
### Deployables
|
||||
|
||||
1. **api-service** (.NET 10, ASP.NET Core)
|
||||
- REST API (implicit org scope from JWT)
|
||||
- JWT access + refresh (both returned in JSON)
|
||||
- RBAC enforced using `org_role` claim + DB ownership checks
|
||||
- Writes incidents + timeline events
|
||||
- Enqueues background jobs to Hangfire
|
||||
|
||||
2. **worker-service** (.NET 10 Worker Service)
|
||||
- Runs **Hangfire Server** using Redis storage
|
||||
- Executes jobs: notification send, escalation checks, rollups
|
||||
- Writes notification attempts and system events
|
||||
|
||||
3. **web** (Next.js 14 + TypeScript)
|
||||
- Auth pages + dashboard + incident detail
|
||||
|
||||
### Dependencies (in kind via Helm)
|
||||
- PostgreSQL (Bitnami)
|
||||
- Redis (Bitnami) - Hangfire storage
|
||||
- ingress-nginx
|
||||
- (later) Prometheus/Grafana/OTel
|
||||
|
||||
---
|
||||
|
||||
## 2. Auth Model (active org in JWT, implicit org scope)
|
||||
|
||||
### JWT Access Token Claims
|
||||
| Claim | Description |
|
||||
|-------|-------------|
|
||||
| `sub` | userId (uuid) |
|
||||
| `org_id` | activeOrgId (uuid) |
|
||||
| `org_role` | `admin\|member\|viewer` |
|
||||
| `iss` | Issuer |
|
||||
| `aud` | Audience |
|
||||
| `iat` | Issued at |
|
||||
| `exp` | Expiration |
|
||||
| `jti` | (optional) Token ID |
|
||||
|
||||
### Refresh Token Model (JSON, not cookie)
|
||||
- Random opaque token returned in JSON
|
||||
- Stored hashed in DB
|
||||
- Rotated on refresh and switch-org
|
||||
- Refresh token row stores `active_org_id` (per-session org selection)
|
||||
|
||||
### DB: `refresh_tokens`
|
||||
```sql
|
||||
id uuid PRIMARY KEY
|
||||
user_id uuid NOT NULL
|
||||
token_hash text NOT NULL UNIQUE
|
||||
active_org_id uuid NOT NULL
|
||||
expires_at timestamptz NOT NULL
|
||||
revoked_at timestamptz NULL
|
||||
created_at timestamptz NOT NULL
|
||||
```
|
||||
|
||||
### Auth Endpoints
|
||||
| Method | Endpoint | Description |
|
||||
|--------|----------|-------------|
|
||||
| POST | `/v1/auth/register` | Create user + default org |
|
||||
| POST | `/v1/auth/login` | Authenticate, return tokens |
|
||||
| POST | `/v1/auth/refresh` | Rotate refresh token |
|
||||
| POST | `/v1/auth/switch-org` | Switch active org context |
|
||||
| POST | `/v1/auth/logout` | Revoke refresh token |
|
||||
|
||||
#### Registration Flow
|
||||
On `POST /v1/auth/register { email, password, displayName }`:
|
||||
1. Create user record
|
||||
2. Create a default org automatically (e.g., "John's Org")
|
||||
3. Create org_member with role=Admin
|
||||
4. Return access + refresh tokens
|
||||
|
||||
---
|
||||
|
||||
## 3. Authorization Rules (implicit org scope)
|
||||
|
||||
### Request Context
|
||||
Middleware extracts from JWT:
|
||||
- `UserId` from `sub`
|
||||
- `OrgId` from `org_id`
|
||||
- `Role` from `org_role`
|
||||
|
||||
### Authorization Approach
|
||||
- **Role check**: enforce viewer/member/admin by claim
|
||||
- **Ownership check**: for any resource ID in path, load its `org_id` from DB and require it equals token `org_id`
|
||||
- Prevents cross-tenant IDOR even though org isn't in the URL
|
||||
|
||||
### Role Permissions
|
||||
| Role | Permissions |
|
||||
|------|-------------|
|
||||
| viewer | Read-only access |
|
||||
| member | Create incidents, transitions, comments |
|
||||
| admin | Manage members, notification targets, on-call schedules |
|
||||
|
||||
---
|
||||
|
||||
## 4. API Surface (implicit org in JWT)
|
||||
|
||||
All routes under `/v1`. Unless noted, routes require auth.
|
||||
|
||||
### Auth
|
||||
| Method | Endpoint | Auth | Description |
|
||||
|--------|----------|------|-------------|
|
||||
| POST | `/auth/register` | No | Register new user |
|
||||
| POST | `/auth/login` | No | Login |
|
||||
| POST | `/auth/refresh` | No | Refresh tokens |
|
||||
| POST | `/auth/switch-org` | No | Switch org context |
|
||||
| POST | `/auth/logout` | No | Logout |
|
||||
| GET | `/me` | Yes | Get current user info |
|
||||
|
||||
### Org (current org context)
|
||||
| Method | Endpoint | Role | Description |
|
||||
|--------|----------|------|-------------|
|
||||
| GET | `/org` | viewer+ | Current org summary + role |
|
||||
| GET | `/org/members` | admin | List org members |
|
||||
| POST | `/org/members` | admin | Invite/add member (stretch) |
|
||||
| GET | `/org/services` | viewer+ | List services |
|
||||
| POST | `/org/services` | member+ | Create service |
|
||||
| GET | `/org/notification-targets` | admin | List notification targets |
|
||||
| POST | `/org/notification-targets` | admin | Create notification target |
|
||||
|
||||
### Incidents
|
||||
| Method | Endpoint | Role | Description |
|
||||
|--------|----------|------|-------------|
|
||||
| GET | `/incidents` | viewer+ | List incidents (cursor pagination) |
|
||||
| POST | `/services/{serviceId}/incidents` | member+ | Create incident |
|
||||
| GET | `/incidents/{incidentId}` | viewer+ | Get incident detail |
|
||||
| GET | `/incidents/{incidentId}/events` | viewer+ | Get incident timeline |
|
||||
| POST | `/incidents/{incidentId}/transition` | member+ | Transition incident state |
|
||||
| POST | `/incidents/{incidentId}/comment` | member+ | Add comment |
|
||||
|
||||
### Health
|
||||
| Method | Endpoint | Description |
|
||||
|--------|----------|-------------|
|
||||
| GET | `/healthz` | Liveness probe |
|
||||
| GET | `/readyz` | Readiness probe (checks Postgres + Redis) |
|
||||
|
||||
---
|
||||
|
||||
## 5. Domain Workflows
|
||||
|
||||
### Incident State Machine
|
||||
```
|
||||
Triggered → Acknowledged → Mitigated → Resolved
|
||||
```
|
||||
|
||||
### Enforcement
|
||||
- Application-level validation (allowed transitions)
|
||||
- DB optimistic concurrency using `incidents.version`
|
||||
|
||||
### Transition Write Pattern
|
||||
```sql
|
||||
UPDATE incidents
|
||||
SET status = @newStatus, version = version + 1, updated_at = NOW()
|
||||
WHERE id = @id AND org_id = @orgId AND version = @expectedVersion
|
||||
```
|
||||
- If 0 rows updated → `409 Conflict` (stale client) or `404` if not found in org
|
||||
|
||||
### Timeline Model
|
||||
Append-only `incident_events` records for:
|
||||
- Incident created
|
||||
- Transitions (ack, mitigate, resolve)
|
||||
- Comments
|
||||
- Notifications sent/failed
|
||||
- Escalations triggered
|
||||
|
||||
`actor_user_id` is null for system/worker actions.
|
||||
|
||||
---
|
||||
|
||||
## 6. PostgreSQL Schema (core tables)
|
||||
|
||||
### Users
|
||||
```sql
|
||||
CREATE TABLE users (
|
||||
id uuid PRIMARY KEY,
|
||||
email text NOT NULL UNIQUE,
|
||||
password_hash text NOT NULL,
|
||||
display_name text NOT NULL,
|
||||
created_at timestamptz NOT NULL DEFAULT NOW()
|
||||
);
|
||||
```
|
||||
|
||||
### Orgs
|
||||
```sql
|
||||
CREATE TABLE orgs (
|
||||
id uuid PRIMARY KEY,
|
||||
name text NOT NULL,
|
||||
slug text NOT NULL UNIQUE,
|
||||
created_at timestamptz NOT NULL DEFAULT NOW()
|
||||
);
|
||||
```
|
||||
|
||||
### Org Members
|
||||
```sql
|
||||
CREATE TABLE org_members (
|
||||
id uuid PRIMARY KEY,
|
||||
org_id uuid NOT NULL REFERENCES orgs(id) ON DELETE CASCADE,
|
||||
user_id uuid NOT NULL REFERENCES users(id) ON DELETE CASCADE,
|
||||
role text NOT NULL CHECK (role IN ('admin', 'member', 'viewer')),
|
||||
created_at timestamptz NOT NULL DEFAULT NOW(),
|
||||
UNIQUE(org_id, user_id)
|
||||
);
|
||||
```
|
||||
|
||||
### Services
|
||||
```sql
|
||||
CREATE TABLE services (
|
||||
id uuid PRIMARY KEY,
|
||||
org_id uuid NOT NULL REFERENCES orgs(id) ON DELETE CASCADE,
|
||||
name text NOT NULL,
|
||||
slug text NOT NULL,
|
||||
description text,
|
||||
created_at timestamptz NOT NULL DEFAULT NOW(),
|
||||
UNIQUE(org_id, slug)
|
||||
);
|
||||
```
|
||||
|
||||
### Incidents
|
||||
```sql
|
||||
CREATE TABLE incidents (
|
||||
id uuid PRIMARY KEY,
|
||||
org_id uuid NOT NULL REFERENCES orgs(id) ON DELETE CASCADE,
|
||||
service_id uuid NOT NULL REFERENCES services(id) ON DELETE CASCADE,
|
||||
title text NOT NULL,
|
||||
description text,
|
||||
status text NOT NULL DEFAULT 'triggered'
|
||||
CHECK (status IN ('triggered', 'acknowledged', 'mitigated', 'resolved')),
|
||||
severity text NOT NULL DEFAULT 'sev3'
|
||||
CHECK (severity IN ('sev1', 'sev2', 'sev3', 'sev4')),
|
||||
version integer NOT NULL DEFAULT 1,
|
||||
created_at timestamptz NOT NULL DEFAULT NOW(),
|
||||
updated_at timestamptz
|
||||
);
|
||||
CREATE INDEX idx_incidents_org_status ON incidents(org_id, status);
|
||||
```
|
||||
|
||||
### Incident Events
|
||||
```sql
|
||||
CREATE TABLE incident_events (
|
||||
id uuid PRIMARY KEY,
|
||||
incident_id uuid NOT NULL REFERENCES incidents(id) ON DELETE CASCADE,
|
||||
event_type text NOT NULL,
|
||||
actor_user_id uuid REFERENCES users(id),
|
||||
payload jsonb,
|
||||
created_at timestamptz NOT NULL DEFAULT NOW()
|
||||
);
|
||||
CREATE INDEX idx_incident_events_incident ON incident_events(incident_id, created_at);
|
||||
```
|
||||
|
||||
### Notification Targets
|
||||
```sql
|
||||
CREATE TABLE notification_targets (
|
||||
id uuid PRIMARY KEY,
|
||||
org_id uuid NOT NULL REFERENCES orgs(id) ON DELETE CASCADE,
|
||||
name text NOT NULL,
|
||||
target_type text NOT NULL CHECK (target_type IN ('webhook', 'email', 'slack')),
|
||||
configuration text NOT NULL,
|
||||
is_enabled boolean NOT NULL DEFAULT true,
|
||||
created_at timestamptz NOT NULL DEFAULT NOW(),
|
||||
updated_at timestamptz
|
||||
);
|
||||
```
|
||||
|
||||
### Notification Attempts
|
||||
```sql
|
||||
CREATE TABLE notification_attempts (
|
||||
id uuid PRIMARY KEY,
|
||||
incident_id uuid NOT NULL REFERENCES incidents(id) ON DELETE CASCADE,
|
||||
target_id uuid NOT NULL REFERENCES notification_targets(id) ON DELETE CASCADE,
|
||||
success boolean NOT NULL,
|
||||
error_message text,
|
||||
attempt_number integer NOT NULL DEFAULT 1,
|
||||
created_at timestamptz NOT NULL DEFAULT NOW(),
|
||||
UNIQUE(incident_id, target_id)
|
||||
);
|
||||
```
|
||||
|
||||
### Refresh Tokens
|
||||
```sql
|
||||
CREATE TABLE refresh_tokens (
|
||||
id uuid PRIMARY KEY,
|
||||
user_id uuid NOT NULL REFERENCES users(id) ON DELETE CASCADE,
|
||||
token_hash text NOT NULL UNIQUE,
|
||||
active_org_id uuid NOT NULL REFERENCES orgs(id),
|
||||
expires_at timestamptz NOT NULL,
|
||||
revoked_at timestamptz,
|
||||
created_at timestamptz NOT NULL DEFAULT NOW()
|
||||
);
|
||||
CREATE INDEX idx_refresh_tokens_user ON refresh_tokens(user_id);
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## 7. Data Access (Dapper) and Migrations (FluentMigrator)
|
||||
|
||||
### Dapper Conventions
|
||||
- Repositories receive `OrgId` as an explicit parameter and include it in WHERE clauses
|
||||
- Keep SQL close to repositories (or separate `.sql` files)
|
||||
- Use `NpgsqlConnection` + `IDbTransaction` for multi-statement operations
|
||||
|
||||
### FluentMigrator
|
||||
| Migration | Tables |
|
||||
|-----------|--------|
|
||||
| 0001 | users, orgs, org_members, services, incidents, incident_events |
|
||||
| 0002 | refresh_tokens |
|
||||
| 0003 | notification_targets, notification_attempts |
|
||||
|
||||
---
|
||||
|
||||
## 8. Hangfire Job Design (Redis storage)
|
||||
|
||||
### Setup
|
||||
- API configures Hangfire Client (enqueue)
|
||||
- Worker hosts Hangfire Server (process)
|
||||
|
||||
### Queues
|
||||
| Queue | Purpose |
|
||||
|-------|---------|
|
||||
| critical | Escalations |
|
||||
| default | Notifications |
|
||||
| low | Rollups |
|
||||
|
||||
### Jobs
|
||||
|
||||
#### 1. IncidentTriggeredJob(incidentId)
|
||||
- Reads incident (must belong to org in incident row)
|
||||
- Loads enabled notification targets for the org
|
||||
- Inserts `notification_attempts` rows (idempotent)
|
||||
- Enqueues per-target send jobs
|
||||
|
||||
#### 2. SendWebhookNotificationJob(incidentId, targetId)
|
||||
- Attempts HTTP POST with incident summary payload
|
||||
- Updates attempt status + writes `incident_event` of type `system.notification_sent` or `system.notification_failed`
|
||||
- Throws on transient failures to trigger retry; safe due to DB idempotency
|
||||
|
||||
#### 3. EscalateIfUnackedJob(incidentId, step) (stretch)
|
||||
- Runs delayed
|
||||
- Checks status; if still Triggered, sends secondary notifications
|
||||
|
||||
### Operational Note
|
||||
- Expose Hangfire Dashboard **only in local** and protect it (basic auth or require a dev token)
|
||||
|
||||
---
|
||||
|
||||
## 9. Kubernetes (kind) + Helm + Skaffold (local-only)
|
||||
|
||||
### Helm Umbrella Chart Deploys
|
||||
- bitnami/postgresql
|
||||
- bitnami/redis
|
||||
- api Deployment/Service
|
||||
- worker Deployment
|
||||
- web Deployment/Service
|
||||
- Ingress with host `incidentops.local`:
|
||||
- `/api`, `/v1`, `/healthz`, `/readyz` → api-service
|
||||
- `/` → web
|
||||
|
||||
### Configuration via Environment
|
||||
| Variable | Description |
|
||||
|----------|-------------|
|
||||
| `ConnectionStrings__Postgres` | PostgreSQL connection string |
|
||||
| `Redis__ConnectionString` | Redis connection string |
|
||||
| `Jwt__Issuer` | JWT issuer |
|
||||
| `Jwt__Audience` | JWT audience |
|
||||
| `Jwt__SigningKey` | JWT signing key (secret) |
|
||||
|
||||
### Readiness
|
||||
- API checks Postgres + Redis
|
||||
- Worker checks Postgres + Redis at startup
|
||||
|
||||
### Skaffold
|
||||
- Builds three images (api, worker, web)
|
||||
- `helm upgrade --install` on changes
|
||||
|
||||
---
|
||||
|
||||
## 10. Frontend UX Requirements (implicit org)
|
||||
|
||||
- On login, display `activeOrg` from response
|
||||
- Org switcher calls `/v1/auth/switch-org` and replaces tokens
|
||||
- All subsequent API calls use only `Authorization` header; no orgId params
|
||||
- Store tokens in localStorage or secure cookie
|
||||
- Handle 401 by attempting token refresh
|
||||
|
||||
---
|
||||
|
||||
## 11. Key Highlights (README/Resume)
|
||||
|
||||
- "Multi-tenant org context embedded in JWT; org switching re-issues tokens."
|
||||
- "DB ownership checks prevent cross-tenant resource access."
|
||||
- "Optimistic concurrency for incident transitions."
|
||||
- "Background jobs with retries + idempotent notification attempts."
|
||||
- "Deployed locally to Kubernetes via Helm + Skaffold."
|
||||
|
||||
---
|
||||
|
||||
## 12. Technology Stack
|
||||
|
||||
| Layer | Technology |
|
||||
|-------|------------|
|
||||
| Runtime | .NET 10 |
|
||||
| API Framework | ASP.NET Core |
|
||||
| Worker | .NET Worker Service |
|
||||
| Background Jobs | Hangfire with Redis |
|
||||
| Database | PostgreSQL |
|
||||
| ORM | Dapper |
|
||||
| Migrations | FluentMigrator |
|
||||
| Auth | JWT Bearer + BCrypt |
|
||||
| Frontend | Next.js 14 + TypeScript |
|
||||
| Container | Docker |
|
||||
| Orchestration | Kubernetes (kind) |
|
||||
| Deployment | Helm + Skaffold |
|
||||
|
||||
---
|
||||
|
||||
## 13. Local Development
|
||||
|
||||
### Prerequisites
|
||||
- .NET 10 SDK
|
||||
- Node.js 20+
|
||||
- Docker
|
||||
- kind (Kubernetes in Docker)
|
||||
- Helm
|
||||
- Skaffold
|
||||
|
||||
### Quick Start
|
||||
|
||||
```bash
|
||||
# With Docker Compose (simplest)
|
||||
docker-compose up -d
|
||||
|
||||
# Run API
|
||||
cd src/IncidentOps.Api
|
||||
dotnet run
|
||||
|
||||
# Run Worker (separate terminal)
|
||||
cd src/IncidentOps.Worker
|
||||
dotnet run
|
||||
|
||||
# Run Web (separate terminal)
|
||||
cd web
|
||||
npm install
|
||||
npm run dev
|
||||
```
|
||||
|
||||
### With Kubernetes (kind)
|
||||
|
||||
```bash
|
||||
# Create cluster
|
||||
kind create cluster --name incidentops
|
||||
|
||||
# Deploy with Skaffold
|
||||
skaffold dev
|
||||
|
||||
# Access at http://incidentops.local (add to /etc/hosts)
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## 14. API Request/Response Examples
|
||||
|
||||
### Register
|
||||
```http
|
||||
POST /v1/auth/register
|
||||
Content-Type: application/json
|
||||
|
||||
{
|
||||
"email": "user@example.com",
|
||||
"password": "SecurePass123!",
|
||||
"displayName": "John Doe"
|
||||
}
|
||||
```
|
||||
|
||||
Response:
|
||||
```json
|
||||
{
|
||||
"accessToken": "eyJhbG...",
|
||||
"refreshToken": "a1b2c3d4...",
|
||||
"activeOrg": {
|
||||
"id": "uuid",
|
||||
"name": "John Doe's Org",
|
||||
"slug": "org-abc123",
|
||||
"role": "admin"
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
### Create Incident
|
||||
```http
|
||||
POST /v1/services/{serviceId}/incidents
|
||||
Authorization: Bearer {accessToken}
|
||||
Content-Type: application/json
|
||||
|
||||
{
|
||||
"title": "Database connection timeout",
|
||||
"description": "Users experiencing slow queries",
|
||||
"severity": "sev2"
|
||||
}
|
||||
```
|
||||
|
||||
### Transition Incident
|
||||
```http
|
||||
POST /v1/incidents/{incidentId}/transition
|
||||
Authorization: Bearer {accessToken}
|
||||
Content-Type: application/json
|
||||
|
||||
{
|
||||
"action": "ack",
|
||||
"expectedVersion": 1
|
||||
}
|
||||
```
|
||||
@@ -1,15 +0,0 @@
|
||||
apiVersion: v2
|
||||
name: incidentops
|
||||
description: A Helm chart for IncidentOps - Incident Management Platform
|
||||
type: application
|
||||
version: 0.1.0
|
||||
appVersion: "0.1.0"
|
||||
|
||||
keywords:
|
||||
- incidentops
|
||||
- incident-management
|
||||
- on-call
|
||||
- alerting
|
||||
|
||||
maintainers:
|
||||
- name: IncidentOps Team
|
||||
@@ -1,33 +0,0 @@
|
||||
IncidentOps has been deployed!
|
||||
|
||||
{{- if .Values.ingress.enabled }}
|
||||
|
||||
Access the application at:
|
||||
http{{ if $.Values.ingress.tls }}s{{ end }}://{{ .Values.ingress.host }}
|
||||
|
||||
{{- else }}
|
||||
|
||||
To access the application, run:
|
||||
|
||||
API:
|
||||
kubectl port-forward svc/{{ include "incidentops.fullname" . }}-api {{ .Values.api.service.port }}:{{ .Values.api.service.port }} -n {{ .Release.Namespace }}
|
||||
Then open: http://localhost:{{ .Values.api.service.port }}
|
||||
|
||||
Web:
|
||||
kubectl port-forward svc/{{ include "incidentops.fullname" . }}-web {{ .Values.web.service.port }}:{{ .Values.web.service.port }} -n {{ .Release.Namespace }}
|
||||
Then open: http://localhost:{{ .Values.web.service.port }}
|
||||
|
||||
{{- end }}
|
||||
|
||||
To check the status of your deployment:
|
||||
kubectl get pods -n {{ .Release.Namespace }} -l "app.kubernetes.io/instance={{ .Release.Name }}"
|
||||
|
||||
{{- if .Values.migration.enabled }}
|
||||
|
||||
Database migrations will run automatically as a Helm hook.
|
||||
Check migration status:
|
||||
kubectl get jobs -n {{ .Release.Namespace }} -l "app.kubernetes.io/component=migration"
|
||||
|
||||
{{- end }}
|
||||
|
||||
For more information, visit the documentation.
|
||||
@@ -1,218 +0,0 @@
|
||||
{{/*
|
||||
Expand the name of the chart.
|
||||
*/}}
|
||||
{{- define "incidentops.name" -}}
|
||||
{{- default .Chart.Name .Values.nameOverride | trunc 63 | trimSuffix "-" }}
|
||||
{{- end }}
|
||||
|
||||
{{/*
|
||||
Create a default fully qualified app name.
|
||||
*/}}
|
||||
{{- define "incidentops.fullname" -}}
|
||||
{{- if .Values.fullnameOverride }}
|
||||
{{- .Values.fullnameOverride | trunc 63 | trimSuffix "-" }}
|
||||
{{- else }}
|
||||
{{- $name := default .Chart.Name .Values.nameOverride }}
|
||||
{{- if contains $name .Release.Name }}
|
||||
{{- .Release.Name | trunc 63 | trimSuffix "-" }}
|
||||
{{- else }}
|
||||
{{- printf "%s-%s" .Release.Name $name | trunc 63 | trimSuffix "-" }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
|
||||
{{/*
|
||||
Create chart name and version as used by the chart label.
|
||||
*/}}
|
||||
{{- define "incidentops.chart" -}}
|
||||
{{- printf "%s-%s" .Chart.Name .Chart.Version | replace "+" "_" | trunc 63 | trimSuffix "-" }}
|
||||
{{- end }}
|
||||
|
||||
{{/*
|
||||
Common labels
|
||||
*/}}
|
||||
{{- define "incidentops.labels" -}}
|
||||
helm.sh/chart: {{ include "incidentops.chart" . }}
|
||||
{{ include "incidentops.selectorLabels" . }}
|
||||
{{- if .Chart.AppVersion }}
|
||||
app.kubernetes.io/version: {{ .Chart.AppVersion | quote }}
|
||||
{{- end }}
|
||||
app.kubernetes.io/managed-by: {{ .Release.Service }}
|
||||
{{- end }}
|
||||
|
||||
{{/*
|
||||
Selector labels
|
||||
*/}}
|
||||
{{- define "incidentops.selectorLabels" -}}
|
||||
app.kubernetes.io/name: {{ include "incidentops.name" . }}
|
||||
app.kubernetes.io/instance: {{ .Release.Name }}
|
||||
{{- end }}
|
||||
|
||||
{{/*
|
||||
API labels
|
||||
*/}}
|
||||
{{- define "incidentops.api.labels" -}}
|
||||
{{ include "incidentops.labels" . }}
|
||||
app.kubernetes.io/component: api
|
||||
{{- end }}
|
||||
|
||||
{{- define "incidentops.api.selectorLabels" -}}
|
||||
{{ include "incidentops.selectorLabels" . }}
|
||||
app.kubernetes.io/component: api
|
||||
{{- end }}
|
||||
|
||||
{{/*
|
||||
Worker labels
|
||||
*/}}
|
||||
{{- define "incidentops.worker.labels" -}}
|
||||
{{ include "incidentops.labels" . }}
|
||||
app.kubernetes.io/component: worker
|
||||
{{- end }}
|
||||
|
||||
{{- define "incidentops.worker.selectorLabels" -}}
|
||||
{{ include "incidentops.selectorLabels" . }}
|
||||
app.kubernetes.io/component: worker
|
||||
{{- end }}
|
||||
|
||||
{{/*
|
||||
Web labels
|
||||
*/}}
|
||||
{{- define "incidentops.web.labels" -}}
|
||||
{{ include "incidentops.labels" . }}
|
||||
app.kubernetes.io/component: web
|
||||
{{- end }}
|
||||
|
||||
{{- define "incidentops.web.selectorLabels" -}}
|
||||
{{ include "incidentops.selectorLabels" . }}
|
||||
app.kubernetes.io/component: web
|
||||
{{- end }}
|
||||
|
||||
{{/*
|
||||
Create the name of the service account to use
|
||||
*/}}
|
||||
{{- define "incidentops.serviceAccountName" -}}
|
||||
{{- if .Values.serviceAccount.create }}
|
||||
{{- default (include "incidentops.fullname" .) .Values.serviceAccount.name }}
|
||||
{{- else }}
|
||||
{{- default "default" .Values.serviceAccount.name }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
|
||||
{{/*
|
||||
PostgreSQL host
|
||||
*/}}
|
||||
{{- define "incidentops.postgresql.host" -}}
|
||||
{{- if .Values.postgresql.enabled }}
|
||||
{{- printf "%s-postgresql" (include "incidentops.fullname" .) }}
|
||||
{{- else }}
|
||||
{{- .Values.externalDatabase.host }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
|
||||
{{/*
|
||||
PostgreSQL port
|
||||
*/}}
|
||||
{{- define "incidentops.postgresql.port" -}}
|
||||
{{- if .Values.postgresql.enabled }}
|
||||
{{- printf "5432" }}
|
||||
{{- else }}
|
||||
{{- .Values.externalDatabase.port | default "5432" }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
|
||||
{{/*
|
||||
Database URL
|
||||
*/}}
|
||||
{{- define "incidentops.databaseUrl" -}}
|
||||
{{- $host := include "incidentops.postgresql.host" . }}
|
||||
{{- $port := include "incidentops.postgresql.port" . }}
|
||||
{{- if .Values.postgresql.enabled }}
|
||||
{{- printf "postgresql://%s:%s@%s:%s/%s" .Values.postgresql.auth.username .Values.postgresql.auth.password $host $port .Values.postgresql.auth.database }}
|
||||
{{- else }}
|
||||
{{- printf "postgresql://%s:%s@%s:%s/%s" .Values.externalDatabase.user .Values.externalDatabase.password $host $port .Values.externalDatabase.database }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
|
||||
{{/*
|
||||
Redis host
|
||||
*/}}
|
||||
{{- define "incidentops.redis.host" -}}
|
||||
{{- if .Values.redis.enabled }}
|
||||
{{- printf "%s-redis" (include "incidentops.fullname" .) }}
|
||||
{{- else }}
|
||||
{{- .Values.externalRedis.host }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
|
||||
{{/*
|
||||
Redis URL
|
||||
*/}}
|
||||
{{- define "incidentops.redisUrl" -}}
|
||||
{{- $host := include "incidentops.redis.host" . }}
|
||||
{{- if .Values.redis.enabled }}
|
||||
{{- printf "redis://%s:6379/0" $host }}
|
||||
{{- else }}
|
||||
{{- printf "redis://%s:%s/%s" $host (.Values.externalRedis.port | default "6379") (.Values.externalRedis.database | default "0") }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
|
||||
{{/*
|
||||
Celery broker URL
|
||||
*/}}
|
||||
{{- define "incidentops.celeryBrokerUrl" -}}
|
||||
{{ include "incidentops.redisUrl" . }}
|
||||
{{- end }}
|
||||
|
||||
{{/*
|
||||
Celery result backend URL
|
||||
*/}}
|
||||
{{- define "incidentops.celeryResultBackend" -}}
|
||||
{{- $host := include "incidentops.redis.host" . }}
|
||||
{{- if .Values.redis.enabled }}
|
||||
{{- printf "redis://%s:6379/1" $host }}
|
||||
{{- else }}
|
||||
{{- printf "redis://%s:%s/%s" $host (.Values.externalRedis.port | default "6379") (add (.Values.externalRedis.database | default 0) 1) }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
|
||||
{{/*
|
||||
API image
|
||||
*/}}
|
||||
{{- define "incidentops.api.image" -}}
|
||||
{{- $registry := .Values.global.imageRegistry | default "" }}
|
||||
{{- $repository := .Values.api.image.repository }}
|
||||
{{- $tag := .Values.api.image.tag | default .Chart.AppVersion }}
|
||||
{{- if $registry }}
|
||||
{{- printf "%s/%s:%s" $registry $repository $tag }}
|
||||
{{- else }}
|
||||
{{- printf "%s:%s" $repository $tag }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
|
||||
{{/*
|
||||
Worker image
|
||||
*/}}
|
||||
{{- define "incidentops.worker.image" -}}
|
||||
{{- $registry := .Values.global.imageRegistry | default "" }}
|
||||
{{- $repository := .Values.worker.image.repository }}
|
||||
{{- $tag := .Values.worker.image.tag | default .Chart.AppVersion }}
|
||||
{{- if $registry }}
|
||||
{{- printf "%s/%s:%s" $registry $repository $tag }}
|
||||
{{- else }}
|
||||
{{- printf "%s:%s" $repository $tag }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
|
||||
{{/*
|
||||
Web image
|
||||
*/}}
|
||||
{{- define "incidentops.web.image" -}}
|
||||
{{- $registry := .Values.global.imageRegistry | default "" }}
|
||||
{{- $repository := .Values.web.image.repository }}
|
||||
{{- $tag := .Values.web.image.tag | default .Chart.AppVersion }}
|
||||
{{- if $registry }}
|
||||
{{- printf "%s/%s:%s" $registry $repository $tag }}
|
||||
{{- else }}
|
||||
{{- printf "%s:%s" $repository $tag }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
@@ -1,104 +0,0 @@
|
||||
apiVersion: apps/v1
|
||||
kind: Deployment
|
||||
metadata:
|
||||
name: {{ include "incidentops.fullname" . }}-api
|
||||
labels:
|
||||
{{- include "incidentops.api.labels" . | nindent 4 }}
|
||||
spec:
|
||||
{{- if not .Values.api.autoscaling.enabled }}
|
||||
replicas: {{ .Values.api.replicaCount }}
|
||||
{{- end }}
|
||||
selector:
|
||||
matchLabels:
|
||||
{{- include "incidentops.api.selectorLabels" . | nindent 6 }}
|
||||
template:
|
||||
metadata:
|
||||
annotations:
|
||||
checksum/config: {{ include (print $.Template.BasePath "/configmap.yaml") . | sha256sum }}
|
||||
checksum/secret: {{ include (print $.Template.BasePath "/secret.yaml") . | sha256sum }}
|
||||
{{- with .Values.api.podAnnotations }}
|
||||
{{- toYaml . | nindent 8 }}
|
||||
{{- end }}
|
||||
labels:
|
||||
{{- include "incidentops.api.selectorLabels" . | nindent 8 }}
|
||||
spec:
|
||||
{{- with .Values.global.imagePullSecrets }}
|
||||
imagePullSecrets:
|
||||
{{- toYaml . | nindent 8 }}
|
||||
{{- end }}
|
||||
serviceAccountName: {{ include "incidentops.serviceAccountName" . }}
|
||||
securityContext:
|
||||
{{- toYaml .Values.podSecurityContext | nindent 8 }}
|
||||
initContainers:
|
||||
- name: wait-for-postgres
|
||||
image: busybox:1.36
|
||||
command:
|
||||
- sh
|
||||
- -c
|
||||
- |
|
||||
until nc -z {{ include "incidentops.fullname" . }}-postgresql 5432; do
|
||||
echo "Waiting for PostgreSQL..."
|
||||
sleep 2
|
||||
done
|
||||
echo "PostgreSQL is ready"
|
||||
- name: wait-for-redis
|
||||
image: busybox:1.36
|
||||
command:
|
||||
- sh
|
||||
- -c
|
||||
- |
|
||||
until nc -z {{ include "incidentops.fullname" . }}-redis 6379; do
|
||||
echo "Waiting for Redis..."
|
||||
sleep 2
|
||||
done
|
||||
echo "Redis is ready"
|
||||
containers:
|
||||
- name: api
|
||||
securityContext:
|
||||
{{- toYaml .Values.securityContext | nindent 12 }}
|
||||
image: {{ include "incidentops.api.image" . }}
|
||||
imagePullPolicy: {{ .Values.api.image.pullPolicy }}
|
||||
ports:
|
||||
- name: http
|
||||
containerPort: 8000
|
||||
protocol: TCP
|
||||
{{- if .Values.metrics.enabled }}
|
||||
- name: metrics
|
||||
containerPort: {{ .Values.metrics.port }}
|
||||
protocol: TCP
|
||||
{{- end }}
|
||||
envFrom:
|
||||
- configMapRef:
|
||||
name: {{ include "incidentops.fullname" . }}-config
|
||||
- secretRef:
|
||||
name: {{ include "incidentops.fullname" . }}-secret
|
||||
livenessProbe:
|
||||
httpGet:
|
||||
path: /v1/healthz
|
||||
port: http
|
||||
initialDelaySeconds: 10
|
||||
periodSeconds: 30
|
||||
timeoutSeconds: 5
|
||||
failureThreshold: 3
|
||||
readinessProbe:
|
||||
httpGet:
|
||||
path: /v1/readyz
|
||||
port: http
|
||||
initialDelaySeconds: 5
|
||||
periodSeconds: 10
|
||||
timeoutSeconds: 5
|
||||
failureThreshold: 3
|
||||
resources:
|
||||
{{- toYaml .Values.api.resources | nindent 12 }}
|
||||
{{- with .Values.api.nodeSelector }}
|
||||
nodeSelector:
|
||||
{{- toYaml . | nindent 8 }}
|
||||
{{- end }}
|
||||
{{- with .Values.api.affinity }}
|
||||
affinity:
|
||||
{{- toYaml . | nindent 8 }}
|
||||
{{- end }}
|
||||
{{- with .Values.api.tolerations }}
|
||||
tolerations:
|
||||
{{- toYaml . | nindent 8 }}
|
||||
{{- end }}
|
||||
@@ -1,22 +0,0 @@
|
||||
{{- if .Values.api.autoscaling.enabled }}
|
||||
apiVersion: autoscaling/v2
|
||||
kind: HorizontalPodAutoscaler
|
||||
metadata:
|
||||
name: {{ include "incidentops.fullname" . }}-api
|
||||
labels:
|
||||
{{- include "incidentops.api.labels" . | nindent 4 }}
|
||||
spec:
|
||||
scaleTargetRef:
|
||||
apiVersion: apps/v1
|
||||
kind: Deployment
|
||||
name: {{ include "incidentops.fullname" . }}-api
|
||||
minReplicas: {{ .Values.api.autoscaling.minReplicas }}
|
||||
maxReplicas: {{ .Values.api.autoscaling.maxReplicas }}
|
||||
metrics:
|
||||
- type: Resource
|
||||
resource:
|
||||
name: cpu
|
||||
target:
|
||||
type: Utilization
|
||||
averageUtilization: {{ .Values.api.autoscaling.targetCPUUtilizationPercentage }}
|
||||
{{- end }}
|
||||
@@ -1,21 +0,0 @@
|
||||
apiVersion: v1
|
||||
kind: Service
|
||||
metadata:
|
||||
name: {{ include "incidentops.fullname" . }}-api
|
||||
labels:
|
||||
{{- include "incidentops.api.labels" . | nindent 4 }}
|
||||
spec:
|
||||
type: {{ .Values.api.service.type }}
|
||||
ports:
|
||||
- port: {{ .Values.api.service.port }}
|
||||
targetPort: http
|
||||
protocol: TCP
|
||||
name: http
|
||||
{{- if .Values.metrics.enabled }}
|
||||
- port: {{ .Values.metrics.port }}
|
||||
targetPort: metrics
|
||||
protocol: TCP
|
||||
name: metrics
|
||||
{{- end }}
|
||||
selector:
|
||||
{{- include "incidentops.api.selectorLabels" . | nindent 4 }}
|
||||
@@ -1,23 +0,0 @@
|
||||
apiVersion: v1
|
||||
kind: ConfigMap
|
||||
metadata:
|
||||
name: {{ include "incidentops.fullname" . }}-config
|
||||
labels:
|
||||
{{- include "incidentops.labels" . | nindent 4 }}
|
||||
data:
|
||||
JWT_ALGORITHM: {{ .Values.config.jwtAlgorithm | quote }}
|
||||
ACCESS_TOKEN_EXPIRE_MINUTES: {{ .Values.config.accessTokenExpireMinutes | quote }}
|
||||
REFRESH_TOKEN_EXPIRE_DAYS: {{ .Values.config.refreshTokenExpireDays | quote }}
|
||||
# OpenTelemetry configuration
|
||||
OTEL_ENABLED: {{ .Values.observability.enabled | quote }}
|
||||
OTEL_SERVICE_NAME: "incidentops-api"
|
||||
OTEL_ENVIRONMENT: {{ .Values.config.environment | default "production" | quote }}
|
||||
{{- if .Values.observability.enabled }}
|
||||
OTEL_EXPORTER_OTLP_ENDPOINT: "http://{{ include "incidentops.fullname" . }}-otel-collector:4317"
|
||||
{{- end }}
|
||||
OTEL_EXPORTER_OTLP_INSECURE: "true"
|
||||
OTEL_LOG_LEVEL: {{ .Values.config.logLevel | default "INFO" | quote }}
|
||||
# Metrics configuration
|
||||
{{- if .Values.metrics.enabled }}
|
||||
PROMETHEUS_PORT: {{ .Values.metrics.port | quote }}
|
||||
{{- end }}
|
||||
@@ -1,387 +0,0 @@
|
||||
{{- if .Values.observability.enabled }}
|
||||
apiVersion: v1
|
||||
kind: ConfigMap
|
||||
metadata:
|
||||
name: {{ include "incidentops.fullname" . }}-grafana-datasources
|
||||
labels:
|
||||
{{- include "incidentops.labels" . | nindent 4 }}
|
||||
app.kubernetes.io/component: grafana
|
||||
data:
|
||||
datasources.yaml: |
|
||||
apiVersion: 1
|
||||
datasources:
|
||||
- name: Prometheus
|
||||
type: prometheus
|
||||
uid: prometheus
|
||||
url: http://{{ include "incidentops.fullname" . }}-prometheus:9090
|
||||
access: proxy
|
||||
isDefault: false
|
||||
jsonData:
|
||||
httpMethod: POST
|
||||
exemplarTraceIdDestinations:
|
||||
- name: trace_id
|
||||
datasourceUid: tempo
|
||||
|
||||
- name: Tempo
|
||||
type: tempo
|
||||
uid: tempo
|
||||
url: http://{{ include "incidentops.fullname" . }}-tempo:3200
|
||||
access: proxy
|
||||
isDefault: false
|
||||
jsonData:
|
||||
tracesToLogsV2:
|
||||
datasourceUid: loki
|
||||
spanStartTimeShift: '-1h'
|
||||
spanEndTimeShift: '1h'
|
||||
filterByTraceID: true
|
||||
filterBySpanID: true
|
||||
tracesToMetrics:
|
||||
datasourceUid: prometheus
|
||||
spanStartTimeShift: '-1h'
|
||||
spanEndTimeShift: '1h'
|
||||
serviceMap:
|
||||
datasourceUid: prometheus
|
||||
nodeGraph:
|
||||
enabled: true
|
||||
lokiSearch:
|
||||
datasourceUid: loki
|
||||
|
||||
- name: Loki
|
||||
type: loki
|
||||
uid: loki
|
||||
url: http://{{ include "incidentops.fullname" . }}-loki:3100
|
||||
access: proxy
|
||||
isDefault: true
|
||||
jsonData:
|
||||
derivedFields:
|
||||
- datasourceUid: tempo
|
||||
matcherRegex: '"trace_id":"([a-f0-9]+)"'
|
||||
name: TraceID
|
||||
url: '$${__value.raw}'
|
||||
urlDisplayLabel: 'View Trace'
|
||||
---
|
||||
apiVersion: v1
|
||||
kind: ConfigMap
|
||||
metadata:
|
||||
name: {{ include "incidentops.fullname" . }}-grafana-dashboards-provider
|
||||
labels:
|
||||
{{- include "incidentops.labels" . | nindent 4 }}
|
||||
app.kubernetes.io/component: grafana
|
||||
data:
|
||||
dashboards.yaml: |
|
||||
apiVersion: 1
|
||||
providers:
|
||||
- name: 'default'
|
||||
orgId: 1
|
||||
folder: 'IncidentOps'
|
||||
folderUid: 'incidentops'
|
||||
type: file
|
||||
disableDeletion: false
|
||||
editable: true
|
||||
options:
|
||||
path: /var/lib/grafana/dashboards
|
||||
---
|
||||
apiVersion: v1
|
||||
kind: ConfigMap
|
||||
metadata:
|
||||
name: {{ include "incidentops.fullname" . }}-grafana-dashboards
|
||||
labels:
|
||||
{{- include "incidentops.labels" . | nindent 4 }}
|
||||
app.kubernetes.io/component: grafana
|
||||
data:
|
||||
api-overview.json: |
|
||||
{
|
||||
"title": "IncidentOps API Overview",
|
||||
"uid": "incidentops-api",
|
||||
"tags": ["incidentops", "api"],
|
||||
"timezone": "browser",
|
||||
"editable": true,
|
||||
"panels": [
|
||||
{
|
||||
"id": 1,
|
||||
"title": "Request Rate",
|
||||
"type": "timeseries",
|
||||
"gridPos": {"h": 8, "w": 8, "x": 0, "y": 0},
|
||||
"targets": [
|
||||
{
|
||||
"datasource": {"type": "prometheus", "uid": "prometheus"},
|
||||
"expr": "sum(rate(http_server_request_duration_seconds_count{job=\"incidentops-api\"}[1m]))",
|
||||
"legendFormat": "Requests/sec",
|
||||
"refId": "A"
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "reqps"
|
||||
}
|
||||
}
|
||||
},
|
||||
{
|
||||
"id": 2,
|
||||
"title": "Request Duration (p50, p95, p99)",
|
||||
"type": "timeseries",
|
||||
"gridPos": {"h": 8, "w": 8, "x": 8, "y": 0},
|
||||
"targets": [
|
||||
{
|
||||
"datasource": {"type": "prometheus", "uid": "prometheus"},
|
||||
"expr": "histogram_quantile(0.50, sum(rate(http_server_request_duration_seconds_bucket{job=\"incidentops-api\"}[5m])) by (le))",
|
||||
"legendFormat": "p50",
|
||||
"refId": "A"
|
||||
},
|
||||
{
|
||||
"datasource": {"type": "prometheus", "uid": "prometheus"},
|
||||
"expr": "histogram_quantile(0.95, sum(rate(http_server_request_duration_seconds_bucket{job=\"incidentops-api\"}[5m])) by (le))",
|
||||
"legendFormat": "p95",
|
||||
"refId": "B"
|
||||
},
|
||||
{
|
||||
"datasource": {"type": "prometheus", "uid": "prometheus"},
|
||||
"expr": "histogram_quantile(0.99, sum(rate(http_server_request_duration_seconds_bucket{job=\"incidentops-api\"}[5m])) by (le))",
|
||||
"legendFormat": "p99",
|
||||
"refId": "C"
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "s"
|
||||
}
|
||||
}
|
||||
},
|
||||
{
|
||||
"id": 3,
|
||||
"title": "Error Rate",
|
||||
"type": "timeseries",
|
||||
"gridPos": {"h": 8, "w": 8, "x": 16, "y": 0},
|
||||
"targets": [
|
||||
{
|
||||
"datasource": {"type": "prometheus", "uid": "prometheus"},
|
||||
"expr": "sum(rate(http_server_request_duration_seconds_count{job=\"incidentops-api\", http_status_code=~\"5..\"}[1m])) / sum(rate(http_server_request_duration_seconds_count{job=\"incidentops-api\"}[1m])) * 100",
|
||||
"legendFormat": "Error %",
|
||||
"refId": "A"
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "percent",
|
||||
"min": 0,
|
||||
"max": 100
|
||||
}
|
||||
}
|
||||
},
|
||||
{
|
||||
"id": 4,
|
||||
"title": "Requests by Status Code",
|
||||
"type": "timeseries",
|
||||
"gridPos": {"h": 8, "w": 12, "x": 0, "y": 8},
|
||||
"targets": [
|
||||
{
|
||||
"datasource": {"type": "prometheus", "uid": "prometheus"},
|
||||
"expr": "sum by (http_status_code) (rate(http_server_request_duration_seconds_count{job=\"incidentops-api\"}[1m]))",
|
||||
"legendFormat": "{{ "{{" }}http_status_code{{ "}}" }}",
|
||||
"refId": "A"
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "reqps"
|
||||
}
|
||||
}
|
||||
},
|
||||
{
|
||||
"id": 5,
|
||||
"title": "Requests by Endpoint",
|
||||
"type": "timeseries",
|
||||
"gridPos": {"h": 8, "w": 12, "x": 12, "y": 8},
|
||||
"targets": [
|
||||
{
|
||||
"datasource": {"type": "prometheus", "uid": "prometheus"},
|
||||
"expr": "sum by (http_route) (rate(http_server_request_duration_seconds_count{job=\"incidentops-api\"}[1m]))",
|
||||
"legendFormat": "{{ "{{" }}http_route{{ "}}" }}",
|
||||
"refId": "A"
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "reqps"
|
||||
}
|
||||
}
|
||||
},
|
||||
{
|
||||
"id": 6,
|
||||
"title": "Recent Logs",
|
||||
"type": "logs",
|
||||
"gridPos": {"h": 10, "w": 24, "x": 0, "y": 16},
|
||||
"targets": [
|
||||
{
|
||||
"datasource": {"type": "loki", "uid": "loki"},
|
||||
"expr": "{service_name=\"incidentops-api\"} | json",
|
||||
"refId": "A"
|
||||
}
|
||||
],
|
||||
"options": {
|
||||
"showTime": true,
|
||||
"showLabels": true,
|
||||
"wrapLogMessage": true,
|
||||
"enableLogDetails": true,
|
||||
"sortOrder": "Descending"
|
||||
}
|
||||
},
|
||||
{
|
||||
"id": 7,
|
||||
"title": "Recent Traces",
|
||||
"type": "traces",
|
||||
"gridPos": {"h": 10, "w": 24, "x": 0, "y": 26},
|
||||
"targets": [
|
||||
{
|
||||
"datasource": {"type": "tempo", "uid": "tempo"},
|
||||
"queryType": "traceqlSearch",
|
||||
"filters": [
|
||||
{
|
||||
"id": "service-name",
|
||||
"operator": "=",
|
||||
"scope": "resource",
|
||||
"tag": "service.name",
|
||||
"value": ["incidentops-api"]
|
||||
}
|
||||
],
|
||||
"refId": "A"
|
||||
}
|
||||
]
|
||||
}
|
||||
],
|
||||
"schemaVersion": 38,
|
||||
"version": 2
|
||||
}
|
||||
---
|
||||
apiVersion: apps/v1
|
||||
kind: Deployment
|
||||
metadata:
|
||||
name: {{ include "incidentops.fullname" . }}-grafana
|
||||
labels:
|
||||
{{- include "incidentops.labels" . | nindent 4 }}
|
||||
app.kubernetes.io/component: grafana
|
||||
spec:
|
||||
replicas: 1
|
||||
selector:
|
||||
matchLabels:
|
||||
{{- include "incidentops.selectorLabels" . | nindent 6 }}
|
||||
app.kubernetes.io/component: grafana
|
||||
template:
|
||||
metadata:
|
||||
labels:
|
||||
{{- include "incidentops.selectorLabels" . | nindent 8 }}
|
||||
app.kubernetes.io/component: grafana
|
||||
annotations:
|
||||
checksum/datasources: {{ .Values.observability.grafana.image.tag | sha256sum }}
|
||||
spec:
|
||||
securityContext:
|
||||
fsGroup: 472
|
||||
runAsUser: 472
|
||||
containers:
|
||||
- name: grafana
|
||||
image: "{{ .Values.observability.grafana.image.repository }}:{{ .Values.observability.grafana.image.tag }}"
|
||||
imagePullPolicy: {{ .Values.observability.grafana.image.pullPolicy }}
|
||||
ports:
|
||||
- name: http
|
||||
containerPort: 3000
|
||||
protocol: TCP
|
||||
env:
|
||||
- name: GF_SECURITY_ADMIN_USER
|
||||
value: {{ .Values.observability.grafana.adminUser | quote }}
|
||||
- name: GF_SECURITY_ADMIN_PASSWORD
|
||||
valueFrom:
|
||||
secretKeyRef:
|
||||
name: {{ include "incidentops.fullname" . }}-grafana
|
||||
key: admin-password
|
||||
- name: GF_USERS_ALLOW_SIGN_UP
|
||||
value: "false"
|
||||
- name: GF_EXPLORE_ENABLED
|
||||
value: "true"
|
||||
- name: GF_FEATURE_TOGGLES_ENABLE
|
||||
value: "traceqlEditor tempoSearch tempoBackendSearch tempoApmTable"
|
||||
volumeMounts:
|
||||
- name: datasources
|
||||
mountPath: /etc/grafana/provisioning/datasources
|
||||
- name: dashboards-provider
|
||||
mountPath: /etc/grafana/provisioning/dashboards
|
||||
- name: dashboards
|
||||
mountPath: /var/lib/grafana/dashboards
|
||||
- name: data
|
||||
mountPath: /var/lib/grafana
|
||||
resources:
|
||||
{{- toYaml .Values.observability.grafana.resources | nindent 12 }}
|
||||
readinessProbe:
|
||||
httpGet:
|
||||
path: /api/health
|
||||
port: http
|
||||
initialDelaySeconds: 10
|
||||
periodSeconds: 10
|
||||
livenessProbe:
|
||||
httpGet:
|
||||
path: /api/health
|
||||
port: http
|
||||
initialDelaySeconds: 30
|
||||
periodSeconds: 30
|
||||
volumes:
|
||||
- name: datasources
|
||||
configMap:
|
||||
name: {{ include "incidentops.fullname" . }}-grafana-datasources
|
||||
- name: dashboards-provider
|
||||
configMap:
|
||||
name: {{ include "incidentops.fullname" . }}-grafana-dashboards-provider
|
||||
- name: dashboards
|
||||
configMap:
|
||||
name: {{ include "incidentops.fullname" . }}-grafana-dashboards
|
||||
- name: data
|
||||
{{- if .Values.observability.grafana.persistence.enabled }}
|
||||
persistentVolumeClaim:
|
||||
claimName: {{ include "incidentops.fullname" . }}-grafana
|
||||
{{- else }}
|
||||
emptyDir: {}
|
||||
{{- end }}
|
||||
---
|
||||
apiVersion: v1
|
||||
kind: Secret
|
||||
metadata:
|
||||
name: {{ include "incidentops.fullname" . }}-grafana
|
||||
labels:
|
||||
{{- include "incidentops.labels" . | nindent 4 }}
|
||||
app.kubernetes.io/component: grafana
|
||||
type: Opaque
|
||||
data:
|
||||
admin-password: {{ .Values.observability.grafana.adminPassword | b64enc | quote }}
|
||||
---
|
||||
apiVersion: v1
|
||||
kind: Service
|
||||
metadata:
|
||||
name: {{ include "incidentops.fullname" . }}-grafana
|
||||
labels:
|
||||
{{- include "incidentops.labels" . | nindent 4 }}
|
||||
app.kubernetes.io/component: grafana
|
||||
spec:
|
||||
type: {{ .Values.observability.grafana.service.type }}
|
||||
ports:
|
||||
- name: http
|
||||
port: 80
|
||||
targetPort: http
|
||||
protocol: TCP
|
||||
selector:
|
||||
{{- include "incidentops.selectorLabels" . | nindent 4 }}
|
||||
app.kubernetes.io/component: grafana
|
||||
{{- if .Values.observability.grafana.persistence.enabled }}
|
||||
---
|
||||
apiVersion: v1
|
||||
kind: PersistentVolumeClaim
|
||||
metadata:
|
||||
name: {{ include "incidentops.fullname" . }}-grafana
|
||||
labels:
|
||||
{{- include "incidentops.labels" . | nindent 4 }}
|
||||
app.kubernetes.io/component: grafana
|
||||
spec:
|
||||
accessModes:
|
||||
- ReadWriteOnce
|
||||
resources:
|
||||
requests:
|
||||
storage: {{ .Values.observability.grafana.persistence.size }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
@@ -1,38 +0,0 @@
|
||||
{{- if and .Values.observability.enabled .Values.observability.grafana.ingress.enabled -}}
|
||||
apiVersion: networking.k8s.io/v1
|
||||
kind: Ingress
|
||||
metadata:
|
||||
name: {{ include "incidentops.fullname" . }}-grafana
|
||||
labels:
|
||||
{{- include "incidentops.labels" . | nindent 4 }}
|
||||
app.kubernetes.io/component: grafana
|
||||
{{- with .Values.observability.grafana.ingress.annotations }}
|
||||
annotations:
|
||||
{{- toYaml . | nindent 4 }}
|
||||
{{- end }}
|
||||
spec:
|
||||
{{- if .Values.ingress.className }}
|
||||
ingressClassName: {{ .Values.ingress.className }}
|
||||
{{- end }}
|
||||
{{- if .Values.observability.grafana.ingress.tls }}
|
||||
tls:
|
||||
{{- range .Values.observability.grafana.ingress.tls }}
|
||||
- hosts:
|
||||
{{- range .hosts }}
|
||||
- {{ . | quote }}
|
||||
{{- end }}
|
||||
secretName: {{ .secretName }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
rules:
|
||||
- host: {{ .Values.observability.grafana.ingress.host | quote }}
|
||||
http:
|
||||
paths:
|
||||
- path: /
|
||||
pathType: Prefix
|
||||
backend:
|
||||
service:
|
||||
name: {{ include "incidentops.fullname" . }}-grafana
|
||||
port:
|
||||
number: 80
|
||||
{{- end }}
|
||||
@@ -1,51 +0,0 @@
|
||||
{{- if .Values.ingress.enabled -}}
|
||||
apiVersion: networking.k8s.io/v1
|
||||
kind: Ingress
|
||||
metadata:
|
||||
name: {{ include "incidentops.fullname" . }}
|
||||
labels:
|
||||
{{- include "incidentops.labels" . | nindent 4 }}
|
||||
{{- with .Values.ingress.annotations }}
|
||||
annotations:
|
||||
{{- toYaml . | nindent 4 }}
|
||||
{{- end }}
|
||||
spec:
|
||||
{{- if .Values.ingress.className }}
|
||||
ingressClassName: {{ .Values.ingress.className }}
|
||||
{{- end }}
|
||||
{{- if .Values.ingress.tls }}
|
||||
tls:
|
||||
{{- range .Values.ingress.tls }}
|
||||
- hosts:
|
||||
{{- range .hosts }}
|
||||
- {{ . | quote }}
|
||||
{{- end }}
|
||||
secretName: {{ .secretName }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
rules:
|
||||
- host: {{ .Values.ingress.host | quote }}
|
||||
http:
|
||||
paths:
|
||||
- path: /api
|
||||
pathType: Prefix
|
||||
backend:
|
||||
service:
|
||||
name: {{ include "incidentops.fullname" . }}-api
|
||||
port:
|
||||
number: {{ .Values.api.service.port }}
|
||||
- path: /v1
|
||||
pathType: Prefix
|
||||
backend:
|
||||
service:
|
||||
name: {{ include "incidentops.fullname" . }}-api
|
||||
port:
|
||||
number: {{ .Values.api.service.port }}
|
||||
- path: /
|
||||
pathType: Prefix
|
||||
backend:
|
||||
service:
|
||||
name: {{ include "incidentops.fullname" . }}-web
|
||||
port:
|
||||
number: {{ .Values.web.service.port }}
|
||||
{{- end }}
|
||||
@@ -1,155 +0,0 @@
|
||||
{{- if .Values.observability.enabled }}
|
||||
apiVersion: v1
|
||||
kind: ConfigMap
|
||||
metadata:
|
||||
name: {{ include "incidentops.fullname" . }}-loki-config
|
||||
labels:
|
||||
{{- include "incidentops.labels" . | nindent 4 }}
|
||||
app.kubernetes.io/component: loki
|
||||
data:
|
||||
loki.yaml: |
|
||||
auth_enabled: false
|
||||
|
||||
server:
|
||||
http_listen_port: 3100
|
||||
grpc_listen_port: 9096
|
||||
|
||||
common:
|
||||
path_prefix: /loki
|
||||
storage:
|
||||
filesystem:
|
||||
chunks_directory: /loki/chunks
|
||||
rules_directory: /loki/rules
|
||||
replication_factor: 1
|
||||
ring:
|
||||
kvstore:
|
||||
store: inmemory
|
||||
|
||||
query_range:
|
||||
results_cache:
|
||||
cache:
|
||||
embedded_cache:
|
||||
enabled: true
|
||||
max_size_mb: 100
|
||||
|
||||
schema_config:
|
||||
configs:
|
||||
- from: "2020-10-24"
|
||||
store: tsdb
|
||||
object_store: filesystem
|
||||
schema: v13
|
||||
index:
|
||||
prefix: index_
|
||||
period: 24h
|
||||
|
||||
ruler:
|
||||
alertmanager_url: http://localhost:9093
|
||||
|
||||
limits_config:
|
||||
retention_period: {{ .Values.observability.loki.retention }}
|
||||
allow_structured_metadata: true
|
||||
volume_enabled: true
|
||||
---
|
||||
apiVersion: apps/v1
|
||||
kind: Deployment
|
||||
metadata:
|
||||
name: {{ include "incidentops.fullname" . }}-loki
|
||||
labels:
|
||||
{{- include "incidentops.labels" . | nindent 4 }}
|
||||
app.kubernetes.io/component: loki
|
||||
spec:
|
||||
replicas: 1
|
||||
selector:
|
||||
matchLabels:
|
||||
{{- include "incidentops.selectorLabels" . | nindent 6 }}
|
||||
app.kubernetes.io/component: loki
|
||||
template:
|
||||
metadata:
|
||||
labels:
|
||||
{{- include "incidentops.selectorLabels" . | nindent 8 }}
|
||||
app.kubernetes.io/component: loki
|
||||
annotations:
|
||||
checksum/config: {{ .Values.observability.loki.image.tag | sha256sum }}
|
||||
spec:
|
||||
containers:
|
||||
- name: loki
|
||||
image: "{{ .Values.observability.loki.image.repository }}:{{ .Values.observability.loki.image.tag }}"
|
||||
imagePullPolicy: {{ .Values.observability.loki.image.pullPolicy }}
|
||||
args:
|
||||
- -config.file=/etc/loki/loki.yaml
|
||||
ports:
|
||||
- name: http
|
||||
containerPort: 3100
|
||||
protocol: TCP
|
||||
- name: grpc
|
||||
containerPort: 9096
|
||||
protocol: TCP
|
||||
volumeMounts:
|
||||
- name: config
|
||||
mountPath: /etc/loki
|
||||
- name: data
|
||||
mountPath: /loki
|
||||
resources:
|
||||
{{- toYaml .Values.observability.loki.resources | nindent 12 }}
|
||||
readinessProbe:
|
||||
httpGet:
|
||||
path: /ready
|
||||
port: http
|
||||
initialDelaySeconds: 10
|
||||
periodSeconds: 10
|
||||
livenessProbe:
|
||||
httpGet:
|
||||
path: /ready
|
||||
port: http
|
||||
initialDelaySeconds: 30
|
||||
periodSeconds: 30
|
||||
volumes:
|
||||
- name: config
|
||||
configMap:
|
||||
name: {{ include "incidentops.fullname" . }}-loki-config
|
||||
- name: data
|
||||
{{- if .Values.observability.loki.persistence.enabled }}
|
||||
persistentVolumeClaim:
|
||||
claimName: {{ include "incidentops.fullname" . }}-loki
|
||||
{{- else }}
|
||||
emptyDir: {}
|
||||
{{- end }}
|
||||
---
|
||||
apiVersion: v1
|
||||
kind: Service
|
||||
metadata:
|
||||
name: {{ include "incidentops.fullname" . }}-loki
|
||||
labels:
|
||||
{{- include "incidentops.labels" . | nindent 4 }}
|
||||
app.kubernetes.io/component: loki
|
||||
spec:
|
||||
type: ClusterIP
|
||||
ports:
|
||||
- name: http
|
||||
port: 3100
|
||||
targetPort: http
|
||||
protocol: TCP
|
||||
- name: grpc
|
||||
port: 9096
|
||||
targetPort: grpc
|
||||
protocol: TCP
|
||||
selector:
|
||||
{{- include "incidentops.selectorLabels" . | nindent 4 }}
|
||||
app.kubernetes.io/component: loki
|
||||
{{- if .Values.observability.loki.persistence.enabled }}
|
||||
---
|
||||
apiVersion: v1
|
||||
kind: PersistentVolumeClaim
|
||||
metadata:
|
||||
name: {{ include "incidentops.fullname" . }}-loki
|
||||
labels:
|
||||
{{- include "incidentops.labels" . | nindent 4 }}
|
||||
app.kubernetes.io/component: loki
|
||||
spec:
|
||||
accessModes:
|
||||
- ReadWriteOnce
|
||||
resources:
|
||||
requests:
|
||||
storage: {{ .Values.observability.loki.persistence.size }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
@@ -1,51 +0,0 @@
|
||||
{{- if .Values.migration.enabled }}
|
||||
apiVersion: batch/v1
|
||||
kind: Job
|
||||
metadata:
|
||||
name: {{ include "incidentops.fullname" . }}-migrate
|
||||
labels:
|
||||
{{- include "incidentops.labels" . | nindent 4 }}
|
||||
app.kubernetes.io/component: migration
|
||||
annotations:
|
||||
"helm.sh/hook": post-install,post-upgrade
|
||||
"helm.sh/hook-weight": "-5"
|
||||
"helm.sh/hook-delete-policy": before-hook-creation,hook-succeeded
|
||||
spec:
|
||||
backoffLimit: {{ .Values.migration.backoffLimit }}
|
||||
template:
|
||||
metadata:
|
||||
labels:
|
||||
{{- include "incidentops.selectorLabels" . | nindent 8 }}
|
||||
app.kubernetes.io/component: migration
|
||||
spec:
|
||||
{{- with .Values.global.imagePullSecrets }}
|
||||
imagePullSecrets:
|
||||
{{- toYaml . | nindent 8 }}
|
||||
{{- end }}
|
||||
serviceAccountName: {{ include "incidentops.serviceAccountName" . }}
|
||||
securityContext:
|
||||
{{- toYaml .Values.podSecurityContext | nindent 8 }}
|
||||
restartPolicy: Never
|
||||
containers:
|
||||
- name: migrate
|
||||
securityContext:
|
||||
{{- toYaml .Values.securityContext | nindent 12 }}
|
||||
image: "{{ .Values.migration.image.repository }}:{{ .Values.migration.image.tag }}"
|
||||
imagePullPolicy: {{ .Values.migration.image.pullPolicy }}
|
||||
command:
|
||||
- uv
|
||||
- run
|
||||
- python
|
||||
- migrations/migrate.py
|
||||
- apply
|
||||
envFrom:
|
||||
- secretRef:
|
||||
name: {{ include "incidentops.fullname" . }}-secret
|
||||
resources:
|
||||
requests:
|
||||
cpu: 50m
|
||||
memory: 128Mi
|
||||
limits:
|
||||
cpu: 200m
|
||||
memory: 256Mi
|
||||
{{- end }}
|
||||
@@ -1,132 +0,0 @@
|
||||
{{- if .Values.observability.enabled }}
|
||||
apiVersion: v1
|
||||
kind: ConfigMap
|
||||
metadata:
|
||||
name: {{ include "incidentops.fullname" . }}-otel-collector-config
|
||||
labels:
|
||||
{{- include "incidentops.labels" . | nindent 4 }}
|
||||
app.kubernetes.io/component: otel-collector
|
||||
data:
|
||||
otel-collector-config.yaml: |
|
||||
extensions:
|
||||
health_check:
|
||||
endpoint: 0.0.0.0:13133
|
||||
|
||||
receivers:
|
||||
otlp:
|
||||
protocols:
|
||||
grpc:
|
||||
endpoint: 0.0.0.0:4317
|
||||
http:
|
||||
endpoint: 0.0.0.0:4318
|
||||
|
||||
processors:
|
||||
batch:
|
||||
timeout: 1s
|
||||
send_batch_size: 1024
|
||||
memory_limiter:
|
||||
check_interval: 1s
|
||||
limit_mib: 512
|
||||
spike_limit_mib: 128
|
||||
|
||||
exporters:
|
||||
otlp/tempo:
|
||||
endpoint: {{ include "incidentops.fullname" . }}-tempo:4317
|
||||
tls:
|
||||
insecure: true
|
||||
loki:
|
||||
endpoint: http://{{ include "incidentops.fullname" . }}-loki:3100/loki/api/v1/push
|
||||
default_labels_enabled:
|
||||
exporter: true
|
||||
job: true
|
||||
|
||||
service:
|
||||
extensions: [health_check]
|
||||
pipelines:
|
||||
traces:
|
||||
receivers: [otlp]
|
||||
processors: [memory_limiter, batch]
|
||||
exporters: [otlp/tempo]
|
||||
logs:
|
||||
receivers: [otlp]
|
||||
processors: [memory_limiter, batch]
|
||||
exporters: [loki]
|
||||
---
|
||||
apiVersion: apps/v1
|
||||
kind: Deployment
|
||||
metadata:
|
||||
name: {{ include "incidentops.fullname" . }}-otel-collector
|
||||
labels:
|
||||
{{- include "incidentops.labels" . | nindent 4 }}
|
||||
app.kubernetes.io/component: otel-collector
|
||||
spec:
|
||||
replicas: {{ .Values.observability.otelCollector.replicaCount }}
|
||||
selector:
|
||||
matchLabels:
|
||||
{{- include "incidentops.selectorLabels" . | nindent 6 }}
|
||||
app.kubernetes.io/component: otel-collector
|
||||
template:
|
||||
metadata:
|
||||
labels:
|
||||
{{- include "incidentops.selectorLabels" . | nindent 8 }}
|
||||
app.kubernetes.io/component: otel-collector
|
||||
annotations:
|
||||
checksum/config: {{ .Values.observability.otelCollector.image.tag | sha256sum }}
|
||||
spec:
|
||||
containers:
|
||||
- name: otel-collector
|
||||
image: "{{ .Values.observability.otelCollector.image.repository }}:{{ .Values.observability.otelCollector.image.tag }}"
|
||||
imagePullPolicy: {{ .Values.observability.otelCollector.image.pullPolicy }}
|
||||
args:
|
||||
- --config=/etc/otel-collector/otel-collector-config.yaml
|
||||
ports:
|
||||
- name: otlp-grpc
|
||||
containerPort: 4317
|
||||
protocol: TCP
|
||||
- name: otlp-http
|
||||
containerPort: 4318
|
||||
protocol: TCP
|
||||
volumeMounts:
|
||||
- name: config
|
||||
mountPath: /etc/otel-collector
|
||||
resources:
|
||||
{{- toYaml .Values.observability.otelCollector.resources | nindent 12 }}
|
||||
livenessProbe:
|
||||
httpGet:
|
||||
path: /
|
||||
port: 13133
|
||||
initialDelaySeconds: 10
|
||||
periodSeconds: 30
|
||||
readinessProbe:
|
||||
httpGet:
|
||||
path: /
|
||||
port: 13133
|
||||
initialDelaySeconds: 5
|
||||
periodSeconds: 10
|
||||
volumes:
|
||||
- name: config
|
||||
configMap:
|
||||
name: {{ include "incidentops.fullname" . }}-otel-collector-config
|
||||
---
|
||||
apiVersion: v1
|
||||
kind: Service
|
||||
metadata:
|
||||
name: {{ include "incidentops.fullname" . }}-otel-collector
|
||||
labels:
|
||||
{{- include "incidentops.labels" . | nindent 4 }}
|
||||
app.kubernetes.io/component: otel-collector
|
||||
spec:
|
||||
type: ClusterIP
|
||||
ports:
|
||||
- name: otlp-grpc
|
||||
port: 4317
|
||||
targetPort: otlp-grpc
|
||||
protocol: TCP
|
||||
- name: otlp-http
|
||||
port: 4318
|
||||
targetPort: otlp-http
|
||||
protocol: TCP
|
||||
selector:
|
||||
{{- include "incidentops.selectorLabels" . | nindent 4 }}
|
||||
app.kubernetes.io/component: otel-collector
|
||||
{{- end }}
|
||||
@@ -1,91 +0,0 @@
|
||||
{{- if .Values.postgresql.enabled }}
|
||||
apiVersion: apps/v1
|
||||
kind: StatefulSet
|
||||
metadata:
|
||||
name: {{ include "incidentops.fullname" . }}-postgresql
|
||||
labels:
|
||||
{{- include "incidentops.labels" . | nindent 4 }}
|
||||
app.kubernetes.io/component: postgresql
|
||||
spec:
|
||||
serviceName: {{ include "incidentops.fullname" . }}-postgresql
|
||||
replicas: 1
|
||||
selector:
|
||||
matchLabels:
|
||||
{{- include "incidentops.selectorLabels" . | nindent 6 }}
|
||||
app.kubernetes.io/component: postgresql
|
||||
template:
|
||||
metadata:
|
||||
labels:
|
||||
{{- include "incidentops.selectorLabels" . | nindent 8 }}
|
||||
app.kubernetes.io/component: postgresql
|
||||
spec:
|
||||
containers:
|
||||
- name: postgresql
|
||||
image: "{{ .Values.postgresql.image.repository }}:{{ .Values.postgresql.image.tag }}"
|
||||
imagePullPolicy: {{ .Values.postgresql.image.pullPolicy }}
|
||||
ports:
|
||||
- name: postgresql
|
||||
containerPort: 5432
|
||||
protocol: TCP
|
||||
env:
|
||||
- name: POSTGRES_USER
|
||||
value: {{ .Values.postgresql.auth.username | quote }}
|
||||
- name: POSTGRES_PASSWORD
|
||||
value: {{ .Values.postgresql.auth.password | quote }}
|
||||
- name: POSTGRES_DB
|
||||
value: {{ .Values.postgresql.auth.database | quote }}
|
||||
- name: PGDATA
|
||||
value: /var/lib/postgresql/data/pgdata
|
||||
volumeMounts:
|
||||
- name: data
|
||||
mountPath: /var/lib/postgresql/data
|
||||
livenessProbe:
|
||||
exec:
|
||||
command:
|
||||
- pg_isready
|
||||
- -U
|
||||
- {{ .Values.postgresql.auth.username }}
|
||||
initialDelaySeconds: 30
|
||||
periodSeconds: 10
|
||||
timeoutSeconds: 5
|
||||
failureThreshold: 6
|
||||
readinessProbe:
|
||||
exec:
|
||||
command:
|
||||
- pg_isready
|
||||
- -U
|
||||
- {{ .Values.postgresql.auth.username }}
|
||||
initialDelaySeconds: 5
|
||||
periodSeconds: 10
|
||||
timeoutSeconds: 5
|
||||
failureThreshold: 6
|
||||
resources:
|
||||
{{- toYaml .Values.postgresql.resources | nindent 12 }}
|
||||
volumeClaimTemplates:
|
||||
- metadata:
|
||||
name: data
|
||||
spec:
|
||||
accessModes:
|
||||
- ReadWriteOnce
|
||||
resources:
|
||||
requests:
|
||||
storage: {{ .Values.postgresql.persistence.size }}
|
||||
---
|
||||
apiVersion: v1
|
||||
kind: Service
|
||||
metadata:
|
||||
name: {{ include "incidentops.fullname" . }}-postgresql
|
||||
labels:
|
||||
{{- include "incidentops.labels" . | nindent 4 }}
|
||||
app.kubernetes.io/component: postgresql
|
||||
spec:
|
||||
type: ClusterIP
|
||||
ports:
|
||||
- port: 5432
|
||||
targetPort: postgresql
|
||||
protocol: TCP
|
||||
name: postgresql
|
||||
selector:
|
||||
{{- include "incidentops.selectorLabels" . | nindent 4 }}
|
||||
app.kubernetes.io/component: postgresql
|
||||
{{- end }}
|
||||
@@ -1,163 +0,0 @@
|
||||
{{- if and .Values.observability.enabled .Values.metrics.enabled }}
|
||||
apiVersion: v1
|
||||
kind: ConfigMap
|
||||
metadata:
|
||||
name: {{ include "incidentops.fullname" . }}-prometheus
|
||||
labels:
|
||||
{{- include "incidentops.labels" . | nindent 4 }}
|
||||
app.kubernetes.io/component: prometheus
|
||||
data:
|
||||
prometheus.yml: |
|
||||
global:
|
||||
scrape_interval: {{ .Values.observability.prometheus.scrapeInterval | default "15s" }}
|
||||
evaluation_interval: 15s
|
||||
|
||||
scrape_configs:
|
||||
- job_name: "prometheus"
|
||||
static_configs:
|
||||
- targets: ["localhost:9090"]
|
||||
|
||||
- job_name: "incidentops-api"
|
||||
kubernetes_sd_configs:
|
||||
- role: pod
|
||||
namespaces:
|
||||
names:
|
||||
- {{ .Release.Namespace }}
|
||||
relabel_configs:
|
||||
- source_labels: [__meta_kubernetes_pod_label_app_kubernetes_io_component]
|
||||
action: keep
|
||||
regex: api
|
||||
- source_labels: [__meta_kubernetes_pod_container_port_name]
|
||||
action: keep
|
||||
regex: metrics
|
||||
- source_labels: [__meta_kubernetes_namespace]
|
||||
target_label: namespace
|
||||
- source_labels: [__meta_kubernetes_pod_name]
|
||||
target_label: pod
|
||||
metrics_path: /metrics
|
||||
scrape_interval: 10s
|
||||
|
||||
- job_name: "incidentops-worker"
|
||||
kubernetes_sd_configs:
|
||||
- role: pod
|
||||
namespaces:
|
||||
names:
|
||||
- {{ .Release.Namespace }}
|
||||
relabel_configs:
|
||||
- source_labels: [__meta_kubernetes_pod_label_app_kubernetes_io_component]
|
||||
action: keep
|
||||
regex: worker
|
||||
- source_labels: [__meta_kubernetes_pod_container_port_name]
|
||||
action: keep
|
||||
regex: metrics
|
||||
- source_labels: [__meta_kubernetes_namespace]
|
||||
target_label: namespace
|
||||
- source_labels: [__meta_kubernetes_pod_name]
|
||||
target_label: pod
|
||||
metrics_path: /metrics
|
||||
scrape_interval: 10s
|
||||
---
|
||||
apiVersion: apps/v1
|
||||
kind: Deployment
|
||||
metadata:
|
||||
name: {{ include "incidentops.fullname" . }}-prometheus
|
||||
labels:
|
||||
{{- include "incidentops.labels" . | nindent 4 }}
|
||||
app.kubernetes.io/component: prometheus
|
||||
spec:
|
||||
replicas: 1
|
||||
selector:
|
||||
matchLabels:
|
||||
{{- include "incidentops.selectorLabels" . | nindent 6 }}
|
||||
app.kubernetes.io/component: prometheus
|
||||
template:
|
||||
metadata:
|
||||
labels:
|
||||
{{- include "incidentops.selectorLabels" . | nindent 8 }}
|
||||
app.kubernetes.io/component: prometheus
|
||||
annotations:
|
||||
checksum/config: {{ .Values.observability.prometheus.image.tag | sha256sum }}
|
||||
spec:
|
||||
serviceAccountName: {{ include "incidentops.serviceAccountName" . }}
|
||||
securityContext:
|
||||
fsGroup: 65534
|
||||
runAsUser: 65534
|
||||
runAsNonRoot: true
|
||||
containers:
|
||||
- name: prometheus
|
||||
image: "{{ .Values.observability.prometheus.image.repository }}:{{ .Values.observability.prometheus.image.tag }}"
|
||||
imagePullPolicy: {{ .Values.observability.prometheus.image.pullPolicy }}
|
||||
args:
|
||||
- "--config.file=/etc/prometheus/prometheus.yml"
|
||||
- "--storage.tsdb.path=/prometheus"
|
||||
- "--storage.tsdb.retention.time={{ .Values.observability.prometheus.retention }}"
|
||||
- "--web.enable-lifecycle"
|
||||
ports:
|
||||
- name: http
|
||||
containerPort: 9090
|
||||
protocol: TCP
|
||||
volumeMounts:
|
||||
- name: config
|
||||
mountPath: /etc/prometheus
|
||||
- name: data
|
||||
mountPath: /prometheus
|
||||
resources:
|
||||
{{- toYaml .Values.observability.prometheus.resources | nindent 12 }}
|
||||
readinessProbe:
|
||||
httpGet:
|
||||
path: /-/ready
|
||||
port: http
|
||||
initialDelaySeconds: 10
|
||||
periodSeconds: 10
|
||||
livenessProbe:
|
||||
httpGet:
|
||||
path: /-/healthy
|
||||
port: http
|
||||
initialDelaySeconds: 30
|
||||
periodSeconds: 30
|
||||
volumes:
|
||||
- name: config
|
||||
configMap:
|
||||
name: {{ include "incidentops.fullname" . }}-prometheus
|
||||
- name: data
|
||||
{{- if .Values.observability.prometheus.persistence.enabled }}
|
||||
persistentVolumeClaim:
|
||||
claimName: {{ include "incidentops.fullname" . }}-prometheus
|
||||
{{- else }}
|
||||
emptyDir: {}
|
||||
{{- end }}
|
||||
---
|
||||
apiVersion: v1
|
||||
kind: Service
|
||||
metadata:
|
||||
name: {{ include "incidentops.fullname" . }}-prometheus
|
||||
labels:
|
||||
{{- include "incidentops.labels" . | nindent 4 }}
|
||||
app.kubernetes.io/component: prometheus
|
||||
spec:
|
||||
type: ClusterIP
|
||||
ports:
|
||||
- name: http
|
||||
port: 9090
|
||||
targetPort: http
|
||||
protocol: TCP
|
||||
selector:
|
||||
{{- include "incidentops.selectorLabels" . | nindent 4 }}
|
||||
app.kubernetes.io/component: prometheus
|
||||
{{- if .Values.observability.prometheus.persistence.enabled }}
|
||||
---
|
||||
apiVersion: v1
|
||||
kind: PersistentVolumeClaim
|
||||
metadata:
|
||||
name: {{ include "incidentops.fullname" . }}-prometheus
|
||||
labels:
|
||||
{{- include "incidentops.labels" . | nindent 4 }}
|
||||
app.kubernetes.io/component: prometheus
|
||||
spec:
|
||||
accessModes:
|
||||
- ReadWriteOnce
|
||||
resources:
|
||||
requests:
|
||||
storage: {{ .Values.observability.prometheus.persistence.size }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
@@ -1,29 +0,0 @@
|
||||
{{- if and .Values.observability.enabled .Values.metrics.enabled }}
|
||||
apiVersion: rbac.authorization.k8s.io/v1
|
||||
kind: Role
|
||||
metadata:
|
||||
name: {{ include "incidentops.fullname" . }}-prometheus
|
||||
labels:
|
||||
{{- include "incidentops.labels" . | nindent 4 }}
|
||||
app.kubernetes.io/component: prometheus
|
||||
rules:
|
||||
- apiGroups: [""]
|
||||
resources: ["pods", "endpoints", "services"]
|
||||
verbs: ["get", "list", "watch"]
|
||||
---
|
||||
apiVersion: rbac.authorization.k8s.io/v1
|
||||
kind: RoleBinding
|
||||
metadata:
|
||||
name: {{ include "incidentops.fullname" . }}-prometheus
|
||||
labels:
|
||||
{{- include "incidentops.labels" . | nindent 4 }}
|
||||
app.kubernetes.io/component: prometheus
|
||||
subjects:
|
||||
- kind: ServiceAccount
|
||||
name: {{ include "incidentops.serviceAccountName" . }}
|
||||
namespace: {{ .Release.Namespace }}
|
||||
roleRef:
|
||||
apiGroup: rbac.authorization.k8s.io
|
||||
kind: Role
|
||||
name: {{ include "incidentops.fullname" . }}-prometheus
|
||||
{{- end }}
|
||||
@@ -1,169 +0,0 @@
|
||||
{{- if and .Values.observability.enabled .Values.observability.promtail.enabled }}
|
||||
apiVersion: v1
|
||||
kind: ConfigMap
|
||||
metadata:
|
||||
name: {{ include "incidentops.fullname" . }}-promtail-config
|
||||
labels:
|
||||
{{- include "incidentops.labels" . | nindent 4 }}
|
||||
app.kubernetes.io/component: promtail
|
||||
data:
|
||||
promtail.yaml: |
|
||||
server:
|
||||
http_listen_port: 3101
|
||||
grpc_listen_port: 0
|
||||
|
||||
positions:
|
||||
filename: /run/promtail/positions.yaml
|
||||
|
||||
clients:
|
||||
- url: http://{{ include "incidentops.fullname" . }}-loki:3100/loki/api/v1/push
|
||||
|
||||
scrape_configs:
|
||||
- job_name: kubernetes-pods
|
||||
pipeline_stages:
|
||||
- cri: {}
|
||||
kubernetes_sd_configs:
|
||||
- role: pod
|
||||
namespaces:
|
||||
names: [{{ .Release.Namespace }}]
|
||||
relabel_configs:
|
||||
- source_labels: [__meta_kubernetes_pod_container_init]
|
||||
regex: "true"
|
||||
action: drop
|
||||
- source_labels: [__meta_kubernetes_pod_phase]
|
||||
regex: Pending|Failed|Succeeded
|
||||
action: drop
|
||||
- source_labels: [__meta_kubernetes_pod_name, __meta_kubernetes_pod_namespace, __meta_kubernetes_pod_container_name]
|
||||
target_label: __path__
|
||||
replacement: /var/log/containers/$1_$2_$3-*.log
|
||||
- source_labels: [__meta_kubernetes_pod_label_app_kubernetes_io_component]
|
||||
regex: (.*)
|
||||
target_label: service_name
|
||||
replacement: {{ include "incidentops.fullname" . }}-$1
|
||||
- source_labels: [__meta_kubernetes_pod_namespace]
|
||||
target_label: namespace
|
||||
- source_labels: [__meta_kubernetes_pod_name]
|
||||
target_label: pod
|
||||
- source_labels: [__meta_kubernetes_pod_container_name]
|
||||
target_label: container
|
||||
- source_labels: [__meta_kubernetes_pod_uid]
|
||||
target_label: pod_uid
|
||||
- target_label: cluster
|
||||
replacement: {{ .Release.Namespace }}
|
||||
|
||||
- job_name: containers-fallback
|
||||
pipeline_stages:
|
||||
- cri: {}
|
||||
static_configs:
|
||||
- labels:
|
||||
job: containers
|
||||
namespace: {{ .Release.Namespace }}
|
||||
service_name: incidentops-api
|
||||
__path__: /var/log/containers/incidentops-api-*_incidentops_api-*.log
|
||||
- labels:
|
||||
job: containers
|
||||
namespace: {{ .Release.Namespace }}
|
||||
service_name: incidentops-worker
|
||||
__path__: /var/log/containers/incidentops-worker-*_incidentops_worker-*.log
|
||||
---
|
||||
apiVersion: v1
|
||||
kind: ServiceAccount
|
||||
metadata:
|
||||
name: {{ include "incidentops.fullname" . }}-promtail
|
||||
labels:
|
||||
{{- include "incidentops.labels" . | nindent 4 }}
|
||||
app.kubernetes.io/component: promtail
|
||||
---
|
||||
apiVersion: rbac.authorization.k8s.io/v1
|
||||
kind: ClusterRole
|
||||
metadata:
|
||||
name: {{ include "incidentops.fullname" . }}-promtail
|
||||
labels:
|
||||
{{- include "incidentops.labels" . | nindent 4 }}
|
||||
app.kubernetes.io/component: promtail
|
||||
rules:
|
||||
- apiGroups: [""]
|
||||
resources: ["pods", "pods/log", "namespaces", "services", "endpoints", "nodes"]
|
||||
verbs: ["get", "list", "watch"]
|
||||
---
|
||||
apiVersion: rbac.authorization.k8s.io/v1
|
||||
kind: ClusterRoleBinding
|
||||
metadata:
|
||||
name: {{ include "incidentops.fullname" . }}-promtail
|
||||
labels:
|
||||
{{- include "incidentops.labels" . | nindent 4 }}
|
||||
app.kubernetes.io/component: promtail
|
||||
subjects:
|
||||
- kind: ServiceAccount
|
||||
name: {{ include "incidentops.fullname" . }}-promtail
|
||||
namespace: {{ .Release.Namespace }}
|
||||
roleRef:
|
||||
apiGroup: rbac.authorization.k8s.io
|
||||
kind: ClusterRole
|
||||
name: {{ include "incidentops.fullname" . }}-promtail
|
||||
---
|
||||
apiVersion: apps/v1
|
||||
kind: DaemonSet
|
||||
metadata:
|
||||
name: {{ include "incidentops.fullname" . }}-promtail
|
||||
labels:
|
||||
{{- include "incidentops.labels" . | nindent 4 }}
|
||||
app.kubernetes.io/component: promtail
|
||||
spec:
|
||||
selector:
|
||||
matchLabels:
|
||||
{{- include "incidentops.selectorLabels" . | nindent 6 }}
|
||||
app.kubernetes.io/component: promtail
|
||||
template:
|
||||
metadata:
|
||||
labels:
|
||||
{{- include "incidentops.selectorLabels" . | nindent 8 }}
|
||||
app.kubernetes.io/component: promtail
|
||||
annotations:
|
||||
checksum/config: {{ .Values.observability.promtail.image.tag | sha256sum }}
|
||||
spec:
|
||||
serviceAccountName: {{ include "incidentops.fullname" . }}-promtail
|
||||
securityContext:
|
||||
runAsUser: 0
|
||||
containers:
|
||||
- name: promtail
|
||||
image: "{{ .Values.observability.promtail.image.repository }}:{{ .Values.observability.promtail.image.tag }}"
|
||||
imagePullPolicy: {{ .Values.observability.promtail.image.pullPolicy }}
|
||||
args:
|
||||
- -config.file=/etc/promtail/promtail.yaml
|
||||
ports:
|
||||
- name: http-metrics
|
||||
containerPort: 3101
|
||||
protocol: TCP
|
||||
volumeMounts:
|
||||
- name: config
|
||||
mountPath: /etc/promtail
|
||||
- name: positions
|
||||
mountPath: /run/promtail
|
||||
- name: varlog
|
||||
mountPath: /var/log
|
||||
readOnly: true
|
||||
- name: varlogpods
|
||||
mountPath: /var/log/pods
|
||||
readOnly: true
|
||||
- name: varlogcontainers
|
||||
mountPath: /var/log/containers
|
||||
readOnly: true
|
||||
resources:
|
||||
{{- toYaml .Values.observability.promtail.resources | nindent 12 }}
|
||||
volumes:
|
||||
- name: config
|
||||
configMap:
|
||||
name: {{ include "incidentops.fullname" . }}-promtail-config
|
||||
- name: positions
|
||||
emptyDir: {}
|
||||
- name: varlog
|
||||
hostPath:
|
||||
path: /var/log
|
||||
- name: varlogpods
|
||||
hostPath:
|
||||
path: /var/log/pods
|
||||
- name: varlogcontainers
|
||||
hostPath:
|
||||
path: /var/log/containers
|
||||
{{- end }}
|
||||
@@ -1,80 +0,0 @@
|
||||
{{- if .Values.redis.enabled }}
|
||||
apiVersion: apps/v1
|
||||
kind: StatefulSet
|
||||
metadata:
|
||||
name: {{ include "incidentops.fullname" . }}-redis
|
||||
labels:
|
||||
{{- include "incidentops.labels" . | nindent 4 }}
|
||||
app.kubernetes.io/component: redis
|
||||
spec:
|
||||
serviceName: {{ include "incidentops.fullname" . }}-redis
|
||||
replicas: 1
|
||||
selector:
|
||||
matchLabels:
|
||||
{{- include "incidentops.selectorLabels" . | nindent 6 }}
|
||||
app.kubernetes.io/component: redis
|
||||
template:
|
||||
metadata:
|
||||
labels:
|
||||
{{- include "incidentops.selectorLabels" . | nindent 8 }}
|
||||
app.kubernetes.io/component: redis
|
||||
spec:
|
||||
containers:
|
||||
- name: redis
|
||||
image: "{{ .Values.redis.image.repository }}:{{ .Values.redis.image.tag }}"
|
||||
imagePullPolicy: {{ .Values.redis.image.pullPolicy }}
|
||||
ports:
|
||||
- name: redis
|
||||
containerPort: 6379
|
||||
protocol: TCP
|
||||
volumeMounts:
|
||||
- name: data
|
||||
mountPath: /data
|
||||
livenessProbe:
|
||||
exec:
|
||||
command:
|
||||
- redis-cli
|
||||
- ping
|
||||
initialDelaySeconds: 30
|
||||
periodSeconds: 10
|
||||
timeoutSeconds: 5
|
||||
failureThreshold: 6
|
||||
readinessProbe:
|
||||
exec:
|
||||
command:
|
||||
- redis-cli
|
||||
- ping
|
||||
initialDelaySeconds: 5
|
||||
periodSeconds: 10
|
||||
timeoutSeconds: 5
|
||||
failureThreshold: 6
|
||||
resources:
|
||||
{{- toYaml .Values.redis.resources | nindent 12 }}
|
||||
volumeClaimTemplates:
|
||||
- metadata:
|
||||
name: data
|
||||
spec:
|
||||
accessModes:
|
||||
- ReadWriteOnce
|
||||
resources:
|
||||
requests:
|
||||
storage: {{ .Values.redis.persistence.size }}
|
||||
---
|
||||
apiVersion: v1
|
||||
kind: Service
|
||||
metadata:
|
||||
name: {{ include "incidentops.fullname" . }}-redis
|
||||
labels:
|
||||
{{- include "incidentops.labels" . | nindent 4 }}
|
||||
app.kubernetes.io/component: redis
|
||||
spec:
|
||||
type: ClusterIP
|
||||
ports:
|
||||
- port: 6379
|
||||
targetPort: redis
|
||||
protocol: TCP
|
||||
name: redis
|
||||
selector:
|
||||
{{- include "incidentops.selectorLabels" . | nindent 4 }}
|
||||
app.kubernetes.io/component: redis
|
||||
{{- end }}
|
||||
@@ -1,13 +0,0 @@
|
||||
apiVersion: v1
|
||||
kind: Secret
|
||||
metadata:
|
||||
name: {{ include "incidentops.fullname" . }}-secret
|
||||
labels:
|
||||
{{- include "incidentops.labels" . | nindent 4 }}
|
||||
type: Opaque
|
||||
stringData:
|
||||
DATABASE_URL: {{ include "incidentops.databaseUrl" . | quote }}
|
||||
REDIS_URL: {{ include "incidentops.redisUrl" . | quote }}
|
||||
CELERY_BROKER_URL: {{ include "incidentops.celeryBrokerUrl" . | quote }}
|
||||
CELERY_RESULT_BACKEND: {{ include "incidentops.celeryResultBackend" . | quote }}
|
||||
JWT_SECRET_KEY: {{ .Values.secrets.jwtSecretKey | quote }}
|
||||
@@ -1,12 +0,0 @@
|
||||
{{- if .Values.serviceAccount.create -}}
|
||||
apiVersion: v1
|
||||
kind: ServiceAccount
|
||||
metadata:
|
||||
name: {{ include "incidentops.serviceAccountName" . }}
|
||||
labels:
|
||||
{{- include "incidentops.labels" . | nindent 4 }}
|
||||
{{- with .Values.serviceAccount.annotations }}
|
||||
annotations:
|
||||
{{- toYaml . | nindent 4 }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
@@ -1,153 +0,0 @@
|
||||
{{- if .Values.observability.enabled }}
|
||||
apiVersion: v1
|
||||
kind: ConfigMap
|
||||
metadata:
|
||||
name: {{ include "incidentops.fullname" . }}-tempo-config
|
||||
labels:
|
||||
{{- include "incidentops.labels" . | nindent 4 }}
|
||||
app.kubernetes.io/component: tempo
|
||||
data:
|
||||
tempo.yaml: |
|
||||
server:
|
||||
http_listen_port: 3200
|
||||
|
||||
distributor:
|
||||
receivers:
|
||||
otlp:
|
||||
protocols:
|
||||
grpc:
|
||||
endpoint: 0.0.0.0:4317
|
||||
http:
|
||||
endpoint: 0.0.0.0:4318
|
||||
|
||||
ingester:
|
||||
trace_idle_period: 10s
|
||||
max_block_bytes: 1048576
|
||||
max_block_duration: 5m
|
||||
|
||||
compactor:
|
||||
compaction:
|
||||
block_retention: {{ .Values.observability.tempo.retention }}
|
||||
|
||||
storage:
|
||||
trace:
|
||||
backend: local
|
||||
local:
|
||||
path: /var/tempo/traces
|
||||
wal:
|
||||
path: /var/tempo/wal
|
||||
|
||||
querier:
|
||||
search:
|
||||
query_timeout: 30s
|
||||
---
|
||||
apiVersion: apps/v1
|
||||
kind: Deployment
|
||||
metadata:
|
||||
name: {{ include "incidentops.fullname" . }}-tempo
|
||||
labels:
|
||||
{{- include "incidentops.labels" . | nindent 4 }}
|
||||
app.kubernetes.io/component: tempo
|
||||
spec:
|
||||
replicas: 1
|
||||
selector:
|
||||
matchLabels:
|
||||
{{- include "incidentops.selectorLabels" . | nindent 6 }}
|
||||
app.kubernetes.io/component: tempo
|
||||
template:
|
||||
metadata:
|
||||
labels:
|
||||
{{- include "incidentops.selectorLabels" . | nindent 8 }}
|
||||
app.kubernetes.io/component: tempo
|
||||
annotations:
|
||||
checksum/config: {{ .Values.observability.tempo.image.tag | sha256sum }}
|
||||
spec:
|
||||
containers:
|
||||
- name: tempo
|
||||
image: "{{ .Values.observability.tempo.image.repository }}:{{ .Values.observability.tempo.image.tag }}"
|
||||
imagePullPolicy: {{ .Values.observability.tempo.image.pullPolicy }}
|
||||
args:
|
||||
- -config.file=/etc/tempo/tempo.yaml
|
||||
ports:
|
||||
- name: http
|
||||
containerPort: 3200
|
||||
protocol: TCP
|
||||
- name: otlp-grpc
|
||||
containerPort: 4317
|
||||
protocol: TCP
|
||||
- name: otlp-http
|
||||
containerPort: 4318
|
||||
protocol: TCP
|
||||
volumeMounts:
|
||||
- name: config
|
||||
mountPath: /etc/tempo
|
||||
- name: data
|
||||
mountPath: /var/tempo
|
||||
resources:
|
||||
{{- toYaml .Values.observability.tempo.resources | nindent 12 }}
|
||||
readinessProbe:
|
||||
httpGet:
|
||||
path: /ready
|
||||
port: http
|
||||
initialDelaySeconds: 10
|
||||
periodSeconds: 10
|
||||
livenessProbe:
|
||||
httpGet:
|
||||
path: /ready
|
||||
port: http
|
||||
initialDelaySeconds: 30
|
||||
periodSeconds: 30
|
||||
volumes:
|
||||
- name: config
|
||||
configMap:
|
||||
name: {{ include "incidentops.fullname" . }}-tempo-config
|
||||
- name: data
|
||||
{{- if .Values.observability.tempo.persistence.enabled }}
|
||||
persistentVolumeClaim:
|
||||
claimName: {{ include "incidentops.fullname" . }}-tempo
|
||||
{{- else }}
|
||||
emptyDir: {}
|
||||
{{- end }}
|
||||
---
|
||||
apiVersion: v1
|
||||
kind: Service
|
||||
metadata:
|
||||
name: {{ include "incidentops.fullname" . }}-tempo
|
||||
labels:
|
||||
{{- include "incidentops.labels" . | nindent 4 }}
|
||||
app.kubernetes.io/component: tempo
|
||||
spec:
|
||||
type: ClusterIP
|
||||
ports:
|
||||
- name: http
|
||||
port: 3200
|
||||
targetPort: http
|
||||
protocol: TCP
|
||||
- name: otlp-grpc
|
||||
port: 4317
|
||||
targetPort: otlp-grpc
|
||||
protocol: TCP
|
||||
- name: otlp-http
|
||||
port: 4318
|
||||
targetPort: otlp-http
|
||||
protocol: TCP
|
||||
selector:
|
||||
{{- include "incidentops.selectorLabels" . | nindent 4 }}
|
||||
app.kubernetes.io/component: tempo
|
||||
{{- if .Values.observability.tempo.persistence.enabled }}
|
||||
---
|
||||
apiVersion: v1
|
||||
kind: PersistentVolumeClaim
|
||||
metadata:
|
||||
name: {{ include "incidentops.fullname" . }}-tempo
|
||||
labels:
|
||||
{{- include "incidentops.labels" . | nindent 4 }}
|
||||
app.kubernetes.io/component: tempo
|
||||
spec:
|
||||
accessModes:
|
||||
- ReadWriteOnce
|
||||
resources:
|
||||
requests:
|
||||
storage: {{ .Values.observability.tempo.persistence.size }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
@@ -1,72 +0,0 @@
|
||||
apiVersion: apps/v1
|
||||
kind: Deployment
|
||||
metadata:
|
||||
name: {{ include "incidentops.fullname" . }}-web
|
||||
labels:
|
||||
{{- include "incidentops.web.labels" . | nindent 4 }}
|
||||
spec:
|
||||
{{- if not .Values.web.autoscaling.enabled }}
|
||||
replicas: {{ .Values.web.replicaCount }}
|
||||
{{- end }}
|
||||
selector:
|
||||
matchLabels:
|
||||
{{- include "incidentops.web.selectorLabels" . | nindent 6 }}
|
||||
template:
|
||||
metadata:
|
||||
{{- with .Values.web.podAnnotations }}
|
||||
annotations:
|
||||
{{- toYaml . | nindent 8 }}
|
||||
{{- end }}
|
||||
labels:
|
||||
{{- include "incidentops.web.selectorLabels" . | nindent 8 }}
|
||||
spec:
|
||||
{{- with .Values.global.imagePullSecrets }}
|
||||
imagePullSecrets:
|
||||
{{- toYaml . | nindent 8 }}
|
||||
{{- end }}
|
||||
serviceAccountName: {{ include "incidentops.serviceAccountName" . }}
|
||||
securityContext:
|
||||
{{- toYaml .Values.podSecurityContext | nindent 8 }}
|
||||
containers:
|
||||
- name: web
|
||||
securityContext:
|
||||
{{- toYaml .Values.securityContext | nindent 12 }}
|
||||
image: {{ include "incidentops.web.image" . }}
|
||||
imagePullPolicy: {{ .Values.web.image.pullPolicy }}
|
||||
ports:
|
||||
- name: http
|
||||
containerPort: 3000
|
||||
protocol: TCP
|
||||
env:
|
||||
- name: NEXT_PUBLIC_API_URL
|
||||
value: "http://{{ include "incidentops.fullname" . }}-api:{{ .Values.api.service.port }}"
|
||||
livenessProbe:
|
||||
httpGet:
|
||||
path: /
|
||||
port: http
|
||||
initialDelaySeconds: 10
|
||||
periodSeconds: 30
|
||||
timeoutSeconds: 5
|
||||
failureThreshold: 3
|
||||
readinessProbe:
|
||||
httpGet:
|
||||
path: /
|
||||
port: http
|
||||
initialDelaySeconds: 5
|
||||
periodSeconds: 10
|
||||
timeoutSeconds: 5
|
||||
failureThreshold: 3
|
||||
resources:
|
||||
{{- toYaml .Values.web.resources | nindent 12 }}
|
||||
{{- with .Values.web.nodeSelector }}
|
||||
nodeSelector:
|
||||
{{- toYaml . | nindent 8 }}
|
||||
{{- end }}
|
||||
{{- with .Values.web.affinity }}
|
||||
affinity:
|
||||
{{- toYaml . | nindent 8 }}
|
||||
{{- end }}
|
||||
{{- with .Values.web.tolerations }}
|
||||
tolerations:
|
||||
{{- toYaml . | nindent 8 }}
|
||||
{{- end }}
|
||||
@@ -1,22 +0,0 @@
|
||||
{{- if .Values.web.autoscaling.enabled }}
|
||||
apiVersion: autoscaling/v2
|
||||
kind: HorizontalPodAutoscaler
|
||||
metadata:
|
||||
name: {{ include "incidentops.fullname" . }}-web
|
||||
labels:
|
||||
{{- include "incidentops.web.labels" . | nindent 4 }}
|
||||
spec:
|
||||
scaleTargetRef:
|
||||
apiVersion: apps/v1
|
||||
kind: Deployment
|
||||
name: {{ include "incidentops.fullname" . }}-web
|
||||
minReplicas: {{ .Values.web.autoscaling.minReplicas }}
|
||||
maxReplicas: {{ .Values.web.autoscaling.maxReplicas }}
|
||||
metrics:
|
||||
- type: Resource
|
||||
resource:
|
||||
name: cpu
|
||||
target:
|
||||
type: Utilization
|
||||
averageUtilization: {{ .Values.web.autoscaling.targetCPUUtilizationPercentage }}
|
||||
{{- end }}
|
||||
@@ -1,15 +0,0 @@
|
||||
apiVersion: v1
|
||||
kind: Service
|
||||
metadata:
|
||||
name: {{ include "incidentops.fullname" . }}-web
|
||||
labels:
|
||||
{{- include "incidentops.web.labels" . | nindent 4 }}
|
||||
spec:
|
||||
type: {{ .Values.web.service.type }}
|
||||
ports:
|
||||
- port: {{ .Values.web.service.port }}
|
||||
targetPort: http
|
||||
protocol: TCP
|
||||
name: http
|
||||
selector:
|
||||
{{- include "incidentops.web.selectorLabels" . | nindent 4 }}
|
||||
@@ -1,106 +0,0 @@
|
||||
apiVersion: apps/v1
|
||||
kind: Deployment
|
||||
metadata:
|
||||
name: {{ include "incidentops.fullname" . }}-worker
|
||||
labels:
|
||||
{{- include "incidentops.worker.labels" . | nindent 4 }}
|
||||
spec:
|
||||
{{- if not .Values.worker.autoscaling.enabled }}
|
||||
replicas: {{ .Values.worker.replicaCount }}
|
||||
{{- end }}
|
||||
selector:
|
||||
matchLabels:
|
||||
{{- include "incidentops.worker.selectorLabels" . | nindent 6 }}
|
||||
template:
|
||||
metadata:
|
||||
annotations:
|
||||
checksum/config: {{ include (print $.Template.BasePath "/configmap.yaml") . | sha256sum }}
|
||||
checksum/secret: {{ include (print $.Template.BasePath "/secret.yaml") . | sha256sum }}
|
||||
{{- with .Values.worker.podAnnotations }}
|
||||
{{- toYaml . | nindent 8 }}
|
||||
{{- end }}
|
||||
labels:
|
||||
{{- include "incidentops.worker.selectorLabels" . | nindent 8 }}
|
||||
spec:
|
||||
{{- with .Values.global.imagePullSecrets }}
|
||||
imagePullSecrets:
|
||||
{{- toYaml . | nindent 8 }}
|
||||
{{- end }}
|
||||
serviceAccountName: {{ include "incidentops.serviceAccountName" . }}
|
||||
securityContext:
|
||||
{{- toYaml .Values.podSecurityContext | nindent 8 }}
|
||||
initContainers:
|
||||
- name: wait-for-postgres
|
||||
image: busybox:1.36
|
||||
command:
|
||||
- sh
|
||||
- -c
|
||||
- |
|
||||
until nc -z {{ include "incidentops.fullname" . }}-postgresql 5432; do
|
||||
echo "Waiting for PostgreSQL..."
|
||||
sleep 2
|
||||
done
|
||||
echo "PostgreSQL is ready"
|
||||
- name: wait-for-redis
|
||||
image: busybox:1.36
|
||||
command:
|
||||
- sh
|
||||
- -c
|
||||
- |
|
||||
until nc -z {{ include "incidentops.fullname" . }}-redis 6379; do
|
||||
echo "Waiting for Redis..."
|
||||
sleep 2
|
||||
done
|
||||
echo "Redis is ready"
|
||||
containers:
|
||||
- name: worker
|
||||
securityContext:
|
||||
{{- toYaml .Values.securityContext | nindent 12 }}
|
||||
image: {{ include "incidentops.worker.image" . }}
|
||||
imagePullPolicy: {{ .Values.worker.image.pullPolicy }}
|
||||
command:
|
||||
- uv
|
||||
- run
|
||||
- celery
|
||||
- -A
|
||||
- worker.celery_app
|
||||
- worker
|
||||
- --loglevel=info
|
||||
- -Q
|
||||
- {{ .Values.worker.queues }}
|
||||
- --concurrency={{ .Values.worker.concurrency }}
|
||||
envFrom:
|
||||
- configMapRef:
|
||||
name: {{ include "incidentops.fullname" . }}-config
|
||||
- secretRef:
|
||||
name: {{ include "incidentops.fullname" . }}-secret
|
||||
livenessProbe:
|
||||
exec:
|
||||
command:
|
||||
- uv
|
||||
- run
|
||||
- celery
|
||||
- -A
|
||||
- worker.celery_app
|
||||
- inspect
|
||||
- ping
|
||||
- -d
|
||||
- celery@$HOSTNAME
|
||||
initialDelaySeconds: 30
|
||||
periodSeconds: 60
|
||||
timeoutSeconds: 10
|
||||
failureThreshold: 3
|
||||
resources:
|
||||
{{- toYaml .Values.worker.resources | nindent 12 }}
|
||||
{{- with .Values.worker.nodeSelector }}
|
||||
nodeSelector:
|
||||
{{- toYaml . | nindent 8 }}
|
||||
{{- end }}
|
||||
{{- with .Values.worker.affinity }}
|
||||
affinity:
|
||||
{{- toYaml . | nindent 8 }}
|
||||
{{- end }}
|
||||
{{- with .Values.worker.tolerations }}
|
||||
tolerations:
|
||||
{{- toYaml . | nindent 8 }}
|
||||
{{- end }}
|
||||
@@ -1,22 +0,0 @@
|
||||
{{- if .Values.worker.autoscaling.enabled }}
|
||||
apiVersion: autoscaling/v2
|
||||
kind: HorizontalPodAutoscaler
|
||||
metadata:
|
||||
name: {{ include "incidentops.fullname" . }}-worker
|
||||
labels:
|
||||
{{- include "incidentops.worker.labels" . | nindent 4 }}
|
||||
spec:
|
||||
scaleTargetRef:
|
||||
apiVersion: apps/v1
|
||||
kind: Deployment
|
||||
name: {{ include "incidentops.fullname" . }}-worker
|
||||
minReplicas: {{ .Values.worker.autoscaling.minReplicas }}
|
||||
maxReplicas: {{ .Values.worker.autoscaling.maxReplicas }}
|
||||
metrics:
|
||||
- type: Resource
|
||||
resource:
|
||||
name: cpu
|
||||
target:
|
||||
type: Utilization
|
||||
averageUtilization: {{ .Values.worker.autoscaling.targetCPUUtilizationPercentage }}
|
||||
{{- end }}
|
||||
@@ -1,142 +0,0 @@
|
||||
# Production values for incidentops
|
||||
# Use external secrets management in production
|
||||
|
||||
api:
|
||||
replicaCount: 3
|
||||
autoscaling:
|
||||
enabled: true
|
||||
minReplicas: 3
|
||||
maxReplicas: 10
|
||||
targetCPUUtilizationPercentage: 70
|
||||
resources:
|
||||
requests:
|
||||
cpu: 250m
|
||||
memory: 512Mi
|
||||
limits:
|
||||
cpu: 1000m
|
||||
memory: 1Gi
|
||||
|
||||
worker:
|
||||
replicaCount: 3
|
||||
autoscaling:
|
||||
enabled: true
|
||||
minReplicas: 3
|
||||
maxReplicas: 10
|
||||
targetCPUUtilizationPercentage: 70
|
||||
concurrency: 8
|
||||
resources:
|
||||
requests:
|
||||
cpu: 250m
|
||||
memory: 512Mi
|
||||
limits:
|
||||
cpu: 1000m
|
||||
memory: 1Gi
|
||||
|
||||
web:
|
||||
replicaCount: 3
|
||||
autoscaling:
|
||||
enabled: true
|
||||
minReplicas: 3
|
||||
maxReplicas: 10
|
||||
targetCPUUtilizationPercentage: 70
|
||||
resources:
|
||||
requests:
|
||||
cpu: 100m
|
||||
memory: 256Mi
|
||||
limits:
|
||||
cpu: 500m
|
||||
memory: 512Mi
|
||||
|
||||
ingress:
|
||||
enabled: true
|
||||
className: nginx
|
||||
annotations:
|
||||
nginx.ingress.kubernetes.io/proxy-body-size: "10m"
|
||||
cert-manager.io/cluster-issuer: letsencrypt-prod
|
||||
host: incidentops.example.com
|
||||
tls:
|
||||
- secretName: incidentops-tls
|
||||
hosts:
|
||||
- incidentops.example.com
|
||||
|
||||
postgresql:
|
||||
persistence:
|
||||
size: 50Gi
|
||||
resources:
|
||||
requests:
|
||||
cpu: 500m
|
||||
memory: 1Gi
|
||||
limits:
|
||||
cpu: 2000m
|
||||
memory: 4Gi
|
||||
|
||||
redis:
|
||||
persistence:
|
||||
size: 10Gi
|
||||
resources:
|
||||
requests:
|
||||
cpu: 250m
|
||||
memory: 512Mi
|
||||
limits:
|
||||
cpu: 1000m
|
||||
memory: 1Gi
|
||||
|
||||
# Application configuration
|
||||
config:
|
||||
environment: production
|
||||
logLevel: INFO
|
||||
|
||||
# Observability Stack - Production settings
|
||||
observability:
|
||||
enabled: true
|
||||
|
||||
otelCollector:
|
||||
replicaCount: 2
|
||||
resources:
|
||||
requests:
|
||||
cpu: 100m
|
||||
memory: 256Mi
|
||||
limits:
|
||||
cpu: 500m
|
||||
memory: 512Mi
|
||||
|
||||
tempo:
|
||||
retention: "720h" # 30 days
|
||||
persistence:
|
||||
enabled: true
|
||||
size: 50Gi
|
||||
resources:
|
||||
requests:
|
||||
cpu: 250m
|
||||
memory: 512Mi
|
||||
limits:
|
||||
cpu: 1000m
|
||||
memory: 2Gi
|
||||
|
||||
loki:
|
||||
retention: "720h" # 30 days
|
||||
persistence:
|
||||
enabled: true
|
||||
size: 100Gi
|
||||
resources:
|
||||
requests:
|
||||
cpu: 250m
|
||||
memory: 512Mi
|
||||
limits:
|
||||
cpu: 1000m
|
||||
memory: 2Gi
|
||||
|
||||
grafana:
|
||||
adminPassword: "" # Set via external secret in production
|
||||
service:
|
||||
type: ClusterIP
|
||||
persistence:
|
||||
enabled: true
|
||||
size: 5Gi
|
||||
resources:
|
||||
requests:
|
||||
cpu: 100m
|
||||
memory: 256Mi
|
||||
limits:
|
||||
cpu: 500m
|
||||
memory: 512Mi
|
||||
@@ -1,279 +0,0 @@
|
||||
# Default values for incidentops
|
||||
|
||||
global:
|
||||
imageRegistry: ""
|
||||
imagePullSecrets: []
|
||||
|
||||
api:
|
||||
replicaCount: 2
|
||||
image:
|
||||
repository: incidentops/api
|
||||
tag: latest
|
||||
pullPolicy: IfNotPresent
|
||||
service:
|
||||
type: ClusterIP
|
||||
port: 8000
|
||||
resources:
|
||||
requests:
|
||||
cpu: 100m
|
||||
memory: 256Mi
|
||||
limits:
|
||||
cpu: 500m
|
||||
memory: 512Mi
|
||||
autoscaling:
|
||||
enabled: false
|
||||
minReplicas: 2
|
||||
maxReplicas: 10
|
||||
targetCPUUtilizationPercentage: 80
|
||||
podAnnotations: {}
|
||||
nodeSelector: {}
|
||||
tolerations: []
|
||||
affinity: {}
|
||||
|
||||
# Worker Service (Celery)
|
||||
worker:
|
||||
replicaCount: 2
|
||||
image:
|
||||
repository: incidentops/worker
|
||||
tag: latest
|
||||
pullPolicy: IfNotPresent
|
||||
resources:
|
||||
requests:
|
||||
cpu: 100m
|
||||
memory: 256Mi
|
||||
limits:
|
||||
cpu: 500m
|
||||
memory: 512Mi
|
||||
autoscaling:
|
||||
enabled: false
|
||||
minReplicas: 2
|
||||
maxReplicas: 10
|
||||
targetCPUUtilizationPercentage: 80
|
||||
queues: "critical,default,low"
|
||||
concurrency: 4
|
||||
podAnnotations: {}
|
||||
nodeSelector: {}
|
||||
tolerations: []
|
||||
affinity: {}
|
||||
|
||||
# Web Frontend (Next.js)
|
||||
web:
|
||||
replicaCount: 2
|
||||
image:
|
||||
repository: incidentops/web
|
||||
tag: latest
|
||||
pullPolicy: IfNotPresent
|
||||
service:
|
||||
type: ClusterIP
|
||||
port: 3000
|
||||
resources:
|
||||
requests:
|
||||
cpu: 50m
|
||||
memory: 128Mi
|
||||
limits:
|
||||
cpu: 200m
|
||||
memory: 256Mi
|
||||
autoscaling:
|
||||
enabled: false
|
||||
minReplicas: 2
|
||||
maxReplicas: 10
|
||||
targetCPUUtilizationPercentage: 80
|
||||
podAnnotations: {}
|
||||
nodeSelector: {}
|
||||
tolerations: []
|
||||
affinity: {}
|
||||
|
||||
# Ingress configuration
|
||||
ingress:
|
||||
enabled: true
|
||||
className: nginx
|
||||
annotations:
|
||||
nginx.ingress.kubernetes.io/proxy-body-size: "10m"
|
||||
host: incidentops.local
|
||||
tls: []
|
||||
|
||||
# Database migration job
|
||||
migration:
|
||||
enabled: true
|
||||
image:
|
||||
repository: incidentops/api
|
||||
tag: latest
|
||||
pullPolicy: IfNotPresent
|
||||
backoffLimit: 3
|
||||
|
||||
# Application configuration
|
||||
config:
|
||||
jwtAlgorithm: HS256
|
||||
accessTokenExpireMinutes: 30
|
||||
refreshTokenExpireDays: 30
|
||||
environment: development
|
||||
logLevel: INFO
|
||||
|
||||
# Secrets (use external secrets in production)
|
||||
secrets:
|
||||
jwtSecretKey: "change-me-in-production"
|
||||
|
||||
# PostgreSQL configuration (using official postgres image)
|
||||
postgresql:
|
||||
enabled: true
|
||||
image:
|
||||
repository: postgres
|
||||
tag: "16-alpine"
|
||||
pullPolicy: IfNotPresent
|
||||
auth:
|
||||
username: incidentops
|
||||
password: incidentops
|
||||
database: incidentops
|
||||
persistence:
|
||||
size: 8Gi
|
||||
resources:
|
||||
requests:
|
||||
cpu: 100m
|
||||
memory: 256Mi
|
||||
limits:
|
||||
cpu: 500m
|
||||
memory: 512Mi
|
||||
|
||||
redis:
|
||||
enabled: true
|
||||
image:
|
||||
repository: redis
|
||||
tag: "7-alpine"
|
||||
pullPolicy: IfNotPresent
|
||||
persistence:
|
||||
size: 2Gi
|
||||
resources:
|
||||
requests:
|
||||
cpu: 50m
|
||||
memory: 128Mi
|
||||
limits:
|
||||
cpu: 200m
|
||||
memory: 256Mi
|
||||
|
||||
# Service Account
|
||||
serviceAccount:
|
||||
create: true
|
||||
annotations: {}
|
||||
name: ""
|
||||
|
||||
# Pod Security Context
|
||||
podSecurityContext:
|
||||
fsGroup: 1000
|
||||
|
||||
securityContext:
|
||||
runAsNonRoot: true
|
||||
runAsUser: 1000
|
||||
|
||||
# Observability Stack (Grafana + Loki + Tempo + OpenTelemetry Collector)
|
||||
observability:
|
||||
enabled: true
|
||||
|
||||
otelCollector:
|
||||
replicaCount: 1
|
||||
image:
|
||||
repository: otel/opentelemetry-collector-contrib
|
||||
tag: "0.96.0"
|
||||
pullPolicy: IfNotPresent
|
||||
resources:
|
||||
requests:
|
||||
cpu: 50m
|
||||
memory: 128Mi
|
||||
limits:
|
||||
cpu: 200m
|
||||
memory: 256Mi
|
||||
|
||||
tempo:
|
||||
image:
|
||||
repository: grafana/tempo
|
||||
tag: "2.4.1"
|
||||
pullPolicy: IfNotPresent
|
||||
retention: "168h" # 7 days
|
||||
persistence:
|
||||
enabled: false
|
||||
size: 10Gi
|
||||
resources:
|
||||
requests:
|
||||
cpu: 50m
|
||||
memory: 128Mi
|
||||
limits:
|
||||
cpu: 500m
|
||||
memory: 512Mi
|
||||
|
||||
loki:
|
||||
image:
|
||||
repository: grafana/loki
|
||||
tag: "2.9.6"
|
||||
pullPolicy: IfNotPresent
|
||||
retention: "168h" # 7 days
|
||||
persistence:
|
||||
enabled: false
|
||||
size: 10Gi
|
||||
resources:
|
||||
requests:
|
||||
cpu: 50m
|
||||
memory: 128Mi
|
||||
limits:
|
||||
cpu: 500m
|
||||
memory: 512Mi
|
||||
|
||||
prometheus:
|
||||
image:
|
||||
repository: prom/prometheus
|
||||
tag: "v2.51.0"
|
||||
pullPolicy: IfNotPresent
|
||||
retention: "15d"
|
||||
scrapeInterval: "15s"
|
||||
persistence:
|
||||
enabled: false
|
||||
size: 10Gi
|
||||
resources:
|
||||
requests:
|
||||
cpu: 50m
|
||||
memory: 128Mi
|
||||
limits:
|
||||
cpu: 500m
|
||||
memory: 512Mi
|
||||
|
||||
grafana:
|
||||
image:
|
||||
repository: grafana/grafana
|
||||
tag: "10.4.1"
|
||||
pullPolicy: IfNotPresent
|
||||
adminUser: admin
|
||||
adminPassword: "admin" # Change in production!
|
||||
service:
|
||||
type: ClusterIP
|
||||
ingress:
|
||||
enabled: false
|
||||
host: grafana.incidentops.local
|
||||
annotations: {}
|
||||
tls: []
|
||||
persistence:
|
||||
enabled: false
|
||||
size: 1Gi
|
||||
resources:
|
||||
requests:
|
||||
cpu: 50m
|
||||
memory: 128Mi
|
||||
limits:
|
||||
cpu: 200m
|
||||
memory: 256Mi
|
||||
|
||||
promtail:
|
||||
enabled: true
|
||||
image:
|
||||
repository: grafana/promtail
|
||||
tag: "2.9.6"
|
||||
pullPolicy: IfNotPresent
|
||||
resources:
|
||||
requests:
|
||||
cpu: 25m
|
||||
memory: 64Mi
|
||||
limits:
|
||||
cpu: 200m
|
||||
memory: 256Mi
|
||||
|
||||
# Metrics configuration
|
||||
metrics:
|
||||
enabled: true
|
||||
port: 9464
|
||||
@@ -1,6 +0,0 @@
|
||||
def main():
|
||||
print("Hello from incidentops!")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
@@ -1,61 +0,0 @@
|
||||
-- Initial schema for IncidentOps
|
||||
-- Creates core tables: users, orgs, org_members, services, incidents, incident_events
|
||||
|
||||
CREATE TABLE users (
|
||||
id UUID PRIMARY KEY,
|
||||
email TEXT NOT NULL UNIQUE,
|
||||
password_hash TEXT NOT NULL,
|
||||
created_at TIMESTAMPTZ NOT NULL DEFAULT now()
|
||||
);
|
||||
|
||||
CREATE TABLE orgs (
|
||||
id UUID PRIMARY KEY,
|
||||
name TEXT NOT NULL,
|
||||
slug TEXT NOT NULL UNIQUE,
|
||||
created_at TIMESTAMPTZ NOT NULL DEFAULT now()
|
||||
);
|
||||
|
||||
CREATE TABLE org_members (
|
||||
id UUID PRIMARY KEY,
|
||||
user_id UUID NOT NULL REFERENCES users(id),
|
||||
org_id UUID NOT NULL REFERENCES orgs(id),
|
||||
role TEXT NOT NULL CHECK (role IN ('admin', 'member', 'viewer')),
|
||||
created_at TIMESTAMPTZ NOT NULL DEFAULT now(),
|
||||
UNIQUE (user_id, org_id)
|
||||
);
|
||||
|
||||
CREATE TABLE services (
|
||||
id UUID PRIMARY KEY,
|
||||
org_id UUID NOT NULL REFERENCES orgs(id),
|
||||
name TEXT NOT NULL,
|
||||
slug TEXT NOT NULL,
|
||||
created_at TIMESTAMPTZ NOT NULL DEFAULT now(),
|
||||
UNIQUE (org_id, slug)
|
||||
);
|
||||
|
||||
CREATE TABLE incidents (
|
||||
id UUID PRIMARY KEY,
|
||||
org_id UUID NOT NULL REFERENCES orgs(id),
|
||||
service_id UUID NOT NULL REFERENCES services(id),
|
||||
title TEXT NOT NULL,
|
||||
description TEXT,
|
||||
status TEXT NOT NULL CHECK (status IN ('triggered', 'acknowledged', 'mitigated', 'resolved')),
|
||||
severity TEXT NOT NULL CHECK (severity IN ('critical', 'high', 'medium', 'low')),
|
||||
version INTEGER NOT NULL DEFAULT 1,
|
||||
created_at TIMESTAMPTZ NOT NULL DEFAULT now(),
|
||||
updated_at TIMESTAMPTZ NOT NULL DEFAULT now()
|
||||
);
|
||||
|
||||
CREATE INDEX idx_incidents_org_status ON incidents(org_id, status);
|
||||
CREATE INDEX idx_incidents_org_created ON incidents(org_id, created_at DESC);
|
||||
|
||||
CREATE TABLE incident_events (
|
||||
id UUID PRIMARY KEY,
|
||||
incident_id UUID NOT NULL REFERENCES incidents(id),
|
||||
event_type TEXT NOT NULL,
|
||||
actor_user_id UUID REFERENCES users(id),
|
||||
payload JSONB,
|
||||
created_at TIMESTAMPTZ NOT NULL DEFAULT now()
|
||||
);
|
||||
|
||||
CREATE INDEX idx_incident_events_incident ON incident_events(incident_id, created_at);
|
||||
@@ -1,15 +0,0 @@
|
||||
-- Refresh tokens table for JWT token rotation
|
||||
-- Stores hashed refresh tokens with active org context
|
||||
|
||||
CREATE TABLE refresh_tokens (
|
||||
id UUID PRIMARY KEY,
|
||||
user_id UUID NOT NULL REFERENCES users(id),
|
||||
token_hash TEXT NOT NULL UNIQUE,
|
||||
active_org_id UUID NOT NULL REFERENCES orgs(id),
|
||||
expires_at TIMESTAMPTZ NOT NULL,
|
||||
revoked_at TIMESTAMPTZ,
|
||||
created_at TIMESTAMPTZ NOT NULL DEFAULT now()
|
||||
);
|
||||
|
||||
CREATE INDEX idx_refresh_tokens_user ON refresh_tokens(user_id);
|
||||
CREATE INDEX idx_refresh_tokens_hash ON refresh_tokens(token_hash);
|
||||
@@ -1,25 +0,0 @@
|
||||
-- Notification system tables
|
||||
-- Stores notification targets and delivery attempts
|
||||
|
||||
CREATE TABLE notification_targets (
|
||||
id UUID PRIMARY KEY,
|
||||
org_id UUID NOT NULL REFERENCES orgs(id),
|
||||
name TEXT NOT NULL,
|
||||
target_type TEXT NOT NULL CHECK (target_type IN ('webhook', 'email', 'slack')),
|
||||
webhook_url TEXT,
|
||||
enabled BOOLEAN NOT NULL DEFAULT true,
|
||||
created_at TIMESTAMPTZ NOT NULL DEFAULT now()
|
||||
);
|
||||
|
||||
CREATE INDEX idx_notification_targets_org ON notification_targets(org_id);
|
||||
|
||||
CREATE TABLE notification_attempts (
|
||||
id UUID PRIMARY KEY,
|
||||
incident_id UUID NOT NULL REFERENCES incidents(id),
|
||||
target_id UUID NOT NULL REFERENCES notification_targets(id),
|
||||
status TEXT NOT NULL CHECK (status IN ('pending', 'sent', 'failed')),
|
||||
error TEXT,
|
||||
sent_at TIMESTAMPTZ,
|
||||
created_at TIMESTAMPTZ NOT NULL DEFAULT now(),
|
||||
UNIQUE (incident_id, target_id)
|
||||
);
|
||||
@@ -1,18 +0,0 @@
|
||||
-- Enhance refresh tokens for secure rotation and reuse detection
|
||||
-- Adds rotated_to column to track token chains and detect stolen token reuse
|
||||
|
||||
-- Add rotated_to column to track which token this was rotated into
|
||||
-- When a token is rotated, we store the ID of the new token here
|
||||
-- If a token with rotated_to set is used again, it indicates token theft
|
||||
ALTER TABLE refresh_tokens ADD COLUMN rotated_to UUID REFERENCES refresh_tokens(id);
|
||||
|
||||
-- Index for efficient cleanup queries on expires_at
|
||||
CREATE INDEX idx_refresh_tokens_expires ON refresh_tokens(expires_at);
|
||||
|
||||
-- Index for finding active tokens per user (for revoke_all and listing)
|
||||
CREATE INDEX idx_refresh_tokens_user_active ON refresh_tokens(user_id, revoked_at)
|
||||
WHERE revoked_at IS NULL;
|
||||
|
||||
-- Index for reuse detection queries
|
||||
CREATE INDEX idx_refresh_tokens_rotated ON refresh_tokens(rotated_to)
|
||||
WHERE rotated_to IS NOT NULL;
|
||||
@@ -1,119 +0,0 @@
|
||||
"""
|
||||
Simple migration runner using asyncpg.
|
||||
Tracks applied migrations in a _migrations table.
|
||||
|
||||
Usage:
|
||||
DATABASE_URL=postgresql://user:pass@localhost/db uv run python migrations/migrate.py apply
|
||||
DATABASE_URL=postgresql://user:pass@localhost/db uv run python migrations/migrate.py status
|
||||
"""
|
||||
|
||||
import asyncio
|
||||
import os
|
||||
import sys
|
||||
from pathlib import Path
|
||||
|
||||
import asyncpg
|
||||
|
||||
MIGRATIONS_DIR = Path(__file__).parent
|
||||
|
||||
|
||||
async def ensure_migrations_table(conn: asyncpg.Connection) -> None:
|
||||
"""Create the migrations tracking table if it doesn't exist."""
|
||||
await conn.execute("""
|
||||
CREATE TABLE IF NOT EXISTS _migrations (
|
||||
id SERIAL PRIMARY KEY,
|
||||
name TEXT NOT NULL UNIQUE,
|
||||
applied_at TIMESTAMPTZ NOT NULL DEFAULT now()
|
||||
)
|
||||
""")
|
||||
|
||||
|
||||
async def get_applied_migrations(conn: asyncpg.Connection) -> set[str]:
|
||||
"""Get the set of already applied migration names."""
|
||||
rows = await conn.fetch("SELECT name FROM _migrations")
|
||||
return {row["name"] for row in rows}
|
||||
|
||||
|
||||
async def get_pending_migrations(conn: asyncpg.Connection) -> list[Path]:
|
||||
"""Get list of migration files that haven't been applied yet."""
|
||||
applied = await get_applied_migrations(conn)
|
||||
sql_files = sorted(MIGRATIONS_DIR.glob("*.sql"))
|
||||
return [f for f in sql_files if f.name not in applied]
|
||||
|
||||
|
||||
async def apply_migration(conn: asyncpg.Connection, migration_file: Path) -> None:
|
||||
"""Apply a single migration file within a transaction."""
|
||||
sql = migration_file.read_text()
|
||||
async with conn.transaction():
|
||||
await conn.execute(sql)
|
||||
await conn.execute(
|
||||
"INSERT INTO _migrations (name) VALUES ($1)",
|
||||
migration_file.name
|
||||
)
|
||||
print(f"Applied: {migration_file.name}")
|
||||
|
||||
|
||||
async def migrate(database_url: str) -> None:
|
||||
"""Apply all pending migrations."""
|
||||
conn = await asyncpg.connect(database_url)
|
||||
try:
|
||||
await ensure_migrations_table(conn)
|
||||
pending = await get_pending_migrations(conn)
|
||||
|
||||
if not pending:
|
||||
print("No pending migrations.")
|
||||
return
|
||||
|
||||
for migration_file in pending:
|
||||
await apply_migration(conn, migration_file)
|
||||
|
||||
print(f"Applied {len(pending)} migration(s).")
|
||||
finally:
|
||||
await conn.close()
|
||||
|
||||
|
||||
async def status(database_url: str) -> None:
|
||||
"""Show migration status."""
|
||||
conn = await asyncpg.connect(database_url)
|
||||
try:
|
||||
await ensure_migrations_table(conn)
|
||||
applied = await get_applied_migrations(conn)
|
||||
pending = await get_pending_migrations(conn)
|
||||
|
||||
print("Applied migrations:")
|
||||
for name in sorted(applied):
|
||||
print(f" [x] {name}")
|
||||
|
||||
print("\nPending migrations:")
|
||||
for f in pending:
|
||||
print(f" [ ] {f.name}")
|
||||
|
||||
if not applied and not pending:
|
||||
print(" (none)")
|
||||
finally:
|
||||
await conn.close()
|
||||
|
||||
|
||||
def main() -> None:
|
||||
database_url = os.environ.get("DATABASE_URL")
|
||||
if not database_url:
|
||||
print("Error: DATABASE_URL environment variable is required")
|
||||
sys.exit(1)
|
||||
|
||||
if len(sys.argv) < 2:
|
||||
print("Usage: python migrate.py [apply|status]")
|
||||
sys.exit(1)
|
||||
|
||||
command = sys.argv[1]
|
||||
if command == "apply":
|
||||
asyncio.run(migrate(database_url))
|
||||
elif command == "status":
|
||||
asyncio.run(status(database_url))
|
||||
else:
|
||||
print(f"Unknown command: {command}")
|
||||
print("Usage: python migrate.py [apply|status]")
|
||||
sys.exit(1)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
@@ -1,294 +0,0 @@
|
||||
{
|
||||
"title": "IncidentOps API Overview",
|
||||
"uid": "incidentops-api",
|
||||
"tags": ["incidentops", "api"],
|
||||
"timezone": "browser",
|
||||
"editable": true,
|
||||
"panels": [
|
||||
{
|
||||
"id": 1,
|
||||
"title": "Request Rate",
|
||||
"type": "timeseries",
|
||||
"gridPos": {"h": 8, "w": 8, "x": 0, "y": 0},
|
||||
"targets": [
|
||||
{
|
||||
"datasource": {"type": "prometheus", "uid": "prometheus"},
|
||||
"expr": "sum(rate(http_server_request_duration_seconds_count{job=\"incidentops-api\"}[1m]))",
|
||||
"legendFormat": "Requests/sec",
|
||||
"refId": "A"
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": {"mode": "palette-classic"},
|
||||
"unit": "reqps"
|
||||
}
|
||||
}
|
||||
},
|
||||
{
|
||||
"id": 2,
|
||||
"title": "Request Duration (p50, p95, p99)",
|
||||
"type": "timeseries",
|
||||
"gridPos": {"h": 8, "w": 8, "x": 8, "y": 0},
|
||||
"targets": [
|
||||
{
|
||||
"datasource": {"type": "prometheus", "uid": "prometheus"},
|
||||
"expr": "histogram_quantile(0.50, sum(rate(http_server_request_duration_seconds_bucket{job=\"incidentops-api\"}[5m])) by (le))",
|
||||
"legendFormat": "p50",
|
||||
"refId": "A"
|
||||
},
|
||||
{
|
||||
"datasource": {"type": "prometheus", "uid": "prometheus"},
|
||||
"expr": "histogram_quantile(0.95, sum(rate(http_server_request_duration_seconds_bucket{job=\"incidentops-api\"}[5m])) by (le))",
|
||||
"legendFormat": "p95",
|
||||
"refId": "B"
|
||||
},
|
||||
{
|
||||
"datasource": {"type": "prometheus", "uid": "prometheus"},
|
||||
"expr": "histogram_quantile(0.99, sum(rate(http_server_request_duration_seconds_bucket{job=\"incidentops-api\"}[5m])) by (le))",
|
||||
"legendFormat": "p99",
|
||||
"refId": "C"
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": {"mode": "palette-classic"},
|
||||
"unit": "s"
|
||||
}
|
||||
}
|
||||
},
|
||||
{
|
||||
"id": 3,
|
||||
"title": "Error Rate",
|
||||
"type": "timeseries",
|
||||
"gridPos": {"h": 8, "w": 8, "x": 16, "y": 0},
|
||||
"targets": [
|
||||
{
|
||||
"datasource": {"type": "prometheus", "uid": "prometheus"},
|
||||
"expr": "sum(rate(http_server_request_duration_seconds_count{job=\"incidentops-api\", http_status_code=~\"5..\"}[1m])) / sum(rate(http_server_request_duration_seconds_count{job=\"incidentops-api\"}[1m])) * 100",
|
||||
"legendFormat": "Error %",
|
||||
"refId": "A"
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": {"fixedColor": "red", "mode": "fixed"},
|
||||
"unit": "percent",
|
||||
"min": 0,
|
||||
"max": 100
|
||||
}
|
||||
}
|
||||
},
|
||||
{
|
||||
"id": 4,
|
||||
"title": "Requests by Status Code",
|
||||
"type": "timeseries",
|
||||
"gridPos": {"h": 8, "w": 12, "x": 0, "y": 8},
|
||||
"targets": [
|
||||
{
|
||||
"datasource": {"type": "prometheus", "uid": "prometheus"},
|
||||
"expr": "sum by (http_status_code) (rate(http_server_request_duration_seconds_count{job=\"incidentops-api\"}[1m]))",
|
||||
"legendFormat": "{{http_status_code}}",
|
||||
"refId": "A"
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": {"mode": "palette-classic"},
|
||||
"unit": "reqps"
|
||||
}
|
||||
}
|
||||
},
|
||||
{
|
||||
"id": 5,
|
||||
"title": "Requests by Endpoint",
|
||||
"type": "timeseries",
|
||||
"gridPos": {"h": 8, "w": 12, "x": 12, "y": 8},
|
||||
"targets": [
|
||||
{
|
||||
"datasource": {"type": "prometheus", "uid": "prometheus"},
|
||||
"expr": "sum by (http_route) (rate(http_server_request_duration_seconds_count{job=\"incidentops-api\"}[1m]))",
|
||||
"legendFormat": "{{http_route}}",
|
||||
"refId": "A"
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": {"mode": "palette-classic"},
|
||||
"unit": "reqps"
|
||||
}
|
||||
}
|
||||
},
|
||||
{
|
||||
"id": 6,
|
||||
"title": "System CPU Usage",
|
||||
"type": "gauge",
|
||||
"gridPos": {"h": 6, "w": 6, "x": 0, "y": 16},
|
||||
"targets": [
|
||||
{
|
||||
"datasource": {"type": "prometheus", "uid": "prometheus"},
|
||||
"expr": "avg(system_cpu_utilization{job=\"incidentops-api\"}) * 100",
|
||||
"refId": "A"
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": {"mode": "thresholds"},
|
||||
"thresholds": {
|
||||
"mode": "absolute",
|
||||
"steps": [
|
||||
{"color": "green", "value": null},
|
||||
{"color": "yellow", "value": 60},
|
||||
{"color": "red", "value": 80}
|
||||
]
|
||||
},
|
||||
"unit": "percent",
|
||||
"min": 0,
|
||||
"max": 100
|
||||
}
|
||||
}
|
||||
},
|
||||
{
|
||||
"id": 7,
|
||||
"title": "Memory Usage",
|
||||
"type": "gauge",
|
||||
"gridPos": {"h": 6, "w": 6, "x": 6, "y": 16},
|
||||
"targets": [
|
||||
{
|
||||
"datasource": {"type": "prometheus", "uid": "prometheus"},
|
||||
"expr": "process_runtime_cpython_memory_bytes{job=\"incidentops-api\", type=\"rss\"} / 1024 / 1024",
|
||||
"refId": "A"
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": {"mode": "thresholds"},
|
||||
"thresholds": {
|
||||
"mode": "absolute",
|
||||
"steps": [
|
||||
{"color": "green", "value": null},
|
||||
{"color": "yellow", "value": 256},
|
||||
{"color": "red", "value": 512}
|
||||
]
|
||||
},
|
||||
"unit": "decmbytes"
|
||||
}
|
||||
}
|
||||
},
|
||||
{
|
||||
"id": 8,
|
||||
"title": "Active Threads",
|
||||
"type": "stat",
|
||||
"gridPos": {"h": 6, "w": 6, "x": 12, "y": 16},
|
||||
"targets": [
|
||||
{
|
||||
"datasource": {"type": "prometheus", "uid": "prometheus"},
|
||||
"expr": "process_runtime_cpython_thread_count{job=\"incidentops-api\"}",
|
||||
"refId": "A"
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": {"mode": "thresholds"},
|
||||
"thresholds": {
|
||||
"mode": "absolute",
|
||||
"steps": [
|
||||
{"color": "green", "value": null},
|
||||
{"color": "yellow", "value": 50},
|
||||
{"color": "red", "value": 100}
|
||||
]
|
||||
}
|
||||
}
|
||||
}
|
||||
},
|
||||
{
|
||||
"id": 9,
|
||||
"title": "GC Collections",
|
||||
"type": "stat",
|
||||
"gridPos": {"h": 6, "w": 6, "x": 18, "y": 16},
|
||||
"targets": [
|
||||
{
|
||||
"datasource": {"type": "prometheus", "uid": "prometheus"},
|
||||
"expr": "sum(rate(process_runtime_cpython_gc_count{job=\"incidentops-api\"}[5m]))",
|
||||
"refId": "A"
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"color": {"mode": "thresholds"},
|
||||
"thresholds": {
|
||||
"mode": "absolute",
|
||||
"steps": [
|
||||
{"color": "green", "value": null}
|
||||
]
|
||||
},
|
||||
"unit": "cps"
|
||||
}
|
||||
}
|
||||
},
|
||||
{
|
||||
"id": 10,
|
||||
"title": "Recent Logs",
|
||||
"type": "logs",
|
||||
"gridPos": {"h": 10, "w": 24, "x": 0, "y": 22},
|
||||
"targets": [
|
||||
{
|
||||
"datasource": {"type": "loki", "uid": "loki"},
|
||||
"expr": "{service_name=\"incidentops-api\"} | json",
|
||||
"refId": "A"
|
||||
}
|
||||
],
|
||||
"options": {
|
||||
"showTime": true,
|
||||
"showLabels": true,
|
||||
"wrapLogMessage": true,
|
||||
"enableLogDetails": true,
|
||||
"sortOrder": "Descending"
|
||||
}
|
||||
},
|
||||
{
|
||||
"id": 11,
|
||||
"title": "Error Logs",
|
||||
"type": "logs",
|
||||
"gridPos": {"h": 8, "w": 24, "x": 0, "y": 32},
|
||||
"targets": [
|
||||
{
|
||||
"datasource": {"type": "loki", "uid": "loki"},
|
||||
"expr": "{service_name=\"incidentops-api\"} |= \"ERROR\" | json",
|
||||
"refId": "A"
|
||||
}
|
||||
],
|
||||
"options": {
|
||||
"showTime": true,
|
||||
"showLabels": true,
|
||||
"wrapLogMessage": true,
|
||||
"enableLogDetails": true,
|
||||
"sortOrder": "Descending"
|
||||
}
|
||||
},
|
||||
{
|
||||
"id": 12,
|
||||
"title": "Recent Traces",
|
||||
"type": "traces",
|
||||
"gridPos": {"h": 10, "w": 24, "x": 0, "y": 40},
|
||||
"targets": [
|
||||
{
|
||||
"datasource": {"type": "tempo", "uid": "tempo"},
|
||||
"queryType": "traceqlSearch",
|
||||
"filters": [
|
||||
{
|
||||
"id": "service-name",
|
||||
"operator": "=",
|
||||
"scope": "resource",
|
||||
"tag": "service.name",
|
||||
"value": ["incidentops-api"]
|
||||
}
|
||||
],
|
||||
"refId": "A"
|
||||
}
|
||||
]
|
||||
}
|
||||
],
|
||||
"schemaVersion": 38,
|
||||
"version": 2
|
||||
}
|
||||
@@ -1,12 +0,0 @@
|
||||
apiVersion: 1
|
||||
|
||||
providers:
|
||||
- name: 'default'
|
||||
orgId: 1
|
||||
folder: 'IncidentOps'
|
||||
folderUid: 'incidentops'
|
||||
type: file
|
||||
disableDeletion: false
|
||||
editable: true
|
||||
options:
|
||||
path: /var/lib/grafana/dashboards
|
||||
@@ -1,48 +0,0 @@
|
||||
apiVersion: 1
|
||||
|
||||
datasources:
|
||||
- name: Prometheus
|
||||
type: prometheus
|
||||
uid: prometheus
|
||||
url: http://prometheus:9090
|
||||
access: proxy
|
||||
isDefault: false
|
||||
jsonData:
|
||||
httpMethod: POST
|
||||
exemplarTraceIdDestinations:
|
||||
- name: trace_id
|
||||
datasourceUid: tempo
|
||||
|
||||
- name: Tempo
|
||||
type: tempo
|
||||
uid: tempo
|
||||
url: http://tempo:3200
|
||||
access: proxy
|
||||
isDefault: false
|
||||
jsonData:
|
||||
tracesToLogsV2:
|
||||
datasourceUid: loki
|
||||
spanStartTimeShift: '-1h'
|
||||
spanEndTimeShift: '1h'
|
||||
filterByTraceID: true
|
||||
filterBySpanID: true
|
||||
tracesToMetrics:
|
||||
datasourceUid: prometheus
|
||||
nodeGraph:
|
||||
enabled: true
|
||||
lokiSearch:
|
||||
datasourceUid: loki
|
||||
|
||||
- name: Loki
|
||||
type: loki
|
||||
uid: loki
|
||||
url: http://loki:3100
|
||||
access: proxy
|
||||
isDefault: true
|
||||
jsonData:
|
||||
derivedFields:
|
||||
- datasourceUid: tempo
|
||||
matcherRegex: '"trace_id":"([a-f0-9]+)"'
|
||||
name: TraceID
|
||||
url: '$${__value.raw}'
|
||||
urlDisplayLabel: 'View Trace'
|
||||
@@ -1,41 +0,0 @@
|
||||
auth_enabled: false
|
||||
|
||||
server:
|
||||
http_listen_port: 3100
|
||||
grpc_listen_port: 9096
|
||||
|
||||
common:
|
||||
path_prefix: /loki
|
||||
storage:
|
||||
filesystem:
|
||||
chunks_directory: /loki/chunks
|
||||
rules_directory: /loki/rules
|
||||
replication_factor: 1
|
||||
ring:
|
||||
kvstore:
|
||||
store: inmemory
|
||||
|
||||
query_range:
|
||||
results_cache:
|
||||
cache:
|
||||
embedded_cache:
|
||||
enabled: true
|
||||
max_size_mb: 100
|
||||
|
||||
schema_config:
|
||||
configs:
|
||||
- from: "2020-10-24"
|
||||
store: tsdb
|
||||
object_store: filesystem
|
||||
schema: v13
|
||||
index:
|
||||
prefix: index_
|
||||
period: 24h
|
||||
|
||||
ruler:
|
||||
alertmanager_url: http://localhost:9093
|
||||
|
||||
limits_config:
|
||||
retention_period: 168h # 7 days
|
||||
allow_structured_metadata: true
|
||||
volume_enabled: true
|
||||
@@ -1,38 +0,0 @@
|
||||
receivers:
|
||||
otlp:
|
||||
protocols:
|
||||
grpc:
|
||||
endpoint: 0.0.0.0:4317
|
||||
http:
|
||||
endpoint: 0.0.0.0:4318
|
||||
|
||||
processors:
|
||||
batch:
|
||||
timeout: 1s
|
||||
send_batch_size: 1024
|
||||
memory_limiter:
|
||||
check_interval: 1s
|
||||
limit_mib: 256
|
||||
spike_limit_mib: 64
|
||||
|
||||
exporters:
|
||||
otlp/tempo:
|
||||
endpoint: tempo:4317
|
||||
tls:
|
||||
insecure: true
|
||||
loki:
|
||||
endpoint: http://loki:3100/loki/api/v1/push
|
||||
default_labels_enabled:
|
||||
exporter: true
|
||||
job: true
|
||||
|
||||
service:
|
||||
pipelines:
|
||||
traces:
|
||||
receivers: [otlp]
|
||||
processors: [memory_limiter, batch]
|
||||
exporters: [otlp/tempo]
|
||||
logs:
|
||||
receivers: [otlp]
|
||||
processors: [memory_limiter, batch]
|
||||
exporters: [loki]
|
||||
@@ -1,23 +0,0 @@
|
||||
global:
|
||||
scrape_interval: 15s
|
||||
evaluation_interval: 15s
|
||||
|
||||
scrape_configs:
|
||||
# Scrape Prometheus itself
|
||||
- job_name: "prometheus"
|
||||
static_configs:
|
||||
- targets: ["localhost:9090"]
|
||||
|
||||
# Scrape IncidentOps API metrics
|
||||
- job_name: "incidentops-api"
|
||||
static_configs:
|
||||
- targets: ["api:9464"]
|
||||
metrics_path: /metrics
|
||||
scrape_interval: 10s
|
||||
|
||||
# Scrape IncidentOps Worker metrics (when metrics are enabled)
|
||||
- job_name: "incidentops-worker"
|
||||
static_configs:
|
||||
- targets: ["worker:9464"]
|
||||
metrics_path: /metrics
|
||||
scrape_interval: 10s
|
||||
@@ -1,32 +0,0 @@
|
||||
server:
|
||||
http_listen_port: 3200
|
||||
|
||||
distributor:
|
||||
receivers:
|
||||
otlp:
|
||||
protocols:
|
||||
grpc:
|
||||
endpoint: 0.0.0.0:4317
|
||||
http:
|
||||
endpoint: 0.0.0.0:4318
|
||||
|
||||
ingester:
|
||||
trace_idle_period: 10s
|
||||
max_block_bytes: 1048576
|
||||
max_block_duration: 5m
|
||||
|
||||
compactor:
|
||||
compaction:
|
||||
block_retention: 168h # 7 days
|
||||
|
||||
storage:
|
||||
trace:
|
||||
backend: local
|
||||
local:
|
||||
path: /var/tempo/traces
|
||||
wal:
|
||||
path: /var/tempo/wal
|
||||
|
||||
querier:
|
||||
search:
|
||||
query_timeout: 30s
|
||||
@@ -1,58 +0,0 @@
|
||||
[project]
|
||||
name = "incidentops"
|
||||
version = "0.1.0"
|
||||
description = "Incident management API with multi-tenant org support"
|
||||
readme = "README.md"
|
||||
requires-python = ">=3.14"
|
||||
dependencies = [
|
||||
"fastapi>=0.115.0",
|
||||
"uvicorn[standard]>=0.32.0",
|
||||
"asyncpg>=0.30.0",
|
||||
"pydantic[email]>=2.0.0",
|
||||
"pydantic-settings>=2.0.0",
|
||||
"python-jose[cryptography]>=3.3.0",
|
||||
"bcrypt>=4.0.0",
|
||||
"celery[redis]>=5.4.0",
|
||||
"redis>=5.0.0",
|
||||
"httpx>=0.28.0",
|
||||
# OpenTelemetry
|
||||
"opentelemetry-api>=1.27.0",
|
||||
"opentelemetry-sdk>=1.27.0",
|
||||
"opentelemetry-exporter-otlp>=1.27.0",
|
||||
"opentelemetry-exporter-prometheus>=0.48b0",
|
||||
"opentelemetry-instrumentation-fastapi>=0.48b0",
|
||||
"opentelemetry-instrumentation-asyncpg>=0.48b0",
|
||||
"opentelemetry-instrumentation-httpx>=0.48b0",
|
||||
"opentelemetry-instrumentation-redis>=0.48b0",
|
||||
"opentelemetry-instrumentation-logging>=0.48b0",
|
||||
"opentelemetry-instrumentation-system-metrics>=0.48b0",
|
||||
"prometheus-client>=0.20.0",
|
||||
]
|
||||
|
||||
[project.optional-dependencies]
|
||||
dev = [
|
||||
"pytest>=8.0.0",
|
||||
"pytest-asyncio>=0.24.0",
|
||||
"ruff>=0.8.0",
|
||||
]
|
||||
|
||||
[build-system]
|
||||
requires = ["hatchling"]
|
||||
build-backend = "hatchling.build"
|
||||
|
||||
[tool.hatch.build.targets.wheel]
|
||||
packages = ["app", "migrations", "worker"]
|
||||
|
||||
[tool.ruff]
|
||||
line-length = 100
|
||||
target-version = "py314"
|
||||
|
||||
[tool.ruff.lint]
|
||||
select = ["E", "F", "I", "N", "W", "UP"]
|
||||
|
||||
[tool.ruff.lint.per-file-ignores]
|
||||
"tests/**/*.py" = ["E501"] # Allow longer lines in tests for descriptive method names
|
||||
|
||||
[tool.pytest.ini_options]
|
||||
asyncio_mode = "auto"
|
||||
testpaths = ["tests"]
|
||||
-150
@@ -1,150 +0,0 @@
|
||||
apiVersion: skaffold/v4beta11
|
||||
kind: Config
|
||||
metadata:
|
||||
name: incidentops
|
||||
|
||||
build:
|
||||
artifacts:
|
||||
- image: incidentops/api
|
||||
docker:
|
||||
dockerfile: Dockerfile
|
||||
target: api
|
||||
sync:
|
||||
manual:
|
||||
- src: "app/**/*.py"
|
||||
dest: /app
|
||||
- src: "worker/**/*.py"
|
||||
dest: /app
|
||||
|
||||
- image: incidentops/worker
|
||||
docker:
|
||||
dockerfile: Dockerfile
|
||||
target: worker
|
||||
sync:
|
||||
manual:
|
||||
- src: "app/**/*.py"
|
||||
dest: /app
|
||||
- src: "worker/**/*.py"
|
||||
dest: /app
|
||||
|
||||
# Web frontend disabled until implemented
|
||||
# - image: incidentops/web
|
||||
# docker:
|
||||
# dockerfile: Dockerfile.web
|
||||
# context: .
|
||||
# sync:
|
||||
# manual:
|
||||
# - src: "web/src/**/*"
|
||||
# dest: /app
|
||||
|
||||
local:
|
||||
push: false
|
||||
useBuildkit: true
|
||||
|
||||
deploy:
|
||||
helm:
|
||||
releases:
|
||||
- name: incidentops
|
||||
chartPath: helm/incidentops
|
||||
valuesFiles:
|
||||
- helm/incidentops/values.yaml
|
||||
setValues:
|
||||
web.replicaCount: 0 # Disabled until frontend is implemented
|
||||
migration.enabled: true
|
||||
setValueTemplates:
|
||||
api.image.repository: "{{.IMAGE_REPO_incidentops_api}}"
|
||||
api.image.tag: "{{.IMAGE_TAG_incidentops_api}}"
|
||||
worker.image.repository: "{{.IMAGE_REPO_incidentops_worker}}"
|
||||
worker.image.tag: "{{.IMAGE_TAG_incidentops_worker}}"
|
||||
migration.image.repository: "{{.IMAGE_REPO_incidentops_api}}"
|
||||
migration.image.tag: "{{.IMAGE_TAG_incidentops_api}}"
|
||||
createNamespace: true
|
||||
namespace: incidentops
|
||||
|
||||
profiles:
|
||||
- name: dev
|
||||
activation:
|
||||
- command: dev
|
||||
build:
|
||||
local:
|
||||
push: false
|
||||
deploy:
|
||||
helm:
|
||||
releases:
|
||||
- name: incidentops
|
||||
chartPath: helm/incidentops
|
||||
valuesFiles:
|
||||
- helm/incidentops/values.yaml
|
||||
setValues:
|
||||
api.replicaCount: 1
|
||||
worker.replicaCount: 1
|
||||
web.replicaCount: 0 # Disabled until frontend is implemented
|
||||
migration.enabled: true
|
||||
setValueTemplates:
|
||||
api.image.repository: "{{.IMAGE_REPO_incidentops_api}}"
|
||||
api.image.tag: "{{.IMAGE_TAG_incidentops_api}}"
|
||||
worker.image.repository: "{{.IMAGE_REPO_incidentops_worker}}"
|
||||
worker.image.tag: "{{.IMAGE_TAG_incidentops_worker}}"
|
||||
migration.image.repository: "{{.IMAGE_REPO_incidentops_api}}"
|
||||
migration.image.tag: "{{.IMAGE_TAG_incidentops_api}}"
|
||||
createNamespace: true
|
||||
namespace: incidentops
|
||||
|
||||
- name: production
|
||||
activation:
|
||||
- env: SKAFFOLD_PROFILE=production
|
||||
build:
|
||||
local:
|
||||
push: true
|
||||
deploy:
|
||||
helm:
|
||||
releases:
|
||||
- name: incidentops
|
||||
chartPath: helm/incidentops
|
||||
valuesFiles:
|
||||
- helm/incidentops/values.yaml
|
||||
- helm/incidentops/values-production.yaml
|
||||
createNamespace: true
|
||||
namespace: incidentops-prod
|
||||
|
||||
- name: kind
|
||||
activation:
|
||||
- kubeContext: kind-.*
|
||||
patches:
|
||||
- op: add
|
||||
path: /build/local/push
|
||||
value: false
|
||||
|
||||
portForward:
|
||||
- resourceType: service
|
||||
resourceName: incidentops-api
|
||||
namespace: incidentops
|
||||
port: 8000
|
||||
localPort: 8000
|
||||
# Web frontend disabled until implemented
|
||||
# - resourceType: service
|
||||
# resourceName: incidentops-web
|
||||
# namespace: incidentops
|
||||
# port: 3000
|
||||
# localPort: 3000
|
||||
# Observability
|
||||
- resourceType: service
|
||||
resourceName: incidentops-grafana
|
||||
namespace: incidentops
|
||||
port: 80
|
||||
localPort: 3001
|
||||
- resourceType: service
|
||||
resourceName: incidentops-prometheus
|
||||
namespace: incidentops
|
||||
port: 9090
|
||||
localPort: 9090
|
||||
- resourceType: service
|
||||
resourceName: incidentops-tempo
|
||||
namespace: incidentops
|
||||
port: 3200
|
||||
localPort: 3200
|
||||
- resourceType: service
|
||||
resourceName: incidentops-loki
|
||||
namespace: incidentops
|
||||
port: 3100
|
||||
localPort: 3100
|
||||
@@ -0,0 +1,22 @@
|
||||
using System.Security.Claims;
|
||||
using IncidentOps.Domain.Enums;
|
||||
|
||||
namespace IncidentOps.Api.Auth;
|
||||
|
||||
public static class ClaimsPrincipalExtensions
|
||||
{
|
||||
public static RequestContext GetRequestContext(this ClaimsPrincipal principal)
|
||||
{
|
||||
var userId = Guid.Parse(principal.FindFirstValue("sub") ?? throw new InvalidOperationException("Missing sub claim"));
|
||||
var orgId = Guid.Parse(principal.FindFirstValue("org_id") ?? throw new InvalidOperationException("Missing org_id claim"));
|
||||
var roleStr = principal.FindFirstValue("org_role") ?? throw new InvalidOperationException("Missing org_role claim");
|
||||
var role = Enum.Parse<OrgRole>(roleStr, ignoreCase: true);
|
||||
|
||||
return new RequestContext
|
||||
{
|
||||
UserId = userId,
|
||||
OrgId = orgId,
|
||||
Role = role
|
||||
};
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,10 @@
|
||||
using IncidentOps.Domain.Enums;
|
||||
|
||||
namespace IncidentOps.Api.Auth;
|
||||
|
||||
public class RequestContext
|
||||
{
|
||||
public Guid UserId { get; set; }
|
||||
public Guid OrgId { get; set; }
|
||||
public OrgRole Role { get; set; }
|
||||
}
|
||||
@@ -0,0 +1,38 @@
|
||||
using IncidentOps.Domain.Enums;
|
||||
using Microsoft.AspNetCore.Authorization;
|
||||
|
||||
namespace IncidentOps.Api.Auth;
|
||||
|
||||
public class RoleRequirement : IAuthorizationRequirement
|
||||
{
|
||||
public OrgRole MinimumRole { get; }
|
||||
|
||||
public RoleRequirement(OrgRole minimumRole)
|
||||
{
|
||||
MinimumRole = minimumRole;
|
||||
}
|
||||
}
|
||||
|
||||
public class RoleRequirementHandler : AuthorizationHandler<RoleRequirement>
|
||||
{
|
||||
protected override Task HandleRequirementAsync(AuthorizationHandlerContext context, RoleRequirement requirement)
|
||||
{
|
||||
var roleClaim = context.User.FindFirst("org_role")?.Value;
|
||||
if (roleClaim == null)
|
||||
{
|
||||
return Task.CompletedTask;
|
||||
}
|
||||
|
||||
if (!Enum.TryParse<OrgRole>(roleClaim, ignoreCase: true, out var userRole))
|
||||
{
|
||||
return Task.CompletedTask;
|
||||
}
|
||||
|
||||
if (userRole >= requirement.MinimumRole)
|
||||
{
|
||||
context.Succeed(requirement);
|
||||
}
|
||||
|
||||
return Task.CompletedTask;
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,226 @@
|
||||
using IncidentOps.Api.Auth;
|
||||
using IncidentOps.Contracts.Auth;
|
||||
using IncidentOps.Domain.Entities;
|
||||
using IncidentOps.Domain.Enums;
|
||||
using IncidentOps.Infrastructure.Auth;
|
||||
using IncidentOps.Infrastructure.Data.Repositories;
|
||||
using Microsoft.AspNetCore.Authorization;
|
||||
using Microsoft.AspNetCore.Mvc;
|
||||
using OrgEntity = IncidentOps.Domain.Entities.Org;
|
||||
|
||||
namespace IncidentOps.Api.Controllers;
|
||||
|
||||
[ApiController]
|
||||
[Route("v1/auth")]
|
||||
public class AuthController : ControllerBase
|
||||
{
|
||||
private readonly IUserRepository _userRepository;
|
||||
private readonly IOrgRepository _orgRepository;
|
||||
private readonly IOrgMemberRepository _orgMemberRepository;
|
||||
private readonly IRefreshTokenRepository _refreshTokenRepository;
|
||||
private readonly ITokenService _tokenService;
|
||||
private readonly IPasswordService _passwordService;
|
||||
private readonly JwtSettings _jwtSettings;
|
||||
|
||||
public AuthController(
|
||||
IUserRepository userRepository,
|
||||
IOrgRepository orgRepository,
|
||||
IOrgMemberRepository orgMemberRepository,
|
||||
IRefreshTokenRepository refreshTokenRepository,
|
||||
ITokenService tokenService,
|
||||
IPasswordService passwordService,
|
||||
JwtSettings jwtSettings)
|
||||
{
|
||||
_userRepository = userRepository;
|
||||
_orgRepository = orgRepository;
|
||||
_orgMemberRepository = orgMemberRepository;
|
||||
_refreshTokenRepository = refreshTokenRepository;
|
||||
_tokenService = tokenService;
|
||||
_passwordService = passwordService;
|
||||
_jwtSettings = jwtSettings;
|
||||
}
|
||||
|
||||
[HttpPost("register")]
|
||||
public async Task<ActionResult<AuthResponse>> Register([FromBody] RegisterRequest request)
|
||||
{
|
||||
var existingUser = await _userRepository.GetByEmailAsync(request.Email);
|
||||
if (existingUser != null)
|
||||
return Conflict(new { message = "Email already registered" });
|
||||
|
||||
var user = new User
|
||||
{
|
||||
Id = Guid.NewGuid(),
|
||||
Email = request.Email.ToLowerInvariant(),
|
||||
PasswordHash = _passwordService.HashPassword(request.Password),
|
||||
DisplayName = request.DisplayName,
|
||||
CreatedAt = DateTime.UtcNow
|
||||
};
|
||||
await _userRepository.CreateAsync(user);
|
||||
|
||||
// Create a default org for the user
|
||||
var org = new OrgEntity
|
||||
{
|
||||
Id = Guid.NewGuid(),
|
||||
Name = $"{request.DisplayName}'s Org",
|
||||
Slug = $"org-{Guid.NewGuid():N}".Substring(0, 20),
|
||||
CreatedAt = DateTime.UtcNow
|
||||
};
|
||||
await _orgRepository.CreateAsync(org);
|
||||
|
||||
var member = new OrgMember
|
||||
{
|
||||
Id = Guid.NewGuid(),
|
||||
OrgId = org.Id,
|
||||
UserId = user.Id,
|
||||
Role = OrgRole.Admin,
|
||||
CreatedAt = DateTime.UtcNow
|
||||
};
|
||||
await _orgMemberRepository.CreateAsync(member);
|
||||
|
||||
return await GenerateAuthResponse(user, org, member.Role);
|
||||
}
|
||||
|
||||
[HttpPost("login")]
|
||||
public async Task<ActionResult<AuthResponse>> Login([FromBody] LoginRequest request)
|
||||
{
|
||||
var user = await _userRepository.GetByEmailAsync(request.Email);
|
||||
if (user == null || !_passwordService.VerifyPassword(request.Password, user.PasswordHash))
|
||||
return Unauthorized(new { message = "Invalid credentials" });
|
||||
|
||||
var orgs = await _orgRepository.GetByUserIdAsync(user.Id);
|
||||
if (orgs.Count == 0)
|
||||
return Unauthorized(new { message = "User has no organizations" });
|
||||
|
||||
OrgEntity activeOrg;
|
||||
if (request.OrgId.HasValue)
|
||||
{
|
||||
activeOrg = orgs.FirstOrDefault(o => o.Id == request.OrgId.Value)
|
||||
?? throw new InvalidOperationException("User is not a member of the specified organization");
|
||||
}
|
||||
else
|
||||
{
|
||||
activeOrg = orgs.First();
|
||||
}
|
||||
|
||||
var member = await _orgMemberRepository.GetByUserAndOrgAsync(user.Id, activeOrg.Id);
|
||||
if (member == null)
|
||||
return Unauthorized(new { message = "User is not a member of the organization" });
|
||||
|
||||
return await GenerateAuthResponse(user, activeOrg, member.Role);
|
||||
}
|
||||
|
||||
[HttpPost("refresh")]
|
||||
public async Task<ActionResult<AuthResponse>> Refresh([FromBody] RefreshRequest request)
|
||||
{
|
||||
var tokenHash = _tokenService.HashToken(request.RefreshToken);
|
||||
var refreshToken = await _refreshTokenRepository.GetByHashAsync(tokenHash);
|
||||
if (refreshToken == null)
|
||||
return Unauthorized(new { message = "Invalid refresh token" });
|
||||
|
||||
var user = await _userRepository.GetByIdAsync(refreshToken.UserId);
|
||||
if (user == null)
|
||||
return Unauthorized(new { message = "User not found" });
|
||||
|
||||
var org = await _orgRepository.GetByIdAsync(refreshToken.ActiveOrgId);
|
||||
if (org == null)
|
||||
return Unauthorized(new { message = "Organization not found" });
|
||||
|
||||
var member = await _orgMemberRepository.GetByUserAndOrgAsync(user.Id, org.Id);
|
||||
if (member == null)
|
||||
return Unauthorized(new { message = "User is not a member of the organization" });
|
||||
|
||||
// Rotate refresh token
|
||||
await _refreshTokenRepository.RevokeAsync(refreshToken.Id);
|
||||
|
||||
return await GenerateAuthResponse(user, org, member.Role);
|
||||
}
|
||||
|
||||
[HttpPost("switch-org")]
|
||||
public async Task<ActionResult<AuthResponse>> SwitchOrg([FromBody] SwitchOrgRequest request)
|
||||
{
|
||||
var tokenHash = _tokenService.HashToken(request.RefreshToken);
|
||||
var refreshToken = await _refreshTokenRepository.GetByHashAsync(tokenHash);
|
||||
if (refreshToken == null)
|
||||
return Unauthorized(new { message = "Invalid refresh token" });
|
||||
|
||||
var user = await _userRepository.GetByIdAsync(refreshToken.UserId);
|
||||
if (user == null)
|
||||
return Unauthorized(new { message = "User not found" });
|
||||
|
||||
var org = await _orgRepository.GetByIdAsync(request.OrgId);
|
||||
if (org == null)
|
||||
return NotFound(new { message = "Organization not found" });
|
||||
|
||||
var member = await _orgMemberRepository.GetByUserAndOrgAsync(user.Id, org.Id);
|
||||
if (member == null)
|
||||
return Forbidden("User is not a member of the organization");
|
||||
|
||||
// Rotate refresh token with new org
|
||||
await _refreshTokenRepository.RevokeAsync(refreshToken.Id);
|
||||
|
||||
return await GenerateAuthResponse(user, org, member.Role);
|
||||
}
|
||||
|
||||
[HttpPost("logout")]
|
||||
public async Task<IActionResult> Logout([FromBody] LogoutRequest request)
|
||||
{
|
||||
var tokenHash = _tokenService.HashToken(request.RefreshToken);
|
||||
var refreshToken = await _refreshTokenRepository.GetByHashAsync(tokenHash);
|
||||
if (refreshToken != null)
|
||||
{
|
||||
await _refreshTokenRepository.RevokeAsync(refreshToken.Id);
|
||||
}
|
||||
|
||||
return NoContent();
|
||||
}
|
||||
|
||||
[Authorize]
|
||||
[HttpGet("/v1/me")]
|
||||
public async Task<ActionResult<MeResponse>> Me()
|
||||
{
|
||||
var ctx = User.GetRequestContext();
|
||||
var user = await _userRepository.GetByIdAsync(ctx.UserId);
|
||||
if (user == null)
|
||||
return NotFound();
|
||||
|
||||
var org = await _orgRepository.GetByIdAsync(ctx.OrgId);
|
||||
if (org == null)
|
||||
return NotFound();
|
||||
|
||||
return new MeResponse(
|
||||
user.Id,
|
||||
user.Email,
|
||||
user.DisplayName,
|
||||
new ActiveOrgDto(org.Id, org.Name, org.Slug, ctx.Role.ToString().ToLowerInvariant())
|
||||
);
|
||||
}
|
||||
|
||||
private async Task<ActionResult<AuthResponse>> GenerateAuthResponse(User user, OrgEntity org, OrgRole role)
|
||||
{
|
||||
var accessToken = _tokenService.GenerateAccessToken(user.Id, org.Id, role);
|
||||
var refreshTokenValue = _tokenService.GenerateRefreshToken();
|
||||
var refreshTokenHash = _tokenService.HashToken(refreshTokenValue);
|
||||
|
||||
var refreshToken = new RefreshToken
|
||||
{
|
||||
Id = Guid.NewGuid(),
|
||||
UserId = user.Id,
|
||||
TokenHash = refreshTokenHash,
|
||||
ActiveOrgId = org.Id,
|
||||
ExpiresAt = DateTime.UtcNow.AddDays(_jwtSettings.RefreshTokenExpirationDays),
|
||||
CreatedAt = DateTime.UtcNow
|
||||
};
|
||||
await _refreshTokenRepository.CreateAsync(refreshToken);
|
||||
|
||||
return new AuthResponse(
|
||||
accessToken,
|
||||
refreshTokenValue,
|
||||
new ActiveOrgDto(org.Id, org.Name, org.Slug, role.ToString().ToLowerInvariant())
|
||||
);
|
||||
}
|
||||
|
||||
private ObjectResult Forbidden(string message)
|
||||
{
|
||||
return StatusCode(403, new { message });
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,60 @@
|
||||
using Microsoft.AspNetCore.Mvc;
|
||||
using Npgsql;
|
||||
using StackExchange.Redis;
|
||||
|
||||
namespace IncidentOps.Api.Controllers;
|
||||
|
||||
[ApiController]
|
||||
public class HealthController : ControllerBase
|
||||
{
|
||||
private readonly IConfiguration _configuration;
|
||||
|
||||
public HealthController(IConfiguration configuration)
|
||||
{
|
||||
_configuration = configuration;
|
||||
}
|
||||
|
||||
[HttpGet("healthz")]
|
||||
public IActionResult Healthz()
|
||||
{
|
||||
return Ok(new { status = "healthy" });
|
||||
}
|
||||
|
||||
[HttpGet("readyz")]
|
||||
public async Task<IActionResult> Readyz()
|
||||
{
|
||||
var checks = new Dictionary<string, string>();
|
||||
|
||||
// Check PostgreSQL
|
||||
try
|
||||
{
|
||||
var connectionString = _configuration.GetConnectionString("Postgres");
|
||||
await using var connection = new NpgsqlConnection(connectionString);
|
||||
await connection.OpenAsync();
|
||||
checks["postgres"] = "healthy";
|
||||
}
|
||||
catch (Exception ex)
|
||||
{
|
||||
checks["postgres"] = $"unhealthy: {ex.Message}";
|
||||
}
|
||||
|
||||
// Check Redis
|
||||
try
|
||||
{
|
||||
var redisConnectionString = _configuration["Redis:ConnectionString"];
|
||||
var redis = await ConnectionMultiplexer.ConnectAsync(redisConnectionString!);
|
||||
var db = redis.GetDatabase();
|
||||
await db.PingAsync();
|
||||
checks["redis"] = "healthy";
|
||||
}
|
||||
catch (Exception ex)
|
||||
{
|
||||
checks["redis"] = $"unhealthy: {ex.Message}";
|
||||
}
|
||||
|
||||
var allHealthy = checks.Values.All(v => v == "healthy");
|
||||
return allHealthy
|
||||
? Ok(new { status = "ready", checks })
|
||||
: StatusCode(503, new { status = "not ready", checks });
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,290 @@
|
||||
using Hangfire;
|
||||
using IncidentOps.Api.Auth;
|
||||
using IncidentOps.Contracts.Incidents;
|
||||
using IncidentOps.Domain.Entities;
|
||||
using IncidentOps.Domain.Enums;
|
||||
using IncidentOps.Infrastructure.Data.Repositories;
|
||||
using IncidentOps.Infrastructure.Jobs;
|
||||
using Microsoft.AspNetCore.Authorization;
|
||||
using Microsoft.AspNetCore.Mvc;
|
||||
|
||||
namespace IncidentOps.Api.Controllers;
|
||||
|
||||
[ApiController]
|
||||
[Authorize]
|
||||
public class IncidentsController : ControllerBase
|
||||
{
|
||||
private readonly IIncidentRepository _incidentRepository;
|
||||
private readonly IIncidentEventRepository _incidentEventRepository;
|
||||
private readonly IServiceRepository _serviceRepository;
|
||||
private readonly IUserRepository _userRepository;
|
||||
private readonly IBackgroundJobClient _backgroundJobClient;
|
||||
|
||||
public IncidentsController(
|
||||
IIncidentRepository incidentRepository,
|
||||
IIncidentEventRepository incidentEventRepository,
|
||||
IServiceRepository serviceRepository,
|
||||
IUserRepository userRepository,
|
||||
IBackgroundJobClient backgroundJobClient)
|
||||
{
|
||||
_incidentRepository = incidentRepository;
|
||||
_incidentEventRepository = incidentEventRepository;
|
||||
_serviceRepository = serviceRepository;
|
||||
_userRepository = userRepository;
|
||||
_backgroundJobClient = backgroundJobClient;
|
||||
}
|
||||
|
||||
[HttpGet("v1/incidents")]
|
||||
public async Task<ActionResult<IncidentListResponse>> GetIncidents(
|
||||
[FromQuery] string? status = null,
|
||||
[FromQuery] string? cursor = null,
|
||||
[FromQuery] int limit = 20)
|
||||
{
|
||||
var ctx = User.GetRequestContext();
|
||||
|
||||
IncidentStatus? statusFilter = null;
|
||||
if (!string.IsNullOrEmpty(status) && Enum.TryParse<IncidentStatus>(status, ignoreCase: true, out var parsed))
|
||||
{
|
||||
statusFilter = parsed;
|
||||
}
|
||||
|
||||
var incidents = await _incidentRepository.GetByOrgIdAsync(ctx.OrgId, statusFilter, limit + 1, cursor);
|
||||
var hasMore = incidents.Count > limit;
|
||||
var items = incidents.Take(limit).ToList();
|
||||
|
||||
var dtos = new List<IncidentDto>();
|
||||
foreach (var incident in items)
|
||||
{
|
||||
var service = await _serviceRepository.GetByIdAsync(incident.ServiceId, ctx.OrgId);
|
||||
var assignedUser = incident.AssignedToUserId.HasValue
|
||||
? await _userRepository.GetByIdAsync(incident.AssignedToUserId.Value)
|
||||
: null;
|
||||
|
||||
dtos.Add(new IncidentDto(
|
||||
incident.Id,
|
||||
incident.ServiceId,
|
||||
service?.Name ?? "Unknown",
|
||||
incident.Title,
|
||||
incident.Description,
|
||||
incident.Status.ToString().ToLowerInvariant(),
|
||||
incident.Version,
|
||||
incident.AssignedToUserId,
|
||||
assignedUser?.DisplayName,
|
||||
incident.CreatedAt,
|
||||
incident.AcknowledgedAt,
|
||||
incident.MitigatedAt,
|
||||
incident.ResolvedAt
|
||||
));
|
||||
}
|
||||
|
||||
var nextCursor = hasMore ? items.Last().CreatedAt.ToString("O") : null;
|
||||
return new IncidentListResponse(dtos, nextCursor);
|
||||
}
|
||||
|
||||
[HttpPost("v1/services/{serviceId}/incidents")]
|
||||
[Authorize(Policy = "Member")]
|
||||
public async Task<ActionResult<IncidentDto>> CreateIncident(Guid serviceId, [FromBody] CreateIncidentRequest request)
|
||||
{
|
||||
var ctx = User.GetRequestContext();
|
||||
|
||||
var service = await _serviceRepository.GetByIdAsync(serviceId, ctx.OrgId);
|
||||
if (service == null)
|
||||
return NotFound(new { message = "Service not found" });
|
||||
|
||||
var incident = new Incident
|
||||
{
|
||||
Id = Guid.NewGuid(),
|
||||
OrgId = ctx.OrgId,
|
||||
ServiceId = serviceId,
|
||||
Title = request.Title,
|
||||
Description = request.Description,
|
||||
Status = IncidentStatus.Triggered,
|
||||
Version = 1,
|
||||
CreatedAt = DateTime.UtcNow
|
||||
};
|
||||
await _incidentRepository.CreateAsync(incident);
|
||||
|
||||
var incidentEvent = new IncidentEvent
|
||||
{
|
||||
Id = Guid.NewGuid(),
|
||||
IncidentId = incident.Id,
|
||||
EventType = IncidentEventType.Created,
|
||||
ActorUserId = ctx.UserId,
|
||||
CreatedAt = DateTime.UtcNow
|
||||
};
|
||||
await _incidentEventRepository.CreateAsync(incidentEvent);
|
||||
|
||||
// Enqueue notification job
|
||||
_backgroundJobClient.Enqueue<IIncidentTriggeredJob>(j => j.ExecuteAsync(incident.Id));
|
||||
|
||||
return CreatedAtAction(nameof(GetIncident), new { incidentId = incident.Id }, new IncidentDto(
|
||||
incident.Id,
|
||||
incident.ServiceId,
|
||||
service.Name,
|
||||
incident.Title,
|
||||
incident.Description,
|
||||
incident.Status.ToString().ToLowerInvariant(),
|
||||
incident.Version,
|
||||
null,
|
||||
null,
|
||||
incident.CreatedAt,
|
||||
null,
|
||||
null,
|
||||
null
|
||||
));
|
||||
}
|
||||
|
||||
[HttpGet("v1/incidents/{incidentId}")]
|
||||
public async Task<ActionResult<IncidentDto>> GetIncident(Guid incidentId)
|
||||
{
|
||||
var ctx = User.GetRequestContext();
|
||||
|
||||
var incident = await _incidentRepository.GetByIdAsync(incidentId, ctx.OrgId);
|
||||
if (incident == null)
|
||||
return NotFound();
|
||||
|
||||
var service = await _serviceRepository.GetByIdAsync(incident.ServiceId, ctx.OrgId);
|
||||
var assignedUser = incident.AssignedToUserId.HasValue
|
||||
? await _userRepository.GetByIdAsync(incident.AssignedToUserId.Value)
|
||||
: null;
|
||||
|
||||
return new IncidentDto(
|
||||
incident.Id,
|
||||
incident.ServiceId,
|
||||
service?.Name ?? "Unknown",
|
||||
incident.Title,
|
||||
incident.Description,
|
||||
incident.Status.ToString().ToLowerInvariant(),
|
||||
incident.Version,
|
||||
incident.AssignedToUserId,
|
||||
assignedUser?.DisplayName,
|
||||
incident.CreatedAt,
|
||||
incident.AcknowledgedAt,
|
||||
incident.MitigatedAt,
|
||||
incident.ResolvedAt
|
||||
);
|
||||
}
|
||||
|
||||
[HttpGet("v1/incidents/{incidentId}/events")]
|
||||
public async Task<ActionResult<IReadOnlyList<IncidentEventDto>>> GetIncidentEvents(Guid incidentId)
|
||||
{
|
||||
var ctx = User.GetRequestContext();
|
||||
|
||||
var incident = await _incidentRepository.GetByIdAsync(incidentId, ctx.OrgId);
|
||||
if (incident == null)
|
||||
return NotFound();
|
||||
|
||||
var events = await _incidentEventRepository.GetByIncidentIdAsync(incidentId);
|
||||
|
||||
var dtos = new List<IncidentEventDto>();
|
||||
foreach (var evt in events)
|
||||
{
|
||||
var actor = evt.ActorUserId.HasValue
|
||||
? await _userRepository.GetByIdAsync(evt.ActorUserId.Value)
|
||||
: null;
|
||||
|
||||
dtos.Add(new IncidentEventDto(
|
||||
evt.Id,
|
||||
evt.EventType.ToString().ToLowerInvariant(),
|
||||
evt.ActorUserId,
|
||||
actor?.DisplayName,
|
||||
evt.Payload,
|
||||
evt.CreatedAt
|
||||
));
|
||||
}
|
||||
|
||||
return dtos;
|
||||
}
|
||||
|
||||
[HttpPost("v1/incidents/{incidentId}/transition")]
|
||||
[Authorize(Policy = "Member")]
|
||||
public async Task<ActionResult<IncidentDto>> TransitionIncident(Guid incidentId, [FromBody] TransitionRequest request)
|
||||
{
|
||||
var ctx = User.GetRequestContext();
|
||||
|
||||
var incident = await _incidentRepository.GetByIdAsync(incidentId, ctx.OrgId);
|
||||
if (incident == null)
|
||||
return NotFound();
|
||||
|
||||
var newStatus = request.Action.ToLowerInvariant() switch
|
||||
{
|
||||
"ack" or "acknowledge" => IncidentStatus.Acknowledged,
|
||||
"mitigate" => IncidentStatus.Mitigated,
|
||||
"resolve" => IncidentStatus.Resolved,
|
||||
_ => (IncidentStatus?)null
|
||||
};
|
||||
|
||||
if (newStatus == null)
|
||||
return BadRequest(new { message = "Invalid action" });
|
||||
|
||||
// Validate transition
|
||||
var validTransitions = new Dictionary<IncidentStatus, IncidentStatus[]>
|
||||
{
|
||||
{ IncidentStatus.Triggered, new[] { IncidentStatus.Acknowledged } },
|
||||
{ IncidentStatus.Acknowledged, new[] { IncidentStatus.Mitigated } },
|
||||
{ IncidentStatus.Mitigated, new[] { IncidentStatus.Resolved } }
|
||||
};
|
||||
|
||||
if (!validTransitions.TryGetValue(incident.Status, out var allowedStatuses) || !allowedStatuses.Contains(newStatus.Value))
|
||||
{
|
||||
return BadRequest(new { message = $"Cannot transition from {incident.Status} to {newStatus}" });
|
||||
}
|
||||
|
||||
var timestamp = DateTime.UtcNow;
|
||||
var success = await _incidentRepository.TransitionAsync(incidentId, ctx.OrgId, request.ExpectedVersion, newStatus.Value, timestamp);
|
||||
if (!success)
|
||||
return Conflict(new { message = "Concurrent modification detected. Please refresh and try again." });
|
||||
|
||||
var eventType = newStatus.Value switch
|
||||
{
|
||||
IncidentStatus.Acknowledged => IncidentEventType.Acknowledged,
|
||||
IncidentStatus.Mitigated => IncidentEventType.Mitigated,
|
||||
IncidentStatus.Resolved => IncidentEventType.Resolved,
|
||||
_ => throw new InvalidOperationException()
|
||||
};
|
||||
|
||||
await _incidentEventRepository.CreateAsync(new IncidentEvent
|
||||
{
|
||||
Id = Guid.NewGuid(),
|
||||
IncidentId = incidentId,
|
||||
EventType = eventType,
|
||||
ActorUserId = ctx.UserId,
|
||||
CreatedAt = timestamp
|
||||
});
|
||||
|
||||
return await GetIncident(incidentId);
|
||||
}
|
||||
|
||||
[HttpPost("v1/incidents/{incidentId}/comment")]
|
||||
[Authorize(Policy = "Member")]
|
||||
public async Task<ActionResult<IncidentEventDto>> AddComment(Guid incidentId, [FromBody] CommentRequest request)
|
||||
{
|
||||
var ctx = User.GetRequestContext();
|
||||
|
||||
var incident = await _incidentRepository.GetByIdAsync(incidentId, ctx.OrgId);
|
||||
if (incident == null)
|
||||
return NotFound();
|
||||
|
||||
var incidentEvent = new IncidentEvent
|
||||
{
|
||||
Id = Guid.NewGuid(),
|
||||
IncidentId = incidentId,
|
||||
EventType = IncidentEventType.Comment,
|
||||
ActorUserId = ctx.UserId,
|
||||
Payload = request.Content,
|
||||
CreatedAt = DateTime.UtcNow
|
||||
};
|
||||
await _incidentEventRepository.CreateAsync(incidentEvent);
|
||||
|
||||
var user = await _userRepository.GetByIdAsync(ctx.UserId);
|
||||
|
||||
return CreatedAtAction(nameof(GetIncidentEvents), new { incidentId }, new IncidentEventDto(
|
||||
incidentEvent.Id,
|
||||
incidentEvent.EventType.ToString().ToLowerInvariant(),
|
||||
ctx.UserId,
|
||||
user?.DisplayName,
|
||||
incidentEvent.Payload,
|
||||
incidentEvent.CreatedAt
|
||||
));
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,151 @@
|
||||
using IncidentOps.Api.Auth;
|
||||
using IncidentOps.Contracts.Orgs;
|
||||
using IncidentOps.Contracts.Services;
|
||||
using IncidentOps.Domain.Entities;
|
||||
using IncidentOps.Domain.Enums;
|
||||
using IncidentOps.Infrastructure.Data.Repositories;
|
||||
using Microsoft.AspNetCore.Authorization;
|
||||
using Microsoft.AspNetCore.Mvc;
|
||||
|
||||
namespace IncidentOps.Api.Controllers;
|
||||
|
||||
[ApiController]
|
||||
[Route("v1/org")]
|
||||
[Authorize]
|
||||
public class OrgController : ControllerBase
|
||||
{
|
||||
private readonly IOrgRepository _orgRepository;
|
||||
private readonly IOrgMemberRepository _orgMemberRepository;
|
||||
private readonly IUserRepository _userRepository;
|
||||
private readonly IServiceRepository _serviceRepository;
|
||||
private readonly INotificationTargetRepository _notificationTargetRepository;
|
||||
|
||||
public OrgController(
|
||||
IOrgRepository orgRepository,
|
||||
IOrgMemberRepository orgMemberRepository,
|
||||
IUserRepository userRepository,
|
||||
IServiceRepository serviceRepository,
|
||||
INotificationTargetRepository notificationTargetRepository)
|
||||
{
|
||||
_orgRepository = orgRepository;
|
||||
_orgMemberRepository = orgMemberRepository;
|
||||
_userRepository = userRepository;
|
||||
_serviceRepository = serviceRepository;
|
||||
_notificationTargetRepository = notificationTargetRepository;
|
||||
}
|
||||
|
||||
[HttpGet]
|
||||
public async Task<ActionResult<OrgDto>> GetCurrentOrg()
|
||||
{
|
||||
var ctx = User.GetRequestContext();
|
||||
var org = await _orgRepository.GetByIdAsync(ctx.OrgId);
|
||||
if (org == null)
|
||||
return NotFound();
|
||||
|
||||
return new OrgDto(org.Id, org.Name, org.Slug, ctx.Role.ToString().ToLowerInvariant());
|
||||
}
|
||||
|
||||
[HttpGet("members")]
|
||||
[Authorize(Policy = "Admin")]
|
||||
public async Task<ActionResult<IReadOnlyList<OrgMemberDto>>> GetMembers()
|
||||
{
|
||||
var ctx = User.GetRequestContext();
|
||||
var members = await _orgMemberRepository.GetByOrgIdAsync(ctx.OrgId);
|
||||
|
||||
var result = new List<OrgMemberDto>();
|
||||
foreach (var member in members)
|
||||
{
|
||||
var user = await _userRepository.GetByIdAsync(member.UserId);
|
||||
if (user != null)
|
||||
{
|
||||
result.Add(new OrgMemberDto(
|
||||
member.Id,
|
||||
user.Id,
|
||||
user.Email,
|
||||
user.DisplayName,
|
||||
member.Role.ToString().ToLowerInvariant(),
|
||||
member.CreatedAt
|
||||
));
|
||||
}
|
||||
}
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
[HttpGet("services")]
|
||||
public async Task<ActionResult<IReadOnlyList<ServiceDto>>> GetServices()
|
||||
{
|
||||
var ctx = User.GetRequestContext();
|
||||
var services = await _serviceRepository.GetByOrgIdAsync(ctx.OrgId);
|
||||
|
||||
return services.Select(s => new ServiceDto(s.Id, s.Name, s.Slug, s.Description, s.CreatedAt)).ToList();
|
||||
}
|
||||
|
||||
[HttpPost("services")]
|
||||
[Authorize(Policy = "Member")]
|
||||
public async Task<ActionResult<ServiceDto>> CreateService([FromBody] CreateServiceRequest request)
|
||||
{
|
||||
var ctx = User.GetRequestContext();
|
||||
|
||||
var service = new Service
|
||||
{
|
||||
Id = Guid.NewGuid(),
|
||||
OrgId = ctx.OrgId,
|
||||
Name = request.Name,
|
||||
Slug = request.Slug,
|
||||
Description = request.Description,
|
||||
CreatedAt = DateTime.UtcNow
|
||||
};
|
||||
await _serviceRepository.CreateAsync(service);
|
||||
|
||||
return CreatedAtAction(nameof(GetServices), new ServiceDto(service.Id, service.Name, service.Slug, service.Description, service.CreatedAt));
|
||||
}
|
||||
|
||||
[HttpGet("notification-targets")]
|
||||
[Authorize(Policy = "Admin")]
|
||||
public async Task<ActionResult<IReadOnlyList<NotificationTargetDto>>> GetNotificationTargets()
|
||||
{
|
||||
var ctx = User.GetRequestContext();
|
||||
var targets = await _notificationTargetRepository.GetByOrgIdAsync(ctx.OrgId);
|
||||
|
||||
return targets.Select(t => new NotificationTargetDto(
|
||||
t.Id,
|
||||
t.Name,
|
||||
t.TargetType.ToString().ToLowerInvariant(),
|
||||
t.Configuration,
|
||||
t.IsEnabled,
|
||||
t.CreatedAt
|
||||
)).ToList();
|
||||
}
|
||||
|
||||
[HttpPost("notification-targets")]
|
||||
[Authorize(Policy = "Admin")]
|
||||
public async Task<ActionResult<NotificationTargetDto>> CreateNotificationTarget([FromBody] CreateNotificationTargetRequest request)
|
||||
{
|
||||
var ctx = User.GetRequestContext();
|
||||
|
||||
if (!Enum.TryParse<NotificationTargetType>(request.TargetType, ignoreCase: true, out var targetType))
|
||||
return BadRequest(new { message = "Invalid target type" });
|
||||
|
||||
var target = new NotificationTarget
|
||||
{
|
||||
Id = Guid.NewGuid(),
|
||||
OrgId = ctx.OrgId,
|
||||
Name = request.Name,
|
||||
TargetType = targetType,
|
||||
Configuration = request.Configuration,
|
||||
IsEnabled = request.IsEnabled,
|
||||
CreatedAt = DateTime.UtcNow
|
||||
};
|
||||
await _notificationTargetRepository.CreateAsync(target);
|
||||
|
||||
return CreatedAtAction(nameof(GetNotificationTargets), new NotificationTargetDto(
|
||||
target.Id,
|
||||
target.Name,
|
||||
target.TargetType.ToString().ToLowerInvariant(),
|
||||
target.Configuration,
|
||||
target.IsEnabled,
|
||||
target.CreatedAt
|
||||
));
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,23 @@
|
||||
FROM mcr.microsoft.com/dotnet/sdk:10.0 AS build
|
||||
WORKDIR /src
|
||||
|
||||
# Copy csproj files and restore
|
||||
COPY src/IncidentOps.Contracts/IncidentOps.Contracts.csproj src/IncidentOps.Contracts/
|
||||
COPY src/IncidentOps.Domain/IncidentOps.Domain.csproj src/IncidentOps.Domain/
|
||||
COPY src/IncidentOps.Infrastructure/IncidentOps.Infrastructure.csproj src/IncidentOps.Infrastructure/
|
||||
COPY src/IncidentOps.Api/IncidentOps.Api.csproj src/IncidentOps.Api/
|
||||
RUN dotnet restore src/IncidentOps.Api/IncidentOps.Api.csproj
|
||||
|
||||
# Copy source and build
|
||||
COPY src/ src/
|
||||
WORKDIR /src/src/IncidentOps.Api
|
||||
RUN dotnet publish -c Release -o /app --no-restore
|
||||
|
||||
FROM mcr.microsoft.com/dotnet/aspnet:10.0 AS runtime
|
||||
WORKDIR /app
|
||||
COPY --from=build /app .
|
||||
|
||||
ENV ASPNETCORE_URLS=http://+:8080
|
||||
EXPOSE 8080
|
||||
|
||||
ENTRYPOINT ["dotnet", "IncidentOps.Api.dll"]
|
||||
@@ -0,0 +1,28 @@
|
||||
<Project Sdk="Microsoft.NET.Sdk.Web">
|
||||
|
||||
<PropertyGroup>
|
||||
<TargetFramework>net10.0</TargetFramework>
|
||||
<Nullable>enable</Nullable>
|
||||
<ImplicitUsings>enable</ImplicitUsings>
|
||||
<AllowMissingPrunePackageData>true</AllowMissingPrunePackageData>
|
||||
</PropertyGroup>
|
||||
|
||||
<ItemGroup>
|
||||
<PackageReference Include="FluentMigrator.Runner" Version="7.2.0" />
|
||||
<PackageReference Include="FluentMigrator.Runner.Postgres" Version="7.2.0" />
|
||||
<PackageReference Include="Hangfire.AspNetCore" Version="1.8.22" />
|
||||
<PackageReference Include="Hangfire.Core" Version="1.8.22" />
|
||||
<PackageReference Include="Hangfire.Redis.StackExchange" Version="1.12.0" />
|
||||
<PackageReference Include="Microsoft.AspNetCore.Authentication.JwtBearer" Version="10.0.1" />
|
||||
<PackageReference Include="Microsoft.AspNetCore.OpenApi" Version="10.0.0" />
|
||||
<PackageReference Include="Npgsql" Version="10.0.1" />
|
||||
<PackageReference Include="StackExchange.Redis" Version="2.10.1" />
|
||||
</ItemGroup>
|
||||
|
||||
<ItemGroup>
|
||||
<ProjectReference Include="..\IncidentOps.Infrastructure\IncidentOps.Infrastructure.csproj" />
|
||||
<ProjectReference Include="..\IncidentOps.Domain\IncidentOps.Domain.csproj" />
|
||||
<ProjectReference Include="..\IncidentOps.Contracts\IncidentOps.Contracts.csproj" />
|
||||
</ItemGroup>
|
||||
|
||||
</Project>
|
||||
@@ -0,0 +1,108 @@
|
||||
using System.Text;
|
||||
using FluentMigrator.Runner;
|
||||
using Hangfire;
|
||||
using Hangfire.Redis.StackExchange;
|
||||
using IncidentOps.Api.Auth;
|
||||
using IncidentOps.Infrastructure;
|
||||
using IncidentOps.Infrastructure.Auth;
|
||||
using IncidentOps.Infrastructure.Migrations;
|
||||
using Microsoft.AspNetCore.Authentication.JwtBearer;
|
||||
using Microsoft.AspNetCore.Authorization;
|
||||
using Microsoft.IdentityModel.Tokens;
|
||||
using StackExchange.Redis;
|
||||
|
||||
var builder = WebApplication.CreateBuilder(args);
|
||||
|
||||
// Add controllers
|
||||
builder.Services.AddControllers();
|
||||
builder.Services.AddEndpointsApiExplorer();
|
||||
builder.Services.AddOpenApi();
|
||||
|
||||
// Configure JWT settings
|
||||
var jwtSettings = new JwtSettings
|
||||
{
|
||||
Issuer = builder.Configuration["Jwt:Issuer"] ?? "incidentops",
|
||||
Audience = builder.Configuration["Jwt:Audience"] ?? "incidentops",
|
||||
SigningKey = builder.Configuration["Jwt:SigningKey"] ?? throw new InvalidOperationException("JWT signing key not configured"),
|
||||
AccessTokenExpirationMinutes = builder.Configuration.GetValue<int>("Jwt:AccessTokenExpirationMinutes", 15),
|
||||
RefreshTokenExpirationDays = builder.Configuration.GetValue<int>("Jwt:RefreshTokenExpirationDays", 7)
|
||||
};
|
||||
|
||||
// Configure Infrastructure
|
||||
var connectionString = builder.Configuration.GetConnectionString("Postgres")
|
||||
?? throw new InvalidOperationException("Postgres connection string not configured");
|
||||
builder.Services.AddInfrastructure(connectionString, jwtSettings);
|
||||
|
||||
// Configure FluentMigrator
|
||||
builder.Services.AddFluentMigratorCore()
|
||||
.ConfigureRunner(rb => rb
|
||||
.AddPostgres()
|
||||
.WithGlobalConnectionString(connectionString)
|
||||
.ScanIn(typeof(Migration0001_InitialSchema).Assembly).For.Migrations())
|
||||
.AddLogging(lb => lb.AddFluentMigratorConsole());
|
||||
|
||||
// Configure JWT Authentication
|
||||
builder.Services.AddAuthentication(JwtBearerDefaults.AuthenticationScheme)
|
||||
.AddJwtBearer(options =>
|
||||
{
|
||||
options.TokenValidationParameters = new TokenValidationParameters
|
||||
{
|
||||
ValidateIssuer = true,
|
||||
ValidateAudience = true,
|
||||
ValidateLifetime = true,
|
||||
ValidateIssuerSigningKey = true,
|
||||
ValidIssuer = jwtSettings.Issuer,
|
||||
ValidAudience = jwtSettings.Audience,
|
||||
IssuerSigningKey = new SymmetricSecurityKey(Encoding.UTF8.GetBytes(jwtSettings.SigningKey))
|
||||
};
|
||||
});
|
||||
|
||||
// Configure Authorization
|
||||
builder.Services.AddSingleton<IAuthorizationHandler, RoleRequirementHandler>();
|
||||
builder.Services.AddAuthorizationBuilder()
|
||||
.AddPolicy("Viewer", policy => policy.Requirements.Add(new RoleRequirement(IncidentOps.Domain.Enums.OrgRole.Viewer)))
|
||||
.AddPolicy("Member", policy => policy.Requirements.Add(new RoleRequirement(IncidentOps.Domain.Enums.OrgRole.Member)))
|
||||
.AddPolicy("Admin", policy => policy.Requirements.Add(new RoleRequirement(IncidentOps.Domain.Enums.OrgRole.Admin)));
|
||||
|
||||
// Configure Hangfire (client only - server runs in Worker)
|
||||
var redisConnectionString = builder.Configuration["Redis:ConnectionString"]
|
||||
?? throw new InvalidOperationException("Redis connection string not configured");
|
||||
builder.Services.AddHangfire(configuration => configuration
|
||||
.SetDataCompatibilityLevel(CompatibilityLevel.Version_180)
|
||||
.UseSimpleAssemblyNameTypeSerializer()
|
||||
.UseRecommendedSerializerSettings()
|
||||
.UseRedisStorage(ConnectionMultiplexer.Connect(redisConnectionString)));
|
||||
|
||||
// Add CORS
|
||||
builder.Services.AddCors(options =>
|
||||
{
|
||||
options.AddDefaultPolicy(policy =>
|
||||
{
|
||||
policy.WithOrigins(builder.Configuration.GetSection("Cors:Origins").Get<string[]>() ?? ["http://localhost:3000"])
|
||||
.AllowAnyHeader()
|
||||
.AllowAnyMethod()
|
||||
.AllowCredentials();
|
||||
});
|
||||
});
|
||||
|
||||
var app = builder.Build();
|
||||
|
||||
// Run migrations
|
||||
using (var scope = app.Services.CreateScope())
|
||||
{
|
||||
var runner = scope.ServiceProvider.GetRequiredService<IMigrationRunner>();
|
||||
runner.MigrateUp();
|
||||
}
|
||||
|
||||
// Configure the HTTP request pipeline
|
||||
if (app.Environment.IsDevelopment())
|
||||
{
|
||||
app.MapOpenApi();
|
||||
}
|
||||
|
||||
app.UseCors();
|
||||
app.UseAuthentication();
|
||||
app.UseAuthorization();
|
||||
app.MapControllers();
|
||||
|
||||
app.Run();
|
||||
@@ -0,0 +1,5 @@
|
||||
namespace IncidentOps.Contracts.Auth;
|
||||
|
||||
public record AuthResponse(string AccessToken, string RefreshToken, ActiveOrgDto ActiveOrg);
|
||||
|
||||
public record ActiveOrgDto(Guid Id, string Name, string Slug, string Role);
|
||||
@@ -0,0 +1,3 @@
|
||||
namespace IncidentOps.Contracts.Auth;
|
||||
|
||||
public record LoginRequest(string Email, string Password, Guid? OrgId = null);
|
||||
@@ -0,0 +1,3 @@
|
||||
namespace IncidentOps.Contracts.Auth;
|
||||
|
||||
public record LogoutRequest(string RefreshToken);
|
||||
@@ -0,0 +1,3 @@
|
||||
namespace IncidentOps.Contracts.Auth;
|
||||
|
||||
public record MeResponse(Guid Id, string Email, string DisplayName, ActiveOrgDto ActiveOrg);
|
||||
@@ -0,0 +1,3 @@
|
||||
namespace IncidentOps.Contracts.Auth;
|
||||
|
||||
public record RefreshRequest(string RefreshToken);
|
||||
@@ -0,0 +1,3 @@
|
||||
namespace IncidentOps.Contracts.Auth;
|
||||
|
||||
public record RegisterRequest(string Email, string Password, string DisplayName);
|
||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user