Compare commits
38 Commits
master
..
f1115497c4
| Author | SHA1 | Date | |
|---|---|---|---|
|
f1115497c4
|
|||
|
672b4ae893
|
|||
|
58daa46912
|
|||
|
51d9aa09f0
|
|||
|
1b9ab0f9e6
|
|||
|
ae037b8ae9
|
|||
|
f61eb6a79b
|
|||
|
cda843a80e
|
|||
|
885c288283
|
|||
|
112a6eeba6
|
|||
|
3abbd4a9aa
|
|||
|
f17fa5eb76
|
|||
|
8ada5d1946
|
|||
|
53418cf41c
|
|||
|
f635386b4d
|
|||
|
d6ac0ddd3a
|
|||
|
a0e9fd71e6
|
|||
|
03bc133e2c
|
|||
|
1d3ef9ef90
|
|||
|
1d98cd5a73
|
|||
|
1a5e1d6c38
|
|||
|
8cac9b4377
|
|||
|
06db4231cf
|
|||
|
8ac4d814ee
|
|||
|
9e73887efc
|
|||
|
4db3e56811
|
|||
|
d4c5f257af
|
|||
|
929327eca3
|
|||
|
97905f9e19
|
|||
|
0aac1b6dc7
|
|||
|
a6d5a696a6
|
|||
|
3e70ba560b
|
|||
|
92f9ed001c
|
|||
|
38aa3fb12e
|
|||
|
370408af95
|
|||
|
7a09f8e2f6
|
|||
|
9357cbe026
|
|||
|
49ec9cd997
|
+64
-10
@@ -1,11 +1,65 @@
|
|||||||
# Python-generated files
|
# .NET
|
||||||
__pycache__/
|
bin/
|
||||||
*.py[oc]
|
obj/
|
||||||
build/
|
*.user
|
||||||
dist/
|
*.suo
|
||||||
wheels/
|
*.userosscache
|
||||||
*.egg-info
|
*.sln.docstates
|
||||||
|
*.userprefs
|
||||||
|
.vs/
|
||||||
|
|
||||||
# Virtual environments
|
# Build results
|
||||||
.venv
|
[Dd]ebug/
|
||||||
.pytest_cache/
|
[Rr]elease/
|
||||||
|
x64/
|
||||||
|
x86/
|
||||||
|
[Aa][Rr][Mm]/
|
||||||
|
[Aa][Rr][Mm]64/
|
||||||
|
bld/
|
||||||
|
[Bb]in/
|
||||||
|
[Oo]bj/
|
||||||
|
[Ll]og/
|
||||||
|
[Ll]ogs/
|
||||||
|
|
||||||
|
# NuGet
|
||||||
|
*.nupkg
|
||||||
|
*.snupkg
|
||||||
|
.nuget/
|
||||||
|
packages/
|
||||||
|
|
||||||
|
# Node.js
|
||||||
|
node_modules/
|
||||||
|
.next/
|
||||||
|
out/
|
||||||
|
.npm/
|
||||||
|
|
||||||
|
# IDE
|
||||||
|
.idea/
|
||||||
|
*.swp
|
||||||
|
*.swo
|
||||||
|
*~
|
||||||
|
|
||||||
|
# OS
|
||||||
|
.DS_Store
|
||||||
|
Thumbs.db
|
||||||
|
|
||||||
|
# Environment
|
||||||
|
.env
|
||||||
|
.env.local
|
||||||
|
.env.*.local
|
||||||
|
appsettings.Local.json
|
||||||
|
appsettings.*.Local.json
|
||||||
|
appsettings*.json
|
||||||
|
|
||||||
|
# Project artifacts
|
||||||
|
**/Properties/
|
||||||
|
*.http
|
||||||
|
|
||||||
|
# Helm
|
||||||
|
helm/incidentops/charts/
|
||||||
|
|
||||||
|
# Docker
|
||||||
|
.docker/
|
||||||
|
|
||||||
|
# Kubernetes
|
||||||
|
*.kubeconfig
|
||||||
|
|||||||
@@ -1 +0,0 @@
|
|||||||
3.14
|
|
||||||
-38
@@ -1,38 +0,0 @@
|
|||||||
# Multi-stage Dockerfile for API and Worker services
|
|
||||||
FROM python:3.14-slim AS base
|
|
||||||
|
|
||||||
WORKDIR /app
|
|
||||||
|
|
||||||
# Install uv
|
|
||||||
COPY --from=ghcr.io/astral-sh/uv:latest /uv /uvx /bin/
|
|
||||||
|
|
||||||
# Install Python dependencies
|
|
||||||
COPY pyproject.toml uv.lock README.md ./
|
|
||||||
RUN uv sync --no-cache --no-dev
|
|
||||||
|
|
||||||
# Copy application code
|
|
||||||
COPY app/ ./app/
|
|
||||||
COPY worker/ ./worker/
|
|
||||||
COPY migrations/ ./migrations/
|
|
||||||
|
|
||||||
# Set up non-root user and cache directory
|
|
||||||
RUN useradd -m -u 1000 appuser && \
|
|
||||||
mkdir -p /app/.cache && \
|
|
||||||
chown -R appuser:appuser /app
|
|
||||||
|
|
||||||
ENV UV_CACHE_DIR=/app/.cache
|
|
||||||
|
|
||||||
# API service target
|
|
||||||
FROM base AS api
|
|
||||||
|
|
||||||
USER appuser
|
|
||||||
EXPOSE 8000
|
|
||||||
|
|
||||||
CMD ["uv", "run", "uvicorn", "app.main:app", "--host", "0.0.0.0", "--port", "8000"]
|
|
||||||
|
|
||||||
# Worker service target
|
|
||||||
FROM base AS worker
|
|
||||||
|
|
||||||
USER appuser
|
|
||||||
|
|
||||||
CMD ["uv", "run", "celery", "-A", "worker.celery_app", "worker", "--loglevel=info", "-Q", "critical,default,low"]
|
|
||||||
@@ -0,0 +1,9 @@
|
|||||||
|
<Solution>
|
||||||
|
<Folder Name="/src/">
|
||||||
|
<Project Path="src/IncidentOps.Api/IncidentOps.Api.csproj" />
|
||||||
|
<Project Path="src/IncidentOps.Contracts/IncidentOps.Contracts.csproj" />
|
||||||
|
<Project Path="src/IncidentOps.Domain/IncidentOps.Domain.csproj" />
|
||||||
|
<Project Path="src/IncidentOps.Infrastructure/IncidentOps.Infrastructure.csproj" />
|
||||||
|
<Project Path="src/IncidentOps.Worker/IncidentOps.Worker.csproj" />
|
||||||
|
</Folder>
|
||||||
|
</Solution>
|
||||||
@@ -1,86 +0,0 @@
|
|||||||
# IncidentOps
|
|
||||||
|
|
||||||
A fullstack on-call & incident management platform
|
|
||||||
|
|
||||||
## Environment Configuration
|
|
||||||
|
|
||||||
| Variable | Description | Default |
|
|
||||||
|----------|-------------|---------|
|
|
||||||
| `DATABASE_URL` | Postgres connection string | — |
|
|
||||||
| `REDIS_URL` | Legacy redis endpoint, also used if no broker override is supplied | `redis://localhost:6379/0` |
|
|
||||||
| `TASK_QUEUE_DRIVER` | Task queue implementation (`celery` or `inmemory`) | `celery` |
|
|
||||||
| `TASK_QUEUE_BROKER_URL` | Celery broker URL (falls back to `REDIS_URL` when unset) | `None` |
|
|
||||||
| `TASK_QUEUE_BACKEND` | Celery transport semantics (`redis` or `sqs`) | `redis` |
|
|
||||||
| `TASK_QUEUE_DEFAULT_QUEUE` | Queue used for fan-out + notification deliveries | `default` |
|
|
||||||
| `TASK_QUEUE_CRITICAL_QUEUE` | Queue used for escalation + delayed work | `critical` |
|
|
||||||
| `TASK_QUEUE_VISIBILITY_TIMEOUT` | Visibility timeout passed to `sqs` transport | `600` |
|
|
||||||
| `TASK_QUEUE_POLLING_INTERVAL` | Polling interval for `sqs` transport (seconds) | `1.0` |
|
|
||||||
| `NOTIFICATION_ESCALATION_DELAY_SECONDS` | Delay before re-checking unacknowledged incidents | `900` |
|
|
||||||
| `AWS_REGION` | Region used when `TASK_QUEUE_BACKEND=sqs` | `None` |
|
|
||||||
| `JWT_SECRET_KEY` | Symmetric JWT signing key | — |
|
|
||||||
| `JWT_ALGORITHM` | JWT algorithm | `HS256` |
|
|
||||||
| `JWT_ISSUER` | JWT issuer claim | `incidentops` |
|
|
||||||
| `JWT_AUDIENCE` | JWT audience claim | `incidentops-api` |
|
|
||||||
|
|
||||||
### Task Queue Modes
|
|
||||||
|
|
||||||
- **Development / Tests** – Set `TASK_QUEUE_DRIVER=inmemory` to bypass Celery entirely (default for local pytest). The API will enqueue events into an in-memory recorder while the worker code remains importable.
|
|
||||||
- **Celery + Redis** – Set `TASK_QUEUE_DRIVER=celery` and either leave `TASK_QUEUE_BROKER_URL` unset (and rely on `REDIS_URL`) or point it to another Redis endpoint. This is the default production-style configuration.
|
|
||||||
- **Celery + Amazon SQS** – Provide `TASK_QUEUE_BROKER_URL=sqs://` (Celery automatically discovers credentials), set `TASK_QUEUE_BACKEND=sqs`, and configure `AWS_REGION`. Optional tuning is available via the visibility timeout and polling interval variables above.
|
|
||||||
|
|
||||||
### Running the Worker
|
|
||||||
|
|
||||||
The worker automatically discovers tasks under `worker/tasks`. Use the same environment variables as the API:
|
|
||||||
|
|
||||||
```
|
|
||||||
uv run celery -A worker.celery_app worker --loglevel=info
|
|
||||||
```
|
|
||||||
|
|
||||||
## Setup
|
|
||||||
|
|
||||||
### Docker Compose
|
|
||||||
|
|
||||||
```
|
|
||||||
docker compose up --build -d
|
|
||||||
```
|
|
||||||
|
|
||||||
### K8S with Skaffold and Helm
|
|
||||||
|
|
||||||
```
|
|
||||||
# Install with infrastructure only (for testing)
|
|
||||||
helm install incidentops helm/incidentops -n incidentops --create-namespace \
|
|
||||||
--set migration.enabled=false \
|
|
||||||
--set api.replicaCount=0 \
|
|
||||||
--set worker.replicaCount=0 \
|
|
||||||
--set web.replicaCount=0
|
|
||||||
|
|
||||||
# Full install (requires building app images first)
|
|
||||||
helm install incidentops helm/incidentops -n incidentops --create-namespace
|
|
||||||
|
|
||||||
# Create a cluster
|
|
||||||
kind create cluster --name incidentops
|
|
||||||
|
|
||||||
# We then deploy
|
|
||||||
skaffold dev
|
|
||||||
|
|
||||||
# One-time deployment
|
|
||||||
skaffold run
|
|
||||||
|
|
||||||
# Production deployment
|
|
||||||
skaffold run -p production
|
|
||||||
```
|
|
||||||
|
|
||||||
### Accessing Dashboards
|
|
||||||
|
|
||||||
When running with `skaffold dev`, the following dashboards are port-forwarded automatically:
|
|
||||||
|
|
||||||
| Dashboard | URL | Description |
|
|
||||||
|-----------|-----|-------------|
|
|
||||||
| **OpenAPI (Swagger)** | http://localhost:8000/docs | Interactive API documentation |
|
|
||||||
| **OpenAPI (ReDoc)** | http://localhost:8000/redoc | Alternative API docs |
|
|
||||||
| **Grafana** | http://localhost:3001 | Metrics, logs, and traces |
|
|
||||||
| **Prometheus** | http://localhost:9090 | Raw metrics queries |
|
|
||||||
| **Tempo** | http://localhost:3200 | Distributed tracing backend |
|
|
||||||
| **Loki** | http://localhost:3100 | Log aggregation backend |
|
|
||||||
|
|
||||||
Grafana comes pre-configured with datasources for Prometheus, Loki, and Tempo.
|
|
||||||
@@ -1,163 +0,0 @@
|
|||||||
# IncidentOps Specification
|
|
||||||
|
|
||||||
Multi-tenant incident management API. Org context embedded in JWT — no `orgId` in URLs.
|
|
||||||
|
|
||||||
## Architecture
|
|
||||||
|
|
||||||
| Service | Stack | Purpose |
|
|
||||||
|---------|-------|---------|
|
|
||||||
| **api** | FastAPI, asyncpg | REST API, JWT auth, RBAC |
|
|
||||||
| **worker** | Celery, Redis | Notifications, escalations |
|
|
||||||
| **web** | Next.js | Dashboard (future) |
|
|
||||||
|
|
||||||
**Infrastructure:** PostgreSQL, Redis, ingress-nginx, Helm/Skaffold
|
|
||||||
|
|
||||||
## Auth
|
|
||||||
|
|
||||||
### JWT Access Token Claims
|
|
||||||
- `sub`: user_id (uuid)
|
|
||||||
- `org_id`: active org (uuid)
|
|
||||||
- `org_role`: `admin | member | viewer`
|
|
||||||
- `iss`: issuer (configurable, default: `incidentops`)
|
|
||||||
- `aud`: audience (configurable, default: `incidentops-api`)
|
|
||||||
- `jti`: unique token ID (uuid)
|
|
||||||
- `iat`: issued at (unix timestamp)
|
|
||||||
- `exp`: expiration (unix timestamp)
|
|
||||||
|
|
||||||
### Refresh Token
|
|
||||||
- Opaque token returned in JSON (not cookie)
|
|
||||||
- Stored hashed in DB with `active_org_id`
|
|
||||||
- Rotated on refresh and org-switch
|
|
||||||
|
|
||||||
### Endpoints
|
|
||||||
| Endpoint | Description |
|
|
||||||
|----------|-------------|
|
|
||||||
| `POST /v1/auth/register` | Create user + default org, return tokens |
|
|
||||||
| `POST /v1/auth/login` | Authenticate, return tokens |
|
|
||||||
| `POST /v1/auth/refresh` | Rotate refresh token, mint new access token |
|
|
||||||
| `POST /v1/auth/switch-org` | Change active org, rotate tokens |
|
|
||||||
| `POST /v1/auth/logout` | Revoke refresh token |
|
|
||||||
|
|
||||||
## Authorization
|
|
||||||
|
|
||||||
### Roles
|
|
||||||
| Role | Permissions |
|
|
||||||
|------|-------------|
|
|
||||||
| viewer | Read-only |
|
|
||||||
| member | + create incidents, transitions, comments |
|
|
||||||
| admin | + manage members, notification targets |
|
|
||||||
|
|
||||||
### Enforcement
|
|
||||||
- Role check via dependency injection
|
|
||||||
- Ownership check: resource `org_id` must match JWT `org_id`
|
|
||||||
|
|
||||||
## API Routes
|
|
||||||
|
|
||||||
All under `/v1`. Auth required unless noted.
|
|
||||||
|
|
||||||
### Org (implicit from JWT)
|
|
||||||
- `GET /org` — current org summary
|
|
||||||
- `GET /org/members` (admin)
|
|
||||||
- `GET /org/services`
|
|
||||||
- `POST /org/services` (member+)
|
|
||||||
- `GET /org/notification-targets` (admin)
|
|
||||||
- `POST /org/notification-targets` (admin)
|
|
||||||
|
|
||||||
### Incidents
|
|
||||||
- `GET /incidents?status=&cursor=&limit=`
|
|
||||||
- `POST /services/{serviceId}/incidents` (member+)
|
|
||||||
- `GET /incidents/{incidentId}`
|
|
||||||
- `GET /incidents/{incidentId}/events`
|
|
||||||
- `POST /incidents/{incidentId}/transition` (member+)
|
|
||||||
- `POST /incidents/{incidentId}/comment` (member+)
|
|
||||||
|
|
||||||
### Health
|
|
||||||
- `GET /healthz` — liveness
|
|
||||||
- `GET /readyz` — readiness (postgres + redis)
|
|
||||||
|
|
||||||
## Incident State Machine
|
|
||||||
|
|
||||||
```
|
|
||||||
Triggered → Acknowledged → Mitigated → Resolved
|
|
||||||
```
|
|
||||||
|
|
||||||
- Transitions validated at application level
|
|
||||||
- Optimistic locking via `version` column
|
|
||||||
- All changes recorded in `incident_events`
|
|
||||||
|
|
||||||
## Database Schema
|
|
||||||
|
|
||||||
| Table | Purpose |
|
|
||||||
|-------|---------|
|
|
||||||
| `users` | User accounts |
|
|
||||||
| `orgs` | Organizations |
|
|
||||||
| `org_members` | User-org membership + role |
|
|
||||||
| `services` | Org-scoped services |
|
|
||||||
| `incidents` | Org-scoped incidents with version |
|
|
||||||
| `incident_events` | Append-only timeline |
|
|
||||||
| `refresh_tokens` | Token rotation + active org |
|
|
||||||
| `notification_targets` | Webhook/email/slack configs |
|
|
||||||
| `notification_attempts` | Delivery tracking (idempotent) |
|
|
||||||
|
|
||||||
## Background Jobs (Celery)
|
|
||||||
|
|
||||||
| Task | Queue | Purpose |
|
|
||||||
|------|-------|---------|
|
|
||||||
| `incident_triggered` | default | Fan-out to notification targets |
|
|
||||||
| `send_webhook` | default | HTTP POST with retry |
|
|
||||||
| `escalate_if_unacked` | critical | Delayed escalation (stretch) |
|
|
||||||
|
|
||||||
## Config (Environment)
|
|
||||||
|
|
||||||
| Variable | Required | Default |
|
|
||||||
|----------|----------|---------|
|
|
||||||
| `DATABASE_URL` | Yes | — |
|
|
||||||
| `REDIS_URL` | No | `redis://localhost:6379/0` |
|
|
||||||
| `JWT_SECRET_KEY` | Yes | — |
|
|
||||||
| `JWT_ALGORITHM` | No | `HS256` |
|
|
||||||
| `JWT_ISSUER` | No | `incidentops` |
|
|
||||||
| `JWT_AUDIENCE` | No | `incidentops-api` |
|
|
||||||
| `ACCESS_TOKEN_EXPIRE_MINUTES` | No | `15` |
|
|
||||||
| `REFRESH_TOKEN_EXPIRE_DAYS` | No | `30` |
|
|
||||||
|
|
||||||
## Development
|
|
||||||
|
|
||||||
Use `uv` for all Python operations:
|
|
||||||
|
|
||||||
```bash
|
|
||||||
# Install dependencies
|
|
||||||
uv sync
|
|
||||||
|
|
||||||
# Run tests
|
|
||||||
uv run pytest tests/
|
|
||||||
|
|
||||||
# Run the API server
|
|
||||||
uv run uvicorn app.main:app --reload
|
|
||||||
|
|
||||||
# Run migrations
|
|
||||||
uv run python migrations/migrate.py
|
|
||||||
```
|
|
||||||
|
|
||||||
## Project Structure
|
|
||||||
|
|
||||||
```
|
|
||||||
incidentops/
|
|
||||||
├── app/
|
|
||||||
│ ├── main.py # FastAPI entry
|
|
||||||
│ ├── config.py # pydantic-settings
|
|
||||||
│ ├── db.py # asyncpg pool
|
|
||||||
│ ├── core/ # security, exceptions
|
|
||||||
│ ├── api/v1/ # route handlers
|
|
||||||
│ ├── schemas/ # pydantic models
|
|
||||||
│ ├── repositories/ # data access
|
|
||||||
│ └── services/ # business logic
|
|
||||||
├── worker/
|
|
||||||
│ ├── celery_app.py
|
|
||||||
│ └── tasks/
|
|
||||||
├── migrations/
|
|
||||||
│ └── *.sql + migrate.py
|
|
||||||
├── helm/
|
|
||||||
├── Dockerfile
|
|
||||||
├── docker-compose.yml
|
|
||||||
└── pyproject.toml
|
|
||||||
```
|
|
||||||
-101
@@ -1,101 +0,0 @@
|
|||||||
"""Shared FastAPI dependencies (auth, RBAC, ownership)."""
|
|
||||||
|
|
||||||
from __future__ import annotations
|
|
||||||
|
|
||||||
from dataclasses import dataclass
|
|
||||||
from typing import Callable
|
|
||||||
from uuid import UUID
|
|
||||||
|
|
||||||
from fastapi import Depends
|
|
||||||
from fastapi.security import HTTPAuthorizationCredentials, HTTPBearer
|
|
||||||
|
|
||||||
from app.core import exceptions as exc, security
|
|
||||||
from app.db import db
|
|
||||||
from app.repositories import OrgRepository, UserRepository
|
|
||||||
|
|
||||||
|
|
||||||
bearer_scheme = HTTPBearer(auto_error=False)
|
|
||||||
|
|
||||||
ROLE_RANKS: dict[str, int] = {"viewer": 0, "member": 1, "admin": 2}
|
|
||||||
|
|
||||||
|
|
||||||
@dataclass(slots=True)
|
|
||||||
class CurrentUser:
|
|
||||||
"""Authenticated user context derived from the access token."""
|
|
||||||
|
|
||||||
user_id: UUID
|
|
||||||
email: str
|
|
||||||
org_id: UUID
|
|
||||||
org_role: str
|
|
||||||
token: str
|
|
||||||
|
|
||||||
|
|
||||||
async def get_current_user(
|
|
||||||
credentials: HTTPAuthorizationCredentials | None = Depends(bearer_scheme),
|
|
||||||
) -> CurrentUser:
|
|
||||||
"""Extract and validate the current user from the Authorization header."""
|
|
||||||
|
|
||||||
if credentials is None or credentials.scheme.lower() != "bearer":
|
|
||||||
raise exc.UnauthorizedError("Missing bearer token")
|
|
||||||
|
|
||||||
try:
|
|
||||||
payload = security.TokenPayload(security.decode_access_token(credentials.credentials))
|
|
||||||
except security.JWTError as err: # pragma: no cover - jose error types
|
|
||||||
raise exc.UnauthorizedError("Invalid access token") from err
|
|
||||||
|
|
||||||
async with db.connection() as conn:
|
|
||||||
user_repo = UserRepository(conn)
|
|
||||||
user = await user_repo.get_by_id(payload.user_id)
|
|
||||||
if user is None:
|
|
||||||
raise exc.UnauthorizedError("User not found")
|
|
||||||
|
|
||||||
org_repo = OrgRepository(conn)
|
|
||||||
membership = await org_repo.get_member(payload.user_id, payload.org_id)
|
|
||||||
if membership is None:
|
|
||||||
raise exc.ForbiddenError("Organization access denied")
|
|
||||||
|
|
||||||
return CurrentUser(
|
|
||||||
user_id=payload.user_id,
|
|
||||||
email=user["email"],
|
|
||||||
org_id=payload.org_id,
|
|
||||||
org_role=membership["role"],
|
|
||||||
token=credentials.credentials,
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
class RoleChecker:
|
|
||||||
"""Dependency that enforces a minimum organization role."""
|
|
||||||
|
|
||||||
def __init__(self, minimum_role: str) -> None:
|
|
||||||
if minimum_role not in ROLE_RANKS:
|
|
||||||
raise ValueError(f"Unknown role '{minimum_role}'")
|
|
||||||
self.minimum_role = minimum_role
|
|
||||||
|
|
||||||
def __call__(self, current_user: CurrentUser = Depends(get_current_user)) -> CurrentUser:
|
|
||||||
if ROLE_RANKS[current_user.org_role] < ROLE_RANKS[self.minimum_role]:
|
|
||||||
raise exc.ForbiddenError("Insufficient role for this operation")
|
|
||||||
return current_user
|
|
||||||
|
|
||||||
|
|
||||||
def require_role(min_role: str) -> Callable[[CurrentUser], CurrentUser]:
|
|
||||||
"""Factory that returns a dependency enforcing the specified role."""
|
|
||||||
|
|
||||||
return RoleChecker(min_role)
|
|
||||||
|
|
||||||
|
|
||||||
def ensure_org_access(resource_org_id: UUID, current_user: CurrentUser) -> None:
|
|
||||||
"""Verify that the resource belongs to the active org in the token."""
|
|
||||||
|
|
||||||
if resource_org_id != current_user.org_id:
|
|
||||||
raise exc.ForbiddenError("Resource does not belong to the active organization")
|
|
||||||
|
|
||||||
|
|
||||||
__all__ = [
|
|
||||||
"CurrentUser",
|
|
||||||
"ROLE_RANKS",
|
|
||||||
"RoleChecker",
|
|
||||||
"bearer_scheme",
|
|
||||||
"ensure_org_access",
|
|
||||||
"get_current_user",
|
|
||||||
"require_role",
|
|
||||||
]
|
|
||||||
@@ -1,59 +0,0 @@
|
|||||||
"""Authentication API endpoints."""
|
|
||||||
|
|
||||||
from fastapi import APIRouter, Depends, status
|
|
||||||
|
|
||||||
from app.api.deps import CurrentUser, get_current_user
|
|
||||||
from app.schemas.auth import (
|
|
||||||
LoginRequest,
|
|
||||||
LogoutRequest,
|
|
||||||
RefreshRequest,
|
|
||||||
RegisterRequest,
|
|
||||||
SwitchOrgRequest,
|
|
||||||
TokenResponse,
|
|
||||||
)
|
|
||||||
from app.services import AuthService
|
|
||||||
|
|
||||||
|
|
||||||
router = APIRouter(prefix="/auth", tags=["auth"])
|
|
||||||
auth_service = AuthService()
|
|
||||||
|
|
||||||
|
|
||||||
@router.post("/register", response_model=TokenResponse, status_code=status.HTTP_201_CREATED)
|
|
||||||
async def register_user(payload: RegisterRequest) -> TokenResponse:
|
|
||||||
"""Register a new user and default org, returning auth tokens."""
|
|
||||||
|
|
||||||
return await auth_service.register_user(payload)
|
|
||||||
|
|
||||||
|
|
||||||
@router.post("/login", response_model=TokenResponse)
|
|
||||||
async def login_user(payload: LoginRequest) -> TokenResponse:
|
|
||||||
"""Authenticate an existing user and issue tokens."""
|
|
||||||
|
|
||||||
return await auth_service.login_user(payload)
|
|
||||||
|
|
||||||
|
|
||||||
@router.post("/refresh", response_model=TokenResponse)
|
|
||||||
async def refresh_tokens(payload: RefreshRequest) -> TokenResponse:
|
|
||||||
"""Rotate refresh token and mint a new access token."""
|
|
||||||
|
|
||||||
return await auth_service.refresh_tokens(payload)
|
|
||||||
|
|
||||||
|
|
||||||
@router.post("/switch-org", response_model=TokenResponse)
|
|
||||||
async def switch_org(
|
|
||||||
payload: SwitchOrgRequest,
|
|
||||||
current_user: CurrentUser = Depends(get_current_user),
|
|
||||||
) -> TokenResponse:
|
|
||||||
"""Switch the active organization for the authenticated user."""
|
|
||||||
|
|
||||||
return await auth_service.switch_org(current_user, payload)
|
|
||||||
|
|
||||||
|
|
||||||
@router.post("/logout", status_code=status.HTTP_204_NO_CONTENT)
|
|
||||||
async def logout(
|
|
||||||
payload: LogoutRequest,
|
|
||||||
current_user: CurrentUser = Depends(get_current_user),
|
|
||||||
) -> None:
|
|
||||||
"""Revoke the provided refresh token for the current session."""
|
|
||||||
|
|
||||||
await auth_service.logout(current_user, payload)
|
|
||||||
@@ -1,47 +0,0 @@
|
|||||||
"""Health check endpoints."""
|
|
||||||
|
|
||||||
from fastapi import APIRouter, Response, status
|
|
||||||
|
|
||||||
from app.db import db
|
|
||||||
from app.taskqueue import task_queue
|
|
||||||
|
|
||||||
router = APIRouter()
|
|
||||||
|
|
||||||
|
|
||||||
@router.get("/healthz")
|
|
||||||
async def healthz() -> dict[str, str]:
|
|
||||||
"""Liveness probe - returns 200 if the service is running."""
|
|
||||||
return {"status": "ok"}
|
|
||||||
|
|
||||||
|
|
||||||
@router.get("/readyz")
|
|
||||||
async def readyz(response: Response) -> dict[str, str | dict[str, bool]]:
|
|
||||||
"""
|
|
||||||
Readiness probe - checks database and task queue connectivity.
|
|
||||||
- Check Postgres status
|
|
||||||
- Check configured task queue backend
|
|
||||||
- Return overall healthiness
|
|
||||||
"""
|
|
||||||
checks = {
|
|
||||||
"postgres": False,
|
|
||||||
"task_queue": False,
|
|
||||||
}
|
|
||||||
|
|
||||||
try:
|
|
||||||
if db.pool:
|
|
||||||
async with db.connection() as conn:
|
|
||||||
await conn.fetchval("SELECT 1")
|
|
||||||
checks["postgres"] = True
|
|
||||||
except Exception:
|
|
||||||
pass
|
|
||||||
|
|
||||||
checks["task_queue"] = await task_queue.ping()
|
|
||||||
|
|
||||||
all_healthy = all(checks.values())
|
|
||||||
if not all_healthy:
|
|
||||||
response.status_code = status.HTTP_503_SERVICE_UNAVAILABLE
|
|
||||||
|
|
||||||
return {
|
|
||||||
"status": "ok" if all_healthy else "degraded",
|
|
||||||
"checks": checks,
|
|
||||||
}
|
|
||||||
@@ -1,103 +0,0 @@
|
|||||||
"""Incident API endpoints."""
|
|
||||||
|
|
||||||
from datetime import datetime
|
|
||||||
from uuid import UUID
|
|
||||||
|
|
||||||
from fastapi import APIRouter, Depends, Query, status
|
|
||||||
|
|
||||||
from app.api.deps import CurrentUser, get_current_user, require_role
|
|
||||||
from app.schemas.common import PaginatedResponse
|
|
||||||
from app.schemas.incident import (
|
|
||||||
CommentRequest,
|
|
||||||
IncidentEventResponse,
|
|
||||||
IncidentResponse,
|
|
||||||
IncidentStatus,
|
|
||||||
TransitionRequest,
|
|
||||||
IncidentCreate,
|
|
||||||
)
|
|
||||||
from app.services import IncidentService
|
|
||||||
|
|
||||||
|
|
||||||
router = APIRouter(tags=["incidents"])
|
|
||||||
incident_service = IncidentService()
|
|
||||||
|
|
||||||
|
|
||||||
@router.get("/incidents", response_model=PaginatedResponse[IncidentResponse])
|
|
||||||
async def list_incidents(
|
|
||||||
status: IncidentStatus | None = Query(default=None),
|
|
||||||
cursor: datetime | None = Query(default=None, description="Cursor (created_at)"),
|
|
||||||
limit: int = Query(default=20, ge=1, le=100),
|
|
||||||
current_user: CurrentUser = Depends(get_current_user),
|
|
||||||
) -> PaginatedResponse[IncidentResponse]:
|
|
||||||
"""List incidents for the active organization."""
|
|
||||||
|
|
||||||
return await incident_service.get_incidents(
|
|
||||||
current_user,
|
|
||||||
status=status,
|
|
||||||
cursor=cursor,
|
|
||||||
limit=limit,
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
@router.post(
|
|
||||||
"/services/{service_id}/incidents",
|
|
||||||
response_model=IncidentResponse,
|
|
||||||
status_code=status.HTTP_201_CREATED,
|
|
||||||
)
|
|
||||||
async def create_incident(
|
|
||||||
service_id: UUID,
|
|
||||||
payload: IncidentCreate,
|
|
||||||
current_user: CurrentUser = Depends(require_role("member")),
|
|
||||||
) -> IncidentResponse:
|
|
||||||
"""Create a new incident for the given service (member+)."""
|
|
||||||
|
|
||||||
return await incident_service.create_incident(current_user, service_id, payload)
|
|
||||||
|
|
||||||
|
|
||||||
@router.get("/incidents/{incident_id}", response_model=IncidentResponse)
|
|
||||||
async def get_incident(
|
|
||||||
incident_id: UUID,
|
|
||||||
current_user: CurrentUser = Depends(get_current_user),
|
|
||||||
) -> IncidentResponse:
|
|
||||||
"""Fetch a single incident by ID."""
|
|
||||||
|
|
||||||
return await incident_service.get_incident(current_user, incident_id)
|
|
||||||
|
|
||||||
|
|
||||||
@router.get("/incidents/{incident_id}/events", response_model=list[IncidentEventResponse])
|
|
||||||
async def get_incident_events(
|
|
||||||
incident_id: UUID,
|
|
||||||
current_user: CurrentUser = Depends(get_current_user),
|
|
||||||
) -> list[IncidentEventResponse]:
|
|
||||||
"""Get the event timeline for an incident."""
|
|
||||||
|
|
||||||
return await incident_service.get_incident_events(current_user, incident_id)
|
|
||||||
|
|
||||||
|
|
||||||
@router.post(
|
|
||||||
"/incidents/{incident_id}/transition",
|
|
||||||
response_model=IncidentResponse,
|
|
||||||
)
|
|
||||||
async def transition_incident(
|
|
||||||
incident_id: UUID,
|
|
||||||
payload: TransitionRequest,
|
|
||||||
current_user: CurrentUser = Depends(require_role("member")),
|
|
||||||
) -> IncidentResponse:
|
|
||||||
"""Transition an incident status (member+)."""
|
|
||||||
|
|
||||||
return await incident_service.transition_incident(current_user, incident_id, payload)
|
|
||||||
|
|
||||||
|
|
||||||
@router.post(
|
|
||||||
"/incidents/{incident_id}/comment",
|
|
||||||
response_model=IncidentEventResponse,
|
|
||||||
status_code=status.HTTP_201_CREATED,
|
|
||||||
)
|
|
||||||
async def add_comment(
|
|
||||||
incident_id: UUID,
|
|
||||||
payload: CommentRequest,
|
|
||||||
current_user: CurrentUser = Depends(require_role("member")),
|
|
||||||
) -> IncidentEventResponse:
|
|
||||||
"""Add a comment to the incident timeline (member+)."""
|
|
||||||
|
|
||||||
return await incident_service.add_comment(current_user, incident_id, payload)
|
|
||||||
@@ -1,72 +0,0 @@
|
|||||||
"""Organization API endpoints."""
|
|
||||||
|
|
||||||
from fastapi import APIRouter, Depends, status
|
|
||||||
|
|
||||||
from app.api.deps import CurrentUser, get_current_user, require_role
|
|
||||||
from app.schemas.org import (
|
|
||||||
MemberResponse,
|
|
||||||
NotificationTargetCreate,
|
|
||||||
NotificationTargetResponse,
|
|
||||||
OrgResponse,
|
|
||||||
ServiceCreate,
|
|
||||||
ServiceResponse,
|
|
||||||
)
|
|
||||||
from app.services import OrgService
|
|
||||||
|
|
||||||
|
|
||||||
router = APIRouter(prefix="/org", tags=["org"])
|
|
||||||
org_service = OrgService()
|
|
||||||
|
|
||||||
|
|
||||||
@router.get("", response_model=OrgResponse)
|
|
||||||
async def get_org(current_user: CurrentUser = Depends(get_current_user)) -> OrgResponse:
|
|
||||||
"""Return the active organization summary for the authenticated user."""
|
|
||||||
|
|
||||||
return await org_service.get_current_org(current_user)
|
|
||||||
|
|
||||||
|
|
||||||
@router.get("/members", response_model=list[MemberResponse])
|
|
||||||
async def list_members(current_user: CurrentUser = Depends(require_role("admin"))) -> list[MemberResponse]:
|
|
||||||
"""List members of the current organization (admin only)."""
|
|
||||||
|
|
||||||
return await org_service.get_members(current_user)
|
|
||||||
|
|
||||||
|
|
||||||
@router.get("/services", response_model=list[ServiceResponse])
|
|
||||||
async def list_services(current_user: CurrentUser = Depends(get_current_user)) -> list[ServiceResponse]:
|
|
||||||
"""List services for the current organization."""
|
|
||||||
|
|
||||||
return await org_service.get_services(current_user)
|
|
||||||
|
|
||||||
|
|
||||||
@router.post("/services", response_model=ServiceResponse, status_code=status.HTTP_201_CREATED)
|
|
||||||
async def create_service(
|
|
||||||
payload: ServiceCreate,
|
|
||||||
current_user: CurrentUser = Depends(require_role("member")),
|
|
||||||
) -> ServiceResponse:
|
|
||||||
"""Create a new service within the current organization (member+)."""
|
|
||||||
|
|
||||||
return await org_service.create_service(current_user, payload)
|
|
||||||
|
|
||||||
|
|
||||||
@router.get("/notification-targets", response_model=list[NotificationTargetResponse])
|
|
||||||
async def list_notification_targets(
|
|
||||||
current_user: CurrentUser = Depends(require_role("admin")),
|
|
||||||
) -> list[NotificationTargetResponse]:
|
|
||||||
"""List notification targets for the current organization (admin only)."""
|
|
||||||
|
|
||||||
return await org_service.get_notification_targets(current_user)
|
|
||||||
|
|
||||||
|
|
||||||
@router.post(
|
|
||||||
"/notification-targets",
|
|
||||||
response_model=NotificationTargetResponse,
|
|
||||||
status_code=status.HTTP_201_CREATED,
|
|
||||||
)
|
|
||||||
async def create_notification_target(
|
|
||||||
payload: NotificationTargetCreate,
|
|
||||||
current_user: CurrentUser = Depends(require_role("admin")),
|
|
||||||
) -> NotificationTargetResponse:
|
|
||||||
"""Create a notification target for the current organization (admin only)."""
|
|
||||||
|
|
||||||
return await org_service.create_notification_target(current_user, payload)
|
|
||||||
@@ -1,66 +0,0 @@
|
|||||||
"""Application configuration via pydantic-settings."""
|
|
||||||
|
|
||||||
from typing import Literal
|
|
||||||
|
|
||||||
from pydantic_settings import BaseSettings, SettingsConfigDict
|
|
||||||
|
|
||||||
|
|
||||||
class Settings(BaseSettings):
|
|
||||||
"""Application settings loaded from environment variables."""
|
|
||||||
|
|
||||||
model_config = SettingsConfigDict(
|
|
||||||
env_file=".env",
|
|
||||||
env_file_encoding="utf-8",
|
|
||||||
case_sensitive=False,
|
|
||||||
)
|
|
||||||
|
|
||||||
# Database
|
|
||||||
database_url: str
|
|
||||||
|
|
||||||
# Redis (legacy default for Celery broker)
|
|
||||||
redis_url: str = "redis://localhost:6379/0"
|
|
||||||
|
|
||||||
# Task queue
|
|
||||||
task_queue_driver: Literal["celery", "inmemory"] = "celery"
|
|
||||||
task_queue_broker_url: str | None = None
|
|
||||||
task_queue_backend: Literal["redis", "sqs"] = "redis"
|
|
||||||
task_queue_default_queue: str = "default"
|
|
||||||
task_queue_critical_queue: str = "critical"
|
|
||||||
task_queue_visibility_timeout: int = 600
|
|
||||||
task_queue_polling_interval: float = 1.0
|
|
||||||
notification_escalation_delay_seconds: int = 900
|
|
||||||
|
|
||||||
# AWS (used when task_queue_backend="sqs")
|
|
||||||
aws_region: str | None = None
|
|
||||||
|
|
||||||
# JWT
|
|
||||||
jwt_secret_key: str
|
|
||||||
jwt_algorithm: str = "HS256"
|
|
||||||
jwt_issuer: str = "incidentops"
|
|
||||||
jwt_audience: str = "incidentops-api"
|
|
||||||
access_token_expire_minutes: int = 15
|
|
||||||
refresh_token_expire_days: int = 30
|
|
||||||
|
|
||||||
# Application
|
|
||||||
debug: bool = False
|
|
||||||
api_v1_prefix: str = "/v1"
|
|
||||||
|
|
||||||
# OpenTelemetry
|
|
||||||
otel_enabled: bool = True
|
|
||||||
otel_service_name: str = "incidentops-api"
|
|
||||||
otel_environment: str = "development"
|
|
||||||
otel_exporter_otlp_endpoint: str | None = None # e.g., "http://tempo:4317"
|
|
||||||
otel_exporter_otlp_insecure: bool = True
|
|
||||||
otel_log_level: str = "INFO"
|
|
||||||
|
|
||||||
# Metrics
|
|
||||||
prometheus_port: int = 9464 # Port for Prometheus metrics endpoint
|
|
||||||
|
|
||||||
@property
|
|
||||||
def resolved_task_queue_broker_url(self) -> str:
|
|
||||||
"""Return the broker URL with redis fallback for backwards compatibility."""
|
|
||||||
|
|
||||||
return self.task_queue_broker_url or self.redis_url
|
|
||||||
|
|
||||||
|
|
||||||
settings = Settings() # type: ignore[call-arg]
|
|
||||||
@@ -1,59 +0,0 @@
|
|||||||
"""Custom HTTP exceptions for the API."""
|
|
||||||
|
|
||||||
from fastapi import HTTPException, status
|
|
||||||
|
|
||||||
|
|
||||||
class NotFoundError(HTTPException):
|
|
||||||
"""Resource not found."""
|
|
||||||
|
|
||||||
def __init__(self, detail: str = "Resource not found") -> None:
|
|
||||||
super().__init__(status_code=status.HTTP_404_NOT_FOUND, detail=detail)
|
|
||||||
|
|
||||||
|
|
||||||
class ConflictError(HTTPException):
|
|
||||||
"""Conflict with current state (e.g., version mismatch)."""
|
|
||||||
|
|
||||||
def __init__(self, detail: str = "Conflict with current state") -> None:
|
|
||||||
super().__init__(status_code=status.HTTP_409_CONFLICT, detail=detail)
|
|
||||||
|
|
||||||
|
|
||||||
class UnauthorizedError(HTTPException):
|
|
||||||
"""Authentication required or failed."""
|
|
||||||
|
|
||||||
def __init__(self, detail: str = "Not authenticated") -> None:
|
|
||||||
super().__init__(
|
|
||||||
status_code=status.HTTP_401_UNAUTHORIZED,
|
|
||||||
detail=detail,
|
|
||||||
headers={"WWW-Authenticate": "Bearer"},
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
class ForbiddenError(HTTPException):
|
|
||||||
"""Insufficient permissions."""
|
|
||||||
|
|
||||||
def __init__(self, detail: str = "Insufficient permissions") -> None:
|
|
||||||
super().__init__(status_code=status.HTTP_403_FORBIDDEN, detail=detail)
|
|
||||||
|
|
||||||
|
|
||||||
class BadRequestError(HTTPException):
|
|
||||||
"""Invalid request data."""
|
|
||||||
|
|
||||||
def __init__(self, detail: str = "Invalid request") -> None:
|
|
||||||
super().__init__(status_code=status.HTTP_400_BAD_REQUEST, detail=detail)
|
|
||||||
|
|
||||||
|
|
||||||
class ValidationError(HTTPException):
|
|
||||||
"""Validation failed."""
|
|
||||||
|
|
||||||
def __init__(self, detail: str = "Validation failed") -> None:
|
|
||||||
super().__init__(status_code=status.HTTP_422_UNPROCESSABLE_ENTITY, detail=detail)
|
|
||||||
|
|
||||||
|
|
||||||
__all__ = [
|
|
||||||
"BadRequestError",
|
|
||||||
"ConflictError",
|
|
||||||
"ForbiddenError",
|
|
||||||
"NotFoundError",
|
|
||||||
"UnauthorizedError",
|
|
||||||
"ValidationError",
|
|
||||||
]
|
|
||||||
@@ -1,164 +0,0 @@
|
|||||||
"""Structured JSON logging configuration with OpenTelemetry integration."""
|
|
||||||
|
|
||||||
import json
|
|
||||||
import logging
|
|
||||||
import sys
|
|
||||||
from datetime import datetime, timezone
|
|
||||||
from typing import Any
|
|
||||||
|
|
||||||
from app.config import settings
|
|
||||||
|
|
||||||
|
|
||||||
class JSONFormatter(logging.Formatter):
|
|
||||||
"""
|
|
||||||
JSON log formatter that outputs structured logs with trace context.
|
|
||||||
|
|
||||||
Log format includes:
|
|
||||||
- timestamp: ISO 8601 format
|
|
||||||
- level: Log level name
|
|
||||||
- message: Log message
|
|
||||||
- logger: Logger name
|
|
||||||
- trace_id: OpenTelemetry trace ID (if available)
|
|
||||||
- span_id: OpenTelemetry span ID (if available)
|
|
||||||
- Extra fields from log record
|
|
||||||
"""
|
|
||||||
|
|
||||||
def format(self, record: logging.LogRecord) -> str:
|
|
||||||
log_data: dict[str, Any] = {
|
|
||||||
"timestamp": datetime.now(timezone.utc).isoformat(),
|
|
||||||
"level": record.levelname,
|
|
||||||
"message": record.getMessage(),
|
|
||||||
"logger": record.name,
|
|
||||||
}
|
|
||||||
|
|
||||||
# Add trace context if available (injected by OpenTelemetry LoggingInstrumentor)
|
|
||||||
if hasattr(record, "otelTraceID") and record.otelTraceID != "0":
|
|
||||||
log_data["trace_id"] = record.otelTraceID
|
|
||||||
if hasattr(record, "otelSpanID") and record.otelSpanID != "0":
|
|
||||||
log_data["span_id"] = record.otelSpanID
|
|
||||||
|
|
||||||
# Add exception info if present
|
|
||||||
if record.exc_info:
|
|
||||||
log_data["exception"] = self.formatException(record.exc_info)
|
|
||||||
|
|
||||||
# Add extra fields (excluding standard LogRecord attributes)
|
|
||||||
standard_attrs = {
|
|
||||||
"name",
|
|
||||||
"msg",
|
|
||||||
"args",
|
|
||||||
"created",
|
|
||||||
"filename",
|
|
||||||
"funcName",
|
|
||||||
"levelname",
|
|
||||||
"levelno",
|
|
||||||
"lineno",
|
|
||||||
"module",
|
|
||||||
"msecs",
|
|
||||||
"pathname",
|
|
||||||
"process",
|
|
||||||
"processName",
|
|
||||||
"relativeCreated",
|
|
||||||
"stack_info",
|
|
||||||
"exc_info",
|
|
||||||
"exc_text",
|
|
||||||
"thread",
|
|
||||||
"threadName",
|
|
||||||
"taskName",
|
|
||||||
"message",
|
|
||||||
"otelTraceID",
|
|
||||||
"otelSpanID",
|
|
||||||
"otelTraceSampled",
|
|
||||||
"otelServiceName",
|
|
||||||
}
|
|
||||||
for key, value in record.__dict__.items():
|
|
||||||
if key not in standard_attrs and not key.startswith("_"):
|
|
||||||
log_data[key] = value
|
|
||||||
|
|
||||||
return json.dumps(log_data, default=str)
|
|
||||||
|
|
||||||
|
|
||||||
class DevelopmentFormatter(logging.Formatter):
|
|
||||||
"""
|
|
||||||
Human-readable formatter for development with color support.
|
|
||||||
|
|
||||||
Format: [TIME] LEVEL logger - message [trace_id]
|
|
||||||
"""
|
|
||||||
|
|
||||||
COLORS = {
|
|
||||||
"DEBUG": "\033[36m", # Cyan
|
|
||||||
"INFO": "\033[32m", # Green
|
|
||||||
"WARNING": "\033[33m", # Yellow
|
|
||||||
"ERROR": "\033[31m", # Red
|
|
||||||
"CRITICAL": "\033[35m", # Magenta
|
|
||||||
}
|
|
||||||
RESET = "\033[0m"
|
|
||||||
|
|
||||||
def format(self, record: logging.LogRecord) -> str:
|
|
||||||
color = self.COLORS.get(record.levelname, "")
|
|
||||||
reset = self.RESET
|
|
||||||
|
|
||||||
# Format timestamp
|
|
||||||
timestamp = datetime.now(timezone.utc).strftime("%H:%M:%S.%f")[:-3]
|
|
||||||
|
|
||||||
# Build message
|
|
||||||
msg = f"[{timestamp}] {color}{record.levelname:8}{reset} {record.name} - {record.getMessage()}"
|
|
||||||
|
|
||||||
# Add trace context if available
|
|
||||||
if hasattr(record, "otelTraceID") and record.otelTraceID != "0":
|
|
||||||
msg += f" [{record.otelTraceID[:8]}...]"
|
|
||||||
|
|
||||||
# Add exception if present
|
|
||||||
if record.exc_info:
|
|
||||||
msg += f"\n{self.formatException(record.exc_info)}"
|
|
||||||
|
|
||||||
return msg
|
|
||||||
|
|
||||||
|
|
||||||
def setup_logging() -> None:
|
|
||||||
"""
|
|
||||||
Configure application logging.
|
|
||||||
|
|
||||||
- JSON format in production (OTEL enabled)
|
|
||||||
- Human-readable format in development
|
|
||||||
- Integrates with OpenTelemetry trace context
|
|
||||||
"""
|
|
||||||
# Determine log level
|
|
||||||
log_level = getattr(logging, settings.otel_log_level.upper(), logging.INFO)
|
|
||||||
|
|
||||||
# Choose formatter based on environment
|
|
||||||
if settings.otel_enabled and not settings.debug:
|
|
||||||
formatter = JSONFormatter()
|
|
||||||
else:
|
|
||||||
formatter = DevelopmentFormatter()
|
|
||||||
|
|
||||||
# Configure root logger
|
|
||||||
root_logger = logging.getLogger()
|
|
||||||
root_logger.setLevel(log_level)
|
|
||||||
|
|
||||||
# Remove existing handlers
|
|
||||||
for handler in root_logger.handlers[:]:
|
|
||||||
root_logger.removeHandler(handler)
|
|
||||||
|
|
||||||
# Add stdout handler
|
|
||||||
handler = logging.StreamHandler(sys.stdout)
|
|
||||||
handler.setFormatter(formatter)
|
|
||||||
root_logger.addHandler(handler)
|
|
||||||
|
|
||||||
# Reduce noise from third-party libraries (keep uvicorn access at INFO so requests are logged)
|
|
||||||
logging.getLogger("uvicorn.access").setLevel(logging.INFO)
|
|
||||||
logging.getLogger("asyncpg").setLevel(logging.WARNING)
|
|
||||||
logging.getLogger("httpx").setLevel(logging.WARNING)
|
|
||||||
logging.getLogger("httpcore").setLevel(logging.WARNING)
|
|
||||||
|
|
||||||
logging.info(
|
|
||||||
"Logging configured",
|
|
||||||
extra={
|
|
||||||
"log_level": settings.otel_log_level,
|
|
||||||
"format": "json" if settings.otel_enabled and not settings.debug else "dev",
|
|
||||||
},
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
def get_logger(name: str) -> logging.Logger:
|
|
||||||
"""Get a logger instance with the given name."""
|
|
||||||
return logging.getLogger(name)
|
|
||||||
@@ -1,106 +0,0 @@
|
|||||||
"""Security utilities for JWT and password hashing."""
|
|
||||||
|
|
||||||
import hashlib
|
|
||||||
import secrets
|
|
||||||
from datetime import UTC, datetime, timedelta
|
|
||||||
from typing import Any
|
|
||||||
from uuid import UUID, uuid4
|
|
||||||
|
|
||||||
import bcrypt
|
|
||||||
from jose import JWTError, jwt
|
|
||||||
|
|
||||||
from app.config import settings
|
|
||||||
|
|
||||||
|
|
||||||
def hash_password(password: str) -> str:
|
|
||||||
"""Hash a password using bcrypt."""
|
|
||||||
return bcrypt.hashpw(password.encode(), bcrypt.gensalt()).decode()
|
|
||||||
|
|
||||||
|
|
||||||
def verify_password(plain_password: str, hashed_password: str) -> bool:
|
|
||||||
"""Verify a password against its hash."""
|
|
||||||
return bcrypt.checkpw(plain_password.encode(), hashed_password.encode())
|
|
||||||
|
|
||||||
|
|
||||||
def create_access_token(
|
|
||||||
sub: str,
|
|
||||||
org_id: str,
|
|
||||||
org_role: str,
|
|
||||||
expires_delta: timedelta | None = None,
|
|
||||||
) -> str:
|
|
||||||
"""Create a JWT access token with org context."""
|
|
||||||
if expires_delta is None:
|
|
||||||
expires_delta = timedelta(minutes=settings.access_token_expire_minutes)
|
|
||||||
|
|
||||||
now = datetime.now(UTC)
|
|
||||||
expire = now + expires_delta
|
|
||||||
|
|
||||||
payload = {
|
|
||||||
"sub": sub,
|
|
||||||
"org_id": org_id,
|
|
||||||
"org_role": org_role,
|
|
||||||
"iss": settings.jwt_issuer,
|
|
||||||
"aud": settings.jwt_audience,
|
|
||||||
"jti": str(uuid4()),
|
|
||||||
"iat": now,
|
|
||||||
"exp": expire,
|
|
||||||
}
|
|
||||||
|
|
||||||
return jwt.encode(payload, settings.jwt_secret_key, algorithm=settings.jwt_algorithm)
|
|
||||||
|
|
||||||
|
|
||||||
def decode_access_token(token: str) -> dict[str, Any]:
|
|
||||||
"""Decode and validate a JWT access token.
|
|
||||||
|
|
||||||
Raises:
|
|
||||||
JWTError: If token is invalid or expired.
|
|
||||||
"""
|
|
||||||
return jwt.decode(
|
|
||||||
token,
|
|
||||||
settings.jwt_secret_key,
|
|
||||||
algorithms=[settings.jwt_algorithm],
|
|
||||||
issuer=settings.jwt_issuer,
|
|
||||||
audience=settings.jwt_audience,
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
def generate_refresh_token() -> str:
|
|
||||||
"""Generate a secure random refresh token."""
|
|
||||||
return secrets.token_urlsafe(32)
|
|
||||||
|
|
||||||
|
|
||||||
def hash_token(token: str) -> str:
|
|
||||||
"""Hash a refresh token for storage."""
|
|
||||||
return hashlib.sha256(token.encode()).hexdigest()
|
|
||||||
|
|
||||||
|
|
||||||
def get_refresh_token_expiry() -> datetime:
|
|
||||||
"""Get expiry datetime for a new refresh token."""
|
|
||||||
return datetime.now(UTC) + timedelta(days=settings.refresh_token_expire_days)
|
|
||||||
|
|
||||||
|
|
||||||
class TokenPayload:
|
|
||||||
"""Parsed JWT token payload."""
|
|
||||||
|
|
||||||
def __init__(self, payload: dict[str, Any]) -> None:
|
|
||||||
self.user_id = UUID(payload["sub"])
|
|
||||||
self.org_id = UUID(payload["org_id"])
|
|
||||||
self.org_role = payload["org_role"]
|
|
||||||
self.issuer = payload["iss"]
|
|
||||||
self.audience = payload["aud"]
|
|
||||||
self.jti = UUID(payload["jti"])
|
|
||||||
self.issued_at = payload["iat"]
|
|
||||||
self.expires_at = payload["exp"]
|
|
||||||
|
|
||||||
|
|
||||||
__all__ = [
|
|
||||||
"JWTError",
|
|
||||||
"TokenPayload",
|
|
||||||
"create_access_token",
|
|
||||||
"decode_access_token",
|
|
||||||
"generate_refresh_token",
|
|
||||||
"get_refresh_token_expiry",
|
|
||||||
"hash_password",
|
|
||||||
"hash_token",
|
|
||||||
"verify_password",
|
|
||||||
]
|
|
||||||
@@ -1,271 +0,0 @@
|
|||||||
"""OpenTelemetry instrumentation for tracing, metrics, and logging."""
|
|
||||||
|
|
||||||
import logging
|
|
||||||
from contextlib import contextmanager
|
|
||||||
from typing import Any
|
|
||||||
|
|
||||||
from opentelemetry import metrics, trace
|
|
||||||
from opentelemetry.exporter.otlp.proto.grpc.trace_exporter import OTLPSpanExporter
|
|
||||||
from opentelemetry.exporter.prometheus import PrometheusMetricReader
|
|
||||||
from opentelemetry.instrumentation.asyncpg import AsyncPGInstrumentor
|
|
||||||
from opentelemetry.instrumentation.fastapi import FastAPIInstrumentor
|
|
||||||
from opentelemetry.instrumentation.httpx import HTTPXClientInstrumentor
|
|
||||||
from opentelemetry.instrumentation.logging import LoggingInstrumentor
|
|
||||||
from opentelemetry.instrumentation.redis import RedisInstrumentor
|
|
||||||
from opentelemetry.instrumentation.system_metrics import SystemMetricsInstrumentor
|
|
||||||
from opentelemetry.sdk.metrics import MeterProvider
|
|
||||||
from opentelemetry.sdk.resources import Resource
|
|
||||||
from opentelemetry.sdk.trace import TracerProvider
|
|
||||||
from opentelemetry.sdk.trace.export import BatchSpanProcessor, ConsoleSpanExporter
|
|
||||||
from opentelemetry.semconv.resource import ResourceAttributes
|
|
||||||
from prometheus_client import REGISTRY, start_http_server
|
|
||||||
|
|
||||||
from app.config import settings
|
|
||||||
|
|
||||||
logger = logging.getLogger(__name__)
|
|
||||||
|
|
||||||
_tracer_provider: TracerProvider | None = None
|
|
||||||
_meter_provider: MeterProvider | None = None
|
|
||||||
|
|
||||||
# Custom metrics
|
|
||||||
_request_counter = None
|
|
||||||
_request_duration = None
|
|
||||||
_active_requests = None
|
|
||||||
_error_counter = None
|
|
||||||
|
|
||||||
|
|
||||||
def setup_telemetry(app: Any) -> None:
|
|
||||||
"""
|
|
||||||
Initialize OpenTelemetry with tracing, metrics, and logging instrumentation.
|
|
||||||
|
|
||||||
Configures:
|
|
||||||
- OTLP exporter for traces (to Tempo/Jaeger)
|
|
||||||
- Prometheus exporter for metrics (scraped by Prometheus)
|
|
||||||
- Auto-instrumentation for FastAPI, asyncpg, httpx, redis
|
|
||||||
- System metrics (CPU, memory, etc.)
|
|
||||||
- Logging instrumentation for trace context injection
|
|
||||||
"""
|
|
||||||
global _tracer_provider, _meter_provider
|
|
||||||
global _request_counter, _request_duration, _active_requests, _error_counter
|
|
||||||
|
|
||||||
if not settings.otel_enabled:
|
|
||||||
logger.info("OpenTelemetry disabled")
|
|
||||||
return
|
|
||||||
|
|
||||||
# Create resource with service info
|
|
||||||
resource = Resource.create(
|
|
||||||
{
|
|
||||||
ResourceAttributes.SERVICE_NAME: settings.otel_service_name,
|
|
||||||
ResourceAttributes.SERVICE_VERSION: "0.1.0",
|
|
||||||
ResourceAttributes.DEPLOYMENT_ENVIRONMENT: settings.otel_environment,
|
|
||||||
}
|
|
||||||
)
|
|
||||||
|
|
||||||
# =========================================
|
|
||||||
# TRACING SETUP
|
|
||||||
# =========================================
|
|
||||||
_tracer_provider = TracerProvider(resource=resource)
|
|
||||||
|
|
||||||
if settings.otel_exporter_otlp_endpoint:
|
|
||||||
otlp_exporter = OTLPSpanExporter(
|
|
||||||
endpoint=settings.otel_exporter_otlp_endpoint,
|
|
||||||
insecure=settings.otel_exporter_otlp_insecure,
|
|
||||||
)
|
|
||||||
_tracer_provider.add_span_processor(BatchSpanProcessor(otlp_exporter))
|
|
||||||
logger.info(f"OTLP exporter configured: {settings.otel_exporter_otlp_endpoint}")
|
|
||||||
else:
|
|
||||||
_tracer_provider.add_span_processor(BatchSpanProcessor(ConsoleSpanExporter()))
|
|
||||||
logger.info("Console span exporter configured (no OTLP endpoint)")
|
|
||||||
|
|
||||||
trace.set_tracer_provider(_tracer_provider)
|
|
||||||
|
|
||||||
# =========================================
|
|
||||||
# METRICS SETUP
|
|
||||||
# =========================================
|
|
||||||
# Prometheus metric reader exposes metrics at /metrics endpoint
|
|
||||||
prometheus_reader = PrometheusMetricReader()
|
|
||||||
_meter_provider = MeterProvider(resource=resource, metric_readers=[prometheus_reader])
|
|
||||||
metrics.set_meter_provider(_meter_provider)
|
|
||||||
|
|
||||||
# Start Prometheus HTTP server on port 9464
|
|
||||||
prometheus_port = settings.prometheus_port
|
|
||||||
try:
|
|
||||||
start_http_server(port=prometheus_port, registry=REGISTRY)
|
|
||||||
logger.info(f"Prometheus metrics server started on port {prometheus_port}")
|
|
||||||
except OSError as e:
|
|
||||||
logger.warning(f"Could not start Prometheus server on port {prometheus_port}: {e}")
|
|
||||||
|
|
||||||
# Create custom metrics
|
|
||||||
meter = metrics.get_meter(__name__)
|
|
||||||
|
|
||||||
_request_counter = meter.create_counter(
|
|
||||||
name="http_requests_total",
|
|
||||||
description="Total number of HTTP requests",
|
|
||||||
unit="1",
|
|
||||||
)
|
|
||||||
|
|
||||||
_request_duration = meter.create_histogram(
|
|
||||||
name="http_request_duration_seconds",
|
|
||||||
description="HTTP request duration in seconds",
|
|
||||||
unit="s",
|
|
||||||
)
|
|
||||||
|
|
||||||
_active_requests = meter.create_up_down_counter(
|
|
||||||
name="http_requests_active",
|
|
||||||
description="Number of active HTTP requests",
|
|
||||||
unit="1",
|
|
||||||
)
|
|
||||||
|
|
||||||
_error_counter = meter.create_counter(
|
|
||||||
name="http_errors_total",
|
|
||||||
description="Total number of HTTP errors",
|
|
||||||
unit="1",
|
|
||||||
)
|
|
||||||
|
|
||||||
# Instrument system metrics (CPU, memory, etc.)
|
|
||||||
SystemMetricsInstrumentor().instrument()
|
|
||||||
logger.info("System metrics instrumentation enabled")
|
|
||||||
|
|
||||||
# =========================================
|
|
||||||
# LIBRARY INSTRUMENTATION
|
|
||||||
# =========================================
|
|
||||||
FastAPIInstrumentor.instrument_app(
|
|
||||||
app,
|
|
||||||
excluded_urls="healthz,readyz,metrics",
|
|
||||||
tracer_provider=_tracer_provider,
|
|
||||||
meter_provider=_meter_provider,
|
|
||||||
)
|
|
||||||
AsyncPGInstrumentor().instrument(tracer_provider=_tracer_provider)
|
|
||||||
HTTPXClientInstrumentor().instrument(tracer_provider=_tracer_provider)
|
|
||||||
RedisInstrumentor().instrument(tracer_provider=_tracer_provider)
|
|
||||||
|
|
||||||
# Inject trace context into logs
|
|
||||||
LoggingInstrumentor().instrument(
|
|
||||||
set_logging_format=True,
|
|
||||||
log_level=logging.INFO,
|
|
||||||
)
|
|
||||||
|
|
||||||
logger.info(
|
|
||||||
f"OpenTelemetry initialized: service={settings.otel_service_name}, "
|
|
||||||
f"env={settings.otel_environment}, metrics_port={prometheus_port}"
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
async def shutdown_telemetry() -> None:
|
|
||||||
"""Gracefully shutdown the tracer and meter providers."""
|
|
||||||
global _tracer_provider, _meter_provider
|
|
||||||
|
|
||||||
if _tracer_provider:
|
|
||||||
_tracer_provider.shutdown()
|
|
||||||
_tracer_provider = None
|
|
||||||
logger.info("Tracer provider shutdown complete")
|
|
||||||
|
|
||||||
if _meter_provider:
|
|
||||||
_meter_provider.shutdown()
|
|
||||||
_meter_provider = None
|
|
||||||
logger.info("Meter provider shutdown complete")
|
|
||||||
|
|
||||||
|
|
||||||
def get_tracer(name: str) -> trace.Tracer:
|
|
||||||
"""Get a tracer instance for manual span creation."""
|
|
||||||
return trace.get_tracer(name)
|
|
||||||
|
|
||||||
|
|
||||||
def get_meter(name: str) -> metrics.Meter:
|
|
||||||
"""Get a meter instance for custom metrics."""
|
|
||||||
return metrics.get_meter(name)
|
|
||||||
|
|
||||||
|
|
||||||
def get_current_trace_id() -> str | None:
|
|
||||||
"""Get the current trace ID for request correlation."""
|
|
||||||
span = trace.get_current_span()
|
|
||||||
if span and span.get_span_context().is_valid:
|
|
||||||
return format(span.get_span_context().trace_id, "032x")
|
|
||||||
return None
|
|
||||||
|
|
||||||
|
|
||||||
def get_current_span_id() -> str | None:
|
|
||||||
"""Get the current span ID."""
|
|
||||||
span = trace.get_current_span()
|
|
||||||
if span and span.get_span_context().is_valid:
|
|
||||||
return format(span.get_span_context().span_id, "016x")
|
|
||||||
return None
|
|
||||||
|
|
||||||
|
|
||||||
@contextmanager
|
|
||||||
def create_span(name: str, attributes: dict[str, Any] | None = None):
|
|
||||||
"""Context manager for creating manual spans."""
|
|
||||||
tracer = get_tracer(__name__)
|
|
||||||
with tracer.start_as_current_span(name, attributes=attributes) as span:
|
|
||||||
yield span
|
|
||||||
|
|
||||||
|
|
||||||
def add_span_attributes(attributes: dict[str, Any]) -> None:
|
|
||||||
"""Add attributes to the current span."""
|
|
||||||
span = trace.get_current_span()
|
|
||||||
if span:
|
|
||||||
for key, value in attributes.items():
|
|
||||||
span.set_attribute(key, value)
|
|
||||||
|
|
||||||
|
|
||||||
def record_exception(exception: Exception) -> None:
|
|
||||||
"""Record an exception on the current span."""
|
|
||||||
span = trace.get_current_span()
|
|
||||||
if span:
|
|
||||||
span.record_exception(exception)
|
|
||||||
span.set_status(trace.Status(trace.StatusCode.ERROR, str(exception)))
|
|
||||||
|
|
||||||
|
|
||||||
# =========================================
|
|
||||||
# CUSTOM METRICS HELPERS
|
|
||||||
# =========================================
|
|
||||||
|
|
||||||
|
|
||||||
def record_request(method: str, endpoint: str, status_code: int) -> None:
|
|
||||||
"""Record a request metric."""
|
|
||||||
if _request_counter:
|
|
||||||
_request_counter.add(
|
|
||||||
1,
|
|
||||||
{
|
|
||||||
"method": method,
|
|
||||||
"endpoint": endpoint,
|
|
||||||
"status_code": str(status_code),
|
|
||||||
},
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
def record_request_duration(method: str, endpoint: str, duration: float) -> None:
|
|
||||||
"""Record request duration in seconds."""
|
|
||||||
if _request_duration:
|
|
||||||
_request_duration.record(
|
|
||||||
duration,
|
|
||||||
{
|
|
||||||
"method": method,
|
|
||||||
"endpoint": endpoint,
|
|
||||||
},
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
def increment_active_requests(method: str, endpoint: str) -> None:
|
|
||||||
"""Increment active requests counter."""
|
|
||||||
if _active_requests:
|
|
||||||
_active_requests.add(1, {"method": method, "endpoint": endpoint})
|
|
||||||
|
|
||||||
|
|
||||||
def decrement_active_requests(method: str, endpoint: str) -> None:
|
|
||||||
"""Decrement active requests counter."""
|
|
||||||
if _active_requests:
|
|
||||||
_active_requests.add(-1, {"method": method, "endpoint": endpoint})
|
|
||||||
|
|
||||||
|
|
||||||
def record_error(method: str, endpoint: str, error_type: str) -> None:
|
|
||||||
"""Record an error metric."""
|
|
||||||
if _error_counter:
|
|
||||||
_error_counter.add(
|
|
||||||
1,
|
|
||||||
{
|
|
||||||
"method": method,
|
|
||||||
"endpoint": endpoint,
|
|
||||||
"error_type": error_type,
|
|
||||||
},
|
|
||||||
)
|
|
||||||
@@ -1,74 +0,0 @@
|
|||||||
"""Database connection management using asyncpg."""
|
|
||||||
|
|
||||||
from collections.abc import AsyncGenerator
|
|
||||||
from contextlib import asynccontextmanager
|
|
||||||
from contextvars import ContextVar
|
|
||||||
|
|
||||||
import asyncpg
|
|
||||||
from asyncpg.pool import PoolConnectionProxy
|
|
||||||
|
|
||||||
|
|
||||||
class Database:
|
|
||||||
"""Manages asyncpg connection pool."""
|
|
||||||
|
|
||||||
pool: asyncpg.Pool | None = None
|
|
||||||
|
|
||||||
async def connect(self, dsn: str) -> None:
|
|
||||||
"""Create connection pool."""
|
|
||||||
self.pool = await asyncpg.create_pool(
|
|
||||||
dsn,
|
|
||||||
min_size=5,
|
|
||||||
max_size=20,
|
|
||||||
command_timeout=60,
|
|
||||||
)
|
|
||||||
|
|
||||||
async def disconnect(self) -> None:
|
|
||||||
"""Close connection pool."""
|
|
||||||
if self.pool:
|
|
||||||
await self.pool.close()
|
|
||||||
|
|
||||||
@asynccontextmanager
|
|
||||||
async def connection(self) -> AsyncGenerator[asyncpg.Connection | PoolConnectionProxy, None]:
|
|
||||||
"""Acquire a connection from the pool."""
|
|
||||||
if not self.pool:
|
|
||||||
raise RuntimeError("Database not connected")
|
|
||||||
async with self.pool.acquire() as conn:
|
|
||||||
yield conn
|
|
||||||
|
|
||||||
@asynccontextmanager
|
|
||||||
async def transaction(self) -> AsyncGenerator[asyncpg.Connection | PoolConnectionProxy, None]:
|
|
||||||
"""Acquire a connection with an active transaction."""
|
|
||||||
if not self.pool:
|
|
||||||
raise RuntimeError("Database not connected")
|
|
||||||
async with self.pool.acquire() as conn:
|
|
||||||
async with conn.transaction():
|
|
||||||
yield conn
|
|
||||||
|
|
||||||
|
|
||||||
# Global instance
|
|
||||||
db = Database()
|
|
||||||
|
|
||||||
|
|
||||||
_connection_ctx: ContextVar[asyncpg.Connection | PoolConnectionProxy | None] = ContextVar(
|
|
||||||
"db_connection",
|
|
||||||
default=None,
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
async def get_conn() -> AsyncGenerator[asyncpg.Connection | PoolConnectionProxy, None]:
|
|
||||||
"""Dependency that reuses the same DB connection within a request context."""
|
|
||||||
|
|
||||||
existing_conn = _connection_ctx.get()
|
|
||||||
if existing_conn is not None:
|
|
||||||
yield existing_conn
|
|
||||||
return
|
|
||||||
|
|
||||||
if not db.pool:
|
|
||||||
raise RuntimeError("Database not connected")
|
|
||||||
|
|
||||||
async with db.pool.acquire() as conn:
|
|
||||||
token = _connection_ctx.set(conn)
|
|
||||||
try:
|
|
||||||
yield conn
|
|
||||||
finally:
|
|
||||||
_connection_ctx.reset(token)
|
|
||||||
-282
@@ -1,282 +0,0 @@
|
|||||||
"""FastAPI application entry point."""
|
|
||||||
|
|
||||||
import logging
|
|
||||||
import time
|
|
||||||
from contextlib import asynccontextmanager
|
|
||||||
from typing import AsyncGenerator
|
|
||||||
|
|
||||||
from fastapi import FastAPI, Request, status
|
|
||||||
from fastapi.encoders import jsonable_encoder
|
|
||||||
from fastapi.exceptions import RequestValidationError
|
|
||||||
from fastapi.openapi.utils import get_openapi
|
|
||||||
from fastapi.responses import JSONResponse
|
|
||||||
from starlette.exceptions import HTTPException as StarletteHTTPException
|
|
||||||
|
|
||||||
from app.api.v1 import auth, health, incidents, org
|
|
||||||
from app.config import settings
|
|
||||||
from app.core.logging import setup_logging
|
|
||||||
from app.core.telemetry import (
|
|
||||||
get_current_trace_id,
|
|
||||||
record_exception,
|
|
||||||
setup_telemetry,
|
|
||||||
shutdown_telemetry,
|
|
||||||
)
|
|
||||||
from app.db import db
|
|
||||||
from app.schemas.common import ErrorDetail, ErrorResponse
|
|
||||||
from app.taskqueue import task_queue
|
|
||||||
|
|
||||||
# Initialize logging before anything else
|
|
||||||
setup_logging()
|
|
||||||
logger = logging.getLogger(__name__)
|
|
||||||
|
|
||||||
|
|
||||||
@asynccontextmanager
|
|
||||||
async def lifespan(app: FastAPI) -> AsyncGenerator[None, None]:
|
|
||||||
"""Manage application lifecycle - connect/disconnect resources."""
|
|
||||||
# Startup
|
|
||||||
logger.info("Starting IncidentOps API")
|
|
||||||
await db.connect(settings.database_url)
|
|
||||||
await task_queue.startup()
|
|
||||||
logger.info("Startup complete")
|
|
||||||
yield
|
|
||||||
# Shutdown
|
|
||||||
logger.info("Shutting down IncidentOps API")
|
|
||||||
await task_queue.shutdown()
|
|
||||||
await db.disconnect()
|
|
||||||
await shutdown_telemetry()
|
|
||||||
logger.info("Shutdown complete")
|
|
||||||
|
|
||||||
|
|
||||||
app = FastAPI(
|
|
||||||
title="IncidentOps",
|
|
||||||
description="Incident management API with multi-tenant org support",
|
|
||||||
version="0.1.0",
|
|
||||||
docs_url="/docs",
|
|
||||||
redoc_url="/redoc",
|
|
||||||
openapi_url="/openapi.json",
|
|
||||||
lifespan=lifespan,
|
|
||||||
)
|
|
||||||
|
|
||||||
# Set up OpenTelemetry instrumentation
|
|
||||||
setup_telemetry(app)
|
|
||||||
|
|
||||||
|
|
||||||
@app.middleware("http")
|
|
||||||
async def request_logging_middleware(request: Request, call_next):
|
|
||||||
start = time.time()
|
|
||||||
response = await call_next(request)
|
|
||||||
duration_ms = (time.time() - start) * 1000
|
|
||||||
logger.info(
|
|
||||||
"request",
|
|
||||||
extra={
|
|
||||||
"method": request.method,
|
|
||||||
"path": request.url.path,
|
|
||||||
"status_code": response.status_code,
|
|
||||||
"duration_ms": round(duration_ms, 2),
|
|
||||||
},
|
|
||||||
)
|
|
||||||
return response
|
|
||||||
|
|
||||||
app.openapi_tags = [
|
|
||||||
{"name": "auth", "description": "Registration, login, token lifecycle"},
|
|
||||||
{"name": "org", "description": "Organization membership, services, and notifications"},
|
|
||||||
{"name": "incidents", "description": "Incident lifecycle and timelines"},
|
|
||||||
{"name": "health", "description": "Service health probes"},
|
|
||||||
]
|
|
||||||
|
|
||||||
|
|
||||||
# ---------------------------------------------------------------------------
|
|
||||||
# Global Exception Handlers
|
|
||||||
# ---------------------------------------------------------------------------
|
|
||||||
|
|
||||||
|
|
||||||
def _build_error_response(
|
|
||||||
error: str,
|
|
||||||
message: str,
|
|
||||||
status_code: int,
|
|
||||||
details: list[ErrorDetail] | None = None,
|
|
||||||
) -> JSONResponse:
|
|
||||||
"""Build a structured error response with trace context."""
|
|
||||||
response = ErrorResponse(
|
|
||||||
error=error,
|
|
||||||
message=message,
|
|
||||||
details=details,
|
|
||||||
request_id=get_current_trace_id(),
|
|
||||||
)
|
|
||||||
return JSONResponse(
|
|
||||||
status_code=status_code,
|
|
||||||
content=jsonable_encoder(response),
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
@app.exception_handler(StarletteHTTPException)
|
|
||||||
async def http_exception_handler(
|
|
||||||
request: Request, exc: StarletteHTTPException
|
|
||||||
) -> JSONResponse:
|
|
||||||
"""Handle HTTP exceptions with structured error responses."""
|
|
||||||
# Map status codes to error type strings
|
|
||||||
error_types = {
|
|
||||||
400: "bad_request",
|
|
||||||
401: "unauthorized",
|
|
||||||
403: "forbidden",
|
|
||||||
404: "not_found",
|
|
||||||
409: "conflict",
|
|
||||||
422: "validation_error",
|
|
||||||
429: "rate_limited",
|
|
||||||
500: "internal_error",
|
|
||||||
502: "bad_gateway",
|
|
||||||
503: "service_unavailable",
|
|
||||||
}
|
|
||||||
error_type = error_types.get(exc.status_code, "error")
|
|
||||||
|
|
||||||
logger.warning(
|
|
||||||
"HTTP exception",
|
|
||||||
extra={
|
|
||||||
"status_code": exc.status_code,
|
|
||||||
"error": error_type,
|
|
||||||
"detail": exc.detail,
|
|
||||||
"path": str(request.url.path),
|
|
||||||
"method": request.method,
|
|
||||||
},
|
|
||||||
)
|
|
||||||
|
|
||||||
return _build_error_response(
|
|
||||||
error=error_type,
|
|
||||||
message=str(exc.detail),
|
|
||||||
status_code=exc.status_code,
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
@app.exception_handler(RequestValidationError)
|
|
||||||
async def validation_exception_handler(
|
|
||||||
request: Request, exc: RequestValidationError
|
|
||||||
) -> JSONResponse:
|
|
||||||
"""Handle Pydantic validation errors with detailed error responses."""
|
|
||||||
details = [
|
|
||||||
ErrorDetail(
|
|
||||||
loc=[str(loc) for loc in error["loc"]],
|
|
||||||
msg=error["msg"],
|
|
||||||
type=error["type"],
|
|
||||||
)
|
|
||||||
for error in exc.errors()
|
|
||||||
]
|
|
||||||
|
|
||||||
logger.warning(
|
|
||||||
"Validation error",
|
|
||||||
extra={
|
|
||||||
"path": str(request.url.path),
|
|
||||||
"method": request.method,
|
|
||||||
"error_count": len(details),
|
|
||||||
},
|
|
||||||
)
|
|
||||||
|
|
||||||
return _build_error_response(
|
|
||||||
error="validation_error",
|
|
||||||
message="Request validation failed",
|
|
||||||
status_code=status.HTTP_422_UNPROCESSABLE_ENTITY,
|
|
||||||
details=details,
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
@app.exception_handler(Exception)
|
|
||||||
async def unhandled_exception_handler(request: Request, exc: Exception) -> JSONResponse:
|
|
||||||
"""Handle unexpected exceptions with logging and safe error response."""
|
|
||||||
# Record exception in the current span for tracing
|
|
||||||
record_exception(exc)
|
|
||||||
|
|
||||||
logger.exception(
|
|
||||||
"Unhandled exception",
|
|
||||||
extra={
|
|
||||||
"path": str(request.url.path),
|
|
||||||
"method": request.method,
|
|
||||||
"exception_type": type(exc).__name__,
|
|
||||||
},
|
|
||||||
)
|
|
||||||
|
|
||||||
# Don't leak internal error details in production
|
|
||||||
message = "An unexpected error occurred"
|
|
||||||
if settings.debug:
|
|
||||||
message = f"{type(exc).__name__}: {exc}"
|
|
||||||
|
|
||||||
return _build_error_response(
|
|
||||||
error="internal_error",
|
|
||||||
message=message,
|
|
||||||
status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
# ---------------------------------------------------------------------------
|
|
||||||
# OpenAPI Customization
|
|
||||||
# ---------------------------------------------------------------------------
|
|
||||||
|
|
||||||
|
|
||||||
def custom_openapi() -> dict:
|
|
||||||
"""Add JWT bearer security scheme and error responses to OpenAPI schema."""
|
|
||||||
if app.openapi_schema:
|
|
||||||
return app.openapi_schema
|
|
||||||
|
|
||||||
openapi_schema = get_openapi(
|
|
||||||
title=app.title,
|
|
||||||
version=app.version,
|
|
||||||
description=app.description,
|
|
||||||
routes=app.routes,
|
|
||||||
tags=app.openapi_tags,
|
|
||||||
)
|
|
||||||
|
|
||||||
# Add security schemes
|
|
||||||
components = openapi_schema.setdefault("components", {})
|
|
||||||
security_schemes = components.setdefault("securitySchemes", {})
|
|
||||||
security_schemes["BearerToken"] = {
|
|
||||||
"type": "http",
|
|
||||||
"scheme": "bearer",
|
|
||||||
"bearerFormat": "JWT",
|
|
||||||
"description": "Paste the JWT access token returned by /auth endpoints",
|
|
||||||
}
|
|
||||||
openapi_schema["security"] = [{"BearerToken": []}]
|
|
||||||
|
|
||||||
# Add common error response schemas
|
|
||||||
schemas = components.setdefault("schemas", {})
|
|
||||||
schemas["ErrorResponse"] = {
|
|
||||||
"type": "object",
|
|
||||||
"properties": {
|
|
||||||
"error": {"type": "string", "description": "Error type identifier"},
|
|
||||||
"message": {"type": "string", "description": "Human-readable error message"},
|
|
||||||
"details": {
|
|
||||||
"type": "array",
|
|
||||||
"items": {"$ref": "#/components/schemas/ErrorDetail"},
|
|
||||||
"nullable": True,
|
|
||||||
"description": "Validation error details",
|
|
||||||
},
|
|
||||||
"request_id": {
|
|
||||||
"type": "string",
|
|
||||||
"nullable": True,
|
|
||||||
"description": "Trace ID for debugging",
|
|
||||||
},
|
|
||||||
},
|
|
||||||
"required": ["error", "message"],
|
|
||||||
}
|
|
||||||
schemas["ErrorDetail"] = {
|
|
||||||
"type": "object",
|
|
||||||
"properties": {
|
|
||||||
"loc": {
|
|
||||||
"type": "array",
|
|
||||||
"items": {"oneOf": [{"type": "string"}, {"type": "integer"}]},
|
|
||||||
"description": "Error location path",
|
|
||||||
},
|
|
||||||
"msg": {"type": "string", "description": "Error message"},
|
|
||||||
"type": {"type": "string", "description": "Error type"},
|
|
||||||
},
|
|
||||||
"required": ["loc", "msg", "type"],
|
|
||||||
}
|
|
||||||
|
|
||||||
app.openapi_schema = openapi_schema
|
|
||||||
return app.openapi_schema
|
|
||||||
|
|
||||||
|
|
||||||
app.openapi = custom_openapi # type: ignore[assignment]
|
|
||||||
|
|
||||||
# Include routers
|
|
||||||
app.include_router(auth.router, prefix=settings.api_v1_prefix)
|
|
||||||
app.include_router(incidents.router, prefix=settings.api_v1_prefix)
|
|
||||||
app.include_router(org.router, prefix=settings.api_v1_prefix)
|
|
||||||
app.include_router(health.router, prefix=settings.api_v1_prefix, tags=["health"])
|
|
||||||
@@ -1,17 +0,0 @@
|
|||||||
"""Repository layer for database operations."""
|
|
||||||
|
|
||||||
from app.repositories.incident import IncidentRepository
|
|
||||||
from app.repositories.notification import NotificationRepository
|
|
||||||
from app.repositories.org import OrgRepository
|
|
||||||
from app.repositories.refresh_token import RefreshTokenRepository
|
|
||||||
from app.repositories.service import ServiceRepository
|
|
||||||
from app.repositories.user import UserRepository
|
|
||||||
|
|
||||||
__all__ = [
|
|
||||||
"IncidentRepository",
|
|
||||||
"NotificationRepository",
|
|
||||||
"OrgRepository",
|
|
||||||
"RefreshTokenRepository",
|
|
||||||
"ServiceRepository",
|
|
||||||
"UserRepository",
|
|
||||||
]
|
|
||||||
@@ -1,161 +0,0 @@
|
|||||||
"""Incident repository for database operations."""
|
|
||||||
|
|
||||||
from datetime import datetime
|
|
||||||
from typing import Any
|
|
||||||
from uuid import UUID
|
|
||||||
|
|
||||||
import asyncpg
|
|
||||||
|
|
||||||
|
|
||||||
class IncidentRepository:
|
|
||||||
"""Database operations for incidents."""
|
|
||||||
|
|
||||||
def __init__(self, conn: asyncpg.Connection) -> None:
|
|
||||||
self.conn = conn
|
|
||||||
|
|
||||||
async def create(
|
|
||||||
self,
|
|
||||||
incident_id: UUID,
|
|
||||||
org_id: UUID,
|
|
||||||
service_id: UUID,
|
|
||||||
title: str,
|
|
||||||
description: str | None,
|
|
||||||
severity: str,
|
|
||||||
) -> dict:
|
|
||||||
"""Create a new incident."""
|
|
||||||
row = await self.conn.fetchrow(
|
|
||||||
"""
|
|
||||||
INSERT INTO incidents (id, org_id, service_id, title, description, status, severity)
|
|
||||||
VALUES ($1, $2, $3, $4, $5, 'triggered', $6)
|
|
||||||
RETURNING id, org_id, service_id, title, description, status, severity,
|
|
||||||
version, created_at, updated_at
|
|
||||||
""",
|
|
||||||
incident_id,
|
|
||||||
org_id,
|
|
||||||
service_id,
|
|
||||||
title,
|
|
||||||
description,
|
|
||||||
severity,
|
|
||||||
)
|
|
||||||
return dict(row)
|
|
||||||
|
|
||||||
async def get_by_id(self, incident_id: UUID) -> dict | None:
|
|
||||||
"""Get incident by ID."""
|
|
||||||
row = await self.conn.fetchrow(
|
|
||||||
"""
|
|
||||||
SELECT id, org_id, service_id, title, description, status, severity,
|
|
||||||
version, created_at, updated_at
|
|
||||||
FROM incidents
|
|
||||||
WHERE id = $1
|
|
||||||
""",
|
|
||||||
incident_id,
|
|
||||||
)
|
|
||||||
return dict(row) if row else None
|
|
||||||
|
|
||||||
async def get_by_org(
|
|
||||||
self,
|
|
||||||
org_id: UUID,
|
|
||||||
status: str | None = None,
|
|
||||||
cursor: datetime | None = None,
|
|
||||||
limit: int = 20,
|
|
||||||
) -> list[dict]:
|
|
||||||
"""Get incidents for an organization with optional filtering and pagination."""
|
|
||||||
query = """
|
|
||||||
SELECT id, org_id, service_id, title, description, status, severity,
|
|
||||||
version, created_at, updated_at
|
|
||||||
FROM incidents
|
|
||||||
WHERE org_id = $1
|
|
||||||
"""
|
|
||||||
params: list[Any] = [org_id]
|
|
||||||
param_idx = 2
|
|
||||||
|
|
||||||
if status:
|
|
||||||
query += f" AND status = ${param_idx}"
|
|
||||||
params.append(status)
|
|
||||||
param_idx += 1
|
|
||||||
|
|
||||||
if cursor:
|
|
||||||
query += f" AND created_at < ${param_idx}"
|
|
||||||
params.append(cursor)
|
|
||||||
param_idx += 1
|
|
||||||
|
|
||||||
query += f" ORDER BY created_at DESC LIMIT ${param_idx}"
|
|
||||||
params.append(limit + 1) # Fetch one extra to check if there are more
|
|
||||||
|
|
||||||
rows = await self.conn.fetch(query, *params)
|
|
||||||
return [dict(row) for row in rows]
|
|
||||||
|
|
||||||
async def update_status(
|
|
||||||
self,
|
|
||||||
incident_id: UUID,
|
|
||||||
new_status: str,
|
|
||||||
expected_version: int,
|
|
||||||
) -> dict | None:
|
|
||||||
"""Update incident status with optimistic locking.
|
|
||||||
|
|
||||||
Returns updated incident if successful, None if version mismatch.
|
|
||||||
"""
|
|
||||||
row = await self.conn.fetchrow(
|
|
||||||
"""
|
|
||||||
UPDATE incidents
|
|
||||||
SET status = $2, version = version + 1, updated_at = now()
|
|
||||||
WHERE id = $1 AND version = $3
|
|
||||||
RETURNING id, org_id, service_id, title, description, status, severity,
|
|
||||||
version, created_at, updated_at
|
|
||||||
""",
|
|
||||||
incident_id,
|
|
||||||
new_status,
|
|
||||||
expected_version,
|
|
||||||
)
|
|
||||||
return dict(row) if row else None
|
|
||||||
|
|
||||||
async def add_event(
|
|
||||||
self,
|
|
||||||
event_id: UUID,
|
|
||||||
incident_id: UUID,
|
|
||||||
event_type: str,
|
|
||||||
actor_user_id: UUID | None,
|
|
||||||
payload: dict[str, Any] | None,
|
|
||||||
) -> dict:
|
|
||||||
"""Add an event to the incident timeline."""
|
|
||||||
import json
|
|
||||||
|
|
||||||
row = await self.conn.fetchrow(
|
|
||||||
"""
|
|
||||||
INSERT INTO incident_events (id, incident_id, event_type, actor_user_id, payload)
|
|
||||||
VALUES ($1, $2, $3, $4, $5)
|
|
||||||
RETURNING id, incident_id, event_type, actor_user_id, payload, created_at
|
|
||||||
""",
|
|
||||||
event_id,
|
|
||||||
incident_id,
|
|
||||||
event_type,
|
|
||||||
actor_user_id,
|
|
||||||
json.dumps(payload) if payload else None,
|
|
||||||
)
|
|
||||||
result = dict(row)
|
|
||||||
|
|
||||||
# Parse JSON payload back to dict
|
|
||||||
if result["payload"]:
|
|
||||||
result["payload"] = json.loads(result["payload"])
|
|
||||||
return result
|
|
||||||
|
|
||||||
async def get_events(self, incident_id: UUID) -> list[dict]:
|
|
||||||
"""Get all events for an incident."""
|
|
||||||
import json
|
|
||||||
|
|
||||||
rows = await self.conn.fetch(
|
|
||||||
"""
|
|
||||||
SELECT id, incident_id, event_type, actor_user_id, payload, created_at
|
|
||||||
FROM incident_events
|
|
||||||
WHERE incident_id = $1
|
|
||||||
ORDER BY created_at
|
|
||||||
""",
|
|
||||||
incident_id,
|
|
||||||
)
|
|
||||||
results = []
|
|
||||||
for row in rows:
|
|
||||||
result = dict(row)
|
|
||||||
if result["payload"]:
|
|
||||||
result["payload"] = json.loads(result["payload"])
|
|
||||||
results.append(result)
|
|
||||||
return results
|
|
||||||
@@ -1,199 +0,0 @@
|
|||||||
"""Notification repository for database operations."""
|
|
||||||
|
|
||||||
from datetime import datetime
|
|
||||||
from uuid import UUID
|
|
||||||
|
|
||||||
import asyncpg
|
|
||||||
|
|
||||||
|
|
||||||
class NotificationRepository:
|
|
||||||
"""Database operations for notification targets and attempts."""
|
|
||||||
|
|
||||||
def __init__(self, conn: asyncpg.Connection) -> None:
|
|
||||||
self.conn = conn
|
|
||||||
|
|
||||||
async def create_target(
|
|
||||||
self,
|
|
||||||
target_id: UUID,
|
|
||||||
org_id: UUID,
|
|
||||||
name: str,
|
|
||||||
target_type: str,
|
|
||||||
webhook_url: str | None = None,
|
|
||||||
enabled: bool = True,
|
|
||||||
) -> dict:
|
|
||||||
"""Create a new notification target."""
|
|
||||||
row = await self.conn.fetchrow(
|
|
||||||
"""
|
|
||||||
INSERT INTO notification_targets (id, org_id, name, target_type, webhook_url, enabled)
|
|
||||||
VALUES ($1, $2, $3, $4, $5, $6)
|
|
||||||
RETURNING id, org_id, name, target_type, webhook_url, enabled, created_at
|
|
||||||
""",
|
|
||||||
target_id,
|
|
||||||
org_id,
|
|
||||||
name,
|
|
||||||
target_type,
|
|
||||||
webhook_url,
|
|
||||||
enabled,
|
|
||||||
)
|
|
||||||
return dict(row)
|
|
||||||
|
|
||||||
async def get_target_by_id(self, target_id: UUID) -> dict | None:
|
|
||||||
"""Get notification target by ID."""
|
|
||||||
row = await self.conn.fetchrow(
|
|
||||||
"""
|
|
||||||
SELECT id, org_id, name, target_type, webhook_url, enabled, created_at
|
|
||||||
FROM notification_targets
|
|
||||||
WHERE id = $1
|
|
||||||
""",
|
|
||||||
target_id,
|
|
||||||
)
|
|
||||||
return dict(row) if row else None
|
|
||||||
|
|
||||||
async def get_targets_by_org(
|
|
||||||
self,
|
|
||||||
org_id: UUID,
|
|
||||||
enabled_only: bool = False,
|
|
||||||
) -> list[dict]:
|
|
||||||
"""Get all notification targets for an organization."""
|
|
||||||
query = """
|
|
||||||
SELECT id, org_id, name, target_type, webhook_url, enabled, created_at
|
|
||||||
FROM notification_targets
|
|
||||||
WHERE org_id = $1
|
|
||||||
"""
|
|
||||||
if enabled_only:
|
|
||||||
query += " AND enabled = true"
|
|
||||||
query += " ORDER BY name"
|
|
||||||
|
|
||||||
rows = await self.conn.fetch(query, org_id)
|
|
||||||
return [dict(row) for row in rows]
|
|
||||||
|
|
||||||
async def update_target(
|
|
||||||
self,
|
|
||||||
target_id: UUID,
|
|
||||||
name: str | None = None,
|
|
||||||
webhook_url: str | None = None,
|
|
||||||
enabled: bool | None = None,
|
|
||||||
) -> dict | None:
|
|
||||||
"""Update a notification target."""
|
|
||||||
updates = []
|
|
||||||
params = [target_id]
|
|
||||||
param_idx = 2
|
|
||||||
|
|
||||||
if name is not None:
|
|
||||||
updates.append(f"name = ${param_idx}")
|
|
||||||
params.append(name)
|
|
||||||
param_idx += 1
|
|
||||||
|
|
||||||
if webhook_url is not None:
|
|
||||||
updates.append(f"webhook_url = ${param_idx}")
|
|
||||||
params.append(webhook_url)
|
|
||||||
param_idx += 1
|
|
||||||
|
|
||||||
if enabled is not None:
|
|
||||||
updates.append(f"enabled = ${param_idx}")
|
|
||||||
params.append(enabled)
|
|
||||||
param_idx += 1
|
|
||||||
|
|
||||||
if not updates:
|
|
||||||
return await self.get_target_by_id(target_id)
|
|
||||||
|
|
||||||
query = f"""
|
|
||||||
UPDATE notification_targets
|
|
||||||
SET {", ".join(updates)}
|
|
||||||
WHERE id = $1
|
|
||||||
RETURNING id, org_id, name, target_type, webhook_url, enabled, created_at
|
|
||||||
"""
|
|
||||||
row = await self.conn.fetchrow(query, *params)
|
|
||||||
return dict(row) if row else None
|
|
||||||
|
|
||||||
async def delete_target(self, target_id: UUID) -> bool:
|
|
||||||
"""Delete a notification target. Returns True if deleted."""
|
|
||||||
result = await self.conn.execute(
|
|
||||||
"DELETE FROM notification_targets WHERE id = $1",
|
|
||||||
target_id,
|
|
||||||
)
|
|
||||||
return result == "DELETE 1"
|
|
||||||
|
|
||||||
async def create_attempt(
|
|
||||||
self,
|
|
||||||
attempt_id: UUID,
|
|
||||||
incident_id: UUID,
|
|
||||||
target_id: UUID,
|
|
||||||
) -> dict:
|
|
||||||
"""Create a notification attempt (idempotent via unique constraint)."""
|
|
||||||
row = await self.conn.fetchrow(
|
|
||||||
"""
|
|
||||||
INSERT INTO notification_attempts (id, incident_id, target_id, status)
|
|
||||||
VALUES ($1, $2, $3, 'pending')
|
|
||||||
ON CONFLICT (incident_id, target_id) DO UPDATE SET id = notification_attempts.id
|
|
||||||
RETURNING id, incident_id, target_id, status, error, sent_at, created_at
|
|
||||||
""",
|
|
||||||
attempt_id,
|
|
||||||
incident_id,
|
|
||||||
target_id,
|
|
||||||
)
|
|
||||||
return dict(row)
|
|
||||||
|
|
||||||
async def get_attempt(self, incident_id: UUID, target_id: UUID) -> dict | None:
|
|
||||||
"""Get notification attempt for incident and target."""
|
|
||||||
row = await self.conn.fetchrow(
|
|
||||||
"""
|
|
||||||
SELECT id, incident_id, target_id, status, error, sent_at, created_at
|
|
||||||
FROM notification_attempts
|
|
||||||
WHERE incident_id = $1 AND target_id = $2
|
|
||||||
""",
|
|
||||||
incident_id,
|
|
||||||
target_id,
|
|
||||||
)
|
|
||||||
return dict(row) if row else None
|
|
||||||
|
|
||||||
async def update_attempt_success(
|
|
||||||
self,
|
|
||||||
attempt_id: UUID,
|
|
||||||
sent_at: datetime,
|
|
||||||
) -> dict | None:
|
|
||||||
"""Mark notification attempt as successful."""
|
|
||||||
row = await self.conn.fetchrow(
|
|
||||||
"""
|
|
||||||
UPDATE notification_attempts
|
|
||||||
SET status = 'sent', sent_at = $2, error = NULL
|
|
||||||
WHERE id = $1
|
|
||||||
RETURNING id, incident_id, target_id, status, error, sent_at, created_at
|
|
||||||
""",
|
|
||||||
attempt_id,
|
|
||||||
sent_at,
|
|
||||||
)
|
|
||||||
return dict(row) if row else None
|
|
||||||
|
|
||||||
async def update_attempt_failure(
|
|
||||||
self,
|
|
||||||
attempt_id: UUID,
|
|
||||||
error: str,
|
|
||||||
) -> dict | None:
|
|
||||||
"""Mark notification attempt as failed."""
|
|
||||||
row = await self.conn.fetchrow(
|
|
||||||
"""
|
|
||||||
UPDATE notification_attempts
|
|
||||||
SET status = 'failed', error = $2
|
|
||||||
WHERE id = $1
|
|
||||||
RETURNING id, incident_id, target_id, status, error, sent_at, created_at
|
|
||||||
""",
|
|
||||||
attempt_id,
|
|
||||||
error,
|
|
||||||
)
|
|
||||||
return dict(row) if row else None
|
|
||||||
|
|
||||||
async def get_pending_attempts(self, incident_id: UUID) -> list[dict]:
|
|
||||||
"""Get all pending notification attempts for an incident."""
|
|
||||||
rows = await self.conn.fetch(
|
|
||||||
"""
|
|
||||||
SELECT na.id, na.incident_id, na.target_id, na.status, na.error,
|
|
||||||
na.sent_at, na.created_at,
|
|
||||||
nt.target_type, nt.webhook_url, nt.name as target_name
|
|
||||||
FROM notification_attempts na
|
|
||||||
JOIN notification_targets nt ON nt.id = na.target_id
|
|
||||||
WHERE na.incident_id = $1 AND na.status = 'pending'
|
|
||||||
""",
|
|
||||||
incident_id,
|
|
||||||
)
|
|
||||||
return [dict(row) for row in rows]
|
|
||||||
@@ -1,125 +0,0 @@
|
|||||||
"""Organization repository for database operations."""
|
|
||||||
|
|
||||||
from uuid import UUID
|
|
||||||
|
|
||||||
import asyncpg
|
|
||||||
|
|
||||||
|
|
||||||
class OrgRepository:
|
|
||||||
"""Database operations for organizations."""
|
|
||||||
|
|
||||||
def __init__(self, conn: asyncpg.Connection) -> None:
|
|
||||||
self.conn = conn
|
|
||||||
|
|
||||||
async def create(
|
|
||||||
self,
|
|
||||||
org_id: UUID,
|
|
||||||
name: str,
|
|
||||||
slug: str,
|
|
||||||
) -> dict:
|
|
||||||
"""Create a new organization."""
|
|
||||||
row = await self.conn.fetchrow(
|
|
||||||
"""
|
|
||||||
INSERT INTO orgs (id, name, slug)
|
|
||||||
VALUES ($1, $2, $3)
|
|
||||||
RETURNING id, name, slug, created_at
|
|
||||||
""",
|
|
||||||
org_id,
|
|
||||||
name,
|
|
||||||
slug,
|
|
||||||
)
|
|
||||||
return dict(row)
|
|
||||||
|
|
||||||
async def get_by_id(self, org_id: UUID) -> dict | None:
|
|
||||||
"""Get organization by ID."""
|
|
||||||
row = await self.conn.fetchrow(
|
|
||||||
"""
|
|
||||||
SELECT id, name, slug, created_at
|
|
||||||
FROM orgs
|
|
||||||
WHERE id = $1
|
|
||||||
""",
|
|
||||||
org_id,
|
|
||||||
)
|
|
||||||
return dict(row) if row else None
|
|
||||||
|
|
||||||
async def get_by_slug(self, slug: str) -> dict | None:
|
|
||||||
"""Get organization by slug."""
|
|
||||||
row = await self.conn.fetchrow(
|
|
||||||
"""
|
|
||||||
SELECT id, name, slug, created_at
|
|
||||||
FROM orgs
|
|
||||||
WHERE slug = $1
|
|
||||||
""",
|
|
||||||
slug,
|
|
||||||
)
|
|
||||||
return dict(row) if row else None
|
|
||||||
|
|
||||||
async def add_member(
|
|
||||||
self,
|
|
||||||
member_id: UUID,
|
|
||||||
user_id: UUID,
|
|
||||||
org_id: UUID,
|
|
||||||
role: str,
|
|
||||||
) -> dict:
|
|
||||||
"""Add a member to an organization."""
|
|
||||||
row = await self.conn.fetchrow(
|
|
||||||
"""
|
|
||||||
INSERT INTO org_members (id, user_id, org_id, role)
|
|
||||||
VALUES ($1, $2, $3, $4)
|
|
||||||
RETURNING id, user_id, org_id, role, created_at
|
|
||||||
""",
|
|
||||||
member_id,
|
|
||||||
user_id,
|
|
||||||
org_id,
|
|
||||||
role,
|
|
||||||
)
|
|
||||||
return dict(row)
|
|
||||||
|
|
||||||
async def get_member(self, user_id: UUID, org_id: UUID) -> dict | None:
|
|
||||||
"""Get membership for a user in an organization."""
|
|
||||||
row = await self.conn.fetchrow(
|
|
||||||
"""
|
|
||||||
SELECT om.id, om.user_id, om.org_id, om.role, om.created_at
|
|
||||||
FROM org_members om
|
|
||||||
WHERE om.user_id = $1 AND om.org_id = $2
|
|
||||||
""",
|
|
||||||
user_id,
|
|
||||||
org_id,
|
|
||||||
)
|
|
||||||
return dict(row) if row else None
|
|
||||||
|
|
||||||
async def get_members(self, org_id: UUID) -> list[dict]:
|
|
||||||
"""Get all members of an organization."""
|
|
||||||
rows = await self.conn.fetch(
|
|
||||||
"""
|
|
||||||
SELECT om.id, om.user_id, u.email, om.role, om.created_at
|
|
||||||
FROM org_members om
|
|
||||||
JOIN users u ON u.id = om.user_id
|
|
||||||
WHERE om.org_id = $1
|
|
||||||
ORDER BY om.created_at
|
|
||||||
""",
|
|
||||||
org_id,
|
|
||||||
)
|
|
||||||
return [dict(row) for row in rows]
|
|
||||||
|
|
||||||
async def get_user_orgs(self, user_id: UUID) -> list[dict]:
|
|
||||||
"""Get all organizations a user belongs to."""
|
|
||||||
rows = await self.conn.fetch(
|
|
||||||
"""
|
|
||||||
SELECT o.id, o.name, o.slug, o.created_at, om.role
|
|
||||||
FROM orgs o
|
|
||||||
JOIN org_members om ON om.org_id = o.id
|
|
||||||
WHERE om.user_id = $1
|
|
||||||
ORDER BY o.created_at
|
|
||||||
""",
|
|
||||||
user_id,
|
|
||||||
)
|
|
||||||
return [dict(row) for row in rows]
|
|
||||||
|
|
||||||
async def slug_exists(self, slug: str) -> bool:
|
|
||||||
"""Check if organization slug exists."""
|
|
||||||
result = await self.conn.fetchval(
|
|
||||||
"SELECT EXISTS(SELECT 1 FROM orgs WHERE slug = $1)",
|
|
||||||
slug,
|
|
||||||
)
|
|
||||||
return result
|
|
||||||
@@ -1,396 +0,0 @@
|
|||||||
"""Refresh token repository for database operations.
|
|
||||||
|
|
||||||
Security considerations implemented:
|
|
||||||
- Atomic rotation using SELECT FOR UPDATE to prevent race conditions
|
|
||||||
- Token chain tracking via rotated_to for reuse/theft detection
|
|
||||||
- Defense-in-depth validation with user_id and active_org_id checks
|
|
||||||
- Uses RETURNING for robust row counting instead of string parsing
|
|
||||||
"""
|
|
||||||
|
|
||||||
from datetime import datetime
|
|
||||||
from uuid import UUID
|
|
||||||
|
|
||||||
import asyncpg
|
|
||||||
|
|
||||||
|
|
||||||
class RefreshTokenRepository:
|
|
||||||
"""Database operations for refresh tokens."""
|
|
||||||
|
|
||||||
def __init__(self, conn: asyncpg.Connection) -> None:
|
|
||||||
self.conn = conn
|
|
||||||
|
|
||||||
async def create(
|
|
||||||
self,
|
|
||||||
token_id: UUID,
|
|
||||||
user_id: UUID,
|
|
||||||
token_hash: str,
|
|
||||||
active_org_id: UUID,
|
|
||||||
expires_at: datetime,
|
|
||||||
) -> dict:
|
|
||||||
"""Create a new refresh token."""
|
|
||||||
row = await self.conn.fetchrow(
|
|
||||||
"""
|
|
||||||
INSERT INTO refresh_tokens (id, user_id, token_hash, active_org_id, expires_at)
|
|
||||||
VALUES ($1, $2, $3, $4, $5)
|
|
||||||
RETURNING id, user_id, token_hash, active_org_id, expires_at,
|
|
||||||
revoked_at, rotated_to, created_at
|
|
||||||
""",
|
|
||||||
token_id,
|
|
||||||
user_id,
|
|
||||||
token_hash,
|
|
||||||
active_org_id,
|
|
||||||
expires_at,
|
|
||||||
)
|
|
||||||
return dict(row)
|
|
||||||
|
|
||||||
async def get_by_hash(self, token_hash: str) -> dict | None:
|
|
||||||
"""Get refresh token by hash (includes revoked/expired for auditing)."""
|
|
||||||
row = await self.conn.fetchrow(
|
|
||||||
"""
|
|
||||||
SELECT id, user_id, token_hash, active_org_id, expires_at,
|
|
||||||
revoked_at, rotated_to, created_at
|
|
||||||
FROM refresh_tokens
|
|
||||||
WHERE token_hash = $1
|
|
||||||
""",
|
|
||||||
token_hash,
|
|
||||||
)
|
|
||||||
return dict(row) if row else None
|
|
||||||
|
|
||||||
async def get_valid_by_hash(
|
|
||||||
self,
|
|
||||||
token_hash: str,
|
|
||||||
user_id: UUID | None = None,
|
|
||||||
active_org_id: UUID | None = None,
|
|
||||||
) -> dict | None:
|
|
||||||
"""Get refresh token by hash, only if valid.
|
|
||||||
|
|
||||||
Validates:
|
|
||||||
- Token exists and matches hash
|
|
||||||
- Token is not revoked
|
|
||||||
- Token is not expired
|
|
||||||
- Token has not been rotated (rotated_to is NULL)
|
|
||||||
- Optionally: user_id matches (defense-in-depth)
|
|
||||||
- Optionally: active_org_id matches (defense-in-depth)
|
|
||||||
|
|
||||||
Args:
|
|
||||||
token_hash: The hashed token value
|
|
||||||
user_id: If provided, token must belong to this user
|
|
||||||
active_org_id: If provided, token must be bound to this org
|
|
||||||
|
|
||||||
Returns:
|
|
||||||
Token dict if valid, None otherwise
|
|
||||||
"""
|
|
||||||
query = """
|
|
||||||
SELECT id, user_id, token_hash, active_org_id, expires_at,
|
|
||||||
revoked_at, rotated_to, created_at
|
|
||||||
FROM refresh_tokens
|
|
||||||
WHERE token_hash = $1
|
|
||||||
AND revoked_at IS NULL
|
|
||||||
AND rotated_to IS NULL
|
|
||||||
AND expires_at > clock_timestamp()
|
|
||||||
"""
|
|
||||||
params: list = [token_hash]
|
|
||||||
param_idx = 2
|
|
||||||
|
|
||||||
if user_id is not None:
|
|
||||||
query += f" AND user_id = ${param_idx}"
|
|
||||||
params.append(user_id)
|
|
||||||
param_idx += 1
|
|
||||||
|
|
||||||
if active_org_id is not None:
|
|
||||||
query += f" AND active_org_id = ${param_idx}"
|
|
||||||
params.append(active_org_id)
|
|
||||||
|
|
||||||
row = await self.conn.fetchrow(query, *params)
|
|
||||||
return dict(row) if row else None
|
|
||||||
|
|
||||||
async def get_valid_for_rotation(
|
|
||||||
self,
|
|
||||||
token_hash: str,
|
|
||||||
user_id: UUID | None = None,
|
|
||||||
) -> dict | None:
|
|
||||||
"""Get and lock a valid token for rotation using SELECT FOR UPDATE.
|
|
||||||
|
|
||||||
This acquires a row-level lock to prevent concurrent rotation attempts.
|
|
||||||
Must be called within a transaction.
|
|
||||||
|
|
||||||
Args:
|
|
||||||
token_hash: The hashed token value
|
|
||||||
user_id: If provided, token must belong to this user
|
|
||||||
|
|
||||||
Returns:
|
|
||||||
Token dict if valid and locked, None otherwise
|
|
||||||
"""
|
|
||||||
query = """
|
|
||||||
SELECT id, user_id, token_hash, active_org_id, expires_at,
|
|
||||||
revoked_at, rotated_to, created_at
|
|
||||||
FROM refresh_tokens
|
|
||||||
WHERE token_hash = $1
|
|
||||||
AND revoked_at IS NULL
|
|
||||||
AND rotated_to IS NULL
|
|
||||||
AND expires_at > clock_timestamp()
|
|
||||||
"""
|
|
||||||
params: list = [token_hash]
|
|
||||||
|
|
||||||
if user_id is not None:
|
|
||||||
query += " AND user_id = $2"
|
|
||||||
params.append(user_id)
|
|
||||||
|
|
||||||
query += " FOR UPDATE"
|
|
||||||
|
|
||||||
row = await self.conn.fetchrow(query, *params)
|
|
||||||
return dict(row) if row else None
|
|
||||||
|
|
||||||
async def check_token_reuse(self, token_hash: str) -> dict | None:
|
|
||||||
"""Check if a token has already been rotated (potential theft).
|
|
||||||
|
|
||||||
If a token is presented that has rotated_to set, it means:
|
|
||||||
1. The token was legitimately rotated earlier
|
|
||||||
2. Someone is now trying to use the old token
|
|
||||||
3. This indicates the token may have been stolen
|
|
||||||
|
|
||||||
Returns:
|
|
||||||
Token dict if this is a reused/stolen token, None if not found or not rotated
|
|
||||||
"""
|
|
||||||
row = await self.conn.fetchrow(
|
|
||||||
"""
|
|
||||||
SELECT id, user_id, token_hash, active_org_id, expires_at,
|
|
||||||
revoked_at, rotated_to, created_at
|
|
||||||
FROM refresh_tokens
|
|
||||||
WHERE token_hash = $1 AND rotated_to IS NOT NULL
|
|
||||||
""",
|
|
||||||
token_hash,
|
|
||||||
)
|
|
||||||
return dict(row) if row else None
|
|
||||||
|
|
||||||
async def revoke_token_chain(self, token_id: UUID) -> int:
|
|
||||||
"""Revoke a token and all tokens in its chain (for breach response).
|
|
||||||
|
|
||||||
When token reuse is detected, this revokes:
|
|
||||||
1. The original stolen token
|
|
||||||
2. Any token it was rotated to (and their rotations, recursively)
|
|
||||||
|
|
||||||
Args:
|
|
||||||
token_id: The ID of the compromised token
|
|
||||||
|
|
||||||
Returns:
|
|
||||||
Count of tokens revoked
|
|
||||||
"""
|
|
||||||
# Use recursive CTE to find all tokens in the chain
|
|
||||||
rows = await self.conn.fetch(
|
|
||||||
"""
|
|
||||||
WITH RECURSIVE token_chain AS (
|
|
||||||
-- Start with the given token
|
|
||||||
SELECT id, rotated_to
|
|
||||||
FROM refresh_tokens
|
|
||||||
WHERE id = $1
|
|
||||||
|
|
||||||
UNION ALL
|
|
||||||
|
|
||||||
-- Follow the chain via rotated_to
|
|
||||||
SELECT rt.id, rt.rotated_to
|
|
||||||
FROM refresh_tokens rt
|
|
||||||
INNER JOIN token_chain tc ON rt.id = tc.rotated_to
|
|
||||||
)
|
|
||||||
UPDATE refresh_tokens
|
|
||||||
SET revoked_at = clock_timestamp()
|
|
||||||
WHERE id IN (SELECT id FROM token_chain)
|
|
||||||
AND revoked_at IS NULL
|
|
||||||
RETURNING id
|
|
||||||
""",
|
|
||||||
token_id,
|
|
||||||
)
|
|
||||||
return len(rows)
|
|
||||||
|
|
||||||
async def rotate(
|
|
||||||
self,
|
|
||||||
old_token_hash: str,
|
|
||||||
new_token_id: UUID,
|
|
||||||
new_token_hash: str,
|
|
||||||
new_expires_at: datetime,
|
|
||||||
new_active_org_id: UUID | None = None,
|
|
||||||
expected_user_id: UUID | None = None,
|
|
||||||
) -> dict | None:
|
|
||||||
"""Atomically rotate a refresh token.
|
|
||||||
|
|
||||||
This method:
|
|
||||||
1. Validates the old token (not expired, not revoked, not already rotated)
|
|
||||||
2. Locks the row to prevent concurrent rotation
|
|
||||||
3. Marks old token as rotated (sets rotated_to)
|
|
||||||
4. Creates new token with updated org if specified
|
|
||||||
5. All in a single atomic operation
|
|
||||||
|
|
||||||
Args:
|
|
||||||
old_token_hash: Hash of the token being rotated
|
|
||||||
new_token_id: UUID for the new token
|
|
||||||
new_token_hash: Hash for the new token
|
|
||||||
new_expires_at: Expiry time for the new token
|
|
||||||
new_active_org_id: New org ID (for org-switch), or None to keep current
|
|
||||||
expected_user_id: If provided, validates token belongs to this user
|
|
||||||
|
|
||||||
Returns:
|
|
||||||
New token dict if rotation succeeded, None if old token invalid/expired
|
|
||||||
"""
|
|
||||||
# First, get and lock the old token
|
|
||||||
old_token = await self.get_valid_for_rotation(old_token_hash, expected_user_id)
|
|
||||||
if old_token is None:
|
|
||||||
return None
|
|
||||||
|
|
||||||
# Determine the org for the new token
|
|
||||||
active_org_id = new_active_org_id or old_token["active_org_id"]
|
|
||||||
user_id = old_token["user_id"]
|
|
||||||
|
|
||||||
# Create the new token
|
|
||||||
new_token = await self.conn.fetchrow(
|
|
||||||
"""
|
|
||||||
INSERT INTO refresh_tokens (id, user_id, token_hash, active_org_id, expires_at)
|
|
||||||
VALUES ($1, $2, $3, $4, $5)
|
|
||||||
RETURNING id, user_id, token_hash, active_org_id, expires_at,
|
|
||||||
revoked_at, rotated_to, created_at
|
|
||||||
""",
|
|
||||||
new_token_id,
|
|
||||||
user_id,
|
|
||||||
new_token_hash,
|
|
||||||
active_org_id,
|
|
||||||
new_expires_at,
|
|
||||||
)
|
|
||||||
|
|
||||||
# Mark the old token as rotated (not revoked - for reuse detection)
|
|
||||||
await self.conn.execute(
|
|
||||||
"""
|
|
||||||
UPDATE refresh_tokens
|
|
||||||
SET rotated_to = $2
|
|
||||||
WHERE id = $1
|
|
||||||
""",
|
|
||||||
old_token["id"],
|
|
||||||
new_token_id,
|
|
||||||
)
|
|
||||||
|
|
||||||
return dict(new_token)
|
|
||||||
|
|
||||||
async def revoke(self, token_id: UUID) -> bool:
|
|
||||||
"""Revoke a refresh token by ID.
|
|
||||||
|
|
||||||
Returns:
|
|
||||||
True if token was revoked, False if not found or already revoked
|
|
||||||
"""
|
|
||||||
row = await self.conn.fetchrow(
|
|
||||||
"""
|
|
||||||
UPDATE refresh_tokens
|
|
||||||
SET revoked_at = clock_timestamp()
|
|
||||||
WHERE id = $1 AND revoked_at IS NULL
|
|
||||||
RETURNING id
|
|
||||||
""",
|
|
||||||
token_id,
|
|
||||||
)
|
|
||||||
return row is not None
|
|
||||||
|
|
||||||
async def revoke_by_hash(self, token_hash: str) -> bool:
|
|
||||||
"""Revoke a refresh token by hash.
|
|
||||||
|
|
||||||
Returns:
|
|
||||||
True if token was revoked, False if not found or already revoked
|
|
||||||
"""
|
|
||||||
row = await self.conn.fetchrow(
|
|
||||||
"""
|
|
||||||
UPDATE refresh_tokens
|
|
||||||
SET revoked_at = clock_timestamp()
|
|
||||||
WHERE token_hash = $1 AND revoked_at IS NULL
|
|
||||||
RETURNING id
|
|
||||||
""",
|
|
||||||
token_hash,
|
|
||||||
)
|
|
||||||
return row is not None
|
|
||||||
|
|
||||||
async def revoke_all_for_user(self, user_id: UUID) -> int:
|
|
||||||
"""Revoke all active refresh tokens for a user.
|
|
||||||
|
|
||||||
Use this for:
|
|
||||||
- User-initiated logout from all devices
|
|
||||||
- Password change
|
|
||||||
- Account compromise response
|
|
||||||
|
|
||||||
Returns:
|
|
||||||
Count of tokens revoked
|
|
||||||
"""
|
|
||||||
rows = await self.conn.fetch(
|
|
||||||
"""
|
|
||||||
UPDATE refresh_tokens
|
|
||||||
SET revoked_at = clock_timestamp()
|
|
||||||
WHERE user_id = $1 AND revoked_at IS NULL
|
|
||||||
RETURNING id
|
|
||||||
""",
|
|
||||||
user_id,
|
|
||||||
)
|
|
||||||
return len(rows)
|
|
||||||
|
|
||||||
async def revoke_all_for_user_except(self, user_id: UUID, keep_token_id: UUID) -> int:
|
|
||||||
"""Revoke all tokens for a user except one (logout other sessions).
|
|
||||||
|
|
||||||
Args:
|
|
||||||
user_id: The user whose tokens to revoke
|
|
||||||
keep_token_id: The token ID to keep active (current session)
|
|
||||||
|
|
||||||
Returns:
|
|
||||||
Count of tokens revoked
|
|
||||||
"""
|
|
||||||
rows = await self.conn.fetch(
|
|
||||||
"""
|
|
||||||
UPDATE refresh_tokens
|
|
||||||
SET revoked_at = clock_timestamp()
|
|
||||||
WHERE user_id = $1 AND revoked_at IS NULL AND id != $2
|
|
||||||
RETURNING id
|
|
||||||
""",
|
|
||||||
user_id,
|
|
||||||
keep_token_id,
|
|
||||||
)
|
|
||||||
return len(rows)
|
|
||||||
|
|
||||||
async def get_active_tokens_for_user(self, user_id: UUID) -> list[dict]:
|
|
||||||
"""Get all active (non-revoked, non-expired, non-rotated) tokens for a user.
|
|
||||||
|
|
||||||
Useful for:
|
|
||||||
- Showing active sessions
|
|
||||||
- Auditing
|
|
||||||
|
|
||||||
Returns:
|
|
||||||
List of active token records
|
|
||||||
"""
|
|
||||||
rows = await self.conn.fetch(
|
|
||||||
"""
|
|
||||||
SELECT id, user_id, token_hash, active_org_id, expires_at,
|
|
||||||
revoked_at, rotated_to, created_at
|
|
||||||
FROM refresh_tokens
|
|
||||||
WHERE user_id = $1
|
|
||||||
AND revoked_at IS NULL
|
|
||||||
AND rotated_to IS NULL
|
|
||||||
AND expires_at > clock_timestamp()
|
|
||||||
ORDER BY created_at DESC
|
|
||||||
""",
|
|
||||||
user_id,
|
|
||||||
)
|
|
||||||
return [dict(row) for row in rows]
|
|
||||||
|
|
||||||
async def cleanup_expired(self, older_than_days: int = 30) -> int:
|
|
||||||
"""Delete expired tokens older than specified days.
|
|
||||||
|
|
||||||
Note: This performs a hard delete. For audit trails, I think we should:
|
|
||||||
- Archiving to a separate table first
|
|
||||||
- Using partitioning with retention policies
|
|
||||||
- Only calling this for tokens well past their expiry
|
|
||||||
|
|
||||||
Args:
|
|
||||||
older_than_days: Only delete tokens expired more than this many days ago
|
|
||||||
|
|
||||||
Returns:
|
|
||||||
Count of tokens deleted
|
|
||||||
"""
|
|
||||||
rows = await self.conn.fetch(
|
|
||||||
"""
|
|
||||||
DELETE FROM refresh_tokens
|
|
||||||
WHERE expires_at < clock_timestamp() - interval '1 day' * $1
|
|
||||||
RETURNING id
|
|
||||||
""",
|
|
||||||
older_than_days,
|
|
||||||
)
|
|
||||||
return len(rows)
|
|
||||||
@@ -1,80 +0,0 @@
|
|||||||
"""Service repository for database operations."""
|
|
||||||
|
|
||||||
from uuid import UUID
|
|
||||||
|
|
||||||
import asyncpg
|
|
||||||
|
|
||||||
|
|
||||||
class ServiceRepository:
|
|
||||||
"""Database operations for services."""
|
|
||||||
|
|
||||||
def __init__(self, conn: asyncpg.Connection) -> None:
|
|
||||||
self.conn = conn
|
|
||||||
|
|
||||||
async def create(
|
|
||||||
self,
|
|
||||||
service_id: UUID,
|
|
||||||
org_id: UUID,
|
|
||||||
name: str,
|
|
||||||
slug: str,
|
|
||||||
) -> dict:
|
|
||||||
"""Create a new service."""
|
|
||||||
row = await self.conn.fetchrow(
|
|
||||||
"""
|
|
||||||
INSERT INTO services (id, org_id, name, slug)
|
|
||||||
VALUES ($1, $2, $3, $4)
|
|
||||||
RETURNING id, org_id, name, slug, created_at
|
|
||||||
""",
|
|
||||||
service_id,
|
|
||||||
org_id,
|
|
||||||
name,
|
|
||||||
slug,
|
|
||||||
)
|
|
||||||
return dict(row)
|
|
||||||
|
|
||||||
async def get_by_id(self, service_id: UUID) -> dict | None:
|
|
||||||
"""Get service by ID."""
|
|
||||||
row = await self.conn.fetchrow(
|
|
||||||
"""
|
|
||||||
SELECT id, org_id, name, slug, created_at
|
|
||||||
FROM services
|
|
||||||
WHERE id = $1
|
|
||||||
""",
|
|
||||||
service_id,
|
|
||||||
)
|
|
||||||
return dict(row) if row else None
|
|
||||||
|
|
||||||
async def get_by_org(self, org_id: UUID) -> list[dict]:
|
|
||||||
"""Get all services for an organization."""
|
|
||||||
rows = await self.conn.fetch(
|
|
||||||
"""
|
|
||||||
SELECT id, org_id, name, slug, created_at
|
|
||||||
FROM services
|
|
||||||
WHERE org_id = $1
|
|
||||||
ORDER BY name
|
|
||||||
""",
|
|
||||||
org_id,
|
|
||||||
)
|
|
||||||
return [dict(row) for row in rows]
|
|
||||||
|
|
||||||
async def get_by_slug(self, org_id: UUID, slug: str) -> dict | None:
|
|
||||||
"""Get service by org and slug."""
|
|
||||||
row = await self.conn.fetchrow(
|
|
||||||
"""
|
|
||||||
SELECT id, org_id, name, slug, created_at
|
|
||||||
FROM services
|
|
||||||
WHERE org_id = $1 AND slug = $2
|
|
||||||
""",
|
|
||||||
org_id,
|
|
||||||
slug,
|
|
||||||
)
|
|
||||||
return dict(row) if row else None
|
|
||||||
|
|
||||||
async def slug_exists(self, org_id: UUID, slug: str) -> bool:
|
|
||||||
"""Check if service slug exists in organization."""
|
|
||||||
result = await self.conn.fetchval(
|
|
||||||
"SELECT EXISTS(SELECT 1 FROM services WHERE org_id = $1 AND slug = $2)",
|
|
||||||
org_id,
|
|
||||||
slug,
|
|
||||||
)
|
|
||||||
return result
|
|
||||||
@@ -1,63 +0,0 @@
|
|||||||
"""User repository for database operations."""
|
|
||||||
|
|
||||||
from uuid import UUID
|
|
||||||
|
|
||||||
import asyncpg
|
|
||||||
|
|
||||||
|
|
||||||
class UserRepository:
|
|
||||||
"""Database operations for users."""
|
|
||||||
|
|
||||||
def __init__(self, conn: asyncpg.Connection) -> None:
|
|
||||||
self.conn = conn
|
|
||||||
|
|
||||||
async def create(
|
|
||||||
self,
|
|
||||||
user_id: UUID,
|
|
||||||
email: str,
|
|
||||||
password_hash: str,
|
|
||||||
) -> dict:
|
|
||||||
"""Create a new user."""
|
|
||||||
row = await self.conn.fetchrow(
|
|
||||||
"""
|
|
||||||
INSERT INTO users (id, email, password_hash)
|
|
||||||
VALUES ($1, $2, $3)
|
|
||||||
RETURNING id, email, created_at
|
|
||||||
""",
|
|
||||||
user_id,
|
|
||||||
email,
|
|
||||||
password_hash,
|
|
||||||
)
|
|
||||||
return dict(row)
|
|
||||||
|
|
||||||
async def get_by_id(self, user_id: UUID) -> dict | None:
|
|
||||||
"""Get user by ID."""
|
|
||||||
row = await self.conn.fetchrow(
|
|
||||||
"""
|
|
||||||
SELECT id, email, password_hash, created_at
|
|
||||||
FROM users
|
|
||||||
WHERE id = $1
|
|
||||||
""",
|
|
||||||
user_id,
|
|
||||||
)
|
|
||||||
return dict(row) if row else None
|
|
||||||
|
|
||||||
async def get_by_email(self, email: str) -> dict | None:
|
|
||||||
"""Get user by email."""
|
|
||||||
row = await self.conn.fetchrow(
|
|
||||||
"""
|
|
||||||
SELECT id, email, password_hash, created_at
|
|
||||||
FROM users
|
|
||||||
WHERE email = $1
|
|
||||||
""",
|
|
||||||
email,
|
|
||||||
)
|
|
||||||
return dict(row) if row else None
|
|
||||||
|
|
||||||
async def exists_by_email(self, email: str) -> bool:
|
|
||||||
"""Check if user exists by email."""
|
|
||||||
result = await self.conn.fetchval(
|
|
||||||
"SELECT EXISTS(SELECT 1 FROM users WHERE email = $1)",
|
|
||||||
email,
|
|
||||||
)
|
|
||||||
return result
|
|
||||||
@@ -1,54 +0,0 @@
|
|||||||
"""Pydantic schemas for request/response models."""
|
|
||||||
|
|
||||||
from app.schemas.auth import (
|
|
||||||
LoginRequest,
|
|
||||||
LogoutRequest,
|
|
||||||
RefreshRequest,
|
|
||||||
RegisterRequest,
|
|
||||||
SwitchOrgRequest,
|
|
||||||
TokenResponse,
|
|
||||||
)
|
|
||||||
from app.schemas.common import CursorParams, ErrorDetail, ErrorResponse, PaginatedResponse
|
|
||||||
from app.schemas.incident import (
|
|
||||||
CommentRequest,
|
|
||||||
IncidentCreate,
|
|
||||||
IncidentEventResponse,
|
|
||||||
IncidentResponse,
|
|
||||||
TransitionRequest,
|
|
||||||
)
|
|
||||||
from app.schemas.org import (
|
|
||||||
MemberResponse,
|
|
||||||
NotificationTargetCreate,
|
|
||||||
NotificationTargetResponse,
|
|
||||||
OrgResponse,
|
|
||||||
ServiceCreate,
|
|
||||||
ServiceResponse,
|
|
||||||
)
|
|
||||||
|
|
||||||
__all__ = [
|
|
||||||
# Auth
|
|
||||||
"LoginRequest",
|
|
||||||
"LogoutRequest",
|
|
||||||
"RefreshRequest",
|
|
||||||
"RegisterRequest",
|
|
||||||
"SwitchOrgRequest",
|
|
||||||
"TokenResponse",
|
|
||||||
# Common
|
|
||||||
"CursorParams",
|
|
||||||
"ErrorDetail",
|
|
||||||
"ErrorResponse",
|
|
||||||
"PaginatedResponse",
|
|
||||||
# Incident
|
|
||||||
"CommentRequest",
|
|
||||||
"IncidentCreate",
|
|
||||||
"IncidentEventResponse",
|
|
||||||
"IncidentResponse",
|
|
||||||
"TransitionRequest",
|
|
||||||
# Org
|
|
||||||
"MemberResponse",
|
|
||||||
"NotificationTargetCreate",
|
|
||||||
"NotificationTargetResponse",
|
|
||||||
"OrgResponse",
|
|
||||||
"ServiceCreate",
|
|
||||||
"ServiceResponse",
|
|
||||||
]
|
|
||||||
@@ -1,48 +0,0 @@
|
|||||||
"""Authentication schemas."""
|
|
||||||
|
|
||||||
from uuid import UUID
|
|
||||||
|
|
||||||
from pydantic import BaseModel, EmailStr, Field
|
|
||||||
|
|
||||||
|
|
||||||
class RegisterRequest(BaseModel):
|
|
||||||
"""Request body for user registration."""
|
|
||||||
|
|
||||||
email: EmailStr
|
|
||||||
password: str = Field(min_length=8, max_length=128)
|
|
||||||
org_name: str = Field(min_length=1, max_length=100, description="Name for the default org")
|
|
||||||
|
|
||||||
|
|
||||||
class LoginRequest(BaseModel):
|
|
||||||
"""Request body for user login."""
|
|
||||||
|
|
||||||
email: EmailStr
|
|
||||||
password: str
|
|
||||||
|
|
||||||
|
|
||||||
class RefreshRequest(BaseModel):
|
|
||||||
"""Request body for token refresh."""
|
|
||||||
|
|
||||||
refresh_token: str
|
|
||||||
|
|
||||||
|
|
||||||
class SwitchOrgRequest(BaseModel):
|
|
||||||
"""Request body for switching active organization."""
|
|
||||||
|
|
||||||
org_id: UUID
|
|
||||||
refresh_token: str
|
|
||||||
|
|
||||||
|
|
||||||
class LogoutRequest(BaseModel):
|
|
||||||
"""Request body for logging out and revoking a refresh token."""
|
|
||||||
|
|
||||||
refresh_token: str
|
|
||||||
|
|
||||||
|
|
||||||
class TokenResponse(BaseModel):
|
|
||||||
"""Response containing access and refresh tokens."""
|
|
||||||
|
|
||||||
access_token: str
|
|
||||||
refresh_token: str
|
|
||||||
token_type: str = "bearer"
|
|
||||||
expires_in: int = Field(description="Access token expiry in seconds")
|
|
||||||
@@ -1,61 +0,0 @@
|
|||||||
"""Common schemas used across the API."""
|
|
||||||
|
|
||||||
from pydantic import BaseModel, Field
|
|
||||||
|
|
||||||
|
|
||||||
class ErrorDetail(BaseModel):
|
|
||||||
"""Individual error detail for validation errors."""
|
|
||||||
|
|
||||||
loc: list[str | int] = Field(description="Location of the error (field path)")
|
|
||||||
msg: str = Field(description="Error message")
|
|
||||||
type: str = Field(description="Error type identifier")
|
|
||||||
|
|
||||||
|
|
||||||
class ErrorResponse(BaseModel):
|
|
||||||
"""Structured error response returned by all error handlers."""
|
|
||||||
|
|
||||||
error: str = Field(description="Error type (e.g., 'not_found', 'validation_error')")
|
|
||||||
message: str = Field(description="Human-readable error message")
|
|
||||||
details: list[ErrorDetail] | None = Field(
|
|
||||||
default=None, description="Additional error details for validation errors"
|
|
||||||
)
|
|
||||||
request_id: str | None = Field(
|
|
||||||
default=None, description="Request trace ID for debugging"
|
|
||||||
)
|
|
||||||
|
|
||||||
model_config = {
|
|
||||||
"json_schema_extra": {
|
|
||||||
"examples": [
|
|
||||||
{
|
|
||||||
"error": "not_found",
|
|
||||||
"message": "Incident not found",
|
|
||||||
"request_id": "abc123def456",
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"error": "validation_error",
|
|
||||||
"message": "Request validation failed",
|
|
||||||
"details": [
|
|
||||||
{"loc": ["body", "title"], "msg": "Field required", "type": "missing"}
|
|
||||||
],
|
|
||||||
"request_id": "abc123def456",
|
|
||||||
},
|
|
||||||
]
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
class CursorParams(BaseModel):
|
|
||||||
"""Pagination parameters using cursor-based pagination."""
|
|
||||||
|
|
||||||
cursor: str | None = Field(default=None, description="Cursor for pagination")
|
|
||||||
limit: int = Field(default=20, ge=1, le=100, description="Number of items per page")
|
|
||||||
|
|
||||||
|
|
||||||
class PaginatedResponse[T](BaseModel):
|
|
||||||
"""Generic paginated response wrapper."""
|
|
||||||
|
|
||||||
items: list[T]
|
|
||||||
next_cursor: str | None = Field(
|
|
||||||
default=None, description="Cursor for next page, null if no more items"
|
|
||||||
)
|
|
||||||
has_more: bool = Field(description="Whether there are more items")
|
|
||||||
@@ -1,57 +0,0 @@
|
|||||||
"""Incident-related schemas."""
|
|
||||||
|
|
||||||
from datetime import datetime
|
|
||||||
from typing import Any, Literal
|
|
||||||
from uuid import UUID
|
|
||||||
|
|
||||||
from pydantic import BaseModel, Field
|
|
||||||
|
|
||||||
IncidentStatus = Literal["triggered", "acknowledged", "mitigated", "resolved"]
|
|
||||||
IncidentSeverity = Literal["critical", "high", "medium", "low"]
|
|
||||||
|
|
||||||
|
|
||||||
class IncidentCreate(BaseModel):
|
|
||||||
"""Request body for creating an incident."""
|
|
||||||
|
|
||||||
title: str = Field(min_length=1, max_length=200)
|
|
||||||
description: str | None = Field(default=None, max_length=5000)
|
|
||||||
severity: IncidentSeverity = "medium"
|
|
||||||
|
|
||||||
|
|
||||||
class IncidentResponse(BaseModel):
|
|
||||||
"""Incident response."""
|
|
||||||
|
|
||||||
id: UUID
|
|
||||||
service_id: UUID
|
|
||||||
title: str
|
|
||||||
description: str | None
|
|
||||||
status: IncidentStatus
|
|
||||||
severity: IncidentSeverity
|
|
||||||
version: int
|
|
||||||
created_at: datetime
|
|
||||||
updated_at: datetime
|
|
||||||
|
|
||||||
|
|
||||||
class IncidentEventResponse(BaseModel):
|
|
||||||
"""Incident event response."""
|
|
||||||
|
|
||||||
id: UUID
|
|
||||||
incident_id: UUID
|
|
||||||
event_type: str
|
|
||||||
actor_user_id: UUID | None
|
|
||||||
payload: dict[str, Any] | None
|
|
||||||
created_at: datetime
|
|
||||||
|
|
||||||
|
|
||||||
class TransitionRequest(BaseModel):
|
|
||||||
"""Request body for transitioning incident status."""
|
|
||||||
|
|
||||||
to_status: IncidentStatus
|
|
||||||
version: int = Field(description="Current version for optimistic locking")
|
|
||||||
note: str | None = Field(default=None, max_length=1000)
|
|
||||||
|
|
||||||
|
|
||||||
class CommentRequest(BaseModel):
|
|
||||||
"""Request body for adding a comment to an incident."""
|
|
||||||
|
|
||||||
content: str = Field(min_length=1, max_length=5000)
|
|
||||||
@@ -1,69 +0,0 @@
|
|||||||
"""Organization-related schemas."""
|
|
||||||
|
|
||||||
from datetime import datetime
|
|
||||||
from typing import Literal
|
|
||||||
from uuid import UUID
|
|
||||||
|
|
||||||
from pydantic import BaseModel, Field, HttpUrl
|
|
||||||
|
|
||||||
|
|
||||||
class OrgResponse(BaseModel):
|
|
||||||
"""Organization summary response."""
|
|
||||||
|
|
||||||
id: UUID
|
|
||||||
name: str
|
|
||||||
slug: str
|
|
||||||
created_at: datetime
|
|
||||||
|
|
||||||
|
|
||||||
class MemberResponse(BaseModel):
|
|
||||||
"""Organization member response."""
|
|
||||||
|
|
||||||
id: UUID
|
|
||||||
user_id: UUID
|
|
||||||
email: str
|
|
||||||
role: Literal["admin", "member", "viewer"]
|
|
||||||
created_at: datetime
|
|
||||||
|
|
||||||
|
|
||||||
class ServiceCreate(BaseModel):
|
|
||||||
"""Request body for creating a service."""
|
|
||||||
|
|
||||||
name: str = Field(min_length=1, max_length=100)
|
|
||||||
slug: str = Field(
|
|
||||||
min_length=1,
|
|
||||||
max_length=50,
|
|
||||||
pattern=r"^[a-z0-9]+(?:-[a-z0-9]+)*$",
|
|
||||||
description="URL-friendly identifier (lowercase, hyphens allowed)",
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
class ServiceResponse(BaseModel):
|
|
||||||
"""Service response."""
|
|
||||||
|
|
||||||
id: UUID
|
|
||||||
name: str
|
|
||||||
slug: str
|
|
||||||
created_at: datetime
|
|
||||||
|
|
||||||
|
|
||||||
class NotificationTargetCreate(BaseModel):
|
|
||||||
"""Request body for creating a notification target."""
|
|
||||||
|
|
||||||
name: str = Field(min_length=1, max_length=100)
|
|
||||||
target_type: Literal["webhook", "email", "slack"]
|
|
||||||
webhook_url: HttpUrl | None = Field(
|
|
||||||
default=None, description="Required for webhook type"
|
|
||||||
)
|
|
||||||
enabled: bool = True
|
|
||||||
|
|
||||||
|
|
||||||
class NotificationTargetResponse(BaseModel):
|
|
||||||
"""Notification target response."""
|
|
||||||
|
|
||||||
id: UUID
|
|
||||||
name: str
|
|
||||||
target_type: Literal["webhook", "email", "slack"]
|
|
||||||
webhook_url: str | None
|
|
||||||
enabled: bool
|
|
||||||
created_at: datetime
|
|
||||||
@@ -1,7 +0,0 @@
|
|||||||
"""Service layer entrypoints."""
|
|
||||||
|
|
||||||
from app.services.auth import AuthService
|
|
||||||
from app.services.incident import IncidentService
|
|
||||||
from app.services.org import OrgService
|
|
||||||
|
|
||||||
__all__ = ["AuthService", "OrgService", "IncidentService"]
|
|
||||||
@@ -1,269 +0,0 @@
|
|||||||
"""Authentication service providing business logic for auth flows."""
|
|
||||||
|
|
||||||
from __future__ import annotations
|
|
||||||
|
|
||||||
import re
|
|
||||||
from typing import cast
|
|
||||||
from uuid import UUID, uuid4
|
|
||||||
|
|
||||||
import asyncpg
|
|
||||||
from asyncpg.pool import PoolConnectionProxy
|
|
||||||
|
|
||||||
from app.api.deps import CurrentUser
|
|
||||||
from app.config import settings
|
|
||||||
from app.core import exceptions as exc, security
|
|
||||||
from app.db import Database, db
|
|
||||||
from app.repositories import OrgRepository, RefreshTokenRepository, UserRepository
|
|
||||||
from app.schemas.auth import (
|
|
||||||
LoginRequest,
|
|
||||||
LogoutRequest,
|
|
||||||
RefreshRequest,
|
|
||||||
RegisterRequest,
|
|
||||||
SwitchOrgRequest,
|
|
||||||
TokenResponse,
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
_SLUG_PATTERN = re.compile(r"[^a-z0-9]+")
|
|
||||||
|
|
||||||
|
|
||||||
def _as_conn(conn: asyncpg.Connection | PoolConnectionProxy) -> asyncpg.Connection:
|
|
||||||
"""Helper to satisfy typing when a pool proxy is returned."""
|
|
||||||
|
|
||||||
return cast(asyncpg.Connection, conn)
|
|
||||||
|
|
||||||
|
|
||||||
class AuthService:
|
|
||||||
"""Encapsulates authentication workflows (register/login/refresh/logout)."""
|
|
||||||
|
|
||||||
def __init__(self, database: Database | None = None) -> None:
|
|
||||||
self.db = database or db
|
|
||||||
self._access_token_expires_in = settings.access_token_expire_minutes * 60
|
|
||||||
|
|
||||||
async def register_user(self, data: RegisterRequest) -> TokenResponse:
|
|
||||||
"""Create a new user, default org, membership, and token pair."""
|
|
||||||
|
|
||||||
async with self.db.transaction() as conn:
|
|
||||||
db_conn = _as_conn(conn)
|
|
||||||
user_repo = UserRepository(db_conn)
|
|
||||||
org_repo = OrgRepository(db_conn)
|
|
||||||
refresh_repo = RefreshTokenRepository(db_conn)
|
|
||||||
|
|
||||||
if await user_repo.exists_by_email(data.email):
|
|
||||||
raise exc.ConflictError("Email already registered")
|
|
||||||
|
|
||||||
user_id = uuid4()
|
|
||||||
org_id = uuid4()
|
|
||||||
member_id = uuid4()
|
|
||||||
password_hash = security.hash_password(data.password)
|
|
||||||
|
|
||||||
await user_repo.create(user_id, data.email, password_hash)
|
|
||||||
slug = await self._generate_unique_org_slug(org_repo, data.org_name)
|
|
||||||
await org_repo.create(org_id, data.org_name, slug)
|
|
||||||
await org_repo.add_member(member_id, user_id, org_id, "admin")
|
|
||||||
|
|
||||||
return await self._issue_token_pair(
|
|
||||||
refresh_repo,
|
|
||||||
user_id=user_id,
|
|
||||||
org_id=org_id,
|
|
||||||
role="admin",
|
|
||||||
)
|
|
||||||
|
|
||||||
async def login_user(self, data: LoginRequest) -> TokenResponse:
|
|
||||||
"""Authenticate a user and issue tokens for their first organization."""
|
|
||||||
|
|
||||||
async with self.db.connection() as conn:
|
|
||||||
db_conn = _as_conn(conn)
|
|
||||||
user_repo = UserRepository(db_conn)
|
|
||||||
org_repo = OrgRepository(db_conn)
|
|
||||||
refresh_repo = RefreshTokenRepository(db_conn)
|
|
||||||
|
|
||||||
user = await user_repo.get_by_email(data.email)
|
|
||||||
if not user or not security.verify_password(data.password, user["password_hash"]):
|
|
||||||
raise exc.UnauthorizedError("Invalid email or password")
|
|
||||||
|
|
||||||
orgs = await org_repo.get_user_orgs(user["id"])
|
|
||||||
if not orgs:
|
|
||||||
raise exc.ForbiddenError("User does not belong to any organization")
|
|
||||||
|
|
||||||
active_org = orgs[0]
|
|
||||||
return await self._issue_token_pair(
|
|
||||||
refresh_repo,
|
|
||||||
user_id=user["id"],
|
|
||||||
org_id=active_org["id"],
|
|
||||||
role=active_org["role"],
|
|
||||||
)
|
|
||||||
|
|
||||||
async def refresh_tokens(self, data: RefreshRequest) -> TokenResponse:
|
|
||||||
"""Rotate refresh token and mint a new access token."""
|
|
||||||
|
|
||||||
old_hash = security.hash_token(data.refresh_token)
|
|
||||||
new_refresh_token = security.generate_refresh_token()
|
|
||||||
new_refresh_hash = security.hash_token(new_refresh_token)
|
|
||||||
new_refresh_id = uuid4()
|
|
||||||
new_refresh_expiry = security.get_refresh_token_expiry()
|
|
||||||
|
|
||||||
rotated: dict | None = None
|
|
||||||
membership: dict | None = None
|
|
||||||
|
|
||||||
async with self.db.transaction() as conn:
|
|
||||||
db_conn = _as_conn(conn)
|
|
||||||
refresh_repo = RefreshTokenRepository(db_conn)
|
|
||||||
rotated = await refresh_repo.rotate(
|
|
||||||
old_token_hash=old_hash,
|
|
||||||
new_token_id=new_refresh_id,
|
|
||||||
new_token_hash=new_refresh_hash,
|
|
||||||
new_expires_at=new_refresh_expiry,
|
|
||||||
)
|
|
||||||
|
|
||||||
if rotated is not None:
|
|
||||||
org_repo = OrgRepository(db_conn)
|
|
||||||
membership = await org_repo.get_member(rotated["user_id"], rotated["active_org_id"])
|
|
||||||
if membership is None:
|
|
||||||
raise exc.UnauthorizedError("Invalid refresh token")
|
|
||||||
|
|
||||||
if rotated is None or membership is None:
|
|
||||||
await self._handle_invalid_refresh(old_hash)
|
|
||||||
|
|
||||||
assert rotated is not None and membership is not None
|
|
||||||
access_token = security.create_access_token(
|
|
||||||
sub=str(rotated["user_id"]),
|
|
||||||
org_id=str(rotated["active_org_id"]),
|
|
||||||
org_role=membership["role"],
|
|
||||||
)
|
|
||||||
|
|
||||||
return TokenResponse(
|
|
||||||
access_token=access_token,
|
|
||||||
refresh_token=new_refresh_token,
|
|
||||||
expires_in=self._access_token_expires_in,
|
|
||||||
)
|
|
||||||
|
|
||||||
async def switch_org(
|
|
||||||
self,
|
|
||||||
current_user: CurrentUser,
|
|
||||||
data: SwitchOrgRequest,
|
|
||||||
) -> TokenResponse:
|
|
||||||
"""Switch active organization (rotates refresh token + issues new JWT)."""
|
|
||||||
|
|
||||||
target_org_id = data.org_id
|
|
||||||
old_hash = security.hash_token(data.refresh_token)
|
|
||||||
new_refresh_token = security.generate_refresh_token()
|
|
||||||
new_refresh_hash = security.hash_token(new_refresh_token)
|
|
||||||
new_refresh_expiry = security.get_refresh_token_expiry()
|
|
||||||
|
|
||||||
rotated: dict | None = None
|
|
||||||
membership: dict | None = None
|
|
||||||
|
|
||||||
async with self.db.transaction() as conn:
|
|
||||||
db_conn = _as_conn(conn)
|
|
||||||
org_repo = OrgRepository(db_conn)
|
|
||||||
membership = await org_repo.get_member(current_user.user_id, target_org_id)
|
|
||||||
if membership is None:
|
|
||||||
raise exc.ForbiddenError("Not a member of the requested organization")
|
|
||||||
|
|
||||||
refresh_repo = RefreshTokenRepository(db_conn)
|
|
||||||
rotated = await refresh_repo.rotate(
|
|
||||||
old_token_hash=old_hash,
|
|
||||||
new_token_id=uuid4(),
|
|
||||||
new_token_hash=new_refresh_hash,
|
|
||||||
new_expires_at=new_refresh_expiry,
|
|
||||||
new_active_org_id=target_org_id,
|
|
||||||
expected_user_id=current_user.user_id,
|
|
||||||
)
|
|
||||||
|
|
||||||
if rotated is None:
|
|
||||||
await self._handle_invalid_refresh(old_hash)
|
|
||||||
|
|
||||||
access_token = security.create_access_token(
|
|
||||||
sub=str(current_user.user_id),
|
|
||||||
org_id=str(target_org_id),
|
|
||||||
org_role=membership["role"],
|
|
||||||
)
|
|
||||||
|
|
||||||
return TokenResponse(
|
|
||||||
access_token=access_token,
|
|
||||||
refresh_token=new_refresh_token,
|
|
||||||
expires_in=self._access_token_expires_in,
|
|
||||||
)
|
|
||||||
|
|
||||||
async def logout(self, current_user: CurrentUser, data: LogoutRequest) -> None:
|
|
||||||
"""Revoke the provided refresh token for the current session."""
|
|
||||||
|
|
||||||
token_hash = security.hash_token(data.refresh_token)
|
|
||||||
|
|
||||||
async with self.db.transaction() as conn:
|
|
||||||
refresh_repo = RefreshTokenRepository(_as_conn(conn))
|
|
||||||
token = await refresh_repo.get_by_hash(token_hash)
|
|
||||||
if token and token["user_id"] != current_user.user_id:
|
|
||||||
raise exc.ForbiddenError("Refresh token does not belong to this user")
|
|
||||||
|
|
||||||
if not token:
|
|
||||||
return
|
|
||||||
|
|
||||||
await refresh_repo.revoke(token["id"])
|
|
||||||
|
|
||||||
async def _issue_token_pair(
|
|
||||||
self,
|
|
||||||
refresh_repo: RefreshTokenRepository,
|
|
||||||
*,
|
|
||||||
user_id: UUID,
|
|
||||||
org_id: UUID,
|
|
||||||
role: str,
|
|
||||||
) -> TokenResponse:
|
|
||||||
"""Create access/refresh tokens and persist the refresh token."""
|
|
||||||
|
|
||||||
access_token = security.create_access_token(
|
|
||||||
sub=str(user_id),
|
|
||||||
org_id=str(org_id),
|
|
||||||
org_role=role,
|
|
||||||
)
|
|
||||||
|
|
||||||
refresh_token = security.generate_refresh_token()
|
|
||||||
await refresh_repo.create(
|
|
||||||
token_id=uuid4(),
|
|
||||||
user_id=user_id,
|
|
||||||
token_hash=security.hash_token(refresh_token),
|
|
||||||
active_org_id=org_id,
|
|
||||||
expires_at=security.get_refresh_token_expiry(),
|
|
||||||
)
|
|
||||||
|
|
||||||
return TokenResponse(
|
|
||||||
access_token=access_token,
|
|
||||||
refresh_token=refresh_token,
|
|
||||||
expires_in=self._access_token_expires_in,
|
|
||||||
)
|
|
||||||
|
|
||||||
async def _handle_invalid_refresh(self, token_hash: str) -> None:
|
|
||||||
"""Raise appropriate errors for invalid/compromised refresh tokens."""
|
|
||||||
|
|
||||||
async with self.db.connection() as conn:
|
|
||||||
refresh_repo = RefreshTokenRepository(_as_conn(conn))
|
|
||||||
reused = await refresh_repo.check_token_reuse(token_hash)
|
|
||||||
if reused:
|
|
||||||
await refresh_repo.revoke_token_chain(reused["id"])
|
|
||||||
raise exc.UnauthorizedError("Refresh token reuse detected")
|
|
||||||
|
|
||||||
raise exc.UnauthorizedError("Invalid refresh token")
|
|
||||||
|
|
||||||
async def _generate_unique_org_slug(
|
|
||||||
self,
|
|
||||||
org_repo: OrgRepository,
|
|
||||||
org_name: str,
|
|
||||||
) -> str:
|
|
||||||
"""Slugify the org name and append a counter until unique."""
|
|
||||||
|
|
||||||
base_slug = self._slugify(org_name)
|
|
||||||
candidate = base_slug
|
|
||||||
counter = 1
|
|
||||||
while await org_repo.slug_exists(candidate):
|
|
||||||
suffix = f"-{counter}"
|
|
||||||
max_base_len = 50 - len(suffix)
|
|
||||||
candidate = f"{base_slug[:max_base_len]}{suffix}"
|
|
||||||
counter += 1
|
|
||||||
return candidate
|
|
||||||
|
|
||||||
def _slugify(self, value: str) -> str:
|
|
||||||
"""Convert arbitrary text into a URL-friendly slug."""
|
|
||||||
|
|
||||||
slug = _SLUG_PATTERN.sub("-", value.strip().lower()).strip("-")
|
|
||||||
return slug[:50] or "org"
|
|
||||||
@@ -1,247 +0,0 @@
|
|||||||
"""Incident service implementing incident lifecycle operations."""
|
|
||||||
|
|
||||||
from __future__ import annotations
|
|
||||||
|
|
||||||
from datetime import datetime
|
|
||||||
from typing import cast
|
|
||||||
from uuid import UUID, uuid4
|
|
||||||
|
|
||||||
import asyncpg
|
|
||||||
from asyncpg.pool import PoolConnectionProxy
|
|
||||||
|
|
||||||
from app.api.deps import CurrentUser, ensure_org_access
|
|
||||||
from app.config import settings
|
|
||||||
from app.core import exceptions as exc
|
|
||||||
from app.db import Database, db
|
|
||||||
from app.repositories import IncidentRepository, ServiceRepository
|
|
||||||
from app.schemas.common import PaginatedResponse
|
|
||||||
from app.schemas.incident import (
|
|
||||||
CommentRequest,
|
|
||||||
IncidentCreate,
|
|
||||||
IncidentEventResponse,
|
|
||||||
IncidentResponse,
|
|
||||||
TransitionRequest,
|
|
||||||
)
|
|
||||||
from app.taskqueue import TaskQueue
|
|
||||||
from app.taskqueue import task_queue as default_task_queue
|
|
||||||
|
|
||||||
_ALLOWED_TRANSITIONS: dict[str, set[str]] = {
|
|
||||||
"triggered": {"acknowledged"},
|
|
||||||
"acknowledged": {"mitigated"},
|
|
||||||
"mitigated": {"resolved"},
|
|
||||||
"resolved": set(),
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
def _as_conn(conn: asyncpg.Connection | PoolConnectionProxy) -> asyncpg.Connection:
|
|
||||||
"""Helper to satisfy typing when a pool proxy is returned."""
|
|
||||||
|
|
||||||
return cast(asyncpg.Connection, conn)
|
|
||||||
|
|
||||||
|
|
||||||
class IncidentService:
|
|
||||||
"""Encapsulates incident lifecycle operations within an org context."""
|
|
||||||
|
|
||||||
def __init__(
|
|
||||||
self,
|
|
||||||
database: Database | None = None,
|
|
||||||
task_queue: TaskQueue | None = None,
|
|
||||||
escalation_delay_seconds: int | None = None,
|
|
||||||
) -> None:
|
|
||||||
self.db = database or db
|
|
||||||
self.task_queue = task_queue or default_task_queue
|
|
||||||
self.escalation_delay_seconds = (
|
|
||||||
escalation_delay_seconds
|
|
||||||
if escalation_delay_seconds is not None
|
|
||||||
else settings.notification_escalation_delay_seconds
|
|
||||||
)
|
|
||||||
|
|
||||||
async def create_incident(
|
|
||||||
self,
|
|
||||||
current_user: CurrentUser,
|
|
||||||
service_id: UUID,
|
|
||||||
data: IncidentCreate,
|
|
||||||
) -> IncidentResponse:
|
|
||||||
"""Create an incident for a service in the active org and record the creation event."""
|
|
||||||
|
|
||||||
async with self.db.transaction() as conn:
|
|
||||||
db_conn = _as_conn(conn)
|
|
||||||
service_repo = ServiceRepository(db_conn)
|
|
||||||
incident_repo = IncidentRepository(db_conn)
|
|
||||||
|
|
||||||
service = await service_repo.get_by_id(service_id)
|
|
||||||
if service is None:
|
|
||||||
raise exc.NotFoundError("Service not found")
|
|
||||||
ensure_org_access(service["org_id"], current_user)
|
|
||||||
|
|
||||||
incident_id = uuid4()
|
|
||||||
incident = await incident_repo.create(
|
|
||||||
incident_id=incident_id,
|
|
||||||
org_id=current_user.org_id,
|
|
||||||
service_id=service_id,
|
|
||||||
title=data.title,
|
|
||||||
description=data.description,
|
|
||||||
severity=data.severity,
|
|
||||||
)
|
|
||||||
|
|
||||||
await incident_repo.add_event(
|
|
||||||
uuid4(),
|
|
||||||
incident_id,
|
|
||||||
"created",
|
|
||||||
actor_user_id=current_user.user_id,
|
|
||||||
payload={
|
|
||||||
"title": data.title,
|
|
||||||
"severity": data.severity,
|
|
||||||
"description": data.description,
|
|
||||||
},
|
|
||||||
)
|
|
||||||
|
|
||||||
incident_response = IncidentResponse(**incident)
|
|
||||||
|
|
||||||
self.task_queue.incident_triggered(
|
|
||||||
incident_id=incident_response.id,
|
|
||||||
org_id=current_user.org_id,
|
|
||||||
triggered_by=current_user.user_id,
|
|
||||||
)
|
|
||||||
|
|
||||||
if self.escalation_delay_seconds > 0:
|
|
||||||
self.task_queue.schedule_escalation_check(
|
|
||||||
incident_id=incident_response.id,
|
|
||||||
org_id=current_user.org_id,
|
|
||||||
delay_seconds=self.escalation_delay_seconds,
|
|
||||||
)
|
|
||||||
|
|
||||||
return incident_response
|
|
||||||
|
|
||||||
async def get_incidents(
|
|
||||||
self,
|
|
||||||
current_user: CurrentUser,
|
|
||||||
*,
|
|
||||||
status: str | None = None,
|
|
||||||
cursor: datetime | None = None,
|
|
||||||
limit: int = 20,
|
|
||||||
) -> PaginatedResponse[IncidentResponse]:
|
|
||||||
"""Return paginated incidents for the active organization."""
|
|
||||||
|
|
||||||
async with self.db.connection() as conn:
|
|
||||||
incident_repo = IncidentRepository(_as_conn(conn))
|
|
||||||
rows = await incident_repo.get_by_org(
|
|
||||||
org_id=current_user.org_id,
|
|
||||||
status=status,
|
|
||||||
cursor=cursor,
|
|
||||||
limit=limit,
|
|
||||||
)
|
|
||||||
|
|
||||||
has_more = len(rows) > limit
|
|
||||||
items = rows[:limit]
|
|
||||||
next_cursor = items[-1]["created_at"].isoformat() if has_more and items else None
|
|
||||||
|
|
||||||
incidents = [IncidentResponse(**row) for row in items]
|
|
||||||
return PaginatedResponse[IncidentResponse](
|
|
||||||
items=incidents,
|
|
||||||
next_cursor=next_cursor,
|
|
||||||
has_more=has_more,
|
|
||||||
)
|
|
||||||
|
|
||||||
async def get_incident(self, current_user: CurrentUser, incident_id: UUID) -> IncidentResponse:
|
|
||||||
"""Return a single incident, ensuring it belongs to the active org."""
|
|
||||||
|
|
||||||
async with self.db.connection() as conn:
|
|
||||||
incident_repo = IncidentRepository(_as_conn(conn))
|
|
||||||
incident = await incident_repo.get_by_id(incident_id)
|
|
||||||
if incident is None:
|
|
||||||
raise exc.NotFoundError("Incident not found")
|
|
||||||
ensure_org_access(incident["org_id"], current_user)
|
|
||||||
return IncidentResponse(**incident)
|
|
||||||
|
|
||||||
async def get_incident_events(
|
|
||||||
self, current_user: CurrentUser, incident_id: UUID
|
|
||||||
) -> list[IncidentEventResponse]:
|
|
||||||
"""Return the timeline events for an incident in the active org."""
|
|
||||||
|
|
||||||
async with self.db.connection() as conn:
|
|
||||||
incident_repo = IncidentRepository(_as_conn(conn))
|
|
||||||
incident = await incident_repo.get_by_id(incident_id)
|
|
||||||
if incident is None:
|
|
||||||
raise exc.NotFoundError("Incident not found")
|
|
||||||
ensure_org_access(incident["org_id"], current_user)
|
|
||||||
|
|
||||||
events = await incident_repo.get_events(incident_id)
|
|
||||||
return [IncidentEventResponse(**event) for event in events]
|
|
||||||
|
|
||||||
async def transition_incident(
|
|
||||||
self,
|
|
||||||
current_user: CurrentUser,
|
|
||||||
incident_id: UUID,
|
|
||||||
data: TransitionRequest,
|
|
||||||
) -> IncidentResponse:
|
|
||||||
"""Transition an incident status with optimistic locking and event recording."""
|
|
||||||
|
|
||||||
async with self.db.transaction() as conn:
|
|
||||||
db_conn = _as_conn(conn)
|
|
||||||
incident_repo = IncidentRepository(db_conn)
|
|
||||||
|
|
||||||
incident = await incident_repo.get_by_id(incident_id)
|
|
||||||
if incident is None:
|
|
||||||
raise exc.NotFoundError("Incident not found")
|
|
||||||
ensure_org_access(incident["org_id"], current_user)
|
|
||||||
self._validate_transition(incident["status"], data.to_status)
|
|
||||||
|
|
||||||
updated = await incident_repo.update_status(
|
|
||||||
incident_id,
|
|
||||||
data.to_status,
|
|
||||||
data.version,
|
|
||||||
)
|
|
||||||
if updated is None:
|
|
||||||
raise exc.ConflictError("Incident version mismatch")
|
|
||||||
|
|
||||||
payload = {"from": incident["status"], "to": data.to_status}
|
|
||||||
if data.note:
|
|
||||||
payload["note"] = data.note
|
|
||||||
|
|
||||||
await incident_repo.add_event(
|
|
||||||
uuid4(),
|
|
||||||
incident_id,
|
|
||||||
"status_changed",
|
|
||||||
actor_user_id=current_user.user_id,
|
|
||||||
payload=payload,
|
|
||||||
)
|
|
||||||
|
|
||||||
return IncidentResponse(**updated)
|
|
||||||
|
|
||||||
async def add_comment(
|
|
||||||
self,
|
|
||||||
current_user: CurrentUser,
|
|
||||||
incident_id: UUID,
|
|
||||||
data: CommentRequest,
|
|
||||||
) -> IncidentEventResponse:
|
|
||||||
"""Add a comment event to the incident timeline."""
|
|
||||||
|
|
||||||
async with self.db.connection() as conn:
|
|
||||||
incident_repo = IncidentRepository(_as_conn(conn))
|
|
||||||
incident = await incident_repo.get_by_id(incident_id)
|
|
||||||
if incident is None:
|
|
||||||
raise exc.NotFoundError("Incident not found")
|
|
||||||
ensure_org_access(incident["org_id"], current_user)
|
|
||||||
|
|
||||||
event = await incident_repo.add_event(
|
|
||||||
uuid4(),
|
|
||||||
incident_id,
|
|
||||||
"comment_added",
|
|
||||||
actor_user_id=current_user.user_id,
|
|
||||||
payload={"content": data.content},
|
|
||||||
)
|
|
||||||
return IncidentEventResponse(**event)
|
|
||||||
|
|
||||||
def _validate_transition(self, current_status: str, to_status: str) -> None:
|
|
||||||
"""Validate a requested status transition against the allowed state machine."""
|
|
||||||
|
|
||||||
if current_status == to_status:
|
|
||||||
raise exc.BadRequestError("Incident is already in the requested status")
|
|
||||||
|
|
||||||
allowed = _ALLOWED_TRANSITIONS.get(current_status, set())
|
|
||||||
if to_status not in allowed:
|
|
||||||
raise exc.BadRequestError("Invalid incident status transition")
|
|
||||||
|
|
||||||
|
|
||||||
__all__ = ["IncidentService"]
|
|
||||||
@@ -1,115 +0,0 @@
|
|||||||
"""Organization service providing org-scoped operations."""
|
|
||||||
|
|
||||||
from __future__ import annotations
|
|
||||||
|
|
||||||
from typing import cast
|
|
||||||
from uuid import UUID, uuid4
|
|
||||||
|
|
||||||
import asyncpg
|
|
||||||
from asyncpg.pool import PoolConnectionProxy
|
|
||||||
|
|
||||||
from app.api.deps import CurrentUser
|
|
||||||
from app.core import exceptions as exc
|
|
||||||
from app.db import Database, db
|
|
||||||
from app.repositories import NotificationRepository, OrgRepository, ServiceRepository
|
|
||||||
from app.schemas.org import (
|
|
||||||
MemberResponse,
|
|
||||||
NotificationTargetCreate,
|
|
||||||
NotificationTargetResponse,
|
|
||||||
OrgResponse,
|
|
||||||
ServiceCreate,
|
|
||||||
ServiceResponse,
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
def _as_conn(conn: asyncpg.Connection | PoolConnectionProxy) -> asyncpg.Connection:
|
|
||||||
"""Helper to satisfy typing when a pool proxy is returned."""
|
|
||||||
|
|
||||||
return cast(asyncpg.Connection, conn)
|
|
||||||
|
|
||||||
|
|
||||||
class OrgService:
|
|
||||||
"""Encapsulates organization-level operations within the active org context."""
|
|
||||||
|
|
||||||
def __init__(self, database: Database | None = None) -> None:
|
|
||||||
self.db = database or db
|
|
||||||
|
|
||||||
async def get_current_org(self, current_user: CurrentUser) -> OrgResponse:
|
|
||||||
"""Return the active organization summary for the current user."""
|
|
||||||
|
|
||||||
async with self.db.connection() as conn:
|
|
||||||
org_repo = OrgRepository(_as_conn(conn))
|
|
||||||
org = await org_repo.get_by_id(current_user.org_id)
|
|
||||||
if org is None:
|
|
||||||
raise exc.NotFoundError("Organization not found")
|
|
||||||
return OrgResponse(**org)
|
|
||||||
|
|
||||||
async def get_members(self, current_user: CurrentUser) -> list[MemberResponse]:
|
|
||||||
"""List members of the active organization."""
|
|
||||||
|
|
||||||
async with self.db.connection() as conn:
|
|
||||||
org_repo = OrgRepository(_as_conn(conn))
|
|
||||||
members = await org_repo.get_members(current_user.org_id)
|
|
||||||
return [MemberResponse(**member) for member in members]
|
|
||||||
|
|
||||||
async def create_service(self, current_user: CurrentUser, data: ServiceCreate) -> ServiceResponse:
|
|
||||||
"""Create a new service within the active organization."""
|
|
||||||
|
|
||||||
async with self.db.connection() as conn:
|
|
||||||
service_repo = ServiceRepository(_as_conn(conn))
|
|
||||||
|
|
||||||
if await service_repo.slug_exists(current_user.org_id, data.slug):
|
|
||||||
raise exc.ConflictError("Service slug already exists in this organization")
|
|
||||||
|
|
||||||
try:
|
|
||||||
service = await service_repo.create(
|
|
||||||
service_id=uuid4(),
|
|
||||||
org_id=current_user.org_id,
|
|
||||||
name=data.name,
|
|
||||||
slug=data.slug,
|
|
||||||
)
|
|
||||||
except asyncpg.UniqueViolationError as err: # pragma: no cover - race protection
|
|
||||||
raise exc.ConflictError("Service slug already exists in this organization") from err
|
|
||||||
|
|
||||||
return ServiceResponse(**service)
|
|
||||||
|
|
||||||
async def get_services(self, current_user: CurrentUser) -> list[ServiceResponse]:
|
|
||||||
"""List services for the active organization."""
|
|
||||||
|
|
||||||
async with self.db.connection() as conn:
|
|
||||||
service_repo = ServiceRepository(_as_conn(conn))
|
|
||||||
services = await service_repo.get_by_org(current_user.org_id)
|
|
||||||
return [ServiceResponse(**svc) for svc in services]
|
|
||||||
|
|
||||||
async def create_notification_target(
|
|
||||||
self,
|
|
||||||
current_user: CurrentUser,
|
|
||||||
data: NotificationTargetCreate,
|
|
||||||
) -> NotificationTargetResponse:
|
|
||||||
"""Create a notification target for the active organization."""
|
|
||||||
|
|
||||||
if data.target_type == "webhook" and data.webhook_url is None:
|
|
||||||
raise exc.BadRequestError("webhook_url is required for webhook targets")
|
|
||||||
|
|
||||||
async with self.db.connection() as conn:
|
|
||||||
notification_repo = NotificationRepository(_as_conn(conn))
|
|
||||||
target = await notification_repo.create_target(
|
|
||||||
target_id=uuid4(),
|
|
||||||
org_id=current_user.org_id,
|
|
||||||
name=data.name,
|
|
||||||
target_type=data.target_type,
|
|
||||||
webhook_url=str(data.webhook_url) if data.webhook_url else None,
|
|
||||||
enabled=data.enabled,
|
|
||||||
)
|
|
||||||
return NotificationTargetResponse(**target)
|
|
||||||
|
|
||||||
async def get_notification_targets(self, current_user: CurrentUser) -> list[NotificationTargetResponse]:
|
|
||||||
"""List notification targets for the active organization."""
|
|
||||||
|
|
||||||
async with self.db.connection() as conn:
|
|
||||||
notification_repo = NotificationRepository(_as_conn(conn))
|
|
||||||
targets = await notification_repo.get_targets_by_org(current_user.org_id)
|
|
||||||
return [NotificationTargetResponse(**target) for target in targets]
|
|
||||||
|
|
||||||
|
|
||||||
__all__ = ["OrgService"]
|
|
||||||
@@ -1,188 +0,0 @@
|
|||||||
"""Task queue abstractions for scheduling background work."""
|
|
||||||
|
|
||||||
from __future__ import annotations
|
|
||||||
|
|
||||||
import asyncio
|
|
||||||
from abc import ABC, abstractmethod
|
|
||||||
from dataclasses import dataclass
|
|
||||||
from typing import Any
|
|
||||||
from uuid import UUID
|
|
||||||
|
|
||||||
from app.config import settings
|
|
||||||
|
|
||||||
try:
|
|
||||||
from worker.celery_app import celery_app
|
|
||||||
except Exception: # pragma: no cover - celery app may not import during docs builds
|
|
||||||
celery_app = None # type: ignore[assignment]
|
|
||||||
|
|
||||||
|
|
||||||
class TaskQueue(ABC):
|
|
||||||
"""Interface for enqueueing background work."""
|
|
||||||
|
|
||||||
async def startup(self) -> None: # pragma: no cover - default no-op
|
|
||||||
"""Hook for queue initialization."""
|
|
||||||
|
|
||||||
async def shutdown(self) -> None: # pragma: no cover - default no-op
|
|
||||||
"""Hook for queue teardown."""
|
|
||||||
|
|
||||||
async def ping(self) -> bool:
|
|
||||||
"""Check if the queue backend is reachable."""
|
|
||||||
|
|
||||||
return True
|
|
||||||
|
|
||||||
def reset(self) -> None: # pragma: no cover - optional for in-memory impls
|
|
||||||
"""Reset any in-memory state (used in tests)."""
|
|
||||||
|
|
||||||
@abstractmethod
|
|
||||||
def incident_triggered(
|
|
||||||
self,
|
|
||||||
*,
|
|
||||||
incident_id: UUID,
|
|
||||||
org_id: UUID,
|
|
||||||
triggered_by: UUID | None,
|
|
||||||
) -> None:
|
|
||||||
"""Fan out an incident triggered notification."""
|
|
||||||
|
|
||||||
@abstractmethod
|
|
||||||
def schedule_escalation_check(
|
|
||||||
self,
|
|
||||||
*,
|
|
||||||
incident_id: UUID,
|
|
||||||
org_id: UUID,
|
|
||||||
delay_seconds: int,
|
|
||||||
) -> None:
|
|
||||||
"""Schedule a delayed escalation check."""
|
|
||||||
|
|
||||||
|
|
||||||
class CeleryTaskQueue(TaskQueue):
|
|
||||||
"""Celery-backed task queue that can use Redis or SQS brokers."""
|
|
||||||
|
|
||||||
def __init__(self, default_queue: str, critical_queue: str) -> None:
|
|
||||||
if celery_app is None: # pragma: no cover - guarded by try/except
|
|
||||||
raise RuntimeError("Celery application is unavailable")
|
|
||||||
self._celery = celery_app
|
|
||||||
self._default_queue = default_queue
|
|
||||||
self._critical_queue = critical_queue
|
|
||||||
|
|
||||||
def incident_triggered(
|
|
||||||
self,
|
|
||||||
*,
|
|
||||||
incident_id: UUID,
|
|
||||||
org_id: UUID,
|
|
||||||
triggered_by: UUID | None,
|
|
||||||
) -> None:
|
|
||||||
self._celery.send_task(
|
|
||||||
"worker.tasks.notifications.incident_triggered",
|
|
||||||
kwargs={
|
|
||||||
"incident_id": str(incident_id),
|
|
||||||
"org_id": str(org_id),
|
|
||||||
"triggered_by": str(triggered_by) if triggered_by else None,
|
|
||||||
},
|
|
||||||
queue=self._default_queue,
|
|
||||||
)
|
|
||||||
|
|
||||||
def schedule_escalation_check(
|
|
||||||
self,
|
|
||||||
*,
|
|
||||||
incident_id: UUID,
|
|
||||||
org_id: UUID,
|
|
||||||
delay_seconds: int,
|
|
||||||
) -> None:
|
|
||||||
self._celery.send_task(
|
|
||||||
"worker.tasks.notifications.escalate_if_unacked",
|
|
||||||
kwargs={
|
|
||||||
"incident_id": str(incident_id),
|
|
||||||
"org_id": str(org_id),
|
|
||||||
},
|
|
||||||
countdown=max(delay_seconds, 0),
|
|
||||||
queue=self._critical_queue,
|
|
||||||
)
|
|
||||||
|
|
||||||
async def ping(self) -> bool:
|
|
||||||
loop = asyncio.get_running_loop()
|
|
||||||
return await loop.run_in_executor(None, self._ping_sync)
|
|
||||||
|
|
||||||
def _ping_sync(self) -> bool:
|
|
||||||
connection = self._celery.connection()
|
|
||||||
try:
|
|
||||||
connection.connect()
|
|
||||||
return True
|
|
||||||
except Exception:
|
|
||||||
return False
|
|
||||||
finally:
|
|
||||||
try:
|
|
||||||
connection.release()
|
|
||||||
except Exception: # pragma: no cover - release best effort
|
|
||||||
pass
|
|
||||||
|
|
||||||
|
|
||||||
@dataclass
|
|
||||||
class InMemoryTaskQueue(TaskQueue):
|
|
||||||
"""Test-friendly queue that records dispatched tasks in memory."""
|
|
||||||
|
|
||||||
dispatched: list[tuple[str, dict[str, Any]]] | None = None
|
|
||||||
|
|
||||||
def __post_init__(self) -> None:
|
|
||||||
if self.dispatched is None:
|
|
||||||
self.dispatched = []
|
|
||||||
|
|
||||||
def incident_triggered(
|
|
||||||
self,
|
|
||||||
*,
|
|
||||||
incident_id: UUID,
|
|
||||||
org_id: UUID,
|
|
||||||
triggered_by: UUID | None,
|
|
||||||
) -> None:
|
|
||||||
self.dispatched.append(
|
|
||||||
(
|
|
||||||
"incident_triggered",
|
|
||||||
{
|
|
||||||
"incident_id": incident_id,
|
|
||||||
"org_id": org_id,
|
|
||||||
"triggered_by": triggered_by,
|
|
||||||
},
|
|
||||||
)
|
|
||||||
)
|
|
||||||
|
|
||||||
def schedule_escalation_check(
|
|
||||||
self,
|
|
||||||
*,
|
|
||||||
incident_id: UUID,
|
|
||||||
org_id: UUID,
|
|
||||||
delay_seconds: int,
|
|
||||||
) -> None:
|
|
||||||
self.dispatched.append(
|
|
||||||
(
|
|
||||||
"escalate_if_unacked",
|
|
||||||
{
|
|
||||||
"incident_id": incident_id,
|
|
||||||
"org_id": org_id,
|
|
||||||
"delay_seconds": delay_seconds,
|
|
||||||
},
|
|
||||||
)
|
|
||||||
)
|
|
||||||
|
|
||||||
def reset(self) -> None:
|
|
||||||
if self.dispatched is not None:
|
|
||||||
self.dispatched.clear()
|
|
||||||
|
|
||||||
|
|
||||||
def _build_task_queue() -> TaskQueue:
|
|
||||||
if settings.task_queue_driver == "inmemory":
|
|
||||||
return InMemoryTaskQueue()
|
|
||||||
|
|
||||||
return CeleryTaskQueue(
|
|
||||||
default_queue=settings.task_queue_default_queue,
|
|
||||||
critical_queue=settings.task_queue_critical_queue,
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
task_queue = _build_task_queue()
|
|
||||||
|
|
||||||
|
|
||||||
__all__ = [
|
|
||||||
"CeleryTaskQueue",
|
|
||||||
"InMemoryTaskQueue",
|
|
||||||
"TaskQueue",
|
|
||||||
"task_queue",
|
|
||||||
]
|
|
||||||
+17
-184
@@ -1,230 +1,63 @@
|
|||||||
version: "3.8"
|
version: '3.8'
|
||||||
|
|
||||||
services:
|
services:
|
||||||
postgres:
|
postgres:
|
||||||
image: postgres:16-alpine
|
image: postgres:16-alpine
|
||||||
container_name: incidentops-postgres
|
|
||||||
environment:
|
environment:
|
||||||
POSTGRES_USER: incidentops
|
POSTGRES_USER: postgres
|
||||||
POSTGRES_PASSWORD: incidentops
|
POSTGRES_PASSWORD: postgres
|
||||||
POSTGRES_DB: incidentops
|
POSTGRES_DB: incidentops
|
||||||
ports:
|
ports:
|
||||||
- "5432:5432"
|
- "5432:5432"
|
||||||
volumes:
|
volumes:
|
||||||
- postgres_data:/var/lib/postgresql/data
|
- postgres_data:/var/lib/postgresql/data
|
||||||
healthcheck:
|
healthcheck:
|
||||||
test: ["CMD-SHELL", "pg_isready -U incidentops"]
|
test: ["CMD-SHELL", "pg_isready -U postgres"]
|
||||||
interval: 10s
|
interval: 5s
|
||||||
timeout: 5s
|
timeout: 5s
|
||||||
retries: 5
|
retries: 5
|
||||||
|
|
||||||
# For Celery broker
|
|
||||||
redis:
|
redis:
|
||||||
image: redis:7-alpine
|
image: redis:7-alpine
|
||||||
container_name: incidentops-redis
|
|
||||||
ports:
|
ports:
|
||||||
- "6379:6379"
|
- "6379:6379"
|
||||||
volumes:
|
|
||||||
- redis_data:/data
|
|
||||||
healthcheck:
|
healthcheck:
|
||||||
test: ["CMD", "redis-cli", "ping"]
|
test: ["CMD", "redis-cli", "ping"]
|
||||||
interval: 10s
|
interval: 5s
|
||||||
timeout: 5s
|
timeout: 5s
|
||||||
retries: 5
|
retries: 5
|
||||||
|
|
||||||
# api services
|
|
||||||
api:
|
api:
|
||||||
build:
|
build:
|
||||||
context: .
|
context: .
|
||||||
dockerfile: Dockerfile
|
dockerfile: src/IncidentOps.Api/Dockerfile
|
||||||
target: api
|
|
||||||
container_name: incidentops-api
|
|
||||||
ports:
|
ports:
|
||||||
- "8000:8000"
|
- "8080:8080"
|
||||||
- "9464:9464" # Prometheus metrics
|
|
||||||
environment:
|
environment:
|
||||||
DATABASE_URL: postgresql://incidentops:incidentops@postgres:5432/incidentops
|
- ConnectionStrings__Postgres=Host=postgres;Port=5432;Database=incidentops;Username=postgres;Password=postgres
|
||||||
REDIS_URL: redis://redis:6379/0
|
- Redis__ConnectionString=redis:6379
|
||||||
JWT_SECRET_KEY: dev-secret-key-change-in-production
|
- Jwt__SigningKey=your-super-secret-key-that-should-be-at-least-32-characters-long
|
||||||
JWT_ALGORITHM: HS256
|
- Jwt__Issuer=incidentops
|
||||||
ACCESS_TOKEN_EXPIRE_MINUTES: 30
|
- Jwt__Audience=incidentops
|
||||||
REFRESH_TOKEN_EXPIRE_DAYS: 30
|
- Cors__Origins__0=http://localhost:3000
|
||||||
# OpenTelemetry
|
|
||||||
OTEL_ENABLED: "true"
|
|
||||||
OTEL_SERVICE_NAME: incidentops-api
|
|
||||||
OTEL_ENVIRONMENT: development
|
|
||||||
OTEL_EXPORTER_OTLP_ENDPOINT: http://otel-collector:4317
|
|
||||||
OTEL_EXPORTER_OTLP_INSECURE: "true"
|
|
||||||
OTEL_LOG_LEVEL: INFO
|
|
||||||
# Metrics
|
|
||||||
PROMETHEUS_PORT: "9464"
|
|
||||||
depends_on:
|
depends_on:
|
||||||
postgres:
|
postgres:
|
||||||
condition: service_healthy
|
condition: service_healthy
|
||||||
redis:
|
redis:
|
||||||
condition: service_healthy
|
condition: service_healthy
|
||||||
otel-collector:
|
|
||||||
condition: service_started
|
|
||||||
prometheus:
|
|
||||||
condition: service_started
|
|
||||||
healthcheck:
|
|
||||||
test: ["CMD", "curl", "-f", "http://localhost:8000/v1/healthz"]
|
|
||||||
interval: 30s
|
|
||||||
timeout: 10s
|
|
||||||
retries: 3
|
|
||||||
start_period: 10s
|
|
||||||
|
|
||||||
# Worker service (Celery)
|
|
||||||
worker:
|
worker:
|
||||||
build:
|
build:
|
||||||
context: .
|
context: .
|
||||||
dockerfile: Dockerfile
|
dockerfile: src/IncidentOps.Worker/Dockerfile
|
||||||
target: worker
|
|
||||||
container_name: incidentops-worker
|
|
||||||
environment:
|
environment:
|
||||||
DATABASE_URL: postgresql://incidentops:incidentops@postgres:5432/incidentops
|
- ConnectionStrings__Postgres=Host=postgres;Port=5432;Database=incidentops;Username=postgres;Password=postgres
|
||||||
REDIS_URL: redis://redis:6379/0
|
- Redis__ConnectionString=redis:6379
|
||||||
CELERY_BROKER_URL: redis://redis:6379/0
|
|
||||||
CELERY_RESULT_BACKEND: redis://redis:6379/1
|
|
||||||
# OpenTelemetry
|
|
||||||
OTEL_ENABLED: "true"
|
|
||||||
OTEL_SERVICE_NAME: incidentops-worker
|
|
||||||
OTEL_ENVIRONMENT: development
|
|
||||||
OTEL_EXPORTER_OTLP_ENDPOINT: http://otel-collector:4317
|
|
||||||
OTEL_EXPORTER_OTLP_INSECURE: "true"
|
|
||||||
depends_on:
|
depends_on:
|
||||||
postgres:
|
postgres:
|
||||||
condition: service_healthy
|
condition: service_healthy
|
||||||
redis:
|
redis:
|
||||||
condition: service_healthy
|
condition: service_healthy
|
||||||
|
|
||||||
# Web frontend (Next.js)
|
|
||||||
web:
|
|
||||||
build:
|
|
||||||
context: .
|
|
||||||
dockerfile: Dockerfile.web
|
|
||||||
container_name: incidentops-web
|
|
||||||
ports:
|
|
||||||
- "3000:3000"
|
|
||||||
environment:
|
|
||||||
NEXT_PUBLIC_API_URL: http://localhost:8000
|
|
||||||
depends_on:
|
|
||||||
- api
|
|
||||||
|
|
||||||
# Database migrations (run once)
|
|
||||||
migrate:
|
|
||||||
build:
|
|
||||||
context: .
|
|
||||||
dockerfile: Dockerfile
|
|
||||||
target: api
|
|
||||||
container_name: incidentops-migrate
|
|
||||||
command: python migrations/migrate.py apply
|
|
||||||
environment:
|
|
||||||
DATABASE_URL: postgresql://incidentops:incidentops@postgres:5432/incidentops
|
|
||||||
depends_on:
|
|
||||||
postgres:
|
|
||||||
condition: service_healthy
|
|
||||||
profiles:
|
|
||||||
- migrate
|
|
||||||
|
|
||||||
# Flower for Celery monitoring (dev only)
|
|
||||||
flower:
|
|
||||||
image: mher/flower:2.0
|
|
||||||
container_name: incidentops-flower
|
|
||||||
ports:
|
|
||||||
- "5555:5555"
|
|
||||||
environment:
|
|
||||||
CELERY_BROKER_URL: redis://redis:6379/0
|
|
||||||
FLOWER_BASIC_AUTH: admin:admin
|
|
||||||
depends_on:
|
|
||||||
- redis
|
|
||||||
profiles:
|
|
||||||
- monitoring
|
|
||||||
|
|
||||||
# ============================================
|
|
||||||
# Observability Stack
|
|
||||||
# ============================================
|
|
||||||
|
|
||||||
# OpenTelemetry Collector - receives traces/logs from apps
|
|
||||||
otel-collector:
|
|
||||||
image: otel/opentelemetry-collector-contrib:0.96.0
|
|
||||||
container_name: incidentops-otel-collector
|
|
||||||
command: ["--config=/etc/otel-collector/config.yaml"]
|
|
||||||
volumes:
|
|
||||||
- ./observability/otel-collector/config.yaml:/etc/otel-collector/config.yaml:ro
|
|
||||||
ports:
|
|
||||||
- "4317:4317" # OTLP gRPC
|
|
||||||
- "4318:4318" # OTLP HTTP
|
|
||||||
depends_on:
|
|
||||||
- tempo
|
|
||||||
- loki
|
|
||||||
|
|
||||||
# Tempo - distributed tracing backend
|
|
||||||
tempo:
|
|
||||||
image: grafana/tempo:2.4.1
|
|
||||||
container_name: incidentops-tempo
|
|
||||||
command: ["-config.file=/etc/tempo/config.yaml"]
|
|
||||||
volumes:
|
|
||||||
- ./observability/tempo/config.yaml:/etc/tempo/config.yaml:ro
|
|
||||||
- tempo_data:/var/tempo
|
|
||||||
ports:
|
|
||||||
- "3200:3200" # Tempo HTTP
|
|
||||||
- "4320:4317" # Tempo OTLP gRPC (different host port to avoid conflict)
|
|
||||||
|
|
||||||
# Loki - log aggregation
|
|
||||||
loki:
|
|
||||||
image: grafana/loki:2.9.6
|
|
||||||
container_name: incidentops-loki
|
|
||||||
command: ["-config.file=/etc/loki/config.yaml"]
|
|
||||||
volumes:
|
|
||||||
- ./observability/loki/config.yaml:/etc/loki/config.yaml:ro
|
|
||||||
- loki_data:/loki
|
|
||||||
ports:
|
|
||||||
- "3100:3100" # Loki HTTP
|
|
||||||
|
|
||||||
# Prometheus - metrics storage
|
|
||||||
prometheus:
|
|
||||||
image: prom/prometheus:v2.51.0
|
|
||||||
container_name: incidentops-prometheus
|
|
||||||
command:
|
|
||||||
- "--config.file=/etc/prometheus/prometheus.yml"
|
|
||||||
- "--storage.tsdb.path=/prometheus"
|
|
||||||
- "--web.enable-lifecycle"
|
|
||||||
volumes:
|
|
||||||
- ./observability/prometheus/prometheus.yml:/etc/prometheus/prometheus.yml:ro
|
|
||||||
- prometheus_data:/prometheus
|
|
||||||
ports:
|
|
||||||
- "9090:9090" # Prometheus UI
|
|
||||||
|
|
||||||
# Grafana - visualization
|
|
||||||
grafana:
|
|
||||||
image: grafana/grafana:10.4.1
|
|
||||||
container_name: incidentops-grafana
|
|
||||||
environment:
|
|
||||||
GF_SECURITY_ADMIN_USER: admin
|
|
||||||
GF_SECURITY_ADMIN_PASSWORD: admin
|
|
||||||
GF_USERS_ALLOW_SIGN_UP: "false"
|
|
||||||
GF_EXPLORE_ENABLED: "true"
|
|
||||||
GF_FEATURE_TOGGLES_ENABLE: traceqlEditor tempoSearch tempoBackendSearch tempoApmTable
|
|
||||||
volumes:
|
|
||||||
- ./observability/grafana/provisioning:/etc/grafana/provisioning:ro
|
|
||||||
- ./observability/grafana/dashboards:/var/lib/grafana/dashboards:ro
|
|
||||||
- grafana_data:/var/lib/grafana
|
|
||||||
ports:
|
|
||||||
- "3001:3000" # Grafana UI (3001 to avoid conflict with web frontend)
|
|
||||||
depends_on:
|
|
||||||
- tempo
|
|
||||||
- loki
|
|
||||||
- prometheus
|
|
||||||
|
|
||||||
volumes:
|
volumes:
|
||||||
postgres_data:
|
postgres_data:
|
||||||
redis_data:
|
|
||||||
tempo_data:
|
|
||||||
loki_data:
|
|
||||||
prometheus_data:
|
|
||||||
grafana_data:
|
|
||||||
|
|
||||||
networks:
|
|
||||||
default:
|
|
||||||
name: incidentops-network
|
|
||||||
|
|||||||
+657
@@ -0,0 +1,657 @@
|
|||||||
|
# IncidentOps Specification
|
||||||
|
|
||||||
|
A multi-tenant incident management system with implicit active-org context from JWT.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Project Structure
|
||||||
|
|
||||||
|
```
|
||||||
|
incidentops/
|
||||||
|
├── IncidentOps.sln
|
||||||
|
├── docker-compose.yml
|
||||||
|
├── skaffold.yaml
|
||||||
|
├── .gitignore
|
||||||
|
│
|
||||||
|
├── src/
|
||||||
|
│ ├── IncidentOps.Api/ # ASP.NET Core REST API
|
||||||
|
│ │ ├── Auth/
|
||||||
|
│ │ │ ├── ClaimsPrincipalExtensions.cs
|
||||||
|
│ │ │ ├── RequestContext.cs
|
||||||
|
│ │ │ └── RoleRequirement.cs
|
||||||
|
│ │ ├── Controllers/
|
||||||
|
│ │ │ ├── AuthController.cs
|
||||||
|
│ │ │ ├── HealthController.cs
|
||||||
|
│ │ │ ├── IncidentsController.cs
|
||||||
|
│ │ │ └── OrgController.cs
|
||||||
|
│ │ ├── Dockerfile
|
||||||
|
│ │ ├── Program.cs
|
||||||
|
│ │ ├── appsettings.json
|
||||||
|
│ │ └── appsettings.Development.json
|
||||||
|
│ │
|
||||||
|
│ ├── IncidentOps.Worker/ # Hangfire Worker Service
|
||||||
|
│ │ ├── Jobs/
|
||||||
|
│ │ │ ├── EscalateIfUnackedJob.cs
|
||||||
|
│ │ │ ├── IncidentTriggeredJob.cs
|
||||||
|
│ │ │ └── SendWebhookNotificationJob.cs
|
||||||
|
│ │ ├── Dockerfile
|
||||||
|
│ │ ├── Program.cs
|
||||||
|
│ │ └── appsettings.json
|
||||||
|
│ │
|
||||||
|
│ ├── IncidentOps.Domain/ # Domain Entities & Enums
|
||||||
|
│ │ ├── Entities/
|
||||||
|
│ │ │ ├── Incident.cs
|
||||||
|
│ │ │ ├── IncidentEvent.cs
|
||||||
|
│ │ │ ├── NotificationAttempt.cs
|
||||||
|
│ │ │ ├── NotificationTarget.cs
|
||||||
|
│ │ │ ├── Org.cs
|
||||||
|
│ │ │ ├── OrgMember.cs
|
||||||
|
│ │ │ ├── RefreshToken.cs
|
||||||
|
│ │ │ ├── Service.cs
|
||||||
|
│ │ │ └── User.cs
|
||||||
|
│ │ └── Enums/
|
||||||
|
│ │ ├── IncidentEventType.cs
|
||||||
|
│ │ ├── IncidentStatus.cs
|
||||||
|
│ │ ├── NotificationTargetType.cs
|
||||||
|
│ │ └── OrgRole.cs
|
||||||
|
│ │
|
||||||
|
│ ├── IncidentOps.Infrastructure/ # Data Access & Services
|
||||||
|
│ │ ├── Auth/
|
||||||
|
│ │ │ ├── IPasswordService.cs
|
||||||
|
│ │ │ ├── ITokenService.cs
|
||||||
|
│ │ │ └── JwtSettings.cs
|
||||||
|
│ │ ├── Data/
|
||||||
|
│ │ │ ├── DbConnectionFactory.cs
|
||||||
|
│ │ │ └── Repositories/
|
||||||
|
│ │ │ ├── IIncidentEventRepository.cs
|
||||||
|
│ │ │ ├── IIncidentRepository.cs
|
||||||
|
│ │ │ ├── INotificationTargetRepository.cs
|
||||||
|
│ │ │ ├── IOrgMemberRepository.cs
|
||||||
|
│ │ │ ├── IOrgRepository.cs
|
||||||
|
│ │ │ ├── IRefreshTokenRepository.cs
|
||||||
|
│ │ │ ├── IServiceRepository.cs
|
||||||
|
│ │ │ └── IUserRepository.cs
|
||||||
|
│ │ ├── Jobs/
|
||||||
|
│ │ │ ├── IEscalateIfUnackedJob.cs
|
||||||
|
│ │ │ ├── IIncidentTriggeredJob.cs
|
||||||
|
│ │ │ └── ISendWebhookNotificationJob.cs
|
||||||
|
│ │ ├── Migrations/
|
||||||
|
│ │ │ ├── Migration0001_InitialSchema.cs
|
||||||
|
│ │ │ ├── Migration0002_RefreshTokens.cs
|
||||||
|
│ │ │ └── Migration0003_NotificationTargets.cs
|
||||||
|
│ │ └── ServiceCollectionExtensions.cs
|
||||||
|
│ │
|
||||||
|
│ └── IncidentOps.Contracts/ # DTOs / API Contracts
|
||||||
|
│ ├── Auth/
|
||||||
|
│ │ ├── AuthResponse.cs
|
||||||
|
│ │ ├── LoginRequest.cs
|
||||||
|
│ │ ├── LogoutRequest.cs
|
||||||
|
│ │ ├── MeResponse.cs
|
||||||
|
│ │ ├── RefreshRequest.cs
|
||||||
|
│ │ ├── RegisterRequest.cs
|
||||||
|
│ │ └── SwitchOrgRequest.cs
|
||||||
|
│ ├── Incidents/
|
||||||
|
│ │ ├── CommentRequest.cs
|
||||||
|
│ │ ├── CreateIncidentRequest.cs
|
||||||
|
│ │ ├── IncidentDto.cs
|
||||||
|
│ │ ├── IncidentEventDto.cs
|
||||||
|
│ │ ├── IncidentListResponse.cs
|
||||||
|
│ │ └── TransitionRequest.cs
|
||||||
|
│ ├── Orgs/
|
||||||
|
│ │ ├── CreateNotificationTargetRequest.cs
|
||||||
|
│ │ ├── NotificationTargetDto.cs
|
||||||
|
│ │ ├── OrgDto.cs
|
||||||
|
│ │ └── OrgMemberDto.cs
|
||||||
|
│ └── Services/
|
||||||
|
│ ├── CreateServiceRequest.cs
|
||||||
|
│ └── ServiceDto.cs
|
||||||
|
│
|
||||||
|
├── web/ # Next.js Frontend
|
||||||
|
│ ├── app/
|
||||||
|
│ │ ├── dashboard/page.tsx
|
||||||
|
│ │ ├── login/page.tsx
|
||||||
|
│ │ ├── register/page.tsx
|
||||||
|
│ │ ├── layout.tsx
|
||||||
|
│ │ ├── page.tsx
|
||||||
|
│ │ └── globals.css
|
||||||
|
│ ├── lib/
|
||||||
|
│ │ └── api.ts
|
||||||
|
│ ├── types/
|
||||||
|
│ │ └── index.ts
|
||||||
|
│ ├── Dockerfile
|
||||||
|
│ ├── package.json
|
||||||
|
│ ├── tsconfig.json
|
||||||
|
│ └── next.config.js
|
||||||
|
│
|
||||||
|
├── helm/incidentops/ # Helm Chart
|
||||||
|
│ ├── Chart.yaml
|
||||||
|
│ ├── values.yaml
|
||||||
|
│ └── templates/
|
||||||
|
│ ├── _helpers.tpl
|
||||||
|
│ ├── api-deployment.yaml
|
||||||
|
│ ├── api-service.yaml
|
||||||
|
│ ├── worker-deployment.yaml
|
||||||
|
│ ├── web-deployment.yaml
|
||||||
|
│ ├── web-service.yaml
|
||||||
|
│ ├── ingress.yaml
|
||||||
|
│ └── secrets.yaml
|
||||||
|
│
|
||||||
|
└── docs/
|
||||||
|
└── specs.md
|
||||||
|
```
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## 1. Architecture (microservices-lite)
|
||||||
|
|
||||||
|
### Deployables
|
||||||
|
|
||||||
|
1. **api-service** (.NET 10, ASP.NET Core)
|
||||||
|
- REST API (implicit org scope from JWT)
|
||||||
|
- JWT access + refresh (both returned in JSON)
|
||||||
|
- RBAC enforced using `org_role` claim + DB ownership checks
|
||||||
|
- Writes incidents + timeline events
|
||||||
|
- Enqueues background jobs to Hangfire
|
||||||
|
|
||||||
|
2. **worker-service** (.NET 10 Worker Service)
|
||||||
|
- Runs **Hangfire Server** using Redis storage
|
||||||
|
- Executes jobs: notification send, escalation checks, rollups
|
||||||
|
- Writes notification attempts and system events
|
||||||
|
|
||||||
|
3. **web** (Next.js 14 + TypeScript)
|
||||||
|
- Auth pages + dashboard + incident detail
|
||||||
|
|
||||||
|
### Dependencies (in kind via Helm)
|
||||||
|
- PostgreSQL (Bitnami)
|
||||||
|
- Redis (Bitnami) - Hangfire storage
|
||||||
|
- ingress-nginx
|
||||||
|
- (later) Prometheus/Grafana/OTel
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## 2. Auth Model (active org in JWT, implicit org scope)
|
||||||
|
|
||||||
|
### JWT Access Token Claims
|
||||||
|
| Claim | Description |
|
||||||
|
|-------|-------------|
|
||||||
|
| `sub` | userId (uuid) |
|
||||||
|
| `org_id` | activeOrgId (uuid) |
|
||||||
|
| `org_role` | `admin\|member\|viewer` |
|
||||||
|
| `iss` | Issuer |
|
||||||
|
| `aud` | Audience |
|
||||||
|
| `iat` | Issued at |
|
||||||
|
| `exp` | Expiration |
|
||||||
|
| `jti` | (optional) Token ID |
|
||||||
|
|
||||||
|
### Refresh Token Model (JSON, not cookie)
|
||||||
|
- Random opaque token returned in JSON
|
||||||
|
- Stored hashed in DB
|
||||||
|
- Rotated on refresh and switch-org
|
||||||
|
- Refresh token row stores `active_org_id` (per-session org selection)
|
||||||
|
|
||||||
|
### DB: `refresh_tokens`
|
||||||
|
```sql
|
||||||
|
id uuid PRIMARY KEY
|
||||||
|
user_id uuid NOT NULL
|
||||||
|
token_hash text NOT NULL UNIQUE
|
||||||
|
active_org_id uuid NOT NULL
|
||||||
|
expires_at timestamptz NOT NULL
|
||||||
|
revoked_at timestamptz NULL
|
||||||
|
created_at timestamptz NOT NULL
|
||||||
|
```
|
||||||
|
|
||||||
|
### Auth Endpoints
|
||||||
|
| Method | Endpoint | Description |
|
||||||
|
|--------|----------|-------------|
|
||||||
|
| POST | `/v1/auth/register` | Create user + default org |
|
||||||
|
| POST | `/v1/auth/login` | Authenticate, return tokens |
|
||||||
|
| POST | `/v1/auth/refresh` | Rotate refresh token |
|
||||||
|
| POST | `/v1/auth/switch-org` | Switch active org context |
|
||||||
|
| POST | `/v1/auth/logout` | Revoke refresh token |
|
||||||
|
|
||||||
|
#### Registration Flow
|
||||||
|
On `POST /v1/auth/register { email, password, displayName }`:
|
||||||
|
1. Create user record
|
||||||
|
2. Create a default org automatically (e.g., "John's Org")
|
||||||
|
3. Create org_member with role=Admin
|
||||||
|
4. Return access + refresh tokens
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## 3. Authorization Rules (implicit org scope)
|
||||||
|
|
||||||
|
### Request Context
|
||||||
|
Middleware extracts from JWT:
|
||||||
|
- `UserId` from `sub`
|
||||||
|
- `OrgId` from `org_id`
|
||||||
|
- `Role` from `org_role`
|
||||||
|
|
||||||
|
### Authorization Approach
|
||||||
|
- **Role check**: enforce viewer/member/admin by claim
|
||||||
|
- **Ownership check**: for any resource ID in path, load its `org_id` from DB and require it equals token `org_id`
|
||||||
|
- Prevents cross-tenant IDOR even though org isn't in the URL
|
||||||
|
|
||||||
|
### Role Permissions
|
||||||
|
| Role | Permissions |
|
||||||
|
|------|-------------|
|
||||||
|
| viewer | Read-only access |
|
||||||
|
| member | Create incidents, transitions, comments |
|
||||||
|
| admin | Manage members, notification targets, on-call schedules |
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## 4. API Surface (implicit org in JWT)
|
||||||
|
|
||||||
|
All routes under `/v1`. Unless noted, routes require auth.
|
||||||
|
|
||||||
|
### Auth
|
||||||
|
| Method | Endpoint | Auth | Description |
|
||||||
|
|--------|----------|------|-------------|
|
||||||
|
| POST | `/auth/register` | No | Register new user |
|
||||||
|
| POST | `/auth/login` | No | Login |
|
||||||
|
| POST | `/auth/refresh` | No | Refresh tokens |
|
||||||
|
| POST | `/auth/switch-org` | No | Switch org context |
|
||||||
|
| POST | `/auth/logout` | No | Logout |
|
||||||
|
| GET | `/me` | Yes | Get current user info |
|
||||||
|
|
||||||
|
### Org (current org context)
|
||||||
|
| Method | Endpoint | Role | Description |
|
||||||
|
|--------|----------|------|-------------|
|
||||||
|
| GET | `/org` | viewer+ | Current org summary + role |
|
||||||
|
| GET | `/org/members` | admin | List org members |
|
||||||
|
| POST | `/org/members` | admin | Invite/add member (stretch) |
|
||||||
|
| GET | `/org/services` | viewer+ | List services |
|
||||||
|
| POST | `/org/services` | member+ | Create service |
|
||||||
|
| GET | `/org/notification-targets` | admin | List notification targets |
|
||||||
|
| POST | `/org/notification-targets` | admin | Create notification target |
|
||||||
|
|
||||||
|
### Incidents
|
||||||
|
| Method | Endpoint | Role | Description |
|
||||||
|
|--------|----------|------|-------------|
|
||||||
|
| GET | `/incidents` | viewer+ | List incidents (cursor pagination) |
|
||||||
|
| POST | `/services/{serviceId}/incidents` | member+ | Create incident |
|
||||||
|
| GET | `/incidents/{incidentId}` | viewer+ | Get incident detail |
|
||||||
|
| GET | `/incidents/{incidentId}/events` | viewer+ | Get incident timeline |
|
||||||
|
| POST | `/incidents/{incidentId}/transition` | member+ | Transition incident state |
|
||||||
|
| POST | `/incidents/{incidentId}/comment` | member+ | Add comment |
|
||||||
|
|
||||||
|
### Health
|
||||||
|
| Method | Endpoint | Description |
|
||||||
|
|--------|----------|-------------|
|
||||||
|
| GET | `/healthz` | Liveness probe |
|
||||||
|
| GET | `/readyz` | Readiness probe (checks Postgres + Redis) |
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## 5. Domain Workflows
|
||||||
|
|
||||||
|
### Incident State Machine
|
||||||
|
```
|
||||||
|
Triggered → Acknowledged → Mitigated → Resolved
|
||||||
|
```
|
||||||
|
|
||||||
|
### Enforcement
|
||||||
|
- Application-level validation (allowed transitions)
|
||||||
|
- DB optimistic concurrency using `incidents.version`
|
||||||
|
|
||||||
|
### Transition Write Pattern
|
||||||
|
```sql
|
||||||
|
UPDATE incidents
|
||||||
|
SET status = @newStatus, version = version + 1, updated_at = NOW()
|
||||||
|
WHERE id = @id AND org_id = @orgId AND version = @expectedVersion
|
||||||
|
```
|
||||||
|
- If 0 rows updated → `409 Conflict` (stale client) or `404` if not found in org
|
||||||
|
|
||||||
|
### Timeline Model
|
||||||
|
Append-only `incident_events` records for:
|
||||||
|
- Incident created
|
||||||
|
- Transitions (ack, mitigate, resolve)
|
||||||
|
- Comments
|
||||||
|
- Notifications sent/failed
|
||||||
|
- Escalations triggered
|
||||||
|
|
||||||
|
`actor_user_id` is null for system/worker actions.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## 6. PostgreSQL Schema (core tables)
|
||||||
|
|
||||||
|
### Users
|
||||||
|
```sql
|
||||||
|
CREATE TABLE users (
|
||||||
|
id uuid PRIMARY KEY,
|
||||||
|
email text NOT NULL UNIQUE,
|
||||||
|
password_hash text NOT NULL,
|
||||||
|
display_name text NOT NULL,
|
||||||
|
created_at timestamptz NOT NULL DEFAULT NOW()
|
||||||
|
);
|
||||||
|
```
|
||||||
|
|
||||||
|
### Orgs
|
||||||
|
```sql
|
||||||
|
CREATE TABLE orgs (
|
||||||
|
id uuid PRIMARY KEY,
|
||||||
|
name text NOT NULL,
|
||||||
|
slug text NOT NULL UNIQUE,
|
||||||
|
created_at timestamptz NOT NULL DEFAULT NOW()
|
||||||
|
);
|
||||||
|
```
|
||||||
|
|
||||||
|
### Org Members
|
||||||
|
```sql
|
||||||
|
CREATE TABLE org_members (
|
||||||
|
id uuid PRIMARY KEY,
|
||||||
|
org_id uuid NOT NULL REFERENCES orgs(id) ON DELETE CASCADE,
|
||||||
|
user_id uuid NOT NULL REFERENCES users(id) ON DELETE CASCADE,
|
||||||
|
role text NOT NULL CHECK (role IN ('admin', 'member', 'viewer')),
|
||||||
|
created_at timestamptz NOT NULL DEFAULT NOW(),
|
||||||
|
UNIQUE(org_id, user_id)
|
||||||
|
);
|
||||||
|
```
|
||||||
|
|
||||||
|
### Services
|
||||||
|
```sql
|
||||||
|
CREATE TABLE services (
|
||||||
|
id uuid PRIMARY KEY,
|
||||||
|
org_id uuid NOT NULL REFERENCES orgs(id) ON DELETE CASCADE,
|
||||||
|
name text NOT NULL,
|
||||||
|
slug text NOT NULL,
|
||||||
|
description text,
|
||||||
|
created_at timestamptz NOT NULL DEFAULT NOW(),
|
||||||
|
UNIQUE(org_id, slug)
|
||||||
|
);
|
||||||
|
```
|
||||||
|
|
||||||
|
### Incidents
|
||||||
|
```sql
|
||||||
|
CREATE TABLE incidents (
|
||||||
|
id uuid PRIMARY KEY,
|
||||||
|
org_id uuid NOT NULL REFERENCES orgs(id) ON DELETE CASCADE,
|
||||||
|
service_id uuid NOT NULL REFERENCES services(id) ON DELETE CASCADE,
|
||||||
|
title text NOT NULL,
|
||||||
|
description text,
|
||||||
|
status text NOT NULL DEFAULT 'triggered'
|
||||||
|
CHECK (status IN ('triggered', 'acknowledged', 'mitigated', 'resolved')),
|
||||||
|
severity text NOT NULL DEFAULT 'sev3'
|
||||||
|
CHECK (severity IN ('sev1', 'sev2', 'sev3', 'sev4')),
|
||||||
|
version integer NOT NULL DEFAULT 1,
|
||||||
|
created_at timestamptz NOT NULL DEFAULT NOW(),
|
||||||
|
updated_at timestamptz
|
||||||
|
);
|
||||||
|
CREATE INDEX idx_incidents_org_status ON incidents(org_id, status);
|
||||||
|
```
|
||||||
|
|
||||||
|
### Incident Events
|
||||||
|
```sql
|
||||||
|
CREATE TABLE incident_events (
|
||||||
|
id uuid PRIMARY KEY,
|
||||||
|
incident_id uuid NOT NULL REFERENCES incidents(id) ON DELETE CASCADE,
|
||||||
|
event_type text NOT NULL,
|
||||||
|
actor_user_id uuid REFERENCES users(id),
|
||||||
|
payload jsonb,
|
||||||
|
created_at timestamptz NOT NULL DEFAULT NOW()
|
||||||
|
);
|
||||||
|
CREATE INDEX idx_incident_events_incident ON incident_events(incident_id, created_at);
|
||||||
|
```
|
||||||
|
|
||||||
|
### Notification Targets
|
||||||
|
```sql
|
||||||
|
CREATE TABLE notification_targets (
|
||||||
|
id uuid PRIMARY KEY,
|
||||||
|
org_id uuid NOT NULL REFERENCES orgs(id) ON DELETE CASCADE,
|
||||||
|
name text NOT NULL,
|
||||||
|
target_type text NOT NULL CHECK (target_type IN ('webhook', 'email', 'slack')),
|
||||||
|
configuration text NOT NULL,
|
||||||
|
is_enabled boolean NOT NULL DEFAULT true,
|
||||||
|
created_at timestamptz NOT NULL DEFAULT NOW(),
|
||||||
|
updated_at timestamptz
|
||||||
|
);
|
||||||
|
```
|
||||||
|
|
||||||
|
### Notification Attempts
|
||||||
|
```sql
|
||||||
|
CREATE TABLE notification_attempts (
|
||||||
|
id uuid PRIMARY KEY,
|
||||||
|
incident_id uuid NOT NULL REFERENCES incidents(id) ON DELETE CASCADE,
|
||||||
|
target_id uuid NOT NULL REFERENCES notification_targets(id) ON DELETE CASCADE,
|
||||||
|
success boolean NOT NULL,
|
||||||
|
error_message text,
|
||||||
|
attempt_number integer NOT NULL DEFAULT 1,
|
||||||
|
created_at timestamptz NOT NULL DEFAULT NOW(),
|
||||||
|
UNIQUE(incident_id, target_id)
|
||||||
|
);
|
||||||
|
```
|
||||||
|
|
||||||
|
### Refresh Tokens
|
||||||
|
```sql
|
||||||
|
CREATE TABLE refresh_tokens (
|
||||||
|
id uuid PRIMARY KEY,
|
||||||
|
user_id uuid NOT NULL REFERENCES users(id) ON DELETE CASCADE,
|
||||||
|
token_hash text NOT NULL UNIQUE,
|
||||||
|
active_org_id uuid NOT NULL REFERENCES orgs(id),
|
||||||
|
expires_at timestamptz NOT NULL,
|
||||||
|
revoked_at timestamptz,
|
||||||
|
created_at timestamptz NOT NULL DEFAULT NOW()
|
||||||
|
);
|
||||||
|
CREATE INDEX idx_refresh_tokens_user ON refresh_tokens(user_id);
|
||||||
|
```
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## 7. Data Access (Dapper) and Migrations (FluentMigrator)
|
||||||
|
|
||||||
|
### Dapper Conventions
|
||||||
|
- Repositories receive `OrgId` as an explicit parameter and include it in WHERE clauses
|
||||||
|
- Keep SQL close to repositories (or separate `.sql` files)
|
||||||
|
- Use `NpgsqlConnection` + `IDbTransaction` for multi-statement operations
|
||||||
|
|
||||||
|
### FluentMigrator
|
||||||
|
| Migration | Tables |
|
||||||
|
|-----------|--------|
|
||||||
|
| 0001 | users, orgs, org_members, services, incidents, incident_events |
|
||||||
|
| 0002 | refresh_tokens |
|
||||||
|
| 0003 | notification_targets, notification_attempts |
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## 8. Hangfire Job Design (Redis storage)
|
||||||
|
|
||||||
|
### Setup
|
||||||
|
- API configures Hangfire Client (enqueue)
|
||||||
|
- Worker hosts Hangfire Server (process)
|
||||||
|
|
||||||
|
### Queues
|
||||||
|
| Queue | Purpose |
|
||||||
|
|-------|---------|
|
||||||
|
| critical | Escalations |
|
||||||
|
| default | Notifications |
|
||||||
|
| low | Rollups |
|
||||||
|
|
||||||
|
### Jobs
|
||||||
|
|
||||||
|
#### 1. IncidentTriggeredJob(incidentId)
|
||||||
|
- Reads incident (must belong to org in incident row)
|
||||||
|
- Loads enabled notification targets for the org
|
||||||
|
- Inserts `notification_attempts` rows (idempotent)
|
||||||
|
- Enqueues per-target send jobs
|
||||||
|
|
||||||
|
#### 2. SendWebhookNotificationJob(incidentId, targetId)
|
||||||
|
- Attempts HTTP POST with incident summary payload
|
||||||
|
- Updates attempt status + writes `incident_event` of type `system.notification_sent` or `system.notification_failed`
|
||||||
|
- Throws on transient failures to trigger retry; safe due to DB idempotency
|
||||||
|
|
||||||
|
#### 3. EscalateIfUnackedJob(incidentId, step) (stretch)
|
||||||
|
- Runs delayed
|
||||||
|
- Checks status; if still Triggered, sends secondary notifications
|
||||||
|
|
||||||
|
### Operational Note
|
||||||
|
- Expose Hangfire Dashboard **only in local** and protect it (basic auth or require a dev token)
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## 9. Kubernetes (kind) + Helm + Skaffold (local-only)
|
||||||
|
|
||||||
|
### Helm Umbrella Chart Deploys
|
||||||
|
- bitnami/postgresql
|
||||||
|
- bitnami/redis
|
||||||
|
- api Deployment/Service
|
||||||
|
- worker Deployment
|
||||||
|
- web Deployment/Service
|
||||||
|
- Ingress with host `incidentops.local`:
|
||||||
|
- `/api`, `/v1`, `/healthz`, `/readyz` → api-service
|
||||||
|
- `/` → web
|
||||||
|
|
||||||
|
### Configuration via Environment
|
||||||
|
| Variable | Description |
|
||||||
|
|----------|-------------|
|
||||||
|
| `ConnectionStrings__Postgres` | PostgreSQL connection string |
|
||||||
|
| `Redis__ConnectionString` | Redis connection string |
|
||||||
|
| `Jwt__Issuer` | JWT issuer |
|
||||||
|
| `Jwt__Audience` | JWT audience |
|
||||||
|
| `Jwt__SigningKey` | JWT signing key (secret) |
|
||||||
|
|
||||||
|
### Readiness
|
||||||
|
- API checks Postgres + Redis
|
||||||
|
- Worker checks Postgres + Redis at startup
|
||||||
|
|
||||||
|
### Skaffold
|
||||||
|
- Builds three images (api, worker, web)
|
||||||
|
- `helm upgrade --install` on changes
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## 10. Frontend UX Requirements (implicit org)
|
||||||
|
|
||||||
|
- On login, display `activeOrg` from response
|
||||||
|
- Org switcher calls `/v1/auth/switch-org` and replaces tokens
|
||||||
|
- All subsequent API calls use only `Authorization` header; no orgId params
|
||||||
|
- Store tokens in localStorage or secure cookie
|
||||||
|
- Handle 401 by attempting token refresh
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## 11. Key Highlights (README/Resume)
|
||||||
|
|
||||||
|
- "Multi-tenant org context embedded in JWT; org switching re-issues tokens."
|
||||||
|
- "DB ownership checks prevent cross-tenant resource access."
|
||||||
|
- "Optimistic concurrency for incident transitions."
|
||||||
|
- "Background jobs with retries + idempotent notification attempts."
|
||||||
|
- "Deployed locally to Kubernetes via Helm + Skaffold."
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## 12. Technology Stack
|
||||||
|
|
||||||
|
| Layer | Technology |
|
||||||
|
|-------|------------|
|
||||||
|
| Runtime | .NET 10 |
|
||||||
|
| API Framework | ASP.NET Core |
|
||||||
|
| Worker | .NET Worker Service |
|
||||||
|
| Background Jobs | Hangfire with Redis |
|
||||||
|
| Database | PostgreSQL |
|
||||||
|
| ORM | Dapper |
|
||||||
|
| Migrations | FluentMigrator |
|
||||||
|
| Auth | JWT Bearer + BCrypt |
|
||||||
|
| Frontend | Next.js 14 + TypeScript |
|
||||||
|
| Container | Docker |
|
||||||
|
| Orchestration | Kubernetes (kind) |
|
||||||
|
| Deployment | Helm + Skaffold |
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## 13. Local Development
|
||||||
|
|
||||||
|
### Prerequisites
|
||||||
|
- .NET 10 SDK
|
||||||
|
- Node.js 20+
|
||||||
|
- Docker
|
||||||
|
- kind (Kubernetes in Docker)
|
||||||
|
- Helm
|
||||||
|
- Skaffold
|
||||||
|
|
||||||
|
### Quick Start
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# With Docker Compose (simplest)
|
||||||
|
docker-compose up -d
|
||||||
|
|
||||||
|
# Run API
|
||||||
|
cd src/IncidentOps.Api
|
||||||
|
dotnet run
|
||||||
|
|
||||||
|
# Run Worker (separate terminal)
|
||||||
|
cd src/IncidentOps.Worker
|
||||||
|
dotnet run
|
||||||
|
|
||||||
|
# Run Web (separate terminal)
|
||||||
|
cd web
|
||||||
|
npm install
|
||||||
|
npm run dev
|
||||||
|
```
|
||||||
|
|
||||||
|
### With Kubernetes (kind)
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# Create cluster
|
||||||
|
kind create cluster --name incidentops
|
||||||
|
|
||||||
|
# Deploy with Skaffold
|
||||||
|
skaffold dev
|
||||||
|
|
||||||
|
# Access at http://incidentops.local (add to /etc/hosts)
|
||||||
|
```
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## 14. API Request/Response Examples
|
||||||
|
|
||||||
|
### Register
|
||||||
|
```http
|
||||||
|
POST /v1/auth/register
|
||||||
|
Content-Type: application/json
|
||||||
|
|
||||||
|
{
|
||||||
|
"email": "user@example.com",
|
||||||
|
"password": "SecurePass123!",
|
||||||
|
"displayName": "John Doe"
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
Response:
|
||||||
|
```json
|
||||||
|
{
|
||||||
|
"accessToken": "eyJhbG...",
|
||||||
|
"refreshToken": "a1b2c3d4...",
|
||||||
|
"activeOrg": {
|
||||||
|
"id": "uuid",
|
||||||
|
"name": "John Doe's Org",
|
||||||
|
"slug": "org-abc123",
|
||||||
|
"role": "admin"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
### Create Incident
|
||||||
|
```http
|
||||||
|
POST /v1/services/{serviceId}/incidents
|
||||||
|
Authorization: Bearer {accessToken}
|
||||||
|
Content-Type: application/json
|
||||||
|
|
||||||
|
{
|
||||||
|
"title": "Database connection timeout",
|
||||||
|
"description": "Users experiencing slow queries",
|
||||||
|
"severity": "sev2"
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
### Transition Incident
|
||||||
|
```http
|
||||||
|
POST /v1/incidents/{incidentId}/transition
|
||||||
|
Authorization: Bearer {accessToken}
|
||||||
|
Content-Type: application/json
|
||||||
|
|
||||||
|
{
|
||||||
|
"action": "ack",
|
||||||
|
"expectedVersion": 1
|
||||||
|
}
|
||||||
|
```
|
||||||
+11
-10
@@ -1,15 +1,16 @@
|
|||||||
apiVersion: v2
|
apiVersion: v2
|
||||||
name: incidentops
|
name: incidentops
|
||||||
description: A Helm chart for IncidentOps - Incident Management Platform
|
description: IncidentOps - Incident Management Platform
|
||||||
type: application
|
type: application
|
||||||
version: 0.1.0
|
version: 0.1.0
|
||||||
appVersion: "0.1.0"
|
appVersion: "1.0.0"
|
||||||
|
|
||||||
keywords:
|
dependencies:
|
||||||
- incidentops
|
- name: postgresql
|
||||||
- incident-management
|
version: "14.0.0"
|
||||||
- on-call
|
repository: "https://charts.bitnami.com/bitnami"
|
||||||
- alerting
|
condition: postgresql.enabled
|
||||||
|
- name: redis
|
||||||
maintainers:
|
version: "18.0.0"
|
||||||
- name: IncidentOps Team
|
repository: "https://charts.bitnami.com/bitnami"
|
||||||
|
condition: redis.enabled
|
||||||
|
|||||||
@@ -1,33 +0,0 @@
|
|||||||
IncidentOps has been deployed!
|
|
||||||
|
|
||||||
{{- if .Values.ingress.enabled }}
|
|
||||||
|
|
||||||
Access the application at:
|
|
||||||
http{{ if $.Values.ingress.tls }}s{{ end }}://{{ .Values.ingress.host }}
|
|
||||||
|
|
||||||
{{- else }}
|
|
||||||
|
|
||||||
To access the application, run:
|
|
||||||
|
|
||||||
API:
|
|
||||||
kubectl port-forward svc/{{ include "incidentops.fullname" . }}-api {{ .Values.api.service.port }}:{{ .Values.api.service.port }} -n {{ .Release.Namespace }}
|
|
||||||
Then open: http://localhost:{{ .Values.api.service.port }}
|
|
||||||
|
|
||||||
Web:
|
|
||||||
kubectl port-forward svc/{{ include "incidentops.fullname" . }}-web {{ .Values.web.service.port }}:{{ .Values.web.service.port }} -n {{ .Release.Namespace }}
|
|
||||||
Then open: http://localhost:{{ .Values.web.service.port }}
|
|
||||||
|
|
||||||
{{- end }}
|
|
||||||
|
|
||||||
To check the status of your deployment:
|
|
||||||
kubectl get pods -n {{ .Release.Namespace }} -l "app.kubernetes.io/instance={{ .Release.Name }}"
|
|
||||||
|
|
||||||
{{- if .Values.migration.enabled }}
|
|
||||||
|
|
||||||
Database migrations will run automatically as a Helm hook.
|
|
||||||
Check migration status:
|
|
||||||
kubectl get jobs -n {{ .Release.Namespace }} -l "app.kubernetes.io/component=migration"
|
|
||||||
|
|
||||||
{{- end }}
|
|
||||||
|
|
||||||
For more information, visit the documentation.
|
|
||||||
@@ -49,170 +49,15 @@ app.kubernetes.io/instance: {{ .Release.Name }}
|
|||||||
{{- end }}
|
{{- end }}
|
||||||
|
|
||||||
{{/*
|
{{/*
|
||||||
API labels
|
PostgreSQL connection string
|
||||||
*/}}
|
*/}}
|
||||||
{{- define "incidentops.api.labels" -}}
|
{{- define "incidentops.postgresConnectionString" -}}
|
||||||
{{ include "incidentops.labels" . }}
|
Host={{ .Release.Name }}-postgresql;Port=5432;Database={{ .Values.postgresql.auth.database }};Username={{ .Values.postgresql.auth.username }};Password={{ .Values.postgresql.auth.password }}
|
||||||
app.kubernetes.io/component: api
|
|
||||||
{{- end }}
|
|
||||||
|
|
||||||
{{- define "incidentops.api.selectorLabels" -}}
|
|
||||||
{{ include "incidentops.selectorLabels" . }}
|
|
||||||
app.kubernetes.io/component: api
|
|
||||||
{{- end }}
|
{{- end }}
|
||||||
|
|
||||||
{{/*
|
{{/*
|
||||||
Worker labels
|
Redis connection string
|
||||||
*/}}
|
*/}}
|
||||||
{{- define "incidentops.worker.labels" -}}
|
{{- define "incidentops.redisConnectionString" -}}
|
||||||
{{ include "incidentops.labels" . }}
|
{{ .Release.Name }}-redis-master:6379
|
||||||
app.kubernetes.io/component: worker
|
|
||||||
{{- end }}
|
|
||||||
|
|
||||||
{{- define "incidentops.worker.selectorLabels" -}}
|
|
||||||
{{ include "incidentops.selectorLabels" . }}
|
|
||||||
app.kubernetes.io/component: worker
|
|
||||||
{{- end }}
|
|
||||||
|
|
||||||
{{/*
|
|
||||||
Web labels
|
|
||||||
*/}}
|
|
||||||
{{- define "incidentops.web.labels" -}}
|
|
||||||
{{ include "incidentops.labels" . }}
|
|
||||||
app.kubernetes.io/component: web
|
|
||||||
{{- end }}
|
|
||||||
|
|
||||||
{{- define "incidentops.web.selectorLabels" -}}
|
|
||||||
{{ include "incidentops.selectorLabels" . }}
|
|
||||||
app.kubernetes.io/component: web
|
|
||||||
{{- end }}
|
|
||||||
|
|
||||||
{{/*
|
|
||||||
Create the name of the service account to use
|
|
||||||
*/}}
|
|
||||||
{{- define "incidentops.serviceAccountName" -}}
|
|
||||||
{{- if .Values.serviceAccount.create }}
|
|
||||||
{{- default (include "incidentops.fullname" .) .Values.serviceAccount.name }}
|
|
||||||
{{- else }}
|
|
||||||
{{- default "default" .Values.serviceAccount.name }}
|
|
||||||
{{- end }}
|
|
||||||
{{- end }}
|
|
||||||
|
|
||||||
{{/*
|
|
||||||
PostgreSQL host
|
|
||||||
*/}}
|
|
||||||
{{- define "incidentops.postgresql.host" -}}
|
|
||||||
{{- if .Values.postgresql.enabled }}
|
|
||||||
{{- printf "%s-postgresql" (include "incidentops.fullname" .) }}
|
|
||||||
{{- else }}
|
|
||||||
{{- .Values.externalDatabase.host }}
|
|
||||||
{{- end }}
|
|
||||||
{{- end }}
|
|
||||||
|
|
||||||
{{/*
|
|
||||||
PostgreSQL port
|
|
||||||
*/}}
|
|
||||||
{{- define "incidentops.postgresql.port" -}}
|
|
||||||
{{- if .Values.postgresql.enabled }}
|
|
||||||
{{- printf "5432" }}
|
|
||||||
{{- else }}
|
|
||||||
{{- .Values.externalDatabase.port | default "5432" }}
|
|
||||||
{{- end }}
|
|
||||||
{{- end }}
|
|
||||||
|
|
||||||
{{/*
|
|
||||||
Database URL
|
|
||||||
*/}}
|
|
||||||
{{- define "incidentops.databaseUrl" -}}
|
|
||||||
{{- $host := include "incidentops.postgresql.host" . }}
|
|
||||||
{{- $port := include "incidentops.postgresql.port" . }}
|
|
||||||
{{- if .Values.postgresql.enabled }}
|
|
||||||
{{- printf "postgresql://%s:%s@%s:%s/%s" .Values.postgresql.auth.username .Values.postgresql.auth.password $host $port .Values.postgresql.auth.database }}
|
|
||||||
{{- else }}
|
|
||||||
{{- printf "postgresql://%s:%s@%s:%s/%s" .Values.externalDatabase.user .Values.externalDatabase.password $host $port .Values.externalDatabase.database }}
|
|
||||||
{{- end }}
|
|
||||||
{{- end }}
|
|
||||||
|
|
||||||
{{/*
|
|
||||||
Redis host
|
|
||||||
*/}}
|
|
||||||
{{- define "incidentops.redis.host" -}}
|
|
||||||
{{- if .Values.redis.enabled }}
|
|
||||||
{{- printf "%s-redis" (include "incidentops.fullname" .) }}
|
|
||||||
{{- else }}
|
|
||||||
{{- .Values.externalRedis.host }}
|
|
||||||
{{- end }}
|
|
||||||
{{- end }}
|
|
||||||
|
|
||||||
{{/*
|
|
||||||
Redis URL
|
|
||||||
*/}}
|
|
||||||
{{- define "incidentops.redisUrl" -}}
|
|
||||||
{{- $host := include "incidentops.redis.host" . }}
|
|
||||||
{{- if .Values.redis.enabled }}
|
|
||||||
{{- printf "redis://%s:6379/0" $host }}
|
|
||||||
{{- else }}
|
|
||||||
{{- printf "redis://%s:%s/%s" $host (.Values.externalRedis.port | default "6379") (.Values.externalRedis.database | default "0") }}
|
|
||||||
{{- end }}
|
|
||||||
{{- end }}
|
|
||||||
|
|
||||||
{{/*
|
|
||||||
Celery broker URL
|
|
||||||
*/}}
|
|
||||||
{{- define "incidentops.celeryBrokerUrl" -}}
|
|
||||||
{{ include "incidentops.redisUrl" . }}
|
|
||||||
{{- end }}
|
|
||||||
|
|
||||||
{{/*
|
|
||||||
Celery result backend URL
|
|
||||||
*/}}
|
|
||||||
{{- define "incidentops.celeryResultBackend" -}}
|
|
||||||
{{- $host := include "incidentops.redis.host" . }}
|
|
||||||
{{- if .Values.redis.enabled }}
|
|
||||||
{{- printf "redis://%s:6379/1" $host }}
|
|
||||||
{{- else }}
|
|
||||||
{{- printf "redis://%s:%s/%s" $host (.Values.externalRedis.port | default "6379") (add (.Values.externalRedis.database | default 0) 1) }}
|
|
||||||
{{- end }}
|
|
||||||
{{- end }}
|
|
||||||
|
|
||||||
{{/*
|
|
||||||
API image
|
|
||||||
*/}}
|
|
||||||
{{- define "incidentops.api.image" -}}
|
|
||||||
{{- $registry := .Values.global.imageRegistry | default "" }}
|
|
||||||
{{- $repository := .Values.api.image.repository }}
|
|
||||||
{{- $tag := .Values.api.image.tag | default .Chart.AppVersion }}
|
|
||||||
{{- if $registry }}
|
|
||||||
{{- printf "%s/%s:%s" $registry $repository $tag }}
|
|
||||||
{{- else }}
|
|
||||||
{{- printf "%s:%s" $repository $tag }}
|
|
||||||
{{- end }}
|
|
||||||
{{- end }}
|
|
||||||
|
|
||||||
{{/*
|
|
||||||
Worker image
|
|
||||||
*/}}
|
|
||||||
{{- define "incidentops.worker.image" -}}
|
|
||||||
{{- $registry := .Values.global.imageRegistry | default "" }}
|
|
||||||
{{- $repository := .Values.worker.image.repository }}
|
|
||||||
{{- $tag := .Values.worker.image.tag | default .Chart.AppVersion }}
|
|
||||||
{{- if $registry }}
|
|
||||||
{{- printf "%s/%s:%s" $registry $repository $tag }}
|
|
||||||
{{- else }}
|
|
||||||
{{- printf "%s:%s" $repository $tag }}
|
|
||||||
{{- end }}
|
|
||||||
{{- end }}
|
|
||||||
|
|
||||||
{{/*
|
|
||||||
Web image
|
|
||||||
*/}}
|
|
||||||
{{- define "incidentops.web.image" -}}
|
|
||||||
{{- $registry := .Values.global.imageRegistry | default "" }}
|
|
||||||
{{- $repository := .Values.web.image.repository }}
|
|
||||||
{{- $tag := .Values.web.image.tag | default .Chart.AppVersion }}
|
|
||||||
{{- if $registry }}
|
|
||||||
{{- printf "%s/%s:%s" $registry $repository $tag }}
|
|
||||||
{{- else }}
|
|
||||||
{{- printf "%s:%s" $repository $tag }}
|
|
||||||
{{- end }}
|
|
||||||
{{- end }}
|
{{- end }}
|
||||||
|
|||||||
@@ -3,102 +3,59 @@ kind: Deployment
|
|||||||
metadata:
|
metadata:
|
||||||
name: {{ include "incidentops.fullname" . }}-api
|
name: {{ include "incidentops.fullname" . }}-api
|
||||||
labels:
|
labels:
|
||||||
{{- include "incidentops.api.labels" . | nindent 4 }}
|
{{- include "incidentops.labels" . | nindent 4 }}
|
||||||
|
app.kubernetes.io/component: api
|
||||||
spec:
|
spec:
|
||||||
{{- if not .Values.api.autoscaling.enabled }}
|
replicas: {{ .Values.api.replicas }}
|
||||||
replicas: {{ .Values.api.replicaCount }}
|
|
||||||
{{- end }}
|
|
||||||
selector:
|
selector:
|
||||||
matchLabels:
|
matchLabels:
|
||||||
{{- include "incidentops.api.selectorLabels" . | nindent 6 }}
|
{{- include "incidentops.selectorLabels" . | nindent 6 }}
|
||||||
|
app.kubernetes.io/component: api
|
||||||
template:
|
template:
|
||||||
metadata:
|
metadata:
|
||||||
annotations:
|
|
||||||
checksum/config: {{ include (print $.Template.BasePath "/configmap.yaml") . | sha256sum }}
|
|
||||||
checksum/secret: {{ include (print $.Template.BasePath "/secret.yaml") . | sha256sum }}
|
|
||||||
{{- with .Values.api.podAnnotations }}
|
|
||||||
{{- toYaml . | nindent 8 }}
|
|
||||||
{{- end }}
|
|
||||||
labels:
|
labels:
|
||||||
{{- include "incidentops.api.selectorLabels" . | nindent 8 }}
|
{{- include "incidentops.selectorLabels" . | nindent 8 }}
|
||||||
|
app.kubernetes.io/component: api
|
||||||
spec:
|
spec:
|
||||||
{{- with .Values.global.imagePullSecrets }}
|
|
||||||
imagePullSecrets:
|
|
||||||
{{- toYaml . | nindent 8 }}
|
|
||||||
{{- end }}
|
|
||||||
serviceAccountName: {{ include "incidentops.serviceAccountName" . }}
|
|
||||||
securityContext:
|
|
||||||
{{- toYaml .Values.podSecurityContext | nindent 8 }}
|
|
||||||
initContainers:
|
|
||||||
- name: wait-for-postgres
|
|
||||||
image: busybox:1.36
|
|
||||||
command:
|
|
||||||
- sh
|
|
||||||
- -c
|
|
||||||
- |
|
|
||||||
until nc -z {{ include "incidentops.fullname" . }}-postgresql 5432; do
|
|
||||||
echo "Waiting for PostgreSQL..."
|
|
||||||
sleep 2
|
|
||||||
done
|
|
||||||
echo "PostgreSQL is ready"
|
|
||||||
- name: wait-for-redis
|
|
||||||
image: busybox:1.36
|
|
||||||
command:
|
|
||||||
- sh
|
|
||||||
- -c
|
|
||||||
- |
|
|
||||||
until nc -z {{ include "incidentops.fullname" . }}-redis 6379; do
|
|
||||||
echo "Waiting for Redis..."
|
|
||||||
sleep 2
|
|
||||||
done
|
|
||||||
echo "Redis is ready"
|
|
||||||
containers:
|
containers:
|
||||||
- name: api
|
- name: api
|
||||||
securityContext:
|
image: "{{ .Values.api.image }}:{{ .Values.api.tag }}"
|
||||||
{{- toYaml .Values.securityContext | nindent 12 }}
|
imagePullPolicy: IfNotPresent
|
||||||
image: {{ include "incidentops.api.image" . }}
|
|
||||||
imagePullPolicy: {{ .Values.api.image.pullPolicy }}
|
|
||||||
ports:
|
ports:
|
||||||
- name: http
|
- name: http
|
||||||
containerPort: 8000
|
containerPort: {{ .Values.api.port }}
|
||||||
protocol: TCP
|
protocol: TCP
|
||||||
{{- if .Values.metrics.enabled }}
|
env:
|
||||||
- name: metrics
|
- name: ConnectionStrings__Postgres
|
||||||
containerPort: {{ .Values.metrics.port }}
|
value: {{ include "incidentops.postgresConnectionString" . | quote }}
|
||||||
protocol: TCP
|
- name: Redis__ConnectionString
|
||||||
{{- end }}
|
value: {{ include "incidentops.redisConnectionString" . | quote }}
|
||||||
envFrom:
|
- name: Jwt__Issuer
|
||||||
- configMapRef:
|
value: {{ .Values.jwt.issuer | quote }}
|
||||||
name: {{ include "incidentops.fullname" . }}-config
|
- name: Jwt__Audience
|
||||||
- secretRef:
|
value: {{ .Values.jwt.audience | quote }}
|
||||||
name: {{ include "incidentops.fullname" . }}-secret
|
- name: Jwt__SigningKey
|
||||||
|
valueFrom:
|
||||||
|
secretKeyRef:
|
||||||
|
name: {{ include "incidentops.fullname" . }}-secrets
|
||||||
|
key: jwt-signing-key
|
||||||
|
- name: Jwt__AccessTokenExpirationMinutes
|
||||||
|
value: {{ .Values.jwt.accessTokenExpirationMinutes | quote }}
|
||||||
|
- name: Jwt__RefreshTokenExpirationDays
|
||||||
|
value: {{ .Values.jwt.refreshTokenExpirationDays | quote }}
|
||||||
|
- name: Cors__Origins__0
|
||||||
|
value: "http://{{ .Values.ingress.host }}"
|
||||||
livenessProbe:
|
livenessProbe:
|
||||||
httpGet:
|
httpGet:
|
||||||
path: /v1/healthz
|
path: /healthz
|
||||||
port: http
|
port: http
|
||||||
initialDelaySeconds: 10
|
initialDelaySeconds: 10
|
||||||
periodSeconds: 30
|
periodSeconds: 10
|
||||||
timeoutSeconds: 5
|
|
||||||
failureThreshold: 3
|
|
||||||
readinessProbe:
|
readinessProbe:
|
||||||
httpGet:
|
httpGet:
|
||||||
path: /v1/readyz
|
path: /readyz
|
||||||
port: http
|
port: http
|
||||||
initialDelaySeconds: 5
|
initialDelaySeconds: 5
|
||||||
periodSeconds: 10
|
periodSeconds: 5
|
||||||
timeoutSeconds: 5
|
|
||||||
failureThreshold: 3
|
|
||||||
resources:
|
resources:
|
||||||
{{- toYaml .Values.api.resources | nindent 12 }}
|
{{- toYaml .Values.api.resources | nindent 12 }}
|
||||||
{{- with .Values.api.nodeSelector }}
|
|
||||||
nodeSelector:
|
|
||||||
{{- toYaml . | nindent 8 }}
|
|
||||||
{{- end }}
|
|
||||||
{{- with .Values.api.affinity }}
|
|
||||||
affinity:
|
|
||||||
{{- toYaml . | nindent 8 }}
|
|
||||||
{{- end }}
|
|
||||||
{{- with .Values.api.tolerations }}
|
|
||||||
tolerations:
|
|
||||||
{{- toYaml . | nindent 8 }}
|
|
||||||
{{- end }}
|
|
||||||
|
|||||||
@@ -1,22 +0,0 @@
|
|||||||
{{- if .Values.api.autoscaling.enabled }}
|
|
||||||
apiVersion: autoscaling/v2
|
|
||||||
kind: HorizontalPodAutoscaler
|
|
||||||
metadata:
|
|
||||||
name: {{ include "incidentops.fullname" . }}-api
|
|
||||||
labels:
|
|
||||||
{{- include "incidentops.api.labels" . | nindent 4 }}
|
|
||||||
spec:
|
|
||||||
scaleTargetRef:
|
|
||||||
apiVersion: apps/v1
|
|
||||||
kind: Deployment
|
|
||||||
name: {{ include "incidentops.fullname" . }}-api
|
|
||||||
minReplicas: {{ .Values.api.autoscaling.minReplicas }}
|
|
||||||
maxReplicas: {{ .Values.api.autoscaling.maxReplicas }}
|
|
||||||
metrics:
|
|
||||||
- type: Resource
|
|
||||||
resource:
|
|
||||||
name: cpu
|
|
||||||
target:
|
|
||||||
type: Utilization
|
|
||||||
averageUtilization: {{ .Values.api.autoscaling.targetCPUUtilizationPercentage }}
|
|
||||||
{{- end }}
|
|
||||||
@@ -3,19 +3,15 @@ kind: Service
|
|||||||
metadata:
|
metadata:
|
||||||
name: {{ include "incidentops.fullname" . }}-api
|
name: {{ include "incidentops.fullname" . }}-api
|
||||||
labels:
|
labels:
|
||||||
{{- include "incidentops.api.labels" . | nindent 4 }}
|
{{- include "incidentops.labels" . | nindent 4 }}
|
||||||
|
app.kubernetes.io/component: api
|
||||||
spec:
|
spec:
|
||||||
type: {{ .Values.api.service.type }}
|
type: ClusterIP
|
||||||
ports:
|
ports:
|
||||||
- port: {{ .Values.api.service.port }}
|
- port: {{ .Values.api.port }}
|
||||||
targetPort: http
|
targetPort: http
|
||||||
protocol: TCP
|
protocol: TCP
|
||||||
name: http
|
name: http
|
||||||
{{- if .Values.metrics.enabled }}
|
|
||||||
- port: {{ .Values.metrics.port }}
|
|
||||||
targetPort: metrics
|
|
||||||
protocol: TCP
|
|
||||||
name: metrics
|
|
||||||
{{- end }}
|
|
||||||
selector:
|
selector:
|
||||||
{{- include "incidentops.api.selectorLabels" . | nindent 4 }}
|
{{- include "incidentops.selectorLabels" . | nindent 4 }}
|
||||||
|
app.kubernetes.io/component: api
|
||||||
|
|||||||
@@ -1,23 +0,0 @@
|
|||||||
apiVersion: v1
|
|
||||||
kind: ConfigMap
|
|
||||||
metadata:
|
|
||||||
name: {{ include "incidentops.fullname" . }}-config
|
|
||||||
labels:
|
|
||||||
{{- include "incidentops.labels" . | nindent 4 }}
|
|
||||||
data:
|
|
||||||
JWT_ALGORITHM: {{ .Values.config.jwtAlgorithm | quote }}
|
|
||||||
ACCESS_TOKEN_EXPIRE_MINUTES: {{ .Values.config.accessTokenExpireMinutes | quote }}
|
|
||||||
REFRESH_TOKEN_EXPIRE_DAYS: {{ .Values.config.refreshTokenExpireDays | quote }}
|
|
||||||
# OpenTelemetry configuration
|
|
||||||
OTEL_ENABLED: {{ .Values.observability.enabled | quote }}
|
|
||||||
OTEL_SERVICE_NAME: "incidentops-api"
|
|
||||||
OTEL_ENVIRONMENT: {{ .Values.config.environment | default "production" | quote }}
|
|
||||||
{{- if .Values.observability.enabled }}
|
|
||||||
OTEL_EXPORTER_OTLP_ENDPOINT: "http://{{ include "incidentops.fullname" . }}-otel-collector:4317"
|
|
||||||
{{- end }}
|
|
||||||
OTEL_EXPORTER_OTLP_INSECURE: "true"
|
|
||||||
OTEL_LOG_LEVEL: {{ .Values.config.logLevel | default "INFO" | quote }}
|
|
||||||
# Metrics configuration
|
|
||||||
{{- if .Values.metrics.enabled }}
|
|
||||||
PROMETHEUS_PORT: {{ .Values.metrics.port | quote }}
|
|
||||||
{{- end }}
|
|
||||||
@@ -1,387 +0,0 @@
|
|||||||
{{- if .Values.observability.enabled }}
|
|
||||||
apiVersion: v1
|
|
||||||
kind: ConfigMap
|
|
||||||
metadata:
|
|
||||||
name: {{ include "incidentops.fullname" . }}-grafana-datasources
|
|
||||||
labels:
|
|
||||||
{{- include "incidentops.labels" . | nindent 4 }}
|
|
||||||
app.kubernetes.io/component: grafana
|
|
||||||
data:
|
|
||||||
datasources.yaml: |
|
|
||||||
apiVersion: 1
|
|
||||||
datasources:
|
|
||||||
- name: Prometheus
|
|
||||||
type: prometheus
|
|
||||||
uid: prometheus
|
|
||||||
url: http://{{ include "incidentops.fullname" . }}-prometheus:9090
|
|
||||||
access: proxy
|
|
||||||
isDefault: false
|
|
||||||
jsonData:
|
|
||||||
httpMethod: POST
|
|
||||||
exemplarTraceIdDestinations:
|
|
||||||
- name: trace_id
|
|
||||||
datasourceUid: tempo
|
|
||||||
|
|
||||||
- name: Tempo
|
|
||||||
type: tempo
|
|
||||||
uid: tempo
|
|
||||||
url: http://{{ include "incidentops.fullname" . }}-tempo:3200
|
|
||||||
access: proxy
|
|
||||||
isDefault: false
|
|
||||||
jsonData:
|
|
||||||
tracesToLogsV2:
|
|
||||||
datasourceUid: loki
|
|
||||||
spanStartTimeShift: '-1h'
|
|
||||||
spanEndTimeShift: '1h'
|
|
||||||
filterByTraceID: true
|
|
||||||
filterBySpanID: true
|
|
||||||
tracesToMetrics:
|
|
||||||
datasourceUid: prometheus
|
|
||||||
spanStartTimeShift: '-1h'
|
|
||||||
spanEndTimeShift: '1h'
|
|
||||||
serviceMap:
|
|
||||||
datasourceUid: prometheus
|
|
||||||
nodeGraph:
|
|
||||||
enabled: true
|
|
||||||
lokiSearch:
|
|
||||||
datasourceUid: loki
|
|
||||||
|
|
||||||
- name: Loki
|
|
||||||
type: loki
|
|
||||||
uid: loki
|
|
||||||
url: http://{{ include "incidentops.fullname" . }}-loki:3100
|
|
||||||
access: proxy
|
|
||||||
isDefault: true
|
|
||||||
jsonData:
|
|
||||||
derivedFields:
|
|
||||||
- datasourceUid: tempo
|
|
||||||
matcherRegex: '"trace_id":"([a-f0-9]+)"'
|
|
||||||
name: TraceID
|
|
||||||
url: '$${__value.raw}'
|
|
||||||
urlDisplayLabel: 'View Trace'
|
|
||||||
---
|
|
||||||
apiVersion: v1
|
|
||||||
kind: ConfigMap
|
|
||||||
metadata:
|
|
||||||
name: {{ include "incidentops.fullname" . }}-grafana-dashboards-provider
|
|
||||||
labels:
|
|
||||||
{{- include "incidentops.labels" . | nindent 4 }}
|
|
||||||
app.kubernetes.io/component: grafana
|
|
||||||
data:
|
|
||||||
dashboards.yaml: |
|
|
||||||
apiVersion: 1
|
|
||||||
providers:
|
|
||||||
- name: 'default'
|
|
||||||
orgId: 1
|
|
||||||
folder: 'IncidentOps'
|
|
||||||
folderUid: 'incidentops'
|
|
||||||
type: file
|
|
||||||
disableDeletion: false
|
|
||||||
editable: true
|
|
||||||
options:
|
|
||||||
path: /var/lib/grafana/dashboards
|
|
||||||
---
|
|
||||||
apiVersion: v1
|
|
||||||
kind: ConfigMap
|
|
||||||
metadata:
|
|
||||||
name: {{ include "incidentops.fullname" . }}-grafana-dashboards
|
|
||||||
labels:
|
|
||||||
{{- include "incidentops.labels" . | nindent 4 }}
|
|
||||||
app.kubernetes.io/component: grafana
|
|
||||||
data:
|
|
||||||
api-overview.json: |
|
|
||||||
{
|
|
||||||
"title": "IncidentOps API Overview",
|
|
||||||
"uid": "incidentops-api",
|
|
||||||
"tags": ["incidentops", "api"],
|
|
||||||
"timezone": "browser",
|
|
||||||
"editable": true,
|
|
||||||
"panels": [
|
|
||||||
{
|
|
||||||
"id": 1,
|
|
||||||
"title": "Request Rate",
|
|
||||||
"type": "timeseries",
|
|
||||||
"gridPos": {"h": 8, "w": 8, "x": 0, "y": 0},
|
|
||||||
"targets": [
|
|
||||||
{
|
|
||||||
"datasource": {"type": "prometheus", "uid": "prometheus"},
|
|
||||||
"expr": "sum(rate(http_server_request_duration_seconds_count{job=\"incidentops-api\"}[1m]))",
|
|
||||||
"legendFormat": "Requests/sec",
|
|
||||||
"refId": "A"
|
|
||||||
}
|
|
||||||
],
|
|
||||||
"fieldConfig": {
|
|
||||||
"defaults": {
|
|
||||||
"unit": "reqps"
|
|
||||||
}
|
|
||||||
}
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"id": 2,
|
|
||||||
"title": "Request Duration (p50, p95, p99)",
|
|
||||||
"type": "timeseries",
|
|
||||||
"gridPos": {"h": 8, "w": 8, "x": 8, "y": 0},
|
|
||||||
"targets": [
|
|
||||||
{
|
|
||||||
"datasource": {"type": "prometheus", "uid": "prometheus"},
|
|
||||||
"expr": "histogram_quantile(0.50, sum(rate(http_server_request_duration_seconds_bucket{job=\"incidentops-api\"}[5m])) by (le))",
|
|
||||||
"legendFormat": "p50",
|
|
||||||
"refId": "A"
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"datasource": {"type": "prometheus", "uid": "prometheus"},
|
|
||||||
"expr": "histogram_quantile(0.95, sum(rate(http_server_request_duration_seconds_bucket{job=\"incidentops-api\"}[5m])) by (le))",
|
|
||||||
"legendFormat": "p95",
|
|
||||||
"refId": "B"
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"datasource": {"type": "prometheus", "uid": "prometheus"},
|
|
||||||
"expr": "histogram_quantile(0.99, sum(rate(http_server_request_duration_seconds_bucket{job=\"incidentops-api\"}[5m])) by (le))",
|
|
||||||
"legendFormat": "p99",
|
|
||||||
"refId": "C"
|
|
||||||
}
|
|
||||||
],
|
|
||||||
"fieldConfig": {
|
|
||||||
"defaults": {
|
|
||||||
"unit": "s"
|
|
||||||
}
|
|
||||||
}
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"id": 3,
|
|
||||||
"title": "Error Rate",
|
|
||||||
"type": "timeseries",
|
|
||||||
"gridPos": {"h": 8, "w": 8, "x": 16, "y": 0},
|
|
||||||
"targets": [
|
|
||||||
{
|
|
||||||
"datasource": {"type": "prometheus", "uid": "prometheus"},
|
|
||||||
"expr": "sum(rate(http_server_request_duration_seconds_count{job=\"incidentops-api\", http_status_code=~\"5..\"}[1m])) / sum(rate(http_server_request_duration_seconds_count{job=\"incidentops-api\"}[1m])) * 100",
|
|
||||||
"legendFormat": "Error %",
|
|
||||||
"refId": "A"
|
|
||||||
}
|
|
||||||
],
|
|
||||||
"fieldConfig": {
|
|
||||||
"defaults": {
|
|
||||||
"unit": "percent",
|
|
||||||
"min": 0,
|
|
||||||
"max": 100
|
|
||||||
}
|
|
||||||
}
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"id": 4,
|
|
||||||
"title": "Requests by Status Code",
|
|
||||||
"type": "timeseries",
|
|
||||||
"gridPos": {"h": 8, "w": 12, "x": 0, "y": 8},
|
|
||||||
"targets": [
|
|
||||||
{
|
|
||||||
"datasource": {"type": "prometheus", "uid": "prometheus"},
|
|
||||||
"expr": "sum by (http_status_code) (rate(http_server_request_duration_seconds_count{job=\"incidentops-api\"}[1m]))",
|
|
||||||
"legendFormat": "{{ "{{" }}http_status_code{{ "}}" }}",
|
|
||||||
"refId": "A"
|
|
||||||
}
|
|
||||||
],
|
|
||||||
"fieldConfig": {
|
|
||||||
"defaults": {
|
|
||||||
"unit": "reqps"
|
|
||||||
}
|
|
||||||
}
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"id": 5,
|
|
||||||
"title": "Requests by Endpoint",
|
|
||||||
"type": "timeseries",
|
|
||||||
"gridPos": {"h": 8, "w": 12, "x": 12, "y": 8},
|
|
||||||
"targets": [
|
|
||||||
{
|
|
||||||
"datasource": {"type": "prometheus", "uid": "prometheus"},
|
|
||||||
"expr": "sum by (http_route) (rate(http_server_request_duration_seconds_count{job=\"incidentops-api\"}[1m]))",
|
|
||||||
"legendFormat": "{{ "{{" }}http_route{{ "}}" }}",
|
|
||||||
"refId": "A"
|
|
||||||
}
|
|
||||||
],
|
|
||||||
"fieldConfig": {
|
|
||||||
"defaults": {
|
|
||||||
"unit": "reqps"
|
|
||||||
}
|
|
||||||
}
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"id": 6,
|
|
||||||
"title": "Recent Logs",
|
|
||||||
"type": "logs",
|
|
||||||
"gridPos": {"h": 10, "w": 24, "x": 0, "y": 16},
|
|
||||||
"targets": [
|
|
||||||
{
|
|
||||||
"datasource": {"type": "loki", "uid": "loki"},
|
|
||||||
"expr": "{service_name=\"incidentops-api\"} | json",
|
|
||||||
"refId": "A"
|
|
||||||
}
|
|
||||||
],
|
|
||||||
"options": {
|
|
||||||
"showTime": true,
|
|
||||||
"showLabels": true,
|
|
||||||
"wrapLogMessage": true,
|
|
||||||
"enableLogDetails": true,
|
|
||||||
"sortOrder": "Descending"
|
|
||||||
}
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"id": 7,
|
|
||||||
"title": "Recent Traces",
|
|
||||||
"type": "traces",
|
|
||||||
"gridPos": {"h": 10, "w": 24, "x": 0, "y": 26},
|
|
||||||
"targets": [
|
|
||||||
{
|
|
||||||
"datasource": {"type": "tempo", "uid": "tempo"},
|
|
||||||
"queryType": "traceqlSearch",
|
|
||||||
"filters": [
|
|
||||||
{
|
|
||||||
"id": "service-name",
|
|
||||||
"operator": "=",
|
|
||||||
"scope": "resource",
|
|
||||||
"tag": "service.name",
|
|
||||||
"value": ["incidentops-api"]
|
|
||||||
}
|
|
||||||
],
|
|
||||||
"refId": "A"
|
|
||||||
}
|
|
||||||
]
|
|
||||||
}
|
|
||||||
],
|
|
||||||
"schemaVersion": 38,
|
|
||||||
"version": 2
|
|
||||||
}
|
|
||||||
---
|
|
||||||
apiVersion: apps/v1
|
|
||||||
kind: Deployment
|
|
||||||
metadata:
|
|
||||||
name: {{ include "incidentops.fullname" . }}-grafana
|
|
||||||
labels:
|
|
||||||
{{- include "incidentops.labels" . | nindent 4 }}
|
|
||||||
app.kubernetes.io/component: grafana
|
|
||||||
spec:
|
|
||||||
replicas: 1
|
|
||||||
selector:
|
|
||||||
matchLabels:
|
|
||||||
{{- include "incidentops.selectorLabels" . | nindent 6 }}
|
|
||||||
app.kubernetes.io/component: grafana
|
|
||||||
template:
|
|
||||||
metadata:
|
|
||||||
labels:
|
|
||||||
{{- include "incidentops.selectorLabels" . | nindent 8 }}
|
|
||||||
app.kubernetes.io/component: grafana
|
|
||||||
annotations:
|
|
||||||
checksum/datasources: {{ .Values.observability.grafana.image.tag | sha256sum }}
|
|
||||||
spec:
|
|
||||||
securityContext:
|
|
||||||
fsGroup: 472
|
|
||||||
runAsUser: 472
|
|
||||||
containers:
|
|
||||||
- name: grafana
|
|
||||||
image: "{{ .Values.observability.grafana.image.repository }}:{{ .Values.observability.grafana.image.tag }}"
|
|
||||||
imagePullPolicy: {{ .Values.observability.grafana.image.pullPolicy }}
|
|
||||||
ports:
|
|
||||||
- name: http
|
|
||||||
containerPort: 3000
|
|
||||||
protocol: TCP
|
|
||||||
env:
|
|
||||||
- name: GF_SECURITY_ADMIN_USER
|
|
||||||
value: {{ .Values.observability.grafana.adminUser | quote }}
|
|
||||||
- name: GF_SECURITY_ADMIN_PASSWORD
|
|
||||||
valueFrom:
|
|
||||||
secretKeyRef:
|
|
||||||
name: {{ include "incidentops.fullname" . }}-grafana
|
|
||||||
key: admin-password
|
|
||||||
- name: GF_USERS_ALLOW_SIGN_UP
|
|
||||||
value: "false"
|
|
||||||
- name: GF_EXPLORE_ENABLED
|
|
||||||
value: "true"
|
|
||||||
- name: GF_FEATURE_TOGGLES_ENABLE
|
|
||||||
value: "traceqlEditor tempoSearch tempoBackendSearch tempoApmTable"
|
|
||||||
volumeMounts:
|
|
||||||
- name: datasources
|
|
||||||
mountPath: /etc/grafana/provisioning/datasources
|
|
||||||
- name: dashboards-provider
|
|
||||||
mountPath: /etc/grafana/provisioning/dashboards
|
|
||||||
- name: dashboards
|
|
||||||
mountPath: /var/lib/grafana/dashboards
|
|
||||||
- name: data
|
|
||||||
mountPath: /var/lib/grafana
|
|
||||||
resources:
|
|
||||||
{{- toYaml .Values.observability.grafana.resources | nindent 12 }}
|
|
||||||
readinessProbe:
|
|
||||||
httpGet:
|
|
||||||
path: /api/health
|
|
||||||
port: http
|
|
||||||
initialDelaySeconds: 10
|
|
||||||
periodSeconds: 10
|
|
||||||
livenessProbe:
|
|
||||||
httpGet:
|
|
||||||
path: /api/health
|
|
||||||
port: http
|
|
||||||
initialDelaySeconds: 30
|
|
||||||
periodSeconds: 30
|
|
||||||
volumes:
|
|
||||||
- name: datasources
|
|
||||||
configMap:
|
|
||||||
name: {{ include "incidentops.fullname" . }}-grafana-datasources
|
|
||||||
- name: dashboards-provider
|
|
||||||
configMap:
|
|
||||||
name: {{ include "incidentops.fullname" . }}-grafana-dashboards-provider
|
|
||||||
- name: dashboards
|
|
||||||
configMap:
|
|
||||||
name: {{ include "incidentops.fullname" . }}-grafana-dashboards
|
|
||||||
- name: data
|
|
||||||
{{- if .Values.observability.grafana.persistence.enabled }}
|
|
||||||
persistentVolumeClaim:
|
|
||||||
claimName: {{ include "incidentops.fullname" . }}-grafana
|
|
||||||
{{- else }}
|
|
||||||
emptyDir: {}
|
|
||||||
{{- end }}
|
|
||||||
---
|
|
||||||
apiVersion: v1
|
|
||||||
kind: Secret
|
|
||||||
metadata:
|
|
||||||
name: {{ include "incidentops.fullname" . }}-grafana
|
|
||||||
labels:
|
|
||||||
{{- include "incidentops.labels" . | nindent 4 }}
|
|
||||||
app.kubernetes.io/component: grafana
|
|
||||||
type: Opaque
|
|
||||||
data:
|
|
||||||
admin-password: {{ .Values.observability.grafana.adminPassword | b64enc | quote }}
|
|
||||||
---
|
|
||||||
apiVersion: v1
|
|
||||||
kind: Service
|
|
||||||
metadata:
|
|
||||||
name: {{ include "incidentops.fullname" . }}-grafana
|
|
||||||
labels:
|
|
||||||
{{- include "incidentops.labels" . | nindent 4 }}
|
|
||||||
app.kubernetes.io/component: grafana
|
|
||||||
spec:
|
|
||||||
type: {{ .Values.observability.grafana.service.type }}
|
|
||||||
ports:
|
|
||||||
- name: http
|
|
||||||
port: 80
|
|
||||||
targetPort: http
|
|
||||||
protocol: TCP
|
|
||||||
selector:
|
|
||||||
{{- include "incidentops.selectorLabels" . | nindent 4 }}
|
|
||||||
app.kubernetes.io/component: grafana
|
|
||||||
{{- if .Values.observability.grafana.persistence.enabled }}
|
|
||||||
---
|
|
||||||
apiVersion: v1
|
|
||||||
kind: PersistentVolumeClaim
|
|
||||||
metadata:
|
|
||||||
name: {{ include "incidentops.fullname" . }}-grafana
|
|
||||||
labels:
|
|
||||||
{{- include "incidentops.labels" . | nindent 4 }}
|
|
||||||
app.kubernetes.io/component: grafana
|
|
||||||
spec:
|
|
||||||
accessModes:
|
|
||||||
- ReadWriteOnce
|
|
||||||
resources:
|
|
||||||
requests:
|
|
||||||
storage: {{ .Values.observability.grafana.persistence.size }}
|
|
||||||
{{- end }}
|
|
||||||
{{- end }}
|
|
||||||
@@ -1,38 +0,0 @@
|
|||||||
{{- if and .Values.observability.enabled .Values.observability.grafana.ingress.enabled -}}
|
|
||||||
apiVersion: networking.k8s.io/v1
|
|
||||||
kind: Ingress
|
|
||||||
metadata:
|
|
||||||
name: {{ include "incidentops.fullname" . }}-grafana
|
|
||||||
labels:
|
|
||||||
{{- include "incidentops.labels" . | nindent 4 }}
|
|
||||||
app.kubernetes.io/component: grafana
|
|
||||||
{{- with .Values.observability.grafana.ingress.annotations }}
|
|
||||||
annotations:
|
|
||||||
{{- toYaml . | nindent 4 }}
|
|
||||||
{{- end }}
|
|
||||||
spec:
|
|
||||||
{{- if .Values.ingress.className }}
|
|
||||||
ingressClassName: {{ .Values.ingress.className }}
|
|
||||||
{{- end }}
|
|
||||||
{{- if .Values.observability.grafana.ingress.tls }}
|
|
||||||
tls:
|
|
||||||
{{- range .Values.observability.grafana.ingress.tls }}
|
|
||||||
- hosts:
|
|
||||||
{{- range .hosts }}
|
|
||||||
- {{ . | quote }}
|
|
||||||
{{- end }}
|
|
||||||
secretName: {{ .secretName }}
|
|
||||||
{{- end }}
|
|
||||||
{{- end }}
|
|
||||||
rules:
|
|
||||||
- host: {{ .Values.observability.grafana.ingress.host | quote }}
|
|
||||||
http:
|
|
||||||
paths:
|
|
||||||
- path: /
|
|
||||||
pathType: Prefix
|
|
||||||
backend:
|
|
||||||
service:
|
|
||||||
name: {{ include "incidentops.fullname" . }}-grafana
|
|
||||||
port:
|
|
||||||
number: 80
|
|
||||||
{{- end }}
|
|
||||||
@@ -13,16 +13,6 @@ spec:
|
|||||||
{{- if .Values.ingress.className }}
|
{{- if .Values.ingress.className }}
|
||||||
ingressClassName: {{ .Values.ingress.className }}
|
ingressClassName: {{ .Values.ingress.className }}
|
||||||
{{- end }}
|
{{- end }}
|
||||||
{{- if .Values.ingress.tls }}
|
|
||||||
tls:
|
|
||||||
{{- range .Values.ingress.tls }}
|
|
||||||
- hosts:
|
|
||||||
{{- range .hosts }}
|
|
||||||
- {{ . | quote }}
|
|
||||||
{{- end }}
|
|
||||||
secretName: {{ .secretName }}
|
|
||||||
{{- end }}
|
|
||||||
{{- end }}
|
|
||||||
rules:
|
rules:
|
||||||
- host: {{ .Values.ingress.host | quote }}
|
- host: {{ .Values.ingress.host | quote }}
|
||||||
http:
|
http:
|
||||||
@@ -33,19 +23,33 @@ spec:
|
|||||||
service:
|
service:
|
||||||
name: {{ include "incidentops.fullname" . }}-api
|
name: {{ include "incidentops.fullname" . }}-api
|
||||||
port:
|
port:
|
||||||
number: {{ .Values.api.service.port }}
|
number: {{ .Values.api.port }}
|
||||||
- path: /v1
|
- path: /v1
|
||||||
pathType: Prefix
|
pathType: Prefix
|
||||||
backend:
|
backend:
|
||||||
service:
|
service:
|
||||||
name: {{ include "incidentops.fullname" . }}-api
|
name: {{ include "incidentops.fullname" . }}-api
|
||||||
port:
|
port:
|
||||||
number: {{ .Values.api.service.port }}
|
number: {{ .Values.api.port }}
|
||||||
|
- path: /healthz
|
||||||
|
pathType: Exact
|
||||||
|
backend:
|
||||||
|
service:
|
||||||
|
name: {{ include "incidentops.fullname" . }}-api
|
||||||
|
port:
|
||||||
|
number: {{ .Values.api.port }}
|
||||||
|
- path: /readyz
|
||||||
|
pathType: Exact
|
||||||
|
backend:
|
||||||
|
service:
|
||||||
|
name: {{ include "incidentops.fullname" . }}-api
|
||||||
|
port:
|
||||||
|
number: {{ .Values.api.port }}
|
||||||
- path: /
|
- path: /
|
||||||
pathType: Prefix
|
pathType: Prefix
|
||||||
backend:
|
backend:
|
||||||
service:
|
service:
|
||||||
name: {{ include "incidentops.fullname" . }}-web
|
name: {{ include "incidentops.fullname" . }}-web
|
||||||
port:
|
port:
|
||||||
number: {{ .Values.web.service.port }}
|
number: {{ .Values.web.port }}
|
||||||
{{- end }}
|
{{- end }}
|
||||||
|
|||||||
@@ -1,155 +0,0 @@
|
|||||||
{{- if .Values.observability.enabled }}
|
|
||||||
apiVersion: v1
|
|
||||||
kind: ConfigMap
|
|
||||||
metadata:
|
|
||||||
name: {{ include "incidentops.fullname" . }}-loki-config
|
|
||||||
labels:
|
|
||||||
{{- include "incidentops.labels" . | nindent 4 }}
|
|
||||||
app.kubernetes.io/component: loki
|
|
||||||
data:
|
|
||||||
loki.yaml: |
|
|
||||||
auth_enabled: false
|
|
||||||
|
|
||||||
server:
|
|
||||||
http_listen_port: 3100
|
|
||||||
grpc_listen_port: 9096
|
|
||||||
|
|
||||||
common:
|
|
||||||
path_prefix: /loki
|
|
||||||
storage:
|
|
||||||
filesystem:
|
|
||||||
chunks_directory: /loki/chunks
|
|
||||||
rules_directory: /loki/rules
|
|
||||||
replication_factor: 1
|
|
||||||
ring:
|
|
||||||
kvstore:
|
|
||||||
store: inmemory
|
|
||||||
|
|
||||||
query_range:
|
|
||||||
results_cache:
|
|
||||||
cache:
|
|
||||||
embedded_cache:
|
|
||||||
enabled: true
|
|
||||||
max_size_mb: 100
|
|
||||||
|
|
||||||
schema_config:
|
|
||||||
configs:
|
|
||||||
- from: "2020-10-24"
|
|
||||||
store: tsdb
|
|
||||||
object_store: filesystem
|
|
||||||
schema: v13
|
|
||||||
index:
|
|
||||||
prefix: index_
|
|
||||||
period: 24h
|
|
||||||
|
|
||||||
ruler:
|
|
||||||
alertmanager_url: http://localhost:9093
|
|
||||||
|
|
||||||
limits_config:
|
|
||||||
retention_period: {{ .Values.observability.loki.retention }}
|
|
||||||
allow_structured_metadata: true
|
|
||||||
volume_enabled: true
|
|
||||||
---
|
|
||||||
apiVersion: apps/v1
|
|
||||||
kind: Deployment
|
|
||||||
metadata:
|
|
||||||
name: {{ include "incidentops.fullname" . }}-loki
|
|
||||||
labels:
|
|
||||||
{{- include "incidentops.labels" . | nindent 4 }}
|
|
||||||
app.kubernetes.io/component: loki
|
|
||||||
spec:
|
|
||||||
replicas: 1
|
|
||||||
selector:
|
|
||||||
matchLabels:
|
|
||||||
{{- include "incidentops.selectorLabels" . | nindent 6 }}
|
|
||||||
app.kubernetes.io/component: loki
|
|
||||||
template:
|
|
||||||
metadata:
|
|
||||||
labels:
|
|
||||||
{{- include "incidentops.selectorLabels" . | nindent 8 }}
|
|
||||||
app.kubernetes.io/component: loki
|
|
||||||
annotations:
|
|
||||||
checksum/config: {{ .Values.observability.loki.image.tag | sha256sum }}
|
|
||||||
spec:
|
|
||||||
containers:
|
|
||||||
- name: loki
|
|
||||||
image: "{{ .Values.observability.loki.image.repository }}:{{ .Values.observability.loki.image.tag }}"
|
|
||||||
imagePullPolicy: {{ .Values.observability.loki.image.pullPolicy }}
|
|
||||||
args:
|
|
||||||
- -config.file=/etc/loki/loki.yaml
|
|
||||||
ports:
|
|
||||||
- name: http
|
|
||||||
containerPort: 3100
|
|
||||||
protocol: TCP
|
|
||||||
- name: grpc
|
|
||||||
containerPort: 9096
|
|
||||||
protocol: TCP
|
|
||||||
volumeMounts:
|
|
||||||
- name: config
|
|
||||||
mountPath: /etc/loki
|
|
||||||
- name: data
|
|
||||||
mountPath: /loki
|
|
||||||
resources:
|
|
||||||
{{- toYaml .Values.observability.loki.resources | nindent 12 }}
|
|
||||||
readinessProbe:
|
|
||||||
httpGet:
|
|
||||||
path: /ready
|
|
||||||
port: http
|
|
||||||
initialDelaySeconds: 10
|
|
||||||
periodSeconds: 10
|
|
||||||
livenessProbe:
|
|
||||||
httpGet:
|
|
||||||
path: /ready
|
|
||||||
port: http
|
|
||||||
initialDelaySeconds: 30
|
|
||||||
periodSeconds: 30
|
|
||||||
volumes:
|
|
||||||
- name: config
|
|
||||||
configMap:
|
|
||||||
name: {{ include "incidentops.fullname" . }}-loki-config
|
|
||||||
- name: data
|
|
||||||
{{- if .Values.observability.loki.persistence.enabled }}
|
|
||||||
persistentVolumeClaim:
|
|
||||||
claimName: {{ include "incidentops.fullname" . }}-loki
|
|
||||||
{{- else }}
|
|
||||||
emptyDir: {}
|
|
||||||
{{- end }}
|
|
||||||
---
|
|
||||||
apiVersion: v1
|
|
||||||
kind: Service
|
|
||||||
metadata:
|
|
||||||
name: {{ include "incidentops.fullname" . }}-loki
|
|
||||||
labels:
|
|
||||||
{{- include "incidentops.labels" . | nindent 4 }}
|
|
||||||
app.kubernetes.io/component: loki
|
|
||||||
spec:
|
|
||||||
type: ClusterIP
|
|
||||||
ports:
|
|
||||||
- name: http
|
|
||||||
port: 3100
|
|
||||||
targetPort: http
|
|
||||||
protocol: TCP
|
|
||||||
- name: grpc
|
|
||||||
port: 9096
|
|
||||||
targetPort: grpc
|
|
||||||
protocol: TCP
|
|
||||||
selector:
|
|
||||||
{{- include "incidentops.selectorLabels" . | nindent 4 }}
|
|
||||||
app.kubernetes.io/component: loki
|
|
||||||
{{- if .Values.observability.loki.persistence.enabled }}
|
|
||||||
---
|
|
||||||
apiVersion: v1
|
|
||||||
kind: PersistentVolumeClaim
|
|
||||||
metadata:
|
|
||||||
name: {{ include "incidentops.fullname" . }}-loki
|
|
||||||
labels:
|
|
||||||
{{- include "incidentops.labels" . | nindent 4 }}
|
|
||||||
app.kubernetes.io/component: loki
|
|
||||||
spec:
|
|
||||||
accessModes:
|
|
||||||
- ReadWriteOnce
|
|
||||||
resources:
|
|
||||||
requests:
|
|
||||||
storage: {{ .Values.observability.loki.persistence.size }}
|
|
||||||
{{- end }}
|
|
||||||
{{- end }}
|
|
||||||
@@ -1,51 +0,0 @@
|
|||||||
{{- if .Values.migration.enabled }}
|
|
||||||
apiVersion: batch/v1
|
|
||||||
kind: Job
|
|
||||||
metadata:
|
|
||||||
name: {{ include "incidentops.fullname" . }}-migrate
|
|
||||||
labels:
|
|
||||||
{{- include "incidentops.labels" . | nindent 4 }}
|
|
||||||
app.kubernetes.io/component: migration
|
|
||||||
annotations:
|
|
||||||
"helm.sh/hook": post-install,post-upgrade
|
|
||||||
"helm.sh/hook-weight": "-5"
|
|
||||||
"helm.sh/hook-delete-policy": before-hook-creation,hook-succeeded
|
|
||||||
spec:
|
|
||||||
backoffLimit: {{ .Values.migration.backoffLimit }}
|
|
||||||
template:
|
|
||||||
metadata:
|
|
||||||
labels:
|
|
||||||
{{- include "incidentops.selectorLabels" . | nindent 8 }}
|
|
||||||
app.kubernetes.io/component: migration
|
|
||||||
spec:
|
|
||||||
{{- with .Values.global.imagePullSecrets }}
|
|
||||||
imagePullSecrets:
|
|
||||||
{{- toYaml . | nindent 8 }}
|
|
||||||
{{- end }}
|
|
||||||
serviceAccountName: {{ include "incidentops.serviceAccountName" . }}
|
|
||||||
securityContext:
|
|
||||||
{{- toYaml .Values.podSecurityContext | nindent 8 }}
|
|
||||||
restartPolicy: Never
|
|
||||||
containers:
|
|
||||||
- name: migrate
|
|
||||||
securityContext:
|
|
||||||
{{- toYaml .Values.securityContext | nindent 12 }}
|
|
||||||
image: "{{ .Values.migration.image.repository }}:{{ .Values.migration.image.tag }}"
|
|
||||||
imagePullPolicy: {{ .Values.migration.image.pullPolicy }}
|
|
||||||
command:
|
|
||||||
- uv
|
|
||||||
- run
|
|
||||||
- python
|
|
||||||
- migrations/migrate.py
|
|
||||||
- apply
|
|
||||||
envFrom:
|
|
||||||
- secretRef:
|
|
||||||
name: {{ include "incidentops.fullname" . }}-secret
|
|
||||||
resources:
|
|
||||||
requests:
|
|
||||||
cpu: 50m
|
|
||||||
memory: 128Mi
|
|
||||||
limits:
|
|
||||||
cpu: 200m
|
|
||||||
memory: 256Mi
|
|
||||||
{{- end }}
|
|
||||||
@@ -1,132 +0,0 @@
|
|||||||
{{- if .Values.observability.enabled }}
|
|
||||||
apiVersion: v1
|
|
||||||
kind: ConfigMap
|
|
||||||
metadata:
|
|
||||||
name: {{ include "incidentops.fullname" . }}-otel-collector-config
|
|
||||||
labels:
|
|
||||||
{{- include "incidentops.labels" . | nindent 4 }}
|
|
||||||
app.kubernetes.io/component: otel-collector
|
|
||||||
data:
|
|
||||||
otel-collector-config.yaml: |
|
|
||||||
extensions:
|
|
||||||
health_check:
|
|
||||||
endpoint: 0.0.0.0:13133
|
|
||||||
|
|
||||||
receivers:
|
|
||||||
otlp:
|
|
||||||
protocols:
|
|
||||||
grpc:
|
|
||||||
endpoint: 0.0.0.0:4317
|
|
||||||
http:
|
|
||||||
endpoint: 0.0.0.0:4318
|
|
||||||
|
|
||||||
processors:
|
|
||||||
batch:
|
|
||||||
timeout: 1s
|
|
||||||
send_batch_size: 1024
|
|
||||||
memory_limiter:
|
|
||||||
check_interval: 1s
|
|
||||||
limit_mib: 512
|
|
||||||
spike_limit_mib: 128
|
|
||||||
|
|
||||||
exporters:
|
|
||||||
otlp/tempo:
|
|
||||||
endpoint: {{ include "incidentops.fullname" . }}-tempo:4317
|
|
||||||
tls:
|
|
||||||
insecure: true
|
|
||||||
loki:
|
|
||||||
endpoint: http://{{ include "incidentops.fullname" . }}-loki:3100/loki/api/v1/push
|
|
||||||
default_labels_enabled:
|
|
||||||
exporter: true
|
|
||||||
job: true
|
|
||||||
|
|
||||||
service:
|
|
||||||
extensions: [health_check]
|
|
||||||
pipelines:
|
|
||||||
traces:
|
|
||||||
receivers: [otlp]
|
|
||||||
processors: [memory_limiter, batch]
|
|
||||||
exporters: [otlp/tempo]
|
|
||||||
logs:
|
|
||||||
receivers: [otlp]
|
|
||||||
processors: [memory_limiter, batch]
|
|
||||||
exporters: [loki]
|
|
||||||
---
|
|
||||||
apiVersion: apps/v1
|
|
||||||
kind: Deployment
|
|
||||||
metadata:
|
|
||||||
name: {{ include "incidentops.fullname" . }}-otel-collector
|
|
||||||
labels:
|
|
||||||
{{- include "incidentops.labels" . | nindent 4 }}
|
|
||||||
app.kubernetes.io/component: otel-collector
|
|
||||||
spec:
|
|
||||||
replicas: {{ .Values.observability.otelCollector.replicaCount }}
|
|
||||||
selector:
|
|
||||||
matchLabels:
|
|
||||||
{{- include "incidentops.selectorLabels" . | nindent 6 }}
|
|
||||||
app.kubernetes.io/component: otel-collector
|
|
||||||
template:
|
|
||||||
metadata:
|
|
||||||
labels:
|
|
||||||
{{- include "incidentops.selectorLabels" . | nindent 8 }}
|
|
||||||
app.kubernetes.io/component: otel-collector
|
|
||||||
annotations:
|
|
||||||
checksum/config: {{ .Values.observability.otelCollector.image.tag | sha256sum }}
|
|
||||||
spec:
|
|
||||||
containers:
|
|
||||||
- name: otel-collector
|
|
||||||
image: "{{ .Values.observability.otelCollector.image.repository }}:{{ .Values.observability.otelCollector.image.tag }}"
|
|
||||||
imagePullPolicy: {{ .Values.observability.otelCollector.image.pullPolicy }}
|
|
||||||
args:
|
|
||||||
- --config=/etc/otel-collector/otel-collector-config.yaml
|
|
||||||
ports:
|
|
||||||
- name: otlp-grpc
|
|
||||||
containerPort: 4317
|
|
||||||
protocol: TCP
|
|
||||||
- name: otlp-http
|
|
||||||
containerPort: 4318
|
|
||||||
protocol: TCP
|
|
||||||
volumeMounts:
|
|
||||||
- name: config
|
|
||||||
mountPath: /etc/otel-collector
|
|
||||||
resources:
|
|
||||||
{{- toYaml .Values.observability.otelCollector.resources | nindent 12 }}
|
|
||||||
livenessProbe:
|
|
||||||
httpGet:
|
|
||||||
path: /
|
|
||||||
port: 13133
|
|
||||||
initialDelaySeconds: 10
|
|
||||||
periodSeconds: 30
|
|
||||||
readinessProbe:
|
|
||||||
httpGet:
|
|
||||||
path: /
|
|
||||||
port: 13133
|
|
||||||
initialDelaySeconds: 5
|
|
||||||
periodSeconds: 10
|
|
||||||
volumes:
|
|
||||||
- name: config
|
|
||||||
configMap:
|
|
||||||
name: {{ include "incidentops.fullname" . }}-otel-collector-config
|
|
||||||
---
|
|
||||||
apiVersion: v1
|
|
||||||
kind: Service
|
|
||||||
metadata:
|
|
||||||
name: {{ include "incidentops.fullname" . }}-otel-collector
|
|
||||||
labels:
|
|
||||||
{{- include "incidentops.labels" . | nindent 4 }}
|
|
||||||
app.kubernetes.io/component: otel-collector
|
|
||||||
spec:
|
|
||||||
type: ClusterIP
|
|
||||||
ports:
|
|
||||||
- name: otlp-grpc
|
|
||||||
port: 4317
|
|
||||||
targetPort: otlp-grpc
|
|
||||||
protocol: TCP
|
|
||||||
- name: otlp-http
|
|
||||||
port: 4318
|
|
||||||
targetPort: otlp-http
|
|
||||||
protocol: TCP
|
|
||||||
selector:
|
|
||||||
{{- include "incidentops.selectorLabels" . | nindent 4 }}
|
|
||||||
app.kubernetes.io/component: otel-collector
|
|
||||||
{{- end }}
|
|
||||||
@@ -1,91 +0,0 @@
|
|||||||
{{- if .Values.postgresql.enabled }}
|
|
||||||
apiVersion: apps/v1
|
|
||||||
kind: StatefulSet
|
|
||||||
metadata:
|
|
||||||
name: {{ include "incidentops.fullname" . }}-postgresql
|
|
||||||
labels:
|
|
||||||
{{- include "incidentops.labels" . | nindent 4 }}
|
|
||||||
app.kubernetes.io/component: postgresql
|
|
||||||
spec:
|
|
||||||
serviceName: {{ include "incidentops.fullname" . }}-postgresql
|
|
||||||
replicas: 1
|
|
||||||
selector:
|
|
||||||
matchLabels:
|
|
||||||
{{- include "incidentops.selectorLabels" . | nindent 6 }}
|
|
||||||
app.kubernetes.io/component: postgresql
|
|
||||||
template:
|
|
||||||
metadata:
|
|
||||||
labels:
|
|
||||||
{{- include "incidentops.selectorLabels" . | nindent 8 }}
|
|
||||||
app.kubernetes.io/component: postgresql
|
|
||||||
spec:
|
|
||||||
containers:
|
|
||||||
- name: postgresql
|
|
||||||
image: "{{ .Values.postgresql.image.repository }}:{{ .Values.postgresql.image.tag }}"
|
|
||||||
imagePullPolicy: {{ .Values.postgresql.image.pullPolicy }}
|
|
||||||
ports:
|
|
||||||
- name: postgresql
|
|
||||||
containerPort: 5432
|
|
||||||
protocol: TCP
|
|
||||||
env:
|
|
||||||
- name: POSTGRES_USER
|
|
||||||
value: {{ .Values.postgresql.auth.username | quote }}
|
|
||||||
- name: POSTGRES_PASSWORD
|
|
||||||
value: {{ .Values.postgresql.auth.password | quote }}
|
|
||||||
- name: POSTGRES_DB
|
|
||||||
value: {{ .Values.postgresql.auth.database | quote }}
|
|
||||||
- name: PGDATA
|
|
||||||
value: /var/lib/postgresql/data/pgdata
|
|
||||||
volumeMounts:
|
|
||||||
- name: data
|
|
||||||
mountPath: /var/lib/postgresql/data
|
|
||||||
livenessProbe:
|
|
||||||
exec:
|
|
||||||
command:
|
|
||||||
- pg_isready
|
|
||||||
- -U
|
|
||||||
- {{ .Values.postgresql.auth.username }}
|
|
||||||
initialDelaySeconds: 30
|
|
||||||
periodSeconds: 10
|
|
||||||
timeoutSeconds: 5
|
|
||||||
failureThreshold: 6
|
|
||||||
readinessProbe:
|
|
||||||
exec:
|
|
||||||
command:
|
|
||||||
- pg_isready
|
|
||||||
- -U
|
|
||||||
- {{ .Values.postgresql.auth.username }}
|
|
||||||
initialDelaySeconds: 5
|
|
||||||
periodSeconds: 10
|
|
||||||
timeoutSeconds: 5
|
|
||||||
failureThreshold: 6
|
|
||||||
resources:
|
|
||||||
{{- toYaml .Values.postgresql.resources | nindent 12 }}
|
|
||||||
volumeClaimTemplates:
|
|
||||||
- metadata:
|
|
||||||
name: data
|
|
||||||
spec:
|
|
||||||
accessModes:
|
|
||||||
- ReadWriteOnce
|
|
||||||
resources:
|
|
||||||
requests:
|
|
||||||
storage: {{ .Values.postgresql.persistence.size }}
|
|
||||||
---
|
|
||||||
apiVersion: v1
|
|
||||||
kind: Service
|
|
||||||
metadata:
|
|
||||||
name: {{ include "incidentops.fullname" . }}-postgresql
|
|
||||||
labels:
|
|
||||||
{{- include "incidentops.labels" . | nindent 4 }}
|
|
||||||
app.kubernetes.io/component: postgresql
|
|
||||||
spec:
|
|
||||||
type: ClusterIP
|
|
||||||
ports:
|
|
||||||
- port: 5432
|
|
||||||
targetPort: postgresql
|
|
||||||
protocol: TCP
|
|
||||||
name: postgresql
|
|
||||||
selector:
|
|
||||||
{{- include "incidentops.selectorLabels" . | nindent 4 }}
|
|
||||||
app.kubernetes.io/component: postgresql
|
|
||||||
{{- end }}
|
|
||||||
@@ -1,163 +0,0 @@
|
|||||||
{{- if and .Values.observability.enabled .Values.metrics.enabled }}
|
|
||||||
apiVersion: v1
|
|
||||||
kind: ConfigMap
|
|
||||||
metadata:
|
|
||||||
name: {{ include "incidentops.fullname" . }}-prometheus
|
|
||||||
labels:
|
|
||||||
{{- include "incidentops.labels" . | nindent 4 }}
|
|
||||||
app.kubernetes.io/component: prometheus
|
|
||||||
data:
|
|
||||||
prometheus.yml: |
|
|
||||||
global:
|
|
||||||
scrape_interval: {{ .Values.observability.prometheus.scrapeInterval | default "15s" }}
|
|
||||||
evaluation_interval: 15s
|
|
||||||
|
|
||||||
scrape_configs:
|
|
||||||
- job_name: "prometheus"
|
|
||||||
static_configs:
|
|
||||||
- targets: ["localhost:9090"]
|
|
||||||
|
|
||||||
- job_name: "incidentops-api"
|
|
||||||
kubernetes_sd_configs:
|
|
||||||
- role: pod
|
|
||||||
namespaces:
|
|
||||||
names:
|
|
||||||
- {{ .Release.Namespace }}
|
|
||||||
relabel_configs:
|
|
||||||
- source_labels: [__meta_kubernetes_pod_label_app_kubernetes_io_component]
|
|
||||||
action: keep
|
|
||||||
regex: api
|
|
||||||
- source_labels: [__meta_kubernetes_pod_container_port_name]
|
|
||||||
action: keep
|
|
||||||
regex: metrics
|
|
||||||
- source_labels: [__meta_kubernetes_namespace]
|
|
||||||
target_label: namespace
|
|
||||||
- source_labels: [__meta_kubernetes_pod_name]
|
|
||||||
target_label: pod
|
|
||||||
metrics_path: /metrics
|
|
||||||
scrape_interval: 10s
|
|
||||||
|
|
||||||
- job_name: "incidentops-worker"
|
|
||||||
kubernetes_sd_configs:
|
|
||||||
- role: pod
|
|
||||||
namespaces:
|
|
||||||
names:
|
|
||||||
- {{ .Release.Namespace }}
|
|
||||||
relabel_configs:
|
|
||||||
- source_labels: [__meta_kubernetes_pod_label_app_kubernetes_io_component]
|
|
||||||
action: keep
|
|
||||||
regex: worker
|
|
||||||
- source_labels: [__meta_kubernetes_pod_container_port_name]
|
|
||||||
action: keep
|
|
||||||
regex: metrics
|
|
||||||
- source_labels: [__meta_kubernetes_namespace]
|
|
||||||
target_label: namespace
|
|
||||||
- source_labels: [__meta_kubernetes_pod_name]
|
|
||||||
target_label: pod
|
|
||||||
metrics_path: /metrics
|
|
||||||
scrape_interval: 10s
|
|
||||||
---
|
|
||||||
apiVersion: apps/v1
|
|
||||||
kind: Deployment
|
|
||||||
metadata:
|
|
||||||
name: {{ include "incidentops.fullname" . }}-prometheus
|
|
||||||
labels:
|
|
||||||
{{- include "incidentops.labels" . | nindent 4 }}
|
|
||||||
app.kubernetes.io/component: prometheus
|
|
||||||
spec:
|
|
||||||
replicas: 1
|
|
||||||
selector:
|
|
||||||
matchLabels:
|
|
||||||
{{- include "incidentops.selectorLabels" . | nindent 6 }}
|
|
||||||
app.kubernetes.io/component: prometheus
|
|
||||||
template:
|
|
||||||
metadata:
|
|
||||||
labels:
|
|
||||||
{{- include "incidentops.selectorLabels" . | nindent 8 }}
|
|
||||||
app.kubernetes.io/component: prometheus
|
|
||||||
annotations:
|
|
||||||
checksum/config: {{ .Values.observability.prometheus.image.tag | sha256sum }}
|
|
||||||
spec:
|
|
||||||
serviceAccountName: {{ include "incidentops.serviceAccountName" . }}
|
|
||||||
securityContext:
|
|
||||||
fsGroup: 65534
|
|
||||||
runAsUser: 65534
|
|
||||||
runAsNonRoot: true
|
|
||||||
containers:
|
|
||||||
- name: prometheus
|
|
||||||
image: "{{ .Values.observability.prometheus.image.repository }}:{{ .Values.observability.prometheus.image.tag }}"
|
|
||||||
imagePullPolicy: {{ .Values.observability.prometheus.image.pullPolicy }}
|
|
||||||
args:
|
|
||||||
- "--config.file=/etc/prometheus/prometheus.yml"
|
|
||||||
- "--storage.tsdb.path=/prometheus"
|
|
||||||
- "--storage.tsdb.retention.time={{ .Values.observability.prometheus.retention }}"
|
|
||||||
- "--web.enable-lifecycle"
|
|
||||||
ports:
|
|
||||||
- name: http
|
|
||||||
containerPort: 9090
|
|
||||||
protocol: TCP
|
|
||||||
volumeMounts:
|
|
||||||
- name: config
|
|
||||||
mountPath: /etc/prometheus
|
|
||||||
- name: data
|
|
||||||
mountPath: /prometheus
|
|
||||||
resources:
|
|
||||||
{{- toYaml .Values.observability.prometheus.resources | nindent 12 }}
|
|
||||||
readinessProbe:
|
|
||||||
httpGet:
|
|
||||||
path: /-/ready
|
|
||||||
port: http
|
|
||||||
initialDelaySeconds: 10
|
|
||||||
periodSeconds: 10
|
|
||||||
livenessProbe:
|
|
||||||
httpGet:
|
|
||||||
path: /-/healthy
|
|
||||||
port: http
|
|
||||||
initialDelaySeconds: 30
|
|
||||||
periodSeconds: 30
|
|
||||||
volumes:
|
|
||||||
- name: config
|
|
||||||
configMap:
|
|
||||||
name: {{ include "incidentops.fullname" . }}-prometheus
|
|
||||||
- name: data
|
|
||||||
{{- if .Values.observability.prometheus.persistence.enabled }}
|
|
||||||
persistentVolumeClaim:
|
|
||||||
claimName: {{ include "incidentops.fullname" . }}-prometheus
|
|
||||||
{{- else }}
|
|
||||||
emptyDir: {}
|
|
||||||
{{- end }}
|
|
||||||
---
|
|
||||||
apiVersion: v1
|
|
||||||
kind: Service
|
|
||||||
metadata:
|
|
||||||
name: {{ include "incidentops.fullname" . }}-prometheus
|
|
||||||
labels:
|
|
||||||
{{- include "incidentops.labels" . | nindent 4 }}
|
|
||||||
app.kubernetes.io/component: prometheus
|
|
||||||
spec:
|
|
||||||
type: ClusterIP
|
|
||||||
ports:
|
|
||||||
- name: http
|
|
||||||
port: 9090
|
|
||||||
targetPort: http
|
|
||||||
protocol: TCP
|
|
||||||
selector:
|
|
||||||
{{- include "incidentops.selectorLabels" . | nindent 4 }}
|
|
||||||
app.kubernetes.io/component: prometheus
|
|
||||||
{{- if .Values.observability.prometheus.persistence.enabled }}
|
|
||||||
---
|
|
||||||
apiVersion: v1
|
|
||||||
kind: PersistentVolumeClaim
|
|
||||||
metadata:
|
|
||||||
name: {{ include "incidentops.fullname" . }}-prometheus
|
|
||||||
labels:
|
|
||||||
{{- include "incidentops.labels" . | nindent 4 }}
|
|
||||||
app.kubernetes.io/component: prometheus
|
|
||||||
spec:
|
|
||||||
accessModes:
|
|
||||||
- ReadWriteOnce
|
|
||||||
resources:
|
|
||||||
requests:
|
|
||||||
storage: {{ .Values.observability.prometheus.persistence.size }}
|
|
||||||
{{- end }}
|
|
||||||
{{- end }}
|
|
||||||
@@ -1,29 +0,0 @@
|
|||||||
{{- if and .Values.observability.enabled .Values.metrics.enabled }}
|
|
||||||
apiVersion: rbac.authorization.k8s.io/v1
|
|
||||||
kind: Role
|
|
||||||
metadata:
|
|
||||||
name: {{ include "incidentops.fullname" . }}-prometheus
|
|
||||||
labels:
|
|
||||||
{{- include "incidentops.labels" . | nindent 4 }}
|
|
||||||
app.kubernetes.io/component: prometheus
|
|
||||||
rules:
|
|
||||||
- apiGroups: [""]
|
|
||||||
resources: ["pods", "endpoints", "services"]
|
|
||||||
verbs: ["get", "list", "watch"]
|
|
||||||
---
|
|
||||||
apiVersion: rbac.authorization.k8s.io/v1
|
|
||||||
kind: RoleBinding
|
|
||||||
metadata:
|
|
||||||
name: {{ include "incidentops.fullname" . }}-prometheus
|
|
||||||
labels:
|
|
||||||
{{- include "incidentops.labels" . | nindent 4 }}
|
|
||||||
app.kubernetes.io/component: prometheus
|
|
||||||
subjects:
|
|
||||||
- kind: ServiceAccount
|
|
||||||
name: {{ include "incidentops.serviceAccountName" . }}
|
|
||||||
namespace: {{ .Release.Namespace }}
|
|
||||||
roleRef:
|
|
||||||
apiGroup: rbac.authorization.k8s.io
|
|
||||||
kind: Role
|
|
||||||
name: {{ include "incidentops.fullname" . }}-prometheus
|
|
||||||
{{- end }}
|
|
||||||
@@ -1,169 +0,0 @@
|
|||||||
{{- if and .Values.observability.enabled .Values.observability.promtail.enabled }}
|
|
||||||
apiVersion: v1
|
|
||||||
kind: ConfigMap
|
|
||||||
metadata:
|
|
||||||
name: {{ include "incidentops.fullname" . }}-promtail-config
|
|
||||||
labels:
|
|
||||||
{{- include "incidentops.labels" . | nindent 4 }}
|
|
||||||
app.kubernetes.io/component: promtail
|
|
||||||
data:
|
|
||||||
promtail.yaml: |
|
|
||||||
server:
|
|
||||||
http_listen_port: 3101
|
|
||||||
grpc_listen_port: 0
|
|
||||||
|
|
||||||
positions:
|
|
||||||
filename: /run/promtail/positions.yaml
|
|
||||||
|
|
||||||
clients:
|
|
||||||
- url: http://{{ include "incidentops.fullname" . }}-loki:3100/loki/api/v1/push
|
|
||||||
|
|
||||||
scrape_configs:
|
|
||||||
- job_name: kubernetes-pods
|
|
||||||
pipeline_stages:
|
|
||||||
- cri: {}
|
|
||||||
kubernetes_sd_configs:
|
|
||||||
- role: pod
|
|
||||||
namespaces:
|
|
||||||
names: [{{ .Release.Namespace }}]
|
|
||||||
relabel_configs:
|
|
||||||
- source_labels: [__meta_kubernetes_pod_container_init]
|
|
||||||
regex: "true"
|
|
||||||
action: drop
|
|
||||||
- source_labels: [__meta_kubernetes_pod_phase]
|
|
||||||
regex: Pending|Failed|Succeeded
|
|
||||||
action: drop
|
|
||||||
- source_labels: [__meta_kubernetes_pod_name, __meta_kubernetes_pod_namespace, __meta_kubernetes_pod_container_name]
|
|
||||||
target_label: __path__
|
|
||||||
replacement: /var/log/containers/$1_$2_$3-*.log
|
|
||||||
- source_labels: [__meta_kubernetes_pod_label_app_kubernetes_io_component]
|
|
||||||
regex: (.*)
|
|
||||||
target_label: service_name
|
|
||||||
replacement: {{ include "incidentops.fullname" . }}-$1
|
|
||||||
- source_labels: [__meta_kubernetes_pod_namespace]
|
|
||||||
target_label: namespace
|
|
||||||
- source_labels: [__meta_kubernetes_pod_name]
|
|
||||||
target_label: pod
|
|
||||||
- source_labels: [__meta_kubernetes_pod_container_name]
|
|
||||||
target_label: container
|
|
||||||
- source_labels: [__meta_kubernetes_pod_uid]
|
|
||||||
target_label: pod_uid
|
|
||||||
- target_label: cluster
|
|
||||||
replacement: {{ .Release.Namespace }}
|
|
||||||
|
|
||||||
- job_name: containers-fallback
|
|
||||||
pipeline_stages:
|
|
||||||
- cri: {}
|
|
||||||
static_configs:
|
|
||||||
- labels:
|
|
||||||
job: containers
|
|
||||||
namespace: {{ .Release.Namespace }}
|
|
||||||
service_name: incidentops-api
|
|
||||||
__path__: /var/log/containers/incidentops-api-*_incidentops_api-*.log
|
|
||||||
- labels:
|
|
||||||
job: containers
|
|
||||||
namespace: {{ .Release.Namespace }}
|
|
||||||
service_name: incidentops-worker
|
|
||||||
__path__: /var/log/containers/incidentops-worker-*_incidentops_worker-*.log
|
|
||||||
---
|
|
||||||
apiVersion: v1
|
|
||||||
kind: ServiceAccount
|
|
||||||
metadata:
|
|
||||||
name: {{ include "incidentops.fullname" . }}-promtail
|
|
||||||
labels:
|
|
||||||
{{- include "incidentops.labels" . | nindent 4 }}
|
|
||||||
app.kubernetes.io/component: promtail
|
|
||||||
---
|
|
||||||
apiVersion: rbac.authorization.k8s.io/v1
|
|
||||||
kind: ClusterRole
|
|
||||||
metadata:
|
|
||||||
name: {{ include "incidentops.fullname" . }}-promtail
|
|
||||||
labels:
|
|
||||||
{{- include "incidentops.labels" . | nindent 4 }}
|
|
||||||
app.kubernetes.io/component: promtail
|
|
||||||
rules:
|
|
||||||
- apiGroups: [""]
|
|
||||||
resources: ["pods", "pods/log", "namespaces", "services", "endpoints", "nodes"]
|
|
||||||
verbs: ["get", "list", "watch"]
|
|
||||||
---
|
|
||||||
apiVersion: rbac.authorization.k8s.io/v1
|
|
||||||
kind: ClusterRoleBinding
|
|
||||||
metadata:
|
|
||||||
name: {{ include "incidentops.fullname" . }}-promtail
|
|
||||||
labels:
|
|
||||||
{{- include "incidentops.labels" . | nindent 4 }}
|
|
||||||
app.kubernetes.io/component: promtail
|
|
||||||
subjects:
|
|
||||||
- kind: ServiceAccount
|
|
||||||
name: {{ include "incidentops.fullname" . }}-promtail
|
|
||||||
namespace: {{ .Release.Namespace }}
|
|
||||||
roleRef:
|
|
||||||
apiGroup: rbac.authorization.k8s.io
|
|
||||||
kind: ClusterRole
|
|
||||||
name: {{ include "incidentops.fullname" . }}-promtail
|
|
||||||
---
|
|
||||||
apiVersion: apps/v1
|
|
||||||
kind: DaemonSet
|
|
||||||
metadata:
|
|
||||||
name: {{ include "incidentops.fullname" . }}-promtail
|
|
||||||
labels:
|
|
||||||
{{- include "incidentops.labels" . | nindent 4 }}
|
|
||||||
app.kubernetes.io/component: promtail
|
|
||||||
spec:
|
|
||||||
selector:
|
|
||||||
matchLabels:
|
|
||||||
{{- include "incidentops.selectorLabels" . | nindent 6 }}
|
|
||||||
app.kubernetes.io/component: promtail
|
|
||||||
template:
|
|
||||||
metadata:
|
|
||||||
labels:
|
|
||||||
{{- include "incidentops.selectorLabels" . | nindent 8 }}
|
|
||||||
app.kubernetes.io/component: promtail
|
|
||||||
annotations:
|
|
||||||
checksum/config: {{ .Values.observability.promtail.image.tag | sha256sum }}
|
|
||||||
spec:
|
|
||||||
serviceAccountName: {{ include "incidentops.fullname" . }}-promtail
|
|
||||||
securityContext:
|
|
||||||
runAsUser: 0
|
|
||||||
containers:
|
|
||||||
- name: promtail
|
|
||||||
image: "{{ .Values.observability.promtail.image.repository }}:{{ .Values.observability.promtail.image.tag }}"
|
|
||||||
imagePullPolicy: {{ .Values.observability.promtail.image.pullPolicy }}
|
|
||||||
args:
|
|
||||||
- -config.file=/etc/promtail/promtail.yaml
|
|
||||||
ports:
|
|
||||||
- name: http-metrics
|
|
||||||
containerPort: 3101
|
|
||||||
protocol: TCP
|
|
||||||
volumeMounts:
|
|
||||||
- name: config
|
|
||||||
mountPath: /etc/promtail
|
|
||||||
- name: positions
|
|
||||||
mountPath: /run/promtail
|
|
||||||
- name: varlog
|
|
||||||
mountPath: /var/log
|
|
||||||
readOnly: true
|
|
||||||
- name: varlogpods
|
|
||||||
mountPath: /var/log/pods
|
|
||||||
readOnly: true
|
|
||||||
- name: varlogcontainers
|
|
||||||
mountPath: /var/log/containers
|
|
||||||
readOnly: true
|
|
||||||
resources:
|
|
||||||
{{- toYaml .Values.observability.promtail.resources | nindent 12 }}
|
|
||||||
volumes:
|
|
||||||
- name: config
|
|
||||||
configMap:
|
|
||||||
name: {{ include "incidentops.fullname" . }}-promtail-config
|
|
||||||
- name: positions
|
|
||||||
emptyDir: {}
|
|
||||||
- name: varlog
|
|
||||||
hostPath:
|
|
||||||
path: /var/log
|
|
||||||
- name: varlogpods
|
|
||||||
hostPath:
|
|
||||||
path: /var/log/pods
|
|
||||||
- name: varlogcontainers
|
|
||||||
hostPath:
|
|
||||||
path: /var/log/containers
|
|
||||||
{{- end }}
|
|
||||||
@@ -1,80 +0,0 @@
|
|||||||
{{- if .Values.redis.enabled }}
|
|
||||||
apiVersion: apps/v1
|
|
||||||
kind: StatefulSet
|
|
||||||
metadata:
|
|
||||||
name: {{ include "incidentops.fullname" . }}-redis
|
|
||||||
labels:
|
|
||||||
{{- include "incidentops.labels" . | nindent 4 }}
|
|
||||||
app.kubernetes.io/component: redis
|
|
||||||
spec:
|
|
||||||
serviceName: {{ include "incidentops.fullname" . }}-redis
|
|
||||||
replicas: 1
|
|
||||||
selector:
|
|
||||||
matchLabels:
|
|
||||||
{{- include "incidentops.selectorLabels" . | nindent 6 }}
|
|
||||||
app.kubernetes.io/component: redis
|
|
||||||
template:
|
|
||||||
metadata:
|
|
||||||
labels:
|
|
||||||
{{- include "incidentops.selectorLabels" . | nindent 8 }}
|
|
||||||
app.kubernetes.io/component: redis
|
|
||||||
spec:
|
|
||||||
containers:
|
|
||||||
- name: redis
|
|
||||||
image: "{{ .Values.redis.image.repository }}:{{ .Values.redis.image.tag }}"
|
|
||||||
imagePullPolicy: {{ .Values.redis.image.pullPolicy }}
|
|
||||||
ports:
|
|
||||||
- name: redis
|
|
||||||
containerPort: 6379
|
|
||||||
protocol: TCP
|
|
||||||
volumeMounts:
|
|
||||||
- name: data
|
|
||||||
mountPath: /data
|
|
||||||
livenessProbe:
|
|
||||||
exec:
|
|
||||||
command:
|
|
||||||
- redis-cli
|
|
||||||
- ping
|
|
||||||
initialDelaySeconds: 30
|
|
||||||
periodSeconds: 10
|
|
||||||
timeoutSeconds: 5
|
|
||||||
failureThreshold: 6
|
|
||||||
readinessProbe:
|
|
||||||
exec:
|
|
||||||
command:
|
|
||||||
- redis-cli
|
|
||||||
- ping
|
|
||||||
initialDelaySeconds: 5
|
|
||||||
periodSeconds: 10
|
|
||||||
timeoutSeconds: 5
|
|
||||||
failureThreshold: 6
|
|
||||||
resources:
|
|
||||||
{{- toYaml .Values.redis.resources | nindent 12 }}
|
|
||||||
volumeClaimTemplates:
|
|
||||||
- metadata:
|
|
||||||
name: data
|
|
||||||
spec:
|
|
||||||
accessModes:
|
|
||||||
- ReadWriteOnce
|
|
||||||
resources:
|
|
||||||
requests:
|
|
||||||
storage: {{ .Values.redis.persistence.size }}
|
|
||||||
---
|
|
||||||
apiVersion: v1
|
|
||||||
kind: Service
|
|
||||||
metadata:
|
|
||||||
name: {{ include "incidentops.fullname" . }}-redis
|
|
||||||
labels:
|
|
||||||
{{- include "incidentops.labels" . | nindent 4 }}
|
|
||||||
app.kubernetes.io/component: redis
|
|
||||||
spec:
|
|
||||||
type: ClusterIP
|
|
||||||
ports:
|
|
||||||
- port: 6379
|
|
||||||
targetPort: redis
|
|
||||||
protocol: TCP
|
|
||||||
name: redis
|
|
||||||
selector:
|
|
||||||
{{- include "incidentops.selectorLabels" . | nindent 4 }}
|
|
||||||
app.kubernetes.io/component: redis
|
|
||||||
{{- end }}
|
|
||||||
@@ -1,13 +0,0 @@
|
|||||||
apiVersion: v1
|
|
||||||
kind: Secret
|
|
||||||
metadata:
|
|
||||||
name: {{ include "incidentops.fullname" . }}-secret
|
|
||||||
labels:
|
|
||||||
{{- include "incidentops.labels" . | nindent 4 }}
|
|
||||||
type: Opaque
|
|
||||||
stringData:
|
|
||||||
DATABASE_URL: {{ include "incidentops.databaseUrl" . | quote }}
|
|
||||||
REDIS_URL: {{ include "incidentops.redisUrl" . | quote }}
|
|
||||||
CELERY_BROKER_URL: {{ include "incidentops.celeryBrokerUrl" . | quote }}
|
|
||||||
CELERY_RESULT_BACKEND: {{ include "incidentops.celeryResultBackend" . | quote }}
|
|
||||||
JWT_SECRET_KEY: {{ .Values.secrets.jwtSecretKey | quote }}
|
|
||||||
@@ -0,0 +1,9 @@
|
|||||||
|
apiVersion: v1
|
||||||
|
kind: Secret
|
||||||
|
metadata:
|
||||||
|
name: {{ include "incidentops.fullname" . }}-secrets
|
||||||
|
labels:
|
||||||
|
{{- include "incidentops.labels" . | nindent 4 }}
|
||||||
|
type: Opaque
|
||||||
|
stringData:
|
||||||
|
jwt-signing-key: {{ .Values.jwt.signingKey | quote }}
|
||||||
@@ -1,12 +0,0 @@
|
|||||||
{{- if .Values.serviceAccount.create -}}
|
|
||||||
apiVersion: v1
|
|
||||||
kind: ServiceAccount
|
|
||||||
metadata:
|
|
||||||
name: {{ include "incidentops.serviceAccountName" . }}
|
|
||||||
labels:
|
|
||||||
{{- include "incidentops.labels" . | nindent 4 }}
|
|
||||||
{{- with .Values.serviceAccount.annotations }}
|
|
||||||
annotations:
|
|
||||||
{{- toYaml . | nindent 4 }}
|
|
||||||
{{- end }}
|
|
||||||
{{- end }}
|
|
||||||
@@ -1,153 +0,0 @@
|
|||||||
{{- if .Values.observability.enabled }}
|
|
||||||
apiVersion: v1
|
|
||||||
kind: ConfigMap
|
|
||||||
metadata:
|
|
||||||
name: {{ include "incidentops.fullname" . }}-tempo-config
|
|
||||||
labels:
|
|
||||||
{{- include "incidentops.labels" . | nindent 4 }}
|
|
||||||
app.kubernetes.io/component: tempo
|
|
||||||
data:
|
|
||||||
tempo.yaml: |
|
|
||||||
server:
|
|
||||||
http_listen_port: 3200
|
|
||||||
|
|
||||||
distributor:
|
|
||||||
receivers:
|
|
||||||
otlp:
|
|
||||||
protocols:
|
|
||||||
grpc:
|
|
||||||
endpoint: 0.0.0.0:4317
|
|
||||||
http:
|
|
||||||
endpoint: 0.0.0.0:4318
|
|
||||||
|
|
||||||
ingester:
|
|
||||||
trace_idle_period: 10s
|
|
||||||
max_block_bytes: 1048576
|
|
||||||
max_block_duration: 5m
|
|
||||||
|
|
||||||
compactor:
|
|
||||||
compaction:
|
|
||||||
block_retention: {{ .Values.observability.tempo.retention }}
|
|
||||||
|
|
||||||
storage:
|
|
||||||
trace:
|
|
||||||
backend: local
|
|
||||||
local:
|
|
||||||
path: /var/tempo/traces
|
|
||||||
wal:
|
|
||||||
path: /var/tempo/wal
|
|
||||||
|
|
||||||
querier:
|
|
||||||
search:
|
|
||||||
query_timeout: 30s
|
|
||||||
---
|
|
||||||
apiVersion: apps/v1
|
|
||||||
kind: Deployment
|
|
||||||
metadata:
|
|
||||||
name: {{ include "incidentops.fullname" . }}-tempo
|
|
||||||
labels:
|
|
||||||
{{- include "incidentops.labels" . | nindent 4 }}
|
|
||||||
app.kubernetes.io/component: tempo
|
|
||||||
spec:
|
|
||||||
replicas: 1
|
|
||||||
selector:
|
|
||||||
matchLabels:
|
|
||||||
{{- include "incidentops.selectorLabels" . | nindent 6 }}
|
|
||||||
app.kubernetes.io/component: tempo
|
|
||||||
template:
|
|
||||||
metadata:
|
|
||||||
labels:
|
|
||||||
{{- include "incidentops.selectorLabels" . | nindent 8 }}
|
|
||||||
app.kubernetes.io/component: tempo
|
|
||||||
annotations:
|
|
||||||
checksum/config: {{ .Values.observability.tempo.image.tag | sha256sum }}
|
|
||||||
spec:
|
|
||||||
containers:
|
|
||||||
- name: tempo
|
|
||||||
image: "{{ .Values.observability.tempo.image.repository }}:{{ .Values.observability.tempo.image.tag }}"
|
|
||||||
imagePullPolicy: {{ .Values.observability.tempo.image.pullPolicy }}
|
|
||||||
args:
|
|
||||||
- -config.file=/etc/tempo/tempo.yaml
|
|
||||||
ports:
|
|
||||||
- name: http
|
|
||||||
containerPort: 3200
|
|
||||||
protocol: TCP
|
|
||||||
- name: otlp-grpc
|
|
||||||
containerPort: 4317
|
|
||||||
protocol: TCP
|
|
||||||
- name: otlp-http
|
|
||||||
containerPort: 4318
|
|
||||||
protocol: TCP
|
|
||||||
volumeMounts:
|
|
||||||
- name: config
|
|
||||||
mountPath: /etc/tempo
|
|
||||||
- name: data
|
|
||||||
mountPath: /var/tempo
|
|
||||||
resources:
|
|
||||||
{{- toYaml .Values.observability.tempo.resources | nindent 12 }}
|
|
||||||
readinessProbe:
|
|
||||||
httpGet:
|
|
||||||
path: /ready
|
|
||||||
port: http
|
|
||||||
initialDelaySeconds: 10
|
|
||||||
periodSeconds: 10
|
|
||||||
livenessProbe:
|
|
||||||
httpGet:
|
|
||||||
path: /ready
|
|
||||||
port: http
|
|
||||||
initialDelaySeconds: 30
|
|
||||||
periodSeconds: 30
|
|
||||||
volumes:
|
|
||||||
- name: config
|
|
||||||
configMap:
|
|
||||||
name: {{ include "incidentops.fullname" . }}-tempo-config
|
|
||||||
- name: data
|
|
||||||
{{- if .Values.observability.tempo.persistence.enabled }}
|
|
||||||
persistentVolumeClaim:
|
|
||||||
claimName: {{ include "incidentops.fullname" . }}-tempo
|
|
||||||
{{- else }}
|
|
||||||
emptyDir: {}
|
|
||||||
{{- end }}
|
|
||||||
---
|
|
||||||
apiVersion: v1
|
|
||||||
kind: Service
|
|
||||||
metadata:
|
|
||||||
name: {{ include "incidentops.fullname" . }}-tempo
|
|
||||||
labels:
|
|
||||||
{{- include "incidentops.labels" . | nindent 4 }}
|
|
||||||
app.kubernetes.io/component: tempo
|
|
||||||
spec:
|
|
||||||
type: ClusterIP
|
|
||||||
ports:
|
|
||||||
- name: http
|
|
||||||
port: 3200
|
|
||||||
targetPort: http
|
|
||||||
protocol: TCP
|
|
||||||
- name: otlp-grpc
|
|
||||||
port: 4317
|
|
||||||
targetPort: otlp-grpc
|
|
||||||
protocol: TCP
|
|
||||||
- name: otlp-http
|
|
||||||
port: 4318
|
|
||||||
targetPort: otlp-http
|
|
||||||
protocol: TCP
|
|
||||||
selector:
|
|
||||||
{{- include "incidentops.selectorLabels" . | nindent 4 }}
|
|
||||||
app.kubernetes.io/component: tempo
|
|
||||||
{{- if .Values.observability.tempo.persistence.enabled }}
|
|
||||||
---
|
|
||||||
apiVersion: v1
|
|
||||||
kind: PersistentVolumeClaim
|
|
||||||
metadata:
|
|
||||||
name: {{ include "incidentops.fullname" . }}-tempo
|
|
||||||
labels:
|
|
||||||
{{- include "incidentops.labels" . | nindent 4 }}
|
|
||||||
app.kubernetes.io/component: tempo
|
|
||||||
spec:
|
|
||||||
accessModes:
|
|
||||||
- ReadWriteOnce
|
|
||||||
resources:
|
|
||||||
requests:
|
|
||||||
storage: {{ .Values.observability.tempo.persistence.size }}
|
|
||||||
{{- end }}
|
|
||||||
{{- end }}
|
|
||||||
@@ -3,70 +3,42 @@ kind: Deployment
|
|||||||
metadata:
|
metadata:
|
||||||
name: {{ include "incidentops.fullname" . }}-web
|
name: {{ include "incidentops.fullname" . }}-web
|
||||||
labels:
|
labels:
|
||||||
{{- include "incidentops.web.labels" . | nindent 4 }}
|
{{- include "incidentops.labels" . | nindent 4 }}
|
||||||
|
app.kubernetes.io/component: web
|
||||||
spec:
|
spec:
|
||||||
{{- if not .Values.web.autoscaling.enabled }}
|
replicas: {{ .Values.web.replicas }}
|
||||||
replicas: {{ .Values.web.replicaCount }}
|
|
||||||
{{- end }}
|
|
||||||
selector:
|
selector:
|
||||||
matchLabels:
|
matchLabels:
|
||||||
{{- include "incidentops.web.selectorLabels" . | nindent 6 }}
|
{{- include "incidentops.selectorLabels" . | nindent 6 }}
|
||||||
|
app.kubernetes.io/component: web
|
||||||
template:
|
template:
|
||||||
metadata:
|
metadata:
|
||||||
{{- with .Values.web.podAnnotations }}
|
|
||||||
annotations:
|
|
||||||
{{- toYaml . | nindent 8 }}
|
|
||||||
{{- end }}
|
|
||||||
labels:
|
labels:
|
||||||
{{- include "incidentops.web.selectorLabels" . | nindent 8 }}
|
{{- include "incidentops.selectorLabels" . | nindent 8 }}
|
||||||
|
app.kubernetes.io/component: web
|
||||||
spec:
|
spec:
|
||||||
{{- with .Values.global.imagePullSecrets }}
|
|
||||||
imagePullSecrets:
|
|
||||||
{{- toYaml . | nindent 8 }}
|
|
||||||
{{- end }}
|
|
||||||
serviceAccountName: {{ include "incidentops.serviceAccountName" . }}
|
|
||||||
securityContext:
|
|
||||||
{{- toYaml .Values.podSecurityContext | nindent 8 }}
|
|
||||||
containers:
|
containers:
|
||||||
- name: web
|
- name: web
|
||||||
securityContext:
|
image: "{{ .Values.web.image }}:{{ .Values.web.tag }}"
|
||||||
{{- toYaml .Values.securityContext | nindent 12 }}
|
imagePullPolicy: IfNotPresent
|
||||||
image: {{ include "incidentops.web.image" . }}
|
|
||||||
imagePullPolicy: {{ .Values.web.image.pullPolicy }}
|
|
||||||
ports:
|
ports:
|
||||||
- name: http
|
- name: http
|
||||||
containerPort: 3000
|
containerPort: {{ .Values.web.port }}
|
||||||
protocol: TCP
|
protocol: TCP
|
||||||
env:
|
env:
|
||||||
- name: NEXT_PUBLIC_API_URL
|
- name: NEXT_PUBLIC_API_URL
|
||||||
value: "http://{{ include "incidentops.fullname" . }}-api:{{ .Values.api.service.port }}"
|
value: "http://{{ .Values.ingress.host }}/api"
|
||||||
livenessProbe:
|
livenessProbe:
|
||||||
httpGet:
|
httpGet:
|
||||||
path: /
|
path: /
|
||||||
port: http
|
port: http
|
||||||
initialDelaySeconds: 10
|
initialDelaySeconds: 10
|
||||||
periodSeconds: 30
|
periodSeconds: 10
|
||||||
timeoutSeconds: 5
|
|
||||||
failureThreshold: 3
|
|
||||||
readinessProbe:
|
readinessProbe:
|
||||||
httpGet:
|
httpGet:
|
||||||
path: /
|
path: /
|
||||||
port: http
|
port: http
|
||||||
initialDelaySeconds: 5
|
initialDelaySeconds: 5
|
||||||
periodSeconds: 10
|
periodSeconds: 5
|
||||||
timeoutSeconds: 5
|
|
||||||
failureThreshold: 3
|
|
||||||
resources:
|
resources:
|
||||||
{{- toYaml .Values.web.resources | nindent 12 }}
|
{{- toYaml .Values.web.resources | nindent 12 }}
|
||||||
{{- with .Values.web.nodeSelector }}
|
|
||||||
nodeSelector:
|
|
||||||
{{- toYaml . | nindent 8 }}
|
|
||||||
{{- end }}
|
|
||||||
{{- with .Values.web.affinity }}
|
|
||||||
affinity:
|
|
||||||
{{- toYaml . | nindent 8 }}
|
|
||||||
{{- end }}
|
|
||||||
{{- with .Values.web.tolerations }}
|
|
||||||
tolerations:
|
|
||||||
{{- toYaml . | nindent 8 }}
|
|
||||||
{{- end }}
|
|
||||||
|
|||||||
@@ -1,22 +0,0 @@
|
|||||||
{{- if .Values.web.autoscaling.enabled }}
|
|
||||||
apiVersion: autoscaling/v2
|
|
||||||
kind: HorizontalPodAutoscaler
|
|
||||||
metadata:
|
|
||||||
name: {{ include "incidentops.fullname" . }}-web
|
|
||||||
labels:
|
|
||||||
{{- include "incidentops.web.labels" . | nindent 4 }}
|
|
||||||
spec:
|
|
||||||
scaleTargetRef:
|
|
||||||
apiVersion: apps/v1
|
|
||||||
kind: Deployment
|
|
||||||
name: {{ include "incidentops.fullname" . }}-web
|
|
||||||
minReplicas: {{ .Values.web.autoscaling.minReplicas }}
|
|
||||||
maxReplicas: {{ .Values.web.autoscaling.maxReplicas }}
|
|
||||||
metrics:
|
|
||||||
- type: Resource
|
|
||||||
resource:
|
|
||||||
name: cpu
|
|
||||||
target:
|
|
||||||
type: Utilization
|
|
||||||
averageUtilization: {{ .Values.web.autoscaling.targetCPUUtilizationPercentage }}
|
|
||||||
{{- end }}
|
|
||||||
@@ -3,13 +3,15 @@ kind: Service
|
|||||||
metadata:
|
metadata:
|
||||||
name: {{ include "incidentops.fullname" . }}-web
|
name: {{ include "incidentops.fullname" . }}-web
|
||||||
labels:
|
labels:
|
||||||
{{- include "incidentops.web.labels" . | nindent 4 }}
|
{{- include "incidentops.labels" . | nindent 4 }}
|
||||||
|
app.kubernetes.io/component: web
|
||||||
spec:
|
spec:
|
||||||
type: {{ .Values.web.service.type }}
|
type: ClusterIP
|
||||||
ports:
|
ports:
|
||||||
- port: {{ .Values.web.service.port }}
|
- port: {{ .Values.web.port }}
|
||||||
targetPort: http
|
targetPort: http
|
||||||
protocol: TCP
|
protocol: TCP
|
||||||
name: http
|
name: http
|
||||||
selector:
|
selector:
|
||||||
{{- include "incidentops.web.selectorLabels" . | nindent 4 }}
|
{{- include "incidentops.selectorLabels" . | nindent 4 }}
|
||||||
|
app.kubernetes.io/component: web
|
||||||
|
|||||||
@@ -3,104 +3,28 @@ kind: Deployment
|
|||||||
metadata:
|
metadata:
|
||||||
name: {{ include "incidentops.fullname" . }}-worker
|
name: {{ include "incidentops.fullname" . }}-worker
|
||||||
labels:
|
labels:
|
||||||
{{- include "incidentops.worker.labels" . | nindent 4 }}
|
{{- include "incidentops.labels" . | nindent 4 }}
|
||||||
|
app.kubernetes.io/component: worker
|
||||||
spec:
|
spec:
|
||||||
{{- if not .Values.worker.autoscaling.enabled }}
|
replicas: {{ .Values.worker.replicas }}
|
||||||
replicas: {{ .Values.worker.replicaCount }}
|
|
||||||
{{- end }}
|
|
||||||
selector:
|
selector:
|
||||||
matchLabels:
|
matchLabels:
|
||||||
{{- include "incidentops.worker.selectorLabels" . | nindent 6 }}
|
{{- include "incidentops.selectorLabels" . | nindent 6 }}
|
||||||
|
app.kubernetes.io/component: worker
|
||||||
template:
|
template:
|
||||||
metadata:
|
metadata:
|
||||||
annotations:
|
|
||||||
checksum/config: {{ include (print $.Template.BasePath "/configmap.yaml") . | sha256sum }}
|
|
||||||
checksum/secret: {{ include (print $.Template.BasePath "/secret.yaml") . | sha256sum }}
|
|
||||||
{{- with .Values.worker.podAnnotations }}
|
|
||||||
{{- toYaml . | nindent 8 }}
|
|
||||||
{{- end }}
|
|
||||||
labels:
|
labels:
|
||||||
{{- include "incidentops.worker.selectorLabels" . | nindent 8 }}
|
{{- include "incidentops.selectorLabels" . | nindent 8 }}
|
||||||
|
app.kubernetes.io/component: worker
|
||||||
spec:
|
spec:
|
||||||
{{- with .Values.global.imagePullSecrets }}
|
|
||||||
imagePullSecrets:
|
|
||||||
{{- toYaml . | nindent 8 }}
|
|
||||||
{{- end }}
|
|
||||||
serviceAccountName: {{ include "incidentops.serviceAccountName" . }}
|
|
||||||
securityContext:
|
|
||||||
{{- toYaml .Values.podSecurityContext | nindent 8 }}
|
|
||||||
initContainers:
|
|
||||||
- name: wait-for-postgres
|
|
||||||
image: busybox:1.36
|
|
||||||
command:
|
|
||||||
- sh
|
|
||||||
- -c
|
|
||||||
- |
|
|
||||||
until nc -z {{ include "incidentops.fullname" . }}-postgresql 5432; do
|
|
||||||
echo "Waiting for PostgreSQL..."
|
|
||||||
sleep 2
|
|
||||||
done
|
|
||||||
echo "PostgreSQL is ready"
|
|
||||||
- name: wait-for-redis
|
|
||||||
image: busybox:1.36
|
|
||||||
command:
|
|
||||||
- sh
|
|
||||||
- -c
|
|
||||||
- |
|
|
||||||
until nc -z {{ include "incidentops.fullname" . }}-redis 6379; do
|
|
||||||
echo "Waiting for Redis..."
|
|
||||||
sleep 2
|
|
||||||
done
|
|
||||||
echo "Redis is ready"
|
|
||||||
containers:
|
containers:
|
||||||
- name: worker
|
- name: worker
|
||||||
securityContext:
|
image: "{{ .Values.worker.image }}:{{ .Values.worker.tag }}"
|
||||||
{{- toYaml .Values.securityContext | nindent 12 }}
|
imagePullPolicy: IfNotPresent
|
||||||
image: {{ include "incidentops.worker.image" . }}
|
env:
|
||||||
imagePullPolicy: {{ .Values.worker.image.pullPolicy }}
|
- name: ConnectionStrings__Postgres
|
||||||
command:
|
value: {{ include "incidentops.postgresConnectionString" . | quote }}
|
||||||
- uv
|
- name: Redis__ConnectionString
|
||||||
- run
|
value: {{ include "incidentops.redisConnectionString" . | quote }}
|
||||||
- celery
|
|
||||||
- -A
|
|
||||||
- worker.celery_app
|
|
||||||
- worker
|
|
||||||
- --loglevel=info
|
|
||||||
- -Q
|
|
||||||
- {{ .Values.worker.queues }}
|
|
||||||
- --concurrency={{ .Values.worker.concurrency }}
|
|
||||||
envFrom:
|
|
||||||
- configMapRef:
|
|
||||||
name: {{ include "incidentops.fullname" . }}-config
|
|
||||||
- secretRef:
|
|
||||||
name: {{ include "incidentops.fullname" . }}-secret
|
|
||||||
livenessProbe:
|
|
||||||
exec:
|
|
||||||
command:
|
|
||||||
- uv
|
|
||||||
- run
|
|
||||||
- celery
|
|
||||||
- -A
|
|
||||||
- worker.celery_app
|
|
||||||
- inspect
|
|
||||||
- ping
|
|
||||||
- -d
|
|
||||||
- celery@$HOSTNAME
|
|
||||||
initialDelaySeconds: 30
|
|
||||||
periodSeconds: 60
|
|
||||||
timeoutSeconds: 10
|
|
||||||
failureThreshold: 3
|
|
||||||
resources:
|
resources:
|
||||||
{{- toYaml .Values.worker.resources | nindent 12 }}
|
{{- toYaml .Values.worker.resources | nindent 12 }}
|
||||||
{{- with .Values.worker.nodeSelector }}
|
|
||||||
nodeSelector:
|
|
||||||
{{- toYaml . | nindent 8 }}
|
|
||||||
{{- end }}
|
|
||||||
{{- with .Values.worker.affinity }}
|
|
||||||
affinity:
|
|
||||||
{{- toYaml . | nindent 8 }}
|
|
||||||
{{- end }}
|
|
||||||
{{- with .Values.worker.tolerations }}
|
|
||||||
tolerations:
|
|
||||||
{{- toYaml . | nindent 8 }}
|
|
||||||
{{- end }}
|
|
||||||
|
|||||||
@@ -1,22 +0,0 @@
|
|||||||
{{- if .Values.worker.autoscaling.enabled }}
|
|
||||||
apiVersion: autoscaling/v2
|
|
||||||
kind: HorizontalPodAutoscaler
|
|
||||||
metadata:
|
|
||||||
name: {{ include "incidentops.fullname" . }}-worker
|
|
||||||
labels:
|
|
||||||
{{- include "incidentops.worker.labels" . | nindent 4 }}
|
|
||||||
spec:
|
|
||||||
scaleTargetRef:
|
|
||||||
apiVersion: apps/v1
|
|
||||||
kind: Deployment
|
|
||||||
name: {{ include "incidentops.fullname" . }}-worker
|
|
||||||
minReplicas: {{ .Values.worker.autoscaling.minReplicas }}
|
|
||||||
maxReplicas: {{ .Values.worker.autoscaling.maxReplicas }}
|
|
||||||
metrics:
|
|
||||||
- type: Resource
|
|
||||||
resource:
|
|
||||||
name: cpu
|
|
||||||
target:
|
|
||||||
type: Utilization
|
|
||||||
averageUtilization: {{ .Values.worker.autoscaling.targetCPUUtilizationPercentage }}
|
|
||||||
{{- end }}
|
|
||||||
@@ -1,142 +0,0 @@
|
|||||||
# Production values for incidentops
|
|
||||||
# Use external secrets management in production
|
|
||||||
|
|
||||||
api:
|
|
||||||
replicaCount: 3
|
|
||||||
autoscaling:
|
|
||||||
enabled: true
|
|
||||||
minReplicas: 3
|
|
||||||
maxReplicas: 10
|
|
||||||
targetCPUUtilizationPercentage: 70
|
|
||||||
resources:
|
|
||||||
requests:
|
|
||||||
cpu: 250m
|
|
||||||
memory: 512Mi
|
|
||||||
limits:
|
|
||||||
cpu: 1000m
|
|
||||||
memory: 1Gi
|
|
||||||
|
|
||||||
worker:
|
|
||||||
replicaCount: 3
|
|
||||||
autoscaling:
|
|
||||||
enabled: true
|
|
||||||
minReplicas: 3
|
|
||||||
maxReplicas: 10
|
|
||||||
targetCPUUtilizationPercentage: 70
|
|
||||||
concurrency: 8
|
|
||||||
resources:
|
|
||||||
requests:
|
|
||||||
cpu: 250m
|
|
||||||
memory: 512Mi
|
|
||||||
limits:
|
|
||||||
cpu: 1000m
|
|
||||||
memory: 1Gi
|
|
||||||
|
|
||||||
web:
|
|
||||||
replicaCount: 3
|
|
||||||
autoscaling:
|
|
||||||
enabled: true
|
|
||||||
minReplicas: 3
|
|
||||||
maxReplicas: 10
|
|
||||||
targetCPUUtilizationPercentage: 70
|
|
||||||
resources:
|
|
||||||
requests:
|
|
||||||
cpu: 100m
|
|
||||||
memory: 256Mi
|
|
||||||
limits:
|
|
||||||
cpu: 500m
|
|
||||||
memory: 512Mi
|
|
||||||
|
|
||||||
ingress:
|
|
||||||
enabled: true
|
|
||||||
className: nginx
|
|
||||||
annotations:
|
|
||||||
nginx.ingress.kubernetes.io/proxy-body-size: "10m"
|
|
||||||
cert-manager.io/cluster-issuer: letsencrypt-prod
|
|
||||||
host: incidentops.example.com
|
|
||||||
tls:
|
|
||||||
- secretName: incidentops-tls
|
|
||||||
hosts:
|
|
||||||
- incidentops.example.com
|
|
||||||
|
|
||||||
postgresql:
|
|
||||||
persistence:
|
|
||||||
size: 50Gi
|
|
||||||
resources:
|
|
||||||
requests:
|
|
||||||
cpu: 500m
|
|
||||||
memory: 1Gi
|
|
||||||
limits:
|
|
||||||
cpu: 2000m
|
|
||||||
memory: 4Gi
|
|
||||||
|
|
||||||
redis:
|
|
||||||
persistence:
|
|
||||||
size: 10Gi
|
|
||||||
resources:
|
|
||||||
requests:
|
|
||||||
cpu: 250m
|
|
||||||
memory: 512Mi
|
|
||||||
limits:
|
|
||||||
cpu: 1000m
|
|
||||||
memory: 1Gi
|
|
||||||
|
|
||||||
# Application configuration
|
|
||||||
config:
|
|
||||||
environment: production
|
|
||||||
logLevel: INFO
|
|
||||||
|
|
||||||
# Observability Stack - Production settings
|
|
||||||
observability:
|
|
||||||
enabled: true
|
|
||||||
|
|
||||||
otelCollector:
|
|
||||||
replicaCount: 2
|
|
||||||
resources:
|
|
||||||
requests:
|
|
||||||
cpu: 100m
|
|
||||||
memory: 256Mi
|
|
||||||
limits:
|
|
||||||
cpu: 500m
|
|
||||||
memory: 512Mi
|
|
||||||
|
|
||||||
tempo:
|
|
||||||
retention: "720h" # 30 days
|
|
||||||
persistence:
|
|
||||||
enabled: true
|
|
||||||
size: 50Gi
|
|
||||||
resources:
|
|
||||||
requests:
|
|
||||||
cpu: 250m
|
|
||||||
memory: 512Mi
|
|
||||||
limits:
|
|
||||||
cpu: 1000m
|
|
||||||
memory: 2Gi
|
|
||||||
|
|
||||||
loki:
|
|
||||||
retention: "720h" # 30 days
|
|
||||||
persistence:
|
|
||||||
enabled: true
|
|
||||||
size: 100Gi
|
|
||||||
resources:
|
|
||||||
requests:
|
|
||||||
cpu: 250m
|
|
||||||
memory: 512Mi
|
|
||||||
limits:
|
|
||||||
cpu: 1000m
|
|
||||||
memory: 2Gi
|
|
||||||
|
|
||||||
grafana:
|
|
||||||
adminPassword: "" # Set via external secret in production
|
|
||||||
service:
|
|
||||||
type: ClusterIP
|
|
||||||
persistence:
|
|
||||||
enabled: true
|
|
||||||
size: 5Gi
|
|
||||||
resources:
|
|
||||||
requests:
|
|
||||||
cpu: 100m
|
|
||||||
memory: 256Mi
|
|
||||||
limits:
|
|
||||||
cpu: 500m
|
|
||||||
memory: 512Mi
|
|
||||||
+45
-252
@@ -1,279 +1,72 @@
|
|||||||
# Default values for incidentops
|
|
||||||
|
|
||||||
global:
|
|
||||||
imageRegistry: ""
|
|
||||||
imagePullSecrets: []
|
|
||||||
|
|
||||||
api:
|
api:
|
||||||
replicaCount: 2
|
image: incidentops-api
|
||||||
image:
|
tag: latest
|
||||||
repository: incidentops/api
|
replicas: 1
|
||||||
tag: latest
|
port: 8080
|
||||||
pullPolicy: IfNotPresent
|
|
||||||
service:
|
|
||||||
type: ClusterIP
|
|
||||||
port: 8000
|
|
||||||
resources:
|
resources:
|
||||||
requests:
|
|
||||||
cpu: 100m
|
|
||||||
memory: 256Mi
|
|
||||||
limits:
|
limits:
|
||||||
cpu: 500m
|
|
||||||
memory: 512Mi
|
memory: 512Mi
|
||||||
autoscaling:
|
cpu: 500m
|
||||||
enabled: false
|
requests:
|
||||||
minReplicas: 2
|
memory: 256Mi
|
||||||
maxReplicas: 10
|
cpu: 100m
|
||||||
targetCPUUtilizationPercentage: 80
|
|
||||||
podAnnotations: {}
|
|
||||||
nodeSelector: {}
|
|
||||||
tolerations: []
|
|
||||||
affinity: {}
|
|
||||||
|
|
||||||
# Worker Service (Celery)
|
|
||||||
worker:
|
worker:
|
||||||
replicaCount: 2
|
image: incidentops-worker
|
||||||
image:
|
tag: latest
|
||||||
repository: incidentops/worker
|
replicas: 1
|
||||||
tag: latest
|
|
||||||
pullPolicy: IfNotPresent
|
|
||||||
resources:
|
resources:
|
||||||
requests:
|
|
||||||
cpu: 100m
|
|
||||||
memory: 256Mi
|
|
||||||
limits:
|
limits:
|
||||||
cpu: 500m
|
|
||||||
memory: 512Mi
|
memory: 512Mi
|
||||||
autoscaling:
|
cpu: 500m
|
||||||
enabled: false
|
|
||||||
minReplicas: 2
|
|
||||||
maxReplicas: 10
|
|
||||||
targetCPUUtilizationPercentage: 80
|
|
||||||
queues: "critical,default,low"
|
|
||||||
concurrency: 4
|
|
||||||
podAnnotations: {}
|
|
||||||
nodeSelector: {}
|
|
||||||
tolerations: []
|
|
||||||
affinity: {}
|
|
||||||
|
|
||||||
# Web Frontend (Next.js)
|
|
||||||
web:
|
|
||||||
replicaCount: 2
|
|
||||||
image:
|
|
||||||
repository: incidentops/web
|
|
||||||
tag: latest
|
|
||||||
pullPolicy: IfNotPresent
|
|
||||||
service:
|
|
||||||
type: ClusterIP
|
|
||||||
port: 3000
|
|
||||||
resources:
|
|
||||||
requests:
|
requests:
|
||||||
cpu: 50m
|
|
||||||
memory: 128Mi
|
|
||||||
limits:
|
|
||||||
cpu: 200m
|
|
||||||
memory: 256Mi
|
memory: 256Mi
|
||||||
autoscaling:
|
cpu: 100m
|
||||||
enabled: false
|
|
||||||
minReplicas: 2
|
web:
|
||||||
maxReplicas: 10
|
image: incidentops-web
|
||||||
targetCPUUtilizationPercentage: 80
|
tag: latest
|
||||||
podAnnotations: {}
|
replicas: 1
|
||||||
nodeSelector: {}
|
port: 3000
|
||||||
tolerations: []
|
resources:
|
||||||
affinity: {}
|
limits:
|
||||||
|
memory: 256Mi
|
||||||
|
cpu: 200m
|
||||||
|
requests:
|
||||||
|
memory: 128Mi
|
||||||
|
cpu: 50m
|
||||||
|
|
||||||
|
jwt:
|
||||||
|
issuer: incidentops
|
||||||
|
audience: incidentops
|
||||||
|
signingKey: your-super-secret-key-that-should-be-at-least-32-characters-long
|
||||||
|
accessTokenExpirationMinutes: 15
|
||||||
|
refreshTokenExpirationDays: 7
|
||||||
|
|
||||||
# Ingress configuration
|
|
||||||
ingress:
|
ingress:
|
||||||
enabled: true
|
enabled: true
|
||||||
className: nginx
|
className: nginx
|
||||||
|
host: incidentops.local
|
||||||
annotations:
|
annotations:
|
||||||
nginx.ingress.kubernetes.io/proxy-body-size: "10m"
|
nginx.ingress.kubernetes.io/proxy-body-size: "10m"
|
||||||
host: incidentops.local
|
|
||||||
tls: []
|
|
||||||
|
|
||||||
# Database migration job
|
|
||||||
migration:
|
|
||||||
enabled: true
|
|
||||||
image:
|
|
||||||
repository: incidentops/api
|
|
||||||
tag: latest
|
|
||||||
pullPolicy: IfNotPresent
|
|
||||||
backoffLimit: 3
|
|
||||||
|
|
||||||
# Application configuration
|
|
||||||
config:
|
|
||||||
jwtAlgorithm: HS256
|
|
||||||
accessTokenExpireMinutes: 30
|
|
||||||
refreshTokenExpireDays: 30
|
|
||||||
environment: development
|
|
||||||
logLevel: INFO
|
|
||||||
|
|
||||||
# Secrets (use external secrets in production)
|
|
||||||
secrets:
|
|
||||||
jwtSecretKey: "change-me-in-production"
|
|
||||||
|
|
||||||
# PostgreSQL configuration (using official postgres image)
|
|
||||||
postgresql:
|
postgresql:
|
||||||
enabled: true
|
enabled: true
|
||||||
image:
|
|
||||||
repository: postgres
|
|
||||||
tag: "16-alpine"
|
|
||||||
pullPolicy: IfNotPresent
|
|
||||||
auth:
|
auth:
|
||||||
username: incidentops
|
username: postgres
|
||||||
password: incidentops
|
password: postgres
|
||||||
database: incidentops
|
database: incidentops
|
||||||
persistence:
|
primary:
|
||||||
size: 8Gi
|
persistence:
|
||||||
resources:
|
enabled: true
|
||||||
requests:
|
size: 1Gi
|
||||||
cpu: 100m
|
|
||||||
memory: 256Mi
|
|
||||||
limits:
|
|
||||||
cpu: 500m
|
|
||||||
memory: 512Mi
|
|
||||||
|
|
||||||
redis:
|
redis:
|
||||||
enabled: true
|
enabled: true
|
||||||
image:
|
architecture: standalone
|
||||||
repository: redis
|
auth:
|
||||||
tag: "7-alpine"
|
enabled: false
|
||||||
pullPolicy: IfNotPresent
|
master:
|
||||||
persistence:
|
|
||||||
size: 2Gi
|
|
||||||
resources:
|
|
||||||
requests:
|
|
||||||
cpu: 50m
|
|
||||||
memory: 128Mi
|
|
||||||
limits:
|
|
||||||
cpu: 200m
|
|
||||||
memory: 256Mi
|
|
||||||
|
|
||||||
# Service Account
|
|
||||||
serviceAccount:
|
|
||||||
create: true
|
|
||||||
annotations: {}
|
|
||||||
name: ""
|
|
||||||
|
|
||||||
# Pod Security Context
|
|
||||||
podSecurityContext:
|
|
||||||
fsGroup: 1000
|
|
||||||
|
|
||||||
securityContext:
|
|
||||||
runAsNonRoot: true
|
|
||||||
runAsUser: 1000
|
|
||||||
|
|
||||||
# Observability Stack (Grafana + Loki + Tempo + OpenTelemetry Collector)
|
|
||||||
observability:
|
|
||||||
enabled: true
|
|
||||||
|
|
||||||
otelCollector:
|
|
||||||
replicaCount: 1
|
|
||||||
image:
|
|
||||||
repository: otel/opentelemetry-collector-contrib
|
|
||||||
tag: "0.96.0"
|
|
||||||
pullPolicy: IfNotPresent
|
|
||||||
resources:
|
|
||||||
requests:
|
|
||||||
cpu: 50m
|
|
||||||
memory: 128Mi
|
|
||||||
limits:
|
|
||||||
cpu: 200m
|
|
||||||
memory: 256Mi
|
|
||||||
|
|
||||||
tempo:
|
|
||||||
image:
|
|
||||||
repository: grafana/tempo
|
|
||||||
tag: "2.4.1"
|
|
||||||
pullPolicy: IfNotPresent
|
|
||||||
retention: "168h" # 7 days
|
|
||||||
persistence:
|
persistence:
|
||||||
enabled: false
|
enabled: true
|
||||||
size: 10Gi
|
|
||||||
resources:
|
|
||||||
requests:
|
|
||||||
cpu: 50m
|
|
||||||
memory: 128Mi
|
|
||||||
limits:
|
|
||||||
cpu: 500m
|
|
||||||
memory: 512Mi
|
|
||||||
|
|
||||||
loki:
|
|
||||||
image:
|
|
||||||
repository: grafana/loki
|
|
||||||
tag: "2.9.6"
|
|
||||||
pullPolicy: IfNotPresent
|
|
||||||
retention: "168h" # 7 days
|
|
||||||
persistence:
|
|
||||||
enabled: false
|
|
||||||
size: 10Gi
|
|
||||||
resources:
|
|
||||||
requests:
|
|
||||||
cpu: 50m
|
|
||||||
memory: 128Mi
|
|
||||||
limits:
|
|
||||||
cpu: 500m
|
|
||||||
memory: 512Mi
|
|
||||||
|
|
||||||
prometheus:
|
|
||||||
image:
|
|
||||||
repository: prom/prometheus
|
|
||||||
tag: "v2.51.0"
|
|
||||||
pullPolicy: IfNotPresent
|
|
||||||
retention: "15d"
|
|
||||||
scrapeInterval: "15s"
|
|
||||||
persistence:
|
|
||||||
enabled: false
|
|
||||||
size: 10Gi
|
|
||||||
resources:
|
|
||||||
requests:
|
|
||||||
cpu: 50m
|
|
||||||
memory: 128Mi
|
|
||||||
limits:
|
|
||||||
cpu: 500m
|
|
||||||
memory: 512Mi
|
|
||||||
|
|
||||||
grafana:
|
|
||||||
image:
|
|
||||||
repository: grafana/grafana
|
|
||||||
tag: "10.4.1"
|
|
||||||
pullPolicy: IfNotPresent
|
|
||||||
adminUser: admin
|
|
||||||
adminPassword: "admin" # Change in production!
|
|
||||||
service:
|
|
||||||
type: ClusterIP
|
|
||||||
ingress:
|
|
||||||
enabled: false
|
|
||||||
host: grafana.incidentops.local
|
|
||||||
annotations: {}
|
|
||||||
tls: []
|
|
||||||
persistence:
|
|
||||||
enabled: false
|
|
||||||
size: 1Gi
|
size: 1Gi
|
||||||
resources:
|
|
||||||
requests:
|
|
||||||
cpu: 50m
|
|
||||||
memory: 128Mi
|
|
||||||
limits:
|
|
||||||
cpu: 200m
|
|
||||||
memory: 256Mi
|
|
||||||
|
|
||||||
promtail:
|
|
||||||
enabled: true
|
|
||||||
image:
|
|
||||||
repository: grafana/promtail
|
|
||||||
tag: "2.9.6"
|
|
||||||
pullPolicy: IfNotPresent
|
|
||||||
resources:
|
|
||||||
requests:
|
|
||||||
cpu: 25m
|
|
||||||
memory: 64Mi
|
|
||||||
limits:
|
|
||||||
cpu: 200m
|
|
||||||
memory: 256Mi
|
|
||||||
|
|
||||||
# Metrics configuration
|
|
||||||
metrics:
|
|
||||||
enabled: true
|
|
||||||
port: 9464
|
|
||||||
|
|||||||
@@ -1,6 +0,0 @@
|
|||||||
def main():
|
|
||||||
print("Hello from incidentops!")
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
|
||||||
main()
|
|
||||||
@@ -1,61 +0,0 @@
|
|||||||
-- Initial schema for IncidentOps
|
|
||||||
-- Creates core tables: users, orgs, org_members, services, incidents, incident_events
|
|
||||||
|
|
||||||
CREATE TABLE users (
|
|
||||||
id UUID PRIMARY KEY,
|
|
||||||
email TEXT NOT NULL UNIQUE,
|
|
||||||
password_hash TEXT NOT NULL,
|
|
||||||
created_at TIMESTAMPTZ NOT NULL DEFAULT now()
|
|
||||||
);
|
|
||||||
|
|
||||||
CREATE TABLE orgs (
|
|
||||||
id UUID PRIMARY KEY,
|
|
||||||
name TEXT NOT NULL,
|
|
||||||
slug TEXT NOT NULL UNIQUE,
|
|
||||||
created_at TIMESTAMPTZ NOT NULL DEFAULT now()
|
|
||||||
);
|
|
||||||
|
|
||||||
CREATE TABLE org_members (
|
|
||||||
id UUID PRIMARY KEY,
|
|
||||||
user_id UUID NOT NULL REFERENCES users(id),
|
|
||||||
org_id UUID NOT NULL REFERENCES orgs(id),
|
|
||||||
role TEXT NOT NULL CHECK (role IN ('admin', 'member', 'viewer')),
|
|
||||||
created_at TIMESTAMPTZ NOT NULL DEFAULT now(),
|
|
||||||
UNIQUE (user_id, org_id)
|
|
||||||
);
|
|
||||||
|
|
||||||
CREATE TABLE services (
|
|
||||||
id UUID PRIMARY KEY,
|
|
||||||
org_id UUID NOT NULL REFERENCES orgs(id),
|
|
||||||
name TEXT NOT NULL,
|
|
||||||
slug TEXT NOT NULL,
|
|
||||||
created_at TIMESTAMPTZ NOT NULL DEFAULT now(),
|
|
||||||
UNIQUE (org_id, slug)
|
|
||||||
);
|
|
||||||
|
|
||||||
CREATE TABLE incidents (
|
|
||||||
id UUID PRIMARY KEY,
|
|
||||||
org_id UUID NOT NULL REFERENCES orgs(id),
|
|
||||||
service_id UUID NOT NULL REFERENCES services(id),
|
|
||||||
title TEXT NOT NULL,
|
|
||||||
description TEXT,
|
|
||||||
status TEXT NOT NULL CHECK (status IN ('triggered', 'acknowledged', 'mitigated', 'resolved')),
|
|
||||||
severity TEXT NOT NULL CHECK (severity IN ('critical', 'high', 'medium', 'low')),
|
|
||||||
version INTEGER NOT NULL DEFAULT 1,
|
|
||||||
created_at TIMESTAMPTZ NOT NULL DEFAULT now(),
|
|
||||||
updated_at TIMESTAMPTZ NOT NULL DEFAULT now()
|
|
||||||
);
|
|
||||||
|
|
||||||
CREATE INDEX idx_incidents_org_status ON incidents(org_id, status);
|
|
||||||
CREATE INDEX idx_incidents_org_created ON incidents(org_id, created_at DESC);
|
|
||||||
|
|
||||||
CREATE TABLE incident_events (
|
|
||||||
id UUID PRIMARY KEY,
|
|
||||||
incident_id UUID NOT NULL REFERENCES incidents(id),
|
|
||||||
event_type TEXT NOT NULL,
|
|
||||||
actor_user_id UUID REFERENCES users(id),
|
|
||||||
payload JSONB,
|
|
||||||
created_at TIMESTAMPTZ NOT NULL DEFAULT now()
|
|
||||||
);
|
|
||||||
|
|
||||||
CREATE INDEX idx_incident_events_incident ON incident_events(incident_id, created_at);
|
|
||||||
@@ -1,15 +0,0 @@
|
|||||||
-- Refresh tokens table for JWT token rotation
|
|
||||||
-- Stores hashed refresh tokens with active org context
|
|
||||||
|
|
||||||
CREATE TABLE refresh_tokens (
|
|
||||||
id UUID PRIMARY KEY,
|
|
||||||
user_id UUID NOT NULL REFERENCES users(id),
|
|
||||||
token_hash TEXT NOT NULL UNIQUE,
|
|
||||||
active_org_id UUID NOT NULL REFERENCES orgs(id),
|
|
||||||
expires_at TIMESTAMPTZ NOT NULL,
|
|
||||||
revoked_at TIMESTAMPTZ,
|
|
||||||
created_at TIMESTAMPTZ NOT NULL DEFAULT now()
|
|
||||||
);
|
|
||||||
|
|
||||||
CREATE INDEX idx_refresh_tokens_user ON refresh_tokens(user_id);
|
|
||||||
CREATE INDEX idx_refresh_tokens_hash ON refresh_tokens(token_hash);
|
|
||||||
@@ -1,25 +0,0 @@
|
|||||||
-- Notification system tables
|
|
||||||
-- Stores notification targets and delivery attempts
|
|
||||||
|
|
||||||
CREATE TABLE notification_targets (
|
|
||||||
id UUID PRIMARY KEY,
|
|
||||||
org_id UUID NOT NULL REFERENCES orgs(id),
|
|
||||||
name TEXT NOT NULL,
|
|
||||||
target_type TEXT NOT NULL CHECK (target_type IN ('webhook', 'email', 'slack')),
|
|
||||||
webhook_url TEXT,
|
|
||||||
enabled BOOLEAN NOT NULL DEFAULT true,
|
|
||||||
created_at TIMESTAMPTZ NOT NULL DEFAULT now()
|
|
||||||
);
|
|
||||||
|
|
||||||
CREATE INDEX idx_notification_targets_org ON notification_targets(org_id);
|
|
||||||
|
|
||||||
CREATE TABLE notification_attempts (
|
|
||||||
id UUID PRIMARY KEY,
|
|
||||||
incident_id UUID NOT NULL REFERENCES incidents(id),
|
|
||||||
target_id UUID NOT NULL REFERENCES notification_targets(id),
|
|
||||||
status TEXT NOT NULL CHECK (status IN ('pending', 'sent', 'failed')),
|
|
||||||
error TEXT,
|
|
||||||
sent_at TIMESTAMPTZ,
|
|
||||||
created_at TIMESTAMPTZ NOT NULL DEFAULT now(),
|
|
||||||
UNIQUE (incident_id, target_id)
|
|
||||||
);
|
|
||||||
@@ -1,18 +0,0 @@
|
|||||||
-- Enhance refresh tokens for secure rotation and reuse detection
|
|
||||||
-- Adds rotated_to column to track token chains and detect stolen token reuse
|
|
||||||
|
|
||||||
-- Add rotated_to column to track which token this was rotated into
|
|
||||||
-- When a token is rotated, we store the ID of the new token here
|
|
||||||
-- If a token with rotated_to set is used again, it indicates token theft
|
|
||||||
ALTER TABLE refresh_tokens ADD COLUMN rotated_to UUID REFERENCES refresh_tokens(id);
|
|
||||||
|
|
||||||
-- Index for efficient cleanup queries on expires_at
|
|
||||||
CREATE INDEX idx_refresh_tokens_expires ON refresh_tokens(expires_at);
|
|
||||||
|
|
||||||
-- Index for finding active tokens per user (for revoke_all and listing)
|
|
||||||
CREATE INDEX idx_refresh_tokens_user_active ON refresh_tokens(user_id, revoked_at)
|
|
||||||
WHERE revoked_at IS NULL;
|
|
||||||
|
|
||||||
-- Index for reuse detection queries
|
|
||||||
CREATE INDEX idx_refresh_tokens_rotated ON refresh_tokens(rotated_to)
|
|
||||||
WHERE rotated_to IS NOT NULL;
|
|
||||||
@@ -1,119 +0,0 @@
|
|||||||
"""
|
|
||||||
Simple migration runner using asyncpg.
|
|
||||||
Tracks applied migrations in a _migrations table.
|
|
||||||
|
|
||||||
Usage:
|
|
||||||
DATABASE_URL=postgresql://user:pass@localhost/db uv run python migrations/migrate.py apply
|
|
||||||
DATABASE_URL=postgresql://user:pass@localhost/db uv run python migrations/migrate.py status
|
|
||||||
"""
|
|
||||||
|
|
||||||
import asyncio
|
|
||||||
import os
|
|
||||||
import sys
|
|
||||||
from pathlib import Path
|
|
||||||
|
|
||||||
import asyncpg
|
|
||||||
|
|
||||||
MIGRATIONS_DIR = Path(__file__).parent
|
|
||||||
|
|
||||||
|
|
||||||
async def ensure_migrations_table(conn: asyncpg.Connection) -> None:
|
|
||||||
"""Create the migrations tracking table if it doesn't exist."""
|
|
||||||
await conn.execute("""
|
|
||||||
CREATE TABLE IF NOT EXISTS _migrations (
|
|
||||||
id SERIAL PRIMARY KEY,
|
|
||||||
name TEXT NOT NULL UNIQUE,
|
|
||||||
applied_at TIMESTAMPTZ NOT NULL DEFAULT now()
|
|
||||||
)
|
|
||||||
""")
|
|
||||||
|
|
||||||
|
|
||||||
async def get_applied_migrations(conn: asyncpg.Connection) -> set[str]:
|
|
||||||
"""Get the set of already applied migration names."""
|
|
||||||
rows = await conn.fetch("SELECT name FROM _migrations")
|
|
||||||
return {row["name"] for row in rows}
|
|
||||||
|
|
||||||
|
|
||||||
async def get_pending_migrations(conn: asyncpg.Connection) -> list[Path]:
|
|
||||||
"""Get list of migration files that haven't been applied yet."""
|
|
||||||
applied = await get_applied_migrations(conn)
|
|
||||||
sql_files = sorted(MIGRATIONS_DIR.glob("*.sql"))
|
|
||||||
return [f for f in sql_files if f.name not in applied]
|
|
||||||
|
|
||||||
|
|
||||||
async def apply_migration(conn: asyncpg.Connection, migration_file: Path) -> None:
|
|
||||||
"""Apply a single migration file within a transaction."""
|
|
||||||
sql = migration_file.read_text()
|
|
||||||
async with conn.transaction():
|
|
||||||
await conn.execute(sql)
|
|
||||||
await conn.execute(
|
|
||||||
"INSERT INTO _migrations (name) VALUES ($1)",
|
|
||||||
migration_file.name
|
|
||||||
)
|
|
||||||
print(f"Applied: {migration_file.name}")
|
|
||||||
|
|
||||||
|
|
||||||
async def migrate(database_url: str) -> None:
|
|
||||||
"""Apply all pending migrations."""
|
|
||||||
conn = await asyncpg.connect(database_url)
|
|
||||||
try:
|
|
||||||
await ensure_migrations_table(conn)
|
|
||||||
pending = await get_pending_migrations(conn)
|
|
||||||
|
|
||||||
if not pending:
|
|
||||||
print("No pending migrations.")
|
|
||||||
return
|
|
||||||
|
|
||||||
for migration_file in pending:
|
|
||||||
await apply_migration(conn, migration_file)
|
|
||||||
|
|
||||||
print(f"Applied {len(pending)} migration(s).")
|
|
||||||
finally:
|
|
||||||
await conn.close()
|
|
||||||
|
|
||||||
|
|
||||||
async def status(database_url: str) -> None:
|
|
||||||
"""Show migration status."""
|
|
||||||
conn = await asyncpg.connect(database_url)
|
|
||||||
try:
|
|
||||||
await ensure_migrations_table(conn)
|
|
||||||
applied = await get_applied_migrations(conn)
|
|
||||||
pending = await get_pending_migrations(conn)
|
|
||||||
|
|
||||||
print("Applied migrations:")
|
|
||||||
for name in sorted(applied):
|
|
||||||
print(f" [x] {name}")
|
|
||||||
|
|
||||||
print("\nPending migrations:")
|
|
||||||
for f in pending:
|
|
||||||
print(f" [ ] {f.name}")
|
|
||||||
|
|
||||||
if not applied and not pending:
|
|
||||||
print(" (none)")
|
|
||||||
finally:
|
|
||||||
await conn.close()
|
|
||||||
|
|
||||||
|
|
||||||
def main() -> None:
|
|
||||||
database_url = os.environ.get("DATABASE_URL")
|
|
||||||
if not database_url:
|
|
||||||
print("Error: DATABASE_URL environment variable is required")
|
|
||||||
sys.exit(1)
|
|
||||||
|
|
||||||
if len(sys.argv) < 2:
|
|
||||||
print("Usage: python migrate.py [apply|status]")
|
|
||||||
sys.exit(1)
|
|
||||||
|
|
||||||
command = sys.argv[1]
|
|
||||||
if command == "apply":
|
|
||||||
asyncio.run(migrate(database_url))
|
|
||||||
elif command == "status":
|
|
||||||
asyncio.run(status(database_url))
|
|
||||||
else:
|
|
||||||
print(f"Unknown command: {command}")
|
|
||||||
print("Usage: python migrate.py [apply|status]")
|
|
||||||
sys.exit(1)
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
|
||||||
main()
|
|
||||||
@@ -1,294 +0,0 @@
|
|||||||
{
|
|
||||||
"title": "IncidentOps API Overview",
|
|
||||||
"uid": "incidentops-api",
|
|
||||||
"tags": ["incidentops", "api"],
|
|
||||||
"timezone": "browser",
|
|
||||||
"editable": true,
|
|
||||||
"panels": [
|
|
||||||
{
|
|
||||||
"id": 1,
|
|
||||||
"title": "Request Rate",
|
|
||||||
"type": "timeseries",
|
|
||||||
"gridPos": {"h": 8, "w": 8, "x": 0, "y": 0},
|
|
||||||
"targets": [
|
|
||||||
{
|
|
||||||
"datasource": {"type": "prometheus", "uid": "prometheus"},
|
|
||||||
"expr": "sum(rate(http_server_request_duration_seconds_count{job=\"incidentops-api\"}[1m]))",
|
|
||||||
"legendFormat": "Requests/sec",
|
|
||||||
"refId": "A"
|
|
||||||
}
|
|
||||||
],
|
|
||||||
"fieldConfig": {
|
|
||||||
"defaults": {
|
|
||||||
"color": {"mode": "palette-classic"},
|
|
||||||
"unit": "reqps"
|
|
||||||
}
|
|
||||||
}
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"id": 2,
|
|
||||||
"title": "Request Duration (p50, p95, p99)",
|
|
||||||
"type": "timeseries",
|
|
||||||
"gridPos": {"h": 8, "w": 8, "x": 8, "y": 0},
|
|
||||||
"targets": [
|
|
||||||
{
|
|
||||||
"datasource": {"type": "prometheus", "uid": "prometheus"},
|
|
||||||
"expr": "histogram_quantile(0.50, sum(rate(http_server_request_duration_seconds_bucket{job=\"incidentops-api\"}[5m])) by (le))",
|
|
||||||
"legendFormat": "p50",
|
|
||||||
"refId": "A"
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"datasource": {"type": "prometheus", "uid": "prometheus"},
|
|
||||||
"expr": "histogram_quantile(0.95, sum(rate(http_server_request_duration_seconds_bucket{job=\"incidentops-api\"}[5m])) by (le))",
|
|
||||||
"legendFormat": "p95",
|
|
||||||
"refId": "B"
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"datasource": {"type": "prometheus", "uid": "prometheus"},
|
|
||||||
"expr": "histogram_quantile(0.99, sum(rate(http_server_request_duration_seconds_bucket{job=\"incidentops-api\"}[5m])) by (le))",
|
|
||||||
"legendFormat": "p99",
|
|
||||||
"refId": "C"
|
|
||||||
}
|
|
||||||
],
|
|
||||||
"fieldConfig": {
|
|
||||||
"defaults": {
|
|
||||||
"color": {"mode": "palette-classic"},
|
|
||||||
"unit": "s"
|
|
||||||
}
|
|
||||||
}
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"id": 3,
|
|
||||||
"title": "Error Rate",
|
|
||||||
"type": "timeseries",
|
|
||||||
"gridPos": {"h": 8, "w": 8, "x": 16, "y": 0},
|
|
||||||
"targets": [
|
|
||||||
{
|
|
||||||
"datasource": {"type": "prometheus", "uid": "prometheus"},
|
|
||||||
"expr": "sum(rate(http_server_request_duration_seconds_count{job=\"incidentops-api\", http_status_code=~\"5..\"}[1m])) / sum(rate(http_server_request_duration_seconds_count{job=\"incidentops-api\"}[1m])) * 100",
|
|
||||||
"legendFormat": "Error %",
|
|
||||||
"refId": "A"
|
|
||||||
}
|
|
||||||
],
|
|
||||||
"fieldConfig": {
|
|
||||||
"defaults": {
|
|
||||||
"color": {"fixedColor": "red", "mode": "fixed"},
|
|
||||||
"unit": "percent",
|
|
||||||
"min": 0,
|
|
||||||
"max": 100
|
|
||||||
}
|
|
||||||
}
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"id": 4,
|
|
||||||
"title": "Requests by Status Code",
|
|
||||||
"type": "timeseries",
|
|
||||||
"gridPos": {"h": 8, "w": 12, "x": 0, "y": 8},
|
|
||||||
"targets": [
|
|
||||||
{
|
|
||||||
"datasource": {"type": "prometheus", "uid": "prometheus"},
|
|
||||||
"expr": "sum by (http_status_code) (rate(http_server_request_duration_seconds_count{job=\"incidentops-api\"}[1m]))",
|
|
||||||
"legendFormat": "{{http_status_code}}",
|
|
||||||
"refId": "A"
|
|
||||||
}
|
|
||||||
],
|
|
||||||
"fieldConfig": {
|
|
||||||
"defaults": {
|
|
||||||
"color": {"mode": "palette-classic"},
|
|
||||||
"unit": "reqps"
|
|
||||||
}
|
|
||||||
}
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"id": 5,
|
|
||||||
"title": "Requests by Endpoint",
|
|
||||||
"type": "timeseries",
|
|
||||||
"gridPos": {"h": 8, "w": 12, "x": 12, "y": 8},
|
|
||||||
"targets": [
|
|
||||||
{
|
|
||||||
"datasource": {"type": "prometheus", "uid": "prometheus"},
|
|
||||||
"expr": "sum by (http_route) (rate(http_server_request_duration_seconds_count{job=\"incidentops-api\"}[1m]))",
|
|
||||||
"legendFormat": "{{http_route}}",
|
|
||||||
"refId": "A"
|
|
||||||
}
|
|
||||||
],
|
|
||||||
"fieldConfig": {
|
|
||||||
"defaults": {
|
|
||||||
"color": {"mode": "palette-classic"},
|
|
||||||
"unit": "reqps"
|
|
||||||
}
|
|
||||||
}
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"id": 6,
|
|
||||||
"title": "System CPU Usage",
|
|
||||||
"type": "gauge",
|
|
||||||
"gridPos": {"h": 6, "w": 6, "x": 0, "y": 16},
|
|
||||||
"targets": [
|
|
||||||
{
|
|
||||||
"datasource": {"type": "prometheus", "uid": "prometheus"},
|
|
||||||
"expr": "avg(system_cpu_utilization{job=\"incidentops-api\"}) * 100",
|
|
||||||
"refId": "A"
|
|
||||||
}
|
|
||||||
],
|
|
||||||
"fieldConfig": {
|
|
||||||
"defaults": {
|
|
||||||
"color": {"mode": "thresholds"},
|
|
||||||
"thresholds": {
|
|
||||||
"mode": "absolute",
|
|
||||||
"steps": [
|
|
||||||
{"color": "green", "value": null},
|
|
||||||
{"color": "yellow", "value": 60},
|
|
||||||
{"color": "red", "value": 80}
|
|
||||||
]
|
|
||||||
},
|
|
||||||
"unit": "percent",
|
|
||||||
"min": 0,
|
|
||||||
"max": 100
|
|
||||||
}
|
|
||||||
}
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"id": 7,
|
|
||||||
"title": "Memory Usage",
|
|
||||||
"type": "gauge",
|
|
||||||
"gridPos": {"h": 6, "w": 6, "x": 6, "y": 16},
|
|
||||||
"targets": [
|
|
||||||
{
|
|
||||||
"datasource": {"type": "prometheus", "uid": "prometheus"},
|
|
||||||
"expr": "process_runtime_cpython_memory_bytes{job=\"incidentops-api\", type=\"rss\"} / 1024 / 1024",
|
|
||||||
"refId": "A"
|
|
||||||
}
|
|
||||||
],
|
|
||||||
"fieldConfig": {
|
|
||||||
"defaults": {
|
|
||||||
"color": {"mode": "thresholds"},
|
|
||||||
"thresholds": {
|
|
||||||
"mode": "absolute",
|
|
||||||
"steps": [
|
|
||||||
{"color": "green", "value": null},
|
|
||||||
{"color": "yellow", "value": 256},
|
|
||||||
{"color": "red", "value": 512}
|
|
||||||
]
|
|
||||||
},
|
|
||||||
"unit": "decmbytes"
|
|
||||||
}
|
|
||||||
}
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"id": 8,
|
|
||||||
"title": "Active Threads",
|
|
||||||
"type": "stat",
|
|
||||||
"gridPos": {"h": 6, "w": 6, "x": 12, "y": 16},
|
|
||||||
"targets": [
|
|
||||||
{
|
|
||||||
"datasource": {"type": "prometheus", "uid": "prometheus"},
|
|
||||||
"expr": "process_runtime_cpython_thread_count{job=\"incidentops-api\"}",
|
|
||||||
"refId": "A"
|
|
||||||
}
|
|
||||||
],
|
|
||||||
"fieldConfig": {
|
|
||||||
"defaults": {
|
|
||||||
"color": {"mode": "thresholds"},
|
|
||||||
"thresholds": {
|
|
||||||
"mode": "absolute",
|
|
||||||
"steps": [
|
|
||||||
{"color": "green", "value": null},
|
|
||||||
{"color": "yellow", "value": 50},
|
|
||||||
{"color": "red", "value": 100}
|
|
||||||
]
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"id": 9,
|
|
||||||
"title": "GC Collections",
|
|
||||||
"type": "stat",
|
|
||||||
"gridPos": {"h": 6, "w": 6, "x": 18, "y": 16},
|
|
||||||
"targets": [
|
|
||||||
{
|
|
||||||
"datasource": {"type": "prometheus", "uid": "prometheus"},
|
|
||||||
"expr": "sum(rate(process_runtime_cpython_gc_count{job=\"incidentops-api\"}[5m]))",
|
|
||||||
"refId": "A"
|
|
||||||
}
|
|
||||||
],
|
|
||||||
"fieldConfig": {
|
|
||||||
"defaults": {
|
|
||||||
"color": {"mode": "thresholds"},
|
|
||||||
"thresholds": {
|
|
||||||
"mode": "absolute",
|
|
||||||
"steps": [
|
|
||||||
{"color": "green", "value": null}
|
|
||||||
]
|
|
||||||
},
|
|
||||||
"unit": "cps"
|
|
||||||
}
|
|
||||||
}
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"id": 10,
|
|
||||||
"title": "Recent Logs",
|
|
||||||
"type": "logs",
|
|
||||||
"gridPos": {"h": 10, "w": 24, "x": 0, "y": 22},
|
|
||||||
"targets": [
|
|
||||||
{
|
|
||||||
"datasource": {"type": "loki", "uid": "loki"},
|
|
||||||
"expr": "{service_name=\"incidentops-api\"} | json",
|
|
||||||
"refId": "A"
|
|
||||||
}
|
|
||||||
],
|
|
||||||
"options": {
|
|
||||||
"showTime": true,
|
|
||||||
"showLabels": true,
|
|
||||||
"wrapLogMessage": true,
|
|
||||||
"enableLogDetails": true,
|
|
||||||
"sortOrder": "Descending"
|
|
||||||
}
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"id": 11,
|
|
||||||
"title": "Error Logs",
|
|
||||||
"type": "logs",
|
|
||||||
"gridPos": {"h": 8, "w": 24, "x": 0, "y": 32},
|
|
||||||
"targets": [
|
|
||||||
{
|
|
||||||
"datasource": {"type": "loki", "uid": "loki"},
|
|
||||||
"expr": "{service_name=\"incidentops-api\"} |= \"ERROR\" | json",
|
|
||||||
"refId": "A"
|
|
||||||
}
|
|
||||||
],
|
|
||||||
"options": {
|
|
||||||
"showTime": true,
|
|
||||||
"showLabels": true,
|
|
||||||
"wrapLogMessage": true,
|
|
||||||
"enableLogDetails": true,
|
|
||||||
"sortOrder": "Descending"
|
|
||||||
}
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"id": 12,
|
|
||||||
"title": "Recent Traces",
|
|
||||||
"type": "traces",
|
|
||||||
"gridPos": {"h": 10, "w": 24, "x": 0, "y": 40},
|
|
||||||
"targets": [
|
|
||||||
{
|
|
||||||
"datasource": {"type": "tempo", "uid": "tempo"},
|
|
||||||
"queryType": "traceqlSearch",
|
|
||||||
"filters": [
|
|
||||||
{
|
|
||||||
"id": "service-name",
|
|
||||||
"operator": "=",
|
|
||||||
"scope": "resource",
|
|
||||||
"tag": "service.name",
|
|
||||||
"value": ["incidentops-api"]
|
|
||||||
}
|
|
||||||
],
|
|
||||||
"refId": "A"
|
|
||||||
}
|
|
||||||
]
|
|
||||||
}
|
|
||||||
],
|
|
||||||
"schemaVersion": 38,
|
|
||||||
"version": 2
|
|
||||||
}
|
|
||||||
@@ -1,12 +0,0 @@
|
|||||||
apiVersion: 1
|
|
||||||
|
|
||||||
providers:
|
|
||||||
- name: 'default'
|
|
||||||
orgId: 1
|
|
||||||
folder: 'IncidentOps'
|
|
||||||
folderUid: 'incidentops'
|
|
||||||
type: file
|
|
||||||
disableDeletion: false
|
|
||||||
editable: true
|
|
||||||
options:
|
|
||||||
path: /var/lib/grafana/dashboards
|
|
||||||
@@ -1,48 +0,0 @@
|
|||||||
apiVersion: 1
|
|
||||||
|
|
||||||
datasources:
|
|
||||||
- name: Prometheus
|
|
||||||
type: prometheus
|
|
||||||
uid: prometheus
|
|
||||||
url: http://prometheus:9090
|
|
||||||
access: proxy
|
|
||||||
isDefault: false
|
|
||||||
jsonData:
|
|
||||||
httpMethod: POST
|
|
||||||
exemplarTraceIdDestinations:
|
|
||||||
- name: trace_id
|
|
||||||
datasourceUid: tempo
|
|
||||||
|
|
||||||
- name: Tempo
|
|
||||||
type: tempo
|
|
||||||
uid: tempo
|
|
||||||
url: http://tempo:3200
|
|
||||||
access: proxy
|
|
||||||
isDefault: false
|
|
||||||
jsonData:
|
|
||||||
tracesToLogsV2:
|
|
||||||
datasourceUid: loki
|
|
||||||
spanStartTimeShift: '-1h'
|
|
||||||
spanEndTimeShift: '1h'
|
|
||||||
filterByTraceID: true
|
|
||||||
filterBySpanID: true
|
|
||||||
tracesToMetrics:
|
|
||||||
datasourceUid: prometheus
|
|
||||||
nodeGraph:
|
|
||||||
enabled: true
|
|
||||||
lokiSearch:
|
|
||||||
datasourceUid: loki
|
|
||||||
|
|
||||||
- name: Loki
|
|
||||||
type: loki
|
|
||||||
uid: loki
|
|
||||||
url: http://loki:3100
|
|
||||||
access: proxy
|
|
||||||
isDefault: true
|
|
||||||
jsonData:
|
|
||||||
derivedFields:
|
|
||||||
- datasourceUid: tempo
|
|
||||||
matcherRegex: '"trace_id":"([a-f0-9]+)"'
|
|
||||||
name: TraceID
|
|
||||||
url: '$${__value.raw}'
|
|
||||||
urlDisplayLabel: 'View Trace'
|
|
||||||
@@ -1,41 +0,0 @@
|
|||||||
auth_enabled: false
|
|
||||||
|
|
||||||
server:
|
|
||||||
http_listen_port: 3100
|
|
||||||
grpc_listen_port: 9096
|
|
||||||
|
|
||||||
common:
|
|
||||||
path_prefix: /loki
|
|
||||||
storage:
|
|
||||||
filesystem:
|
|
||||||
chunks_directory: /loki/chunks
|
|
||||||
rules_directory: /loki/rules
|
|
||||||
replication_factor: 1
|
|
||||||
ring:
|
|
||||||
kvstore:
|
|
||||||
store: inmemory
|
|
||||||
|
|
||||||
query_range:
|
|
||||||
results_cache:
|
|
||||||
cache:
|
|
||||||
embedded_cache:
|
|
||||||
enabled: true
|
|
||||||
max_size_mb: 100
|
|
||||||
|
|
||||||
schema_config:
|
|
||||||
configs:
|
|
||||||
- from: "2020-10-24"
|
|
||||||
store: tsdb
|
|
||||||
object_store: filesystem
|
|
||||||
schema: v13
|
|
||||||
index:
|
|
||||||
prefix: index_
|
|
||||||
period: 24h
|
|
||||||
|
|
||||||
ruler:
|
|
||||||
alertmanager_url: http://localhost:9093
|
|
||||||
|
|
||||||
limits_config:
|
|
||||||
retention_period: 168h # 7 days
|
|
||||||
allow_structured_metadata: true
|
|
||||||
volume_enabled: true
|
|
||||||
@@ -1,38 +0,0 @@
|
|||||||
receivers:
|
|
||||||
otlp:
|
|
||||||
protocols:
|
|
||||||
grpc:
|
|
||||||
endpoint: 0.0.0.0:4317
|
|
||||||
http:
|
|
||||||
endpoint: 0.0.0.0:4318
|
|
||||||
|
|
||||||
processors:
|
|
||||||
batch:
|
|
||||||
timeout: 1s
|
|
||||||
send_batch_size: 1024
|
|
||||||
memory_limiter:
|
|
||||||
check_interval: 1s
|
|
||||||
limit_mib: 256
|
|
||||||
spike_limit_mib: 64
|
|
||||||
|
|
||||||
exporters:
|
|
||||||
otlp/tempo:
|
|
||||||
endpoint: tempo:4317
|
|
||||||
tls:
|
|
||||||
insecure: true
|
|
||||||
loki:
|
|
||||||
endpoint: http://loki:3100/loki/api/v1/push
|
|
||||||
default_labels_enabled:
|
|
||||||
exporter: true
|
|
||||||
job: true
|
|
||||||
|
|
||||||
service:
|
|
||||||
pipelines:
|
|
||||||
traces:
|
|
||||||
receivers: [otlp]
|
|
||||||
processors: [memory_limiter, batch]
|
|
||||||
exporters: [otlp/tempo]
|
|
||||||
logs:
|
|
||||||
receivers: [otlp]
|
|
||||||
processors: [memory_limiter, batch]
|
|
||||||
exporters: [loki]
|
|
||||||
@@ -1,23 +0,0 @@
|
|||||||
global:
|
|
||||||
scrape_interval: 15s
|
|
||||||
evaluation_interval: 15s
|
|
||||||
|
|
||||||
scrape_configs:
|
|
||||||
# Scrape Prometheus itself
|
|
||||||
- job_name: "prometheus"
|
|
||||||
static_configs:
|
|
||||||
- targets: ["localhost:9090"]
|
|
||||||
|
|
||||||
# Scrape IncidentOps API metrics
|
|
||||||
- job_name: "incidentops-api"
|
|
||||||
static_configs:
|
|
||||||
- targets: ["api:9464"]
|
|
||||||
metrics_path: /metrics
|
|
||||||
scrape_interval: 10s
|
|
||||||
|
|
||||||
# Scrape IncidentOps Worker metrics (when metrics are enabled)
|
|
||||||
- job_name: "incidentops-worker"
|
|
||||||
static_configs:
|
|
||||||
- targets: ["worker:9464"]
|
|
||||||
metrics_path: /metrics
|
|
||||||
scrape_interval: 10s
|
|
||||||
@@ -1,32 +0,0 @@
|
|||||||
server:
|
|
||||||
http_listen_port: 3200
|
|
||||||
|
|
||||||
distributor:
|
|
||||||
receivers:
|
|
||||||
otlp:
|
|
||||||
protocols:
|
|
||||||
grpc:
|
|
||||||
endpoint: 0.0.0.0:4317
|
|
||||||
http:
|
|
||||||
endpoint: 0.0.0.0:4318
|
|
||||||
|
|
||||||
ingester:
|
|
||||||
trace_idle_period: 10s
|
|
||||||
max_block_bytes: 1048576
|
|
||||||
max_block_duration: 5m
|
|
||||||
|
|
||||||
compactor:
|
|
||||||
compaction:
|
|
||||||
block_retention: 168h # 7 days
|
|
||||||
|
|
||||||
storage:
|
|
||||||
trace:
|
|
||||||
backend: local
|
|
||||||
local:
|
|
||||||
path: /var/tempo/traces
|
|
||||||
wal:
|
|
||||||
path: /var/tempo/wal
|
|
||||||
|
|
||||||
querier:
|
|
||||||
search:
|
|
||||||
query_timeout: 30s
|
|
||||||
@@ -1,58 +0,0 @@
|
|||||||
[project]
|
|
||||||
name = "incidentops"
|
|
||||||
version = "0.1.0"
|
|
||||||
description = "Incident management API with multi-tenant org support"
|
|
||||||
readme = "README.md"
|
|
||||||
requires-python = ">=3.14"
|
|
||||||
dependencies = [
|
|
||||||
"fastapi>=0.115.0",
|
|
||||||
"uvicorn[standard]>=0.32.0",
|
|
||||||
"asyncpg>=0.30.0",
|
|
||||||
"pydantic[email]>=2.0.0",
|
|
||||||
"pydantic-settings>=2.0.0",
|
|
||||||
"python-jose[cryptography]>=3.3.0",
|
|
||||||
"bcrypt>=4.0.0",
|
|
||||||
"celery[redis]>=5.4.0",
|
|
||||||
"redis>=5.0.0",
|
|
||||||
"httpx>=0.28.0",
|
|
||||||
# OpenTelemetry
|
|
||||||
"opentelemetry-api>=1.27.0",
|
|
||||||
"opentelemetry-sdk>=1.27.0",
|
|
||||||
"opentelemetry-exporter-otlp>=1.27.0",
|
|
||||||
"opentelemetry-exporter-prometheus>=0.48b0",
|
|
||||||
"opentelemetry-instrumentation-fastapi>=0.48b0",
|
|
||||||
"opentelemetry-instrumentation-asyncpg>=0.48b0",
|
|
||||||
"opentelemetry-instrumentation-httpx>=0.48b0",
|
|
||||||
"opentelemetry-instrumentation-redis>=0.48b0",
|
|
||||||
"opentelemetry-instrumentation-logging>=0.48b0",
|
|
||||||
"opentelemetry-instrumentation-system-metrics>=0.48b0",
|
|
||||||
"prometheus-client>=0.20.0",
|
|
||||||
]
|
|
||||||
|
|
||||||
[project.optional-dependencies]
|
|
||||||
dev = [
|
|
||||||
"pytest>=8.0.0",
|
|
||||||
"pytest-asyncio>=0.24.0",
|
|
||||||
"ruff>=0.8.0",
|
|
||||||
]
|
|
||||||
|
|
||||||
[build-system]
|
|
||||||
requires = ["hatchling"]
|
|
||||||
build-backend = "hatchling.build"
|
|
||||||
|
|
||||||
[tool.hatch.build.targets.wheel]
|
|
||||||
packages = ["app", "migrations", "worker"]
|
|
||||||
|
|
||||||
[tool.ruff]
|
|
||||||
line-length = 100
|
|
||||||
target-version = "py314"
|
|
||||||
|
|
||||||
[tool.ruff.lint]
|
|
||||||
select = ["E", "F", "I", "N", "W", "UP"]
|
|
||||||
|
|
||||||
[tool.ruff.lint.per-file-ignores]
|
|
||||||
"tests/**/*.py" = ["E501"] # Allow longer lines in tests for descriptive method names
|
|
||||||
|
|
||||||
[tool.pytest.ini_options]
|
|
||||||
asyncio_mode = "auto"
|
|
||||||
testpaths = ["tests"]
|
|
||||||
+19
-128
@@ -1,46 +1,24 @@
|
|||||||
apiVersion: skaffold/v4beta11
|
apiVersion: skaffold/v4beta6
|
||||||
kind: Config
|
kind: Config
|
||||||
metadata:
|
metadata:
|
||||||
name: incidentops
|
name: incidentops
|
||||||
|
|
||||||
build:
|
build:
|
||||||
artifacts:
|
artifacts:
|
||||||
- image: incidentops/api
|
- image: incidentops-api
|
||||||
|
context: .
|
||||||
|
docker:
|
||||||
|
dockerfile: src/IncidentOps.Api/Dockerfile
|
||||||
|
- image: incidentops-worker
|
||||||
|
context: .
|
||||||
|
docker:
|
||||||
|
dockerfile: src/IncidentOps.Worker/Dockerfile
|
||||||
|
- image: incidentops-web
|
||||||
|
context: web
|
||||||
docker:
|
docker:
|
||||||
dockerfile: Dockerfile
|
dockerfile: Dockerfile
|
||||||
target: api
|
|
||||||
sync:
|
|
||||||
manual:
|
|
||||||
- src: "app/**/*.py"
|
|
||||||
dest: /app
|
|
||||||
- src: "worker/**/*.py"
|
|
||||||
dest: /app
|
|
||||||
|
|
||||||
- image: incidentops/worker
|
|
||||||
docker:
|
|
||||||
dockerfile: Dockerfile
|
|
||||||
target: worker
|
|
||||||
sync:
|
|
||||||
manual:
|
|
||||||
- src: "app/**/*.py"
|
|
||||||
dest: /app
|
|
||||||
- src: "worker/**/*.py"
|
|
||||||
dest: /app
|
|
||||||
|
|
||||||
# Web frontend disabled until implemented
|
|
||||||
# - image: incidentops/web
|
|
||||||
# docker:
|
|
||||||
# dockerfile: Dockerfile.web
|
|
||||||
# context: .
|
|
||||||
# sync:
|
|
||||||
# manual:
|
|
||||||
# - src: "web/src/**/*"
|
|
||||||
# dest: /app
|
|
||||||
|
|
||||||
local:
|
local:
|
||||||
push: false
|
push: false
|
||||||
useBuildkit: true
|
useBuildkit: true
|
||||||
|
|
||||||
deploy:
|
deploy:
|
||||||
helm:
|
helm:
|
||||||
releases:
|
releases:
|
||||||
@@ -49,102 +27,15 @@ deploy:
|
|||||||
valuesFiles:
|
valuesFiles:
|
||||||
- helm/incidentops/values.yaml
|
- helm/incidentops/values.yaml
|
||||||
setValues:
|
setValues:
|
||||||
web.replicaCount: 0 # Disabled until frontend is implemented
|
api.image: incidentops-api
|
||||||
migration.enabled: true
|
worker.image: incidentops-worker
|
||||||
setValueTemplates:
|
web.image: incidentops-web
|
||||||
api.image.repository: "{{.IMAGE_REPO_incidentops_api}}"
|
|
||||||
api.image.tag: "{{.IMAGE_TAG_incidentops_api}}"
|
|
||||||
worker.image.repository: "{{.IMAGE_REPO_incidentops_worker}}"
|
|
||||||
worker.image.tag: "{{.IMAGE_TAG_incidentops_worker}}"
|
|
||||||
migration.image.repository: "{{.IMAGE_REPO_incidentops_api}}"
|
|
||||||
migration.image.tag: "{{.IMAGE_TAG_incidentops_api}}"
|
|
||||||
createNamespace: true
|
|
||||||
namespace: incidentops
|
|
||||||
|
|
||||||
profiles:
|
|
||||||
- name: dev
|
|
||||||
activation:
|
|
||||||
- command: dev
|
|
||||||
build:
|
|
||||||
local:
|
|
||||||
push: false
|
|
||||||
deploy:
|
|
||||||
helm:
|
|
||||||
releases:
|
|
||||||
- name: incidentops
|
|
||||||
chartPath: helm/incidentops
|
|
||||||
valuesFiles:
|
|
||||||
- helm/incidentops/values.yaml
|
|
||||||
setValues:
|
|
||||||
api.replicaCount: 1
|
|
||||||
worker.replicaCount: 1
|
|
||||||
web.replicaCount: 0 # Disabled until frontend is implemented
|
|
||||||
migration.enabled: true
|
|
||||||
setValueTemplates:
|
|
||||||
api.image.repository: "{{.IMAGE_REPO_incidentops_api}}"
|
|
||||||
api.image.tag: "{{.IMAGE_TAG_incidentops_api}}"
|
|
||||||
worker.image.repository: "{{.IMAGE_REPO_incidentops_worker}}"
|
|
||||||
worker.image.tag: "{{.IMAGE_TAG_incidentops_worker}}"
|
|
||||||
migration.image.repository: "{{.IMAGE_REPO_incidentops_api}}"
|
|
||||||
migration.image.tag: "{{.IMAGE_TAG_incidentops_api}}"
|
|
||||||
createNamespace: true
|
|
||||||
namespace: incidentops
|
|
||||||
|
|
||||||
- name: production
|
|
||||||
activation:
|
|
||||||
- env: SKAFFOLD_PROFILE=production
|
|
||||||
build:
|
|
||||||
local:
|
|
||||||
push: true
|
|
||||||
deploy:
|
|
||||||
helm:
|
|
||||||
releases:
|
|
||||||
- name: incidentops
|
|
||||||
chartPath: helm/incidentops
|
|
||||||
valuesFiles:
|
|
||||||
- helm/incidentops/values.yaml
|
|
||||||
- helm/incidentops/values-production.yaml
|
|
||||||
createNamespace: true
|
|
||||||
namespace: incidentops-prod
|
|
||||||
|
|
||||||
- name: kind
|
|
||||||
activation:
|
|
||||||
- kubeContext: kind-.*
|
|
||||||
patches:
|
|
||||||
- op: add
|
|
||||||
path: /build/local/push
|
|
||||||
value: false
|
|
||||||
|
|
||||||
portForward:
|
portForward:
|
||||||
- resourceType: service
|
- resourceType: service
|
||||||
resourceName: incidentops-api
|
resourceName: incidentops-api
|
||||||
namespace: incidentops
|
port: 8080
|
||||||
port: 8000
|
localPort: 8080
|
||||||
localPort: 8000
|
|
||||||
# Web frontend disabled until implemented
|
|
||||||
# - resourceType: service
|
|
||||||
# resourceName: incidentops-web
|
|
||||||
# namespace: incidentops
|
|
||||||
# port: 3000
|
|
||||||
# localPort: 3000
|
|
||||||
# Observability
|
|
||||||
- resourceType: service
|
- resourceType: service
|
||||||
resourceName: incidentops-grafana
|
resourceName: incidentops-web
|
||||||
namespace: incidentops
|
port: 3000
|
||||||
port: 80
|
localPort: 3000
|
||||||
localPort: 3001
|
|
||||||
- resourceType: service
|
|
||||||
resourceName: incidentops-prometheus
|
|
||||||
namespace: incidentops
|
|
||||||
port: 9090
|
|
||||||
localPort: 9090
|
|
||||||
- resourceType: service
|
|
||||||
resourceName: incidentops-tempo
|
|
||||||
namespace: incidentops
|
|
||||||
port: 3200
|
|
||||||
localPort: 3200
|
|
||||||
- resourceType: service
|
|
||||||
resourceName: incidentops-loki
|
|
||||||
namespace: incidentops
|
|
||||||
port: 3100
|
|
||||||
localPort: 3100
|
|
||||||
|
|||||||
@@ -0,0 +1,22 @@
|
|||||||
|
using System.Security.Claims;
|
||||||
|
using IncidentOps.Domain.Enums;
|
||||||
|
|
||||||
|
namespace IncidentOps.Api.Auth;
|
||||||
|
|
||||||
|
public static class ClaimsPrincipalExtensions
|
||||||
|
{
|
||||||
|
public static RequestContext GetRequestContext(this ClaimsPrincipal principal)
|
||||||
|
{
|
||||||
|
var userId = Guid.Parse(principal.FindFirstValue("sub") ?? throw new InvalidOperationException("Missing sub claim"));
|
||||||
|
var orgId = Guid.Parse(principal.FindFirstValue("org_id") ?? throw new InvalidOperationException("Missing org_id claim"));
|
||||||
|
var roleStr = principal.FindFirstValue("org_role") ?? throw new InvalidOperationException("Missing org_role claim");
|
||||||
|
var role = Enum.Parse<OrgRole>(roleStr, ignoreCase: true);
|
||||||
|
|
||||||
|
return new RequestContext
|
||||||
|
{
|
||||||
|
UserId = userId,
|
||||||
|
OrgId = orgId,
|
||||||
|
Role = role
|
||||||
|
};
|
||||||
|
}
|
||||||
|
}
|
||||||
@@ -0,0 +1,10 @@
|
|||||||
|
using IncidentOps.Domain.Enums;
|
||||||
|
|
||||||
|
namespace IncidentOps.Api.Auth;
|
||||||
|
|
||||||
|
public class RequestContext
|
||||||
|
{
|
||||||
|
public Guid UserId { get; set; }
|
||||||
|
public Guid OrgId { get; set; }
|
||||||
|
public OrgRole Role { get; set; }
|
||||||
|
}
|
||||||
@@ -0,0 +1,38 @@
|
|||||||
|
using IncidentOps.Domain.Enums;
|
||||||
|
using Microsoft.AspNetCore.Authorization;
|
||||||
|
|
||||||
|
namespace IncidentOps.Api.Auth;
|
||||||
|
|
||||||
|
public class RoleRequirement : IAuthorizationRequirement
|
||||||
|
{
|
||||||
|
public OrgRole MinimumRole { get; }
|
||||||
|
|
||||||
|
public RoleRequirement(OrgRole minimumRole)
|
||||||
|
{
|
||||||
|
MinimumRole = minimumRole;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
public class RoleRequirementHandler : AuthorizationHandler<RoleRequirement>
|
||||||
|
{
|
||||||
|
protected override Task HandleRequirementAsync(AuthorizationHandlerContext context, RoleRequirement requirement)
|
||||||
|
{
|
||||||
|
var roleClaim = context.User.FindFirst("org_role")?.Value;
|
||||||
|
if (roleClaim == null)
|
||||||
|
{
|
||||||
|
return Task.CompletedTask;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (!Enum.TryParse<OrgRole>(roleClaim, ignoreCase: true, out var userRole))
|
||||||
|
{
|
||||||
|
return Task.CompletedTask;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (userRole >= requirement.MinimumRole)
|
||||||
|
{
|
||||||
|
context.Succeed(requirement);
|
||||||
|
}
|
||||||
|
|
||||||
|
return Task.CompletedTask;
|
||||||
|
}
|
||||||
|
}
|
||||||
@@ -0,0 +1,226 @@
|
|||||||
|
using IncidentOps.Api.Auth;
|
||||||
|
using IncidentOps.Contracts.Auth;
|
||||||
|
using IncidentOps.Domain.Entities;
|
||||||
|
using IncidentOps.Domain.Enums;
|
||||||
|
using IncidentOps.Infrastructure.Auth;
|
||||||
|
using IncidentOps.Infrastructure.Data.Repositories;
|
||||||
|
using Microsoft.AspNetCore.Authorization;
|
||||||
|
using Microsoft.AspNetCore.Mvc;
|
||||||
|
using OrgEntity = IncidentOps.Domain.Entities.Org;
|
||||||
|
|
||||||
|
namespace IncidentOps.Api.Controllers;
|
||||||
|
|
||||||
|
[ApiController]
|
||||||
|
[Route("v1/auth")]
|
||||||
|
public class AuthController : ControllerBase
|
||||||
|
{
|
||||||
|
private readonly IUserRepository _userRepository;
|
||||||
|
private readonly IOrgRepository _orgRepository;
|
||||||
|
private readonly IOrgMemberRepository _orgMemberRepository;
|
||||||
|
private readonly IRefreshTokenRepository _refreshTokenRepository;
|
||||||
|
private readonly ITokenService _tokenService;
|
||||||
|
private readonly IPasswordService _passwordService;
|
||||||
|
private readonly JwtSettings _jwtSettings;
|
||||||
|
|
||||||
|
public AuthController(
|
||||||
|
IUserRepository userRepository,
|
||||||
|
IOrgRepository orgRepository,
|
||||||
|
IOrgMemberRepository orgMemberRepository,
|
||||||
|
IRefreshTokenRepository refreshTokenRepository,
|
||||||
|
ITokenService tokenService,
|
||||||
|
IPasswordService passwordService,
|
||||||
|
JwtSettings jwtSettings)
|
||||||
|
{
|
||||||
|
_userRepository = userRepository;
|
||||||
|
_orgRepository = orgRepository;
|
||||||
|
_orgMemberRepository = orgMemberRepository;
|
||||||
|
_refreshTokenRepository = refreshTokenRepository;
|
||||||
|
_tokenService = tokenService;
|
||||||
|
_passwordService = passwordService;
|
||||||
|
_jwtSettings = jwtSettings;
|
||||||
|
}
|
||||||
|
|
||||||
|
[HttpPost("register")]
|
||||||
|
public async Task<ActionResult<AuthResponse>> Register([FromBody] RegisterRequest request)
|
||||||
|
{
|
||||||
|
var existingUser = await _userRepository.GetByEmailAsync(request.Email);
|
||||||
|
if (existingUser != null)
|
||||||
|
return Conflict(new { message = "Email already registered" });
|
||||||
|
|
||||||
|
var user = new User
|
||||||
|
{
|
||||||
|
Id = Guid.NewGuid(),
|
||||||
|
Email = request.Email.ToLowerInvariant(),
|
||||||
|
PasswordHash = _passwordService.HashPassword(request.Password),
|
||||||
|
DisplayName = request.DisplayName,
|
||||||
|
CreatedAt = DateTime.UtcNow
|
||||||
|
};
|
||||||
|
await _userRepository.CreateAsync(user);
|
||||||
|
|
||||||
|
// Create a default org for the user
|
||||||
|
var org = new OrgEntity
|
||||||
|
{
|
||||||
|
Id = Guid.NewGuid(),
|
||||||
|
Name = $"{request.DisplayName}'s Org",
|
||||||
|
Slug = $"org-{Guid.NewGuid():N}".Substring(0, 20),
|
||||||
|
CreatedAt = DateTime.UtcNow
|
||||||
|
};
|
||||||
|
await _orgRepository.CreateAsync(org);
|
||||||
|
|
||||||
|
var member = new OrgMember
|
||||||
|
{
|
||||||
|
Id = Guid.NewGuid(),
|
||||||
|
OrgId = org.Id,
|
||||||
|
UserId = user.Id,
|
||||||
|
Role = OrgRole.Admin,
|
||||||
|
CreatedAt = DateTime.UtcNow
|
||||||
|
};
|
||||||
|
await _orgMemberRepository.CreateAsync(member);
|
||||||
|
|
||||||
|
return await GenerateAuthResponse(user, org, member.Role);
|
||||||
|
}
|
||||||
|
|
||||||
|
[HttpPost("login")]
|
||||||
|
public async Task<ActionResult<AuthResponse>> Login([FromBody] LoginRequest request)
|
||||||
|
{
|
||||||
|
var user = await _userRepository.GetByEmailAsync(request.Email);
|
||||||
|
if (user == null || !_passwordService.VerifyPassword(request.Password, user.PasswordHash))
|
||||||
|
return Unauthorized(new { message = "Invalid credentials" });
|
||||||
|
|
||||||
|
var orgs = await _orgRepository.GetByUserIdAsync(user.Id);
|
||||||
|
if (orgs.Count == 0)
|
||||||
|
return Unauthorized(new { message = "User has no organizations" });
|
||||||
|
|
||||||
|
OrgEntity activeOrg;
|
||||||
|
if (request.OrgId.HasValue)
|
||||||
|
{
|
||||||
|
activeOrg = orgs.FirstOrDefault(o => o.Id == request.OrgId.Value)
|
||||||
|
?? throw new InvalidOperationException("User is not a member of the specified organization");
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
activeOrg = orgs.First();
|
||||||
|
}
|
||||||
|
|
||||||
|
var member = await _orgMemberRepository.GetByUserAndOrgAsync(user.Id, activeOrg.Id);
|
||||||
|
if (member == null)
|
||||||
|
return Unauthorized(new { message = "User is not a member of the organization" });
|
||||||
|
|
||||||
|
return await GenerateAuthResponse(user, activeOrg, member.Role);
|
||||||
|
}
|
||||||
|
|
||||||
|
[HttpPost("refresh")]
|
||||||
|
public async Task<ActionResult<AuthResponse>> Refresh([FromBody] RefreshRequest request)
|
||||||
|
{
|
||||||
|
var tokenHash = _tokenService.HashToken(request.RefreshToken);
|
||||||
|
var refreshToken = await _refreshTokenRepository.GetByHashAsync(tokenHash);
|
||||||
|
if (refreshToken == null)
|
||||||
|
return Unauthorized(new { message = "Invalid refresh token" });
|
||||||
|
|
||||||
|
var user = await _userRepository.GetByIdAsync(refreshToken.UserId);
|
||||||
|
if (user == null)
|
||||||
|
return Unauthorized(new { message = "User not found" });
|
||||||
|
|
||||||
|
var org = await _orgRepository.GetByIdAsync(refreshToken.ActiveOrgId);
|
||||||
|
if (org == null)
|
||||||
|
return Unauthorized(new { message = "Organization not found" });
|
||||||
|
|
||||||
|
var member = await _orgMemberRepository.GetByUserAndOrgAsync(user.Id, org.Id);
|
||||||
|
if (member == null)
|
||||||
|
return Unauthorized(new { message = "User is not a member of the organization" });
|
||||||
|
|
||||||
|
// Rotate refresh token
|
||||||
|
await _refreshTokenRepository.RevokeAsync(refreshToken.Id);
|
||||||
|
|
||||||
|
return await GenerateAuthResponse(user, org, member.Role);
|
||||||
|
}
|
||||||
|
|
||||||
|
[HttpPost("switch-org")]
|
||||||
|
public async Task<ActionResult<AuthResponse>> SwitchOrg([FromBody] SwitchOrgRequest request)
|
||||||
|
{
|
||||||
|
var tokenHash = _tokenService.HashToken(request.RefreshToken);
|
||||||
|
var refreshToken = await _refreshTokenRepository.GetByHashAsync(tokenHash);
|
||||||
|
if (refreshToken == null)
|
||||||
|
return Unauthorized(new { message = "Invalid refresh token" });
|
||||||
|
|
||||||
|
var user = await _userRepository.GetByIdAsync(refreshToken.UserId);
|
||||||
|
if (user == null)
|
||||||
|
return Unauthorized(new { message = "User not found" });
|
||||||
|
|
||||||
|
var org = await _orgRepository.GetByIdAsync(request.OrgId);
|
||||||
|
if (org == null)
|
||||||
|
return NotFound(new { message = "Organization not found" });
|
||||||
|
|
||||||
|
var member = await _orgMemberRepository.GetByUserAndOrgAsync(user.Id, org.Id);
|
||||||
|
if (member == null)
|
||||||
|
return Forbidden("User is not a member of the organization");
|
||||||
|
|
||||||
|
// Rotate refresh token with new org
|
||||||
|
await _refreshTokenRepository.RevokeAsync(refreshToken.Id);
|
||||||
|
|
||||||
|
return await GenerateAuthResponse(user, org, member.Role);
|
||||||
|
}
|
||||||
|
|
||||||
|
[HttpPost("logout")]
|
||||||
|
public async Task<IActionResult> Logout([FromBody] LogoutRequest request)
|
||||||
|
{
|
||||||
|
var tokenHash = _tokenService.HashToken(request.RefreshToken);
|
||||||
|
var refreshToken = await _refreshTokenRepository.GetByHashAsync(tokenHash);
|
||||||
|
if (refreshToken != null)
|
||||||
|
{
|
||||||
|
await _refreshTokenRepository.RevokeAsync(refreshToken.Id);
|
||||||
|
}
|
||||||
|
|
||||||
|
return NoContent();
|
||||||
|
}
|
||||||
|
|
||||||
|
[Authorize]
|
||||||
|
[HttpGet("/v1/me")]
|
||||||
|
public async Task<ActionResult<MeResponse>> Me()
|
||||||
|
{
|
||||||
|
var ctx = User.GetRequestContext();
|
||||||
|
var user = await _userRepository.GetByIdAsync(ctx.UserId);
|
||||||
|
if (user == null)
|
||||||
|
return NotFound();
|
||||||
|
|
||||||
|
var org = await _orgRepository.GetByIdAsync(ctx.OrgId);
|
||||||
|
if (org == null)
|
||||||
|
return NotFound();
|
||||||
|
|
||||||
|
return new MeResponse(
|
||||||
|
user.Id,
|
||||||
|
user.Email,
|
||||||
|
user.DisplayName,
|
||||||
|
new ActiveOrgDto(org.Id, org.Name, org.Slug, ctx.Role.ToString().ToLowerInvariant())
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
private async Task<ActionResult<AuthResponse>> GenerateAuthResponse(User user, OrgEntity org, OrgRole role)
|
||||||
|
{
|
||||||
|
var accessToken = _tokenService.GenerateAccessToken(user.Id, org.Id, role);
|
||||||
|
var refreshTokenValue = _tokenService.GenerateRefreshToken();
|
||||||
|
var refreshTokenHash = _tokenService.HashToken(refreshTokenValue);
|
||||||
|
|
||||||
|
var refreshToken = new RefreshToken
|
||||||
|
{
|
||||||
|
Id = Guid.NewGuid(),
|
||||||
|
UserId = user.Id,
|
||||||
|
TokenHash = refreshTokenHash,
|
||||||
|
ActiveOrgId = org.Id,
|
||||||
|
ExpiresAt = DateTime.UtcNow.AddDays(_jwtSettings.RefreshTokenExpirationDays),
|
||||||
|
CreatedAt = DateTime.UtcNow
|
||||||
|
};
|
||||||
|
await _refreshTokenRepository.CreateAsync(refreshToken);
|
||||||
|
|
||||||
|
return new AuthResponse(
|
||||||
|
accessToken,
|
||||||
|
refreshTokenValue,
|
||||||
|
new ActiveOrgDto(org.Id, org.Name, org.Slug, role.ToString().ToLowerInvariant())
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
private ObjectResult Forbidden(string message)
|
||||||
|
{
|
||||||
|
return StatusCode(403, new { message });
|
||||||
|
}
|
||||||
|
}
|
||||||
@@ -0,0 +1,60 @@
|
|||||||
|
using Microsoft.AspNetCore.Mvc;
|
||||||
|
using Npgsql;
|
||||||
|
using StackExchange.Redis;
|
||||||
|
|
||||||
|
namespace IncidentOps.Api.Controllers;
|
||||||
|
|
||||||
|
[ApiController]
|
||||||
|
public class HealthController : ControllerBase
|
||||||
|
{
|
||||||
|
private readonly IConfiguration _configuration;
|
||||||
|
|
||||||
|
public HealthController(IConfiguration configuration)
|
||||||
|
{
|
||||||
|
_configuration = configuration;
|
||||||
|
}
|
||||||
|
|
||||||
|
[HttpGet("healthz")]
|
||||||
|
public IActionResult Healthz()
|
||||||
|
{
|
||||||
|
return Ok(new { status = "healthy" });
|
||||||
|
}
|
||||||
|
|
||||||
|
[HttpGet("readyz")]
|
||||||
|
public async Task<IActionResult> Readyz()
|
||||||
|
{
|
||||||
|
var checks = new Dictionary<string, string>();
|
||||||
|
|
||||||
|
// Check PostgreSQL
|
||||||
|
try
|
||||||
|
{
|
||||||
|
var connectionString = _configuration.GetConnectionString("Postgres");
|
||||||
|
await using var connection = new NpgsqlConnection(connectionString);
|
||||||
|
await connection.OpenAsync();
|
||||||
|
checks["postgres"] = "healthy";
|
||||||
|
}
|
||||||
|
catch (Exception ex)
|
||||||
|
{
|
||||||
|
checks["postgres"] = $"unhealthy: {ex.Message}";
|
||||||
|
}
|
||||||
|
|
||||||
|
// Check Redis
|
||||||
|
try
|
||||||
|
{
|
||||||
|
var redisConnectionString = _configuration["Redis:ConnectionString"];
|
||||||
|
var redis = await ConnectionMultiplexer.ConnectAsync(redisConnectionString!);
|
||||||
|
var db = redis.GetDatabase();
|
||||||
|
await db.PingAsync();
|
||||||
|
checks["redis"] = "healthy";
|
||||||
|
}
|
||||||
|
catch (Exception ex)
|
||||||
|
{
|
||||||
|
checks["redis"] = $"unhealthy: {ex.Message}";
|
||||||
|
}
|
||||||
|
|
||||||
|
var allHealthy = checks.Values.All(v => v == "healthy");
|
||||||
|
return allHealthy
|
||||||
|
? Ok(new { status = "ready", checks })
|
||||||
|
: StatusCode(503, new { status = "not ready", checks });
|
||||||
|
}
|
||||||
|
}
|
||||||
@@ -0,0 +1,290 @@
|
|||||||
|
using Hangfire;
|
||||||
|
using IncidentOps.Api.Auth;
|
||||||
|
using IncidentOps.Contracts.Incidents;
|
||||||
|
using IncidentOps.Domain.Entities;
|
||||||
|
using IncidentOps.Domain.Enums;
|
||||||
|
using IncidentOps.Infrastructure.Data.Repositories;
|
||||||
|
using IncidentOps.Infrastructure.Jobs;
|
||||||
|
using Microsoft.AspNetCore.Authorization;
|
||||||
|
using Microsoft.AspNetCore.Mvc;
|
||||||
|
|
||||||
|
namespace IncidentOps.Api.Controllers;
|
||||||
|
|
||||||
|
[ApiController]
|
||||||
|
[Authorize]
|
||||||
|
public class IncidentsController : ControllerBase
|
||||||
|
{
|
||||||
|
private readonly IIncidentRepository _incidentRepository;
|
||||||
|
private readonly IIncidentEventRepository _incidentEventRepository;
|
||||||
|
private readonly IServiceRepository _serviceRepository;
|
||||||
|
private readonly IUserRepository _userRepository;
|
||||||
|
private readonly IBackgroundJobClient _backgroundJobClient;
|
||||||
|
|
||||||
|
public IncidentsController(
|
||||||
|
IIncidentRepository incidentRepository,
|
||||||
|
IIncidentEventRepository incidentEventRepository,
|
||||||
|
IServiceRepository serviceRepository,
|
||||||
|
IUserRepository userRepository,
|
||||||
|
IBackgroundJobClient backgroundJobClient)
|
||||||
|
{
|
||||||
|
_incidentRepository = incidentRepository;
|
||||||
|
_incidentEventRepository = incidentEventRepository;
|
||||||
|
_serviceRepository = serviceRepository;
|
||||||
|
_userRepository = userRepository;
|
||||||
|
_backgroundJobClient = backgroundJobClient;
|
||||||
|
}
|
||||||
|
|
||||||
|
[HttpGet("v1/incidents")]
|
||||||
|
public async Task<ActionResult<IncidentListResponse>> GetIncidents(
|
||||||
|
[FromQuery] string? status = null,
|
||||||
|
[FromQuery] string? cursor = null,
|
||||||
|
[FromQuery] int limit = 20)
|
||||||
|
{
|
||||||
|
var ctx = User.GetRequestContext();
|
||||||
|
|
||||||
|
IncidentStatus? statusFilter = null;
|
||||||
|
if (!string.IsNullOrEmpty(status) && Enum.TryParse<IncidentStatus>(status, ignoreCase: true, out var parsed))
|
||||||
|
{
|
||||||
|
statusFilter = parsed;
|
||||||
|
}
|
||||||
|
|
||||||
|
var incidents = await _incidentRepository.GetByOrgIdAsync(ctx.OrgId, statusFilter, limit + 1, cursor);
|
||||||
|
var hasMore = incidents.Count > limit;
|
||||||
|
var items = incidents.Take(limit).ToList();
|
||||||
|
|
||||||
|
var dtos = new List<IncidentDto>();
|
||||||
|
foreach (var incident in items)
|
||||||
|
{
|
||||||
|
var service = await _serviceRepository.GetByIdAsync(incident.ServiceId, ctx.OrgId);
|
||||||
|
var assignedUser = incident.AssignedToUserId.HasValue
|
||||||
|
? await _userRepository.GetByIdAsync(incident.AssignedToUserId.Value)
|
||||||
|
: null;
|
||||||
|
|
||||||
|
dtos.Add(new IncidentDto(
|
||||||
|
incident.Id,
|
||||||
|
incident.ServiceId,
|
||||||
|
service?.Name ?? "Unknown",
|
||||||
|
incident.Title,
|
||||||
|
incident.Description,
|
||||||
|
incident.Status.ToString().ToLowerInvariant(),
|
||||||
|
incident.Version,
|
||||||
|
incident.AssignedToUserId,
|
||||||
|
assignedUser?.DisplayName,
|
||||||
|
incident.CreatedAt,
|
||||||
|
incident.AcknowledgedAt,
|
||||||
|
incident.MitigatedAt,
|
||||||
|
incident.ResolvedAt
|
||||||
|
));
|
||||||
|
}
|
||||||
|
|
||||||
|
var nextCursor = hasMore ? items.Last().CreatedAt.ToString("O") : null;
|
||||||
|
return new IncidentListResponse(dtos, nextCursor);
|
||||||
|
}
|
||||||
|
|
||||||
|
[HttpPost("v1/services/{serviceId}/incidents")]
|
||||||
|
[Authorize(Policy = "Member")]
|
||||||
|
public async Task<ActionResult<IncidentDto>> CreateIncident(Guid serviceId, [FromBody] CreateIncidentRequest request)
|
||||||
|
{
|
||||||
|
var ctx = User.GetRequestContext();
|
||||||
|
|
||||||
|
var service = await _serviceRepository.GetByIdAsync(serviceId, ctx.OrgId);
|
||||||
|
if (service == null)
|
||||||
|
return NotFound(new { message = "Service not found" });
|
||||||
|
|
||||||
|
var incident = new Incident
|
||||||
|
{
|
||||||
|
Id = Guid.NewGuid(),
|
||||||
|
OrgId = ctx.OrgId,
|
||||||
|
ServiceId = serviceId,
|
||||||
|
Title = request.Title,
|
||||||
|
Description = request.Description,
|
||||||
|
Status = IncidentStatus.Triggered,
|
||||||
|
Version = 1,
|
||||||
|
CreatedAt = DateTime.UtcNow
|
||||||
|
};
|
||||||
|
await _incidentRepository.CreateAsync(incident);
|
||||||
|
|
||||||
|
var incidentEvent = new IncidentEvent
|
||||||
|
{
|
||||||
|
Id = Guid.NewGuid(),
|
||||||
|
IncidentId = incident.Id,
|
||||||
|
EventType = IncidentEventType.Created,
|
||||||
|
ActorUserId = ctx.UserId,
|
||||||
|
CreatedAt = DateTime.UtcNow
|
||||||
|
};
|
||||||
|
await _incidentEventRepository.CreateAsync(incidentEvent);
|
||||||
|
|
||||||
|
// Enqueue notification job
|
||||||
|
_backgroundJobClient.Enqueue<IIncidentTriggeredJob>(j => j.ExecuteAsync(incident.Id));
|
||||||
|
|
||||||
|
return CreatedAtAction(nameof(GetIncident), new { incidentId = incident.Id }, new IncidentDto(
|
||||||
|
incident.Id,
|
||||||
|
incident.ServiceId,
|
||||||
|
service.Name,
|
||||||
|
incident.Title,
|
||||||
|
incident.Description,
|
||||||
|
incident.Status.ToString().ToLowerInvariant(),
|
||||||
|
incident.Version,
|
||||||
|
null,
|
||||||
|
null,
|
||||||
|
incident.CreatedAt,
|
||||||
|
null,
|
||||||
|
null,
|
||||||
|
null
|
||||||
|
));
|
||||||
|
}
|
||||||
|
|
||||||
|
[HttpGet("v1/incidents/{incidentId}")]
|
||||||
|
public async Task<ActionResult<IncidentDto>> GetIncident(Guid incidentId)
|
||||||
|
{
|
||||||
|
var ctx = User.GetRequestContext();
|
||||||
|
|
||||||
|
var incident = await _incidentRepository.GetByIdAsync(incidentId, ctx.OrgId);
|
||||||
|
if (incident == null)
|
||||||
|
return NotFound();
|
||||||
|
|
||||||
|
var service = await _serviceRepository.GetByIdAsync(incident.ServiceId, ctx.OrgId);
|
||||||
|
var assignedUser = incident.AssignedToUserId.HasValue
|
||||||
|
? await _userRepository.GetByIdAsync(incident.AssignedToUserId.Value)
|
||||||
|
: null;
|
||||||
|
|
||||||
|
return new IncidentDto(
|
||||||
|
incident.Id,
|
||||||
|
incident.ServiceId,
|
||||||
|
service?.Name ?? "Unknown",
|
||||||
|
incident.Title,
|
||||||
|
incident.Description,
|
||||||
|
incident.Status.ToString().ToLowerInvariant(),
|
||||||
|
incident.Version,
|
||||||
|
incident.AssignedToUserId,
|
||||||
|
assignedUser?.DisplayName,
|
||||||
|
incident.CreatedAt,
|
||||||
|
incident.AcknowledgedAt,
|
||||||
|
incident.MitigatedAt,
|
||||||
|
incident.ResolvedAt
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
[HttpGet("v1/incidents/{incidentId}/events")]
|
||||||
|
public async Task<ActionResult<IReadOnlyList<IncidentEventDto>>> GetIncidentEvents(Guid incidentId)
|
||||||
|
{
|
||||||
|
var ctx = User.GetRequestContext();
|
||||||
|
|
||||||
|
var incident = await _incidentRepository.GetByIdAsync(incidentId, ctx.OrgId);
|
||||||
|
if (incident == null)
|
||||||
|
return NotFound();
|
||||||
|
|
||||||
|
var events = await _incidentEventRepository.GetByIncidentIdAsync(incidentId);
|
||||||
|
|
||||||
|
var dtos = new List<IncidentEventDto>();
|
||||||
|
foreach (var evt in events)
|
||||||
|
{
|
||||||
|
var actor = evt.ActorUserId.HasValue
|
||||||
|
? await _userRepository.GetByIdAsync(evt.ActorUserId.Value)
|
||||||
|
: null;
|
||||||
|
|
||||||
|
dtos.Add(new IncidentEventDto(
|
||||||
|
evt.Id,
|
||||||
|
evt.EventType.ToString().ToLowerInvariant(),
|
||||||
|
evt.ActorUserId,
|
||||||
|
actor?.DisplayName,
|
||||||
|
evt.Payload,
|
||||||
|
evt.CreatedAt
|
||||||
|
));
|
||||||
|
}
|
||||||
|
|
||||||
|
return dtos;
|
||||||
|
}
|
||||||
|
|
||||||
|
[HttpPost("v1/incidents/{incidentId}/transition")]
|
||||||
|
[Authorize(Policy = "Member")]
|
||||||
|
public async Task<ActionResult<IncidentDto>> TransitionIncident(Guid incidentId, [FromBody] TransitionRequest request)
|
||||||
|
{
|
||||||
|
var ctx = User.GetRequestContext();
|
||||||
|
|
||||||
|
var incident = await _incidentRepository.GetByIdAsync(incidentId, ctx.OrgId);
|
||||||
|
if (incident == null)
|
||||||
|
return NotFound();
|
||||||
|
|
||||||
|
var newStatus = request.Action.ToLowerInvariant() switch
|
||||||
|
{
|
||||||
|
"ack" or "acknowledge" => IncidentStatus.Acknowledged,
|
||||||
|
"mitigate" => IncidentStatus.Mitigated,
|
||||||
|
"resolve" => IncidentStatus.Resolved,
|
||||||
|
_ => (IncidentStatus?)null
|
||||||
|
};
|
||||||
|
|
||||||
|
if (newStatus == null)
|
||||||
|
return BadRequest(new { message = "Invalid action" });
|
||||||
|
|
||||||
|
// Validate transition
|
||||||
|
var validTransitions = new Dictionary<IncidentStatus, IncidentStatus[]>
|
||||||
|
{
|
||||||
|
{ IncidentStatus.Triggered, new[] { IncidentStatus.Acknowledged } },
|
||||||
|
{ IncidentStatus.Acknowledged, new[] { IncidentStatus.Mitigated } },
|
||||||
|
{ IncidentStatus.Mitigated, new[] { IncidentStatus.Resolved } }
|
||||||
|
};
|
||||||
|
|
||||||
|
if (!validTransitions.TryGetValue(incident.Status, out var allowedStatuses) || !allowedStatuses.Contains(newStatus.Value))
|
||||||
|
{
|
||||||
|
return BadRequest(new { message = $"Cannot transition from {incident.Status} to {newStatus}" });
|
||||||
|
}
|
||||||
|
|
||||||
|
var timestamp = DateTime.UtcNow;
|
||||||
|
var success = await _incidentRepository.TransitionAsync(incidentId, ctx.OrgId, request.ExpectedVersion, newStatus.Value, timestamp);
|
||||||
|
if (!success)
|
||||||
|
return Conflict(new { message = "Concurrent modification detected. Please refresh and try again." });
|
||||||
|
|
||||||
|
var eventType = newStatus.Value switch
|
||||||
|
{
|
||||||
|
IncidentStatus.Acknowledged => IncidentEventType.Acknowledged,
|
||||||
|
IncidentStatus.Mitigated => IncidentEventType.Mitigated,
|
||||||
|
IncidentStatus.Resolved => IncidentEventType.Resolved,
|
||||||
|
_ => throw new InvalidOperationException()
|
||||||
|
};
|
||||||
|
|
||||||
|
await _incidentEventRepository.CreateAsync(new IncidentEvent
|
||||||
|
{
|
||||||
|
Id = Guid.NewGuid(),
|
||||||
|
IncidentId = incidentId,
|
||||||
|
EventType = eventType,
|
||||||
|
ActorUserId = ctx.UserId,
|
||||||
|
CreatedAt = timestamp
|
||||||
|
});
|
||||||
|
|
||||||
|
return await GetIncident(incidentId);
|
||||||
|
}
|
||||||
|
|
||||||
|
[HttpPost("v1/incidents/{incidentId}/comment")]
|
||||||
|
[Authorize(Policy = "Member")]
|
||||||
|
public async Task<ActionResult<IncidentEventDto>> AddComment(Guid incidentId, [FromBody] CommentRequest request)
|
||||||
|
{
|
||||||
|
var ctx = User.GetRequestContext();
|
||||||
|
|
||||||
|
var incident = await _incidentRepository.GetByIdAsync(incidentId, ctx.OrgId);
|
||||||
|
if (incident == null)
|
||||||
|
return NotFound();
|
||||||
|
|
||||||
|
var incidentEvent = new IncidentEvent
|
||||||
|
{
|
||||||
|
Id = Guid.NewGuid(),
|
||||||
|
IncidentId = incidentId,
|
||||||
|
EventType = IncidentEventType.Comment,
|
||||||
|
ActorUserId = ctx.UserId,
|
||||||
|
Payload = request.Content,
|
||||||
|
CreatedAt = DateTime.UtcNow
|
||||||
|
};
|
||||||
|
await _incidentEventRepository.CreateAsync(incidentEvent);
|
||||||
|
|
||||||
|
var user = await _userRepository.GetByIdAsync(ctx.UserId);
|
||||||
|
|
||||||
|
return CreatedAtAction(nameof(GetIncidentEvents), new { incidentId }, new IncidentEventDto(
|
||||||
|
incidentEvent.Id,
|
||||||
|
incidentEvent.EventType.ToString().ToLowerInvariant(),
|
||||||
|
ctx.UserId,
|
||||||
|
user?.DisplayName,
|
||||||
|
incidentEvent.Payload,
|
||||||
|
incidentEvent.CreatedAt
|
||||||
|
));
|
||||||
|
}
|
||||||
|
}
|
||||||
@@ -0,0 +1,151 @@
|
|||||||
|
using IncidentOps.Api.Auth;
|
||||||
|
using IncidentOps.Contracts.Orgs;
|
||||||
|
using IncidentOps.Contracts.Services;
|
||||||
|
using IncidentOps.Domain.Entities;
|
||||||
|
using IncidentOps.Domain.Enums;
|
||||||
|
using IncidentOps.Infrastructure.Data.Repositories;
|
||||||
|
using Microsoft.AspNetCore.Authorization;
|
||||||
|
using Microsoft.AspNetCore.Mvc;
|
||||||
|
|
||||||
|
namespace IncidentOps.Api.Controllers;
|
||||||
|
|
||||||
|
[ApiController]
|
||||||
|
[Route("v1/org")]
|
||||||
|
[Authorize]
|
||||||
|
public class OrgController : ControllerBase
|
||||||
|
{
|
||||||
|
private readonly IOrgRepository _orgRepository;
|
||||||
|
private readonly IOrgMemberRepository _orgMemberRepository;
|
||||||
|
private readonly IUserRepository _userRepository;
|
||||||
|
private readonly IServiceRepository _serviceRepository;
|
||||||
|
private readonly INotificationTargetRepository _notificationTargetRepository;
|
||||||
|
|
||||||
|
public OrgController(
|
||||||
|
IOrgRepository orgRepository,
|
||||||
|
IOrgMemberRepository orgMemberRepository,
|
||||||
|
IUserRepository userRepository,
|
||||||
|
IServiceRepository serviceRepository,
|
||||||
|
INotificationTargetRepository notificationTargetRepository)
|
||||||
|
{
|
||||||
|
_orgRepository = orgRepository;
|
||||||
|
_orgMemberRepository = orgMemberRepository;
|
||||||
|
_userRepository = userRepository;
|
||||||
|
_serviceRepository = serviceRepository;
|
||||||
|
_notificationTargetRepository = notificationTargetRepository;
|
||||||
|
}
|
||||||
|
|
||||||
|
[HttpGet]
|
||||||
|
public async Task<ActionResult<OrgDto>> GetCurrentOrg()
|
||||||
|
{
|
||||||
|
var ctx = User.GetRequestContext();
|
||||||
|
var org = await _orgRepository.GetByIdAsync(ctx.OrgId);
|
||||||
|
if (org == null)
|
||||||
|
return NotFound();
|
||||||
|
|
||||||
|
return new OrgDto(org.Id, org.Name, org.Slug, ctx.Role.ToString().ToLowerInvariant());
|
||||||
|
}
|
||||||
|
|
||||||
|
[HttpGet("members")]
|
||||||
|
[Authorize(Policy = "Admin")]
|
||||||
|
public async Task<ActionResult<IReadOnlyList<OrgMemberDto>>> GetMembers()
|
||||||
|
{
|
||||||
|
var ctx = User.GetRequestContext();
|
||||||
|
var members = await _orgMemberRepository.GetByOrgIdAsync(ctx.OrgId);
|
||||||
|
|
||||||
|
var result = new List<OrgMemberDto>();
|
||||||
|
foreach (var member in members)
|
||||||
|
{
|
||||||
|
var user = await _userRepository.GetByIdAsync(member.UserId);
|
||||||
|
if (user != null)
|
||||||
|
{
|
||||||
|
result.Add(new OrgMemberDto(
|
||||||
|
member.Id,
|
||||||
|
user.Id,
|
||||||
|
user.Email,
|
||||||
|
user.DisplayName,
|
||||||
|
member.Role.ToString().ToLowerInvariant(),
|
||||||
|
member.CreatedAt
|
||||||
|
));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return result;
|
||||||
|
}
|
||||||
|
|
||||||
|
[HttpGet("services")]
|
||||||
|
public async Task<ActionResult<IReadOnlyList<ServiceDto>>> GetServices()
|
||||||
|
{
|
||||||
|
var ctx = User.GetRequestContext();
|
||||||
|
var services = await _serviceRepository.GetByOrgIdAsync(ctx.OrgId);
|
||||||
|
|
||||||
|
return services.Select(s => new ServiceDto(s.Id, s.Name, s.Slug, s.Description, s.CreatedAt)).ToList();
|
||||||
|
}
|
||||||
|
|
||||||
|
[HttpPost("services")]
|
||||||
|
[Authorize(Policy = "Member")]
|
||||||
|
public async Task<ActionResult<ServiceDto>> CreateService([FromBody] CreateServiceRequest request)
|
||||||
|
{
|
||||||
|
var ctx = User.GetRequestContext();
|
||||||
|
|
||||||
|
var service = new Service
|
||||||
|
{
|
||||||
|
Id = Guid.NewGuid(),
|
||||||
|
OrgId = ctx.OrgId,
|
||||||
|
Name = request.Name,
|
||||||
|
Slug = request.Slug,
|
||||||
|
Description = request.Description,
|
||||||
|
CreatedAt = DateTime.UtcNow
|
||||||
|
};
|
||||||
|
await _serviceRepository.CreateAsync(service);
|
||||||
|
|
||||||
|
return CreatedAtAction(nameof(GetServices), new ServiceDto(service.Id, service.Name, service.Slug, service.Description, service.CreatedAt));
|
||||||
|
}
|
||||||
|
|
||||||
|
[HttpGet("notification-targets")]
|
||||||
|
[Authorize(Policy = "Admin")]
|
||||||
|
public async Task<ActionResult<IReadOnlyList<NotificationTargetDto>>> GetNotificationTargets()
|
||||||
|
{
|
||||||
|
var ctx = User.GetRequestContext();
|
||||||
|
var targets = await _notificationTargetRepository.GetByOrgIdAsync(ctx.OrgId);
|
||||||
|
|
||||||
|
return targets.Select(t => new NotificationTargetDto(
|
||||||
|
t.Id,
|
||||||
|
t.Name,
|
||||||
|
t.TargetType.ToString().ToLowerInvariant(),
|
||||||
|
t.Configuration,
|
||||||
|
t.IsEnabled,
|
||||||
|
t.CreatedAt
|
||||||
|
)).ToList();
|
||||||
|
}
|
||||||
|
|
||||||
|
[HttpPost("notification-targets")]
|
||||||
|
[Authorize(Policy = "Admin")]
|
||||||
|
public async Task<ActionResult<NotificationTargetDto>> CreateNotificationTarget([FromBody] CreateNotificationTargetRequest request)
|
||||||
|
{
|
||||||
|
var ctx = User.GetRequestContext();
|
||||||
|
|
||||||
|
if (!Enum.TryParse<NotificationTargetType>(request.TargetType, ignoreCase: true, out var targetType))
|
||||||
|
return BadRequest(new { message = "Invalid target type" });
|
||||||
|
|
||||||
|
var target = new NotificationTarget
|
||||||
|
{
|
||||||
|
Id = Guid.NewGuid(),
|
||||||
|
OrgId = ctx.OrgId,
|
||||||
|
Name = request.Name,
|
||||||
|
TargetType = targetType,
|
||||||
|
Configuration = request.Configuration,
|
||||||
|
IsEnabled = request.IsEnabled,
|
||||||
|
CreatedAt = DateTime.UtcNow
|
||||||
|
};
|
||||||
|
await _notificationTargetRepository.CreateAsync(target);
|
||||||
|
|
||||||
|
return CreatedAtAction(nameof(GetNotificationTargets), new NotificationTargetDto(
|
||||||
|
target.Id,
|
||||||
|
target.Name,
|
||||||
|
target.TargetType.ToString().ToLowerInvariant(),
|
||||||
|
target.Configuration,
|
||||||
|
target.IsEnabled,
|
||||||
|
target.CreatedAt
|
||||||
|
));
|
||||||
|
}
|
||||||
|
}
|
||||||
@@ -0,0 +1,23 @@
|
|||||||
|
FROM mcr.microsoft.com/dotnet/sdk:10.0 AS build
|
||||||
|
WORKDIR /src
|
||||||
|
|
||||||
|
# Copy csproj files and restore
|
||||||
|
COPY src/IncidentOps.Contracts/IncidentOps.Contracts.csproj src/IncidentOps.Contracts/
|
||||||
|
COPY src/IncidentOps.Domain/IncidentOps.Domain.csproj src/IncidentOps.Domain/
|
||||||
|
COPY src/IncidentOps.Infrastructure/IncidentOps.Infrastructure.csproj src/IncidentOps.Infrastructure/
|
||||||
|
COPY src/IncidentOps.Api/IncidentOps.Api.csproj src/IncidentOps.Api/
|
||||||
|
RUN dotnet restore src/IncidentOps.Api/IncidentOps.Api.csproj
|
||||||
|
|
||||||
|
# Copy source and build
|
||||||
|
COPY src/ src/
|
||||||
|
WORKDIR /src/src/IncidentOps.Api
|
||||||
|
RUN dotnet publish -c Release -o /app --no-restore
|
||||||
|
|
||||||
|
FROM mcr.microsoft.com/dotnet/aspnet:10.0 AS runtime
|
||||||
|
WORKDIR /app
|
||||||
|
COPY --from=build /app .
|
||||||
|
|
||||||
|
ENV ASPNETCORE_URLS=http://+:8080
|
||||||
|
EXPOSE 8080
|
||||||
|
|
||||||
|
ENTRYPOINT ["dotnet", "IncidentOps.Api.dll"]
|
||||||
@@ -0,0 +1,28 @@
|
|||||||
|
<Project Sdk="Microsoft.NET.Sdk.Web">
|
||||||
|
|
||||||
|
<PropertyGroup>
|
||||||
|
<TargetFramework>net10.0</TargetFramework>
|
||||||
|
<Nullable>enable</Nullable>
|
||||||
|
<ImplicitUsings>enable</ImplicitUsings>
|
||||||
|
<AllowMissingPrunePackageData>true</AllowMissingPrunePackageData>
|
||||||
|
</PropertyGroup>
|
||||||
|
|
||||||
|
<ItemGroup>
|
||||||
|
<PackageReference Include="FluentMigrator.Runner" Version="7.2.0" />
|
||||||
|
<PackageReference Include="FluentMigrator.Runner.Postgres" Version="7.2.0" />
|
||||||
|
<PackageReference Include="Hangfire.AspNetCore" Version="1.8.22" />
|
||||||
|
<PackageReference Include="Hangfire.Core" Version="1.8.22" />
|
||||||
|
<PackageReference Include="Hangfire.Redis.StackExchange" Version="1.12.0" />
|
||||||
|
<PackageReference Include="Microsoft.AspNetCore.Authentication.JwtBearer" Version="10.0.1" />
|
||||||
|
<PackageReference Include="Microsoft.AspNetCore.OpenApi" Version="10.0.0" />
|
||||||
|
<PackageReference Include="Npgsql" Version="10.0.1" />
|
||||||
|
<PackageReference Include="StackExchange.Redis" Version="2.10.1" />
|
||||||
|
</ItemGroup>
|
||||||
|
|
||||||
|
<ItemGroup>
|
||||||
|
<ProjectReference Include="..\IncidentOps.Infrastructure\IncidentOps.Infrastructure.csproj" />
|
||||||
|
<ProjectReference Include="..\IncidentOps.Domain\IncidentOps.Domain.csproj" />
|
||||||
|
<ProjectReference Include="..\IncidentOps.Contracts\IncidentOps.Contracts.csproj" />
|
||||||
|
</ItemGroup>
|
||||||
|
|
||||||
|
</Project>
|
||||||
@@ -0,0 +1,108 @@
|
|||||||
|
using System.Text;
|
||||||
|
using FluentMigrator.Runner;
|
||||||
|
using Hangfire;
|
||||||
|
using Hangfire.Redis.StackExchange;
|
||||||
|
using IncidentOps.Api.Auth;
|
||||||
|
using IncidentOps.Infrastructure;
|
||||||
|
using IncidentOps.Infrastructure.Auth;
|
||||||
|
using IncidentOps.Infrastructure.Migrations;
|
||||||
|
using Microsoft.AspNetCore.Authentication.JwtBearer;
|
||||||
|
using Microsoft.AspNetCore.Authorization;
|
||||||
|
using Microsoft.IdentityModel.Tokens;
|
||||||
|
using StackExchange.Redis;
|
||||||
|
|
||||||
|
var builder = WebApplication.CreateBuilder(args);
|
||||||
|
|
||||||
|
// Add controllers
|
||||||
|
builder.Services.AddControllers();
|
||||||
|
builder.Services.AddEndpointsApiExplorer();
|
||||||
|
builder.Services.AddOpenApi();
|
||||||
|
|
||||||
|
// Configure JWT settings
|
||||||
|
var jwtSettings = new JwtSettings
|
||||||
|
{
|
||||||
|
Issuer = builder.Configuration["Jwt:Issuer"] ?? "incidentops",
|
||||||
|
Audience = builder.Configuration["Jwt:Audience"] ?? "incidentops",
|
||||||
|
SigningKey = builder.Configuration["Jwt:SigningKey"] ?? throw new InvalidOperationException("JWT signing key not configured"),
|
||||||
|
AccessTokenExpirationMinutes = builder.Configuration.GetValue<int>("Jwt:AccessTokenExpirationMinutes", 15),
|
||||||
|
RefreshTokenExpirationDays = builder.Configuration.GetValue<int>("Jwt:RefreshTokenExpirationDays", 7)
|
||||||
|
};
|
||||||
|
|
||||||
|
// Configure Infrastructure
|
||||||
|
var connectionString = builder.Configuration.GetConnectionString("Postgres")
|
||||||
|
?? throw new InvalidOperationException("Postgres connection string not configured");
|
||||||
|
builder.Services.AddInfrastructure(connectionString, jwtSettings);
|
||||||
|
|
||||||
|
// Configure FluentMigrator
|
||||||
|
builder.Services.AddFluentMigratorCore()
|
||||||
|
.ConfigureRunner(rb => rb
|
||||||
|
.AddPostgres()
|
||||||
|
.WithGlobalConnectionString(connectionString)
|
||||||
|
.ScanIn(typeof(Migration0001_InitialSchema).Assembly).For.Migrations())
|
||||||
|
.AddLogging(lb => lb.AddFluentMigratorConsole());
|
||||||
|
|
||||||
|
// Configure JWT Authentication
|
||||||
|
builder.Services.AddAuthentication(JwtBearerDefaults.AuthenticationScheme)
|
||||||
|
.AddJwtBearer(options =>
|
||||||
|
{
|
||||||
|
options.TokenValidationParameters = new TokenValidationParameters
|
||||||
|
{
|
||||||
|
ValidateIssuer = true,
|
||||||
|
ValidateAudience = true,
|
||||||
|
ValidateLifetime = true,
|
||||||
|
ValidateIssuerSigningKey = true,
|
||||||
|
ValidIssuer = jwtSettings.Issuer,
|
||||||
|
ValidAudience = jwtSettings.Audience,
|
||||||
|
IssuerSigningKey = new SymmetricSecurityKey(Encoding.UTF8.GetBytes(jwtSettings.SigningKey))
|
||||||
|
};
|
||||||
|
});
|
||||||
|
|
||||||
|
// Configure Authorization
|
||||||
|
builder.Services.AddSingleton<IAuthorizationHandler, RoleRequirementHandler>();
|
||||||
|
builder.Services.AddAuthorizationBuilder()
|
||||||
|
.AddPolicy("Viewer", policy => policy.Requirements.Add(new RoleRequirement(IncidentOps.Domain.Enums.OrgRole.Viewer)))
|
||||||
|
.AddPolicy("Member", policy => policy.Requirements.Add(new RoleRequirement(IncidentOps.Domain.Enums.OrgRole.Member)))
|
||||||
|
.AddPolicy("Admin", policy => policy.Requirements.Add(new RoleRequirement(IncidentOps.Domain.Enums.OrgRole.Admin)));
|
||||||
|
|
||||||
|
// Configure Hangfire (client only - server runs in Worker)
|
||||||
|
var redisConnectionString = builder.Configuration["Redis:ConnectionString"]
|
||||||
|
?? throw new InvalidOperationException("Redis connection string not configured");
|
||||||
|
builder.Services.AddHangfire(configuration => configuration
|
||||||
|
.SetDataCompatibilityLevel(CompatibilityLevel.Version_180)
|
||||||
|
.UseSimpleAssemblyNameTypeSerializer()
|
||||||
|
.UseRecommendedSerializerSettings()
|
||||||
|
.UseRedisStorage(ConnectionMultiplexer.Connect(redisConnectionString)));
|
||||||
|
|
||||||
|
// Add CORS
|
||||||
|
builder.Services.AddCors(options =>
|
||||||
|
{
|
||||||
|
options.AddDefaultPolicy(policy =>
|
||||||
|
{
|
||||||
|
policy.WithOrigins(builder.Configuration.GetSection("Cors:Origins").Get<string[]>() ?? ["http://localhost:3000"])
|
||||||
|
.AllowAnyHeader()
|
||||||
|
.AllowAnyMethod()
|
||||||
|
.AllowCredentials();
|
||||||
|
});
|
||||||
|
});
|
||||||
|
|
||||||
|
var app = builder.Build();
|
||||||
|
|
||||||
|
// Run migrations
|
||||||
|
using (var scope = app.Services.CreateScope())
|
||||||
|
{
|
||||||
|
var runner = scope.ServiceProvider.GetRequiredService<IMigrationRunner>();
|
||||||
|
runner.MigrateUp();
|
||||||
|
}
|
||||||
|
|
||||||
|
// Configure the HTTP request pipeline
|
||||||
|
if (app.Environment.IsDevelopment())
|
||||||
|
{
|
||||||
|
app.MapOpenApi();
|
||||||
|
}
|
||||||
|
|
||||||
|
app.UseCors();
|
||||||
|
app.UseAuthentication();
|
||||||
|
app.UseAuthorization();
|
||||||
|
app.MapControllers();
|
||||||
|
|
||||||
|
app.Run();
|
||||||
@@ -0,0 +1,5 @@
|
|||||||
|
namespace IncidentOps.Contracts.Auth;
|
||||||
|
|
||||||
|
public record AuthResponse(string AccessToken, string RefreshToken, ActiveOrgDto ActiveOrg);
|
||||||
|
|
||||||
|
public record ActiveOrgDto(Guid Id, string Name, string Slug, string Role);
|
||||||
@@ -0,0 +1,3 @@
|
|||||||
|
namespace IncidentOps.Contracts.Auth;
|
||||||
|
|
||||||
|
public record LoginRequest(string Email, string Password, Guid? OrgId = null);
|
||||||
@@ -0,0 +1,3 @@
|
|||||||
|
namespace IncidentOps.Contracts.Auth;
|
||||||
|
|
||||||
|
public record LogoutRequest(string RefreshToken);
|
||||||
@@ -0,0 +1,3 @@
|
|||||||
|
namespace IncidentOps.Contracts.Auth;
|
||||||
|
|
||||||
|
public record MeResponse(Guid Id, string Email, string DisplayName, ActiveOrgDto ActiveOrg);
|
||||||
@@ -0,0 +1,3 @@
|
|||||||
|
namespace IncidentOps.Contracts.Auth;
|
||||||
|
|
||||||
|
public record RefreshRequest(string RefreshToken);
|
||||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user