From 49ec9cd99730e5bd230ab5f752ee5183bbe7108d Mon Sep 17 00:00:00 2001 From: minhtrannhat Date: Sun, 15 Dec 2024 12:00:00 -0500 Subject: [PATCH] chore: initialize project structure and specification --- .gitignore | 60 +++++ IncidentOps.slnx | 9 + docs/specs.md | 657 +++++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 726 insertions(+) create mode 100644 .gitignore create mode 100644 IncidentOps.slnx create mode 100644 docs/specs.md diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..961ba3e --- /dev/null +++ b/.gitignore @@ -0,0 +1,60 @@ +# .NET +bin/ +obj/ +*.user +*.suo +*.userosscache +*.sln.docstates +*.userprefs +.vs/ + +# Build results +[Dd]ebug/ +[Rr]elease/ +x64/ +x86/ +[Aa][Rr][Mm]/ +[Aa][Rr][Mm]64/ +bld/ +[Bb]in/ +[Oo]bj/ +[Ll]og/ +[Ll]ogs/ + +# NuGet +*.nupkg +*.snupkg +.nuget/ +packages/ + +# Node.js +node_modules/ +.next/ +out/ +.npm/ + +# IDE +.idea/ +*.swp +*.swo +*~ + +# OS +.DS_Store +Thumbs.db + +# Environment +.env +.env.local +.env.*.local +appsettings.Local.json +appsettings.*.Local.json + +# Helm +helm/incidentops/charts/ + +# Docker +.docker/ + +# Kubernetes +*.kubeconfig diff --git a/IncidentOps.slnx b/IncidentOps.slnx new file mode 100644 index 0000000..b703e48 --- /dev/null +++ b/IncidentOps.slnx @@ -0,0 +1,9 @@ + + + + + + + + + diff --git a/docs/specs.md b/docs/specs.md new file mode 100644 index 0000000..e4e98a0 --- /dev/null +++ b/docs/specs.md @@ -0,0 +1,657 @@ +# IncidentOps Specification + +A multi-tenant incident management system with implicit active-org context from JWT. + +--- + +## Project Structure + +``` +incidentops/ +├── IncidentOps.sln +├── docker-compose.yml +├── skaffold.yaml +├── .gitignore +│ +├── src/ +│ ├── IncidentOps.Api/ # ASP.NET Core REST API +│ │ ├── Auth/ +│ │ │ ├── ClaimsPrincipalExtensions.cs +│ │ │ ├── RequestContext.cs +│ │ │ └── RoleRequirement.cs +│ │ ├── Controllers/ +│ │ │ ├── AuthController.cs +│ │ │ ├── HealthController.cs +│ │ │ ├── IncidentsController.cs +│ │ │ └── OrgController.cs +│ │ ├── Dockerfile +│ │ ├── Program.cs +│ │ ├── appsettings.json +│ │ └── appsettings.Development.json +│ │ +│ ├── IncidentOps.Worker/ # Hangfire Worker Service +│ │ ├── Jobs/ +│ │ │ ├── EscalateIfUnackedJob.cs +│ │ │ ├── IncidentTriggeredJob.cs +│ │ │ └── SendWebhookNotificationJob.cs +│ │ ├── Dockerfile +│ │ ├── Program.cs +│ │ └── appsettings.json +│ │ +│ ├── IncidentOps.Domain/ # Domain Entities & Enums +│ │ ├── Entities/ +│ │ │ ├── Incident.cs +│ │ │ ├── IncidentEvent.cs +│ │ │ ├── NotificationAttempt.cs +│ │ │ ├── NotificationTarget.cs +│ │ │ ├── Org.cs +│ │ │ ├── OrgMember.cs +│ │ │ ├── RefreshToken.cs +│ │ │ ├── Service.cs +│ │ │ └── User.cs +│ │ └── Enums/ +│ │ ├── IncidentEventType.cs +│ │ ├── IncidentStatus.cs +│ │ ├── NotificationTargetType.cs +│ │ └── OrgRole.cs +│ │ +│ ├── IncidentOps.Infrastructure/ # Data Access & Services +│ │ ├── Auth/ +│ │ │ ├── IPasswordService.cs +│ │ │ ├── ITokenService.cs +│ │ │ └── JwtSettings.cs +│ │ ├── Data/ +│ │ │ ├── DbConnectionFactory.cs +│ │ │ └── Repositories/ +│ │ │ ├── IIncidentEventRepository.cs +│ │ │ ├── IIncidentRepository.cs +│ │ │ ├── INotificationTargetRepository.cs +│ │ │ ├── IOrgMemberRepository.cs +│ │ │ ├── IOrgRepository.cs +│ │ │ ├── IRefreshTokenRepository.cs +│ │ │ ├── IServiceRepository.cs +│ │ │ └── IUserRepository.cs +│ │ ├── Jobs/ +│ │ │ ├── IEscalateIfUnackedJob.cs +│ │ │ ├── IIncidentTriggeredJob.cs +│ │ │ └── ISendWebhookNotificationJob.cs +│ │ ├── Migrations/ +│ │ │ ├── Migration0001_InitialSchema.cs +│ │ │ ├── Migration0002_RefreshTokens.cs +│ │ │ └── Migration0003_NotificationTargets.cs +│ │ └── ServiceCollectionExtensions.cs +│ │ +│ └── IncidentOps.Contracts/ # DTOs / API Contracts +│ ├── Auth/ +│ │ ├── AuthResponse.cs +│ │ ├── LoginRequest.cs +│ │ ├── LogoutRequest.cs +│ │ ├── MeResponse.cs +│ │ ├── RefreshRequest.cs +│ │ ├── RegisterRequest.cs +│ │ └── SwitchOrgRequest.cs +│ ├── Incidents/ +│ │ ├── CommentRequest.cs +│ │ ├── CreateIncidentRequest.cs +│ │ ├── IncidentDto.cs +│ │ ├── IncidentEventDto.cs +│ │ ├── IncidentListResponse.cs +│ │ └── TransitionRequest.cs +│ ├── Orgs/ +│ │ ├── CreateNotificationTargetRequest.cs +│ │ ├── NotificationTargetDto.cs +│ │ ├── OrgDto.cs +│ │ └── OrgMemberDto.cs +│ └── Services/ +│ ├── CreateServiceRequest.cs +│ └── ServiceDto.cs +│ +├── web/ # Next.js Frontend +│ ├── app/ +│ │ ├── dashboard/page.tsx +│ │ ├── login/page.tsx +│ │ ├── register/page.tsx +│ │ ├── layout.tsx +│ │ ├── page.tsx +│ │ └── globals.css +│ ├── lib/ +│ │ └── api.ts +│ ├── types/ +│ │ └── index.ts +│ ├── Dockerfile +│ ├── package.json +│ ├── tsconfig.json +│ └── next.config.js +│ +├── helm/incidentops/ # Helm Chart +│ ├── Chart.yaml +│ ├── values.yaml +│ └── templates/ +│ ├── _helpers.tpl +│ ├── api-deployment.yaml +│ ├── api-service.yaml +│ ├── worker-deployment.yaml +│ ├── web-deployment.yaml +│ ├── web-service.yaml +│ ├── ingress.yaml +│ └── secrets.yaml +│ +└── docs/ + └── specs.md +``` + +--- + +## 1. Architecture (microservices-lite) + +### Deployables + +1. **api-service** (.NET 10, ASP.NET Core) + - REST API (implicit org scope from JWT) + - JWT access + refresh (both returned in JSON) + - RBAC enforced using `org_role` claim + DB ownership checks + - Writes incidents + timeline events + - Enqueues background jobs to Hangfire + +2. **worker-service** (.NET 10 Worker Service) + - Runs **Hangfire Server** using Redis storage + - Executes jobs: notification send, escalation checks, rollups + - Writes notification attempts and system events + +3. **web** (Next.js 14 + TypeScript) + - Auth pages + dashboard + incident detail + +### Dependencies (in kind via Helm) +- PostgreSQL (Bitnami) +- Redis (Bitnami) - Hangfire storage +- ingress-nginx +- (later) Prometheus/Grafana/OTel + +--- + +## 2. Auth Model (active org in JWT, implicit org scope) + +### JWT Access Token Claims +| Claim | Description | +|-------|-------------| +| `sub` | userId (uuid) | +| `org_id` | activeOrgId (uuid) | +| `org_role` | `admin\|member\|viewer` | +| `iss` | Issuer | +| `aud` | Audience | +| `iat` | Issued at | +| `exp` | Expiration | +| `jti` | (optional) Token ID | + +### Refresh Token Model (JSON, not cookie) +- Random opaque token returned in JSON +- Stored hashed in DB +- Rotated on refresh and switch-org +- Refresh token row stores `active_org_id` (per-session org selection) + +### DB: `refresh_tokens` +```sql +id uuid PRIMARY KEY +user_id uuid NOT NULL +token_hash text NOT NULL UNIQUE +active_org_id uuid NOT NULL +expires_at timestamptz NOT NULL +revoked_at timestamptz NULL +created_at timestamptz NOT NULL +``` + +### Auth Endpoints +| Method | Endpoint | Description | +|--------|----------|-------------| +| POST | `/v1/auth/register` | Create user + default org | +| POST | `/v1/auth/login` | Authenticate, return tokens | +| POST | `/v1/auth/refresh` | Rotate refresh token | +| POST | `/v1/auth/switch-org` | Switch active org context | +| POST | `/v1/auth/logout` | Revoke refresh token | + +#### Registration Flow +On `POST /v1/auth/register { email, password, displayName }`: +1. Create user record +2. Create a default org automatically (e.g., "John's Org") +3. Create org_member with role=Admin +4. Return access + refresh tokens + +--- + +## 3. Authorization Rules (implicit org scope) + +### Request Context +Middleware extracts from JWT: +- `UserId` from `sub` +- `OrgId` from `org_id` +- `Role` from `org_role` + +### Authorization Approach +- **Role check**: enforce viewer/member/admin by claim +- **Ownership check**: for any resource ID in path, load its `org_id` from DB and require it equals token `org_id` + - Prevents cross-tenant IDOR even though org isn't in the URL + +### Role Permissions +| Role | Permissions | +|------|-------------| +| viewer | Read-only access | +| member | Create incidents, transitions, comments | +| admin | Manage members, notification targets, on-call schedules | + +--- + +## 4. API Surface (implicit org in JWT) + +All routes under `/v1`. Unless noted, routes require auth. + +### Auth +| Method | Endpoint | Auth | Description | +|--------|----------|------|-------------| +| POST | `/auth/register` | No | Register new user | +| POST | `/auth/login` | No | Login | +| POST | `/auth/refresh` | No | Refresh tokens | +| POST | `/auth/switch-org` | No | Switch org context | +| POST | `/auth/logout` | No | Logout | +| GET | `/me` | Yes | Get current user info | + +### Org (current org context) +| Method | Endpoint | Role | Description | +|--------|----------|------|-------------| +| GET | `/org` | viewer+ | Current org summary + role | +| GET | `/org/members` | admin | List org members | +| POST | `/org/members` | admin | Invite/add member (stretch) | +| GET | `/org/services` | viewer+ | List services | +| POST | `/org/services` | member+ | Create service | +| GET | `/org/notification-targets` | admin | List notification targets | +| POST | `/org/notification-targets` | admin | Create notification target | + +### Incidents +| Method | Endpoint | Role | Description | +|--------|----------|------|-------------| +| GET | `/incidents` | viewer+ | List incidents (cursor pagination) | +| POST | `/services/{serviceId}/incidents` | member+ | Create incident | +| GET | `/incidents/{incidentId}` | viewer+ | Get incident detail | +| GET | `/incidents/{incidentId}/events` | viewer+ | Get incident timeline | +| POST | `/incidents/{incidentId}/transition` | member+ | Transition incident state | +| POST | `/incidents/{incidentId}/comment` | member+ | Add comment | + +### Health +| Method | Endpoint | Description | +|--------|----------|-------------| +| GET | `/healthz` | Liveness probe | +| GET | `/readyz` | Readiness probe (checks Postgres + Redis) | + +--- + +## 5. Domain Workflows + +### Incident State Machine +``` +Triggered → Acknowledged → Mitigated → Resolved +``` + +### Enforcement +- Application-level validation (allowed transitions) +- DB optimistic concurrency using `incidents.version` + +### Transition Write Pattern +```sql +UPDATE incidents +SET status = @newStatus, version = version + 1, updated_at = NOW() +WHERE id = @id AND org_id = @orgId AND version = @expectedVersion +``` +- If 0 rows updated → `409 Conflict` (stale client) or `404` if not found in org + +### Timeline Model +Append-only `incident_events` records for: +- Incident created +- Transitions (ack, mitigate, resolve) +- Comments +- Notifications sent/failed +- Escalations triggered + +`actor_user_id` is null for system/worker actions. + +--- + +## 6. PostgreSQL Schema (core tables) + +### Users +```sql +CREATE TABLE users ( + id uuid PRIMARY KEY, + email text NOT NULL UNIQUE, + password_hash text NOT NULL, + display_name text NOT NULL, + created_at timestamptz NOT NULL DEFAULT NOW() +); +``` + +### Orgs +```sql +CREATE TABLE orgs ( + id uuid PRIMARY KEY, + name text NOT NULL, + slug text NOT NULL UNIQUE, + created_at timestamptz NOT NULL DEFAULT NOW() +); +``` + +### Org Members +```sql +CREATE TABLE org_members ( + id uuid PRIMARY KEY, + org_id uuid NOT NULL REFERENCES orgs(id) ON DELETE CASCADE, + user_id uuid NOT NULL REFERENCES users(id) ON DELETE CASCADE, + role text NOT NULL CHECK (role IN ('admin', 'member', 'viewer')), + created_at timestamptz NOT NULL DEFAULT NOW(), + UNIQUE(org_id, user_id) +); +``` + +### Services +```sql +CREATE TABLE services ( + id uuid PRIMARY KEY, + org_id uuid NOT NULL REFERENCES orgs(id) ON DELETE CASCADE, + name text NOT NULL, + slug text NOT NULL, + description text, + created_at timestamptz NOT NULL DEFAULT NOW(), + UNIQUE(org_id, slug) +); +``` + +### Incidents +```sql +CREATE TABLE incidents ( + id uuid PRIMARY KEY, + org_id uuid NOT NULL REFERENCES orgs(id) ON DELETE CASCADE, + service_id uuid NOT NULL REFERENCES services(id) ON DELETE CASCADE, + title text NOT NULL, + description text, + status text NOT NULL DEFAULT 'triggered' + CHECK (status IN ('triggered', 'acknowledged', 'mitigated', 'resolved')), + severity text NOT NULL DEFAULT 'sev3' + CHECK (severity IN ('sev1', 'sev2', 'sev3', 'sev4')), + version integer NOT NULL DEFAULT 1, + created_at timestamptz NOT NULL DEFAULT NOW(), + updated_at timestamptz +); +CREATE INDEX idx_incidents_org_status ON incidents(org_id, status); +``` + +### Incident Events +```sql +CREATE TABLE incident_events ( + id uuid PRIMARY KEY, + incident_id uuid NOT NULL REFERENCES incidents(id) ON DELETE CASCADE, + event_type text NOT NULL, + actor_user_id uuid REFERENCES users(id), + payload jsonb, + created_at timestamptz NOT NULL DEFAULT NOW() +); +CREATE INDEX idx_incident_events_incident ON incident_events(incident_id, created_at); +``` + +### Notification Targets +```sql +CREATE TABLE notification_targets ( + id uuid PRIMARY KEY, + org_id uuid NOT NULL REFERENCES orgs(id) ON DELETE CASCADE, + name text NOT NULL, + target_type text NOT NULL CHECK (target_type IN ('webhook', 'email', 'slack')), + configuration text NOT NULL, + is_enabled boolean NOT NULL DEFAULT true, + created_at timestamptz NOT NULL DEFAULT NOW(), + updated_at timestamptz +); +``` + +### Notification Attempts +```sql +CREATE TABLE notification_attempts ( + id uuid PRIMARY KEY, + incident_id uuid NOT NULL REFERENCES incidents(id) ON DELETE CASCADE, + target_id uuid NOT NULL REFERENCES notification_targets(id) ON DELETE CASCADE, + success boolean NOT NULL, + error_message text, + attempt_number integer NOT NULL DEFAULT 1, + created_at timestamptz NOT NULL DEFAULT NOW(), + UNIQUE(incident_id, target_id) +); +``` + +### Refresh Tokens +```sql +CREATE TABLE refresh_tokens ( + id uuid PRIMARY KEY, + user_id uuid NOT NULL REFERENCES users(id) ON DELETE CASCADE, + token_hash text NOT NULL UNIQUE, + active_org_id uuid NOT NULL REFERENCES orgs(id), + expires_at timestamptz NOT NULL, + revoked_at timestamptz, + created_at timestamptz NOT NULL DEFAULT NOW() +); +CREATE INDEX idx_refresh_tokens_user ON refresh_tokens(user_id); +``` + +--- + +## 7. Data Access (Dapper) and Migrations (FluentMigrator) + +### Dapper Conventions +- Repositories receive `OrgId` as an explicit parameter and include it in WHERE clauses +- Keep SQL close to repositories (or separate `.sql` files) +- Use `NpgsqlConnection` + `IDbTransaction` for multi-statement operations + +### FluentMigrator +| Migration | Tables | +|-----------|--------| +| 0001 | users, orgs, org_members, services, incidents, incident_events | +| 0002 | refresh_tokens | +| 0003 | notification_targets, notification_attempts | + +--- + +## 8. Hangfire Job Design (Redis storage) + +### Setup +- API configures Hangfire Client (enqueue) +- Worker hosts Hangfire Server (process) + +### Queues +| Queue | Purpose | +|-------|---------| +| critical | Escalations | +| default | Notifications | +| low | Rollups | + +### Jobs + +#### 1. IncidentTriggeredJob(incidentId) +- Reads incident (must belong to org in incident row) +- Loads enabled notification targets for the org +- Inserts `notification_attempts` rows (idempotent) +- Enqueues per-target send jobs + +#### 2. SendWebhookNotificationJob(incidentId, targetId) +- Attempts HTTP POST with incident summary payload +- Updates attempt status + writes `incident_event` of type `system.notification_sent` or `system.notification_failed` +- Throws on transient failures to trigger retry; safe due to DB idempotency + +#### 3. EscalateIfUnackedJob(incidentId, step) (stretch) +- Runs delayed +- Checks status; if still Triggered, sends secondary notifications + +### Operational Note +- Expose Hangfire Dashboard **only in local** and protect it (basic auth or require a dev token) + +--- + +## 9. Kubernetes (kind) + Helm + Skaffold (local-only) + +### Helm Umbrella Chart Deploys +- bitnami/postgresql +- bitnami/redis +- api Deployment/Service +- worker Deployment +- web Deployment/Service +- Ingress with host `incidentops.local`: + - `/api`, `/v1`, `/healthz`, `/readyz` → api-service + - `/` → web + +### Configuration via Environment +| Variable | Description | +|----------|-------------| +| `ConnectionStrings__Postgres` | PostgreSQL connection string | +| `Redis__ConnectionString` | Redis connection string | +| `Jwt__Issuer` | JWT issuer | +| `Jwt__Audience` | JWT audience | +| `Jwt__SigningKey` | JWT signing key (secret) | + +### Readiness +- API checks Postgres + Redis +- Worker checks Postgres + Redis at startup + +### Skaffold +- Builds three images (api, worker, web) +- `helm upgrade --install` on changes + +--- + +## 10. Frontend UX Requirements (implicit org) + +- On login, display `activeOrg` from response +- Org switcher calls `/v1/auth/switch-org` and replaces tokens +- All subsequent API calls use only `Authorization` header; no orgId params +- Store tokens in localStorage or secure cookie +- Handle 401 by attempting token refresh + +--- + +## 11. Key Highlights (README/Resume) + +- "Multi-tenant org context embedded in JWT; org switching re-issues tokens." +- "DB ownership checks prevent cross-tenant resource access." +- "Optimistic concurrency for incident transitions." +- "Background jobs with retries + idempotent notification attempts." +- "Deployed locally to Kubernetes via Helm + Skaffold." + +--- + +## 12. Technology Stack + +| Layer | Technology | +|-------|------------| +| Runtime | .NET 10 | +| API Framework | ASP.NET Core | +| Worker | .NET Worker Service | +| Background Jobs | Hangfire with Redis | +| Database | PostgreSQL | +| ORM | Dapper | +| Migrations | FluentMigrator | +| Auth | JWT Bearer + BCrypt | +| Frontend | Next.js 14 + TypeScript | +| Container | Docker | +| Orchestration | Kubernetes (kind) | +| Deployment | Helm + Skaffold | + +--- + +## 13. Local Development + +### Prerequisites +- .NET 10 SDK +- Node.js 20+ +- Docker +- kind (Kubernetes in Docker) +- Helm +- Skaffold + +### Quick Start + +```bash +# With Docker Compose (simplest) +docker-compose up -d + +# Run API +cd src/IncidentOps.Api +dotnet run + +# Run Worker (separate terminal) +cd src/IncidentOps.Worker +dotnet run + +# Run Web (separate terminal) +cd web +npm install +npm run dev +``` + +### With Kubernetes (kind) + +```bash +# Create cluster +kind create cluster --name incidentops + +# Deploy with Skaffold +skaffold dev + +# Access at http://incidentops.local (add to /etc/hosts) +``` + +--- + +## 14. API Request/Response Examples + +### Register +```http +POST /v1/auth/register +Content-Type: application/json + +{ + "email": "user@example.com", + "password": "SecurePass123!", + "displayName": "John Doe" +} +``` + +Response: +```json +{ + "accessToken": "eyJhbG...", + "refreshToken": "a1b2c3d4...", + "activeOrg": { + "id": "uuid", + "name": "John Doe's Org", + "slug": "org-abc123", + "role": "admin" + } +} +``` + +### Create Incident +```http +POST /v1/services/{serviceId}/incidents +Authorization: Bearer {accessToken} +Content-Type: application/json + +{ + "title": "Database connection timeout", + "description": "Users experiencing slow queries", + "severity": "sev2" +} +``` + +### Transition Incident +```http +POST /v1/incidents/{incidentId}/transition +Authorization: Bearer {accessToken} +Content-Type: application/json + +{ + "action": "ack", + "expectedVersion": 1 +} +```