From 3c45b48bd5d0d99f76d2063504adaa7b312b85bc Mon Sep 17 00:00:00 2001 From: Eugene Blikh Date: Sat, 25 Apr 2026 22:54:21 +0300 Subject: [PATCH] feat(http): chi server with middleware stack + RFC 7807 problem renderer --- docs/tasks/lethe-server.md | 5 +- go.sum | 4 + internal/deps/deps.go | 10 +- internal/domain/ingest/handler.go | 22 ++ internal/domain/session/handler.go | 22 ++ internal/pkg/apierror/apierror.go | 112 ++++++++++ internal/pkg/apierror/apierror_test.go | 152 ++++++++++++++ internal/pkg/httputil/httputil.go | 89 ++++++++ internal/server/auth/authenticator.go | 22 ++ internal/server/server.go | 279 +++++++++++++++++++++++++ internal/server/server_test.go | 253 ++++++++++++++++++++++ 11 files changed, 964 insertions(+), 6 deletions(-) create mode 100644 internal/domain/ingest/handler.go create mode 100644 internal/domain/session/handler.go create mode 100644 internal/pkg/apierror/apierror.go create mode 100644 internal/pkg/apierror/apierror_test.go create mode 100644 internal/pkg/httputil/httputil.go create mode 100644 internal/server/auth/authenticator.go create mode 100644 internal/server/server.go create mode 100644 internal/server/server_test.go diff --git a/docs/tasks/lethe-server.md b/docs/tasks/lethe-server.md index fef72e88ad9c13f0bfea72603c084f7e2260cb5f..4c50b60ec37ace359069ed431eb8db55c32cb5b4 100644 --- a/docs/tasks/lethe-server.md +++ b/docs/tasks/lethe-server.md @@ -406,6 +406,9 @@ Greenfield — no compat surface. Wire format is the only forward-compat concern ### Notes carried forward -- Phase 3 should add `migrate-up`, `migrate-down`, `migrate-create` to the Justfile alongside the migration runner so the targets aren't dead. +- Phase 3 should add `migrate-up`, `migrate-down`, `migrate-create` to the Justfile alongside the migration runner so the targets aren't dead. (Done in Phase 3.) - Each phase from 2 onward must remove the dep it adopts from `internal/deps/deps.go`; Phase 9 deletes the file. - README's Caddy/Authelia snippets use `auth.example.com` placeholders; replace with phoebe-specific values when the production deploy lands (out of scope for this task). +- **Phase 4 finding (steward unwind gap)**: `steward.Manager.Init` returns on the first failing `CallInit` and does **not** iterate back over previously-initialized assets to call `Destroy`. The canary test `TestStewardUnwindsOnInitFailure` (in `internal/platform/health/steward_unwind_test.go`) is intentionally red on master to document this. **Phase 9 main must compensate**: track each component as it init's and, on `Init` error, walk the list in reverse calling `Destroy` directly on each (don't try `mgr.Stop`/`mgr.Destroy` — those panic unless the manager has reached Started). Once Phase 9 lands the explicit unwind, either delete the canary test or convert it to assert the new compensating behavior. +- **Phase 5 consistency fix (folded into commit `1b215bc` via amend)**: chi's default 404/405 handlers wrote text/plain, violating the invariant "errors leaving any HTTP handler are rendered as RFC 7807". Added explicit `chi.Router.NotFound`/`MethodNotAllowed` handlers that call `apierror.Render` with `NOT_FOUND` / `METHOD_NOT_ALLOWED` codes. Added `METHOD_NOT_ALLOWED → 405` entry to the apierror code-status map. Added two regression tests (`TestRouter_NotFoundReturnsProblemJSON`, `TestRouter_MethodNotAllowedReturnsProblemJSON`). +- **Phase 3 → Phase 7 contract pin**: `INSERT … ON CONFLICT … DO UPDATE` fires the UPDATE trigger on SQLite (verified by `TestUpsertFiresUpdateTriggerAndKeepsFTSCoherent`). Regular FTS5 (not contentless / external content) was chosen so `WHERE owner = ?` works on the FTS table without a join — accepted the storage cost (`content` duplicated in real table + FTS shadow). Composite key order is `(owner, tool, host, session_id[, turn_id])` everywhere; ingest INSERT/UPDATE/ON CONFLICT clauses must match. `started_at`/`ended_at`/`source_file` are `NOT NULL` — ingest derives `started_at` from `MIN(turn.timestamp)` when `SessionMeta.StartedAt` is absent. diff --git a/go.sum b/go.sum index f75a7ddad86a2ccaab7e6700ce80601dd6bd496d..57dde81056f7db37cd61e0133f7c9ecf4ae9acff 100644 --- a/go.sum +++ b/go.sum @@ -46,10 +46,14 @@ github.com/hashicorp/golang-lru/v2 v2.0.7 h1:a+bsQ5rvGLjzHuww6tVxozPZFVghXaHOwFs github.com/hashicorp/golang-lru/v2 v2.0.7/go.mod h1:QeFd9opnmA6QUJc5vARoKUSoFhyfM2/ZepoAG6RGpeM= github.com/jmoiron/sqlx v1.4.0 h1:1PLqN7S1UYp5t4SrVVnt4nUVNemrDAtxlulVe+Qgm3o= github.com/jmoiron/sqlx v1.4.0/go.mod h1:ZrZ7UsYB/weZdl2Bxg6jCRO9c3YHl8r3ahlKmRT4JLY= +github.com/klauspost/compress v1.18.0 h1:c/Cqfb0r+Yi+JtIEq73FWXVkRonBlf0CRNYc8Zttxdo= +github.com/klauspost/compress v1.18.0/go.mod h1:2Pp+KzxcywXVXMr50+X0Q/Lsb43OQHYWRCY2AiWywWQ= github.com/kr/pretty v0.3.1 h1:flRD4NNwYAUpkphVc1HcthR4KEIFJ65n8Mw5qdRn3LE= github.com/kr/pretty v0.3.1/go.mod h1:hoEshYVHaxMs3cyo3Yncou5ZscifuDolrwPKZanG3xk= github.com/kr/text v0.2.0 h1:5Nx0Ya0ZqY2ygV366QzturHI13Jq95ApcVaJBhpS+AY= github.com/kr/text v0.2.0/go.mod h1:eLer722TekiGuMkidMxC/pM04lWEeraHUUmBw8l2grE= +github.com/kylelemons/godebug v1.1.0 h1:RPNrshWIDI6G2gRW9EHilWtl7Z6Sb1BR0xunSBf0SNc= +github.com/kylelemons/godebug v1.1.0/go.mod h1:9/0rRGxNHcop5bhtWyNeEfOS8JIWk580+fNqagV/RAw= github.com/leodido/go-urn v1.4.0 h1:WT9HwE9SGECu3lg4d/dIA+jxlljEa1/ffXKmRjqdmIQ= github.com/leodido/go-urn v1.4.0/go.mod h1:bvxc+MVxLKB4z00jd1z+Dvzr47oO32F/QSNjSBOlFxI= github.com/lib/pq v1.10.9 h1:YXG7RB+JIjhP29X+OtkiDnYaXQwpS4JEWq7dtCCRUEw= diff --git a/internal/deps/deps.go b/internal/deps/deps.go index 5295796a4739a5464a50981271f299b11eced450..1485220d022e7a7743b590c49c997998a62f4e7a 100644 --- a/internal/deps/deps.go +++ b/internal/deps/deps.go @@ -1,6 +1,6 @@ // Package deps records the locked set of direct dependencies for the lethe // server during early scaffolding. Real packages adopt these as they come -// online (server — chi; auth — go-oidc). +// online (auth — go-oidc). // // Phase 2 promoted viper, validator/v10, and culpa to real imports under // internal/config. Phase 3 promoted sqlx, modernc.org/sqlite, and @@ -8,12 +8,12 @@ // under internal/platform/database, and culpa is now used there too. Phase 4 // promoted prometheus/client_golang and auxilia/scribe (under // internal/platform/observability) and auxilia/steward (canary test under -// internal/platform/health) to real imports. Once every dep below has at -// least one real importer, this file is expected to disappear in the same -// commit that completes the migration. +// internal/platform/health) to real imports. Phase 5 promoted chi/v5 to a +// real import under internal/server. Once every dep below has at least one +// real importer, this file is expected to disappear in the same commit that +// completes the migration. package deps import ( _ "github.com/coreos/go-oidc/v3/oidc" - _ "github.com/go-chi/chi/v5" ) diff --git a/internal/domain/ingest/handler.go b/internal/domain/ingest/handler.go new file mode 100644 index 0000000000000000000000000000000000000000..381bcba1c4f579847a5dd72ba31da6bb82b7155b --- /dev/null +++ b/internal/domain/ingest/handler.go @@ -0,0 +1,22 @@ +// Phase-5 stub. Replaced by real implementation in Phase 7. +// +// Mount is a no-op so the /api/v1 group registers no ingest routes yet; +// Server.Init still wires it in so the full router topology compiles. +package ingest + +import ( + "context" + + "github.com/go-chi/chi/v5" +) + +// Handler is a no-op steward service. Phase 7 supplies the real ingest +// pipeline (NDJSON streaming, validation, chunked commit). +type Handler struct{} + +// Init satisfies the steward Service contract. +func (h *Handler) Init(_ context.Context) error { return nil } + +// Mount registers no routes. Phase 7 replaces this with the ingest endpoints +// rooted at the /api/v1 group passed in by Server.Init. +func (h *Handler) Mount(_ chi.Router) {} diff --git a/internal/domain/session/handler.go b/internal/domain/session/handler.go new file mode 100644 index 0000000000000000000000000000000000000000..0015c2a2d659c1e0e972b6e8556ee100e84b2467 --- /dev/null +++ b/internal/domain/session/handler.go @@ -0,0 +1,22 @@ +// Phase-5 stub. Replaced by real implementation in Phase 8. +// +// Mount is a no-op so the /api/v1 group registers no session routes yet; +// Server.Init still wires it in so the full router topology compiles. +package session + +import ( + "context" + + "github.com/go-chi/chi/v5" +) + +// Handler is a no-op steward service. Phase 8 supplies the real session +// read API. +type Handler struct{} + +// Init satisfies the steward Service contract. +func (h *Handler) Init(_ context.Context) error { return nil } + +// Mount registers no routes. Phase 8 replaces this with the sessions list / +// detail endpoints rooted at the /api/v1 group passed in by Server.Init. +func (h *Handler) Mount(_ chi.Router) {} diff --git a/internal/pkg/apierror/apierror.go b/internal/pkg/apierror/apierror.go new file mode 100644 index 0000000000000000000000000000000000000000..357cb539c9402c15e82163ecb95d09947364867b --- /dev/null +++ b/internal/pkg/apierror/apierror.go @@ -0,0 +1,112 @@ +// Package apierror is the single boundary between culpa-coded errors that +// flow through the lethe server and the RFC 7807 `application/problem+json` +// documents the HTTP layer hands back to clients. +// +// Render is the only exported entry point: it inspects the error's culpa +// CodeDetail, maps the code to an HTTP status, builds a Problem document +// from any attached PublicDetail, and writes it. 5xx responses log the full +// stack trace via scribe.Err first, then redact the body so internal details +// never leak to the client. +package apierror + +import ( + "encoding/json" + "log/slog" + "net/http" + + "go.bigb.es/auxilia/culpa" + "go.bigb.es/auxilia/scribe" +) + +// Problem is the RFC 7807 representation. The custom `code` extension holds +// the machine-readable culpa code for client-side error handling, and +// `errors` is reserved for per-line / per-field validation errors that the +// ingest handler (Phase 7) produces. +type Problem struct { + Type string `json:"type,omitempty"` + Title string `json:"title"` + Status int `json:"status"` + Detail string `json:"detail,omitempty"` + Code string `json:"code,omitempty"` + Instance string `json:"instance,omitempty"` + Errors []ProblemError `json:"errors,omitempty"` +} + +// ProblemError is one element of the `errors` extension, used for batched +// validation reports (e.g. NDJSON ingest line failures). +type ProblemError struct { + Line int `json:"line,omitempty"` + Field string `json:"field,omitempty"` + Code string `json:"code,omitempty"` + Message string `json:"message"` +} + +// codeStatus is the authoritative culpa-code → HTTP-status map. Any code +// not listed here falls through to 500 and gets the sanitized treatment. +var codeStatus = map[string]int{ + "NOT_FOUND": http.StatusNotFound, + "METHOD_NOT_ALLOWED": http.StatusMethodNotAllowed, + "INVALID": http.StatusBadRequest, + "VALIDATION": http.StatusBadRequest, + "UNAUTHORIZED": http.StatusUnauthorized, + "FORBIDDEN": http.StatusForbidden, + "CONFLICT": http.StatusConflict, + "TOO_LARGE": http.StatusRequestEntityTooLarge, +} + +// Render is the single boundary that translates a culpa-wrapped error into +// an RFC 7807 response. Callers must not mutate w after invoking Render. +// +// 5xx responses log the full error (with stack trace) via scribe.Err before +// redacting the body, so operators can diagnose without leaking specifics +// to clients. 4xx responses surface the culpa PublicDetail as `detail` and +// the CodeDetail as `code`. +func Render(w http.ResponseWriter, r *http.Request, err error) { + status, code := lookup(err) + + p := Problem{ + Title: http.StatusText(status), + Status: status, + Instance: r.URL.Path, + } + + if status >= 500 { + // Log the full chain (with stack) before sanitizing the response. + slog.Default().ErrorContext(r.Context(), "internal error", scribe.Err(err)) + p.Detail = "internal server error" + } else { + p.Code = code + var pub culpa.PublicDetail + if culpa.FindDetail(err, &pub) { + p.Detail = pub.Message + } + } + + w.Header().Set("Content-Type", "application/problem+json") + w.WriteHeader(status) + // Best-effort encode; if the client connection is gone there is nothing + // useful to do beyond logging. + if encErr := json.NewEncoder(w).Encode(p); encErr != nil { + slog.Default().ErrorContext(r.Context(), "encode problem document", + scribe.Err(encErr), + slog.Int("status", status), + ) + } +} + +// lookup walks the culpa chain for a CodeDetail and returns the matched +// (status, code) pair. Missing or unknown codes default to (500, ""). +func lookup(err error) (int, string) { + var cd culpa.CodeDetail + if !culpa.FindDetail(err, &cd) { + return http.StatusInternalServerError, "" + } + codeStr, ok := cd.Code.(string) + if !ok { + return http.StatusInternalServerError, "" + } + if status, ok := codeStatus[codeStr]; ok { + return status, codeStr + } + return http.StatusInternalServerError, codeStr +} diff --git a/internal/pkg/apierror/apierror_test.go b/internal/pkg/apierror/apierror_test.go new file mode 100644 index 0000000000000000000000000000000000000000..3b827ea450b8bd910ef50235fd2ca6f204c5762a --- /dev/null +++ b/internal/pkg/apierror/apierror_test.go @@ -0,0 +1,152 @@ +package apierror + +import ( + "encoding/json" + "net/http" + "net/http/httptest" + "strings" + "testing" + + "go.bigb.es/auxilia/culpa" +) + +// TestRender_CodeToStatusMapping verifies each documented culpa code maps to +// the expected HTTP status, and the response body has the RFC 7807 shape with +// the code echoed under the "code" extension field. +func TestRender_CodeToStatusMapping(t *testing.T) { + cases := []struct { + name string + code string + wantSt int + wantTit string + }{ + {"not-found", "NOT_FOUND", http.StatusNotFound, http.StatusText(http.StatusNotFound)}, + {"invalid", "INVALID", http.StatusBadRequest, http.StatusText(http.StatusBadRequest)}, + {"validation", "VALIDATION", http.StatusBadRequest, http.StatusText(http.StatusBadRequest)}, + {"unauthorized", "UNAUTHORIZED", http.StatusUnauthorized, http.StatusText(http.StatusUnauthorized)}, + {"forbidden", "FORBIDDEN", http.StatusForbidden, http.StatusText(http.StatusForbidden)}, + {"conflict", "CONFLICT", http.StatusConflict, http.StatusText(http.StatusConflict)}, + {"too-large", "TOO_LARGE", http.StatusRequestEntityTooLarge, http.StatusText(http.StatusRequestEntityTooLarge)}, + {"unknown-defaults-500", "WHO_KNOWS", http.StatusInternalServerError, http.StatusText(http.StatusInternalServerError)}, + } + for _, tc := range cases { + t.Run(tc.name, func(t *testing.T) { + err := culpa.WithCode(culpa.WithPublic(culpa.New("boom"), "user-facing"), tc.code) + rec := httptest.NewRecorder() + req := httptest.NewRequest(http.MethodGet, "/api/v1/things/42", nil) + + Render(rec, req, err) + + if got := rec.Code; got != tc.wantSt { + t.Fatalf("status = %d; want %d", got, tc.wantSt) + } + if ct := rec.Header().Get("Content-Type"); ct != "application/problem+json" { + t.Errorf("Content-Type = %q; want application/problem+json", ct) + } + var p Problem + if err := json.Unmarshal(rec.Body.Bytes(), &p); err != nil { + t.Fatalf("unmarshal body: %v", err) + } + if p.Status != tc.wantSt { + t.Errorf("body.status = %d; want %d", p.Status, tc.wantSt) + } + if p.Title != tc.wantTit { + t.Errorf("body.title = %q; want %q", p.Title, tc.wantTit) + } + if p.Instance != "/api/v1/things/42" { + t.Errorf("body.instance = %q; want /api/v1/things/42", p.Instance) + } + // 5xx codes have their detail sanitized to the constant message. + if tc.wantSt >= 500 { + if p.Detail != "internal server error" { + t.Errorf("body.detail = %q; want %q", p.Detail, "internal server error") + } + // Unknown code is not echoed as the public error code. + if p.Code == "WHO_KNOWS" { + t.Errorf("body.code = WHO_KNOWS leaked for 5xx response") + } + } else { + if p.Code != tc.code { + t.Errorf("body.code = %q; want %q", p.Code, tc.code) + } + if p.Detail != "user-facing" { + t.Errorf("body.detail = %q; want %q", p.Detail, "user-facing") + } + } + }) + } +} + +// TestRender_5xxSanitizesDetail proves an internal error never leaks the +// underlying message (which may contain credentials or stack-like text) +// through the response body. +func TestRender_5xxSanitizesDetail(t *testing.T) { + sensitive := "DB password=hunter2 connection refused" + err := culpa.Wrap(culpa.New(sensitive), "open db") + + rec := httptest.NewRecorder() + req := httptest.NewRequest(http.MethodGet, "/whatever", nil) + Render(rec, req, err) + + if rec.Code != http.StatusInternalServerError { + t.Fatalf("status = %d; want 500", rec.Code) + } + body := rec.Body.String() + if strings.Contains(body, "hunter2") || strings.Contains(body, "password") { + t.Fatalf("response body leaked sensitive substring; body=%q", body) + } + var p Problem + if err := json.Unmarshal([]byte(body), &p); err != nil { + t.Fatalf("unmarshal: %v", err) + } + if p.Detail != "internal server error" { + t.Errorf("detail = %q; want sanitized", p.Detail) + } +} + +// TestRender_NoCodeDefaultsTo500 ensures errors without a CodeDetail still +// produce a valid problem document at 500. +func TestRender_NoCodeDefaultsTo500(t *testing.T) { + err := culpa.New("plain") + + rec := httptest.NewRecorder() + req := httptest.NewRequest(http.MethodGet, "/x", nil) + Render(rec, req, err) + + if rec.Code != http.StatusInternalServerError { + t.Fatalf("status = %d; want 500", rec.Code) + } + var p Problem + if err := json.Unmarshal(rec.Body.Bytes(), &p); err != nil { + t.Fatalf("unmarshal: %v", err) + } + if p.Status != 500 { + t.Errorf("body.status = %d; want 500", p.Status) + } + if p.Detail != "internal server error" { + t.Errorf("detail = %q; want sanitized", p.Detail) + } +} + +// TestRender_PublicDetailUsedForClientErrors confirms PublicDetail is what +// shows up in the client-facing detail for non-5xx responses. +func TestRender_PublicDetailUsedForClientErrors(t *testing.T) { + err := culpa.WithCode(culpa.WithPublic(culpa.New("internal"), "session not found"), "NOT_FOUND") + rec := httptest.NewRecorder() + req := httptest.NewRequest(http.MethodGet, "/api/v1/sessions/abc", nil) + Render(rec, req, err) + + if rec.Code != http.StatusNotFound { + t.Fatalf("status = %d; want 404", rec.Code) + } + var p Problem + if err := json.Unmarshal(rec.Body.Bytes(), &p); err != nil { + t.Fatalf("unmarshal: %v", err) + } + if p.Detail != "session not found" { + t.Errorf("detail = %q; want public message", p.Detail) + } + if p.Code != "NOT_FOUND" { + t.Errorf("code = %q; want NOT_FOUND", p.Code) + } +} diff --git a/internal/pkg/httputil/httputil.go b/internal/pkg/httputil/httputil.go new file mode 100644 index 0000000000000000000000000000000000000000..fad2b7c3b929090c6be8fcce5a476f7a77e0d294 --- /dev/null +++ b/internal/pkg/httputil/httputil.go @@ -0,0 +1,89 @@ +// Package httputil holds the small set of request/response helpers shared +// across HTTP handlers in lethe: JSON read with body cap and strict +// decoding, JSON write, and an iterator-based NDJSON line reader for the +// ingest endpoint. +// +// The helpers return culpa-coded errors so the apierror.Render boundary can +// translate them into RFC 7807 responses without losing the cause. +package httputil + +import ( + "bufio" + "encoding/json" + "errors" + "io" + "iter" + "net/http" + + "go.bigb.es/auxilia/culpa" +) + +// ReadJSON limits the request body to maxBytes, decodes it into dst with +// DisallowUnknownFields, and rejects trailing tokens. Body-cap exceedance +// is reported with code TOO_LARGE; any other parse failure is INVALID. +func ReadJSON[T any](r *http.Request, dst *T, maxBytes int64) error { + r.Body = http.MaxBytesReader(nil, r.Body, maxBytes) + dec := json.NewDecoder(r.Body) + dec.DisallowUnknownFields() + if err := dec.Decode(dst); err != nil { + var maxErr *http.MaxBytesError + if errors.As(err, &maxErr) { + return culpa.WithCode(culpa.Wrap(err, "request body exceeds limit"), "TOO_LARGE") + } + return culpa.WithCode(culpa.Wrap(err, "decode json body"), "INVALID") + } + // Reject trailing data after the first JSON value. + if dec.More() { + return culpa.WithCode(culpa.New("unexpected data after JSON body"), "INVALID") + } + return nil +} + +// WriteJSON encodes v as JSON with the given status code and the standard +// application/json content type. Encoder errors are returned so the caller +// can decide how to surface them; in practice they only fire on a dropped +// client connection. +func WriteJSON(w http.ResponseWriter, status int, v any) error { + w.Header().Set("Content-Type", "application/json") + w.WriteHeader(status) + return json.NewEncoder(w).Encode(v) +} + +// ReadNDJSONLines streams NDJSON lines from r, yielding the raw bytes of +// each non-empty line. The yielded slice is owned by an internal buffer +// and is reused on the next iteration: consumers that retain a line must +// copy it themselves. +// +// Body-cap exceedance and oversized single lines surface as a (nil, err) +// terminal yield with code TOO_LARGE; other scanner failures use INVALID. +func ReadNDJSONLines(r io.Reader, maxBytes int64) iter.Seq2[[]byte, error] { + return func(yield func([]byte, error) bool) { + limited := io.LimitReader(r, maxBytes+1) + sc := bufio.NewScanner(limited) + // Allow the scanner to grow the token buffer up to the body cap. + // The starting buffer of 64 KiB is plenty for typical lines. + sc.Buffer(make([]byte, 0, 64*1024), int(maxBytes)+1) + var read int64 + for sc.Scan() { + line := sc.Bytes() + read += int64(len(line)) + 1 // +1 for the consumed newline + if read > maxBytes { + yield(nil, culpa.WithCode(culpa.New("ndjson body exceeds limit"), "TOO_LARGE")) + return + } + if len(line) == 0 { + continue + } + if !yield(line, nil) { + return + } + } + if err := sc.Err(); err != nil { + if errors.Is(err, bufio.ErrTooLong) { + yield(nil, culpa.WithCode(culpa.Wrap(err, "ndjson line too long"), "TOO_LARGE")) + return + } + yield(nil, culpa.WithCode(culpa.Wrap(err, "scan ndjson"), "INVALID")) + } + } +} diff --git a/internal/server/auth/authenticator.go b/internal/server/auth/authenticator.go new file mode 100644 index 0000000000000000000000000000000000000000..1fd757d382d74d99146bbf0df97aed71f186ed20 --- /dev/null +++ b/internal/server/auth/authenticator.go @@ -0,0 +1,22 @@ +// Phase-5 stub. Replaced by real implementation in Phase 6. +// +// This stub only satisfies the compile-time surface that Server depends on: +// an Init that does nothing and a Middleware that is the identity passthrough. +// Phase 6 replaces this file with the forward-auth + OIDC implementation. +package auth + +import ( + "context" + "net/http" +) + +// Authenticator is a no-op steward service. The real Phase-6 version will +// hold OIDC verifier state and forward-auth config. +type Authenticator struct{} + +// Init satisfies the steward Service contract. +func (a *Authenticator) Init(_ context.Context) error { return nil } + +// Middleware returns next unchanged. Phase 6 replaces this with the +// authentication chain. +func (a *Authenticator) Middleware(next http.Handler) http.Handler { return next } diff --git a/internal/server/server.go b/internal/server/server.go new file mode 100644 index 0000000000000000000000000000000000000000..013e7b109f443518fdb77526e568e6fdec846cd3 --- /dev/null +++ b/internal/server/server.go @@ -0,0 +1,279 @@ +// Package server is the steward-managed HTTP layer for lethe. It owns the +// chi router, the standard middleware stack (request-id, logging, metrics, +// recovery), the unauthenticated probes (/healthz, /readyz, /metrics), and +// the authenticated /api/v1/* group inside which the ingest and session +// handlers register their routes. +// +// The Server is a steward.Root() service: it has no consumers and steward +// would otherwise prune it. main.go (Phase 9) registers it as the root. +// +// Bind validation is fail-fast: only loopback IPs are accepted. Phase 5 +// keeps the system reachable by the local collector; exposing it on a +// public interface is a config error. +package server + +import ( + "context" + "crypto/rand" + "encoding/hex" + "errors" + "fmt" + "log/slog" + "net" + "net/http" + "time" + + "github.com/go-chi/chi/v5" + "github.com/prometheus/client_golang/prometheus/promhttp" + "go.bigb.es/auxilia/culpa" + "go.bigb.es/auxilia/scribe" + + "sourcecraft.dev/bigbes/lethe/internal/config" + "sourcecraft.dev/bigbes/lethe/internal/domain/ingest" + "sourcecraft.dev/bigbes/lethe/internal/domain/session" + "sourcecraft.dev/bigbes/lethe/internal/pkg/apierror" + "sourcecraft.dev/bigbes/lethe/internal/pkg/httputil" + "sourcecraft.dev/bigbes/lethe/internal/platform/health" + "sourcecraft.dev/bigbes/lethe/internal/platform/observability" + authpkg "sourcecraft.dev/bigbes/lethe/internal/server/auth" +) + +// readyzTimeout caps the time /readyz spends running every health check. +// Long enough to absorb a transient slow probe, short enough that orchestrators +// do not wait too long on a stuck dependency. +const readyzTimeout = 5 * time.Second + +// Server is the HTTP steward service. Steward injects the shared platform +// services and the auth/handler stubs (which Phases 6/7/8 replace). +type Server struct { + Cfg config.ServerConfig `config:""` + + Log *observability.Logger `inject:""` + Metrics *observability.Metrics `inject:""` + Health *health.Set `inject:""` + + Auth *authpkg.Authenticator `inject:""` + Ingest *ingest.Handler `inject:""` + Sessions *session.Handler `inject:""` + + router *chi.Mux + httpSrv *http.Server +} + +// Init validates the bind address and constructs the chi router with the +// standard middleware stack and all routes mounted. It does not start the +// listener — that is Start's job. +func (s *Server) Init(_ context.Context) error { + if err := validateLoopbackBind(s.Cfg.Bind); err != nil { + return err + } + + r := chi.NewRouter() + r.Use(s.requestIDMiddleware) + r.Use(s.loggingMiddleware) + r.Use(s.metricsMiddleware) + r.Use(s.recoveryMiddleware) + + r.NotFound(notFoundHandler) + r.MethodNotAllowed(methodNotAllowedHandler) + + r.Get("/healthz", s.healthzHandler) + r.Get("/readyz", s.readyzHandler) + r.Handle("/metrics", promhttp.HandlerFor(s.Metrics.Registry, promhttp.HandlerOpts{})) + + r.Route("/api/v1", func(r chi.Router) { + r.Use(s.Auth.Middleware) + s.Ingest.Mount(r) + s.Sessions.Mount(r) + }) + + s.router = r + return nil +} + +// Start spawns ListenAndServe in the background and returns immediately. +// Errors are logged; steward observes lifecycle via Stop. +func (s *Server) Start(_ context.Context) error { + s.httpSrv = &http.Server{ + Addr: s.Cfg.Bind, + Handler: s.router, + ReadHeaderTimeout: 10 * time.Second, + } + go func() { + if err := s.httpSrv.ListenAndServe(); err != nil && !errors.Is(err, http.ErrServerClosed) { + slog.Default().Error("server crashed", scribe.Err(err)) + } + }() + return nil +} + +// Stop gracefully drains in-flight requests within ShutdownGrace, then +// closes the listener. Idempotent if Start was never called. +func (s *Server) Stop(ctx context.Context) error { + if s.httpSrv == nil { + return nil + } + grace := s.Cfg.ShutdownGrace + if grace <= 0 { + grace = 10 * time.Second + } + ctx2, cancel := context.WithTimeout(ctx, grace) + defer cancel() + return s.httpSrv.Shutdown(ctx2) +} + +// validateLoopbackBind ensures s.Cfg.Bind resolves to a loopback IP. +// Anything else is rejected at startup with code CONFIG_INVALID. +func validateLoopbackBind(bind string) error { + host, _, err := net.SplitHostPort(bind) + if err != nil { + return culpa.WithCode(culpa.Wrap(err, "parse bind address"), "CONFIG_INVALID") + } + ip := net.ParseIP(host) + if ip == nil { + return culpa.WithCode(culpa.Errorf("bind host %q is not an IP literal", host), "CONFIG_INVALID") + } + if !ip.IsLoopback() { + return culpa.WithCode(culpa.Errorf("bind %q must resolve to a loopback IP", bind), "CONFIG_INVALID") + } + return nil +} + +// healthzHandler is a constant-200 process-up probe. +func (s *Server) healthzHandler(w http.ResponseWriter, _ *http.Request) { + w.Header().Set("Content-Type", "text/plain; charset=utf-8") + w.WriteHeader(http.StatusOK) + _, _ = w.Write([]byte("ok")) +} + +// readyzResponse is the JSON body returned by /readyz. +type readyzResponse struct { + Checks map[string]string `json:"checks"` +} + +// readyzHandler runs every registered Checker with a 5s ceiling. Any +// failure flips the status to 503; the body always lists each check. +func (s *Server) readyzHandler(w http.ResponseWriter, r *http.Request) { + ctx, cancel := context.WithTimeout(r.Context(), readyzTimeout) + defer cancel() + + results, allOK := s.Health.Run(ctx) + body := readyzResponse{Checks: make(map[string]string, len(results))} + for name, err := range results { + if err == nil { + body.Checks[name] = "ok" + } else { + body.Checks[name] = err.Error() + } + } + status := http.StatusOK + if !allOK { + status = http.StatusServiceUnavailable + } + if err := httputil.WriteJSON(w, status, body); err != nil { + slog.Default().ErrorContext(r.Context(), "write readyz response", scribe.Err(err)) + } +} + +// requestIDMiddleware mints a 32-char hex id, attaches it to the request +// context for the logger to pick up, and echoes it as X-Request-ID. The +// header is set before next.ServeHTTP so even slow handlers expose it. +func (s *Server) requestIDMiddleware(next http.Handler) http.Handler { + return http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + var buf [16]byte + // crypto/rand.Read in Go 1.24+ is documented infallible. + _, _ = rand.Read(buf[:]) + id := hex.EncodeToString(buf[:]) + w.Header().Set("X-Request-ID", id) + ctx := observability.WithRequestID(r.Context(), id) + next.ServeHTTP(w, r.WithContext(ctx)) + }) +} + +// responseRecorder is a minimal ResponseWriter wrapper that captures the +// final status code so the logging and metrics middlewares can label their +// records correctly. +type responseRecorder struct { + http.ResponseWriter + status int + wroteHeader bool +} + +// WriteHeader records the status before delegating. +func (rr *responseRecorder) WriteHeader(code int) { + if rr.wroteHeader { + return + } + rr.status = code + rr.wroteHeader = true + rr.ResponseWriter.WriteHeader(code) +} + +// Write triggers an implicit 200 if the handler wrote without WriteHeader. +func (rr *responseRecorder) Write(b []byte) (int, error) { + if !rr.wroteHeader { + rr.status = http.StatusOK + rr.wroteHeader = true + } + return rr.ResponseWriter.Write(b) +} + +// loggingMiddleware emits one structured access log per request. The +// request_id attribute is added automatically by contextHandler. +func (s *Server) loggingMiddleware(next http.Handler) http.Handler { + return http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + start := time.Now() + rr := &responseRecorder{ResponseWriter: w, status: http.StatusOK} + next.ServeHTTP(rr, r) + slog.Default().InfoContext(r.Context(), "request", + slog.String("method", r.Method), + slog.String("path", r.URL.Path), + slog.Int("status", rr.status), + slog.Int64("duration_ms", time.Since(start).Milliseconds()), + ) + }) +} + +// metricsMiddleware increments the request counter and observes the +// duration histogram. The route label comes from chi's RoutePattern so +// cardinality stays bounded; unmatched paths fall back to "unknown". +func (s *Server) metricsMiddleware(next http.Handler) http.Handler { + return http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + start := time.Now() + rr := &responseRecorder{ResponseWriter: w, status: http.StatusOK} + next.ServeHTTP(rr, r) + + route := "unknown" + if rc := chi.RouteContext(r.Context()); rc != nil { + if pat := rc.RoutePattern(); pat != "" { + route = pat + } + } + statusStr := fmt.Sprintf("%d", rr.status) + s.Metrics.HTTPRequests.WithLabelValues(r.Method, route, statusStr).Inc() + s.Metrics.HTTPDuration.WithLabelValues(r.Method, route).Observe(time.Since(start).Seconds()) + }) +} + +// recoveryMiddleware turns a panic into a 500 problem document. The +// stacktrace logging is handled by apierror.Render's 5xx branch, which +// receives the panic-as-error. +func (s *Server) recoveryMiddleware(next http.Handler) http.Handler { + return http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + defer func() { + if rec := recover(); rec != nil { + err := culpa.WithCode(culpa.Errorf("panic: %v", rec), "INTERNAL") + apierror.Render(w, r, err) + } + }() + next.ServeHTTP(w, r) + }) +} + +func notFoundHandler(w http.ResponseWriter, r *http.Request) { + apierror.Render(w, r, culpa.WithCode(culpa.New("route not found"), "NOT_FOUND")) +} + +func methodNotAllowedHandler(w http.ResponseWriter, r *http.Request) { + apierror.Render(w, r, culpa.WithCode(culpa.New("method not allowed"), "METHOD_NOT_ALLOWED")) +} diff --git a/internal/server/server_test.go b/internal/server/server_test.go new file mode 100644 index 0000000000000000000000000000000000000000..5146f9cea51731c5d289f193f6d9f43bfee7392e --- /dev/null +++ b/internal/server/server_test.go @@ -0,0 +1,253 @@ +package server + +import ( + "context" + "encoding/json" + "net/http" + "net/http/httptest" + "testing" + "time" + + "github.com/go-chi/chi/v5" + "go.bigb.es/auxilia/culpa" + + "sourcecraft.dev/bigbes/lethe/internal/config" + "sourcecraft.dev/bigbes/lethe/internal/domain/ingest" + "sourcecraft.dev/bigbes/lethe/internal/domain/session" + "sourcecraft.dev/bigbes/lethe/internal/platform/health" + "sourcecraft.dev/bigbes/lethe/internal/platform/observability" + authpkg "sourcecraft.dev/bigbes/lethe/internal/server/auth" +) + +// newTestServer wires a Server with hand-constructed dependencies so unit +// tests do not need to spin up the steward graph. +func newTestServer(t *testing.T, bind string) *Server { + t.Helper() + + logger := &observability.Logger{Cfg: config.LoggingConfig{Level: "info", Format: "json"}} + if err := logger.Init(context.Background()); err != nil { + t.Fatalf("logger.Init: %v", err) + } + metrics := &observability.Metrics{} + if err := metrics.Init(context.Background()); err != nil { + t.Fatalf("metrics.Init: %v", err) + } + return &Server{ + Cfg: config.ServerConfig{ + Bind: bind, + ShutdownGrace: 5 * time.Second, + }, + Log: logger, + Metrics: metrics, + Health: &health.Set{}, + Auth: &authpkg.Authenticator{}, + Ingest: &ingest.Handler{}, + Sessions: &session.Handler{}, + } +} + +func TestServerInit_RejectsNonLoopbackBind(t *testing.T) { + s := newTestServer(t, "0.0.0.0:8080") + err := s.Init(context.Background()) + if err == nil { + t.Fatalf("Init: expected error for non-loopback bind") + } + var cd culpa.CodeDetail + if !culpa.FindDetail(err, &cd) { + t.Fatalf("Init: expected culpa CodeDetail; got %v", err) + } + if cd.Code != "CONFIG_INVALID" { + t.Errorf("Init: code = %v; want CONFIG_INVALID", cd.Code) + } +} + +func TestServerInit_AcceptsLoopback(t *testing.T) { + s := newTestServer(t, "127.0.0.1:0") + if err := s.Init(context.Background()); err != nil { + t.Fatalf("Init: %v", err) + } + if s.router == nil { + t.Fatalf("Init: router not built") + } +} + +func TestRouter_RecoveryTurnsPanicInto500Problem(t *testing.T) { + s := newTestServer(t, "127.0.0.1:0") + if err := s.Init(context.Background()); err != nil { + t.Fatalf("Init: %v", err) + } + // Mount a panicking route on the router *after* Init wired the + // middleware chain so the recovery middleware sits in front of it. + s.router.Get("/boom", func(http.ResponseWriter, *http.Request) { + panic("kaboom") + }) + + req := httptest.NewRequest(http.MethodGet, "/boom", nil) + rec := httptest.NewRecorder() + s.router.ServeHTTP(rec, req) + + if rec.Code != http.StatusInternalServerError { + t.Fatalf("status = %d; want 500", rec.Code) + } + if ct := rec.Header().Get("Content-Type"); ct != "application/problem+json" { + t.Errorf("Content-Type = %q; want application/problem+json", ct) + } + var body map[string]any + if err := json.Unmarshal(rec.Body.Bytes(), &body); err != nil { + t.Fatalf("unmarshal body: %v", err) + } + if body["status"].(float64) != 500 { + t.Errorf("body.status = %v; want 500", body["status"]) + } + if body["detail"] != "internal server error" { + t.Errorf("body.detail = %v; want sanitized message", body["detail"]) + } +} + +func TestRouter_RequestIDInResponseAndContext(t *testing.T) { + s := newTestServer(t, "127.0.0.1:0") + if err := s.Init(context.Background()); err != nil { + t.Fatalf("Init: %v", err) + } + var fromCtx string + s.router.Get("/probe", func(w http.ResponseWriter, r *http.Request) { + fromCtx = observability.RequestIDFrom(r.Context()) + w.WriteHeader(http.StatusOK) + }) + + req := httptest.NewRequest(http.MethodGet, "/probe", nil) + rec := httptest.NewRecorder() + s.router.ServeHTTP(rec, req) + + if rec.Code != http.StatusOK { + t.Fatalf("status = %d; want 200", rec.Code) + } + headerID := rec.Header().Get("X-Request-ID") + if headerID == "" { + t.Fatalf("X-Request-ID header missing") + } + if fromCtx == "" { + t.Fatalf("request id missing from context") + } + if headerID != fromCtx { + t.Errorf("ctx id %q != header id %q", fromCtx, headerID) + } +} + +func TestRouter_HealthzReturnsOK(t *testing.T) { + s := newTestServer(t, "127.0.0.1:0") + if err := s.Init(context.Background()); err != nil { + t.Fatalf("Init: %v", err) + } + req := httptest.NewRequest(http.MethodGet, "/healthz", nil) + rec := httptest.NewRecorder() + s.router.ServeHTTP(rec, req) + if rec.Code != http.StatusOK { + t.Fatalf("status = %d; want 200", rec.Code) + } + if rec.Body.String() != "ok" { + t.Errorf("body = %q; want ok", rec.Body.String()) + } +} + +func TestRouter_ReadyzAllOKWithEmptyChecks(t *testing.T) { + s := newTestServer(t, "127.0.0.1:0") + if err := s.Init(context.Background()); err != nil { + t.Fatalf("Init: %v", err) + } + req := httptest.NewRequest(http.MethodGet, "/readyz", nil) + rec := httptest.NewRecorder() + s.router.ServeHTTP(rec, req) + if rec.Code != http.StatusOK { + t.Fatalf("status = %d; want 200", rec.Code) + } +} + +func TestRouter_MetricsExposesPrometheus(t *testing.T) { + s := newTestServer(t, "127.0.0.1:0") + if err := s.Init(context.Background()); err != nil { + t.Fatalf("Init: %v", err) + } + req := httptest.NewRequest(http.MethodGet, "/metrics", nil) + rec := httptest.NewRecorder() + s.router.ServeHTTP(rec, req) + if rec.Code != http.StatusOK { + t.Fatalf("status = %d; want 200", rec.Code) + } + if got := rec.Body.String(); got == "" { + t.Errorf("metrics body empty") + } +} + +// TestRouter_APIv1MountsAuthMiddleware is a smoke test: the auth middleware +// is the identity passthrough in Phase 5, but the /api/v1 group must still +// wire it in so Phase 6's replacement immediately takes effect on every +// route registered by the ingest/session handlers. +func TestRouter_APIv1MountsAuthMiddleware(t *testing.T) { + s := newTestServer(t, "127.0.0.1:0") + if err := s.Init(context.Background()); err != nil { + t.Fatalf("Init: %v", err) + } + // Hang a probe route off the /api/v1 group via the same chi router. + s.router.Route("/api/v1/probe", func(r chi.Router) { + r.Get("/", func(w http.ResponseWriter, _ *http.Request) { + w.WriteHeader(http.StatusTeapot) + }) + }) + req := httptest.NewRequest(http.MethodGet, "/api/v1/probe", nil) + rec := httptest.NewRecorder() + s.router.ServeHTTP(rec, req) + if rec.Code != http.StatusTeapot { + t.Fatalf("status = %d; want 418", rec.Code) + } +} + +func TestRouter_NotFoundReturnsProblemJSON(t *testing.T) { + srv := newTestServer(t, "127.0.0.1:0") + if err := srv.Init(context.Background()); err != nil { + t.Fatalf("Init: %v", err) + } + + req := httptest.NewRequest(http.MethodGet, "/no-such-route", nil) + rec := httptest.NewRecorder() + srv.router.ServeHTTP(rec, req) + + if rec.Code != http.StatusNotFound { + t.Fatalf("status: got %d, want 404", rec.Code) + } + if ct := rec.Header().Get("Content-Type"); ct != "application/problem+json" { + t.Fatalf("Content-Type: got %q, want application/problem+json", ct) + } + var p map[string]any + if err := json.Unmarshal(rec.Body.Bytes(), &p); err != nil { + t.Fatalf("body is not JSON: %v", err) + } + if got, _ := p["code"].(string); got != "NOT_FOUND" { + t.Fatalf("code: got %q, want NOT_FOUND", got) + } +} + +func TestRouter_MethodNotAllowedReturnsProblemJSON(t *testing.T) { + srv := newTestServer(t, "127.0.0.1:0") + if err := srv.Init(context.Background()); err != nil { + t.Fatalf("Init: %v", err) + } + + req := httptest.NewRequest(http.MethodPost, "/healthz", nil) + rec := httptest.NewRecorder() + srv.router.ServeHTTP(rec, req) + + if rec.Code != http.StatusMethodNotAllowed { + t.Fatalf("status: got %d, want 405", rec.Code) + } + if ct := rec.Header().Get("Content-Type"); ct != "application/problem+json" { + t.Fatalf("Content-Type: got %q, want application/problem+json", ct) + } + var p map[string]any + if err := json.Unmarshal(rec.Body.Bytes(), &p); err != nil { + t.Fatalf("body is not JSON: %v", err) + } + if got, _ := p["code"].(string); got != "METHOD_NOT_ALLOWED" { + t.Fatalf("code: got %q, want METHOD_NOT_ALLOWED", got) + } +}