~bigbes/lethe

3c45b48bd5d0d99f76d2063504adaa7b312b85bc — Eugene Blikh a month ago b91a08d
feat(http): chi server with middleware stack + RFC 7807 problem renderer
M docs/tasks/lethe-server.md => docs/tasks/lethe-server.md +4 -1
@@ 406,6 406,9 @@ Greenfield — no compat surface. Wire format is the only forward-compat concern

### Notes carried forward

- Phase 3 should add `migrate-up`, `migrate-down`, `migrate-create` to the Justfile alongside the migration runner so the targets aren't dead.
- Phase 3 should add `migrate-up`, `migrate-down`, `migrate-create` to the Justfile alongside the migration runner so the targets aren't dead. (Done in Phase 3.)
- Each phase from 2 onward must remove the dep it adopts from `internal/deps/deps.go`; Phase 9 deletes the file.
- README's Caddy/Authelia snippets use `auth.example.com` placeholders; replace with phoebe-specific values when the production deploy lands (out of scope for this task).
- **Phase 4 finding (steward unwind gap)**: `steward.Manager.Init` returns on the first failing `CallInit` and does **not** iterate back over previously-initialized assets to call `Destroy`. The canary test `TestStewardUnwindsOnInitFailure` (in `internal/platform/health/steward_unwind_test.go`) is intentionally red on master to document this. **Phase 9 main must compensate**: track each component as it init's and, on `Init` error, walk the list in reverse calling `Destroy` directly on each (don't try `mgr.Stop`/`mgr.Destroy` — those panic unless the manager has reached Started). Once Phase 9 lands the explicit unwind, either delete the canary test or convert it to assert the new compensating behavior.
- **Phase 5 consistency fix (folded into commit `1b215bc` via amend)**: chi's default 404/405 handlers wrote text/plain, violating the invariant "errors leaving any HTTP handler are rendered as RFC 7807". Added explicit `chi.Router.NotFound`/`MethodNotAllowed` handlers that call `apierror.Render` with `NOT_FOUND` / `METHOD_NOT_ALLOWED` codes. Added `METHOD_NOT_ALLOWED → 405` entry to the apierror code-status map. Added two regression tests (`TestRouter_NotFoundReturnsProblemJSON`, `TestRouter_MethodNotAllowedReturnsProblemJSON`).
- **Phase 3 → Phase 7 contract pin**: `INSERT … ON CONFLICT … DO UPDATE` fires the UPDATE trigger on SQLite (verified by `TestUpsertFiresUpdateTriggerAndKeepsFTSCoherent`). Regular FTS5 (not contentless / external content) was chosen so `WHERE owner = ?` works on the FTS table without a join — accepted the storage cost (`content` duplicated in real table + FTS shadow). Composite key order is `(owner, tool, host, session_id[, turn_id])` everywhere; ingest INSERT/UPDATE/ON CONFLICT clauses must match. `started_at`/`ended_at`/`source_file` are `NOT NULL` — ingest derives `started_at` from `MIN(turn.timestamp)` when `SessionMeta.StartedAt` is absent.

M go.sum => go.sum +4 -0
@@ 46,10 46,14 @@ github.com/hashicorp/golang-lru/v2 v2.0.7 h1:a+bsQ5rvGLjzHuww6tVxozPZFVghXaHOwFs
github.com/hashicorp/golang-lru/v2 v2.0.7/go.mod h1:QeFd9opnmA6QUJc5vARoKUSoFhyfM2/ZepoAG6RGpeM=
github.com/jmoiron/sqlx v1.4.0 h1:1PLqN7S1UYp5t4SrVVnt4nUVNemrDAtxlulVe+Qgm3o=
github.com/jmoiron/sqlx v1.4.0/go.mod h1:ZrZ7UsYB/weZdl2Bxg6jCRO9c3YHl8r3ahlKmRT4JLY=
github.com/klauspost/compress v1.18.0 h1:c/Cqfb0r+Yi+JtIEq73FWXVkRonBlf0CRNYc8Zttxdo=
github.com/klauspost/compress v1.18.0/go.mod h1:2Pp+KzxcywXVXMr50+X0Q/Lsb43OQHYWRCY2AiWywWQ=
github.com/kr/pretty v0.3.1 h1:flRD4NNwYAUpkphVc1HcthR4KEIFJ65n8Mw5qdRn3LE=
github.com/kr/pretty v0.3.1/go.mod h1:hoEshYVHaxMs3cyo3Yncou5ZscifuDolrwPKZanG3xk=
github.com/kr/text v0.2.0 h1:5Nx0Ya0ZqY2ygV366QzturHI13Jq95ApcVaJBhpS+AY=
github.com/kr/text v0.2.0/go.mod h1:eLer722TekiGuMkidMxC/pM04lWEeraHUUmBw8l2grE=
github.com/kylelemons/godebug v1.1.0 h1:RPNrshWIDI6G2gRW9EHilWtl7Z6Sb1BR0xunSBf0SNc=
github.com/kylelemons/godebug v1.1.0/go.mod h1:9/0rRGxNHcop5bhtWyNeEfOS8JIWk580+fNqagV/RAw=
github.com/leodido/go-urn v1.4.0 h1:WT9HwE9SGECu3lg4d/dIA+jxlljEa1/ffXKmRjqdmIQ=
github.com/leodido/go-urn v1.4.0/go.mod h1:bvxc+MVxLKB4z00jd1z+Dvzr47oO32F/QSNjSBOlFxI=
github.com/lib/pq v1.10.9 h1:YXG7RB+JIjhP29X+OtkiDnYaXQwpS4JEWq7dtCCRUEw=

M internal/deps/deps.go => internal/deps/deps.go +5 -5
@@ 1,6 1,6 @@
// Package deps records the locked set of direct dependencies for the lethe
// server during early scaffolding. Real packages adopt these as they come
// online (server — chi; auth — go-oidc).
// online (auth — go-oidc).
//
// Phase 2 promoted viper, validator/v10, and culpa to real imports under
// internal/config. Phase 3 promoted sqlx, modernc.org/sqlite, and


@@ 8,12 8,12 @@
// under internal/platform/database, and culpa is now used there too. Phase 4
// promoted prometheus/client_golang and auxilia/scribe (under
// internal/platform/observability) and auxilia/steward (canary test under
// internal/platform/health) to real imports. Once every dep below has at
// least one real importer, this file is expected to disappear in the same
// commit that completes the migration.
// internal/platform/health) to real imports. Phase 5 promoted chi/v5 to a
// real import under internal/server. Once every dep below has at least one
// real importer, this file is expected to disappear in the same commit that
// completes the migration.
package deps

import (
	_ "github.com/coreos/go-oidc/v3/oidc"
	_ "github.com/go-chi/chi/v5"
)

A internal/domain/ingest/handler.go => internal/domain/ingest/handler.go +22 -0
@@ 0,0 1,22 @@
// Phase-5 stub. Replaced by real implementation in Phase 7.
//
// Mount is a no-op so the /api/v1 group registers no ingest routes yet;
// Server.Init still wires it in so the full router topology compiles.
package ingest

import (
	"context"

	"github.com/go-chi/chi/v5"
)

// Handler is a no-op steward service. Phase 7 supplies the real ingest
// pipeline (NDJSON streaming, validation, chunked commit).
type Handler struct{}

// Init satisfies the steward Service contract.
func (h *Handler) Init(_ context.Context) error { return nil }

// Mount registers no routes. Phase 7 replaces this with the ingest endpoints
// rooted at the /api/v1 group passed in by Server.Init.
func (h *Handler) Mount(_ chi.Router) {}

A internal/domain/session/handler.go => internal/domain/session/handler.go +22 -0
@@ 0,0 1,22 @@
// Phase-5 stub. Replaced by real implementation in Phase 8.
//
// Mount is a no-op so the /api/v1 group registers no session routes yet;
// Server.Init still wires it in so the full router topology compiles.
package session

import (
	"context"

	"github.com/go-chi/chi/v5"
)

// Handler is a no-op steward service. Phase 8 supplies the real session
// read API.
type Handler struct{}

// Init satisfies the steward Service contract.
func (h *Handler) Init(_ context.Context) error { return nil }

// Mount registers no routes. Phase 8 replaces this with the sessions list /
// detail endpoints rooted at the /api/v1 group passed in by Server.Init.
func (h *Handler) Mount(_ chi.Router) {}

A internal/pkg/apierror/apierror.go => internal/pkg/apierror/apierror.go +112 -0
@@ 0,0 1,112 @@
// Package apierror is the single boundary between culpa-coded errors that
// flow through the lethe server and the RFC 7807 `application/problem+json`
// documents the HTTP layer hands back to clients.
//
// Render is the only exported entry point: it inspects the error's culpa
// CodeDetail, maps the code to an HTTP status, builds a Problem document
// from any attached PublicDetail, and writes it. 5xx responses log the full
// stack trace via scribe.Err first, then redact the body so internal details
// never leak to the client.
package apierror

import (
	"encoding/json"
	"log/slog"
	"net/http"

	"go.bigb.es/auxilia/culpa"
	"go.bigb.es/auxilia/scribe"
)

// Problem is the RFC 7807 representation. The custom `code` extension holds
// the machine-readable culpa code for client-side error handling, and
// `errors` is reserved for per-line / per-field validation errors that the
// ingest handler (Phase 7) produces.
type Problem struct {
	Type     string         `json:"type,omitempty"`
	Title    string         `json:"title"`
	Status   int            `json:"status"`
	Detail   string         `json:"detail,omitempty"`
	Code     string         `json:"code,omitempty"`
	Instance string         `json:"instance,omitempty"`
	Errors   []ProblemError `json:"errors,omitempty"`
}

// ProblemError is one element of the `errors` extension, used for batched
// validation reports (e.g. NDJSON ingest line failures).
type ProblemError struct {
	Line    int    `json:"line,omitempty"`
	Field   string `json:"field,omitempty"`
	Code    string `json:"code,omitempty"`
	Message string `json:"message"`
}

// codeStatus is the authoritative culpa-code → HTTP-status map. Any code
// not listed here falls through to 500 and gets the sanitized treatment.
var codeStatus = map[string]int{
	"NOT_FOUND":          http.StatusNotFound,
	"METHOD_NOT_ALLOWED": http.StatusMethodNotAllowed,
	"INVALID":            http.StatusBadRequest,
	"VALIDATION":         http.StatusBadRequest,
	"UNAUTHORIZED":       http.StatusUnauthorized,
	"FORBIDDEN":          http.StatusForbidden,
	"CONFLICT":           http.StatusConflict,
	"TOO_LARGE":          http.StatusRequestEntityTooLarge,
}

// Render is the single boundary that translates a culpa-wrapped error into
// an RFC 7807 response. Callers must not mutate w after invoking Render.
//
// 5xx responses log the full error (with stack trace) via scribe.Err before
// redacting the body, so operators can diagnose without leaking specifics
// to clients. 4xx responses surface the culpa PublicDetail as `detail` and
// the CodeDetail as `code`.
func Render(w http.ResponseWriter, r *http.Request, err error) {
	status, code := lookup(err)

	p := Problem{
		Title:    http.StatusText(status),
		Status:   status,
		Instance: r.URL.Path,
	}

	if status >= 500 {
		// Log the full chain (with stack) before sanitizing the response.
		slog.Default().ErrorContext(r.Context(), "internal error", scribe.Err(err))
		p.Detail = "internal server error"
	} else {
		p.Code = code
		var pub culpa.PublicDetail
		if culpa.FindDetail(err, &pub) {
			p.Detail = pub.Message
		}
	}

	w.Header().Set("Content-Type", "application/problem+json")
	w.WriteHeader(status)
	// Best-effort encode; if the client connection is gone there is nothing
	// useful to do beyond logging.
	if encErr := json.NewEncoder(w).Encode(p); encErr != nil {
		slog.Default().ErrorContext(r.Context(), "encode problem document",
			scribe.Err(encErr),
			slog.Int("status", status),
		)
	}
}

// lookup walks the culpa chain for a CodeDetail and returns the matched
// (status, code) pair. Missing or unknown codes default to (500, "").
func lookup(err error) (int, string) {
	var cd culpa.CodeDetail
	if !culpa.FindDetail(err, &cd) {
		return http.StatusInternalServerError, ""
	}
	codeStr, ok := cd.Code.(string)
	if !ok {
		return http.StatusInternalServerError, ""
	}
	if status, ok := codeStatus[codeStr]; ok {
		return status, codeStr
	}
	return http.StatusInternalServerError, codeStr
}

A internal/pkg/apierror/apierror_test.go => internal/pkg/apierror/apierror_test.go +152 -0
@@ 0,0 1,152 @@
package apierror

import (
	"encoding/json"
	"net/http"
	"net/http/httptest"
	"strings"
	"testing"

	"go.bigb.es/auxilia/culpa"
)

// TestRender_CodeToStatusMapping verifies each documented culpa code maps to
// the expected HTTP status, and the response body has the RFC 7807 shape with
// the code echoed under the "code" extension field.
func TestRender_CodeToStatusMapping(t *testing.T) {
	cases := []struct {
		name    string
		code    string
		wantSt  int
		wantTit string
	}{
		{"not-found", "NOT_FOUND", http.StatusNotFound, http.StatusText(http.StatusNotFound)},
		{"invalid", "INVALID", http.StatusBadRequest, http.StatusText(http.StatusBadRequest)},
		{"validation", "VALIDATION", http.StatusBadRequest, http.StatusText(http.StatusBadRequest)},
		{"unauthorized", "UNAUTHORIZED", http.StatusUnauthorized, http.StatusText(http.StatusUnauthorized)},
		{"forbidden", "FORBIDDEN", http.StatusForbidden, http.StatusText(http.StatusForbidden)},
		{"conflict", "CONFLICT", http.StatusConflict, http.StatusText(http.StatusConflict)},
		{"too-large", "TOO_LARGE", http.StatusRequestEntityTooLarge, http.StatusText(http.StatusRequestEntityTooLarge)},
		{"unknown-defaults-500", "WHO_KNOWS", http.StatusInternalServerError, http.StatusText(http.StatusInternalServerError)},
	}
	for _, tc := range cases {
		t.Run(tc.name, func(t *testing.T) {
			err := culpa.WithCode(culpa.WithPublic(culpa.New("boom"), "user-facing"), tc.code)
			rec := httptest.NewRecorder()
			req := httptest.NewRequest(http.MethodGet, "/api/v1/things/42", nil)

			Render(rec, req, err)

			if got := rec.Code; got != tc.wantSt {
				t.Fatalf("status = %d; want %d", got, tc.wantSt)
			}
			if ct := rec.Header().Get("Content-Type"); ct != "application/problem+json" {
				t.Errorf("Content-Type = %q; want application/problem+json", ct)
			}
			var p Problem
			if err := json.Unmarshal(rec.Body.Bytes(), &p); err != nil {
				t.Fatalf("unmarshal body: %v", err)
			}
			if p.Status != tc.wantSt {
				t.Errorf("body.status = %d; want %d", p.Status, tc.wantSt)
			}
			if p.Title != tc.wantTit {
				t.Errorf("body.title = %q; want %q", p.Title, tc.wantTit)
			}
			if p.Instance != "/api/v1/things/42" {
				t.Errorf("body.instance = %q; want /api/v1/things/42", p.Instance)
			}
			// 5xx codes have their detail sanitized to the constant message.
			if tc.wantSt >= 500 {
				if p.Detail != "internal server error" {
					t.Errorf("body.detail = %q; want %q", p.Detail, "internal server error")
				}
				// Unknown code is not echoed as the public error code.
				if p.Code == "WHO_KNOWS" {
					t.Errorf("body.code = WHO_KNOWS leaked for 5xx response")
				}
			} else {
				if p.Code != tc.code {
					t.Errorf("body.code = %q; want %q", p.Code, tc.code)
				}
				if p.Detail != "user-facing" {
					t.Errorf("body.detail = %q; want %q", p.Detail, "user-facing")
				}
			}
		})
	}
}

// TestRender_5xxSanitizesDetail proves an internal error never leaks the
// underlying message (which may contain credentials or stack-like text)
// through the response body.
func TestRender_5xxSanitizesDetail(t *testing.T) {
	sensitive := "DB password=hunter2 connection refused"
	err := culpa.Wrap(culpa.New(sensitive), "open db")

	rec := httptest.NewRecorder()
	req := httptest.NewRequest(http.MethodGet, "/whatever", nil)
	Render(rec, req, err)

	if rec.Code != http.StatusInternalServerError {
		t.Fatalf("status = %d; want 500", rec.Code)
	}
	body := rec.Body.String()
	if strings.Contains(body, "hunter2") || strings.Contains(body, "password") {
		t.Fatalf("response body leaked sensitive substring; body=%q", body)
	}
	var p Problem
	if err := json.Unmarshal([]byte(body), &p); err != nil {
		t.Fatalf("unmarshal: %v", err)
	}
	if p.Detail != "internal server error" {
		t.Errorf("detail = %q; want sanitized", p.Detail)
	}
}

// TestRender_NoCodeDefaultsTo500 ensures errors without a CodeDetail still
// produce a valid problem document at 500.
func TestRender_NoCodeDefaultsTo500(t *testing.T) {
	err := culpa.New("plain")

	rec := httptest.NewRecorder()
	req := httptest.NewRequest(http.MethodGet, "/x", nil)
	Render(rec, req, err)

	if rec.Code != http.StatusInternalServerError {
		t.Fatalf("status = %d; want 500", rec.Code)
	}
	var p Problem
	if err := json.Unmarshal(rec.Body.Bytes(), &p); err != nil {
		t.Fatalf("unmarshal: %v", err)
	}
	if p.Status != 500 {
		t.Errorf("body.status = %d; want 500", p.Status)
	}
	if p.Detail != "internal server error" {
		t.Errorf("detail = %q; want sanitized", p.Detail)
	}
}

// TestRender_PublicDetailUsedForClientErrors confirms PublicDetail is what
// shows up in the client-facing detail for non-5xx responses.
func TestRender_PublicDetailUsedForClientErrors(t *testing.T) {
	err := culpa.WithCode(culpa.WithPublic(culpa.New("internal"), "session not found"), "NOT_FOUND")
	rec := httptest.NewRecorder()
	req := httptest.NewRequest(http.MethodGet, "/api/v1/sessions/abc", nil)
	Render(rec, req, err)

	if rec.Code != http.StatusNotFound {
		t.Fatalf("status = %d; want 404", rec.Code)
	}
	var p Problem
	if err := json.Unmarshal(rec.Body.Bytes(), &p); err != nil {
		t.Fatalf("unmarshal: %v", err)
	}
	if p.Detail != "session not found" {
		t.Errorf("detail = %q; want public message", p.Detail)
	}
	if p.Code != "NOT_FOUND" {
		t.Errorf("code = %q; want NOT_FOUND", p.Code)
	}
}

A internal/pkg/httputil/httputil.go => internal/pkg/httputil/httputil.go +89 -0
@@ 0,0 1,89 @@
// Package httputil holds the small set of request/response helpers shared
// across HTTP handlers in lethe: JSON read with body cap and strict
// decoding, JSON write, and an iterator-based NDJSON line reader for the
// ingest endpoint.
//
// The helpers return culpa-coded errors so the apierror.Render boundary can
// translate them into RFC 7807 responses without losing the cause.
package httputil

import (
	"bufio"
	"encoding/json"
	"errors"
	"io"
	"iter"
	"net/http"

	"go.bigb.es/auxilia/culpa"
)

// ReadJSON limits the request body to maxBytes, decodes it into dst with
// DisallowUnknownFields, and rejects trailing tokens. Body-cap exceedance
// is reported with code TOO_LARGE; any other parse failure is INVALID.
func ReadJSON[T any](r *http.Request, dst *T, maxBytes int64) error {
	r.Body = http.MaxBytesReader(nil, r.Body, maxBytes)
	dec := json.NewDecoder(r.Body)
	dec.DisallowUnknownFields()
	if err := dec.Decode(dst); err != nil {
		var maxErr *http.MaxBytesError
		if errors.As(err, &maxErr) {
			return culpa.WithCode(culpa.Wrap(err, "request body exceeds limit"), "TOO_LARGE")
		}
		return culpa.WithCode(culpa.Wrap(err, "decode json body"), "INVALID")
	}
	// Reject trailing data after the first JSON value.
	if dec.More() {
		return culpa.WithCode(culpa.New("unexpected data after JSON body"), "INVALID")
	}
	return nil
}

// WriteJSON encodes v as JSON with the given status code and the standard
// application/json content type. Encoder errors are returned so the caller
// can decide how to surface them; in practice they only fire on a dropped
// client connection.
func WriteJSON(w http.ResponseWriter, status int, v any) error {
	w.Header().Set("Content-Type", "application/json")
	w.WriteHeader(status)
	return json.NewEncoder(w).Encode(v)
}

// ReadNDJSONLines streams NDJSON lines from r, yielding the raw bytes of
// each non-empty line. The yielded slice is owned by an internal buffer
// and is reused on the next iteration: consumers that retain a line must
// copy it themselves.
//
// Body-cap exceedance and oversized single lines surface as a (nil, err)
// terminal yield with code TOO_LARGE; other scanner failures use INVALID.
func ReadNDJSONLines(r io.Reader, maxBytes int64) iter.Seq2[[]byte, error] {
	return func(yield func([]byte, error) bool) {
		limited := io.LimitReader(r, maxBytes+1)
		sc := bufio.NewScanner(limited)
		// Allow the scanner to grow the token buffer up to the body cap.
		// The starting buffer of 64 KiB is plenty for typical lines.
		sc.Buffer(make([]byte, 0, 64*1024), int(maxBytes)+1)
		var read int64
		for sc.Scan() {
			line := sc.Bytes()
			read += int64(len(line)) + 1 // +1 for the consumed newline
			if read > maxBytes {
				yield(nil, culpa.WithCode(culpa.New("ndjson body exceeds limit"), "TOO_LARGE"))
				return
			}
			if len(line) == 0 {
				continue
			}
			if !yield(line, nil) {
				return
			}
		}
		if err := sc.Err(); err != nil {
			if errors.Is(err, bufio.ErrTooLong) {
				yield(nil, culpa.WithCode(culpa.Wrap(err, "ndjson line too long"), "TOO_LARGE"))
				return
			}
			yield(nil, culpa.WithCode(culpa.Wrap(err, "scan ndjson"), "INVALID"))
		}
	}
}

A internal/server/auth/authenticator.go => internal/server/auth/authenticator.go +22 -0
@@ 0,0 1,22 @@
// Phase-5 stub. Replaced by real implementation in Phase 6.
//
// This stub only satisfies the compile-time surface that Server depends on:
// an Init that does nothing and a Middleware that is the identity passthrough.
// Phase 6 replaces this file with the forward-auth + OIDC implementation.
package auth

import (
	"context"
	"net/http"
)

// Authenticator is a no-op steward service. The real Phase-6 version will
// hold OIDC verifier state and forward-auth config.
type Authenticator struct{}

// Init satisfies the steward Service contract.
func (a *Authenticator) Init(_ context.Context) error { return nil }

// Middleware returns next unchanged. Phase 6 replaces this with the
// authentication chain.
func (a *Authenticator) Middleware(next http.Handler) http.Handler { return next }

A internal/server/server.go => internal/server/server.go +279 -0
@@ 0,0 1,279 @@
// Package server is the steward-managed HTTP layer for lethe. It owns the
// chi router, the standard middleware stack (request-id, logging, metrics,
// recovery), the unauthenticated probes (/healthz, /readyz, /metrics), and
// the authenticated /api/v1/* group inside which the ingest and session
// handlers register their routes.
//
// The Server is a steward.Root() service: it has no consumers and steward
// would otherwise prune it. main.go (Phase 9) registers it as the root.
//
// Bind validation is fail-fast: only loopback IPs are accepted. Phase 5
// keeps the system reachable by the local collector; exposing it on a
// public interface is a config error.
package server

import (
	"context"
	"crypto/rand"
	"encoding/hex"
	"errors"
	"fmt"
	"log/slog"
	"net"
	"net/http"
	"time"

	"github.com/go-chi/chi/v5"
	"github.com/prometheus/client_golang/prometheus/promhttp"
	"go.bigb.es/auxilia/culpa"
	"go.bigb.es/auxilia/scribe"

	"sourcecraft.dev/bigbes/lethe/internal/config"
	"sourcecraft.dev/bigbes/lethe/internal/domain/ingest"
	"sourcecraft.dev/bigbes/lethe/internal/domain/session"
	"sourcecraft.dev/bigbes/lethe/internal/pkg/apierror"
	"sourcecraft.dev/bigbes/lethe/internal/pkg/httputil"
	"sourcecraft.dev/bigbes/lethe/internal/platform/health"
	"sourcecraft.dev/bigbes/lethe/internal/platform/observability"
	authpkg "sourcecraft.dev/bigbes/lethe/internal/server/auth"
)

// readyzTimeout caps the time /readyz spends running every health check.
// Long enough to absorb a transient slow probe, short enough that orchestrators
// do not wait too long on a stuck dependency.
const readyzTimeout = 5 * time.Second

// Server is the HTTP steward service. Steward injects the shared platform
// services and the auth/handler stubs (which Phases 6/7/8 replace).
type Server struct {
	Cfg config.ServerConfig `config:""`

	Log     *observability.Logger  `inject:""`
	Metrics *observability.Metrics `inject:""`
	Health  *health.Set            `inject:""`

	Auth     *authpkg.Authenticator `inject:""`
	Ingest   *ingest.Handler        `inject:""`
	Sessions *session.Handler       `inject:""`

	router  *chi.Mux
	httpSrv *http.Server
}

// Init validates the bind address and constructs the chi router with the
// standard middleware stack and all routes mounted. It does not start the
// listener — that is Start's job.
func (s *Server) Init(_ context.Context) error {
	if err := validateLoopbackBind(s.Cfg.Bind); err != nil {
		return err
	}

	r := chi.NewRouter()
	r.Use(s.requestIDMiddleware)
	r.Use(s.loggingMiddleware)
	r.Use(s.metricsMiddleware)
	r.Use(s.recoveryMiddleware)

	r.NotFound(notFoundHandler)
	r.MethodNotAllowed(methodNotAllowedHandler)

	r.Get("/healthz", s.healthzHandler)
	r.Get("/readyz", s.readyzHandler)
	r.Handle("/metrics", promhttp.HandlerFor(s.Metrics.Registry, promhttp.HandlerOpts{}))

	r.Route("/api/v1", func(r chi.Router) {
		r.Use(s.Auth.Middleware)
		s.Ingest.Mount(r)
		s.Sessions.Mount(r)
	})

	s.router = r
	return nil
}

// Start spawns ListenAndServe in the background and returns immediately.
// Errors are logged; steward observes lifecycle via Stop.
func (s *Server) Start(_ context.Context) error {
	s.httpSrv = &http.Server{
		Addr:              s.Cfg.Bind,
		Handler:           s.router,
		ReadHeaderTimeout: 10 * time.Second,
	}
	go func() {
		if err := s.httpSrv.ListenAndServe(); err != nil && !errors.Is(err, http.ErrServerClosed) {
			slog.Default().Error("server crashed", scribe.Err(err))
		}
	}()
	return nil
}

// Stop gracefully drains in-flight requests within ShutdownGrace, then
// closes the listener. Idempotent if Start was never called.
func (s *Server) Stop(ctx context.Context) error {
	if s.httpSrv == nil {
		return nil
	}
	grace := s.Cfg.ShutdownGrace
	if grace <= 0 {
		grace = 10 * time.Second
	}
	ctx2, cancel := context.WithTimeout(ctx, grace)
	defer cancel()
	return s.httpSrv.Shutdown(ctx2)
}

// validateLoopbackBind ensures s.Cfg.Bind resolves to a loopback IP.
// Anything else is rejected at startup with code CONFIG_INVALID.
func validateLoopbackBind(bind string) error {
	host, _, err := net.SplitHostPort(bind)
	if err != nil {
		return culpa.WithCode(culpa.Wrap(err, "parse bind address"), "CONFIG_INVALID")
	}
	ip := net.ParseIP(host)
	if ip == nil {
		return culpa.WithCode(culpa.Errorf("bind host %q is not an IP literal", host), "CONFIG_INVALID")
	}
	if !ip.IsLoopback() {
		return culpa.WithCode(culpa.Errorf("bind %q must resolve to a loopback IP", bind), "CONFIG_INVALID")
	}
	return nil
}

// healthzHandler is a constant-200 process-up probe.
func (s *Server) healthzHandler(w http.ResponseWriter, _ *http.Request) {
	w.Header().Set("Content-Type", "text/plain; charset=utf-8")
	w.WriteHeader(http.StatusOK)
	_, _ = w.Write([]byte("ok"))
}

// readyzResponse is the JSON body returned by /readyz.
type readyzResponse struct {
	Checks map[string]string `json:"checks"`
}

// readyzHandler runs every registered Checker with a 5s ceiling. Any
// failure flips the status to 503; the body always lists each check.
func (s *Server) readyzHandler(w http.ResponseWriter, r *http.Request) {
	ctx, cancel := context.WithTimeout(r.Context(), readyzTimeout)
	defer cancel()

	results, allOK := s.Health.Run(ctx)
	body := readyzResponse{Checks: make(map[string]string, len(results))}
	for name, err := range results {
		if err == nil {
			body.Checks[name] = "ok"
		} else {
			body.Checks[name] = err.Error()
		}
	}
	status := http.StatusOK
	if !allOK {
		status = http.StatusServiceUnavailable
	}
	if err := httputil.WriteJSON(w, status, body); err != nil {
		slog.Default().ErrorContext(r.Context(), "write readyz response", scribe.Err(err))
	}
}

// requestIDMiddleware mints a 32-char hex id, attaches it to the request
// context for the logger to pick up, and echoes it as X-Request-ID. The
// header is set before next.ServeHTTP so even slow handlers expose it.
func (s *Server) requestIDMiddleware(next http.Handler) http.Handler {
	return http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
		var buf [16]byte
		// crypto/rand.Read in Go 1.24+ is documented infallible.
		_, _ = rand.Read(buf[:])
		id := hex.EncodeToString(buf[:])
		w.Header().Set("X-Request-ID", id)
		ctx := observability.WithRequestID(r.Context(), id)
		next.ServeHTTP(w, r.WithContext(ctx))
	})
}

// responseRecorder is a minimal ResponseWriter wrapper that captures the
// final status code so the logging and metrics middlewares can label their
// records correctly.
type responseRecorder struct {
	http.ResponseWriter
	status      int
	wroteHeader bool
}

// WriteHeader records the status before delegating.
func (rr *responseRecorder) WriteHeader(code int) {
	if rr.wroteHeader {
		return
	}
	rr.status = code
	rr.wroteHeader = true
	rr.ResponseWriter.WriteHeader(code)
}

// Write triggers an implicit 200 if the handler wrote without WriteHeader.
func (rr *responseRecorder) Write(b []byte) (int, error) {
	if !rr.wroteHeader {
		rr.status = http.StatusOK
		rr.wroteHeader = true
	}
	return rr.ResponseWriter.Write(b)
}

// loggingMiddleware emits one structured access log per request. The
// request_id attribute is added automatically by contextHandler.
func (s *Server) loggingMiddleware(next http.Handler) http.Handler {
	return http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
		start := time.Now()
		rr := &responseRecorder{ResponseWriter: w, status: http.StatusOK}
		next.ServeHTTP(rr, r)
		slog.Default().InfoContext(r.Context(), "request",
			slog.String("method", r.Method),
			slog.String("path", r.URL.Path),
			slog.Int("status", rr.status),
			slog.Int64("duration_ms", time.Since(start).Milliseconds()),
		)
	})
}

// metricsMiddleware increments the request counter and observes the
// duration histogram. The route label comes from chi's RoutePattern so
// cardinality stays bounded; unmatched paths fall back to "unknown".
func (s *Server) metricsMiddleware(next http.Handler) http.Handler {
	return http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
		start := time.Now()
		rr := &responseRecorder{ResponseWriter: w, status: http.StatusOK}
		next.ServeHTTP(rr, r)

		route := "unknown"
		if rc := chi.RouteContext(r.Context()); rc != nil {
			if pat := rc.RoutePattern(); pat != "" {
				route = pat
			}
		}
		statusStr := fmt.Sprintf("%d", rr.status)
		s.Metrics.HTTPRequests.WithLabelValues(r.Method, route, statusStr).Inc()
		s.Metrics.HTTPDuration.WithLabelValues(r.Method, route).Observe(time.Since(start).Seconds())
	})
}

// recoveryMiddleware turns a panic into a 500 problem document. The
// stacktrace logging is handled by apierror.Render's 5xx branch, which
// receives the panic-as-error.
func (s *Server) recoveryMiddleware(next http.Handler) http.Handler {
	return http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
		defer func() {
			if rec := recover(); rec != nil {
				err := culpa.WithCode(culpa.Errorf("panic: %v", rec), "INTERNAL")
				apierror.Render(w, r, err)
			}
		}()
		next.ServeHTTP(w, r)
	})
}

func notFoundHandler(w http.ResponseWriter, r *http.Request) {
	apierror.Render(w, r, culpa.WithCode(culpa.New("route not found"), "NOT_FOUND"))
}

func methodNotAllowedHandler(w http.ResponseWriter, r *http.Request) {
	apierror.Render(w, r, culpa.WithCode(culpa.New("method not allowed"), "METHOD_NOT_ALLOWED"))
}

A internal/server/server_test.go => internal/server/server_test.go +253 -0
@@ 0,0 1,253 @@
package server

import (
	"context"
	"encoding/json"
	"net/http"
	"net/http/httptest"
	"testing"
	"time"

	"github.com/go-chi/chi/v5"
	"go.bigb.es/auxilia/culpa"

	"sourcecraft.dev/bigbes/lethe/internal/config"
	"sourcecraft.dev/bigbes/lethe/internal/domain/ingest"
	"sourcecraft.dev/bigbes/lethe/internal/domain/session"
	"sourcecraft.dev/bigbes/lethe/internal/platform/health"
	"sourcecraft.dev/bigbes/lethe/internal/platform/observability"
	authpkg "sourcecraft.dev/bigbes/lethe/internal/server/auth"
)

// newTestServer wires a Server with hand-constructed dependencies so unit
// tests do not need to spin up the steward graph.
func newTestServer(t *testing.T, bind string) *Server {
	t.Helper()

	logger := &observability.Logger{Cfg: config.LoggingConfig{Level: "info", Format: "json"}}
	if err := logger.Init(context.Background()); err != nil {
		t.Fatalf("logger.Init: %v", err)
	}
	metrics := &observability.Metrics{}
	if err := metrics.Init(context.Background()); err != nil {
		t.Fatalf("metrics.Init: %v", err)
	}
	return &Server{
		Cfg: config.ServerConfig{
			Bind:          bind,
			ShutdownGrace: 5 * time.Second,
		},
		Log:      logger,
		Metrics:  metrics,
		Health:   &health.Set{},
		Auth:     &authpkg.Authenticator{},
		Ingest:   &ingest.Handler{},
		Sessions: &session.Handler{},
	}
}

func TestServerInit_RejectsNonLoopbackBind(t *testing.T) {
	s := newTestServer(t, "0.0.0.0:8080")
	err := s.Init(context.Background())
	if err == nil {
		t.Fatalf("Init: expected error for non-loopback bind")
	}
	var cd culpa.CodeDetail
	if !culpa.FindDetail(err, &cd) {
		t.Fatalf("Init: expected culpa CodeDetail; got %v", err)
	}
	if cd.Code != "CONFIG_INVALID" {
		t.Errorf("Init: code = %v; want CONFIG_INVALID", cd.Code)
	}
}

func TestServerInit_AcceptsLoopback(t *testing.T) {
	s := newTestServer(t, "127.0.0.1:0")
	if err := s.Init(context.Background()); err != nil {
		t.Fatalf("Init: %v", err)
	}
	if s.router == nil {
		t.Fatalf("Init: router not built")
	}
}

func TestRouter_RecoveryTurnsPanicInto500Problem(t *testing.T) {
	s := newTestServer(t, "127.0.0.1:0")
	if err := s.Init(context.Background()); err != nil {
		t.Fatalf("Init: %v", err)
	}
	// Mount a panicking route on the router *after* Init wired the
	// middleware chain so the recovery middleware sits in front of it.
	s.router.Get("/boom", func(http.ResponseWriter, *http.Request) {
		panic("kaboom")
	})

	req := httptest.NewRequest(http.MethodGet, "/boom", nil)
	rec := httptest.NewRecorder()
	s.router.ServeHTTP(rec, req)

	if rec.Code != http.StatusInternalServerError {
		t.Fatalf("status = %d; want 500", rec.Code)
	}
	if ct := rec.Header().Get("Content-Type"); ct != "application/problem+json" {
		t.Errorf("Content-Type = %q; want application/problem+json", ct)
	}
	var body map[string]any
	if err := json.Unmarshal(rec.Body.Bytes(), &body); err != nil {
		t.Fatalf("unmarshal body: %v", err)
	}
	if body["status"].(float64) != 500 {
		t.Errorf("body.status = %v; want 500", body["status"])
	}
	if body["detail"] != "internal server error" {
		t.Errorf("body.detail = %v; want sanitized message", body["detail"])
	}
}

func TestRouter_RequestIDInResponseAndContext(t *testing.T) {
	s := newTestServer(t, "127.0.0.1:0")
	if err := s.Init(context.Background()); err != nil {
		t.Fatalf("Init: %v", err)
	}
	var fromCtx string
	s.router.Get("/probe", func(w http.ResponseWriter, r *http.Request) {
		fromCtx = observability.RequestIDFrom(r.Context())
		w.WriteHeader(http.StatusOK)
	})

	req := httptest.NewRequest(http.MethodGet, "/probe", nil)
	rec := httptest.NewRecorder()
	s.router.ServeHTTP(rec, req)

	if rec.Code != http.StatusOK {
		t.Fatalf("status = %d; want 200", rec.Code)
	}
	headerID := rec.Header().Get("X-Request-ID")
	if headerID == "" {
		t.Fatalf("X-Request-ID header missing")
	}
	if fromCtx == "" {
		t.Fatalf("request id missing from context")
	}
	if headerID != fromCtx {
		t.Errorf("ctx id %q != header id %q", fromCtx, headerID)
	}
}

func TestRouter_HealthzReturnsOK(t *testing.T) {
	s := newTestServer(t, "127.0.0.1:0")
	if err := s.Init(context.Background()); err != nil {
		t.Fatalf("Init: %v", err)
	}
	req := httptest.NewRequest(http.MethodGet, "/healthz", nil)
	rec := httptest.NewRecorder()
	s.router.ServeHTTP(rec, req)
	if rec.Code != http.StatusOK {
		t.Fatalf("status = %d; want 200", rec.Code)
	}
	if rec.Body.String() != "ok" {
		t.Errorf("body = %q; want ok", rec.Body.String())
	}
}

func TestRouter_ReadyzAllOKWithEmptyChecks(t *testing.T) {
	s := newTestServer(t, "127.0.0.1:0")
	if err := s.Init(context.Background()); err != nil {
		t.Fatalf("Init: %v", err)
	}
	req := httptest.NewRequest(http.MethodGet, "/readyz", nil)
	rec := httptest.NewRecorder()
	s.router.ServeHTTP(rec, req)
	if rec.Code != http.StatusOK {
		t.Fatalf("status = %d; want 200", rec.Code)
	}
}

func TestRouter_MetricsExposesPrometheus(t *testing.T) {
	s := newTestServer(t, "127.0.0.1:0")
	if err := s.Init(context.Background()); err != nil {
		t.Fatalf("Init: %v", err)
	}
	req := httptest.NewRequest(http.MethodGet, "/metrics", nil)
	rec := httptest.NewRecorder()
	s.router.ServeHTTP(rec, req)
	if rec.Code != http.StatusOK {
		t.Fatalf("status = %d; want 200", rec.Code)
	}
	if got := rec.Body.String(); got == "" {
		t.Errorf("metrics body empty")
	}
}

// TestRouter_APIv1MountsAuthMiddleware is a smoke test: the auth middleware
// is the identity passthrough in Phase 5, but the /api/v1 group must still
// wire it in so Phase 6's replacement immediately takes effect on every
// route registered by the ingest/session handlers.
func TestRouter_APIv1MountsAuthMiddleware(t *testing.T) {
	s := newTestServer(t, "127.0.0.1:0")
	if err := s.Init(context.Background()); err != nil {
		t.Fatalf("Init: %v", err)
	}
	// Hang a probe route off the /api/v1 group via the same chi router.
	s.router.Route("/api/v1/probe", func(r chi.Router) {
		r.Get("/", func(w http.ResponseWriter, _ *http.Request) {
			w.WriteHeader(http.StatusTeapot)
		})
	})
	req := httptest.NewRequest(http.MethodGet, "/api/v1/probe", nil)
	rec := httptest.NewRecorder()
	s.router.ServeHTTP(rec, req)
	if rec.Code != http.StatusTeapot {
		t.Fatalf("status = %d; want 418", rec.Code)
	}
}

func TestRouter_NotFoundReturnsProblemJSON(t *testing.T) {
	srv := newTestServer(t, "127.0.0.1:0")
	if err := srv.Init(context.Background()); err != nil {
		t.Fatalf("Init: %v", err)
	}

	req := httptest.NewRequest(http.MethodGet, "/no-such-route", nil)
	rec := httptest.NewRecorder()
	srv.router.ServeHTTP(rec, req)

	if rec.Code != http.StatusNotFound {
		t.Fatalf("status: got %d, want 404", rec.Code)
	}
	if ct := rec.Header().Get("Content-Type"); ct != "application/problem+json" {
		t.Fatalf("Content-Type: got %q, want application/problem+json", ct)
	}
	var p map[string]any
	if err := json.Unmarshal(rec.Body.Bytes(), &p); err != nil {
		t.Fatalf("body is not JSON: %v", err)
	}
	if got, _ := p["code"].(string); got != "NOT_FOUND" {
		t.Fatalf("code: got %q, want NOT_FOUND", got)
	}
}

func TestRouter_MethodNotAllowedReturnsProblemJSON(t *testing.T) {
	srv := newTestServer(t, "127.0.0.1:0")
	if err := srv.Init(context.Background()); err != nil {
		t.Fatalf("Init: %v", err)
	}

	req := httptest.NewRequest(http.MethodPost, "/healthz", nil)
	rec := httptest.NewRecorder()
	srv.router.ServeHTTP(rec, req)

	if rec.Code != http.StatusMethodNotAllowed {
		t.Fatalf("status: got %d, want 405", rec.Code)
	}
	if ct := rec.Header().Get("Content-Type"); ct != "application/problem+json" {
		t.Fatalf("Content-Type: got %q, want application/problem+json", ct)
	}
	var p map[string]any
	if err := json.Unmarshal(rec.Body.Bytes(), &p); err != nil {
		t.Fatalf("body is not JSON: %v", err)
	}
	if got, _ := p["code"].(string); got != "METHOD_NOT_ALLOWED" {
		t.Fatalf("code: got %q, want METHOD_NOT_ALLOWED", got)
	}
}