// Package server is the steward-managed HTTP layer for lethe. It owns the
// chi router, the standard middleware stack (request-id, logging, metrics,
// recovery), the unauthenticated probes (/healthz, /readyz, /metrics), and
// the authenticated /api/v1/* group inside which the ingest and session
// handlers register their routes.
//
// The Server is a steward.Root() service: it has no consumers and steward
// would otherwise prune it. main.go (Phase 9) registers it as the root.
//
// Bind validation is fail-fast: only loopback IPs are accepted. Phase 5
// keeps the system reachable by the local collector; exposing it on a
// public interface is a config error.
package server
import (
"context"
"crypto/rand"
"encoding/hex"
"errors"
"fmt"
"log/slog"
"net"
"net/http"
"time"
"github.com/go-chi/chi/v5"
"github.com/prometheus/client_golang/prometheus/promhttp"
"go.bigb.es/auxilia/culpa"
"go.bigb.es/auxilia/scribe"
"sourcecraft.dev/bigbes/lethe/internal/config"
"sourcecraft.dev/bigbes/lethe/internal/domain/ingest"
"sourcecraft.dev/bigbes/lethe/internal/domain/session"
"sourcecraft.dev/bigbes/lethe/internal/pkg/apierror"
"sourcecraft.dev/bigbes/lethe/internal/pkg/httputil"
"sourcecraft.dev/bigbes/lethe/internal/platform/health"
"sourcecraft.dev/bigbes/lethe/internal/platform/observability"
authpkg "sourcecraft.dev/bigbes/lethe/internal/server/auth"
)
// readyzTimeout caps the time /readyz spends running every health check.
// Long enough to absorb a transient slow probe, short enough that orchestrators
// do not wait too long on a stuck dependency.
const readyzTimeout = 5 * time.Second
// Server is the HTTP steward service. Steward injects the shared platform
// services and the auth/handler stubs (which Phases 6/7/8 replace).
type Server struct {
Cfg config.ServerConfig `config:""`
Log *observability.Logger `inject:""`
Metrics *observability.Metrics `inject:""`
Health *health.Set `inject:""`
Auth *authpkg.Authenticator `inject:""`
Ingest *ingest.Handler `inject:""`
Sessions *session.Handler `inject:""`
router *chi.Mux
httpSrv *http.Server
}
// Init validates the bind address and constructs the chi router with the
// standard middleware stack and all routes mounted. It does not start the
// listener — that is Start's job.
func (s *Server) Init(_ context.Context) error {
if err := validateLoopbackBind(s.Cfg.Bind); err != nil {
return err
}
r := chi.NewRouter()
r.Use(s.requestIDMiddleware)
r.Use(s.loggingMiddleware)
r.Use(s.metricsMiddleware)
r.Use(s.recoveryMiddleware)
r.NotFound(notFoundHandler)
r.MethodNotAllowed(methodNotAllowedHandler)
r.Get("/healthz", s.healthzHandler)
r.Get("/readyz", s.readyzHandler)
r.Handle("/metrics", promhttp.HandlerFor(s.Metrics.Registry, promhttp.HandlerOpts{}))
r.Route("/api/v1", func(r chi.Router) {
r.Use(s.Auth.Middleware)
s.Ingest.Mount(r)
s.Sessions.Mount(r)
})
s.router = r
return nil
}
// Start spawns ListenAndServe in the background and returns immediately.
// Errors are logged; steward observes lifecycle via Stop.
func (s *Server) Start(_ context.Context) error {
s.httpSrv = &http.Server{
Addr: s.Cfg.Bind,
Handler: s.router,
ReadHeaderTimeout: 10 * time.Second,
}
go func() {
if err := s.httpSrv.ListenAndServe(); err != nil && !errors.Is(err, http.ErrServerClosed) {
slog.Default().Error("server crashed", scribe.Err(err))
}
}()
return nil
}
// Stop gracefully drains in-flight requests within ShutdownGrace, then
// closes the listener. Idempotent if Start was never called.
func (s *Server) Stop(ctx context.Context) error {
if s.httpSrv == nil {
return nil
}
grace := s.Cfg.ShutdownGrace
if grace <= 0 {
grace = 10 * time.Second
}
ctx2, cancel := context.WithTimeout(ctx, grace)
defer cancel()
return s.httpSrv.Shutdown(ctx2)
}
// validateLoopbackBind ensures s.Cfg.Bind resolves to a loopback IP.
// Anything else is rejected at startup with code CONFIG_INVALID.
func validateLoopbackBind(bind string) error {
host, _, err := net.SplitHostPort(bind)
if err != nil {
return culpa.WithCode(culpa.Wrap(err, "parse bind address"), "CONFIG_INVALID")
}
ip := net.ParseIP(host)
if ip == nil {
return culpa.WithCode(culpa.Errorf("bind host %q is not an IP literal", host), "CONFIG_INVALID")
}
if !ip.IsLoopback() {
return culpa.WithCode(culpa.Errorf("bind %q must resolve to a loopback IP", bind), "CONFIG_INVALID")
}
return nil
}
// healthzHandler is a constant-200 process-up probe.
func (s *Server) healthzHandler(w http.ResponseWriter, _ *http.Request) {
w.Header().Set("Content-Type", "text/plain; charset=utf-8")
w.WriteHeader(http.StatusOK)
_, _ = w.Write([]byte("ok"))
}
// readyzResponse is the JSON body returned by /readyz.
type readyzResponse struct {
Checks map[string]string `json:"checks"`
}
// readyzHandler runs every registered Checker with a 5s ceiling. Any
// failure flips the status to 503; the body always lists each check.
func (s *Server) readyzHandler(w http.ResponseWriter, r *http.Request) {
ctx, cancel := context.WithTimeout(r.Context(), readyzTimeout)
defer cancel()
results, allOK := s.Health.Run(ctx)
body := readyzResponse{Checks: make(map[string]string, len(results))}
for name, err := range results {
if err == nil {
body.Checks[name] = "ok"
} else {
body.Checks[name] = err.Error()
}
}
status := http.StatusOK
if !allOK {
status = http.StatusServiceUnavailable
}
if err := httputil.WriteJSON(w, status, body); err != nil {
slog.Default().ErrorContext(r.Context(), "write readyz response", scribe.Err(err))
}
}
// requestIDMiddleware mints a 32-char hex id, attaches it to the request
// context for the logger to pick up, and echoes it as X-Request-ID. The
// header is set before next.ServeHTTP so even slow handlers expose it.
func (s *Server) requestIDMiddleware(next http.Handler) http.Handler {
return http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
var buf [16]byte
// crypto/rand.Read in Go 1.24+ is documented infallible.
_, _ = rand.Read(buf[:])
id := hex.EncodeToString(buf[:])
w.Header().Set("X-Request-ID", id)
ctx := observability.WithRequestID(r.Context(), id)
next.ServeHTTP(w, r.WithContext(ctx))
})
}
// responseRecorder is a minimal ResponseWriter wrapper that captures the
// final status code so the logging and metrics middlewares can label their
// records correctly.
type responseRecorder struct {
http.ResponseWriter
status int
wroteHeader bool
}
// WriteHeader records the status before delegating.
func (rr *responseRecorder) WriteHeader(code int) {
if rr.wroteHeader {
return
}
rr.status = code
rr.wroteHeader = true
rr.ResponseWriter.WriteHeader(code)
}
// Write triggers an implicit 200 if the handler wrote without WriteHeader.
func (rr *responseRecorder) Write(b []byte) (int, error) {
if !rr.wroteHeader {
rr.status = http.StatusOK
rr.wroteHeader = true
}
return rr.ResponseWriter.Write(b)
}
// loggingMiddleware emits one structured access log per request. The
// request_id attribute is added automatically by contextHandler.
func (s *Server) loggingMiddleware(next http.Handler) http.Handler {
return http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
start := time.Now()
rr := &responseRecorder{ResponseWriter: w, status: http.StatusOK}
next.ServeHTTP(rr, r)
slog.Default().InfoContext(r.Context(), "request",
slog.String("method", r.Method),
slog.String("path", r.URL.Path),
slog.Int("status", rr.status),
slog.Int64("duration_ms", time.Since(start).Milliseconds()),
)
})
}
// metricsMiddleware increments the request counter and observes the
// duration histogram. The route label comes from chi's RoutePattern so
// cardinality stays bounded; unmatched paths fall back to "unknown".
func (s *Server) metricsMiddleware(next http.Handler) http.Handler {
return http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
start := time.Now()
rr := &responseRecorder{ResponseWriter: w, status: http.StatusOK}
next.ServeHTTP(rr, r)
route := "unknown"
if rc := chi.RouteContext(r.Context()); rc != nil {
if pat := rc.RoutePattern(); pat != "" {
route = pat
}
}
statusStr := fmt.Sprintf("%d", rr.status)
s.Metrics.HTTPRequests.WithLabelValues(r.Method, route, statusStr).Inc()
s.Metrics.HTTPDuration.WithLabelValues(r.Method, route).Observe(time.Since(start).Seconds())
})
}
// recoveryMiddleware turns a panic into a 500 problem document. The
// stacktrace logging is handled by apierror.Render's 5xx branch, which
// receives the panic-as-error.
func (s *Server) recoveryMiddleware(next http.Handler) http.Handler {
return http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
defer func() {
if rec := recover(); rec != nil {
err := culpa.WithCode(culpa.Errorf("panic: %v", rec), "INTERNAL")
apierror.Render(w, r, err)
}
}()
next.ServeHTTP(w, r)
})
}
func notFoundHandler(w http.ResponseWriter, r *http.Request) {
apierror.Render(w, r, culpa.WithCode(culpa.New("route not found"), "NOT_FOUND"))
}
func methodNotAllowedHandler(w http.ResponseWriter, r *http.Request) {
apierror.Render(w, r, culpa.WithCode(culpa.New("method not allowed"), "METHOD_NOT_ALLOWED"))
}