feat(gateway): 可观测性 —— Prometheus 指标 + 结构化日志 + 探针

往"生产可运维"推一步(网关前门):
- Prometheus /metrics:sundynix_http_requests_total{method,route,status}、
  request_duration_seconds 直方图、requests_in_flight。route 用 c.FullPath()
  路由模板(/tasks/:id/...)避免按真实路径高基数。
- 结构化访问日志:slog JSON 到 stderr(request_id/method/route/status/latency_ms/
  ip/uid/bytes),替代 gin 默认文本日志;gin.New()+Recovery 自管中间件链。
- RequestID 中间件:生成/透传 X-Request-ID,写上下文+响应头,供日志关联。
- 探针:/healthz(liveness,不查依赖)、/readyz(readiness,DB+Redis 就绪才 200,
  否则 503),供 k8s 等导流判断;/api/v1/health 深度聚合保留。
- 三个根端点不挂业务鉴权(/metrics 生产应由网络层限制抓取来源)。

验证:单测(计数 +1 / X-Request-ID 生成与透传);实跑 /healthz 200、/readyz 200
(db,redis ready)、/metrics 输出真实指标、访问日志 JSON 正常、X-Request-ID 回写。

Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
This commit is contained in:
Blizzard
2026-06-19 10:38:31 +08:00
parent e05e6f5903
commit b6a6875795
7 changed files with 201 additions and 16 deletions
@@ -95,6 +95,22 @@ func (h *Handler) StreamTask(c *gin.Context) {
})
}
// Healthz: GET /healthz —— 存活探针(liveness):进程能应答即 200,不查依赖。
func (h *Handler) Healthz(c *gin.Context) {
c.JSON(http.StatusOK, gin.H{"status": "ok"})
}
// Readyz: GET /readyz —— 就绪探针(readiness):核心依赖(DB/Redis)可用才 200,否则 503。
// 供 k8s 等编排器在依赖未就绪时暂不导流。NATS 在启动时即连(连不上会 fatal),故不单列。
func (h *Handler) Readyz(c *gin.Context) {
deps := gin.H{"db": h.db.Enabled(), "redis": h.cache.Enabled()}
if h.db.Enabled() && h.cache.Enabled() {
c.JSON(http.StatusOK, gin.H{"status": "ready", "deps": deps})
return
}
c.JSON(http.StatusServiceUnavailable, gin.H{"status": "not_ready", "deps": deps})
}
// Health: GET /api/v1/health —— 聚合各依赖子系统健康,供桌面端顶栏五盏灯实时点亮。
// gateway/db/redis/nats 网关本地可判;milvus/neo4j 经 mcp-go health 工具取(不可用则置否)。
func (h *Handler) Health(c *gin.Context) {
@@ -0,0 +1,94 @@
package middleware
import (
"crypto/rand"
"encoding/hex"
"log/slog"
"os"
"strconv"
"time"
"github.com/gin-gonic/gin"
"github.com/prometheus/client_golang/prometheus"
"github.com/prometheus/client_golang/prometheus/promauto"
)
// CtxRequestID 是请求 ID 在 gin.Context 中的键。
const CtxRequestID = "request_id"
// ---- Prometheus 指标 ----
var (
httpRequests = promauto.NewCounterVec(prometheus.CounterOpts{
Name: "sundynix_http_requests_total",
Help: "HTTP 请求总数(按方法/路由模板/状态码)。",
}, []string{"method", "route", "status"})
httpDuration = promauto.NewHistogramVec(prometheus.HistogramOpts{
Name: "sundynix_http_request_duration_seconds",
Help: "HTTP 请求耗时(秒)。",
Buckets: prometheus.DefBuckets,
}, []string{"method", "route"})
httpInFlight = promauto.NewGauge(prometheus.GaugeOpts{
Name: "sundynix_http_requests_in_flight",
Help: "当前处理中的 HTTP 请求数。",
})
)
// accessLogger 是结构化访问日志器(JSON 到 stderr)。
var accessLogger = slog.New(slog.NewJSONHandler(os.Stderr, &slog.HandlerOptions{Level: slog.LevelInfo}))
// RequestID 为每个请求生成/透传 X-Request-ID,写入上下文与响应头,供日志关联。
func RequestID() gin.HandlerFunc {
return func(c *gin.Context) {
id := c.GetHeader("X-Request-ID")
if id == "" {
id = newRequestID()
}
c.Set(CtxRequestID, id)
c.Header("X-Request-ID", id)
c.Next()
}
}
// Observe 记录 Prometheus 指标 + 结构化访问日志。放在中间件链较前位置。
func Observe() gin.HandlerFunc {
return func(c *gin.Context) {
start := time.Now()
httpInFlight.Inc()
c.Next()
httpInFlight.Dec()
route := c.FullPath() // 路由模板(/tasks/:id/...),避免按真实路径产生高基数
if route == "" {
route = "unmatched"
}
status := c.Writer.Status()
dur := time.Since(start)
method := c.Request.Method
httpRequests.WithLabelValues(method, route, strconv.Itoa(status)).Inc()
httpDuration.WithLabelValues(method, route).Observe(dur.Seconds())
uid, _ := c.Get(CtxUserID)
rid, _ := c.Get(CtxRequestID)
accessLogger.Info("http",
"request_id", rid,
"method", method,
"route", route,
"path", c.Request.URL.Path,
"status", status,
"latency_ms", dur.Milliseconds(),
"ip", c.ClientIP(),
"uid", uid,
"bytes", c.Writer.Size(),
)
}
}
func newRequestID() string {
var b [8]byte
_, _ = rand.Read(b[:])
return hex.EncodeToString(b[:])
}
@@ -0,0 +1,48 @@
package middleware
import (
"net/http"
"net/http/httptest"
"testing"
"github.com/gin-gonic/gin"
"github.com/prometheus/client_golang/prometheus/testutil"
)
func newEngine() *gin.Engine {
gin.SetMode(gin.TestMode)
r := gin.New()
r.Use(RequestID(), Observe())
r.GET("/ping", func(c *gin.Context) { c.String(http.StatusOK, "pong") })
return r
}
func TestObserve_CountsAndRequestID(t *testing.T) {
r := newEngine()
before := testutil.ToFloat64(httpRequests.WithLabelValues("GET", "/ping", "200"))
w := httptest.NewRecorder()
r.ServeHTTP(w, httptest.NewRequest(http.MethodGet, "/ping", nil))
if w.Code != 200 {
t.Fatalf("状态码=%d", w.Code)
}
if w.Header().Get("X-Request-ID") == "" {
t.Error("应自动生成并回写 X-Request-ID")
}
after := testutil.ToFloat64(httpRequests.WithLabelValues("GET", "/ping", "200"))
if after != before+1 {
t.Errorf("请求计数应 +1before=%v after=%v", before, after)
}
}
func TestRequestID_PropagatesIncoming(t *testing.T) {
r := newEngine()
w := httptest.NewRecorder()
req := httptest.NewRequest(http.MethodGet, "/ping", nil)
req.Header.Set("X-Request-ID", "trace-abc-123")
r.ServeHTTP(w, req)
if got := w.Header().Get("X-Request-ID"); got != "trace-abc-123" {
t.Errorf("应透传入站 X-Request-IDgot %q", got)
}
}
+12 -2
View File
@@ -5,6 +5,7 @@ import (
"os"
"github.com/gin-gonic/gin"
"github.com/prometheus/client_golang/prometheus/promhttp"
"github.com/sundynix/sundynix-gateway/internal/blob"
"github.com/sundynix/sundynix-gateway/internal/handler"
@@ -15,13 +16,22 @@ import (
// New 构建带有 Guardrail / 限流中间件的 Gin 引擎。
func New(db *store.Postgres, cache *store.Redis, bus *nats.Bus, blobStore *blob.Store) *gin.Engine {
r := gin.Default()
r.Use(cors()) // 桌面端/浏览器跨源访问(开发期放开)
r := gin.New()
r.Use(gin.Recovery()) // panic 兜底
r.Use(middleware.RequestID()) // 生成/透传 X-Request-ID(日志关联)
r.Use(middleware.Observe()) // Prometheus 指标 + 结构化访问日志(替代 gin 默认文本日志)
r.Use(cors()) // 桌面端/浏览器跨源访问
r.Use(middleware.RateLimit(cache))
r.Use(middleware.Auth()) // 解析 Bearer JWT,注入已验证 userID(非阻断)
r.Use(middleware.Guardrail()) // Harness: Input Guardrail
h := handler.New(db, cache, bus, blobStore)
// 可观测性根端点:Prometheus 抓取 + k8s 存活/就绪探针(不挂业务中间件鉴权)。
r.GET("/metrics", gin.WrapH(promhttp.Handler()))
r.GET("/healthz", h.Healthz)
r.GET("/readyz", h.Readyz)
api := r.Group("/api/v1")
{
// —— 公开:鉴权端点 / 健康 / 按 task_id 寻址的 SSE 与导出(EventSource/下载无法带 Bearer)——