feat(dispatcher): LLM 自动化评测落地（规则 + LLM-as-judge）+ 单测

Evaluator 此前是空桩（Score 恒返 0）且未接线。落地为真实自动化评测并接入： - 规则评测（always-on，纯函数）：空输出/过短/疑似拒答/重复啰嗦各扣分 → 0–1 分 + 标签。 - LLM-as-judge（模型就绪时）：让模型对(输入,输出)按相关性/准确性/完整性 1–5 打分给理由，归一化后与规则分加权（0.4 规则 + 0.6 LLM）；解析失败/无模型则回退纯规则分。 - 经注入 ready/chat 解耦 LLM 后端，便于单测（无需真实模型）。 - 接线：orchestrator 在答复产出后 `go o.evaluate(...)` 异步评分并记日志（off 热路径，不影响响应与流式）；main.go 用 pool.Ready/pool.Chat 构造 Evaluator。测试：规则各情形（正常/空/过短/拒答/重复）、纯规则模式、LLM-judge（带围栏 JSON 解析 + 归一化 + 加权）、坏 JSON 回退 —— 全过。至此 Harness 三件：熔断降级 ✅ · 输入护栏 ✅ · LLM 自动化评测 ✅（输出护栏待 emit 层）。 Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
2026-06-17 15:32:02 +08:00
parent e63632adf5
commit 3ae009db38
5 changed files with 247 additions and 17 deletions
@@ -20,6 +20,10 @@ func main() {

 	pool := llm.NewPool()                  // LLM Pool: vLLM / Ollama 集群
 	breaker := harness.NewCircuitBreaker() // Harness: 熔断降级中心
+	// Harness: LLM 自动化评测（规则 + LLM-as-judge，模型就绪时启用）。
+	eval := harness.NewEvaluator(pool.Ready, func(ctx context.Context, sys, user string) (string, error) {
+		return pool.Chat(ctx, []llm.ChatMessage{{Role: "system", Content: sys}, {Role: "user", Content: user}})
+	})

 	sub := dnats.MustConnect(natsURL)
 	defer sub.Close()
@@ -37,7 +41,7 @@ func main() {
 	}

 	// sub 同时作为 Token 回流出口（TokenSink）、MCP 工具调用出口（ToolCaller）与执行事件出口（ExecSink）。
-	orch, err := eino.NewOrchestrator(pool, breaker, sub, sub, sub)
+	orch, err := eino.NewOrchestrator(pool, breaker, eval, sub, sub, sub)
 	if err != nil {
 		log.Fatalf("[dispatcher] build eino graph: %v", err)
 	}