3550a22557
入库从纯文本升级为多文件类型:解析(mcp-py 算法层)与切块/embedding 解耦。 上传文件 → Gateway 按类型路由 → mcp-py parse_document 解析为文本 → kb_ingest。 - mcp-py: parsers.py(docx=python-docx / xlsx=openpyxl / pdf=pypdf / csv / txt→文本); parse_document 工具做真(base64 文件→文本,线程池跑 CPU 密集解析);pyproject 加依赖 - gateway: POST /api/v1/kb/ingest_file(multipart);parseFile 文本类直读、office/pdf→mcp-py - nats-server.conf: max_payload 8MB(容纳 base64 文件经工具调用;大文件应走对象存储) - frontend: KbView 加文件上传(accept docx/xlsx/pdf/csv...);api.ingestFile - 验证: 全模块 build✓ + e2e PASS; live——4 类文件上传→mcp-py 解析→入库→检索命中: docx(营收报告)/xlsx(销量表行)/pdf(Q2计划)/csv(城市人口) 全部正确 - 边界: 扫描件/版面 OCR(MinerU/PaddleOCR)推迟;大文件 base64 走 NATS 受 max_payload 限,生产应走对象存储(MinIO) Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
131 lines
4.1 KiB
Go
131 lines
4.1 KiB
Go
package handler
|
||
|
||
import (
|
||
"context"
|
||
"encoding/base64"
|
||
"encoding/json"
|
||
"errors"
|
||
"io"
|
||
"net/http"
|
||
"path/filepath"
|
||
"strings"
|
||
|
||
"github.com/gin-gonic/gin"
|
||
|
||
"github.com/sundynix/sundynix-shared/contract"
|
||
)
|
||
|
||
// KbIngest: POST /api/v1/kb/ingest —— 把文本入库到知识库(→ mcp-go kb_ingest → 切块/embedding/Milvus)。
|
||
// 供知识库管理页/脚本调用。
|
||
func (h *Handler) KbIngest(c *gin.Context) {
|
||
var body struct {
|
||
KB string `json:"kb"`
|
||
Text string `json:"text"`
|
||
}
|
||
if err := c.ShouldBindJSON(&body); err != nil || body.Text == "" {
|
||
c.JSON(http.StatusBadRequest, gin.H{"error": "text required"})
|
||
return
|
||
}
|
||
res, err := h.bus.CallTool(c.Request.Context(), contract.ToolSubjectGo("kb_ingest"),
|
||
&contract.ToolCall{Tool: "kb_ingest", Args: map[string]any{"kb": body.KB, "text": body.Text}})
|
||
if err != nil {
|
||
c.JSON(http.StatusBadGateway, gin.H{"error": err.Error()})
|
||
return
|
||
}
|
||
if !res.OK {
|
||
c.JSON(http.StatusUnprocessableEntity, gin.H{"error": res.Error})
|
||
return
|
||
}
|
||
c.JSON(http.StatusOK, gin.H{"status": "ok", "message": res.Content})
|
||
}
|
||
|
||
// KbIngestFile: POST /api/v1/kb/ingest_file(multipart)—— 上传文件入库。
|
||
// 按类型路由:文本直读;docx/xlsx/pdf/csv → mcp-py parse_document 解析为文本 → kb_ingest。
|
||
func (h *Handler) KbIngestFile(c *gin.Context) {
|
||
kb := c.PostForm("kb")
|
||
fh, err := c.FormFile("file")
|
||
if err != nil {
|
||
c.JSON(http.StatusBadRequest, gin.H{"error": "file required"})
|
||
return
|
||
}
|
||
f, err := fh.Open()
|
||
if err != nil {
|
||
c.JSON(http.StatusBadGateway, gin.H{"error": err.Error()})
|
||
return
|
||
}
|
||
defer f.Close()
|
||
data, err := io.ReadAll(f)
|
||
if err != nil {
|
||
c.JSON(http.StatusBadGateway, gin.H{"error": err.Error()})
|
||
return
|
||
}
|
||
text, err := h.parseFile(c.Request.Context(), fh.Filename, data)
|
||
if err != nil {
|
||
c.JSON(http.StatusUnprocessableEntity, gin.H{"error": "解析失败: " + err.Error()})
|
||
return
|
||
}
|
||
res, err := h.bus.CallTool(c.Request.Context(), contract.ToolSubjectGo("kb_ingest"),
|
||
&contract.ToolCall{Tool: "kb_ingest", Args: map[string]any{"kb": kb, "text": text}})
|
||
if err != nil {
|
||
c.JSON(http.StatusBadGateway, gin.H{"error": err.Error()})
|
||
return
|
||
}
|
||
if !res.OK {
|
||
c.JSON(http.StatusUnprocessableEntity, gin.H{"error": res.Error})
|
||
return
|
||
}
|
||
c.JSON(http.StatusOK, gin.H{"status": "ok", "file": fh.Filename, "chars": len([]rune(text)), "message": res.Content})
|
||
}
|
||
|
||
// parseFile 把文件字节转为纯文本:文本类直读,其余经 mcp-py parse_document(算法层)。
|
||
func (h *Handler) parseFile(ctx context.Context, filename string, data []byte) (string, error) {
|
||
switch strings.ToLower(filepath.Ext(filename)) {
|
||
case ".txt", ".md", ".markdown", ".text":
|
||
return string(data), nil
|
||
}
|
||
res, err := h.bus.CallTool(ctx, contract.ToolSubjectPy("parse_document"),
|
||
&contract.ToolCall{Tool: "parse_document", Args: map[string]any{
|
||
"filename": filename, "content_b64": base64.StdEncoding.EncodeToString(data),
|
||
}})
|
||
if err != nil {
|
||
return "", err
|
||
}
|
||
if res == nil || !res.OK {
|
||
if res != nil && res.Error != "" {
|
||
return "", errors.New(res.Error)
|
||
}
|
||
return "", errors.New("parse_document 无响应(mcp-py 未运行?)")
|
||
}
|
||
return res.Content, nil
|
||
}
|
||
|
||
// KbSearch: POST /api/v1/kb/search —— 检索台:查某知识库,返回带分数的命中(→ mcp-go kb_search)。
|
||
func (h *Handler) KbSearch(c *gin.Context) {
|
||
var body struct {
|
||
KB string `json:"kb"`
|
||
Q string `json:"q"`
|
||
TopK int `json:"topK"`
|
||
}
|
||
if err := c.ShouldBindJSON(&body); err != nil || body.Q == "" {
|
||
c.JSON(http.StatusBadRequest, gin.H{"error": "q required"})
|
||
return
|
||
}
|
||
args := map[string]any{"kb": body.KB, "q": body.Q}
|
||
if body.TopK > 0 {
|
||
args["topK"] = body.TopK
|
||
}
|
||
res, err := h.bus.CallTool(c.Request.Context(), contract.ToolSubjectGo("kb_search"),
|
||
&contract.ToolCall{Tool: "kb_search", Args: args})
|
||
if err != nil {
|
||
c.JSON(http.StatusBadGateway, gin.H{"error": err.Error()})
|
||
return
|
||
}
|
||
if !res.OK {
|
||
c.JSON(http.StatusUnprocessableEntity, gin.H{"error": res.Error})
|
||
return
|
||
}
|
||
var hits []map[string]any
|
||
_ = json.Unmarshal([]byte(res.Content), &hits)
|
||
c.JSON(http.StatusOK, gin.H{"hits": hits})
|
||
}
|