feat: 文件入库 — docx/xlsx/pdf/csv 经 mcp-py 解析 → RAG
入库从纯文本升级为多文件类型:解析(mcp-py 算法层)与切块/embedding 解耦。 上传文件 → Gateway 按类型路由 → mcp-py parse_document 解析为文本 → kb_ingest。 - mcp-py: parsers.py(docx=python-docx / xlsx=openpyxl / pdf=pypdf / csv / txt→文本); parse_document 工具做真(base64 文件→文本,线程池跑 CPU 密集解析);pyproject 加依赖 - gateway: POST /api/v1/kb/ingest_file(multipart);parseFile 文本类直读、office/pdf→mcp-py - nats-server.conf: max_payload 8MB(容纳 base64 文件经工具调用;大文件应走对象存储) - frontend: KbView 加文件上传(accept docx/xlsx/pdf/csv...);api.ingestFile - 验证: 全模块 build✓ + e2e PASS; live——4 类文件上传→mcp-py 解析→入库→检索命中: docx(营收报告)/xlsx(销量表行)/pdf(Q2计划)/csv(城市人口) 全部正确 - 边界: 扫描件/版面 OCR(MinerU/PaddleOCR)推迟;大文件 base64 走 NATS 受 max_payload 限,生产应走对象存储(MinIO) Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
This commit is contained in:
@@ -1,8 +1,14 @@
|
||||
package handler
|
||||
|
||||
import (
|
||||
"context"
|
||||
"encoding/base64"
|
||||
"encoding/json"
|
||||
"errors"
|
||||
"io"
|
||||
"net/http"
|
||||
"path/filepath"
|
||||
"strings"
|
||||
|
||||
"github.com/gin-gonic/gin"
|
||||
|
||||
@@ -33,6 +39,66 @@ func (h *Handler) KbIngest(c *gin.Context) {
|
||||
c.JSON(http.StatusOK, gin.H{"status": "ok", "message": res.Content})
|
||||
}
|
||||
|
||||
// KbIngestFile: POST /api/v1/kb/ingest_file(multipart)—— 上传文件入库。
|
||||
// 按类型路由:文本直读;docx/xlsx/pdf/csv → mcp-py parse_document 解析为文本 → kb_ingest。
|
||||
func (h *Handler) KbIngestFile(c *gin.Context) {
|
||||
kb := c.PostForm("kb")
|
||||
fh, err := c.FormFile("file")
|
||||
if err != nil {
|
||||
c.JSON(http.StatusBadRequest, gin.H{"error": "file required"})
|
||||
return
|
||||
}
|
||||
f, err := fh.Open()
|
||||
if err != nil {
|
||||
c.JSON(http.StatusBadGateway, gin.H{"error": err.Error()})
|
||||
return
|
||||
}
|
||||
defer f.Close()
|
||||
data, err := io.ReadAll(f)
|
||||
if err != nil {
|
||||
c.JSON(http.StatusBadGateway, gin.H{"error": err.Error()})
|
||||
return
|
||||
}
|
||||
text, err := h.parseFile(c.Request.Context(), fh.Filename, data)
|
||||
if err != nil {
|
||||
c.JSON(http.StatusUnprocessableEntity, gin.H{"error": "解析失败: " + err.Error()})
|
||||
return
|
||||
}
|
||||
res, err := h.bus.CallTool(c.Request.Context(), contract.ToolSubjectGo("kb_ingest"),
|
||||
&contract.ToolCall{Tool: "kb_ingest", Args: map[string]any{"kb": kb, "text": text}})
|
||||
if err != nil {
|
||||
c.JSON(http.StatusBadGateway, gin.H{"error": err.Error()})
|
||||
return
|
||||
}
|
||||
if !res.OK {
|
||||
c.JSON(http.StatusUnprocessableEntity, gin.H{"error": res.Error})
|
||||
return
|
||||
}
|
||||
c.JSON(http.StatusOK, gin.H{"status": "ok", "file": fh.Filename, "chars": len([]rune(text)), "message": res.Content})
|
||||
}
|
||||
|
||||
// parseFile 把文件字节转为纯文本:文本类直读,其余经 mcp-py parse_document(算法层)。
|
||||
func (h *Handler) parseFile(ctx context.Context, filename string, data []byte) (string, error) {
|
||||
switch strings.ToLower(filepath.Ext(filename)) {
|
||||
case ".txt", ".md", ".markdown", ".text":
|
||||
return string(data), nil
|
||||
}
|
||||
res, err := h.bus.CallTool(ctx, contract.ToolSubjectPy("parse_document"),
|
||||
&contract.ToolCall{Tool: "parse_document", Args: map[string]any{
|
||||
"filename": filename, "content_b64": base64.StdEncoding.EncodeToString(data),
|
||||
}})
|
||||
if err != nil {
|
||||
return "", err
|
||||
}
|
||||
if res == nil || !res.OK {
|
||||
if res != nil && res.Error != "" {
|
||||
return "", errors.New(res.Error)
|
||||
}
|
||||
return "", errors.New("parse_document 无响应(mcp-py 未运行?)")
|
||||
}
|
||||
return res.Content, nil
|
||||
}
|
||||
|
||||
// KbSearch: POST /api/v1/kb/search —— 检索台:查某知识库,返回带分数的命中(→ mcp-go kb_search)。
|
||||
func (h *Handler) KbSearch(c *gin.Context) {
|
||||
var body struct {
|
||||
|
||||
Reference in New Issue
Block a user