feat: 文件入库 — docx/xlsx/pdf/csv 经 mcp-py 解析 → RAG

入库从纯文本升级为多文件类型:解析(mcp-py 算法层)与切块/embedding 解耦。
上传文件 → Gateway 按类型路由 → mcp-py parse_document 解析为文本 → kb_ingest。

- mcp-py: parsers.py(docx=python-docx / xlsx=openpyxl / pdf=pypdf / csv / txt→文本);
  parse_document 工具做真(base64 文件→文本,线程池跑 CPU 密集解析);pyproject 加依赖
- gateway: POST /api/v1/kb/ingest_file(multipart);parseFile 文本类直读、office/pdf→mcp-py
- nats-server.conf: max_payload 8MB(容纳 base64 文件经工具调用;大文件应走对象存储)
- frontend: KbView 加文件上传(accept docx/xlsx/pdf/csv...);api.ingestFile
- 验证: 全模块 build✓ + e2e PASS; live——4 类文件上传→mcp-py 解析→入库→检索命中:
  docx(营收报告)/xlsx(销量表行)/pdf(Q2计划)/csv(城市人口) 全部正确
- 边界: 扫描件/版面 OCR(MinerU/PaddleOCR)推迟;大文件 base64 走 NATS 受 max_payload
  限,生产应走对象存储(MinIO)

Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
This commit is contained in:
Blizzard
2026-06-11 10:10:07 +08:00
parent 85a5c2c1e7
commit 3550a22557
8 changed files with 198 additions and 14 deletions
+4 -1
View File
@@ -7,7 +7,10 @@ dependencies = [
"mcp>=1.2.0", # MCP 协议
"nats-py>=2.7.0", # 接入 NATS 骨干网
"docker>=7.1.0", # Docker 隔离沙箱 / Code Interpreter
# "magic-pdf", # MinerU 多模态解析 (PaddleOCR),按需安装
"python-docx>=1.1.0", # Word 解析
"openpyxl>=3.1.0", # Excel 解析
"pypdf>=4.0.0", # PDF 文本解析
# "magic-pdf", # MinerU 多模态解析 (PaddleOCR),扫描件 OCR,按需安装
]
[build-system]
@@ -98,9 +98,18 @@ class McpGateway:
return f"[run_code] Docker 隔离执行(桩) stdout={result.get('stdout','')!r}"
async def _parse_document(self, args: dict) -> str:
path = str(args.get("path", ""))
result = await self.parser.parse(path) # MinerU / PaddleOCR(桩)
return f"[parse_document] MinerU 解析(桩) path={result.get('path','')!r} blocks={len(result.get('blocks', []))}"
"""文件 → 纯文本。content_b64=文件内容(base64)filename 决定解析器。"""
import base64
from . import parsers
filename = str(args.get("filename", ""))
content_b64 = str(args.get("content_b64", ""))
if not content_b64:
return str(args.get("text", ""))
data = base64.b64decode(content_b64)
# 解析是 CPU 密集,丢到线程池避免阻塞事件循环。
return await asyncio.to_thread(parsers.parse, filename, data)
async def _secure_sandbox(self, args: dict) -> str:
code = str(args.get("code", ""))
@@ -0,0 +1,65 @@
"""文档解析:各类文件 → 纯文本(供 RAG 切块/embedding)。
按扩展名路由:txt/md/csv 直读;docx(python-docx)xlsx(openpyxl)pdf(pypdf)。
扫描件/版面 OCR(MinerU/PaddleOCR) 为后续。
"""
from __future__ import annotations
import csv
import io
def parse(filename: str, data: bytes) -> str:
ext = filename.lower().rsplit(".", 1)[-1] if "." in filename else ""
if ext in ("txt", "md", "markdown", "text", ""):
return data.decode("utf-8", errors="replace")
if ext == "csv":
return _csv(data)
if ext == "docx":
return _docx(data)
if ext == "xlsx":
return _xlsx(data)
if ext == "pdf":
return _pdf(data)
raise ValueError(f"暂不支持的文件类型: .{ext}")
def _csv(data: bytes) -> str:
text = data.decode("utf-8", errors="replace")
rows = list(csv.reader(io.StringIO(text)))
return "\n".join(" | ".join(r) for r in rows if any(c.strip() for c in r))
def _docx(data: bytes) -> str:
from docx import Document # python-docx
doc = Document(io.BytesIO(data))
lines: list[str] = [p.text for p in doc.paragraphs if p.text.strip()]
for table in doc.tables:
for row in table.rows:
cells = [c.text.strip() for c in row.cells]
if any(cells):
lines.append(" | ".join(cells))
return "\n".join(lines)
def _xlsx(data: bytes) -> str:
from openpyxl import load_workbook
wb = load_workbook(io.BytesIO(data), read_only=True, data_only=True)
lines: list[str] = []
for ws in wb.worksheets:
lines.append(f"# 工作表: {ws.title}")
for row in ws.iter_rows(values_only=True):
cells = [str(c) for c in row if c is not None]
if cells:
lines.append(" | ".join(cells))
return "\n".join(lines)
def _pdf(data: bytes) -> str:
from pypdf import PdfReader
reader = PdfReader(io.BytesIO(data))
pages = [(page.extract_text() or "").strip() for page in reader.pages]
return "\n\n".join(p for p in pages if p)