feat: 文件入库 — docx/xlsx/pdf/csv 经 mcp-py 解析 → RAG

入库从纯文本升级为多文件类型:解析(mcp-py 算法层)与切块/embedding 解耦。
上传文件 → Gateway 按类型路由 → mcp-py parse_document 解析为文本 → kb_ingest。

- mcp-py: parsers.py(docx=python-docx / xlsx=openpyxl / pdf=pypdf / csv / txt→文本);
  parse_document 工具做真(base64 文件→文本,线程池跑 CPU 密集解析);pyproject 加依赖
- gateway: POST /api/v1/kb/ingest_file(multipart);parseFile 文本类直读、office/pdf→mcp-py
- nats-server.conf: max_payload 8MB(容纳 base64 文件经工具调用;大文件应走对象存储)
- frontend: KbView 加文件上传(accept docx/xlsx/pdf/csv...);api.ingestFile
- 验证: 全模块 build✓ + e2e PASS; live——4 类文件上传→mcp-py 解析→入库→检索命中:
  docx(营收报告)/xlsx(销量表行)/pdf(Q2计划)/csv(城市人口) 全部正确
- 边界: 扫描件/版面 OCR(MinerU/PaddleOCR)推迟;大文件 base64 走 NATS 受 max_payload
  限,生产应走对象存储(MinIO)

Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
This commit is contained in:
Blizzard
2026-06-11 10:10:07 +08:00
parent 85a5c2c1e7
commit 3550a22557
8 changed files with 198 additions and 14 deletions
+11
View File
@@ -59,6 +59,17 @@ export async function ingestKb(kb: string, text: string): Promise<string> {
return data.message ?? "ok";
}
// ingestFile: POST /api/v1/kb/ingest_filemultipart)—— 上传文件入库(docx/xlsx/pdf… → mcp-py 解析)。
export async function ingestFile(kb: string, file: File): Promise<string> {
const fd = new FormData();
fd.append("kb", kb);
fd.append("file", file);
const res = await fetch(`${GATEWAY}/api/v1/kb/ingest_file`, { method: "POST", body: fd });
const data = (await res.json()) as { message?: string; chars?: number; error?: string };
if (!res.ok) throw new Error(data.error ?? `ingest file failed: ${res.status}`);
return `${file.name}:解析 ${data.chars ?? 0} 字 → ${data.message ?? "ok"}`;
}
export interface KbHit {
text: string;
score: number;
+37 -9
View File
@@ -1,5 +1,5 @@
import { useState } from "react";
import { ingestKb, searchKb, type KbHit } from "../lib/api";
import { useRef, useState } from "react";
import { ingestKb, ingestFile, searchKb, type KbHit } from "../lib/api";
interface IngestLog {
t: string;
@@ -14,6 +14,8 @@ export function KbView() {
const [logs, setLogs] = useState<IngestLog[]>([]);
const [ingesting, setIngesting] = useState(false);
const fileRef = useRef<HTMLInputElement>(null);
const [q, setQ] = useState("");
const [topK, setTopK] = useState(5);
const [hits, setHits] = useState<KbHit[] | null>(null);
@@ -36,6 +38,20 @@ export function KbView() {
}
};
const onFile = async (file?: File) => {
if (!file) return;
setIngesting(true);
try {
const msg = await ingestFile(kb, file);
setLogs((l) => [{ t: stamp(), msg, ok: true }, ...l]);
} catch (e) {
setLogs((l) => [{ t: stamp(), msg: `${file.name}: ${(e as Error).message}`, ok: false }, ...l]);
} finally {
setIngesting(false);
if (fileRef.current) fileRef.current.value = "";
}
};
const onSearch = async () => {
if (!q.trim()) return;
setSearching(true);
@@ -74,13 +90,25 @@ export function KbView() {
onChange={(e) => setText(e.target.value)}
placeholder={"每行一条知识,例如:\nsundynix 用 Milvus 做向量库\nsundynix 用 NATS 做消息总线"}
/>
<button
onClick={onIngest}
disabled={ingesting || !text.trim()}
className="mt-2 self-start rounded bg-emerald-600 px-3 py-1 text-sm text-white disabled:opacity-40"
>
{ingesting ? "入库中…" : "⬆ 入库"}
</button>
<div className="mt-2 flex items-center gap-2">
<button
onClick={onIngest}
disabled={ingesting || !text.trim()}
className="rounded bg-emerald-600 px-3 py-1 text-sm text-white disabled:opacity-40"
>
{ingesting ? "入库中…" : "⬆ 入库文本"}
</button>
<span className="text-[11px] text-gray-400"></span>
<input
ref={fileRef}
type="file"
accept=".txt,.md,.csv,.docx,.xlsx,.pdf"
onChange={(e) => onFile(e.target.files?.[0])}
disabled={ingesting}
className="text-xs file:mr-2 file:rounded file:border file:bg-gray-50 file:px-2 file:py-1 file:text-xs"
/>
</div>
<span className="mt-1 text-[10px] text-gray-400"> txt/md/csv/docx/xlsx/pdfdocx/xlsx/pdf mcp-py </span>
<h3 className="mb-1 mt-4 text-xs font-semibold text-gray-600"></h3>
<ul className="flex-1 space-y-1 overflow-auto">
{logs.length === 0 && <li className="text-xs text-gray-400"></li>}