feat: gitignore

2026-04-28 10:46:56 +08:00
parent f62f95ec02
commit 473f9226d3
129 changed files with 695 additions and 10631 deletions
@@ -328,20 +328,15 @@ async def get_material_content(file_id: str):

    if file_type == "excel":
        # Return structured table data for rich rendering
-        from parsers.excel_parser import _prepare_grid, _cell_str
-        from openpyxl import load_workbook
-        wb = load_workbook(file_path, data_only=True)
+        from parsers.excel_parser import _iter_sheets, _cell_str
        sheets = []
-        for sheet_name in wb.sheetnames:
-            ws = wb[sheet_name]
-            grid = _prepare_grid(ws)
+        for sheet_name, grid, _ in _iter_sheets(file_path):
            if not grid:
                continue
            rows = []
            for row in grid:
                rows.append([_cell_str(c) for c in row])
            sheets.append({"name": sheet_name, "rows": rows})
-        wb.close()
        return {"type": "excel", "sheets": sheets}
    else:
        # For non-excel: return cached markdown or re-parse
@@ -882,39 +877,82 @@ def _get_embedding_config() -> dict | None:


 def _search_material_context(query: str, file_ids: list[str], top_k: int) -> str:
+    """Hybrid search: vector + keyword in parallel, merge and deduplicate.
+
+    Vector search captures semantic similarity; keyword search captures
+    exact term matches regardless of word order — together they fix the
+    issue where '水域集体所有' hits but '集体所有水域' misses.
+    """
    pid = db.current_project_id
    if not pid:
        return ""

-    chunks = []
+    vector_chunks = []
+    keyword_chunks = []

-    # Try vector search
+    # ── Vector search ──
    emb_cfg = _get_embedding_config()
    if emb_cfg and vector_store.connected:
        try:
            import asyncio
-            loop = asyncio.get_event_loop()
-            chunks_coro = rag_service.search_context(pid, query, top_k, emb_cfg, file_ids or None)
-            # We're in sync context here, but called from async — use create_task workaround
-            # Actually this helper is called from async routes, so just run sync approach
            import concurrent.futures
+            chunks_coro = rag_service.search_context(
+                pid, query, top_k, emb_cfg, file_ids or None,
+            )
            with concurrent.futures.ThreadPoolExecutor() as pool:
                future = pool.submit(asyncio.run, chunks_coro)
-                chunks = future.result()
+                vector_chunks = future.result()
        except Exception as e:
-            logger.warning("Vector search failed, falling back: %s", e)
-            chunks = []
+            logger.warning("Vector search failed: %s", e)

-    # Fallback: keyword search
-    if not chunks:
-        session = db.project_session()
-        if session:
-            with session as s:
-                chunks = search_text_chunks_keyword(s, pid, query, file_ids or None, top_k)
+    # ── Keyword search (always run in parallel) ──
+    session = db.project_session()
+    if session:
+        with session as s:
+            keyword_chunks = search_text_chunks_keyword(
+                s, pid, query, file_ids or None, top_k,
+            )

-    if not chunks:
+    # ── Merge and deduplicate ──
+    merged = _merge_search_results(vector_chunks, keyword_chunks, top_k)
+
+    if not merged:
        return ""
-    return "\n\n---\n\n".join(c.get("text", "") for c in chunks)
+    return "\n\n---\n\n".join(c.get("text", "") for c in merged)
+
+
+def _merge_search_results(vector_chunks: list[dict],
+                          keyword_chunks: list[dict],
+                          top_k: int) -> list[dict]:
+    """Merge vector and keyword results, deduplicate by text content.
+
+    Priority: vector results first (semantically ranked), then keyword
+    results that weren't already found by vector search.
+    """
+    seen_texts: set = set()
+    merged: list[dict] = []
+
+    def _text_key(text: str) -> str:
+        """Normalize text for dedup: strip whitespace, take first 80 chars."""
+        return text.strip()[:80] if text else ""
+
+    # Vector results first (higher priority)
+    for c in vector_chunks:
+        key = _text_key(c.get("text", ""))
+        if key and key not in seen_texts:
+            seen_texts.add(key)
+            merged.append(c)
+
+    # Keyword results fill remaining slots
+    for c in keyword_chunks:
+        if len(merged) >= top_k:
+            break
+        key = _text_key(c.get("text", ""))
+        if key and key not in seen_texts:
+            seen_texts.add(key)
+            merged.append(c)
+
+    return merged[:top_k]


 async def _parse_and_index(project_id: str, file_id: str, file_name: str,