feat: gitignore

2026-04-28 10:46:56 +08:00
parent f62f95ec02
commit 473f9226d3
129 changed files with 695 additions and 10631 deletions
@@ -121,7 +121,11 @@ class VectorStore:
        logger.info("Created collection %s (dim=%d)", name, dim)

    def insert(self, project_id: str, chunks: list[dict]):
-        """Insert chunks: [{'id': str, 'source_id': str, 'text': str, 'vector': list, 'metadata': dict?}]."""
+        """Insert chunks: [{'id': str, 'source_id': str, 'text': str, 'vector': list, 'metadata': dict?}].
+
+        Metadata keys 'sheet', 'row_number', 'file_name' are promoted to
+        payload top-level for Qdrant filter support.
+        """
        if not self._client:
            return
        name = self.collection_name(project_id)
@@ -129,7 +133,12 @@ class VectorStore:
        for c in chunks:
            payload = {"text": c["text"], "source_id": c["source_id"]}
            if "metadata" in c:
-                payload["metadata"] = c["metadata"]
+                meta = c["metadata"]
+                # Promote key fields to top level for Qdrant filtering
+                for key in ("sheet", "row_number", "file_name"):
+                    if key in meta:
+                        payload[key] = meta[key]
+                payload["metadata"] = meta
            points.append(PointStruct(id=c["id"], vector=c["vector"], payload=payload))
        self._client.upsert(collection_name=name, points=points)

@@ -250,11 +259,17 @@ class RAGService:
        return self.store.search(project_id, query_vec, top_k, file_ids)


-# ────────── SQLite fallback keyword search ──────────
+# ────────── SQLite keyword search (scored ranking) ──────────

 def search_text_chunks_keyword(session, project_id: str, query: str,
                               file_ids: list[str] | None, top_k: int) -> list[dict]:
-    """Keyword-based fallback when vector search is unavailable."""
+    """Keyword search with scored ranking.
+
+    1. Extract keywords from query (Chinese-aware n-gram splitting)
+    2. Fetch chunks matching ANY keyword (OR, wide net)
+    3. Score each chunk by how many keywords it contains
+    4. Return top_k sorted by score descending
+    """
    from models import TextChunk

    keywords = _extract_keywords(query)
@@ -266,19 +281,77 @@ def search_text_chunks_keyword(session, project_id: str, query: str,
        q = q.filter(TextChunk.source_id.in_(file_ids))

    from sqlalchemy import or_
-    conditions = [TextChunk.content.ilike(f"%{kw}%") for kw in keywords if len(kw) >= 2]
+    conditions = [TextChunk.content.ilike(f"%{kw}%") for kw in keywords]
    if not conditions:
        return []
-    q = q.filter(or_(*conditions)).order_by(TextChunk.chunk_idx).limit(top_k)

-    return [{"text": c.content, "source_id": c.source_id} for c in q.all()]
+    # Fetch wider pool, then rank by keyword hit count
+    fetch_limit = max(top_k * 4, 20)
+    candidates = q.filter(or_(*conditions)).limit(fetch_limit).all()
+
+    # Score each chunk: count how many keywords appear in its content
+    scored = []
+    for c in candidates:
+        text_lower = c.content.lower()
+        hits = sum(1 for kw in keywords if kw.lower() in text_lower)
+        scored.append((hits, c))
+
+    # Sort by hit count descending, take top_k
+    scored.sort(key=lambda x: x[0], reverse=True)
+    return [
+        {"text": c.content, "source_id": c.source_id}
+        for _, c in scored[:top_k]
+    ]


 def _extract_keywords(query: str) -> list[str]:
+    """Extract search keywords from a Chinese query (no jieba needed).
+
+    Strategy:
+    1. Remove stop words / particles
+    2. Split on punctuation and whitespace
+    3. For each segment, generate 2-4 char n-grams for Chinese text
+    4. Deduplicate and return
+    """
    import re
-    stop = {"的", "了", "是", "在", "和", "与", "对", "有", "不", "这", "那", "我", "你"}
-    parts = re.split(r'[，。、？！,.\?!\s\n\t]+', query)
-    return [p.strip() for p in parts if len(p.strip()) >= 2 and p.strip() not in stop]
+
+    # Common stop words / particles
+    stop_chars = set("的了是在和与对有不这那我你它们都吗呢吧啊哦呀嘛")
+    stop_words = {"多少", "什么", "怎么", "如何", "哪些", "哪个", "请问",
+                  "告诉", "可以", "一下", "一共", "总共", "分别"}
+
+    # Remove stop characters
+    cleaned = "".join(c for c in query if c not in stop_chars)
+
+    # Split on punctuation, whitespace, and non-CJK characters
+    segments = re.split(r'[，。、？！,.?!\s\n\t:：;\-—()（）\[\]【】{}""\']+', cleaned)
+
+    keywords: list[str] = []
+    seen: set = set()
+
+    def _add(kw: str):
+        if kw and len(kw) >= 2 and kw not in seen and kw not in stop_words:
+            seen.add(kw)
+            keywords.append(kw)
+
+    for seg in segments:
+        seg = seg.strip()
+        if not seg:
+            continue
+
+        # If segment is short enough, keep as-is
+        if len(seg) <= 4:
+            _add(seg)
+            continue
+
+        # For longer segments, generate overlapping n-grams (2, 3, 4 chars)
+        # Also keep the full segment for exact matching
+        _add(seg)
+        for n in (4, 3, 2):
+            for i in range(len(seg) - n + 1):
+                _add(seg[i:i + n])
+
+    return keywords


 # ── Singletons ──