feat: gitignore
This commit is contained in:
+83
-10
@@ -121,7 +121,11 @@ class VectorStore:
|
||||
logger.info("Created collection %s (dim=%d)", name, dim)
|
||||
|
||||
def insert(self, project_id: str, chunks: list[dict]):
|
||||
"""Insert chunks: [{'id': str, 'source_id': str, 'text': str, 'vector': list, 'metadata': dict?}]."""
|
||||
"""Insert chunks: [{'id': str, 'source_id': str, 'text': str, 'vector': list, 'metadata': dict?}].
|
||||
|
||||
Metadata keys 'sheet', 'row_number', 'file_name' are promoted to
|
||||
payload top-level for Qdrant filter support.
|
||||
"""
|
||||
if not self._client:
|
||||
return
|
||||
name = self.collection_name(project_id)
|
||||
@@ -129,7 +133,12 @@ class VectorStore:
|
||||
for c in chunks:
|
||||
payload = {"text": c["text"], "source_id": c["source_id"]}
|
||||
if "metadata" in c:
|
||||
payload["metadata"] = c["metadata"]
|
||||
meta = c["metadata"]
|
||||
# Promote key fields to top level for Qdrant filtering
|
||||
for key in ("sheet", "row_number", "file_name"):
|
||||
if key in meta:
|
||||
payload[key] = meta[key]
|
||||
payload["metadata"] = meta
|
||||
points.append(PointStruct(id=c["id"], vector=c["vector"], payload=payload))
|
||||
self._client.upsert(collection_name=name, points=points)
|
||||
|
||||
@@ -250,11 +259,17 @@ class RAGService:
|
||||
return self.store.search(project_id, query_vec, top_k, file_ids)
|
||||
|
||||
|
||||
# ────────── SQLite fallback keyword search ──────────
|
||||
# ────────── SQLite keyword search (scored ranking) ──────────
|
||||
|
||||
def search_text_chunks_keyword(session, project_id: str, query: str,
|
||||
file_ids: list[str] | None, top_k: int) -> list[dict]:
|
||||
"""Keyword-based fallback when vector search is unavailable."""
|
||||
"""Keyword search with scored ranking.
|
||||
|
||||
1. Extract keywords from query (Chinese-aware n-gram splitting)
|
||||
2. Fetch chunks matching ANY keyword (OR, wide net)
|
||||
3. Score each chunk by how many keywords it contains
|
||||
4. Return top_k sorted by score descending
|
||||
"""
|
||||
from models import TextChunk
|
||||
|
||||
keywords = _extract_keywords(query)
|
||||
@@ -266,19 +281,77 @@ def search_text_chunks_keyword(session, project_id: str, query: str,
|
||||
q = q.filter(TextChunk.source_id.in_(file_ids))
|
||||
|
||||
from sqlalchemy import or_
|
||||
conditions = [TextChunk.content.ilike(f"%{kw}%") for kw in keywords if len(kw) >= 2]
|
||||
conditions = [TextChunk.content.ilike(f"%{kw}%") for kw in keywords]
|
||||
if not conditions:
|
||||
return []
|
||||
q = q.filter(or_(*conditions)).order_by(TextChunk.chunk_idx).limit(top_k)
|
||||
|
||||
return [{"text": c.content, "source_id": c.source_id} for c in q.all()]
|
||||
# Fetch wider pool, then rank by keyword hit count
|
||||
fetch_limit = max(top_k * 4, 20)
|
||||
candidates = q.filter(or_(*conditions)).limit(fetch_limit).all()
|
||||
|
||||
# Score each chunk: count how many keywords appear in its content
|
||||
scored = []
|
||||
for c in candidates:
|
||||
text_lower = c.content.lower()
|
||||
hits = sum(1 for kw in keywords if kw.lower() in text_lower)
|
||||
scored.append((hits, c))
|
||||
|
||||
# Sort by hit count descending, take top_k
|
||||
scored.sort(key=lambda x: x[0], reverse=True)
|
||||
return [
|
||||
{"text": c.content, "source_id": c.source_id}
|
||||
for _, c in scored[:top_k]
|
||||
]
|
||||
|
||||
|
||||
def _extract_keywords(query: str) -> list[str]:
|
||||
"""Extract search keywords from a Chinese query (no jieba needed).
|
||||
|
||||
Strategy:
|
||||
1. Remove stop words / particles
|
||||
2. Split on punctuation and whitespace
|
||||
3. For each segment, generate 2-4 char n-grams for Chinese text
|
||||
4. Deduplicate and return
|
||||
"""
|
||||
import re
|
||||
stop = {"的", "了", "是", "在", "和", "与", "对", "有", "不", "这", "那", "我", "你"}
|
||||
parts = re.split(r'[,。、?!,.\?!\s\n\t]+', query)
|
||||
return [p.strip() for p in parts if len(p.strip()) >= 2 and p.strip() not in stop]
|
||||
|
||||
# Common stop words / particles
|
||||
stop_chars = set("的了是在和与对有不这那我你它们都吗呢吧啊哦呀嘛")
|
||||
stop_words = {"多少", "什么", "怎么", "如何", "哪些", "哪个", "请问",
|
||||
"告诉", "可以", "一下", "一共", "总共", "分别"}
|
||||
|
||||
# Remove stop characters
|
||||
cleaned = "".join(c for c in query if c not in stop_chars)
|
||||
|
||||
# Split on punctuation, whitespace, and non-CJK characters
|
||||
segments = re.split(r'[,。、?!,.?!\s\n\t::;\-—()()\[\]【】{}""\']+', cleaned)
|
||||
|
||||
keywords: list[str] = []
|
||||
seen: set = set()
|
||||
|
||||
def _add(kw: str):
|
||||
if kw and len(kw) >= 2 and kw not in seen and kw not in stop_words:
|
||||
seen.add(kw)
|
||||
keywords.append(kw)
|
||||
|
||||
for seg in segments:
|
||||
seg = seg.strip()
|
||||
if not seg:
|
||||
continue
|
||||
|
||||
# If segment is short enough, keep as-is
|
||||
if len(seg) <= 4:
|
||||
_add(seg)
|
||||
continue
|
||||
|
||||
# For longer segments, generate overlapping n-grams (2, 3, 4 chars)
|
||||
# Also keep the full segment for exact matching
|
||||
_add(seg)
|
||||
for n in (4, 3, 2):
|
||||
for i in range(len(seg) - n + 1):
|
||||
_add(seg[i:i + n])
|
||||
|
||||
return keywords
|
||||
|
||||
|
||||
# ── Singletons ──
|
||||
|
||||
Reference in New Issue
Block a user