feat: gitignore

This commit is contained in:
Blizzard
2026-04-28 10:46:56 +08:00
parent f62f95ec02
commit 473f9226d3
129 changed files with 695 additions and 10631 deletions
+83 -10
View File
@@ -121,7 +121,11 @@ class VectorStore:
logger.info("Created collection %s (dim=%d)", name, dim)
def insert(self, project_id: str, chunks: list[dict]):
"""Insert chunks: [{'id': str, 'source_id': str, 'text': str, 'vector': list, 'metadata': dict?}]."""
"""Insert chunks: [{'id': str, 'source_id': str, 'text': str, 'vector': list, 'metadata': dict?}].
Metadata keys 'sheet', 'row_number', 'file_name' are promoted to
payload top-level for Qdrant filter support.
"""
if not self._client:
return
name = self.collection_name(project_id)
@@ -129,7 +133,12 @@ class VectorStore:
for c in chunks:
payload = {"text": c["text"], "source_id": c["source_id"]}
if "metadata" in c:
payload["metadata"] = c["metadata"]
meta = c["metadata"]
# Promote key fields to top level for Qdrant filtering
for key in ("sheet", "row_number", "file_name"):
if key in meta:
payload[key] = meta[key]
payload["metadata"] = meta
points.append(PointStruct(id=c["id"], vector=c["vector"], payload=payload))
self._client.upsert(collection_name=name, points=points)
@@ -250,11 +259,17 @@ class RAGService:
return self.store.search(project_id, query_vec, top_k, file_ids)
# ────────── SQLite fallback keyword search ──────────
# ────────── SQLite keyword search (scored ranking) ──────────
def search_text_chunks_keyword(session, project_id: str, query: str,
file_ids: list[str] | None, top_k: int) -> list[dict]:
"""Keyword-based fallback when vector search is unavailable."""
"""Keyword search with scored ranking.
1. Extract keywords from query (Chinese-aware n-gram splitting)
2. Fetch chunks matching ANY keyword (OR, wide net)
3. Score each chunk by how many keywords it contains
4. Return top_k sorted by score descending
"""
from models import TextChunk
keywords = _extract_keywords(query)
@@ -266,19 +281,77 @@ def search_text_chunks_keyword(session, project_id: str, query: str,
q = q.filter(TextChunk.source_id.in_(file_ids))
from sqlalchemy import or_
conditions = [TextChunk.content.ilike(f"%{kw}%") for kw in keywords if len(kw) >= 2]
conditions = [TextChunk.content.ilike(f"%{kw}%") for kw in keywords]
if not conditions:
return []
q = q.filter(or_(*conditions)).order_by(TextChunk.chunk_idx).limit(top_k)
return [{"text": c.content, "source_id": c.source_id} for c in q.all()]
# Fetch wider pool, then rank by keyword hit count
fetch_limit = max(top_k * 4, 20)
candidates = q.filter(or_(*conditions)).limit(fetch_limit).all()
# Score each chunk: count how many keywords appear in its content
scored = []
for c in candidates:
text_lower = c.content.lower()
hits = sum(1 for kw in keywords if kw.lower() in text_lower)
scored.append((hits, c))
# Sort by hit count descending, take top_k
scored.sort(key=lambda x: x[0], reverse=True)
return [
{"text": c.content, "source_id": c.source_id}
for _, c in scored[:top_k]
]
def _extract_keywords(query: str) -> list[str]:
"""Extract search keywords from a Chinese query (no jieba needed).
Strategy:
1. Remove stop words / particles
2. Split on punctuation and whitespace
3. For each segment, generate 2-4 char n-grams for Chinese text
4. Deduplicate and return
"""
import re
stop = {"", "", "", "", "", "", "", "", "", "", "", "", ""}
parts = re.split(r'[,。、?!,.\?!\s\n\t]+', query)
return [p.strip() for p in parts if len(p.strip()) >= 2 and p.strip() not in stop]
# Common stop words / particles
stop_chars = set("的了是在和与对有不这那我你它们都吗呢吧啊哦呀嘛")
stop_words = {"多少", "什么", "怎么", "如何", "哪些", "哪个", "请问",
"告诉", "可以", "一下", "一共", "总共", "分别"}
# Remove stop characters
cleaned = "".join(c for c in query if c not in stop_chars)
# Split on punctuation, whitespace, and non-CJK characters
segments = re.split(r'[,。、?!,.?!\s\n\t:;\-—()()\[\]【】{}""\']+', cleaned)
keywords: list[str] = []
seen: set = set()
def _add(kw: str):
if kw and len(kw) >= 2 and kw not in seen and kw not in stop_words:
seen.add(kw)
keywords.append(kw)
for seg in segments:
seg = seg.strip()
if not seg:
continue
# If segment is short enough, keep as-is
if len(seg) <= 4:
_add(seg)
continue
# For longer segments, generate overlapping n-grams (2, 3, 4 chars)
# Also keep the full segment for exact matching
_add(seg)
for n in (4, 3, 2):
for i in range(len(seg) - n + 1):
_add(seg[i:i + n])
return keywords
# ── Singletons ──