refactor: excel parse

2026-04-16 10:01:11 +08:00
parent 680ecc320f
commit f62f95ec02
7941 changed files with 2899112 additions and 0 deletions
@@ -0,0 +1,79 @@
+from typing import Optional
+
+from qdrant_client.http import models
+
+
+DEFAULT_RANKING_CONSTANT_K = 2
+
+
+def reciprocal_rank_fusion(
+    responses: list[list[models.ScoredPoint]],
+    limit: int = 10,
+    ranking_constant_k: Optional[int] = None,
+) -> list[models.ScoredPoint]:
+    def compute_score(pos: int) -> float:
+        ranking_constant = (
+            ranking_constant_k if ranking_constant_k is not None else DEFAULT_RANKING_CONSTANT_K
+        )  # mitigates the impact of high rankings by outlier systems
+        return 1 / (ranking_constant + pos)
+
+    scores: dict[models.ExtendedPointId, float] = {}
+    point_pile = {}
+    for response in responses:
+        for i, scored_point in enumerate(response):
+            if scored_point.id in scores:
+                scores[scored_point.id] += compute_score(i)
+            else:
+                point_pile[scored_point.id] = scored_point
+                scores[scored_point.id] = compute_score(i)
+
+    sorted_scores = sorted(scores.items(), key=lambda item: item[1], reverse=True)
+    sorted_points = []
+    for point_id, score in sorted_scores[:limit]:
+        point = point_pile[point_id]
+        point.score = score
+        sorted_points.append(point)
+    return sorted_points
+
+
+def distribution_based_score_fusion(
+    responses: list[list[models.ScoredPoint]], limit: int
+) -> list[models.ScoredPoint]:
+    def normalize(response: list[models.ScoredPoint]) -> list[models.ScoredPoint]:
+        if len(response) == 1:
+            response[0].score = 0.5
+            return response
+
+        total = sum([point.score for point in response])
+        mean = total / len(response)
+        variance = sum([(point.score - mean) ** 2 for point in response]) / (len(response) - 1)
+
+        if variance == 0:
+            for point in response:
+                point.score = 0.5
+            return response
+
+        std_dev = variance**0.5
+        low = mean - 3 * std_dev
+        high = mean + 3 * std_dev
+
+        for point in response:
+            point.score = (point.score - low) / (high - low)
+
+        return response
+
+    points_map: dict[models.ExtendedPointId, models.ScoredPoint] = {}
+    for response in responses:
+        if not response:
+            continue
+        normalized = normalize(response)
+        for point in normalized:
+            entry = points_map.get(point.id)
+            if entry is None:
+                points_map[point.id] = point
+            else:
+                entry.score += point.score
+
+    sorted_points = sorted(points_map.values(), key=lambda item: item.score, reverse=True)
+
+    return sorted_points[:limit]