refactor: excel parse
This commit is contained in:
@@ -0,0 +1,370 @@
|
||||
import math
|
||||
|
||||
|
||||
from qdrant_client.conversions.common_types import get_args_subscribed
|
||||
from qdrant_client.http import models
|
||||
from typing import Union, Any, Tuple
|
||||
|
||||
from qdrant_client.local import datetime_utils
|
||||
from qdrant_client.local.geo import geo_distance
|
||||
from qdrant_client.local.payload_filters import check_condition
|
||||
from qdrant_client.local.payload_value_extractor import value_by_key
|
||||
|
||||
DEFAULT_SCORE = 0.0
|
||||
DEFAULT_DECAY_TARGET = 0.0
|
||||
DEFAULT_DECAY_MIDPOINT = 0.5
|
||||
DEFAULT_DECAY_SCALE = 1.0
|
||||
|
||||
|
||||
def evaluate_expression(
|
||||
expression: models.Expression,
|
||||
point_id: models.ExtendedPointId,
|
||||
scores: list[dict[models.ExtendedPointId, float]],
|
||||
payload: models.Payload,
|
||||
has_vector: dict[str, bool],
|
||||
defaults: dict[str, Any],
|
||||
) -> float:
|
||||
if isinstance(expression, (float, int)): # Constant
|
||||
return float(expression)
|
||||
|
||||
elif isinstance(expression, str): # Variable
|
||||
return evaluate_variable(expression, point_id, scores, payload, defaults)
|
||||
|
||||
elif isinstance(expression, get_args_subscribed(models.Condition)):
|
||||
if check_condition(expression, payload, point_id, has_vector): # type: ignore
|
||||
return 1.0
|
||||
return 0.0
|
||||
|
||||
elif isinstance(expression, models.MultExpression):
|
||||
factors: list[float] = []
|
||||
|
||||
for expr in expression.mult:
|
||||
factor = evaluate_expression(expr, point_id, scores, payload, has_vector, defaults)
|
||||
# Return early if any factor is zero
|
||||
if factor == 0.0:
|
||||
return factor
|
||||
|
||||
factors.append(factor)
|
||||
|
||||
return math.prod(factors)
|
||||
|
||||
elif isinstance(expression, models.SumExpression):
|
||||
return sum(
|
||||
evaluate_expression(expr, point_id, scores, payload, has_vector, defaults)
|
||||
for expr in expression.sum
|
||||
)
|
||||
|
||||
elif isinstance(expression, models.NegExpression):
|
||||
value = evaluate_expression(
|
||||
expression.neg, point_id, scores, payload, has_vector, defaults
|
||||
)
|
||||
return -value
|
||||
|
||||
elif isinstance(expression, models.AbsExpression):
|
||||
return abs(
|
||||
evaluate_expression(expression.abs, point_id, scores, payload, has_vector, defaults)
|
||||
)
|
||||
|
||||
elif isinstance(expression, models.DivExpression):
|
||||
left = evaluate_expression(
|
||||
expression.div.left, point_id, scores, payload, has_vector, defaults
|
||||
)
|
||||
|
||||
if left == 0.0:
|
||||
return left
|
||||
|
||||
right = evaluate_expression(
|
||||
expression.div.right, point_id, scores, payload, has_vector, defaults
|
||||
)
|
||||
|
||||
if right == 0.0:
|
||||
if expression.div.by_zero_default is not None:
|
||||
return expression.div.by_zero_default
|
||||
raise_non_finite_error(f"{left}/{right}")
|
||||
|
||||
result = left / right
|
||||
if math.isfinite(result):
|
||||
return result
|
||||
|
||||
raise_non_finite_error(f"{left}/{right}")
|
||||
|
||||
elif isinstance(expression, models.SqrtExpression):
|
||||
value = evaluate_expression(
|
||||
expression.sqrt, point_id, scores, payload, has_vector, defaults
|
||||
)
|
||||
|
||||
if value >= 0:
|
||||
return math.sqrt(value)
|
||||
|
||||
raise_non_finite_error(f"√{value}")
|
||||
|
||||
elif isinstance(expression, models.PowExpression):
|
||||
base = evaluate_expression(
|
||||
expression.pow.base, point_id, scores, payload, has_vector, defaults
|
||||
)
|
||||
exponent = evaluate_expression(
|
||||
expression.pow.exponent, point_id, scores, payload, has_vector, defaults
|
||||
)
|
||||
|
||||
# Check for valid input
|
||||
if base >= 0 or (base != 0 and exponent.is_integer()):
|
||||
try:
|
||||
return math.pow(base, exponent)
|
||||
except OverflowError:
|
||||
pass
|
||||
|
||||
raise_non_finite_error(f"{base}^{exponent}")
|
||||
|
||||
elif isinstance(expression, models.ExpExpression):
|
||||
value = evaluate_expression(
|
||||
expression.exp, point_id, scores, payload, has_vector, defaults
|
||||
)
|
||||
|
||||
try:
|
||||
return math.exp(value)
|
||||
except OverflowError:
|
||||
raise_non_finite_error(f"exp({value})")
|
||||
|
||||
elif isinstance(expression, models.Log10Expression):
|
||||
value = evaluate_expression(
|
||||
expression.log10, point_id, scores, payload, has_vector, defaults
|
||||
)
|
||||
|
||||
if value > 0:
|
||||
try:
|
||||
return math.log10(value)
|
||||
except OverflowError:
|
||||
pass
|
||||
|
||||
raise_non_finite_error(f"log10({value})")
|
||||
|
||||
elif isinstance(expression, models.LnExpression):
|
||||
value = evaluate_expression(expression.ln, point_id, scores, payload, has_vector, defaults)
|
||||
|
||||
if value > 0:
|
||||
try:
|
||||
return math.log(value)
|
||||
except OverflowError:
|
||||
pass
|
||||
|
||||
raise_non_finite_error(f"ln({value})")
|
||||
|
||||
elif isinstance(expression, models.GeoDistance):
|
||||
origin = expression.geo_distance.origin
|
||||
to = expression.geo_distance.to
|
||||
|
||||
# Get value from payload
|
||||
geo_value = try_extract_payload_value(to, payload, defaults)
|
||||
|
||||
if isinstance(geo_value, dict):
|
||||
# let this fail if it is not a valid geo point
|
||||
destination = models.GeoPoint(**geo_value)
|
||||
return geo_distance(origin.lon, origin.lat, destination.lon, destination.lat)
|
||||
|
||||
raise ValueError(
|
||||
f"Expected geo point for {to} in the payload and/or in the formula defaults."
|
||||
)
|
||||
|
||||
elif isinstance(expression, models.DatetimeExpression):
|
||||
# try to parse as datetime
|
||||
dt = datetime_utils.parse(expression.datetime)
|
||||
if dt is None:
|
||||
raise ValueError(f"Expected datetime in supported format for {expression.datetime}")
|
||||
|
||||
return dt.timestamp()
|
||||
|
||||
elif isinstance(expression, models.DatetimeKeyExpression):
|
||||
dt_str = try_extract_payload_value(expression.datetime_key, payload, defaults)
|
||||
dt = datetime_utils.parse(dt_str)
|
||||
if dt is None:
|
||||
raise ValueError(
|
||||
f"Expected datetime for {expression.datetime_key} in the payload and/or in the formula defaults."
|
||||
)
|
||||
|
||||
return dt.timestamp()
|
||||
|
||||
elif isinstance(expression, models.LinDecayExpression):
|
||||
x, target, midpoint, scale = evaluate_decay_params(
|
||||
expression.lin_decay, point_id, scores, payload, has_vector, defaults
|
||||
)
|
||||
|
||||
lambda_factor = (1.0 - midpoint) / scale
|
||||
diff = abs(x - target)
|
||||
return max(0.0, -lambda_factor * diff + 1.0)
|
||||
|
||||
elif isinstance(expression, models.ExpDecayExpression):
|
||||
x, target, midpoint, scale = evaluate_decay_params(
|
||||
expression.exp_decay, point_id, scores, payload, has_vector, defaults
|
||||
)
|
||||
|
||||
lambda_factor = math.log(midpoint) / scale
|
||||
diff = abs(x - target)
|
||||
return math.exp(lambda_factor * diff)
|
||||
|
||||
elif isinstance(expression, models.GaussDecayExpression):
|
||||
x, target, midpoint, scale = evaluate_decay_params(
|
||||
expression.gauss_decay, point_id, scores, payload, has_vector, defaults
|
||||
)
|
||||
|
||||
lambda_factor = math.log(midpoint) / (scale * scale)
|
||||
diff = x - target
|
||||
return math.exp(lambda_factor * diff * diff)
|
||||
|
||||
raise ValueError(f"Unsupported expression type: {type(expression)}")
|
||||
|
||||
|
||||
def evaluate_decay_params(
|
||||
params: models.DecayParamsExpression,
|
||||
point_id: models.ExtendedPointId,
|
||||
scores: list[dict[models.ExtendedPointId, float]],
|
||||
payload: models.Payload,
|
||||
has_vector: dict[str, bool],
|
||||
defaults: dict[str, Any],
|
||||
) -> Tuple[float, float, float, float]:
|
||||
x = evaluate_expression(params.x, point_id, scores, payload, has_vector, defaults)
|
||||
|
||||
if params.target is None:
|
||||
target = DEFAULT_DECAY_TARGET
|
||||
else:
|
||||
target = evaluate_expression(
|
||||
params.target, point_id, scores, payload, has_vector, defaults
|
||||
)
|
||||
|
||||
midpoint = params.midpoint if params.midpoint is not None else DEFAULT_DECAY_MIDPOINT
|
||||
|
||||
if midpoint <= 0.0 or midpoint >= 1.0:
|
||||
raise ValueError(f"Midpoint must be between 0 and 1, got {midpoint}")
|
||||
|
||||
scale = params.scale if params.scale is not None else DEFAULT_DECAY_SCALE
|
||||
if scale <= 0.0:
|
||||
raise ValueError(f"Scale must be non-zero positive, got {scale}")
|
||||
|
||||
return x, target, midpoint, scale
|
||||
|
||||
|
||||
def try_extract_payload_value(key: str, payload: models.Payload, defaults: dict[str, Any]) -> Any:
|
||||
# Get value from payload
|
||||
value = value_by_key(payload, key)
|
||||
|
||||
if value is None or len(value) == 0:
|
||||
# Or from defaults
|
||||
value = defaults.get(key, None)
|
||||
# Consider it None if it is an empty list
|
||||
if isinstance(value, list) and len(value) == 0:
|
||||
value = None
|
||||
|
||||
# Consider it a single value if it's a list with one element
|
||||
if isinstance(value, list) and len(value) == 1:
|
||||
return value[0]
|
||||
|
||||
if value is None:
|
||||
raise ValueError(f"No value found for {key} in the payload nor the formula defaults")
|
||||
|
||||
return value
|
||||
|
||||
|
||||
def evaluate_variable(
|
||||
variable: str,
|
||||
point_id: models.ExtendedPointId,
|
||||
scores: list[dict[models.ExtendedPointId, float]],
|
||||
payload: models.Payload,
|
||||
defaults: dict[str, Any],
|
||||
) -> float:
|
||||
var = parse_variable(variable)
|
||||
if isinstance(var, str):
|
||||
value = try_extract_payload_value(var, payload, defaults)
|
||||
|
||||
if is_number(value):
|
||||
return value
|
||||
|
||||
raise ValueError(
|
||||
f"Expected number value for {var} in the payload and/or in the formula defaults. Error: Value is not a number"
|
||||
)
|
||||
|
||||
elif isinstance(var, int):
|
||||
# Get score from scores
|
||||
score = None
|
||||
if var < len(scores):
|
||||
score = scores[var].get(point_id, None)
|
||||
if score is not None:
|
||||
return score
|
||||
|
||||
defined_default = defaults.get(variable, None)
|
||||
if defined_default is not None:
|
||||
return defined_default
|
||||
|
||||
return DEFAULT_SCORE
|
||||
|
||||
raise ValueError(f"Invalid variable type: {type(var)}")
|
||||
|
||||
|
||||
def parse_variable(var: str) -> Union[str, int]:
|
||||
# Try to parse score pattern
|
||||
if not var.startswith("$score"):
|
||||
# Treat as payload path
|
||||
return var
|
||||
|
||||
remaining = var.replace("$score", "", 1)
|
||||
if remaining == "":
|
||||
# end of string, default idx is 0
|
||||
return 0
|
||||
|
||||
# it must proceed with brackets
|
||||
if not remaining.startswith("["):
|
||||
raise ValueError(f"Invalid score pattern: {var}")
|
||||
|
||||
remaining = remaining.replace("[", "", 1)
|
||||
bracket_end = remaining.find("]")
|
||||
if bracket_end == -1:
|
||||
raise ValueError(f"Invalid score pattern: {var}")
|
||||
|
||||
# try parsing the content in between brackets as integer
|
||||
try:
|
||||
idx = int(remaining[:bracket_end])
|
||||
except ValueError:
|
||||
raise ValueError(f"Invalid score pattern: {var}")
|
||||
|
||||
# make sure the string ends after the closing bracket
|
||||
if len(remaining) > bracket_end + 1:
|
||||
raise ValueError(f"Invalid score pattern: {var}")
|
||||
|
||||
return idx
|
||||
|
||||
|
||||
def raise_non_finite_error(expression: str) -> None:
|
||||
raise ValueError(f"The expression {expression} produced a non-finite number")
|
||||
|
||||
|
||||
def is_number(value: Any) -> bool:
|
||||
return isinstance(value, (int, float)) and not isinstance(value, bool)
|
||||
|
||||
|
||||
def test_parsing_variable() -> None:
|
||||
assert parse_variable("$score") == 0
|
||||
assert parse_variable("$score[0]") == 0
|
||||
assert parse_variable("$score[1]") == 1
|
||||
assert parse_variable("$score[2]") == 2
|
||||
|
||||
try:
|
||||
parse_variable("$score[invalid]")
|
||||
assert False
|
||||
except ValueError as e:
|
||||
assert str(e) == "Invalid score pattern: $score[invalid]"
|
||||
|
||||
try:
|
||||
parse_variable("$score[10].other")
|
||||
assert False
|
||||
except ValueError as e:
|
||||
assert str(e) == "Invalid score pattern: $score[10].other"
|
||||
|
||||
|
||||
def test_try_extract_payload_value() -> None:
|
||||
for payload_value, expected in [(1.2, 1.2), ([1.2], 1.2), ([1.2, 2.3], [1.2, 2.3])]:
|
||||
empty_defaults: dict[str, Any] = {}
|
||||
|
||||
payload = {"key": payload_value}
|
||||
assert try_extract_payload_value("key", payload, empty_defaults) == expected
|
||||
|
||||
defaults = {"key": payload_value}
|
||||
empty_payload: dict[str, Any] = {}
|
||||
assert try_extract_payload_value("key", empty_payload, defaults) == expected
|
||||
@@ -0,0 +1,79 @@
|
||||
from typing import Optional
|
||||
|
||||
from qdrant_client.http import models
|
||||
|
||||
|
||||
DEFAULT_RANKING_CONSTANT_K = 2
|
||||
|
||||
|
||||
def reciprocal_rank_fusion(
|
||||
responses: list[list[models.ScoredPoint]],
|
||||
limit: int = 10,
|
||||
ranking_constant_k: Optional[int] = None,
|
||||
) -> list[models.ScoredPoint]:
|
||||
def compute_score(pos: int) -> float:
|
||||
ranking_constant = (
|
||||
ranking_constant_k if ranking_constant_k is not None else DEFAULT_RANKING_CONSTANT_K
|
||||
) # mitigates the impact of high rankings by outlier systems
|
||||
return 1 / (ranking_constant + pos)
|
||||
|
||||
scores: dict[models.ExtendedPointId, float] = {}
|
||||
point_pile = {}
|
||||
for response in responses:
|
||||
for i, scored_point in enumerate(response):
|
||||
if scored_point.id in scores:
|
||||
scores[scored_point.id] += compute_score(i)
|
||||
else:
|
||||
point_pile[scored_point.id] = scored_point
|
||||
scores[scored_point.id] = compute_score(i)
|
||||
|
||||
sorted_scores = sorted(scores.items(), key=lambda item: item[1], reverse=True)
|
||||
sorted_points = []
|
||||
for point_id, score in sorted_scores[:limit]:
|
||||
point = point_pile[point_id]
|
||||
point.score = score
|
||||
sorted_points.append(point)
|
||||
return sorted_points
|
||||
|
||||
|
||||
def distribution_based_score_fusion(
|
||||
responses: list[list[models.ScoredPoint]], limit: int
|
||||
) -> list[models.ScoredPoint]:
|
||||
def normalize(response: list[models.ScoredPoint]) -> list[models.ScoredPoint]:
|
||||
if len(response) == 1:
|
||||
response[0].score = 0.5
|
||||
return response
|
||||
|
||||
total = sum([point.score for point in response])
|
||||
mean = total / len(response)
|
||||
variance = sum([(point.score - mean) ** 2 for point in response]) / (len(response) - 1)
|
||||
|
||||
if variance == 0:
|
||||
for point in response:
|
||||
point.score = 0.5
|
||||
return response
|
||||
|
||||
std_dev = variance**0.5
|
||||
low = mean - 3 * std_dev
|
||||
high = mean + 3 * std_dev
|
||||
|
||||
for point in response:
|
||||
point.score = (point.score - low) / (high - low)
|
||||
|
||||
return response
|
||||
|
||||
points_map: dict[models.ExtendedPointId, models.ScoredPoint] = {}
|
||||
for response in responses:
|
||||
if not response:
|
||||
continue
|
||||
normalized = normalize(response)
|
||||
for point in normalized:
|
||||
entry = points_map.get(point.id)
|
||||
if entry is None:
|
||||
points_map[point.id] = point
|
||||
else:
|
||||
entry.score += point.score
|
||||
|
||||
sorted_points = sorted(points_map.values(), key=lambda item: item.score, reverse=True)
|
||||
|
||||
return sorted_points[:limit]
|
||||
@@ -0,0 +1,117 @@
|
||||
import numpy as np
|
||||
|
||||
from qdrant_client.http import models
|
||||
from qdrant_client.hybrid.fusion import reciprocal_rank_fusion, distribution_based_score_fusion
|
||||
|
||||
|
||||
def test_reciprocal_rank_fusion() -> None:
|
||||
responses = [
|
||||
[
|
||||
models.ScoredPoint(id="1", score=0.1, version=1),
|
||||
models.ScoredPoint(id="2", score=0.2, version=1),
|
||||
models.ScoredPoint(id="3", score=0.3, version=1),
|
||||
],
|
||||
[
|
||||
models.ScoredPoint(id="5", score=12.0, version=1),
|
||||
models.ScoredPoint(id="6", score=8.0, version=1),
|
||||
models.ScoredPoint(id="7", score=5.0, version=1),
|
||||
models.ScoredPoint(id="2", score=3.0, version=1),
|
||||
],
|
||||
]
|
||||
|
||||
fused = reciprocal_rank_fusion(responses)
|
||||
|
||||
assert fused[0].id == "2"
|
||||
assert fused[1].id in ["1", "5"]
|
||||
assert np.isclose(fused[1].score, 1 / 2)
|
||||
assert fused[2].id in ["1", "5"]
|
||||
assert np.isclose(fused[2].score, 1 / 2)
|
||||
|
||||
|
||||
def test_distribution_based_score_fusion() -> None:
|
||||
responses = [
|
||||
[
|
||||
models.ScoredPoint(id=1, version=0, score=85.0),
|
||||
models.ScoredPoint(id=0, version=0, score=76.0),
|
||||
models.ScoredPoint(id=5, version=0, score=68.0),
|
||||
],
|
||||
[
|
||||
models.ScoredPoint(id=1, version=0, score=62.0),
|
||||
models.ScoredPoint(id=0, version=0, score=61.0),
|
||||
models.ScoredPoint(id=4, version=0, score=57.0),
|
||||
models.ScoredPoint(id=3, version=0, score=51.0),
|
||||
models.ScoredPoint(id=2, version=0, score=44.0),
|
||||
],
|
||||
]
|
||||
|
||||
fused = distribution_based_score_fusion(responses, limit=3)
|
||||
|
||||
assert fused[0].id == 1
|
||||
assert fused[1].id == 0
|
||||
assert fused[2].id == 4
|
||||
|
||||
|
||||
def test_reciprocal_rank_fusion_empty_responses() -> None:
|
||||
responses: list[list[models.ScoredPoint]] = [[]]
|
||||
fused = reciprocal_rank_fusion(responses)
|
||||
assert fused == []
|
||||
|
||||
responses = [
|
||||
[
|
||||
models.ScoredPoint(id="1", score=0.1, version=1),
|
||||
models.ScoredPoint(id="2", score=0.2, version=1),
|
||||
models.ScoredPoint(id="3", score=0.3, version=1),
|
||||
],
|
||||
[],
|
||||
]
|
||||
|
||||
fused = reciprocal_rank_fusion(responses)
|
||||
|
||||
assert fused[0].id == "1"
|
||||
assert np.isclose(fused[0].score, 1 / 2)
|
||||
assert fused[1].id == "2"
|
||||
assert np.isclose(fused[1].score, 1 / 3)
|
||||
assert fused[2].id == "3"
|
||||
assert np.isclose(fused[2].score, 1 / 4)
|
||||
|
||||
|
||||
def test_distribution_based_score_fusion_empty_response() -> None:
|
||||
responses: list[list[models.ScoredPoint]] = [[]]
|
||||
fused = distribution_based_score_fusion(responses, limit=3)
|
||||
assert fused == []
|
||||
|
||||
responses = [
|
||||
[
|
||||
models.ScoredPoint(id=1, version=0, score=85.0),
|
||||
models.ScoredPoint(id=0, version=0, score=76.0),
|
||||
models.ScoredPoint(id=5, version=0, score=68.0),
|
||||
],
|
||||
[],
|
||||
]
|
||||
|
||||
fused = distribution_based_score_fusion(responses, limit=3)
|
||||
|
||||
assert fused[0].id == 1
|
||||
assert fused[1].id == 0
|
||||
assert fused[2].id == 5
|
||||
|
||||
|
||||
def test_distribution_based_score_fusion_zero_variance() -> None:
|
||||
score = 85.0
|
||||
responses = [
|
||||
[
|
||||
models.ScoredPoint(id=1, version=0, score=score),
|
||||
models.ScoredPoint(id=0, version=0, score=score),
|
||||
models.ScoredPoint(id=5, version=0, score=score),
|
||||
],
|
||||
[],
|
||||
]
|
||||
fused = distribution_based_score_fusion(
|
||||
[[models.ScoredPoint(id=1, version=0, score=score)]], limit=3
|
||||
)
|
||||
assert fused[0].id == 1
|
||||
assert fused[0].score == 0.5
|
||||
|
||||
fused = distribution_based_score_fusion(responses, limit=3)
|
||||
assert len(fused) == 3
|
||||
assert all([p.score == 0.5 for p in fused])
|
||||
Reference in New Issue
Block a user