324 lines
11 KiB
Python
324 lines
11 KiB
Python
from typing import Any, Optional, Union
|
|
|
|
from pydantic import BaseModel, Field
|
|
|
|
from qdrant_client.conversions.common_types import SparseVector
|
|
from qdrant_client.http import models
|
|
|
|
try:
|
|
from fastembed import (
|
|
TextEmbedding,
|
|
SparseTextEmbedding,
|
|
ImageEmbedding,
|
|
LateInteractionTextEmbedding,
|
|
LateInteractionMultimodalEmbedding,
|
|
)
|
|
from fastembed.common import OnnxProvider, ImageInput
|
|
except ImportError:
|
|
TextEmbedding = None
|
|
SparseTextEmbedding = None
|
|
ImageEmbedding = None
|
|
LateInteractionTextEmbedding = None
|
|
LateInteractionMultimodalEmbedding = None
|
|
OnnxProvider = None
|
|
ImageInput = None
|
|
|
|
|
|
class QueryResponse(BaseModel, extra="forbid"): # type: ignore
|
|
id: Union[str, int]
|
|
embedding: Optional[list[float]]
|
|
sparse_embedding: Optional[SparseVector] = Field(default=None)
|
|
metadata: dict[str, Any]
|
|
document: str
|
|
score: float
|
|
|
|
|
|
class FastEmbedMisc:
|
|
IS_INSTALLED: bool = False
|
|
_TEXT_MODELS: set[str] = set()
|
|
_IMAGE_MODELS: set[str] = set()
|
|
_LATE_INTERACTION_TEXT_MODELS: set[str] = set()
|
|
_LATE_INTERACTION_MULTIMODAL_MODELS: set[str] = set()
|
|
_SPARSE_MODELS: set[str] = set()
|
|
|
|
@classmethod
|
|
def is_installed(cls) -> bool:
|
|
if cls.IS_INSTALLED:
|
|
return cls.IS_INSTALLED
|
|
|
|
try:
|
|
from fastembed import (
|
|
SparseTextEmbedding,
|
|
TextEmbedding,
|
|
ImageEmbedding,
|
|
LateInteractionMultimodalEmbedding,
|
|
LateInteractionTextEmbedding,
|
|
)
|
|
|
|
assert len(SparseTextEmbedding.list_supported_models()) > 0
|
|
assert len(TextEmbedding.list_supported_models()) > 0
|
|
assert len(ImageEmbedding.list_supported_models()) > 0
|
|
assert len(LateInteractionTextEmbedding.list_supported_models()) > 0
|
|
assert len(LateInteractionMultimodalEmbedding.list_supported_models()) > 0
|
|
cls.IS_INSTALLED = True
|
|
except ImportError:
|
|
cls.IS_INSTALLED = False
|
|
|
|
return cls.IS_INSTALLED
|
|
|
|
@classmethod
|
|
def import_fastembed(cls) -> None:
|
|
if cls.IS_INSTALLED:
|
|
return
|
|
|
|
# If it's not, ask the user to install it
|
|
raise ImportError(
|
|
"fastembed is not installed."
|
|
" Please install it to enable fast vector indexing with `pip install fastembed`."
|
|
)
|
|
|
|
@classmethod
|
|
def list_text_models(cls) -> dict[str, tuple[int, models.Distance]]:
|
|
"""Lists the supported dense text models.
|
|
|
|
Requires invocation of TextEmbedding.list_supported_models() to support custom models.
|
|
|
|
Returns:
|
|
dict[str, tuple[int, models.Distance]]: A dict of model names, their dimensions and distance metrics.
|
|
"""
|
|
return (
|
|
{
|
|
model["model"]: (model["dim"], models.Distance.COSINE)
|
|
for model in TextEmbedding.list_supported_models()
|
|
}
|
|
if TextEmbedding
|
|
else {}
|
|
)
|
|
|
|
@classmethod
|
|
def list_image_models(cls) -> dict[str, tuple[int, models.Distance]]:
|
|
"""Lists the supported image dense models.
|
|
|
|
Custom image models are not supported yet, but calls to ImageEmbedding.list_supported_models() is done each
|
|
time in order for preserving the same style as with TextEmbedding.
|
|
|
|
Returns:
|
|
dict[str, tuple[int, models.Distance]]: A dict of model names, their dimensions and distance metrics.
|
|
"""
|
|
return (
|
|
{
|
|
model["model"]: (model["dim"], models.Distance.COSINE)
|
|
for model in ImageEmbedding.list_supported_models()
|
|
}
|
|
if ImageEmbedding
|
|
else {}
|
|
)
|
|
|
|
@classmethod
|
|
def list_late_interaction_text_models(cls) -> dict[str, tuple[int, models.Distance]]:
|
|
"""Lists the supported late interaction text models.
|
|
|
|
Custom late interaction models are not supported yet, but calls to
|
|
LateInteractionTextEmbedding.list_supported_models()
|
|
is done each time in order for preserving the same style as with TextEmbedding.
|
|
|
|
Returns:
|
|
dict[str, tuple[int, models.Distance]]: A dict of model names, their dimensions and distance metrics.
|
|
"""
|
|
return (
|
|
{
|
|
model["model"]: (model["dim"], models.Distance.COSINE)
|
|
for model in LateInteractionTextEmbedding.list_supported_models()
|
|
}
|
|
if LateInteractionTextEmbedding
|
|
else {}
|
|
)
|
|
|
|
@classmethod
|
|
def list_late_interaction_multimodal_models(cls) -> dict[str, tuple[int, models.Distance]]:
|
|
"""Lists the supported late interaction multimodal models.
|
|
|
|
Custom late interaction multimodal models are not supported yet, but calls to
|
|
LateInteractionMultimodalEmbedding.list_supported_models()
|
|
is done each time in order for preserving the same style as with TextEmbedding.
|
|
|
|
Returns:
|
|
dict[str, tuple[int, models.Distance]]: A dict of model names, their dimensions and distance metrics.
|
|
"""
|
|
return (
|
|
{
|
|
model["model"]: (model["dim"], models.Distance.COSINE)
|
|
for model in LateInteractionMultimodalEmbedding.list_supported_models()
|
|
}
|
|
if LateInteractionMultimodalEmbedding
|
|
else {}
|
|
)
|
|
|
|
@classmethod
|
|
def list_sparse_models(cls) -> dict[str, dict[str, Any]]:
|
|
"""Lists the supported sparse models.
|
|
|
|
Custom sparse models are not supported yet, but calls to
|
|
SparseTextEmbedding.list_supported_models()
|
|
is done each time in order for preserving the same style as with TextEmbedding.
|
|
|
|
Returns:
|
|
dict[str, dict[str, Any]]: A dict of model names and their descriptions.
|
|
"""
|
|
descriptions = {}
|
|
if SparseTextEmbedding:
|
|
for description in SparseTextEmbedding.list_supported_models():
|
|
descriptions[description.pop("model")] = description
|
|
return descriptions
|
|
|
|
@classmethod
|
|
def is_supported_text_model(cls, model_name: str) -> bool:
|
|
"""Checks if the model is supported by fastembed.
|
|
|
|
Args:
|
|
model_name (str): The name of the model to check.
|
|
|
|
Returns:
|
|
bool: True if the model is supported, False otherwise.
|
|
"""
|
|
if model_name.lower() in cls._TEXT_MODELS:
|
|
return True
|
|
# update cached list in case custom models were added
|
|
cls._TEXT_MODELS = {model.lower() for model in cls.list_text_models()}
|
|
if model_name.lower() in cls._TEXT_MODELS:
|
|
return True
|
|
return False
|
|
|
|
@classmethod
|
|
def is_supported_image_model(cls, model_name: str) -> bool:
|
|
"""Checks if the model is supported by fastembed.
|
|
|
|
Args:
|
|
model_name (str): The name of the model to check.
|
|
|
|
Returns:
|
|
bool: True if the model is supported, False otherwise.
|
|
"""
|
|
if model_name.lower() in cls._IMAGE_MODELS:
|
|
return True
|
|
# update cached list in case custom models were added
|
|
cls._IMAGE_MODELS = {model.lower() for model in cls.list_image_models()}
|
|
if model_name.lower() in cls._IMAGE_MODELS:
|
|
return True
|
|
return False
|
|
|
|
@classmethod
|
|
def is_supported_late_interaction_text_model(cls, model_name: str) -> bool:
|
|
"""Checks if the model is supported by fastembed.
|
|
|
|
Args:
|
|
model_name (str): The name of the model to check.
|
|
|
|
Returns:
|
|
bool: True if the model is supported, False otherwise.
|
|
"""
|
|
if model_name.lower() in cls._LATE_INTERACTION_TEXT_MODELS:
|
|
return True
|
|
# update cached list in case custom models were added
|
|
cls._LATE_INTERACTION_TEXT_MODELS = {
|
|
model.lower() for model in cls.list_late_interaction_text_models()
|
|
}
|
|
if model_name.lower() in cls._LATE_INTERACTION_TEXT_MODELS:
|
|
return True
|
|
return False
|
|
|
|
@classmethod
|
|
def is_supported_late_interaction_multimodal_model(cls, model_name: str) -> bool:
|
|
"""Checks if the model is supported by fastembed.
|
|
|
|
Args:
|
|
model_name (str): The name of the model to check.
|
|
|
|
Returns:
|
|
bool: True if the model is supported, False otherwise.
|
|
"""
|
|
if model_name.lower() in cls._LATE_INTERACTION_MULTIMODAL_MODELS:
|
|
return True
|
|
# update cached list in case custom models were added
|
|
cls._LATE_INTERACTION_MULTIMODAL_MODELS = {
|
|
model.lower() for model in cls.list_late_interaction_multimodal_models()
|
|
}
|
|
if model_name.lower() in cls._LATE_INTERACTION_MULTIMODAL_MODELS:
|
|
return True
|
|
return False
|
|
|
|
@classmethod
|
|
def is_supported_sparse_model(cls, model_name: str) -> bool:
|
|
"""Checks if the model is supported by fastembed.
|
|
|
|
Args:
|
|
model_name (str): The name of the model to check.
|
|
|
|
Returns:
|
|
bool: True if the model is supported, False otherwise.
|
|
"""
|
|
if model_name.lower() in cls._SPARSE_MODELS:
|
|
return True
|
|
# update cached list in case custom models were added
|
|
cls._SPARSE_MODELS = {model.lower() for model in cls.list_sparse_models()}
|
|
if model_name.lower() in cls._SPARSE_MODELS:
|
|
return True
|
|
return False
|
|
|
|
|
|
# region deprecated
|
|
# prefer using methods builtin into QdrantClient, e.g. list_supported_text_models, list_supported_idf_models, etc.
|
|
|
|
SUPPORTED_EMBEDDING_MODELS: dict[str, tuple[int, models.Distance]] = (
|
|
{
|
|
model["model"]: (model["dim"], models.Distance.COSINE)
|
|
for model in TextEmbedding.list_supported_models()
|
|
}
|
|
if TextEmbedding
|
|
else {}
|
|
)
|
|
|
|
SUPPORTED_SPARSE_EMBEDDING_MODELS: dict[str, dict[str, Any]] = (
|
|
{model["model"]: model for model in SparseTextEmbedding.list_supported_models()}
|
|
if SparseTextEmbedding
|
|
else {}
|
|
)
|
|
|
|
IDF_EMBEDDING_MODELS: set[str] = (
|
|
{
|
|
model_config["model"]
|
|
for model_config in SparseTextEmbedding.list_supported_models()
|
|
if model_config.get("requires_idf", None)
|
|
}
|
|
if SparseTextEmbedding
|
|
else set()
|
|
)
|
|
|
|
_LATE_INTERACTION_EMBEDDING_MODELS: dict[str, tuple[int, models.Distance]] = (
|
|
{
|
|
model["model"]: (model["dim"], models.Distance.COSINE)
|
|
for model in LateInteractionTextEmbedding.list_supported_models()
|
|
}
|
|
if LateInteractionTextEmbedding
|
|
else {}
|
|
)
|
|
|
|
_IMAGE_EMBEDDING_MODELS: dict[str, tuple[int, models.Distance]] = (
|
|
{
|
|
model["model"]: (model["dim"], models.Distance.COSINE)
|
|
for model in ImageEmbedding.list_supported_models()
|
|
}
|
|
if ImageEmbedding
|
|
else {}
|
|
)
|
|
|
|
_LATE_INTERACTION_MULTIMODAL_EMBEDDING_MODELS: dict[str, tuple[int, models.Distance]] = (
|
|
{
|
|
model["model"]: (model["dim"], models.Distance.COSINE)
|
|
for model in LateInteractionMultimodalEmbedding.list_supported_models()
|
|
}
|
|
if LateInteractionMultimodalEmbedding
|
|
else {}
|
|
)
|
|
# endregion
|