refactor: excel parse
This commit is contained in:
@@ -0,0 +1,323 @@
|
||||
from typing import Any, Optional, Union
|
||||
|
||||
from pydantic import BaseModel, Field
|
||||
|
||||
from qdrant_client.conversions.common_types import SparseVector
|
||||
from qdrant_client.http import models
|
||||
|
||||
try:
|
||||
from fastembed import (
|
||||
TextEmbedding,
|
||||
SparseTextEmbedding,
|
||||
ImageEmbedding,
|
||||
LateInteractionTextEmbedding,
|
||||
LateInteractionMultimodalEmbedding,
|
||||
)
|
||||
from fastembed.common import OnnxProvider, ImageInput
|
||||
except ImportError:
|
||||
TextEmbedding = None
|
||||
SparseTextEmbedding = None
|
||||
ImageEmbedding = None
|
||||
LateInteractionTextEmbedding = None
|
||||
LateInteractionMultimodalEmbedding = None
|
||||
OnnxProvider = None
|
||||
ImageInput = None
|
||||
|
||||
|
||||
class QueryResponse(BaseModel, extra="forbid"): # type: ignore
|
||||
id: Union[str, int]
|
||||
embedding: Optional[list[float]]
|
||||
sparse_embedding: Optional[SparseVector] = Field(default=None)
|
||||
metadata: dict[str, Any]
|
||||
document: str
|
||||
score: float
|
||||
|
||||
|
||||
class FastEmbedMisc:
|
||||
IS_INSTALLED: bool = False
|
||||
_TEXT_MODELS: set[str] = set()
|
||||
_IMAGE_MODELS: set[str] = set()
|
||||
_LATE_INTERACTION_TEXT_MODELS: set[str] = set()
|
||||
_LATE_INTERACTION_MULTIMODAL_MODELS: set[str] = set()
|
||||
_SPARSE_MODELS: set[str] = set()
|
||||
|
||||
@classmethod
|
||||
def is_installed(cls) -> bool:
|
||||
if cls.IS_INSTALLED:
|
||||
return cls.IS_INSTALLED
|
||||
|
||||
try:
|
||||
from fastembed import (
|
||||
SparseTextEmbedding,
|
||||
TextEmbedding,
|
||||
ImageEmbedding,
|
||||
LateInteractionMultimodalEmbedding,
|
||||
LateInteractionTextEmbedding,
|
||||
)
|
||||
|
||||
assert len(SparseTextEmbedding.list_supported_models()) > 0
|
||||
assert len(TextEmbedding.list_supported_models()) > 0
|
||||
assert len(ImageEmbedding.list_supported_models()) > 0
|
||||
assert len(LateInteractionTextEmbedding.list_supported_models()) > 0
|
||||
assert len(LateInteractionMultimodalEmbedding.list_supported_models()) > 0
|
||||
cls.IS_INSTALLED = True
|
||||
except ImportError:
|
||||
cls.IS_INSTALLED = False
|
||||
|
||||
return cls.IS_INSTALLED
|
||||
|
||||
@classmethod
|
||||
def import_fastembed(cls) -> None:
|
||||
if cls.IS_INSTALLED:
|
||||
return
|
||||
|
||||
# If it's not, ask the user to install it
|
||||
raise ImportError(
|
||||
"fastembed is not installed."
|
||||
" Please install it to enable fast vector indexing with `pip install fastembed`."
|
||||
)
|
||||
|
||||
@classmethod
|
||||
def list_text_models(cls) -> dict[str, tuple[int, models.Distance]]:
|
||||
"""Lists the supported dense text models.
|
||||
|
||||
Requires invocation of TextEmbedding.list_supported_models() to support custom models.
|
||||
|
||||
Returns:
|
||||
dict[str, tuple[int, models.Distance]]: A dict of model names, their dimensions and distance metrics.
|
||||
"""
|
||||
return (
|
||||
{
|
||||
model["model"]: (model["dim"], models.Distance.COSINE)
|
||||
for model in TextEmbedding.list_supported_models()
|
||||
}
|
||||
if TextEmbedding
|
||||
else {}
|
||||
)
|
||||
|
||||
@classmethod
|
||||
def list_image_models(cls) -> dict[str, tuple[int, models.Distance]]:
|
||||
"""Lists the supported image dense models.
|
||||
|
||||
Custom image models are not supported yet, but calls to ImageEmbedding.list_supported_models() is done each
|
||||
time in order for preserving the same style as with TextEmbedding.
|
||||
|
||||
Returns:
|
||||
dict[str, tuple[int, models.Distance]]: A dict of model names, their dimensions and distance metrics.
|
||||
"""
|
||||
return (
|
||||
{
|
||||
model["model"]: (model["dim"], models.Distance.COSINE)
|
||||
for model in ImageEmbedding.list_supported_models()
|
||||
}
|
||||
if ImageEmbedding
|
||||
else {}
|
||||
)
|
||||
|
||||
@classmethod
|
||||
def list_late_interaction_text_models(cls) -> dict[str, tuple[int, models.Distance]]:
|
||||
"""Lists the supported late interaction text models.
|
||||
|
||||
Custom late interaction models are not supported yet, but calls to
|
||||
LateInteractionTextEmbedding.list_supported_models()
|
||||
is done each time in order for preserving the same style as with TextEmbedding.
|
||||
|
||||
Returns:
|
||||
dict[str, tuple[int, models.Distance]]: A dict of model names, their dimensions and distance metrics.
|
||||
"""
|
||||
return (
|
||||
{
|
||||
model["model"]: (model["dim"], models.Distance.COSINE)
|
||||
for model in LateInteractionTextEmbedding.list_supported_models()
|
||||
}
|
||||
if LateInteractionTextEmbedding
|
||||
else {}
|
||||
)
|
||||
|
||||
@classmethod
|
||||
def list_late_interaction_multimodal_models(cls) -> dict[str, tuple[int, models.Distance]]:
|
||||
"""Lists the supported late interaction multimodal models.
|
||||
|
||||
Custom late interaction multimodal models are not supported yet, but calls to
|
||||
LateInteractionMultimodalEmbedding.list_supported_models()
|
||||
is done each time in order for preserving the same style as with TextEmbedding.
|
||||
|
||||
Returns:
|
||||
dict[str, tuple[int, models.Distance]]: A dict of model names, their dimensions and distance metrics.
|
||||
"""
|
||||
return (
|
||||
{
|
||||
model["model"]: (model["dim"], models.Distance.COSINE)
|
||||
for model in LateInteractionMultimodalEmbedding.list_supported_models()
|
||||
}
|
||||
if LateInteractionMultimodalEmbedding
|
||||
else {}
|
||||
)
|
||||
|
||||
@classmethod
|
||||
def list_sparse_models(cls) -> dict[str, dict[str, Any]]:
|
||||
"""Lists the supported sparse models.
|
||||
|
||||
Custom sparse models are not supported yet, but calls to
|
||||
SparseTextEmbedding.list_supported_models()
|
||||
is done each time in order for preserving the same style as with TextEmbedding.
|
||||
|
||||
Returns:
|
||||
dict[str, dict[str, Any]]: A dict of model names and their descriptions.
|
||||
"""
|
||||
descriptions = {}
|
||||
if SparseTextEmbedding:
|
||||
for description in SparseTextEmbedding.list_supported_models():
|
||||
descriptions[description.pop("model")] = description
|
||||
return descriptions
|
||||
|
||||
@classmethod
|
||||
def is_supported_text_model(cls, model_name: str) -> bool:
|
||||
"""Checks if the model is supported by fastembed.
|
||||
|
||||
Args:
|
||||
model_name (str): The name of the model to check.
|
||||
|
||||
Returns:
|
||||
bool: True if the model is supported, False otherwise.
|
||||
"""
|
||||
if model_name.lower() in cls._TEXT_MODELS:
|
||||
return True
|
||||
# update cached list in case custom models were added
|
||||
cls._TEXT_MODELS = {model.lower() for model in cls.list_text_models()}
|
||||
if model_name.lower() in cls._TEXT_MODELS:
|
||||
return True
|
||||
return False
|
||||
|
||||
@classmethod
|
||||
def is_supported_image_model(cls, model_name: str) -> bool:
|
||||
"""Checks if the model is supported by fastembed.
|
||||
|
||||
Args:
|
||||
model_name (str): The name of the model to check.
|
||||
|
||||
Returns:
|
||||
bool: True if the model is supported, False otherwise.
|
||||
"""
|
||||
if model_name.lower() in cls._IMAGE_MODELS:
|
||||
return True
|
||||
# update cached list in case custom models were added
|
||||
cls._IMAGE_MODELS = {model.lower() for model in cls.list_image_models()}
|
||||
if model_name.lower() in cls._IMAGE_MODELS:
|
||||
return True
|
||||
return False
|
||||
|
||||
@classmethod
|
||||
def is_supported_late_interaction_text_model(cls, model_name: str) -> bool:
|
||||
"""Checks if the model is supported by fastembed.
|
||||
|
||||
Args:
|
||||
model_name (str): The name of the model to check.
|
||||
|
||||
Returns:
|
||||
bool: True if the model is supported, False otherwise.
|
||||
"""
|
||||
if model_name.lower() in cls._LATE_INTERACTION_TEXT_MODELS:
|
||||
return True
|
||||
# update cached list in case custom models were added
|
||||
cls._LATE_INTERACTION_TEXT_MODELS = {
|
||||
model.lower() for model in cls.list_late_interaction_text_models()
|
||||
}
|
||||
if model_name.lower() in cls._LATE_INTERACTION_TEXT_MODELS:
|
||||
return True
|
||||
return False
|
||||
|
||||
@classmethod
|
||||
def is_supported_late_interaction_multimodal_model(cls, model_name: str) -> bool:
|
||||
"""Checks if the model is supported by fastembed.
|
||||
|
||||
Args:
|
||||
model_name (str): The name of the model to check.
|
||||
|
||||
Returns:
|
||||
bool: True if the model is supported, False otherwise.
|
||||
"""
|
||||
if model_name.lower() in cls._LATE_INTERACTION_MULTIMODAL_MODELS:
|
||||
return True
|
||||
# update cached list in case custom models were added
|
||||
cls._LATE_INTERACTION_MULTIMODAL_MODELS = {
|
||||
model.lower() for model in cls.list_late_interaction_multimodal_models()
|
||||
}
|
||||
if model_name.lower() in cls._LATE_INTERACTION_MULTIMODAL_MODELS:
|
||||
return True
|
||||
return False
|
||||
|
||||
@classmethod
|
||||
def is_supported_sparse_model(cls, model_name: str) -> bool:
|
||||
"""Checks if the model is supported by fastembed.
|
||||
|
||||
Args:
|
||||
model_name (str): The name of the model to check.
|
||||
|
||||
Returns:
|
||||
bool: True if the model is supported, False otherwise.
|
||||
"""
|
||||
if model_name.lower() in cls._SPARSE_MODELS:
|
||||
return True
|
||||
# update cached list in case custom models were added
|
||||
cls._SPARSE_MODELS = {model.lower() for model in cls.list_sparse_models()}
|
||||
if model_name.lower() in cls._SPARSE_MODELS:
|
||||
return True
|
||||
return False
|
||||
|
||||
|
||||
# region deprecated
|
||||
# prefer using methods builtin into QdrantClient, e.g. list_supported_text_models, list_supported_idf_models, etc.
|
||||
|
||||
SUPPORTED_EMBEDDING_MODELS: dict[str, tuple[int, models.Distance]] = (
|
||||
{
|
||||
model["model"]: (model["dim"], models.Distance.COSINE)
|
||||
for model in TextEmbedding.list_supported_models()
|
||||
}
|
||||
if TextEmbedding
|
||||
else {}
|
||||
)
|
||||
|
||||
SUPPORTED_SPARSE_EMBEDDING_MODELS: dict[str, dict[str, Any]] = (
|
||||
{model["model"]: model for model in SparseTextEmbedding.list_supported_models()}
|
||||
if SparseTextEmbedding
|
||||
else {}
|
||||
)
|
||||
|
||||
IDF_EMBEDDING_MODELS: set[str] = (
|
||||
{
|
||||
model_config["model"]
|
||||
for model_config in SparseTextEmbedding.list_supported_models()
|
||||
if model_config.get("requires_idf", None)
|
||||
}
|
||||
if SparseTextEmbedding
|
||||
else set()
|
||||
)
|
||||
|
||||
_LATE_INTERACTION_EMBEDDING_MODELS: dict[str, tuple[int, models.Distance]] = (
|
||||
{
|
||||
model["model"]: (model["dim"], models.Distance.COSINE)
|
||||
for model in LateInteractionTextEmbedding.list_supported_models()
|
||||
}
|
||||
if LateInteractionTextEmbedding
|
||||
else {}
|
||||
)
|
||||
|
||||
_IMAGE_EMBEDDING_MODELS: dict[str, tuple[int, models.Distance]] = (
|
||||
{
|
||||
model["model"]: (model["dim"], models.Distance.COSINE)
|
||||
for model in ImageEmbedding.list_supported_models()
|
||||
}
|
||||
if ImageEmbedding
|
||||
else {}
|
||||
)
|
||||
|
||||
_LATE_INTERACTION_MULTIMODAL_EMBEDDING_MODELS: dict[str, tuple[int, models.Distance]] = (
|
||||
{
|
||||
model["model"]: (model["dim"], models.Distance.COSINE)
|
||||
for model in LateInteractionMultimodalEmbedding.list_supported_models()
|
||||
}
|
||||
if LateInteractionMultimodalEmbedding
|
||||
else {}
|
||||
)
|
||||
# endregion
|
||||
Reference in New Issue
Block a user