refactor: excel parse

This commit is contained in:
Blizzard
2026-04-16 10:01:11 +08:00
parent 680ecc320f
commit f62f95ec02
7941 changed files with 2899112 additions and 0 deletions
File diff suppressed because it is too large Load Diff
@@ -0,0 +1,94 @@
from typing import Optional, Any
from qdrant_client.http import models
from qdrant_client.embed.models import NumericVector
class BuiltinEmbedder:
_SUPPORTED_MODELS = ("Qdrant/Bm25",)
def __init__(self, **kwargs: Any) -> None:
pass
def embed(
self,
model_name: str,
texts: Optional[list[str]] = None,
options: Optional[dict[str, Any]] = None,
**kwargs: Any,
) -> NumericVector:
if texts is None:
if "images" in kwargs:
raise ValueError(
"Image processing is only available with cloud inference of FastEmbed"
)
raise ValueError("Texts must be provided for the inference")
if not self.is_supported_sparse_model(model_name):
raise ValueError(
f"Model {model_name} is not supported in {self.__class__.__name__}. "
f"Did you forget to enable cloud inference or install FastEmbed for local inference?"
)
return [models.Document(text=text, options=options, model=model_name) for text in texts]
@classmethod
def is_supported_text_model(cls, model_name: str) -> bool:
"""Mock embedder interface, only sparse text model Qdrant/Bm25 is supported
Args:
model_name (str): The name of the model to check.
Returns:
bool: True if the model is supported, False otherwise.
"""
return False # currently only Qdrant/Bm25 is supported
@classmethod
def is_supported_image_model(cls, model_name: str) -> bool:
"""Mock embedder interface, only sparse text model Qdrant/Bm25 is supported
Args:
model_name (str): The name of the model to check.
Returns:
bool: True if the model is supported, False otherwise.
"""
return False # currently only Qdrant/Bm25 is supported
@classmethod
def is_supported_late_interaction_text_model(cls, model_name: str) -> bool:
"""Mock embedder interface, only sparse text model Qdrant/Bm25 is supported
Args:
model_name (str): The name of the model to check.
Returns:
bool: True if the model is supported, False otherwise.
"""
return False # currently only Qdrant/Bm25 is supported
@classmethod
def is_supported_late_interaction_multimodal_model(cls, model_name: str) -> bool:
"""Mock embedder interface, only sparse text model Qdrant/Bm25 is supported
Args:
model_name (str): The name of the model to check.
Returns:
bool: True if the model is supported, False otherwise.
"""
return False # currently only Qdrant/Bm25 is supported
@classmethod
def is_supported_sparse_model(cls, model_name: str) -> bool:
"""Checks if the model is supported. Only `Qdrant/Bm25` is supported
Args:
model_name (str): The name of the model to check.
Returns:
bool: True if the model is supported, False otherwise.
"""
return model_name.lower() in [model.lower() for model in cls._SUPPORTED_MODELS]
@@ -0,0 +1,6 @@
from typing import Union
from qdrant_client.http import models
INFERENCE_OBJECT_NAMES: set[str] = {"Document", "Image", "InferenceObject"}
INFERENCE_OBJECT_TYPES = Union[models.Document, models.Image, models.InferenceObject]
@@ -0,0 +1,176 @@
from copy import copy
from typing import Union, Optional, Iterable, get_args
from pydantic import BaseModel
from qdrant_client._pydantic_compat import model_fields_set
from qdrant_client.embed.common import INFERENCE_OBJECT_TYPES
from qdrant_client.embed.schema_parser import ModelSchemaParser
from qdrant_client.embed.utils import convert_paths, FieldPath
class InspectorEmbed:
"""Inspector which collects paths to objects requiring inference in the received models
Attributes:
parser: ModelSchemaParser instance
"""
def __init__(self, parser: Optional[ModelSchemaParser] = None) -> None:
self.parser = ModelSchemaParser() if parser is None else parser
def inspect(self, points: Union[Iterable[BaseModel], BaseModel]) -> list[FieldPath]:
"""Looks for all the paths to objects requiring inference in the received models
Args:
points: models to inspect
Returns:
list of FieldPath objects
"""
paths = []
if isinstance(points, BaseModel):
self.parser.parse_model(points.__class__)
paths.extend(self._inspect_model(points))
elif isinstance(points, dict):
for value in points.values():
paths.extend(self.inspect(value))
elif isinstance(points, Iterable):
for point in points:
if isinstance(point, BaseModel):
self.parser.parse_model(point.__class__)
paths.extend(self._inspect_model(point))
paths = sorted(set(paths))
return convert_paths(paths)
def _inspect_model(
self, mod: BaseModel, paths: Optional[list[FieldPath]] = None, accum: Optional[str] = None
) -> list[str]:
"""Looks for all the paths to objects requiring inference in the received model
Args:
mod: model to inspect
paths: list of paths to the fields possibly containing objects for inference
accum: accumulator for the path. Path is a dot separated string of field names which we assemble recursively
Returns:
list of paths to the model fields containing objects for inference
"""
paths = self.parser.path_cache.get(mod.__class__.__name__, []) if paths is None else paths
found_paths = []
for path in paths:
found_paths.extend(
self._inspect_inner_models(
mod, path.current, path.tail if path.tail else [], accum
)
)
return found_paths
def _inspect_inner_models(
self,
original_model: BaseModel,
current_path: str,
tail: list[FieldPath],
accum: Optional[str] = None,
) -> list[str]:
"""Looks for all the paths to objects requiring inference in the received model
Args:
original_model: model to inspect
current_path: the field to inspect on the current iteration
tail: list of FieldPath objects to the fields possibly containing objects for inference
accum: accumulator for the path. Path is a dot separated string of field names which we assemble recursively
Returns:
list of paths to the model fields containing objects for inference
"""
found_paths = []
if accum is None:
accum = current_path
else:
accum += f".{current_path}"
def inspect_recursive(member: BaseModel, accumulator: str) -> list[str]:
"""Iterates over the set model fields, expand recursive ones and find paths to objects requiring inference
Args:
member: currently inspected model, which may or may not contain recursive fields
accumulator: accumulator for the path, which is a dot separated string assembled recursively
"""
recursive_paths = []
for field in model_fields_set(member):
if field in self.parser.name_recursive_ref_mapping:
mapped_field = self.parser.name_recursive_ref_mapping[field]
recursive_paths.extend(self.parser.path_cache[mapped_field])
return self._inspect_model(member, copy(recursive_paths), accumulator)
model = getattr(original_model, current_path, None)
if model is None:
return []
if isinstance(model, get_args(INFERENCE_OBJECT_TYPES)):
return [accum]
if isinstance(model, BaseModel):
found_paths.extend(inspect_recursive(model, accum))
for next_path in tail:
found_paths.extend(
self._inspect_inner_models(
model, next_path.current, next_path.tail if next_path.tail else [], accum
)
)
return found_paths
elif isinstance(model, list):
for current_model in model:
if not isinstance(current_model, BaseModel):
continue
if isinstance(current_model, get_args(INFERENCE_OBJECT_TYPES)):
found_paths.append(accum)
found_paths.extend(inspect_recursive(current_model, accum))
for next_path in tail:
for current_model in model:
found_paths.extend(
self._inspect_inner_models(
current_model,
next_path.current,
next_path.tail if next_path.tail else [],
accum,
)
)
return found_paths
elif isinstance(model, dict):
found_paths = []
for key, values in model.items():
values = [values] if not isinstance(values, list) else values
for current_model in values:
if not isinstance(current_model, BaseModel):
continue
if isinstance(current_model, get_args(INFERENCE_OBJECT_TYPES)):
found_paths.append(accum)
found_paths.extend(inspect_recursive(current_model, accum))
for next_path in tail:
for current_model in values:
found_paths.extend(
self._inspect_inner_models(
current_model,
next_path.current,
next_path.tail if next_path.tail else [],
accum,
)
)
return found_paths
@@ -0,0 +1,447 @@
from collections import defaultdict
from typing import Optional, Sequence, Any, TypeVar, Generic
from pydantic import BaseModel
from qdrant_client.http import models
from qdrant_client.embed.models import NumericVector
from qdrant_client.fastembed_common import (
OnnxProvider,
ImageInput,
TextEmbedding,
SparseTextEmbedding,
LateInteractionTextEmbedding,
LateInteractionMultimodalEmbedding,
ImageEmbedding,
FastEmbedMisc,
)
T = TypeVar("T")
class ModelInstance(BaseModel, Generic[T], arbitrary_types_allowed=True): # type: ignore[call-arg]
model: T
options: dict[str, Any]
deprecated: bool = False
class Embedder:
def __init__(self, threads: Optional[int] = None, **kwargs: Any) -> None:
self.embedding_models: dict[str, list[ModelInstance[TextEmbedding]]] = defaultdict(list)
self.sparse_embedding_models: dict[str, list[ModelInstance[SparseTextEmbedding]]] = (
defaultdict(list)
)
self.late_interaction_embedding_models: dict[
str, list[ModelInstance[LateInteractionTextEmbedding]]
] = defaultdict(list)
self.image_embedding_models: dict[str, list[ModelInstance[ImageEmbedding]]] = defaultdict(
list
)
self.late_interaction_multimodal_embedding_models: dict[
str, list[ModelInstance[LateInteractionMultimodalEmbedding]]
] = defaultdict(list)
self._threads = threads
def get_or_init_model(
self,
model_name: str,
cache_dir: Optional[str] = None,
threads: Optional[int] = None,
providers: Optional[Sequence["OnnxProvider"]] = None,
cuda: bool = False,
device_ids: Optional[list[int]] = None,
deprecated: bool = False,
**kwargs: Any,
) -> TextEmbedding:
if not FastEmbedMisc.is_supported_text_model(model_name):
raise ValueError(
f"Unsupported embedding model: {model_name}. Supported models: {FastEmbedMisc.list_text_models()}"
)
options = {
"cache_dir": cache_dir,
"threads": threads or self._threads,
"providers": providers,
"cuda": cuda,
"device_ids": device_ids,
**kwargs,
}
for instance in self.embedding_models[model_name]:
if (deprecated and instance.deprecated) or (
not deprecated and instance.options == options
):
return instance.model
model = TextEmbedding(model_name=model_name, **options)
model_instance: ModelInstance[TextEmbedding] = ModelInstance(
model=model, options=options, deprecated=deprecated
)
self.embedding_models[model_name].append(model_instance)
return model
def get_or_init_sparse_model(
self,
model_name: str,
cache_dir: Optional[str] = None,
threads: Optional[int] = None,
providers: Optional[Sequence["OnnxProvider"]] = None,
cuda: bool = False,
device_ids: Optional[list[int]] = None,
deprecated: bool = False,
**kwargs: Any,
) -> SparseTextEmbedding:
if not FastEmbedMisc.is_supported_sparse_model(model_name):
raise ValueError(
f"Unsupported embedding model: {model_name}. Supported models: {FastEmbedMisc.list_sparse_models()}"
)
options = {
"cache_dir": cache_dir,
"threads": threads or self._threads,
"providers": providers,
"cuda": cuda,
"device_ids": device_ids,
**kwargs,
}
for instance in self.sparse_embedding_models[model_name]:
if (deprecated and instance.deprecated) or (
not deprecated and instance.options == options
):
return instance.model
model = SparseTextEmbedding(model_name=model_name, **options)
model_instance: ModelInstance[SparseTextEmbedding] = ModelInstance(
model=model, options=options, deprecated=deprecated
)
self.sparse_embedding_models[model_name].append(model_instance)
return model
def get_or_init_late_interaction_model(
self,
model_name: str,
cache_dir: Optional[str] = None,
threads: Optional[int] = None,
providers: Optional[Sequence["OnnxProvider"]] = None,
cuda: bool = False,
device_ids: Optional[list[int]] = None,
**kwargs: Any,
) -> LateInteractionTextEmbedding:
if not FastEmbedMisc.is_supported_late_interaction_text_model(model_name):
raise ValueError(
f"Unsupported embedding model: {model_name}. "
f"Supported models: {FastEmbedMisc.list_late_interaction_text_models()}"
)
options = {
"cache_dir": cache_dir,
"threads": threads or self._threads,
"providers": providers,
"cuda": cuda,
"device_ids": device_ids,
**kwargs,
}
for instance in self.late_interaction_embedding_models[model_name]:
if instance.options == options:
return instance.model
model = LateInteractionTextEmbedding(model_name=model_name, **options)
model_instance: ModelInstance[LateInteractionTextEmbedding] = ModelInstance(
model=model, options=options
)
self.late_interaction_embedding_models[model_name].append(model_instance)
return model
def get_or_init_late_interaction_multimodal_model(
self,
model_name: str,
cache_dir: Optional[str] = None,
threads: Optional[int] = None,
providers: Optional[Sequence["OnnxProvider"]] = None,
cuda: bool = False,
device_ids: Optional[list[int]] = None,
**kwargs: Any,
) -> LateInteractionMultimodalEmbedding:
if not FastEmbedMisc.is_supported_late_interaction_multimodal_model(model_name):
raise ValueError(
f"Unsupported embedding model: {model_name}. "
f"Supported models: {FastEmbedMisc.list_late_interaction_multimodal_models()}"
)
options = {
"cache_dir": cache_dir,
"threads": threads or self._threads,
"providers": providers,
"cuda": cuda,
"device_ids": device_ids,
**kwargs,
}
for instance in self.late_interaction_multimodal_embedding_models[model_name]:
if instance.options == options:
return instance.model
model = LateInteractionMultimodalEmbedding(model_name=model_name, **options)
model_instance: ModelInstance[LateInteractionMultimodalEmbedding] = ModelInstance(
model=model, options=options
)
self.late_interaction_multimodal_embedding_models[model_name].append(model_instance)
return model
def get_or_init_image_model(
self,
model_name: str,
cache_dir: Optional[str] = None,
threads: Optional[int] = None,
providers: Optional[Sequence["OnnxProvider"]] = None,
cuda: bool = False,
device_ids: Optional[list[int]] = None,
**kwargs: Any,
) -> ImageEmbedding:
if not FastEmbedMisc.is_supported_image_model(model_name):
raise ValueError(
f"Unsupported embedding model: {model_name}. Supported models: {FastEmbedMisc.list_image_models()}"
)
options = {
"cache_dir": cache_dir,
"threads": threads or self._threads,
"providers": providers,
"cuda": cuda,
"device_ids": device_ids,
**kwargs,
}
for instance in self.image_embedding_models[model_name]:
if instance.options == options:
return instance.model
model = ImageEmbedding(model_name=model_name, **options)
model_instance: ModelInstance[ImageEmbedding] = ModelInstance(model=model, options=options)
self.image_embedding_models[model_name].append(model_instance)
return model
def embed(
self,
model_name: str,
texts: Optional[list[str]] = None,
images: Optional[list[ImageInput]] = None,
options: Optional[dict[str, Any]] = None,
is_query: bool = False,
batch_size: int = 8,
) -> NumericVector:
if (texts is None) is (images is None):
raise ValueError("Either documents or images should be provided")
embeddings: NumericVector # define type for a static type checker
if texts is not None:
if FastEmbedMisc.is_supported_text_model(model_name):
embeddings = self._embed_dense_text(
texts, model_name, options, is_query, batch_size
)
elif FastEmbedMisc.is_supported_sparse_model(model_name):
embeddings = self._embed_sparse_text(
texts, model_name, options, is_query, batch_size
)
elif FastEmbedMisc.is_supported_late_interaction_text_model(model_name):
embeddings = self._embed_late_interaction_text(
texts, model_name, options, is_query, batch_size
)
elif FastEmbedMisc.is_supported_late_interaction_multimodal_model(model_name):
embeddings = self._embed_late_interaction_multimodal_text(
texts, model_name, options, batch_size
)
else:
raise ValueError(f"Unsupported embedding model: {model_name}")
else:
assert (
images is not None
) # just to satisfy mypy which can't infer it from the previous conditions
if FastEmbedMisc.is_supported_image_model(model_name):
embeddings = self._embed_dense_image(images, model_name, options, batch_size)
elif FastEmbedMisc.is_supported_late_interaction_multimodal_model(model_name):
embeddings = self._embed_late_interaction_multimodal_image(
images, model_name, options, batch_size
)
else:
raise ValueError(f"Unsupported embedding model: {model_name}")
return embeddings
def _embed_dense_text(
self,
texts: list[str],
model_name: str,
options: Optional[dict[str, Any]],
is_query: bool,
batch_size: int,
) -> list[list[float]]:
embedding_model_inst = self.get_or_init_model(model_name=model_name, **options or {})
if not is_query:
embeddings = [
embedding.tolist()
for embedding in embedding_model_inst.embed(documents=texts, batch_size=batch_size)
]
else:
embeddings = [
embedding.tolist() for embedding in embedding_model_inst.query_embed(query=texts)
]
return embeddings
def _embed_sparse_text(
self,
texts: list[str],
model_name: str,
options: Optional[dict[str, Any]],
is_query: bool,
batch_size: int,
) -> list[models.SparseVector]:
embedding_model_inst = self.get_or_init_sparse_model(
model_name=model_name, **options or {}
)
if not is_query:
embeddings = [
models.SparseVector(
indices=sparse_embedding.indices.tolist(),
values=sparse_embedding.values.tolist(),
)
for sparse_embedding in embedding_model_inst.embed(
documents=texts, batch_size=batch_size
)
]
else:
embeddings = [
models.SparseVector(
indices=sparse_embedding.indices.tolist(),
values=sparse_embedding.values.tolist(),
)
for sparse_embedding in embedding_model_inst.query_embed(query=texts)
]
return embeddings
def _embed_late_interaction_text(
self,
texts: list[str],
model_name: str,
options: Optional[dict[str, Any]],
is_query: bool,
batch_size: int,
) -> list[list[list[float]]]:
embedding_model_inst = self.get_or_init_late_interaction_model(
model_name=model_name, **options or {}
)
if not is_query:
embeddings = [
embedding.tolist()
for embedding in embedding_model_inst.embed(documents=texts, batch_size=batch_size)
]
else:
embeddings = [
embedding.tolist() for embedding in embedding_model_inst.query_embed(query=texts)
]
return embeddings
def _embed_late_interaction_multimodal_text(
self,
texts: list[str],
model_name: str,
options: Optional[dict[str, Any]],
batch_size: int,
) -> list[list[list[float]]]:
embedding_model_inst = self.get_or_init_late_interaction_multimodal_model(
model_name=model_name, **options or {}
)
return [
embedding.tolist()
for embedding in embedding_model_inst.embed_text(
documents=texts, batch_size=batch_size
)
]
def _embed_late_interaction_multimodal_image(
self,
images: list[ImageInput],
model_name: str,
options: Optional[dict[str, Any]],
batch_size: int,
) -> list[list[list[float]]]:
embedding_model_inst = self.get_or_init_late_interaction_multimodal_model(
model_name=model_name, **options or {}
)
return [
embedding.tolist()
for embedding in embedding_model_inst.embed_image(images=images, batch_size=batch_size)
]
def _embed_dense_image(
self,
images: list[ImageInput],
model_name: str,
options: Optional[dict[str, Any]],
batch_size: int,
) -> list[list[float]]:
embedding_model_inst = self.get_or_init_image_model(model_name=model_name, **options or {})
embeddings = [
embedding.tolist()
for embedding in embedding_model_inst.embed(images=images, batch_size=batch_size)
]
return embeddings
@classmethod
def is_supported_text_model(cls, model_name: str) -> bool:
"""Check if model is supported by fastembed
Args:
model_name (str): The name of the model to check.
Returns:
bool: True if the model is supported, False otherwise.
"""
return FastEmbedMisc.is_supported_text_model(model_name)
@classmethod
def is_supported_image_model(cls, model_name: str) -> bool:
"""Check if model is supported by fastembed
Args:
model_name (str): The name of the model to check.
Returns:
bool: True if the model is supported, False otherwise.
"""
return FastEmbedMisc.is_supported_image_model(model_name)
@classmethod
def is_supported_late_interaction_text_model(cls, model_name: str) -> bool:
"""Check if model is supported by fastembed
Args:
model_name (str): The name of the model to check.
Returns:
bool: True if the model is supported, False otherwise.
"""
return FastEmbedMisc.is_supported_late_interaction_text_model(model_name)
@classmethod
def is_supported_late_interaction_multimodal_model(cls, model_name: str) -> bool:
"""Check if model is supported by fastembed
Args:
model_name (str): The name of the model to check.
Returns:
bool: True if the model is supported, False otherwise.
"""
return FastEmbedMisc.is_supported_late_interaction_multimodal_model(model_name)
@classmethod
def is_supported_sparse_model(cls, model_name: str) -> bool:
"""Check if model is supported by fastembed
Args:
model_name (str): The name of the model to check.
Returns:
bool: True if the model is supported, False otherwise.
"""
return FastEmbedMisc.is_supported_sparse_model(model_name)
@@ -0,0 +1,498 @@
import os
from collections import defaultdict
from copy import deepcopy
from multiprocessing import get_all_start_methods
from typing import Optional, Union, Iterable, Any, Type, get_args
from pydantic import BaseModel
from qdrant_client.embed.builtin_embedder import BuiltinEmbedder
from qdrant_client.http import models
from qdrant_client.embed.common import INFERENCE_OBJECT_TYPES
from qdrant_client.embed.embed_inspector import InspectorEmbed
from qdrant_client.embed.embedder import Embedder
from qdrant_client.embed.models import NumericVector, NumericVectorStruct
from qdrant_client.embed.schema_parser import ModelSchemaParser
from qdrant_client.embed.utils import FieldPath
from qdrant_client.fastembed_common import FastEmbedMisc
from qdrant_client.parallel_processor import ParallelWorkerPool, Worker
from qdrant_client.uploader.uploader import iter_batch
class ModelEmbedderWorker(Worker):
def __init__(self, batch_size: int, **kwargs: Any):
self.model_embedder = ModelEmbedder(**kwargs)
self.batch_size = batch_size
@classmethod
def start(cls, batch_size: int, **kwargs: Any) -> "ModelEmbedderWorker":
return cls(threads=1, batch_size=batch_size, **kwargs)
def process(self, items: Iterable[tuple[int, Any]]) -> Iterable[tuple[int, Any]]:
for idx, batch in items:
yield (
idx,
list(
self.model_embedder.embed_models_batch(
batch, inference_batch_size=self.batch_size
)
),
)
class ModelEmbedder:
MAX_INTERNAL_BATCH_SIZE = 64
def __init__(
self,
parser: Optional[ModelSchemaParser] = None,
is_local_mode: bool = False,
server_version: Optional[str] = None,
**kwargs: Any,
):
self._batch_accumulator: dict[str, list[INFERENCE_OBJECT_TYPES]] = {}
self._embed_storage: dict[str, list[NumericVector]] = {}
self._embed_inspector = InspectorEmbed(parser=parser)
self._is_builtin_embedder_available = self._check_builtin_embedder_availability(
is_local_mode, server_version
)
self.embedder = (
Embedder(**kwargs) if FastEmbedMisc.is_installed() else BuiltinEmbedder(**kwargs)
)
@staticmethod
def _check_builtin_embedder_availability(
is_local_mode: bool, server_version: Optional[str]
) -> bool:
if is_local_mode:
return False
if (
server_version is None
): # failed to detect server version, it might happen due to security or network
# problems even on supported server versions, so we are not blocking usage of BuiltinEmbedder.
return True
try:
major, minor, patch = server_version.split(".")
patch = patch.split("-")[0]
if (int(major), int(minor), int(patch)) >= (1, 15, 3):
return True
return False
except Exception:
return True
def embed_models(
self,
raw_models: Union[BaseModel, Iterable[BaseModel]],
is_query: bool = False,
batch_size: int = 8,
) -> Iterable[BaseModel]:
"""Embed raw data fields in models and return models with vectors
If any of model fields required inference, a deepcopy of a model with computed embeddings is returned,
otherwise returns original models.
Args:
raw_models: Iterable[BaseModel] - models which can contain fields with raw data
is_query: bool - flag to determine which embed method to use. Defaults to False.
batch_size: int - batch size for inference
Returns:
list[BaseModel]: models with embedded fields
"""
if not self._is_builtin_embedder_available:
FastEmbedMisc.import_fastembed() # fail fast if fastembed is required
if isinstance(raw_models, BaseModel):
raw_models = [raw_models]
for raw_models_batch in iter_batch(raw_models, batch_size):
yield from self.embed_models_batch(
raw_models_batch, is_query, inference_batch_size=batch_size
)
def embed_models_strict(
self,
raw_models: Iterable[Union[dict[str, BaseModel], BaseModel]],
batch_size: int = 8,
parallel: Optional[int] = None,
) -> Iterable[Union[dict[str, BaseModel], BaseModel]]:
"""Embed raw data fields in models and return models with vectors
Requires every input sequences element to contain raw data fields to inference.
Does not accept ready vectors.
Args:
raw_models: Iterable[BaseModel] - models which contain fields with raw data to inference
batch_size: int - batch size for inference
parallel: int - number of parallel processes to use. Defaults to None.
Returns:
Iterable[Union[dict[str, BaseModel], BaseModel]]: models with embedded fields
"""
if not self._is_builtin_embedder_available:
FastEmbedMisc.import_fastembed() # fail fast if fastembed is required
is_small = False
if isinstance(raw_models, list):
if len(raw_models) < batch_size:
is_small = True
if (
isinstance(self.embedder, BuiltinEmbedder)
or parallel is None
or parallel == 1
or is_small
):
for batch in iter_batch(raw_models, batch_size):
yield from self.embed_models_batch(batch, inference_batch_size=batch_size)
else:
multiprocessing_batch_size = 1 # larger batch sizes do not help with data parallel
# on cpu. todo: adjust when multi-gpu is available
raw_models_batches = iter_batch(raw_models, size=multiprocessing_batch_size)
if parallel == 0:
parallel = os.cpu_count()
start_method = "forkserver" if "forkserver" in get_all_start_methods() else "spawn"
assert parallel is not None # just a mypy complaint
pool = ParallelWorkerPool(
num_workers=parallel,
worker=self._get_worker_class(),
start_method=start_method,
max_internal_batch_size=self.MAX_INTERNAL_BATCH_SIZE,
)
for batch in pool.ordered_map(
raw_models_batches, batch_size=multiprocessing_batch_size
):
yield from batch
def embed_models_batch(
self,
raw_models: list[Union[dict[str, BaseModel], BaseModel]],
is_query: bool = False,
inference_batch_size: int = 8,
) -> Iterable[BaseModel]:
"""Embed a batch of models with raw data fields and return models with vectors
If any of model fields required inference, a deepcopy of a model with computed embeddings is returned,
otherwise returns original models.
Args:
raw_models: list[Union[dict[str, BaseModel], BaseModel]] - models which can contain fields with raw data
is_query: bool - flag to determine which embed method to use. Defaults to False.
inference_batch_size: int - batch size for inference
Returns:
Iterable[BaseModel]: models with embedded fields
"""
if not self._is_builtin_embedder_available:
FastEmbedMisc.import_fastembed() # fail fast if fastembed is required
for raw_model in raw_models:
self._process_model(raw_model, is_query=is_query, accumulating=True)
if not self._batch_accumulator:
yield from raw_models
else:
yield from (
self._process_model(
raw_model,
is_query=is_query,
accumulating=False,
inference_batch_size=inference_batch_size,
)
for raw_model in raw_models
)
def _process_model(
self,
model: Union[dict[str, BaseModel], BaseModel],
paths: Optional[list[FieldPath]] = None,
is_query: bool = False,
accumulating: bool = False,
inference_batch_size: Optional[int] = None,
) -> Union[dict[str, BaseModel], dict[str, NumericVector], BaseModel, NumericVector]:
"""Embed model's fields requiring inference
Args:
model: Qdrant http model containing fields to embed
paths: Path to fields to embed. E.g. [FieldPath(current="recommend", tail=[FieldPath(current="negative", tail=None)])]
is_query: Flag to determine which embed method to use. Defaults to False.
accumulating: Flag to determine if we are accumulating models for batch embedding. Defaults to False.
inference_batch_size: Optional[int] - batch size for inference
Returns:
A deepcopy of the method with embedded fields
"""
if isinstance(model, get_args(INFERENCE_OBJECT_TYPES)):
if accumulating:
self._accumulate(model) # type: ignore
else:
assert (
inference_batch_size is not None
), "inference_batch_size should be passed for inference"
return self._drain_accumulator(
model, # type: ignore
is_query=is_query,
inference_batch_size=inference_batch_size,
)
if paths is None:
model = deepcopy(model) if not accumulating else model
if isinstance(model, dict):
for key, value in model.items():
if accumulating:
self._process_model(value, paths, accumulating=True)
else:
model[key] = self._process_model(
value,
paths,
is_query=is_query,
accumulating=False,
inference_batch_size=inference_batch_size,
)
return model
paths = paths if paths is not None else self._embed_inspector.inspect(model)
for path in paths:
list_model = [model] if not isinstance(model, list) else model
for item in list_model:
current_model = getattr(item, path.current, None)
if current_model is None:
continue
if path.tail:
self._process_model(
current_model,
path.tail,
is_query=is_query,
accumulating=accumulating,
inference_batch_size=inference_batch_size,
)
else:
was_list = isinstance(current_model, list)
current_model = current_model if was_list else [current_model]
if not accumulating:
assert (
inference_batch_size is not None
), "inference_batch_size should be passed for inference"
embeddings = [
self._drain_accumulator(
data, is_query=is_query, inference_batch_size=inference_batch_size
)
for data in current_model
]
if was_list:
setattr(item, path.current, embeddings)
else:
setattr(item, path.current, embeddings[0])
else:
for data in current_model:
self._accumulate(data)
return model
def _accumulate(self, data: models.VectorStruct) -> None:
"""Add data to batch accumulator
Args:
data: models.VectorStruct - any vector struct data, if inference object types instances in `data` - add them
to the accumulator, otherwise - do nothing. `InferenceObject` instances are converted to proper types.
Returns:
None
"""
if isinstance(data, dict):
for value in data.values():
self._accumulate(value)
return None
if isinstance(data, list):
for value in data:
if not isinstance(value, get_args(INFERENCE_OBJECT_TYPES)): # if value is a vector
return None
self._accumulate(value)
if not isinstance(data, get_args(INFERENCE_OBJECT_TYPES)):
return None
data = self._resolve_inference_object(data)
if data.model not in self._batch_accumulator:
self._batch_accumulator[data.model] = []
self._batch_accumulator[data.model].append(data)
return None
def _drain_accumulator(
self, data: models.VectorStruct, is_query: bool, inference_batch_size: int = 8
) -> NumericVectorStruct:
"""Drain accumulator and replaces inference objects with computed embeddings
It is assumed objects are traversed in the same order as they were added to the accumulator
Args:
data: models.VectorStruct - any vector struct data, if inference object types instances in `data` - replace
them with computed embeddings. If embeddings haven't yet been computed - compute them and then replace
inference objects.
inference_batch_size: int - batch size for inference
Returns:
NumericVectorStruct: data with replaced inference objects
"""
if isinstance(data, dict):
for key, value in data.items():
data[key] = self._drain_accumulator(
value, is_query=is_query, inference_batch_size=inference_batch_size
)
return data
if isinstance(data, list):
for i, value in enumerate(data):
if not isinstance(value, get_args(INFERENCE_OBJECT_TYPES)): # if value is vector
return data
data[i] = self._drain_accumulator(
value, is_query=is_query, inference_batch_size=inference_batch_size
)
return data
if not isinstance(
data, get_args(INFERENCE_OBJECT_TYPES)
): # ide type checker ignores `not` and scolds
return data # type: ignore
if not self._embed_storage or not self._embed_storage.get(data.model, None):
self._embed_accumulator(is_query=is_query, inference_batch_size=inference_batch_size)
return self._next_embed(data.model)
def _embed_accumulator(self, is_query: bool = False, inference_batch_size: int = 8) -> None:
"""Embed all accumulated objects for all models
Args:
is_query: bool - flag to determine which embed method to use. Defaults to False.
inference_batch_size: int - batch size for inference
Returns:
None
"""
def embed(
objects: list[INFERENCE_OBJECT_TYPES], model_name: str, batch_size: int
) -> list[NumericVector]:
"""
Assemble batches by options and data type based groups, embeds and return embeddings in the original order
"""
unique_options: list[dict[str, Any]] = []
unique_options_is_text: list[bool] = [] # multimodal models can have both text
# and image data, we need to track which data we process to construct separate batches for texts and images
batches: list[Any] = []
group_indices: dict[int, list[int]] = defaultdict(list)
for i, obj in enumerate(objects):
is_text = isinstance(obj, models.Document)
for j, (options, options_is_text) in enumerate(
zip(unique_options, unique_options_is_text)
):
if options == obj.options and is_text == options_is_text:
group_indices[j].append(i)
batches[j].append(obj.text if is_text else obj.image)
break
else:
# Create a new group if no match was found
group_indices[len(unique_options)] = [i]
unique_options.append(obj.options)
unique_options_is_text.append(is_text)
batches.append([obj.text if is_text else obj.image])
embeddings = []
for i, (options, is_text) in enumerate(zip(unique_options, unique_options_is_text)):
embeddings.extend(
[
embedding
for embedding in self.embedder.embed(
model_name=model_name,
texts=batches[i] if is_text else None,
images=batches[i] if not is_text else None,
is_query=is_query,
options=options or {},
batch_size=batch_size,
)
]
)
iter_embeddings = iter(embeddings)
ordered_embeddings: list[list[NumericVector]] = [[]] * len(objects)
for indices in group_indices.values():
for index in indices:
ordered_embeddings[index] = next(iter_embeddings)
return ordered_embeddings
for model in self._batch_accumulator:
if not any(
(
self.embedder.is_supported_text_model(model),
self.embedder.is_supported_sparse_model(model),
self.embedder.is_supported_late_interaction_text_model(model),
self.embedder.is_supported_image_model(model),
self.embedder.is_supported_late_interaction_multimodal_model(model),
)
):
if isinstance(self.embedder, BuiltinEmbedder):
raise ValueError(
f"{model} is not among supported models. "
f"Have you forgotten to set `cloud_inference` or install `fastembed` for local inference?"
)
else:
raise ValueError(f"{model} is not among supported models")
for model, data in self._batch_accumulator.items():
self._embed_storage[model] = embed(
objects=data, model_name=model, batch_size=inference_batch_size
)
self._batch_accumulator.clear()
def _next_embed(self, model_name: str) -> NumericVector:
"""Get next computed embedding from embedded batch
Args:
model_name: str - retrieve embedding from the storage by this model name
Returns:
NumericVector: computed embedding
"""
return self._embed_storage[model_name].pop(0)
def _resolve_inference_object(self, data: models.VectorStruct) -> models.VectorStruct:
"""Resolve inference object into a model
Args:
data: models.VectorStruct - data to resolve, if it's an inference object, convert it to a proper type,
otherwise - keep unchanged
Returns:
models.VectorStruct: resolved data
"""
if not isinstance(data, models.InferenceObject):
return data
model_name = data.model
value = data.object
options = data.options
if any(
(
self.embedder.is_supported_text_model(model_name),
self.embedder.is_supported_sparse_model(model_name),
self.embedder.is_supported_late_interaction_text_model(model_name),
)
):
return models.Document(model=model_name, text=value, options=options)
if self.embedder.is_supported_image_model(model_name):
return models.Image(model=model_name, image=value, options=options)
if self.embedder.is_supported_late_interaction_multimodal_model(model_name):
raise ValueError(f"{model_name} does not support `InferenceObject` interface")
raise ValueError(f"{model_name} is not among supported models")
@classmethod
def _get_worker_class(cls) -> Type[ModelEmbedderWorker]:
return ModelEmbedderWorker
@@ -0,0 +1,25 @@
from typing import Union
from pydantic import StrictFloat, StrictStr
from qdrant_client.http.models import ExtendedPointId, SparseVector
NumericVector = Union[
list[StrictFloat],
SparseVector,
list[list[StrictFloat]],
]
NumericVectorInput = Union[
list[StrictFloat],
SparseVector,
list[list[StrictFloat]],
ExtendedPointId,
]
NumericVectorStruct = Union[
list[StrictFloat],
list[list[StrictFloat]],
dict[StrictStr, NumericVector],
]
__all__ = ["NumericVector", "NumericVectorInput", "NumericVectorStruct"]
@@ -0,0 +1,305 @@
from copy import copy, deepcopy
from pathlib import Path
from typing import Type, Union, Any, Optional
from pydantic import BaseModel
from qdrant_client._pydantic_compat import model_json_schema
from qdrant_client.embed.utils import FieldPath, convert_paths
try:
from qdrant_client.embed._inspection_cache import (
DEFS,
CACHE_STR_PATH,
RECURSIVE_REFS,
EXCLUDED_RECURSIVE_REFS,
INCLUDED_RECURSIVE_REFS,
NAME_RECURSIVE_REF_MAPPING,
)
except ImportError as e:
DEFS = {}
CACHE_STR_PATH = {}
RECURSIVE_REFS = set() # type: ignore
EXCLUDED_RECURSIVE_REFS = {"Filter"} # type: ignore
INCLUDED_RECURSIVE_REFS = set() # type: ignore
NAME_RECURSIVE_REF_MAPPING = {}
class ModelSchemaParser:
"""Model schema parser. Parses json schemas to retrieve paths to objects requiring inference.
The parser is stateful, it accumulates the results of parsing in its internal structures.
Attributes:
_defs: definitions extracted from json schemas
_recursive_refs: set of recursive refs found in the processed schemas, e.g.:
{"Filter", "Prefetch"}
_excluded_recursive_refs: predefined time-consuming recursive refs which don't have inference objects, e.g.:
{"Filter"}
_included_recursive_refs: set of recursive refs which have inference objects, e.g.:
{"Prefetch"}
_cache: cache of string paths for models containing objects for inference, e.g.:
{"Prefetch": ['prefetch.query', 'prefetch.query.context.negative', ...]}
path_cache: cache of FieldPath objects for models containing objects for inference, e.g.:
{
"Prefetch": [
FieldPath(
current="prefetch",
tail=[
FieldPath(
current="query",
tail=[
FieldPath(
current="recommend",
tail=[
FieldPath(current="negative", tail=None),
FieldPath(current="positive", tail=None),
],
),
...,
],
),
],
)
]
}
name_recursive_ref_mapping: mapping of model field names to ref names, e.g.:
{"prefetch": "Prefetch"}
"""
CACHE_PATH = "_inspection_cache.py"
INFERENCE_OBJECT_NAMES = {"Document", "Image", "InferenceObject"}
def __init__(self) -> None:
# self._defs does not include the whole schema, but only the part with the structures used in $defs
self._defs: dict[str, Union[dict[str, Any], list[dict[str, Any]]]] = deepcopy(DEFS) # type: ignore[arg-type]
self._cache: dict[str, list[str]] = deepcopy(CACHE_STR_PATH)
self._recursive_refs: set[str] = set(RECURSIVE_REFS)
self._excluded_recursive_refs: set[str] = set(EXCLUDED_RECURSIVE_REFS)
self._included_recursive_refs: set[str] = set(INCLUDED_RECURSIVE_REFS)
self.name_recursive_ref_mapping: dict[str, str] = {
k: v for k, v in NAME_RECURSIVE_REF_MAPPING.items()
}
self.path_cache: dict[str, list[FieldPath]] = {
model: convert_paths(paths) for model, paths in self._cache.items()
}
self._processed_recursive_defs: dict[str, Any] = {}
def _replace_refs(
self,
schema: Union[dict[str, Any], list[dict[str, Any]]],
parent: Optional[str] = None,
seen_refs: Optional[set] = None,
) -> Union[dict[str, Any], list[dict[str, Any]]]:
"""Replace refs in schema with their definitions
Args:
schema: schema to parse
parent: previous level key
seen_refs: set of seen refs to spot recursive paths
Returns:
schema with replaced refs
"""
parent = parent if parent else None
seen_refs = seen_refs if seen_refs else set()
if isinstance(schema, dict):
if "$ref" in schema:
ref_path = schema["$ref"]
def_key = ref_path.split("/")[-1]
if def_key in self._processed_recursive_defs:
return self._processed_recursive_defs[def_key]
if def_key == parent or def_key in seen_refs:
self._recursive_refs.add(def_key)
self._processed_recursive_defs[def_key] = schema
return schema
seen_refs.add(def_key)
return self._replace_refs(
self._defs[def_key], parent=def_key, seen_refs=copy(seen_refs)
)
schemes = {}
if "properties" in schema:
for k, v in schema.items():
if k == "properties":
schemes[k] = self._replace_refs(
schema=v, parent=parent, seen_refs=copy(seen_refs)
)
else:
schemes[k] = v
else:
for k, v in schema.items():
parent_key = k if isinstance(v, dict) and "properties" in v else parent
schemes[k] = self._replace_refs(
schema=v, parent=parent_key, seen_refs=copy(seen_refs)
)
return schemes
elif isinstance(schema, list):
return [
self._replace_refs(schema=item, parent=parent, seen_refs=copy(seen_refs)) # type: ignore
for item in schema
]
else:
return schema
def _find_document_paths(
self,
schema: Union[dict[str, Any], list[dict[str, Any]]],
current_path: str = "",
after_properties: bool = False,
seen_refs: Optional[set] = None,
) -> list[str]:
"""Read a schema and find paths to objects requiring inference
Populates model fields names to ref names mapping
Args:
schema: schema to parse
current_path: current path in the schema
after_properties: flag indicating if the current path is after "properties" key
seen_refs: set of seen refs to spot recursive paths
Returns:
List of string dot separated paths to objects requiring inference
"""
document_paths: list[str] = []
seen_recursive_refs = seen_refs if seen_refs is not None else set()
parts = current_path.split(".")
if len(parts) != len(set(parts)): # check for recursive paths
return document_paths
if not isinstance(schema, dict):
return document_paths
if "title" in schema and schema["title"] in self.INFERENCE_OBJECT_NAMES:
document_paths.append(current_path)
return document_paths
for key, value in schema.items():
if key == "$defs":
continue
if key == "$ref":
model_name = value.split("/")[-1]
value = self._defs[model_name]
if model_name in seen_recursive_refs:
continue
if (
model_name in self._excluded_recursive_refs
): # on the first run it might be empty
continue
if (
model_name in self._recursive_refs
): # included and excluded refs might not be filled up yet, we're looking in all recursive refs
# we would need to clean up name recursive ref mapping later and delete excluded refs from there
seen_recursive_refs.add(model_name)
self.name_recursive_ref_mapping[current_path.split(".")[-1]] = model_name
if after_properties: # field name seen in pydantic models comes after "properties" key
if current_path:
new_path = f"{current_path}.{key}"
else:
new_path = key
else:
new_path = current_path
if isinstance(value, dict):
document_paths.extend(
self._find_document_paths(
value, new_path, key == "properties", seen_refs=seen_recursive_refs
)
)
elif isinstance(value, list):
for item in value:
if isinstance(item, dict):
document_paths.extend(
self._find_document_paths(
item,
new_path,
key == "properties",
seen_refs=seen_recursive_refs,
)
)
return sorted(set(document_paths))
def parse_model(self, model: Type[BaseModel]) -> None:
"""Parse model schema to retrieve paths to objects requiring inference.
Checks model json schema, extracts definitions and finds paths to objects requiring inference.
No parsing happens if model has already been processed.
Args:
model: model to parse
Returns:
None
"""
model_name = model.__name__
if model_name in self._cache:
return None
schema = model_json_schema(model)
for k, v in schema.get("$defs", {}).items():
if k not in self._defs:
self._defs[k] = v
if "$defs" in schema:
raw_refs = (
{"$ref": schema["$ref"]}
if "$ref" in schema
else {"properties": schema["properties"]}
)
refs = self._replace_refs(raw_refs)
self._cache[model_name] = self._find_document_paths(refs)
else:
self._cache[model_name] = []
for ref in self._recursive_refs:
if ref in self._excluded_recursive_refs or ref in self._included_recursive_refs:
continue
if self._find_document_paths(self._defs[ref]):
self._included_recursive_refs.add(ref)
else:
self._excluded_recursive_refs.add(ref)
self.name_recursive_ref_mapping = {
k: v
for k, v in self.name_recursive_ref_mapping.items()
if v not in self._excluded_recursive_refs
}
# convert str paths to FieldPath objects which group path parts and reduce the time of the traversal
self.path_cache = {model: convert_paths(paths) for model, paths in self._cache.items()}
def _persist(self, output_path: Union[Path, str] = CACHE_PATH) -> None:
"""Persist the parser state to a file
Args:
output_path: path to the file to save the parser state
Returns:
None
"""
with open(output_path, "w") as f:
f.write(f"CACHE_STR_PATH = {self._cache}\n")
f.write(f"DEFS = {self._defs}\n")
# `sorted is required` to use `diff` in comparisons
f.write(f"RECURSIVE_REFS = {sorted(self._recursive_refs)}\n")
f.write(f"INCLUDED_RECURSIVE_REFS = {sorted(self._included_recursive_refs)}\n")
f.write(f"EXCLUDED_RECURSIVE_REFS = {sorted(self._excluded_recursive_refs)}\n")
f.write(f"NAME_RECURSIVE_REF_MAPPING = {self.name_recursive_ref_mapping}\n")
@@ -0,0 +1,149 @@
from typing import Union, Optional, Iterable, get_args
from pydantic import BaseModel
from qdrant_client._pydantic_compat import model_fields_set
from qdrant_client.embed.common import INFERENCE_OBJECT_TYPES
from qdrant_client.embed.schema_parser import ModelSchemaParser
from qdrant_client.embed.utils import FieldPath
class Inspector:
"""Inspector which tries to find at least one occurrence of an object requiring inference
Inspector is stateful and accumulates parsed model schemes in its parser.
Attributes:
parser: ModelSchemaParser instance to inspect model json schemas
"""
def __init__(self, parser: Optional[ModelSchemaParser] = None) -> None:
self.parser = ModelSchemaParser() if parser is None else parser
def inspect(self, points: Union[Iterable[BaseModel], BaseModel]) -> bool:
"""Looks for at least one occurrence of an object requiring inference in the received models
Args:
points: models to inspect
Returns:
True if at least one object requiring inference is found, False otherwise
"""
if isinstance(points, BaseModel):
self.parser.parse_model(points.__class__)
return self._inspect_model(points)
elif isinstance(points, dict):
for value in points.values():
if self.inspect(value):
return True
elif isinstance(points, Iterable):
for point in points:
if isinstance(point, BaseModel):
self.parser.parse_model(point.__class__)
if self._inspect_model(point):
return True
else:
return False
return False
def _inspect_model(self, model: BaseModel, paths: Optional[list[FieldPath]] = None) -> bool:
if isinstance(model, get_args(INFERENCE_OBJECT_TYPES)):
return True
paths = (
self.parser.path_cache.get(model.__class__.__name__, []) if paths is None else paths
)
for path in paths:
type_found = self._inspect_inner_models(
model, path.current, path.tail if path.tail else []
)
if type_found:
return True
return False
def _inspect_inner_models(
self, original_model: BaseModel, current_path: str, tail: list[FieldPath]
) -> bool:
def inspect_recursive(member: BaseModel) -> bool:
recursive_paths = []
for field_name in model_fields_set(member):
if field_name in self.parser.name_recursive_ref_mapping:
mapped_model_name = self.parser.name_recursive_ref_mapping[field_name]
recursive_paths.extend(self.parser.path_cache[mapped_model_name])
if recursive_paths:
found = self._inspect_model(member, recursive_paths)
if found:
return True
return False
model = getattr(original_model, current_path, None)
if model is None:
return False
if isinstance(model, get_args(INFERENCE_OBJECT_TYPES)):
return True
if isinstance(model, BaseModel):
type_found = inspect_recursive(model)
if type_found:
return True
for next_path in tail:
type_found = self._inspect_inner_models(
model, next_path.current, next_path.tail if next_path.tail else []
)
if type_found:
return True
return False
elif isinstance(model, list):
for current_model in model:
if isinstance(current_model, get_args(INFERENCE_OBJECT_TYPES)):
return True
if not isinstance(current_model, BaseModel):
continue
type_found = inspect_recursive(current_model)
if type_found:
return True
for next_path in tail:
for current_model in model:
type_found = self._inspect_inner_models(
current_model, next_path.current, next_path.tail if next_path.tail else []
)
if type_found:
return True
return False
elif isinstance(model, dict):
for key, values in model.items():
values = [values] if not isinstance(values, list) else values
for current_model in values:
if isinstance(current_model, get_args(INFERENCE_OBJECT_TYPES)):
return True
if not isinstance(current_model, BaseModel):
continue
found_type = inspect_recursive(current_model)
if found_type:
return True
for next_path in tail:
for current_model in values:
found_type = self._inspect_inner_models(
current_model,
next_path.current,
next_path.tail if next_path.tail else [],
)
if found_type:
return True
return False
@@ -0,0 +1,79 @@
import base64
from pathlib import Path
from typing import Optional, Union
from pydantic import BaseModel, Field
class FieldPath(BaseModel):
current: str
tail: Optional[list["FieldPath"]] = Field(default=None)
def as_str_list(self) -> list[str]:
"""
>>> FieldPath(current='a', tail=[FieldPath(current='b', tail=[FieldPath(current='c'), FieldPath(current='d')])]).as_str_list()
['a.b.c', 'a.b.d']
"""
# Recursive function to collect all paths
def collect_paths(path: FieldPath, prefix: str = "") -> list[str]:
current_path = prefix + path.current
if not path.tail:
return [current_path]
else:
paths = []
for sub_path in path.tail:
paths.extend(collect_paths(sub_path, current_path + "."))
return paths
# Collect all paths starting from this object
return collect_paths(self)
def convert_paths(paths: list[str]) -> list[FieldPath]:
"""Convert string paths into FieldPath objects
Paths which share the same root are grouped together.
Args:
paths: List[str]: List of str paths containing "." as separator
Returns:
List[FieldPath]: List of FieldPath objects
"""
sorted_paths = sorted(paths)
prev_root = None
converted_paths = []
for path in sorted_paths:
parts = path.split(".")
root = parts[0]
if root != prev_root:
converted_paths.append(FieldPath(current=root))
prev_root = root
current = converted_paths[-1]
for part in parts[1:]:
if current.tail is None:
current.tail = []
found = False
for tail in current.tail:
if tail.current == part:
current = tail
found = True
break
if not found:
new_tail = FieldPath(current=part)
assert current.tail is not None
current.tail.append(new_tail)
current = new_tail
return converted_paths
def read_base64(file_path: Union[str, Path]) -> str:
"""Convert a file path to a base64 encoded string."""
path = Path(file_path)
if not path.exists():
raise FileNotFoundError(f"The file {path} does not exist.")
with open(path, "rb") as file:
file_content = file.read()
return base64.b64encode(file_content).decode("utf-8")