Epsilla

EpsillaVectorStore #

基础: BasePydanticVectorStore

Epsilla 向量存储。

在此向量存储中，我们将文本、其嵌入向量及其一些元数据存储在 Epsilla collection 中。此实现允许使用已有的 collection。如果 collection 不存在或设置了 overwrite 为 True，它也支持创建新的 collection。

作为先决条件，您需要安装 pyepsilla 包并运行 Epsilla 向量数据库（例如，通过我们的 docker 镜像）。有关如何运行 Epsilla 向量数据库的文档，请参阅：https://epsilla-inc.gitbook.io/epsilladb/quick-start

参数

名称	类型	描述	默认值
`add`	`Any`	要连接的 Epsilla 客户端。	必需
`collection_name`	`Optional[str]`	要使用的 collection。默认为 "llama_collection"。	`'llama_collection'`
`db_path`	`Optional[str]`	数据库将持久化的路径。默认为 "/tmp/langchain-epsilla"。	`DEFAULT_PERSIST_DIR`
`db_name`	`Optional[str]`	为加载的数据库指定名称。默认为 "langchain_store"。	`'llama_db'`
`dimension`	`Optional[int]`	嵌入向量的维度。如果未提供，将在第一次插入时创建 collection。默认为 None。	`无`
`overwrite`	`Optional[bool]`	是否覆盖同名现有 collection。默认为 False。	`False`

返回值

名称	类型	描述
`client`		支持添加、删除和查询的向量存储。

示例

pip install llama-index-vector-stores-epsilla

from llama_index.vector_stores.epsilla import EpsillaVectorStore
from pyepsilla import vectordb

client = vectordb.Client()
vector_store = EpsillaVectorStore(client=client, db_path="/tmp/llamastore")

源代码位于 llama-index-integrations/vector_stores/llama-index-vector-stores-epsilla/llama_index/vector_stores/epsilla/base.py

class EpsillaVectorStore(BasePydanticVectorStore):
    """
    The Epsilla Vector Store.

    In this vector store we store the text, its embedding and
    a few pieces of its metadata in a Epsilla collection. This implemnetation
    allows the use of an already existing collection.
    It also supports creating a new one if the collection does not
    exist or if `overwrite` is set to True.

    As a prerequisite, you need to install ``pyepsilla`` package
    and have a running Epsilla vector database (for example, through our docker image)
    See the following documentation for how to run an Epsilla vector database:
    https://epsilla-inc.gitbook.io/epsilladb/quick-start

    Args:
        client (Any): Epsilla client to connect to.
        collection_name (Optional[str]): Which collection to use.
                    Defaults to "llama_collection".
        db_path (Optional[str]): The path where the database will be persisted.
                    Defaults to "/tmp/langchain-epsilla".
        db_name (Optional[str]): Give a name to the loaded database.
                    Defaults to "langchain_store".
        dimension (Optional[int]): The dimension of the embeddings. If not provided,
                    collection creation will be done on first insert. Defaults to None.
        overwrite (Optional[bool]): Whether to overwrite existing collection with same
                    name. Defaults to False.

    Returns:
        EpsillaVectorStore: Vectorstore that supports add, delete, and query.

    Examples:
        `pip install llama-index-vector-stores-epsilla`

        ```python
        from llama_index.vector_stores.epsilla import EpsillaVectorStore
        from pyepsilla import vectordb

        client = vectordb.Client()
        vector_store = EpsillaVectorStore(client=client, db_path="/tmp/llamastore")
        ```

    """

    stores_text: bool = True
    flat_metadata: bool = False

    _client: vectordb.Client = PrivateAttr()
    _collection_name: str = PrivateAttr()
    _collection_created: bool = PrivateAttr()

    def __init__(
        self,
        client: Any,
        collection_name: str = "llama_collection",
        db_path: Optional[str] = DEFAULT_PERSIST_DIR,  # sub folder
        db_name: Optional[str] = "llama_db",
        dimension: Optional[int] = None,
        overwrite: bool = False,
        **kwargs: Any,
    ) -> None:
        """Init params."""
        super().__init__()

        if not isinstance(client, vectordb.Client):
            raise TypeError(
                f"client should be an instance of pyepsilla.vectordb.Client, "
                f"got {type(client)}"
            )

        self._client: vectordb.Client = client
        self._collection_name = collection_name
        self._client.load_db(db_name, db_path)
        self._client.use_db(db_name)
        self._collection_created = False

        status_code, response = self._client.list_tables()
        if status_code != 200:
            self._handle_error(msg=response["message"])
        table_list = response["result"]

        if self._collection_name in table_list and overwrite is False:
            self._collection_created = True

        if self._collection_name in table_list and overwrite is True:
            status_code, response = self._client.drop_table(
                table_name=self._collection_name
            )
            if status_code != 200:
                self._handle_error(msg=response["message"])
            logger.debug(
                f"Successfully removed old collection: {self._collection_name}"
            )
            if dimension is not None:
                self._create_collection(dimension)

        if self._collection_name not in table_list and dimension is not None:
            self._create_collection(dimension)

    @classmethod
    def class_name(cls) -> str:
        return "EpsillaVectorStore"

    @property
    def client(self) -> Any:
        """Return the Epsilla client."""
        return self._client

    def _handle_error(self, msg: str) -> None:
        """Handle error."""
        logger.error(f"Failed to get records: {msg}")
        raise Exception(f"Error: {msg}.")

    def _create_collection(self, dimension: int) -> None:
        """
        Create collection.

        Args:
            dimension (int): The dimension of the embeddings.

        """
        fields: List[dict] = [
            {"name": "id", "dataType": "STRING", "primaryKey": True},
            {"name": DEFAULT_DOC_ID_KEY, "dataType": "STRING"},
            {"name": DEFAULT_TEXT_KEY, "dataType": "STRING"},
            {
                "name": DEFAULT_EMBEDDING_KEY,
                "dataType": "VECTOR_FLOAT",
                "dimensions": dimension,
            },
            {"name": "metadata", "dataType": "JSON"},
        ]
        status_code, response = self._client.create_table(
            table_name=self._collection_name, table_fields=fields
        )
        if status_code != 200:
            self._handle_error(msg=response["message"])
        self._collection_created = True
        logger.debug(f"Successfully created collection: {self._collection_name}")

    def add(
        self,
        nodes: List[BaseNode],
        **add_kwargs: Any,
    ) -> List[str]:
        """
        Add nodes to Epsilla vector store.

        Args:
            nodes: List[BaseNode]: list of nodes with embeddings

        Returns:
            List[str]: List of ids inserted.

        """
        # If the collection doesn't exist yet, create the collection
        if not self._collection_created and len(nodes) > 0:
            dimension = len(nodes[0].get_embedding())
            self._create_collection(dimension)

        elif len(nodes) == 0:
            return []

        ids = []
        records = []
        for node in nodes:
            ids.append(node.node_id)
            text = node.get_content(metadata_mode=MetadataMode.NONE)
            metadata_dict = node_to_metadata_dict(node, remove_text=True)
            metadata = metadata_dict["_node_content"]
            record = {
                "id": node.node_id,
                DEFAULT_DOC_ID_KEY: node.ref_doc_id,
                DEFAULT_TEXT_KEY: text,
                DEFAULT_EMBEDDING_KEY: node.get_embedding(),
                "metadata": metadata,
            }
            records.append(record)

        status_code, response = self._client.insert(
            table_name=self._collection_name, records=records
        )
        if status_code != 200:
            self._handle_error(msg=response["message"])

        return ids

    def delete(self, ref_doc_id: str, **delete_kwargs: Any) -> None:
        """
        Delete nodes using with ref_doc_id.

        Args:
            ref_doc_id (str): The doc_id of the document to delete.

        """
        raise NotImplementedError("Delete with filtering will be coming soon.")

    def query(self, query: VectorStoreQuery, **kwargs: Any) -> VectorStoreQueryResult:
        """
        Query index for top k most similar nodes.

        Args:
            query (VectorStoreQuery): query.

        Returns:
            Vector store query result.

        """
        if not self._collection_created:
            raise ValueError("Please initialize a collection first.")

        if query.mode != VectorStoreQueryMode.DEFAULT:
            raise NotImplementedError(f"Epsilla does not support {query.mode} yet.")

        if query.filters is not None:
            raise NotImplementedError("Epsilla does not support Metadata filters yet.")

        if query.doc_ids is not None and len(query.doc_ids) > 0:
            raise NotImplementedError("Epsilla does not support filters yet.")

        status_code, response = self._client.query(
            table_name=self._collection_name,
            query_field=DEFAULT_EMBEDDING_KEY,
            query_vector=query.query_embedding,
            limit=query.similarity_top_k,
            with_distance=True,
        )
        if status_code != 200:
            self._handle_error(msg=response["message"])

        results = response["result"]
        logger.debug(
            f"Successfully searched embedding in collection: {self._collection_name}"
            f" Num Results: {len(results)}"
        )

        nodes = []
        similarities = []
        ids = []
        for res in results:
            try:
                node = metadata_dict_to_node({"_node_content": res["metadata"]})
                node.text = res[DEFAULT_TEXT_KEY]
            except Exception:
                # NOTE: deprecated legacy logic for backward compatibility
                metadata, node_info, relationships = legacy_metadata_dict_to_node(
                    res["metadata"]
                )
                node = TextNode(
                    id=res["id"],
                    text=res[DEFAULT_TEXT_KEY],
                    metadata=metadata,
                    start_char_idx=node_info.get("start", None),
                    end_char_idx=node_info.get("end", None),
                    relationships=relationships,
                )
            nodes.append(node)
            similarities.append(res["@distance"])
            ids.append(res["id"])

        return VectorStoreQueryResult(nodes=nodes, similarities=similarities, ids=ids)

client `property` #

client: Any

返回 Epsilla 客户端。

add #

add(nodes: List[BaseNode], **add_kwargs: Any) -> List[str]

将节点添加到 Epsilla 向量存储。

参数

名称	类型	描述	默认值
`nodes`	`List[BaseNode]`	List[BaseNode]: 带有嵌入向量的节点列表	必需

返回值

类型	描述
`List[str]`	List[str]: 插入的 ID 列表。

源代码位于 llama-index-integrations/vector_stores/llama-index-vector-stores-epsilla/llama_index/vector_stores/epsilla/base.py

def add(
    self,
    nodes: List[BaseNode],
    **add_kwargs: Any,
) -> List[str]:
    """
    Add nodes to Epsilla vector store.

    Args:
        nodes: List[BaseNode]: list of nodes with embeddings

    Returns:
        List[str]: List of ids inserted.

    """
    # If the collection doesn't exist yet, create the collection
    if not self._collection_created and len(nodes) > 0:
        dimension = len(nodes[0].get_embedding())
        self._create_collection(dimension)

    elif len(nodes) == 0:
        return []

    ids = []
    records = []
    for node in nodes:
        ids.append(node.node_id)
        text = node.get_content(metadata_mode=MetadataMode.NONE)
        metadata_dict = node_to_metadata_dict(node, remove_text=True)
        metadata = metadata_dict["_node_content"]
        record = {
            "id": node.node_id,
            DEFAULT_DOC_ID_KEY: node.ref_doc_id,
            DEFAULT_TEXT_KEY: text,
            DEFAULT_EMBEDDING_KEY: node.get_embedding(),
            "metadata": metadata,
        }
        records.append(record)

    status_code, response = self._client.insert(
        table_name=self._collection_name, records=records
    )
    if status_code != 200:
        self._handle_error(msg=response["message"])

    return ids

delete #

delete(ref_doc_id: str, **delete_kwargs: Any) -> None

使用 ref_doc_id 删除节点。

参数

名称	类型	描述	默认值
`ref_doc_id`	`str`	要删除的文档的 doc_id。	必需

源代码位于 llama-index-integrations/vector_stores/llama-index-vector-stores-epsilla/llama_index/vector_stores/epsilla/base.py

def delete(self, ref_doc_id: str, **delete_kwargs: Any) -> None:
    """
    Delete nodes using with ref_doc_id.

    Args:
        ref_doc_id (str): The doc_id of the document to delete.

    """
    raise NotImplementedError("Delete with filtering will be coming soon.")

query #

query(query: VectorStoreQuery, **kwargs: Any) -> VectorStoreQueryResult

查询索引以获取前 k 个最相似的节点。

参数

名称	类型	描述	默认值
`Kdbai`	`VectorStoreQuery`	query。	必需

返回值

类型	描述
`VectorStoreQueryResult`	向量存储查询结果。

源代码位于 llama-index-integrations/vector_stores/llama-index-vector-stores-epsilla/llama_index/vector_stores/epsilla/base.py

def query(self, query: VectorStoreQuery, **kwargs: Any) -> VectorStoreQueryResult:
    """
    Query index for top k most similar nodes.

    Args:
        query (VectorStoreQuery): query.

    Returns:
        Vector store query result.

    """
    if not self._collection_created:
        raise ValueError("Please initialize a collection first.")

    if query.mode != VectorStoreQueryMode.DEFAULT:
        raise NotImplementedError(f"Epsilla does not support {query.mode} yet.")

    if query.filters is not None:
        raise NotImplementedError("Epsilla does not support Metadata filters yet.")

    if query.doc_ids is not None and len(query.doc_ids) > 0:
        raise NotImplementedError("Epsilla does not support filters yet.")

    status_code, response = self._client.query(
        table_name=self._collection_name,
        query_field=DEFAULT_EMBEDDING_KEY,
        query_vector=query.query_embedding,
        limit=query.similarity_top_k,
        with_distance=True,
    )
    if status_code != 200:
        self._handle_error(msg=response["message"])

    results = response["result"]
    logger.debug(
        f"Successfully searched embedding in collection: {self._collection_name}"
        f" Num Results: {len(results)}"
    )

    nodes = []
    similarities = []
    ids = []
    for res in results:
        try:
            node = metadata_dict_to_node({"_node_content": res["metadata"]})
            node.text = res[DEFAULT_TEXT_KEY]
        except Exception:
            # NOTE: deprecated legacy logic for backward compatibility
            metadata, node_info, relationships = legacy_metadata_dict_to_node(
                res["metadata"]
            )
            node = TextNode(
                id=res["id"],
                text=res[DEFAULT_TEXT_KEY],
                metadata=metadata,
                start_char_idx=node_info.get("start", None),
                end_char_idx=node_info.get("end", None),
                relationships=relationships,
            )
        nodes.append(node)
        similarities.append(res["@distance"])
        ids.append(res["id"])

    return VectorStoreQueryResult(nodes=nodes, similarities=similarities, ids=ids)

Epsilla

EpsillaVectorStore #

client property #

add #

delete #

query #

client `property` #