Txtai

TxtaiVectorStore #

基类: BasePydanticVectorStore

txtai 向量存储。

Embeddings are stored within a txtai index.

During query time, the index uses txtai to query for the top
k embeddings, and returns the corresponding indices.

参数

名称	类型	描述	默认值
`txtai_index`	`ANN`	txtai 索引实例	必需

示例

pip install llama-index-vector-stores-txtai

```python import txtai from llama_index.vector_stores.txtai import TxtaiVectorStore

创建 txtai ann 索引#

txtai_index = txtai.ann.ANNFactory.create({"backend": "numpy"})

vector_store = TxtaiVectorStore(txtai_index=txtai_index)

```

源代码位于 llama-index-integrations/vector_stores/llama-index-vector-stores-txtai/llama_index/vector_stores/txtai/base.py

class TxtaiVectorStore(BasePydanticVectorStore):
    """
    txtai Vector Store.

        Embeddings are stored within a txtai index.

        During query time, the index uses txtai to query for the top
        k embeddings, and returns the corresponding indices.

    Args:
            txtai_index (txtai.ann.ANN): txtai index instance

    Examples:
            `pip install llama-index-vector-stores-txtai`

            ```python
            import txtai
            from llama_index.vector_stores.txtai import TxtaiVectorStore

            # Create txtai ann index
            txtai_index = txtai.ann.ANNFactory.create({"backend": "numpy"})

            vector_store = TxtaiVectorStore(txtai_index=txtai_index)
    ```

    """

    stores_text: bool = False

    _txtai_index = PrivateAttr()

    def __init__(
        self,
        txtai_index: Any,
    ) -> None:
        """Initialize params."""
        try:
            import txtai
        except ImportError:
            raise ImportError(IMPORT_ERROR_MSG)

        super().__init__()

        self._txtai_index = cast(txtai.ann.ANN, txtai_index)

    @classmethod
    def from_persist_dir(
        cls,
        persist_dir: str = DEFAULT_PERSIST_DIR,
        fs: Optional[fsspec.AbstractFileSystem] = None,
    ) -> "TxtaiVectorStore":
        persist_path = os.path.join(
            persist_dir,
            f"{DEFAULT_VECTOR_STORE}{NAMESPACE_SEP}{DEFAULT_PERSIST_FNAME}",
        )
        # only support local storage for now
        if fs and not isinstance(fs, LocalFileSystem):
            raise NotImplementedError("txtai only supports local storage for now.")
        return cls.from_persist_path(persist_path=persist_path, fs=None)

    @classmethod
    def from_persist_path(
        cls,
        persist_path: str,
        fs: Optional[fsspec.AbstractFileSystem] = None,
    ) -> "TxtaiVectorStore":
        try:
            import txtai
        except ImportError:
            raise ImportError(IMPORT_ERROR_MSG)

        if fs and not isinstance(fs, LocalFileSystem):
            raise NotImplementedError("txtai only supports local storage for now.")

        if not os.path.exists(persist_path):
            raise ValueError(f"No existing {__name__} found at {persist_path}.")

        logger.info(f"Loading {__name__} config from {persist_path}.")
        parent_directory = Path(persist_path).parent
        config_path = parent_directory / "config.json"
        jsonconfig = config_path.exists()
        # Determine if config is json or pickle
        config_path = config_path if jsonconfig else parent_directory / "config"
        # Load configuration
        with open(config_path, "r" if jsonconfig else "rb") as f:
            config = json.load(f) if jsonconfig else pickle.load(f)

        logger.info(f"Loading {__name__} from {persist_path}.")
        txtai_index = txtai.ann.ANNFactory.create(config)
        txtai_index.load(persist_path)
        return cls(txtai_index=txtai_index)

    def add(
        self,
        nodes: List[BaseNode],
        **add_kwargs: Any,
    ) -> List[str]:
        """
        Add nodes to index.

        Args:
            nodes: List[BaseNode]: list of nodes with embeddings

        """
        text_embedding_np = np.array(
            [node.get_embedding() for node in nodes], dtype="float32"
        )

        # Check if the ann index is already created
        # If not create the index with node embeddings
        if self._txtai_index.backend is None:
            self._txtai_index.index(text_embedding_np)
        else:
            self._txtai_index.append(text_embedding_np)

        indx_size = self._txtai_index.count()
        return [str(idx) for idx in range(indx_size - len(nodes) + 1, indx_size + 1)]

    @property
    def client(self) -> Any:
        """Return the txtai index."""
        return self._txtai_index

    def persist(
        self,
        persist_path: str = DEFAULT_PERSIST_PATH,
        fs: Optional[fsspec.AbstractFileSystem] = None,
    ) -> None:
        """
        Save to file.

        This method saves the vector store to disk.

        Args:
            persist_path (str): The save_path of the file.

        """
        if fs and not isinstance(fs, LocalFileSystem):
            raise NotImplementedError("txtai only supports local storage for now.")

        dirpath = Path(persist_path).parent
        dirpath.mkdir(exist_ok=True)

        jsonconfig = self._txtai_index.config.get("format", "pickle") == "json"
        # Determine if config is json or pickle
        config_path = dirpath / "config.json" if jsonconfig else dirpath / "config"

        # Write configuration
        with open(
            config_path,
            "w" if jsonconfig else "wb",
            encoding="utf-8" if jsonconfig else None,
        ) as f:
            if jsonconfig:
                # Write config as JSON
                json.dump(self._txtai_index.config, f, default=str)
            else:
                from txtai.version import __pickle__

                # Write config as pickle format
                pickle.dump(self._txtai_index.config, f, protocol=__pickle__)

        self._txtai_index.save(persist_path)

    def delete(self, ref_doc_id: str, **delete_kwargs: Any) -> None:
        """
        Delete nodes using with ref_doc_id.

        Args:
            ref_doc_id (str): The doc_id of the document to delete.

        """
        self._txtai_index.delete([int(ref_doc_id)])

    def query(
        self,
        query: VectorStoreQuery,
        **kwargs: Any,
    ) -> VectorStoreQueryResult:
        """
        Query index for top k most similar nodes.

        Args:
            query (VectorStoreQuery): query to search for in the index

        """
        if query.filters is not None:
            raise ValueError("Metadata filters not implemented for txtai yet.")

        query_embedding = cast(List[float], query.query_embedding)
        query_embedding_np = np.array(query_embedding, dtype="float32")[np.newaxis, :]
        search_result = self._txtai_index.search(
            query_embedding_np, query.similarity_top_k
        )[0]
        # if empty, then return an empty response
        if len(search_result) == 0:
            return VectorStoreQueryResult(similarities=[], ids=[])

        filtered_dists = []
        filtered_node_idxs = []
        for dist, idx in search_result:
            if idx < 0:
                continue
            filtered_dists.append(dist)
            filtered_node_idxs.append(str(idx))

        return VectorStoreQueryResult(
            similarities=filtered_dists, ids=filtered_node_idxs
        )

client `property` #

client: Any

返回 txtai 索引。

add #

add(nodes: List[BaseNode], **add_kwargs: Any) -> List[str]

向索引添加节点。

参数

名称	类型	描述	默认值
`nodes`	`List[BaseNode]`	List[BaseNode]：带 Embedding 的节点列表	必需

源代码位于 llama-index-integrations/vector_stores/llama-index-vector-stores-txtai/llama_index/vector_stores/txtai/base.py

def add(
    self,
    nodes: List[BaseNode],
    **add_kwargs: Any,
) -> List[str]:
    """
    Add nodes to index.

    Args:
        nodes: List[BaseNode]: list of nodes with embeddings

    """
    text_embedding_np = np.array(
        [node.get_embedding() for node in nodes], dtype="float32"
    )

    # Check if the ann index is already created
    # If not create the index with node embeddings
    if self._txtai_index.backend is None:
        self._txtai_index.index(text_embedding_np)
    else:
        self._txtai_index.append(text_embedding_np)

    indx_size = self._txtai_index.count()
    return [str(idx) for idx in range(indx_size - len(nodes) + 1, indx_size + 1)]

persist #

persist(persist_path: str = DEFAULT_PERSIST_PATH, fs: Optional[AbstractFileSystem] = None) -> None

保存到文件。

此方法将向量存储保存到磁盘。

参数

名称	类型	描述	默认值
`persist_path`	`str`	文件的保存路径。	`DEFAULT_PERSIST_PATH`

源代码位于 llama-index-integrations/vector_stores/llama-index-vector-stores-txtai/llama_index/vector_stores/txtai/base.py

def persist(
    self,
    persist_path: str = DEFAULT_PERSIST_PATH,
    fs: Optional[fsspec.AbstractFileSystem] = None,
) -> None:
    """
    Save to file.

    This method saves the vector store to disk.

    Args:
        persist_path (str): The save_path of the file.

    """
    if fs and not isinstance(fs, LocalFileSystem):
        raise NotImplementedError("txtai only supports local storage for now.")

    dirpath = Path(persist_path).parent
    dirpath.mkdir(exist_ok=True)

    jsonconfig = self._txtai_index.config.get("format", "pickle") == "json"
    # Determine if config is json or pickle
    config_path = dirpath / "config.json" if jsonconfig else dirpath / "config"

    # Write configuration
    with open(
        config_path,
        "w" if jsonconfig else "wb",
        encoding="utf-8" if jsonconfig else None,
    ) as f:
        if jsonconfig:
            # Write config as JSON
            json.dump(self._txtai_index.config, f, default=str)
        else:
            from txtai.version import __pickle__

            # Write config as pickle format
            pickle.dump(self._txtai_index.config, f, protocol=__pickle__)

    self._txtai_index.save(persist_path)

delete #

delete(ref_doc_id: str, **delete_kwargs: Any) -> None

使用 ref_doc_id 删除节点。

参数

名称	类型	描述	默认值
`ref_doc_id`	`str`	要删除的文档的 doc_id。	必需

源代码位于 llama-index-integrations/vector_stores/llama-index-vector-stores-txtai/llama_index/vector_stores/txtai/base.py

def delete(self, ref_doc_id: str, **delete_kwargs: Any) -> None:
    """
    Delete nodes using with ref_doc_id.

    Args:
        ref_doc_id (str): The doc_id of the document to delete.

    """
    self._txtai_index.delete([int(ref_doc_id)])

query #

query(query: VectorStoreQuery, **kwargs: Any) -> VectorStoreQueryResult

查询索引以获取相似度最高的 k 个节点。

参数

名称	类型	描述	默认值
`query`	`VectorStoreQuery`	用于在索引中搜索的查询	必需

源代码位于 llama-index-integrations/vector_stores/llama-index-vector-stores-txtai/llama_index/vector_stores/txtai/base.py

def query(
    self,
    query: VectorStoreQuery,
    **kwargs: Any,
) -> VectorStoreQueryResult:
    """
    Query index for top k most similar nodes.

    Args:
        query (VectorStoreQuery): query to search for in the index

    """
    if query.filters is not None:
        raise ValueError("Metadata filters not implemented for txtai yet.")

    query_embedding = cast(List[float], query.query_embedding)
    query_embedding_np = np.array(query_embedding, dtype="float32")[np.newaxis, :]
    search_result = self._txtai_index.search(
        query_embedding_np, query.similarity_top_k
    )[0]
    # if empty, then return an empty response
    if len(search_result) == 0:
        return VectorStoreQueryResult(similarities=[], ids=[])

    filtered_dists = []
    filtered_node_idxs = []
    for dist, idx in search_result:
        if idx < 0:
            continue
        filtered_dists.append(dist)
        filtered_node_idxs.append(str(idx))

    return VectorStoreQueryResult(
        similarities=filtered_dists, ids=filtered_node_idxs
    )

Txtai

TxtaiVectorStore #

创建 txtai ann 索引#

client property #

add #

persist #

delete #

query #

client `property` #