Tair

TairVectorStore #

基类: BasePydanticVectorStore

初始化 TairVectorStore。

有两种索引类型可用：FLAT 和 HNSW。

HNSW 的索引参数

ef_construct
M
ef_search

有关这些参数的详细信息请参见此处：https://www.alibabacloud.com/help/en/tair/latest/tairvector#section-c76-ull-5mk

参数

名称	类型	描述	默认值
`index_name`	`str`	索引的名称。	必需
`index_type`	`str`	索引的类型。默认为 'HNSW'。	`'HNSW'`
`index_args`	`Dict[str, Any]`	索引的参数。默认为 None。	`无`
`tair_url`	`str`	Tair 实例的 URL。	必需
`overwrite`	`bool`	如果索引已存在是否覆盖。默认为 False。	`False`
`kwargs`	`Any`	传递给 Tair 客户端的额外参数。	`{}`

抛出

类型	描述
`ValueError`	如果未安装 tair-py
`ValueError`	如果连接到 Tair 实例失败

示例

pip install llama-index-vector-stores-tair

from llama_index.core.vector_stores.tair import TairVectorStore

# Create a TairVectorStore
vector_store = TairVectorStore(
    tair_url="redis://{username}:{password}@r-bp****************.redis.rds.aliyuncs.com:{port}",
    index_name="my_index",
    index_type="HNSW",
    index_args={"M": 16, "ef_construct": 200},
    overwrite=True
)

源代码位于 llama-index-integrations/vector_stores/llama-index-vector-stores-tair/llama_index/vector_stores/tair/base.py

class TairVectorStore(BasePydanticVectorStore):
    """
    Initialize TairVectorStore.

    Two index types are available: FLAT & HNSW.

    index args for HNSW:
        - ef_construct
        - M
        - ef_search

    Detailed info for these arguments can be found here:
    https://www.alibabacloud.com/help/en/tair/latest/tairvector#section-c76-ull-5mk

    Args:
        index_name (str): Name of the index.
        index_type (str): Type of the index. Defaults to 'HNSW'.
        index_args (Dict[str, Any]): Arguments for the index. Defaults to None.
        tair_url (str): URL for the Tair instance.
        overwrite (bool): Whether to overwrite the index if it already exists.
            Defaults to False.
        kwargs (Any): Additional arguments to pass to the Tair client.

    Raises:
        ValueError: If tair-py is not installed
        ValueError: If failed to connect to Tair instance

    Examples:
        `pip install llama-index-vector-stores-tair`

        ```python
        from llama_index.core.vector_stores.tair import TairVectorStore

        # Create a TairVectorStore
        vector_store = TairVectorStore(
            tair_url="redis://{username}:{password}@r-bp****************.redis.rds.aliyuncs.com:{port}",
            index_name="my_index",
            index_type="HNSW",
            index_args={"M": 16, "ef_construct": 200},
            overwrite=True
        )
        ```

    """

    stores_text: bool = True
    stores_node: bool = True
    flat_metadata: bool = False

    _tair_client: Tair = PrivateAttr()
    _index_name: str = PrivateAttr()
    _index_type: str = PrivateAttr()
    _metric_type: str = PrivateAttr()
    _overwrite: bool = PrivateAttr()
    _index_args: Dict[str, Any] = PrivateAttr()
    _query_args: Dict[str, Any] = PrivateAttr()
    _dim: int = PrivateAttr()

    def __init__(
        self,
        tair_url: str,
        index_name: str,
        index_type: str = "HNSW",
        index_args: Optional[Dict[str, Any]] = None,
        overwrite: bool = False,
        **kwargs: Any,
    ) -> None:
        super().__init__()
        try:
            self._tair_client = Tair.from_url(tair_url, **kwargs)
        except ValueError as e:
            raise ValueError(f"Tair failed to connect: {e}")

        # index identifiers
        self._index_name = index_name
        self._index_type = index_type
        self._metric_type = "L2"
        self._overwrite = overwrite
        self._index_args = {}
        self._query_args = {}
        if index_type == "HNSW":
            if index_args is not None:
                ef_construct = index_args.get("ef_construct", 500)
                M = index_args.get("M", 24)
                ef_search = index_args.get("ef_search", 400)
            else:
                ef_construct = 500
                M = 24
                ef_search = 400

            self._index_args = {"ef_construct": ef_construct, "M": M}
            self._query_args = {"ef_search": ef_search}

    @classmethod
    def class_name(cls) -> str:
        """Class name."""
        return "TairVectorStore"

    @property
    def client(self) -> "Tair":
        """Return the Tair client instance."""
        return self._tair_client

    def add(self, nodes: List[BaseNode], **add_kwargs: Any) -> List[str]:
        """
        Add nodes to the index.

        Args:
            nodes (List[BaseNode]): List of nodes with embeddings

        Returns:
            List[str]: List of ids of the documents added to the index.

        """
        # check to see if empty document list was passed
        if len(nodes) == 0:
            return []

        # set vector dim for creation if index doesn't exist
        self._dim = len(nodes[0].get_embedding())

        if self._index_exists():
            if self._overwrite:
                self.delete_index()
                self._create_index()
            else:
                logging.info(f"Adding document to existing index {self._index_name}")
        else:
            self._create_index()

        ids = []
        for node in nodes:
            attributes = {
                "id": node.node_id,
                "doc_id": node.ref_doc_id,
                "text": node.get_content(metadata_mode=MetadataMode.NONE),
            }
            metadata_dict = node_to_metadata_dict(
                node, remove_text=True, flat_metadata=self.flat_metadata
            )
            attributes.update(metadata_dict)

            ids.append(node.node_id)
            self._tair_client.tvs_hset(
                self._index_name,
                f"{node.ref_doc_id}#{node.node_id}",
                vector=node.get_embedding(),
                is_binary=False,
                **attributes,
            )

        _logger.info(f"Added {len(ids)} documents to index {self._index_name}")
        return ids

    def delete(self, ref_doc_id: str, **delete_kwargs: Any) -> None:
        """
        Delete a document.

        Args:
            doc_id (str): document id

        """
        iter = self._tair_client.tvs_scan(self._index_name, "%s#*" % ref_doc_id)
        for k in iter:
            self._tair_client.tvs_del(self._index_name, k)

    def delete_index(self) -> None:
        """Delete the index and all documents."""
        _logger.info(f"Deleting index {self._index_name}")
        self._tair_client.tvs_del_index(self._index_name)

    def query(self, query: VectorStoreQuery, **kwargs: Any) -> VectorStoreQueryResult:
        """
        Query the index.

        Args:
            query (VectorStoreQuery): query object

        Returns:
            VectorStoreQueryResult: query result

        Raises:
            ValueError: If query.query_embedding is None.

        """
        filter_expr = None
        if query.filters is not None:
            filter_expr = _to_filter_expr(query.filters)

        if not query.query_embedding:
            raise ValueError("Query embedding is required for querying.")

        _logger.info(f"Querying index {self._index_name}")

        query_args = self._query_args
        if self._index_type == "HNSW" and "ef_search" in kwargs:
            query_args["ef_search"] = kwargs["ef_search"]

        results = self._tair_client.tvs_knnsearch(
            self._index_name,
            query.similarity_top_k,
            query.query_embedding,
            False,
            filter_str=filter_expr,
            **query_args,
        )
        results = [(k.decode(), float(s)) for k, s in results]

        ids = []
        nodes = []
        scores = []
        pipe = self._tair_client.pipeline(transaction=False)
        for key, score in results:
            scores.append(score)
            pipe.tvs_hmget(self._index_name, key, "id", "doc_id", "text")
        metadatas = pipe.execute()
        for i, m in enumerate(metadatas):
            # TODO: properly get the _node_conent
            doc_id = m[0].decode()
            node = TextNode(
                text=m[2].decode(),
                id_=doc_id,
                embedding=None,
                relationships={
                    NodeRelationship.SOURCE: RelatedNodeInfo(node_id=m[1].decode())
                },
            )
            ids.append(doc_id)
            nodes.append(node)
        _logger.info(f"Found {len(nodes)} results for query with id {ids}")

        return VectorStoreQueryResult(nodes=nodes, ids=ids, similarities=scores)

    def _create_index(self) -> None:
        _logger.info(f"Creating index {self._index_name}")
        self._tair_client.tvs_create_index(
            self._index_name,
            self._dim,
            distance_type=self._metric_type,
            index_type=self._index_type,
            data_type=tairvector.DataType.Float32,
            **self._index_args,
        )

    def _index_exists(self) -> bool:
        index = self._tair_client.tvs_get_index(self._index_name)
        return index is not None

client `property` #

client: Tair

返回 Tair 客户端实例。

class_name `classmethod` #

class_name() -> str

类名。

源代码位于 llama-index-integrations/vector_stores/llama-index-vector-stores-tair/llama_index/vector_stores/tair/base.py

@classmethod
def class_name(cls) -> str:
    """Class name."""
    return "TairVectorStore"

add #

add(nodes: List[BaseNode], **add_kwargs: Any) -> List[str]

向索引添加节点。

参数

名称	类型	描述	默认值
`节点`	`List[BaseNode]`	带有嵌入的节点列表	必需

返回

类型	描述
`List[str]`	List[str]: 添加到索引中的文档 ID 列表。

源代码位于 llama-index-integrations/vector_stores/llama-index-vector-stores-tair/llama_index/vector_stores/tair/base.py

def add(self, nodes: List[BaseNode], **add_kwargs: Any) -> List[str]:
    """
    Add nodes to the index.

    Args:
        nodes (List[BaseNode]): List of nodes with embeddings

    Returns:
        List[str]: List of ids of the documents added to the index.

    """
    # check to see if empty document list was passed
    if len(nodes) == 0:
        return []

    # set vector dim for creation if index doesn't exist
    self._dim = len(nodes[0].get_embedding())

    if self._index_exists():
        if self._overwrite:
            self.delete_index()
            self._create_index()
        else:
            logging.info(f"Adding document to existing index {self._index_name}")
    else:
        self._create_index()

    ids = []
    for node in nodes:
        attributes = {
            "id": node.node_id,
            "doc_id": node.ref_doc_id,
            "text": node.get_content(metadata_mode=MetadataMode.NONE),
        }
        metadata_dict = node_to_metadata_dict(
            node, remove_text=True, flat_metadata=self.flat_metadata
        )
        attributes.update(metadata_dict)

        ids.append(node.node_id)
        self._tair_client.tvs_hset(
            self._index_name,
            f"{node.ref_doc_id}#{node.node_id}",
            vector=node.get_embedding(),
            is_binary=False,
            **attributes,
        )

    _logger.info(f"Added {len(ids)} documents to index {self._index_name}")
    return ids

delete #

delete(ref_doc_id: str, **delete_kwargs: Any) -> None

删除文档。

参数

名称	类型	描述	默认值
`doc_id`	`str`	文档 ID	必需

源代码位于 llama-index-integrations/vector_stores/llama-index-vector-stores-tair/llama_index/vector_stores/tair/base.py

def delete(self, ref_doc_id: str, **delete_kwargs: Any) -> None:
    """
    Delete a document.

    Args:
        doc_id (str): document id

    """
    iter = self._tair_client.tvs_scan(self._index_name, "%s#*" % ref_doc_id)
    for k in iter:
        self._tair_client.tvs_del(self._index_name, k)

delete_index #

delete_index() -> None

删除索引及所有文档。

源代码位于 llama-index-integrations/vector_stores/llama-index-vector-stores-tair/llama_index/vector_stores/tair/base.py

def delete_index(self) -> None:
    """Delete the index and all documents."""
    _logger.info(f"Deleting index {self._index_name}")
    self._tair_client.tvs_del_index(self._index_name)

query #

query(query: VectorStoreQuery, **kwargs: Any) -> VectorStoreQueryResult

查询索引。

参数

名称	类型	描述	默认值
`查询`	`VectorStoreQuery`	查询对象	必需

返回

名称	类型	描述
`VectorStoreQueryResult`	`VectorStoreQueryResult`	查询结果

抛出

类型	描述
`ValueError`	如果 query.query_embedding 为 None。

源代码位于 llama-index-integrations/vector_stores/llama-index-vector-stores-tair/llama_index/vector_stores/tair/base.py

def query(self, query: VectorStoreQuery, **kwargs: Any) -> VectorStoreQueryResult:
    """
    Query the index.

    Args:
        query (VectorStoreQuery): query object

    Returns:
        VectorStoreQueryResult: query result

    Raises:
        ValueError: If query.query_embedding is None.

    """
    filter_expr = None
    if query.filters is not None:
        filter_expr = _to_filter_expr(query.filters)

    if not query.query_embedding:
        raise ValueError("Query embedding is required for querying.")

    _logger.info(f"Querying index {self._index_name}")

    query_args = self._query_args
    if self._index_type == "HNSW" and "ef_search" in kwargs:
        query_args["ef_search"] = kwargs["ef_search"]

    results = self._tair_client.tvs_knnsearch(
        self._index_name,
        query.similarity_top_k,
        query.query_embedding,
        False,
        filter_str=filter_expr,
        **query_args,
    )
    results = [(k.decode(), float(s)) for k, s in results]

    ids = []
    nodes = []
    scores = []
    pipe = self._tair_client.pipeline(transaction=False)
    for key, score in results:
        scores.append(score)
        pipe.tvs_hmget(self._index_name, key, "id", "doc_id", "text")
    metadatas = pipe.execute()
    for i, m in enumerate(metadatas):
        # TODO: properly get the _node_conent
        doc_id = m[0].decode()
        node = TextNode(
            text=m[2].decode(),
            id_=doc_id,
            embedding=None,
            relationships={
                NodeRelationship.SOURCE: RelatedNodeInfo(node_id=m[1].decode())
            },
        )
        ids.append(doc_id)
        nodes.append(node)
    _logger.info(f"Found {len(nodes)} results for query with id {ids}")

    return VectorStoreQueryResult(nodes=nodes, ids=ids, similarities=scores)

Tair

TairVectorStore #

client property #

class_name classmethod #

add #

delete #

delete_index #

query #

client `property` #

class_name `classmethod` #