跳过内容

Vertexai

VertexAIIndex #

基类：BaseManagedIndex

Vertex AI 索引。

Vertex AI RAG 索引实现了使用 Vertex AI 作为后端的托管索引。Vertex AI 在后端执行了传统索引中的许多功能： - 将文档分解成块（节点） - 为每个块（节点）创建嵌入 - 执行搜索以查找与查询最相似的 top k 个节点 - 可选地可以对 top k 个节点进行摘要

参数

名称	类型	描述	默认值
`show_progress`	`bool`	是否显示 tqdm 进度条。默认为 False。	`False`

源码位于 llama-index-integrations/indices/llama-index-indices-managed-vertexai/llama_index/indices/managed/vertexai/base.py

class VertexAIIndex(BaseManagedIndex):
    """
    Vertex AI Index.

    The Vertex AI RAG index implements a managed index that uses Vertex AI as the backend.
    Vertex AI performs a lot of the functions in traditional indexes in the backend:
    - breaks down a document into chunks (nodes)
    - Creates the embedding for each chunk (node)
    - Performs the search for the top k most similar nodes to a query
    - Optionally can perform summarization of the top k nodes

    Args:
        show_progress (bool): Whether to show tqdm progress bars. Defaults to False.

    """

    def __init__(
        self,
        project_id: str,
        location: Optional[str] = None,
        corpus_id: Optional[str] = None,
        corpus_display_name: Optional[str] = None,
        corpus_description: Optional[str] = None,
        show_progress: bool = False,
        **kwargs: Any,
    ) -> None:
        """Initialize the Vertex AI API."""
        if corpus_id and (corpus_display_name or corpus_description):
            raise ValueError(
                "Cannot specify both corpus_id and corpus_display_name or corpus_description"
            )

        self.project_id = project_id
        self.location = location
        self.show_progress = show_progress
        self._user_agent = get_user_agent("vertexai-rag")

        vertexai.init(project=self.project_id, location=self.location)

        with telemetry.tool_context_manager(self._user_agent):
            # If a corpus is not specified, create a new one.
            if corpus_id:
                # Make sure corpus exists
                self.corpus_name = rag.get_corpus(name=corpus_id).name
            else:
                self.corpus_name = rag.create_corpus(
                    display_name=corpus_display_name, description=corpus_description
                ).name

    def import_files(
        self,
        uris: Sequence[str],
        chunk_size: Optional[int] = None,
        chunk_overlap: Optional[int] = None,
        timeout: Optional[int] = None,
        **kwargs: Any,
    ) -> ImportRagFilesResponse:
        """Import Google Cloud Storage or Google Drive files into the index."""
        # Convert https://storage.googleapis.com URLs to gs:// format
        uris = [
            re.sub(r"^https://storage\.googleapis\.com/", "gs://", uri) for uri in uris
        ]

        with telemetry.tool_context_manager(self._user_agent):
            return rag.import_files(
                self.corpus_name,
                paths=uris,
                chunk_size=chunk_size,
                chunk_overlap=chunk_overlap,
                timeout=timeout,
                **kwargs,
            )

    def insert_file(
        self,
        file_path: str,
        metadata: Optional[dict] = None,
        **insert_kwargs: Any,
    ) -> Optional[str]:
        """Insert a local file into the index."""
        if metadata:
            display_name = metadata.get("display_name")
            description = metadata.get("description")

        with telemetry.tool_context_manager(self._user_agent):
            rag_file = rag.upload_file(
                corpus_name=self.corpus_name,
                path=file_path,
                display_name=display_name,
                description=description,
                **insert_kwargs,
            )

        return rag_file.name if rag_file else None

    def list_files(self) -> Sequence[str]:
        """List all files in the index."""
        files = []
        with telemetry.tool_context_manager(self._user_agent):
            for file in rag.list_files(corpus_name=self.corpus_name):
                files.append(file.name)
        return files

    def delete_file(self, file_name: str) -> None:
        """Delete file from the index."""
        with telemetry.tool_context_manager(self._user_agent):
            rag.delete_file(name=file_name, corpus_name=self.corpus_name)

    def as_query_engine(self, **kwargs: Any) -> BaseQueryEngine:
        from llama_index.core.query_engine.retriever_query_engine import (
            RetrieverQueryEngine,
        )

        kwargs["retriever"] = self.as_retriever(**kwargs)
        return RetrieverQueryEngine.from_args(**kwargs)

    def as_retriever(self, **kwargs: Any) -> BaseRetriever:
        """Return a Retriever for this managed index."""
        from llama_index.indices.managed.vertexai.retriever import (
            VertexAIRetriever,
        )

        similarity_top_k = kwargs.pop("similarity_top_k", None)
        vector_distance_threshold = kwargs.pop("vector_distance_threshold", None)

        return VertexAIRetriever(
            self.corpus_name,
            similarity_top_k,
            vector_distance_threshold,
            self._user_agent,
            **kwargs,
        )

    def _insert(self, nodes: Sequence[BaseNode], **insert_kwargs: Any) -> None:
        """Insert a set of documents (each a node)."""
        raise NotImplementedError("Node insertion is not supported.")

    def delete_ref_doc(
        self, ref_doc_id: str, delete_from_docstore: bool = False, **delete_kwargs: Any
    ) -> None:
        """Delete a document and it's nodes by using ref_doc_id."""
        if delete_from_docstore:
            with telemetry.tool_context_manager(self._user_agent):
                rag.delete_file(
                    name=ref_doc_id,
                    corpus_name=self.corpus_name,
                )

    def update_ref_doc(self, document: Document, **update_kwargs: Any) -> None:
        """Update a document and it's corresponding nodes."""
        raise NotImplementedError("Document update is not supported.")

导入文件 #

import_files(uris: Sequence[str], chunk_size: Optional[int] = None, chunk_overlap: Optional[int] = None, timeout: Optional[int] = None, **kwargs: Any) -> ImportRagFilesResponse

将 Google Cloud Storage 或 Google Drive 文件导入索引。

源码位于 llama-index-integrations/indices/llama-index-indices-managed-vertexai/llama_index/indices/managed/vertexai/base.py

def import_files(
    self,
    uris: Sequence[str],
    chunk_size: Optional[int] = None,
    chunk_overlap: Optional[int] = None,
    timeout: Optional[int] = None,
    **kwargs: Any,
) -> ImportRagFilesResponse:
    """Import Google Cloud Storage or Google Drive files into the index."""
    # Convert https://storage.googleapis.com URLs to gs:// format
    uris = [
        re.sub(r"^https://storage\.googleapis\.com/", "gs://", uri) for uri in uris
    ]

    with telemetry.tool_context_manager(self._user_agent):
        return rag.import_files(
            self.corpus_name,
            paths=uris,
            chunk_size=chunk_size,
            chunk_overlap=chunk_overlap,
            timeout=timeout,
            **kwargs,
        )

插入文件 #

insert_file(file_path: str, metadata: Optional[dict] = None, **insert_kwargs: Any) -> Optional[str]

将本地文件插入索引。

源码位于 llama-index-integrations/indices/llama-index-indices-managed-vertexai/llama_index/indices/managed/vertexai/base.py

def insert_file(
    self,
    file_path: str,
    metadata: Optional[dict] = None,
    **insert_kwargs: Any,
) -> Optional[str]:
    """Insert a local file into the index."""
    if metadata:
        display_name = metadata.get("display_name")
        description = metadata.get("description")

    with telemetry.tool_context_manager(self._user_agent):
        rag_file = rag.upload_file(
            corpus_name=self.corpus_name,
            path=file_path,
            display_name=display_name,
            description=description,
            **insert_kwargs,
        )

    return rag_file.name if rag_file else None

列出文件 #

list_files() -> Sequence[str]

列出索引中的所有文件。

源码位于 llama-index-integrations/indices/llama-index-indices-managed-vertexai/llama_index/indices/managed/vertexai/base.py

def list_files(self) -> Sequence[str]:
    """List all files in the index."""
    files = []
    with telemetry.tool_context_manager(self._user_agent):
        for file in rag.list_files(corpus_name=self.corpus_name):
            files.append(file.name)
    return files

删除文件 #

delete_file(file_name: str) -> None

从索引中删除文件。

源码位于 llama-index-integrations/indices/llama-index-indices-managed-vertexai/llama_index/indices/managed/vertexai/base.py

def delete_file(self, file_name: str) -> None:
    """Delete file from the index."""
    with telemetry.tool_context_manager(self._user_agent):
        rag.delete_file(name=file_name, corpus_name=self.corpus_name)

作为检索器 #

as_retriever(**kwargs: Any) -> BaseRetriever

为此托管索引返回一个检索器。

源码位于 llama-index-integrations/indices/llama-index-indices-managed-vertexai/llama_index/indices/managed/vertexai/base.py

def as_retriever(self, **kwargs: Any) -> BaseRetriever:
    """Return a Retriever for this managed index."""
    from llama_index.indices.managed.vertexai.retriever import (
        VertexAIRetriever,
    )

    similarity_top_k = kwargs.pop("similarity_top_k", None)
    vector_distance_threshold = kwargs.pop("vector_distance_threshold", None)

    return VertexAIRetriever(
        self.corpus_name,
        similarity_top_k,
        vector_distance_threshold,
        self._user_agent,
        **kwargs,
    )

删除参考文档 #

delete_ref_doc(ref_doc_id: str, delete_from_docstore: bool = False, **delete_kwargs: Any) -> None

使用 ref_doc_id 删除文档及其节点。

源码位于 llama-index-integrations/indices/llama-index-indices-managed-vertexai/llama_index/indices/managed/vertexai/base.py

def delete_ref_doc(
    self, ref_doc_id: str, delete_from_docstore: bool = False, **delete_kwargs: Any
) -> None:
    """Delete a document and it's nodes by using ref_doc_id."""
    if delete_from_docstore:
        with telemetry.tool_context_manager(self._user_agent):
            rag.delete_file(
                name=ref_doc_id,
                corpus_name=self.corpus_name,
            )

更新参考文档 #

update_ref_doc(document: Document, **update_kwargs: Any) -> None

更新文档及其对应的节点。

源码位于 llama-index-integrations/indices/llama-index-indices-managed-vertexai/llama_index/indices/managed/vertexai/base.py

def update_ref_doc(self, document: Document, **update_kwargs: Any) -> None:
    """Update a document and it's corresponding nodes."""
    raise NotImplementedError("Document update is not supported.")