跳过内容

Vertexai

VertexAIIndex #

基类:BaseManagedIndex

Vertex AI 索引。

Vertex AI RAG 索引实现了使用 Vertex AI 作为后端的托管索引。Vertex AI 在后端执行了传统索引中的许多功能: - 将文档分解成块(节点) - 为每个块(节点)创建嵌入 - 执行搜索以查找与查询最相似的 top k 个节点 - 可选地可以对 top k 个节点进行摘要

参数

名称 类型 描述 默认值
show_progress bool

是否显示 tqdm 进度条。默认为 False。

False
源码位于 llama-index-integrations/indices/llama-index-indices-managed-vertexai/llama_index/indices/managed/vertexai/base.py
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
class VertexAIIndex(BaseManagedIndex):
    """
    Vertex AI Index.

    The Vertex AI RAG index implements a managed index that uses Vertex AI as the backend.
    Vertex AI performs a lot of the functions in traditional indexes in the backend:
    - breaks down a document into chunks (nodes)
    - Creates the embedding for each chunk (node)
    - Performs the search for the top k most similar nodes to a query
    - Optionally can perform summarization of the top k nodes

    Args:
        show_progress (bool): Whether to show tqdm progress bars. Defaults to False.

    """

    def __init__(
        self,
        project_id: str,
        location: Optional[str] = None,
        corpus_id: Optional[str] = None,
        corpus_display_name: Optional[str] = None,
        corpus_description: Optional[str] = None,
        show_progress: bool = False,
        **kwargs: Any,
    ) -> None:
        """Initialize the Vertex AI API."""
        if corpus_id and (corpus_display_name or corpus_description):
            raise ValueError(
                "Cannot specify both corpus_id and corpus_display_name or corpus_description"
            )

        self.project_id = project_id
        self.location = location
        self.show_progress = show_progress
        self._user_agent = get_user_agent("vertexai-rag")

        vertexai.init(project=self.project_id, location=self.location)

        with telemetry.tool_context_manager(self._user_agent):
            # If a corpus is not specified, create a new one.
            if corpus_id:
                # Make sure corpus exists
                self.corpus_name = rag.get_corpus(name=corpus_id).name
            else:
                self.corpus_name = rag.create_corpus(
                    display_name=corpus_display_name, description=corpus_description
                ).name

    def import_files(
        self,
        uris: Sequence[str],
        chunk_size: Optional[int] = None,
        chunk_overlap: Optional[int] = None,
        timeout: Optional[int] = None,
        **kwargs: Any,
    ) -> ImportRagFilesResponse:
        """Import Google Cloud Storage or Google Drive files into the index."""
        # Convert https://storage.googleapis.com URLs to gs:// format
        uris = [
            re.sub(r"^https://storage\.googleapis\.com/", "gs://", uri) for uri in uris
        ]

        with telemetry.tool_context_manager(self._user_agent):
            return rag.import_files(
                self.corpus_name,
                paths=uris,
                chunk_size=chunk_size,
                chunk_overlap=chunk_overlap,
                timeout=timeout,
                **kwargs,
            )

    def insert_file(
        self,
        file_path: str,
        metadata: Optional[dict] = None,
        **insert_kwargs: Any,
    ) -> Optional[str]:
        """Insert a local file into the index."""
        if metadata:
            display_name = metadata.get("display_name")
            description = metadata.get("description")

        with telemetry.tool_context_manager(self._user_agent):
            rag_file = rag.upload_file(
                corpus_name=self.corpus_name,
                path=file_path,
                display_name=display_name,
                description=description,
                **insert_kwargs,
            )

        return rag_file.name if rag_file else None

    def list_files(self) -> Sequence[str]:
        """List all files in the index."""
        files = []
        with telemetry.tool_context_manager(self._user_agent):
            for file in rag.list_files(corpus_name=self.corpus_name):
                files.append(file.name)
        return files

    def delete_file(self, file_name: str) -> None:
        """Delete file from the index."""
        with telemetry.tool_context_manager(self._user_agent):
            rag.delete_file(name=file_name, corpus_name=self.corpus_name)

    def as_query_engine(self, **kwargs: Any) -> BaseQueryEngine:
        from llama_index.core.query_engine.retriever_query_engine import (
            RetrieverQueryEngine,
        )

        kwargs["retriever"] = self.as_retriever(**kwargs)
        return RetrieverQueryEngine.from_args(**kwargs)

    def as_retriever(self, **kwargs: Any) -> BaseRetriever:
        """Return a Retriever for this managed index."""
        from llama_index.indices.managed.vertexai.retriever import (
            VertexAIRetriever,
        )

        similarity_top_k = kwargs.pop("similarity_top_k", None)
        vector_distance_threshold = kwargs.pop("vector_distance_threshold", None)

        return VertexAIRetriever(
            self.corpus_name,
            similarity_top_k,
            vector_distance_threshold,
            self._user_agent,
            **kwargs,
        )

    def _insert(self, nodes: Sequence[BaseNode], **insert_kwargs: Any) -> None:
        """Insert a set of documents (each a node)."""
        raise NotImplementedError("Node insertion is not supported.")

    def delete_ref_doc(
        self, ref_doc_id: str, delete_from_docstore: bool = False, **delete_kwargs: Any
    ) -> None:
        """Delete a document and it's nodes by using ref_doc_id."""
        if delete_from_docstore:
            with telemetry.tool_context_manager(self._user_agent):
                rag.delete_file(
                    name=ref_doc_id,
                    corpus_name=self.corpus_name,
                )

    def update_ref_doc(self, document: Document, **update_kwargs: Any) -> None:
        """Update a document and it's corresponding nodes."""
        raise NotImplementedError("Document update is not supported.")

导入文件 #

import_files(uris: Sequence[str], chunk_size: Optional[int] = None, chunk_overlap: Optional[int] = None, timeout: Optional[int] = None, **kwargs: Any) -> ImportRagFilesResponse

将 Google Cloud Storage 或 Google Drive 文件导入索引。

源码位于 llama-index-integrations/indices/llama-index-indices-managed-vertexai/llama_index/indices/managed/vertexai/base.py
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
def import_files(
    self,
    uris: Sequence[str],
    chunk_size: Optional[int] = None,
    chunk_overlap: Optional[int] = None,
    timeout: Optional[int] = None,
    **kwargs: Any,
) -> ImportRagFilesResponse:
    """Import Google Cloud Storage or Google Drive files into the index."""
    # Convert https://storage.googleapis.com URLs to gs:// format
    uris = [
        re.sub(r"^https://storage\.googleapis\.com/", "gs://", uri) for uri in uris
    ]

    with telemetry.tool_context_manager(self._user_agent):
        return rag.import_files(
            self.corpus_name,
            paths=uris,
            chunk_size=chunk_size,
            chunk_overlap=chunk_overlap,
            timeout=timeout,
            **kwargs,
        )

插入文件 #

insert_file(file_path: str, metadata: Optional[dict] = None, **insert_kwargs: Any) -> Optional[str]

将本地文件插入索引。

源码位于 llama-index-integrations/indices/llama-index-indices-managed-vertexai/llama_index/indices/managed/vertexai/base.py
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
def insert_file(
    self,
    file_path: str,
    metadata: Optional[dict] = None,
    **insert_kwargs: Any,
) -> Optional[str]:
    """Insert a local file into the index."""
    if metadata:
        display_name = metadata.get("display_name")
        description = metadata.get("description")

    with telemetry.tool_context_manager(self._user_agent):
        rag_file = rag.upload_file(
            corpus_name=self.corpus_name,
            path=file_path,
            display_name=display_name,
            description=description,
            **insert_kwargs,
        )

    return rag_file.name if rag_file else None

列出文件 #

list_files() -> Sequence[str]

列出索引中的所有文件。

源码位于 llama-index-integrations/indices/llama-index-indices-managed-vertexai/llama_index/indices/managed/vertexai/base.py
142
143
144
145
146
147
148
def list_files(self) -> Sequence[str]:
    """List all files in the index."""
    files = []
    with telemetry.tool_context_manager(self._user_agent):
        for file in rag.list_files(corpus_name=self.corpus_name):
            files.append(file.name)
    return files

删除文件 #

delete_file(file_name: str) -> None

从索引中删除文件。

源码位于 llama-index-integrations/indices/llama-index-indices-managed-vertexai/llama_index/indices/managed/vertexai/base.py
150
151
152
153
def delete_file(self, file_name: str) -> None:
    """Delete file from the index."""
    with telemetry.tool_context_manager(self._user_agent):
        rag.delete_file(name=file_name, corpus_name=self.corpus_name)

作为检索器 #

as_retriever(**kwargs: Any) -> BaseRetriever

为此托管索引返回一个检索器。

源码位于 llama-index-integrations/indices/llama-index-indices-managed-vertexai/llama_index/indices/managed/vertexai/base.py
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
def as_retriever(self, **kwargs: Any) -> BaseRetriever:
    """Return a Retriever for this managed index."""
    from llama_index.indices.managed.vertexai.retriever import (
        VertexAIRetriever,
    )

    similarity_top_k = kwargs.pop("similarity_top_k", None)
    vector_distance_threshold = kwargs.pop("vector_distance_threshold", None)

    return VertexAIRetriever(
        self.corpus_name,
        similarity_top_k,
        vector_distance_threshold,
        self._user_agent,
        **kwargs,
    )

删除参考文档 #

delete_ref_doc(ref_doc_id: str, delete_from_docstore: bool = False, **delete_kwargs: Any) -> None

使用 ref_doc_id 删除文档及其节点。

源码位于 llama-index-integrations/indices/llama-index-indices-managed-vertexai/llama_index/indices/managed/vertexai/base.py
184
185
186
187
188
189
190
191
192
193
def delete_ref_doc(
    self, ref_doc_id: str, delete_from_docstore: bool = False, **delete_kwargs: Any
) -> None:
    """Delete a document and it's nodes by using ref_doc_id."""
    if delete_from_docstore:
        with telemetry.tool_context_manager(self._user_agent):
            rag.delete_file(
                name=ref_doc_id,
                corpus_name=self.corpus_name,
            )

更新参考文档 #

update_ref_doc(document: Document, **update_kwargs: Any) -> None

更新文档及其对应的节点。

源码位于 llama-index-integrations/indices/llama-index-indices-managed-vertexai/llama_index/indices/managed/vertexai/base.py
195
196
197
def update_ref_doc(self, document: Document, **update_kwargs: Any) -> None:
    """Update a document and it's corresponding nodes."""
    raise NotImplementedError("Document update is not supported.")