跳到内容

Vertex AI 向量搜索

VertexAIVectorStore #

Bases: BasePydanticVectorStore

Vertex AI Vector Search 向量存储。

在此向量存储中,embedding 存储在 Vertex AI Vector Store 中,文档存储在 Cloud Storage bucket 中。

在查询时,索引使用 Vertex AI Vector Search 查询最相似的 Top k 个节点。

参数

名称 类型 描述 默认值
project_id str)

Google Cloud 项目 ID。

region str)

发起 API 调用的默认位置。它必须与 Vector Search 索引创建的位置相同,并且必须是区域性的。

index_id str)

在 Vertex AI Vector Search 中创建的索引的完全限定资源名称。

endpoint_id str

在 Vertex AI Vector Search 中创建的索引端点的完全限定资源名称。

gcs_bucket_name 可选[str]
           The location where the vectors will be stored for
           the index to be created in batch mode.
credentials_path 可选[str]
           The path of the Google credentials on the local file
           system.

示例

pip install llama-index-vector-stores-vertexaivectorsearch

from
vector_store = VertexAIVectorStore(
    project_id=PROJECT_ID,
    region=REGION,
    index_id="<index_resource_name>"
    endpoint_id="<index_endpoint_resource_name>"
)
源代码位于 llama-index-integrations/vector_stores/llama-index-vector-stores-vertexaivectorsearch/llama_index/vector_stores/vertexaivectorsearch/base.py
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
class VertexAIVectorStore(BasePydanticVectorStore):
    """
    Vertex AI Vector Search vector store.

    In this vector store, embeddings are stored in Vertex AI Vector Store and
    docs are stored within Cloud Storage bucket.

    During query time, the index uses Vertex AI Vector Search to query for the
    top k most similar nodes.

    Args:
        project_id (str) : The Google Cloud Project ID.
        region (str)     : The default location making the API calls.
                           It must be the same location as where Vector Search
                           index created and must be regional.
        index_id (str)   : The fully qualified resource name of the created
                           index in Vertex AI Vector Search.
        endpoint_id (str): The fully qualified resource name of the created
                           index endpoint in Vertex AI Vector Search.
        gcs_bucket_name (Optional[str]):
                           The location where the vectors will be stored for
                           the index to be created in batch mode.
        credentials_path (Optional[str]):
                           The path of the Google credentials on the local file
                           system.

    Examples:
        `pip install llama-index-vector-stores-vertexaivectorsearch`

        ```python
        from
        vector_store = VertexAIVectorStore(
            project_id=PROJECT_ID,
            region=REGION,
            index_id="<index_resource_name>"
            endpoint_id="<index_endpoint_resource_name>"
        )
        ```

    """

    stores_text: bool = True
    remove_text_from_metadata: bool = True
    flat_metadata: bool = False

    text_key: str

    project_id: str
    region: str
    index_id: str
    endpoint_id: str
    gcs_bucket_name: Optional[str] = None
    credentials_path: Optional[str] = None

    _index: MatchingEngineIndex = PrivateAttr()
    _endpoint: MatchingEngineIndexEndpoint = PrivateAttr()
    _index_metadata: dict = PrivateAttr()
    _stream_update: bool = PrivateAttr()
    _staging_bucket: storage.Bucket = PrivateAttr()
    # _document_storage: GCSDocumentStorage = PrivateAttr()

    def __init__(
        self,
        project_id: Optional[str] = None,
        region: Optional[str] = None,
        index_id: Optional[str] = None,
        endpoint_id: Optional[str] = None,
        gcs_bucket_name: Optional[str] = None,
        credentials_path: Optional[str] = None,
        text_key: str = DEFAULT_TEXT_KEY,
        remove_text_from_metadata: bool = True,
        **kwargs: Any,
    ) -> None:
        super().__init__(
            project_id=project_id,
            region=region,
            index_id=index_id,
            endpoint_id=endpoint_id,
            gcs_bucket_name=gcs_bucket_name,
            credentials_path=credentials_path,
            text_key=text_key,
            remove_text_from_metadata=remove_text_from_metadata,
        )

        """Initialize params."""
        _sdk_manager = VectorSearchSDKManager(
            project_id=project_id, region=region, credentials_path=credentials_path
        )

        # get index and endpoint resource names including metadata
        self._index = _sdk_manager.get_index(index_id=index_id)
        self._endpoint = _sdk_manager.get_endpoint(endpoint_id=endpoint_id)
        self._index_metadata = self._index.to_dict()

        # get index update method from index metadata
        self._stream_update = False
        if self._index_metadata["indexUpdateMethod"] == "STREAM_UPDATE":
            self._stream_update = True

        # get bucket object when available
        if self.gcs_bucket_name:
            self._staging_bucket = _sdk_manager.get_gcs_bucket(
                bucket_name=gcs_bucket_name
            )
        else:
            self._staging_bucket = None

    @classmethod
    def from_params(
        cls,
        project_id: Optional[str] = None,
        region: Optional[str] = None,
        index_id: Optional[str] = None,
        endpoint_id: Optional[str] = None,
        gcs_bucket_name: Optional[str] = None,
        credentials_path: Optional[str] = None,
        text_key: str = DEFAULT_TEXT_KEY,
        **kwargs: Any,
    ) -> "VertexAIVectorStore":
        """Create VertexAIVectorStore from config."""
        return cls(
            project_id=project_id,
            region=region,
            index_name=index_id,
            endpoint_id=endpoint_id,
            gcs_bucket_name=gcs_bucket_name,
            credentials_path=credentials_path,
            text_key=text_key,
            **kwargs,
        )

    @classmethod
    def class_name(cls) -> str:
        return "VertexAIVectorStore"

    @property
    def client(self) -> Any:
        """Get client."""
        return self._index

    @property
    def index(self) -> Any:
        """Get client."""
        return self._index

    @property
    def endpoint(self) -> Any:
        """Get client."""
        return self._endpoint

    @property
    def staging_bucket(self) -> Any:
        """Get client."""
        return self._staging_bucket

    def add(
        self,
        nodes: List[BaseNode],
        is_complete_overwrite: bool = False,
        **add_kwargs: Any,
    ) -> List[str]:
        """
        Add nodes to index.

        Args:
            nodes: List[BaseNode]: list of nodes with embeddings

        """
        ids = []
        embeddings = []
        metadatas = []
        for node in nodes:
            node_id = node.node_id
            metadata = node_to_metadata_dict(
                node, remove_text=False, flat_metadata=False
            )
            embedding = node.get_embedding()

            ids.append(node_id)
            embeddings.append(embedding)
            metadatas.append(metadata)

        data_points = utils.to_data_points(ids, embeddings, metadatas)
        # self._document_storage.add_documents(list(zip(ids, nodes)))

        if self._stream_update:
            utils.stream_update_index(index=self._index, data_points=data_points)
        else:
            if self._staging_bucket is None:
                raise ValueError(
                    "To update a Vector Search index a staging bucket must"
                    " be defined."
                )
            utils.batch_update_index(
                index=self._index,
                data_points=data_points,
                staging_bucket=self._staging_bucket,
                is_complete_overwrite=is_complete_overwrite,
            )
        return ids

    def delete(self, ref_doc_id: str, **delete_kwargs: Any) -> None:
        """
        Delete nodes using with ref_doc_id.

        Args:
            ref_doc_id (str): The doc_id of the document to delete.

        """
        # get datapoint ids by filter
        filter = {"ref_doc_id": ref_doc_id}
        ids = utils.get_datapoints_by_filter(
            index=self.index, endpoint=self.endpoint, metadata=filter
        )
        # remove datapoints
        self._index.remove_datapoints(datapoint_ids=ids)

    def query(self, query: VectorStoreQuery, **kwargs: Any) -> VectorStoreQueryResult:
        """Query index for top k most similar nodes."""
        query_embedding = None
        if query.mode == VectorStoreQueryMode.DEFAULT:
            query_embedding = [cast(List[float], query.query_embedding)]

        if query.filters is not None:
            if "filter" in kwargs and kwargs["filter"] is not None:
                raise ValueError(
                    "Cannot specify filter via both query and kwargs. "
                    "Use kwargs only for Vertex AI Vector Search specific items that are "
                    "not supported via the generic query interface such as numeric filters."
                )
            filter, num_filter = utils.to_vectorsearch_filter(query.filters)
        else:
            filter = None
            num_filter = None

        matches = utils.find_neighbors(
            index=self._index,
            endpoint=self._endpoint,
            embeddings=query_embedding,
            top_k=query.similarity_top_k,
            filter=filter,
            numeric_filter=num_filter,
        )

        top_k_nodes = []
        top_k_ids = []
        top_k_scores = []

        for match in matches:
            node = utils.to_node(match, self.text_key)
            top_k_ids.append(match.id)
            top_k_scores.append(match.distance)
            top_k_nodes.append(node)

        return VectorStoreQueryResult(
            nodes=top_k_nodes, similarities=top_k_scores, ids=top_k_ids
        )

client 属性 #

client: Any

获取 client。

index 属性 #

index: Any

获取 client。

endpoint 属性 #

endpoint: Any

获取 client。

staging_bucket 属性 #

staging_bucket: Any

获取 client。

from_params 类方法 #

from_params(project_id: Optional[str] = None, region: Optional[str] = None, index_id: Optional[str] = None, endpoint_id: Optional[str] = None, gcs_bucket_name: Optional[str] = None, credentials_path: Optional[str] = None, text_key: str = DEFAULT_TEXT_KEY, **kwargs: Any) -> VertexAIVectorStore

从配置创建 VertexAIVectorStore。

源代码位于 llama-index-integrations/vector_stores/llama-index-vector-stores-vertexaivectorsearch/llama_index/vector_stores/vertexaivectorsearch/base.py
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
@classmethod
def from_params(
    cls,
    project_id: Optional[str] = None,
    region: Optional[str] = None,
    index_id: Optional[str] = None,
    endpoint_id: Optional[str] = None,
    gcs_bucket_name: Optional[str] = None,
    credentials_path: Optional[str] = None,
    text_key: str = DEFAULT_TEXT_KEY,
    **kwargs: Any,
) -> "VertexAIVectorStore":
    """Create VertexAIVectorStore from config."""
    return cls(
        project_id=project_id,
        region=region,
        index_name=index_id,
        endpoint_id=endpoint_id,
        gcs_bucket_name=gcs_bucket_name,
        credentials_path=credentials_path,
        text_key=text_key,
        **kwargs,
    )

add #

add(nodes: List[BaseNode], is_complete_overwrite: bool = False, **add_kwargs: Any) -> List[str]

将节点添加到索引。

参数

名称 类型 描述 默认值
nodes 列表[BaseNode]

List[BaseNode]: 包含 embedding 的节点列表

必需
源代码位于 llama-index-integrations/vector_stores/llama-index-vector-stores-vertexaivectorsearch/llama_index/vector_stores/vertexaivectorsearch/base.py
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
def add(
    self,
    nodes: List[BaseNode],
    is_complete_overwrite: bool = False,
    **add_kwargs: Any,
) -> List[str]:
    """
    Add nodes to index.

    Args:
        nodes: List[BaseNode]: list of nodes with embeddings

    """
    ids = []
    embeddings = []
    metadatas = []
    for node in nodes:
        node_id = node.node_id
        metadata = node_to_metadata_dict(
            node, remove_text=False, flat_metadata=False
        )
        embedding = node.get_embedding()

        ids.append(node_id)
        embeddings.append(embedding)
        metadatas.append(metadata)

    data_points = utils.to_data_points(ids, embeddings, metadatas)
    # self._document_storage.add_documents(list(zip(ids, nodes)))

    if self._stream_update:
        utils.stream_update_index(index=self._index, data_points=data_points)
    else:
        if self._staging_bucket is None:
            raise ValueError(
                "To update a Vector Search index a staging bucket must"
                " be defined."
            )
        utils.batch_update_index(
            index=self._index,
            data_points=data_points,
            staging_bucket=self._staging_bucket,
            is_complete_overwrite=is_complete_overwrite,
        )
    return ids

delete #

delete(ref_doc_id: str, **delete_kwargs: Any) -> None

使用 ref_doc_id 删除节点。

参数

名称 类型 描述 默认值
ref_doc_id str

要删除的文档的 doc_id。

必需
源代码位于 llama-index-integrations/vector_stores/llama-index-vector-stores-vertexaivectorsearch/llama_index/vector_stores/vertexaivectorsearch/base.py
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
def delete(self, ref_doc_id: str, **delete_kwargs: Any) -> None:
    """
    Delete nodes using with ref_doc_id.

    Args:
        ref_doc_id (str): The doc_id of the document to delete.

    """
    # get datapoint ids by filter
    filter = {"ref_doc_id": ref_doc_id}
    ids = utils.get_datapoints_by_filter(
        index=self.index, endpoint=self.endpoint, metadata=filter
    )
    # remove datapoints
    self._index.remove_datapoints(datapoint_ids=ids)

query #

query(query: VectorStoreQuery, **kwargs: Any) -> VectorStoreQueryResult

查询索引以获取最相似的 Top k 个节点。

源代码位于 llama-index-integrations/vector_stores/llama-index-vector-stores-vertexaivectorsearch/llama_index/vector_stores/vertexaivectorsearch/base.py
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
def query(self, query: VectorStoreQuery, **kwargs: Any) -> VectorStoreQueryResult:
    """Query index for top k most similar nodes."""
    query_embedding = None
    if query.mode == VectorStoreQueryMode.DEFAULT:
        query_embedding = [cast(List[float], query.query_embedding)]

    if query.filters is not None:
        if "filter" in kwargs and kwargs["filter"] is not None:
            raise ValueError(
                "Cannot specify filter via both query and kwargs. "
                "Use kwargs only for Vertex AI Vector Search specific items that are "
                "not supported via the generic query interface such as numeric filters."
            )
        filter, num_filter = utils.to_vectorsearch_filter(query.filters)
    else:
        filter = None
        num_filter = None

    matches = utils.find_neighbors(
        index=self._index,
        endpoint=self._endpoint,
        embeddings=query_embedding,
        top_k=query.similarity_top_k,
        filter=filter,
        numeric_filter=num_filter,
    )

    top_k_nodes = []
    top_k_ids = []
    top_k_scores = []

    for match in matches:
        node = utils.to_node(match, self.text_key)
        top_k_ids.append(match.id)
        top_k_scores.append(match.distance)
        top_k_nodes.append(node)

    return VectorStoreQueryResult(
        nodes=top_k_nodes, similarities=top_k_scores, ids=top_k_ids
    )