跳到内容

Tidbvector

TiDBVectorStore #

Bases: BasePydanticVectorStore

源代码位于 llama-index-integrations/vector_stores/llama-index-vector-stores-tidbvector/llama_index/vector_stores/tidbvector/base.py
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
class TiDBVectorStore(BasePydanticVectorStore):
    stores_text: bool = True
    flat_metadata: bool = False

    _connection_string: str = PrivateAttr()
    _engine_args: Dict[str, Any] = PrivateAttr()
    _tidb: Any = PrivateAttr()

    def __init__(
        self,
        connection_string: str,
        table_name: str = DEFAULT_VECTOR_TABLE_NAME,
        distance_strategy: str = DEFAULT_DISTANCE_STRATEGY,
        vector_dimension: int = 1536,
        *,
        engine_args: Optional[Dict[str, Any]] = None,
        drop_existing_table: bool = False,
        **kwargs: Any,
    ) -> None:
        """
        Initialize a TiDB Vector Store in Llama Index with a flexible
        and standardized table structure for storing vector data
        which remains fixed regardless of the dynamic table name setting.

        The vector table schema includes:
        - 'id': a UUID for each entry.
        - 'embedding': stores vector data in a VectorType column.
        - 'document': a Text column for the original data or additional information.
        - 'meta': a JSON column for flexible metadata storage.
        - 'create_time' and 'update_time': timestamp columns for tracking data changes.

        This table structure caters to general use cases and
        complex scenarios where the table serves as a semantic layer for advanced
        data integration and analysis, leveraging SQL for join queries.

        Args:
            connection_string (str): The connection string for the TiDB database.
                format: "mysql+pymysql://[email protected]:4000/test".
            table_name (str, optional): The name of the table that will be used to
                store vector data. If you do not provide a table name,
                a default table named `llama_index_vector_store` will be created automatically
            distance_strategy: The strategy used for similarity search,
                defaults to "cosine", valid values: "l2", "cosine".
            vector_dimension: The dimension of the vector, defaults to 1536.
            engine_args (Optional[Dict[str, Any]], optional): Additional engine arguments. Defaults to None.
            drop_existing_table: Drop the existing TiDB table before initializing,
                defaults to False.
            **kwargs (Any): Additional keyword arguments.

        Raises:
            ImportError: If the tidbvec python package is not installed.

        """
        super().__init__(**kwargs)
        self._connection_string = connection_string
        self._engine_args = engine_args or {}
        try:
            from tidb_vector.integrations import TiDBVectorClient
        except ImportError:
            raise ImportError(
                "Could not import tidbvec python package. "
                "Please install it with `pip install tidb-vector`."
            )

        self._tidb = TiDBVectorClient(
            connection_string=connection_string,
            table_name=table_name,
            distance_strategy=distance_strategy,
            vector_dimension=vector_dimension,
            engine_args=engine_args,
            drop_existing_table=drop_existing_table,
            **kwargs,
        )

    @property
    def client(self) -> Any:
        """Get client."""
        return self._tidb

    def drop_vectorstore(self) -> None:
        """
        Drop the tidb vector store from the TiDB database.
        """
        self._tidb.drop_table()

    @classmethod
    def class_name(cls) -> str:
        return "TiDBVectorStore"

    def add(self, nodes: List[BaseNode], **add_kwargs: Any) -> List[str]:
        """
        Add nodes to the vector store.

        Args:
            nodes (List[BaseNode]): List of nodes to be added.
            **add_kwargs: Additional keyword arguments to be passed to the underlying storage.

        Returns:
            List[str]: List of node IDs that were added.

        """
        ids = []
        metadatas = []
        embeddings = []
        texts = []
        for node in nodes:
            ids.append(node.node_id)
            metadatas.append(node_to_metadata_dict(node, remove_text=True))
            embeddings.append(node.get_embedding())
            texts.append(node.get_content(metadata_mode=MetadataMode.NONE))

        self._tidb.insert(
            texts=texts,
            embeddings=embeddings,
            metadatas=metadatas,
            ids=ids,
            **add_kwargs,
        )

        return ids

    def delete(self, ref_doc_id: str, **delete_kwargs: Any) -> None:
        """
        Delete all nodes of a document from the vector store.

        Args:
            ref_doc_id (str): The reference document ID to be deleted.
            **delete_kwargs: Additional keyword arguments to be passed to the delete method.

        Returns:
            None

        """
        delete_kwargs["filter"] = {"doc_id": ref_doc_id}
        self._tidb.delete(**delete_kwargs)

    def query(self, query: VectorStoreQuery, **kwargs: Any) -> VectorStoreQueryResult:
        """
        Perform a similarity search with the given query embedding.

        Args:
            query (VectorStoreQuery): The query object containing the query data.
            **kwargs: Additional keyword arguments.

        Returns:
            VectorStoreQueryResult: The result of the similarity search.

        Raises:
            ValueError: If the query embedding is not provided.

        """
        if query.query_embedding is None:
            raise ValueError("Query embedding must be provided.")
        return self._similarity_search_with_score(
            query.query_embedding,
            query.similarity_top_k,
            query.filters,
            **kwargs,
        )

    def _similarity_search_with_score(
        self,
        embedding: List[float],
        limit: int = 10,
        metadata_filters: Optional[MetadataFilters] = None,
        **kwargs: Any,
    ) -> VectorStoreQueryResult:
        """
        Performs a similarity search with scores based on the given condition.

        Args:
            embedding (List[float]): The embedding vector for similarity search.
            limit (int, optional): The maximum number of results to return. Defaults to 10.
            metadata_filters (Optional[MetadataFilters], optional): Filters to apply on metadata. Defaults to None.
            **kwargs (Any): Additional keyword arguments.

        Returns:
            VectorStoreQueryResult: The result of the similarity search, including nodes, similarities, and ids.

        """
        filters = self._to_tidb_filters(metadata_filters)
        results = self._tidb.query(
            query_vector=embedding, k=limit, filter=filters, **kwargs
        )

        nodes = []
        similarities = []
        ids = []
        for row in results:
            try:
                node = metadata_dict_to_node(row.metadata)
                node.set_content(str(row.document))
            except Exception:
                # NOTE: deprecated legacy logic for backward compatibility
                _logger.warning(
                    "Failed to parse metadata dict, falling back to legacy logic."
                )
                node = TextNode(
                    id_=row.id,
                    text=row.document,
                    metadata=row.metadata,
                )
            similarities.append((1 - row.distance) if row.distance is not None else 0)
            ids.append(row.id)
            nodes.append(node)

        return VectorStoreQueryResult(
            nodes=nodes,
            similarities=similarities,
            ids=ids,
        )

    def _to_tidb_filters(
        self, metadata_filters: Optional[MetadataFilters] = None
    ) -> Optional[Dict[str, Any]]:
        """
        Converts metadata filters to TiDB filters.

        Args:
            metadata_filters (Optional[MetadataFilters]): The metadata filters to be converted.

        Returns:
            Optional[Dict[str, Any]]: The converted TiDB filters.

        Raises:
            ValueError: If an unsupported operator is encountered.

        """
        if metadata_filters is None:
            return None

        condition = "$and"
        if metadata_filters.condition == FilterCondition.OR:
            condition = "$or"

        filters = []
        for filter in metadata_filters.filters:
            if filter.operator == FilterOperator.EQ:
                filters.append({filter.key: {"$eq": filter.value}})
            elif filter.operator == FilterOperator.NE:
                filters.append({filter.key: {"$ne": filter.value}})
            elif filter.operator == FilterOperator.GT:
                filters.append({filter.key: {"$gt": filter.value}})
            elif filter.operator == FilterOperator.GTE:
                filters.append({filter.key: {"$gte": filter.value}})
            elif filter.operator == FilterOperator.LT:
                filters.append({filter.key: {"$lt": filter.value}})
            elif filter.operator == FilterOperator.LTE:
                filters.append({filter.key: {"$lte": filter.value}})
            elif filter.operator == FilterOperator.IN:
                filters.append({filter.key: {"$in": filter.value}})
            elif filter.operator == FilterOperator.NIN:
                filters.append({filter.key: {"$nin": filter.value}})
            else:
                raise ValueError(f"Unsupported operator: {filter.operator}")

        return {condition: filters}

client 属性 #

client: Any

获取客户端。

drop_vectorstore #

drop_vectorstore() -> None

从 TiDB 数据库中删除 tidb vector store。

源代码位于 llama-index-integrations/vector_stores/llama-index-vector-stores-tidbvector/llama_index/vector_stores/tidbvector/base.py
104
105
106
107
108
def drop_vectorstore(self) -> None:
    """
    Drop the tidb vector store from the TiDB database.
    """
    self._tidb.drop_table()

add #

add(nodes: List[BaseNode], **add_kwargs: Any) -> List[str]

向 vector store 添加节点。

参数

名称 类型 描述 默认值
nodes List[BaseNode]

要添加的节点列表。

必需
**add_kwargs 任意类型

传递给底层存储的附加关键字参数。

{}

返回值

类型 描述
List[str]

List[str]: 已添加的节点 ID 列表。

源代码位于 llama-index-integrations/vector_stores/llama-index-vector-stores-tidbvector/llama_index/vector_stores/tidbvector/base.py
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
def add(self, nodes: List[BaseNode], **add_kwargs: Any) -> List[str]:
    """
    Add nodes to the vector store.

    Args:
        nodes (List[BaseNode]): List of nodes to be added.
        **add_kwargs: Additional keyword arguments to be passed to the underlying storage.

    Returns:
        List[str]: List of node IDs that were added.

    """
    ids = []
    metadatas = []
    embeddings = []
    texts = []
    for node in nodes:
        ids.append(node.node_id)
        metadatas.append(node_to_metadata_dict(node, remove_text=True))
        embeddings.append(node.get_embedding())
        texts.append(node.get_content(metadata_mode=MetadataMode.NONE))

    self._tidb.insert(
        texts=texts,
        embeddings=embeddings,
        metadatas=metadatas,
        ids=ids,
        **add_kwargs,
    )

    return ids

delete #

delete(ref_doc_id: str, **delete_kwargs: Any) -> None

从 vector store 中删除文档的所有节点。

参数

名称 类型 描述 默认值
ref_doc_id str

要删除的参考文档 ID。

必需
**delete_kwargs 任意类型

传递给 delete 方法的附加关键字参数。

{}

返回值

类型 描述

源代码位于 llama-index-integrations/vector_stores/llama-index-vector-stores-tidbvector/llama_index/vector_stores/tidbvector/base.py
146
147
148
149
150
151
152
153
154
155
156
157
158
159
def delete(self, ref_doc_id: str, **delete_kwargs: Any) -> None:
    """
    Delete all nodes of a document from the vector store.

    Args:
        ref_doc_id (str): The reference document ID to be deleted.
        **delete_kwargs: Additional keyword arguments to be passed to the delete method.

    Returns:
        None

    """
    delete_kwargs["filter"] = {"doc_id": ref_doc_id}
    self._tidb.delete(**delete_kwargs)

query #

query(query: VectorStoreQuery, **kwargs: Any) -> VectorStoreQueryResult

使用给定的查询嵌入执行相似性搜索。

参数

名称 类型 描述 默认值
query VectorStoreQuery

包含查询数据的查询对象。

必需
**kwargs 任意类型

附加关键字参数。

{}

返回值

名称 类型 描述
VectorStoreQueryResult VectorStoreQueryResult

相似性搜索结果。

抛出异常

类型 描述
ValueError

如果未提供查询嵌入。

源代码位于 llama-index-integrations/vector_stores/llama-index-vector-stores-tidbvector/llama_index/vector_stores/tidbvector/base.py
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
def query(self, query: VectorStoreQuery, **kwargs: Any) -> VectorStoreQueryResult:
    """
    Perform a similarity search with the given query embedding.

    Args:
        query (VectorStoreQuery): The query object containing the query data.
        **kwargs: Additional keyword arguments.

    Returns:
        VectorStoreQueryResult: The result of the similarity search.

    Raises:
        ValueError: If the query embedding is not provided.

    """
    if query.query_embedding is None:
        raise ValueError("Query embedding must be provided.")
    return self._similarity_search_with_score(
        query.query_embedding,
        query.similarity_top_k,
        query.filters,
        **kwargs,
    )