Databricks Vector Search¶

Databricks Vector Search 是一个向量数据库，内置于 Databricks Intelligence Platform 中，并与其治理和生产力工具集成。完整文档请见：https://docs.databricks.com/en/generative-ai/vector-search.html

安装 llama-index 和 databricks-vectorsearch。您必须在 Databricks Runtime 中才能使用 Vector Search Python 客户端。

输入 [ ]

已复制!

%pip install llama-index llama-index-vector-stores-databricks
%pip install databricks-vectorsearch
%pip install llama-index llama-index-vector-stores-databricks %pip install databricks-vectorsearch

导入 databricks 依赖项

输入 [ ]

已复制!

from databricks.vector_search.client import (
    VectorSearchIndex,
    VectorSearchClient,
)
from databricks.vector_search.client import ( VectorSearchIndex, VectorSearchClient, )

导入 LlamaIndex 依赖项

输入 [ ]

已复制!





from llama_index.core import (
    VectorStoreIndex,
    SimpleDirectoryReader,
    ServiceContext,
    StorageContext,
)
from llama_index.vector_stores.databricks import DatabricksVectorSearch
from llama_index.core import ( VectorStoreIndex, SimpleDirectoryReader, ServiceContext, StorageContext, ) from llama_index.vector_stores.databricks import DatabricksVectorSearch

加载示例数据

输入 [ ]

已复制!

!mkdir -p 'data/paul_graham/'
!wget 'https://raw.githubusercontent.com/run-llama/llama_index/main/docs/docs/examples/data/paul_graham/paul_graham_essay.txt' -O 'data/paul_graham/paul_graham_essay.txt'
!mkdir -p 'data/paul_graham/' !wget 'https://raw.githubusercontent.com/run-llama/llama_index/main/docs/docs/examples/data/paul_graham/paul_graham_essay.txt' -O 'data/paul_graham/paul_graham_essay.txt'

读取数据

输入 [ ]

已复制!





# load documents
documents = SimpleDirectoryReader("./data/paul_graham/").load_data()
print(f"Total documents: {len(documents)}")
print(f"First document, id: {documents[0].doc_id}")
print(f"First document, hash: {documents[0].hash}")
print(
    "First document, text"
    f" ({len(documents[0].text)} characters):\n{'='*20}\n{documents[0].text[:360]} ..."
)
# 加载文档 documents = SimpleDirectoryReader("./data/paul_graham/").load_data() print(f"总文档数: {len(documents)}") print(f"第一个文档，id: {documents[0].doc_id}") print(f"第一个文档，哈希: {documents[0].hash}") print( "第一个文档，文本" f" ({len(documents[0].text)} 字符):\n{'='*20}\n{documents[0].text[:360]} ..." )

创建一个 Databricks Vector Search 端点，用于服务索引

输入 [ ]

已复制!

# Create a vector search endpoint
client = VectorSearchClient()
client.create_endpoint(
    name="llamaindex_dbx_vector_store_test_endpoint", endpoint_type="STANDARD"
)
# 创建向量搜索端点 client = VectorSearchClient() client.create_endpoint( name="llamaindex_dbx_vector_store_test_endpoint", endpoint_type="STANDARD" )

创建 Databricks Vector Search 索引，并从文档构建它

输入 [ ]

已复制!





# Create a vector search index
# it must be placed inside a Unity Catalog-enabled schema

# We'll use self-managed embeddings (i.e. managed by LlamaIndex) rather than a Databricks-managed index
databricks_index = client.create_direct_access_index(
    endpoint_name="llamaindex_dbx_vector_store_test_endpoint",
    index_name="my_catalog.my_schema.my_test_table",
    primary_key="my_primary_key_name",
    embedding_dimension=1536,  # match the embeddings model dimension you're going to use
    embedding_vector_column="my_embedding_vector_column_name",  # you name this anything you want - it'll be picked up by the LlamaIndex class
    schema={
        "my_primary_key_name": "string",
        "my_embedding_vector_column_name": "array<double>",
        "text": "string",  # one column must match the text_column in the DatabricksVectorSearch instance created below; this will hold the raw node text,
        "doc_id": "string",  # one column must contain the reference document ID (this will be populated by LlamaIndex automatically)
        # add any other metadata you may have in your nodes (Databricks Vector Search supports metadata filtering)
        # NOTE THAT THESE FIELDS MUST BE ADDED EXPLICITLY TO BE USED FOR METADATA FILTERING
    },
)

databricks_vector_store = DatabricksVectorSearch(
    index=databricks_index,
    text_column="text",
    columns=None,  # YOU MUST ALSO RECORD YOUR METADATA FIELD NAMES HERE
)  # text_column is required for self-managed embeddings
storage_context = StorageContext.from_defaults(
    vector_store=databricks_vector_store
)
index = VectorStoreIndex.from_documents(
    documents, storage_context=storage_context
)
# 创建向量搜索索引 # 它必须放置在已启用 Unity Catalog 的 schema 中 # 我们将使用自管理嵌入 (即由 LlamaIndex 管理)，而不是 Databricks 管理的索引 databricks_index = client.create_direct_access_index( endpoint_name="llamaindex_dbx_vector_store_test_endpoint", index_name="my_catalog.my_schema.my_test_table", primary_key="my_primary_key_name", embedding_dimension=1536, # 匹配您将使用的嵌入模型维度 embedding_vector_column="my_embedding_vector_column_name", # 您可以随意命名此列 - LlamaIndex 类会识别它 schema={ "my_primary_key_name": "string", "my_embedding_vector_column_name": "array", "text": "string", # 其中一列必须与下面创建的 DatabricksVectorSearch 实例中的 text_column 匹配；此列将保存原始节点文本， "doc_id": "string", # 其中一列必须包含引用文档 ID (LlamaIndex 会自动填充) # 添加您节点中可能包含的任何其他元数据 (Databricks Vector Search 支持元数据过滤) # 请注意，必须明确添加这些字段才能用于元数据过滤 }, ) databricks_vector_store = DatabricksVectorSearch( index=databricks_index, text_column="text", columns=None, # 您还必须在此处记录您的元数据字段名称 ) # 对于自管理嵌入，text_column 是必需的 storage_context = StorageContext.from_defaults( vector_store=databricks_vector_store ) index = VectorStoreIndex.from_documents( documents, storage_context=storage_context )

查询索引

输入 [ ]

已复制!

query_engine = index.as_query_engine()
response = query_engine.query("Why did the author choose to work on AI?")

print(response.response)
query_engine = index.as_query_engine() response = query_engine.query("作者为什么选择研究 AI？") print(response.response)