Azure Cosmos DB NoSQL 向量存储¶
在本 Notebook 中,我们将展示如何使用 AzureCosmosDBNoSqlVectorSearch 在 LlamaIndex 中执行向量搜索的快速演示。
如果您正在 colab 上打开此 Notebook,则可能需要安装 LlamaIndex 🦙。
In [ ]
已复制!
%pip install llama-index-embeddings-openai
%pip install llama-index-llms-azure-openai
%pip install llama-index-embeddings-openai %pip install llama-index-llms-azure-openai
In [ ]
已复制!
!pip install llama-index
!pip install llama-index
In [ ]
已复制!
import os
import json
import openai
from llama_index.llms.azure_openai import AzureOpenAI
from llama_index.embeddings.openai import OpenAIEmbedding
from llama_index.core import VectorStoreIndex, SimpleDirectoryReader
from llama_index.embeddings.azure_openai import AzureOpenAIEmbedding
import os import json import openai from llama_index.llms.azure_openai import AzureOpenAI from llama_index.embeddings.openai import OpenAIEmbedding from llama_index.core import VectorStoreIndex, SimpleDirectoryReader from llama_index.embeddings.azure_openai import AzureOpenAIEmbedding
设置 Azure OpenAI¶
第一步是配置 llm 和 embedding 模型。这些模型将用于为加载到数据库中的文档创建 embedding,并用于 llm 补全。
In [ ]
已复制!
llm = AzureOpenAI(
model="AZURE_OPENAI_MODEL",
deployment_name="AZURE_OPENAI_DEPLOYMENT_NAME",
azure_endpoint="AZURE_OPENAI_BASE",
api_key="AZURE_OPENAI_KEY",
api_version="AZURE_OPENAI_VERSION",
)
embed_model = AzureOpenAIEmbedding(
model="AZURE_OPENAI_EMBEDDING_MODEL",
deployment_name="AZURE_OPENAI_EMBEDDING_MODEL_DEPLOYMENT_NAME",
azure_endpoint="AZURE_OPENAI_BASE",
api_key="AZURE_OPENAI_KEY",
api_version="AZURE_OPENAI_VERSION",
)
llm = AzureOpenAI( model="AZURE_OPENAI_MODEL", deployment_name="AZURE_OPENAI_DEPLOYMENT_NAME", azure_endpoint="AZURE_OPENAI_BASE", api_key="AZURE_OPENAI_KEY", api_version="AZURE_OPENAI_VERSION", ) embed_model = AzureOpenAIEmbedding( model="AZURE_OPENAI_EMBEDDING_MODEL", deployment_name="AZURE_OPENAI_EMBEDDING_MODEL_DEPLOYMENT_NAME", azure_endpoint="AZURE_OPENAI_BASE", api_key="AZURE_OPENAI_KEY", api_version="AZURE_OPENAI_VERSION", )
In [ ]
已复制!
from llama_index.core import Settings
Settings.llm = llm
Settings.embed_model = embed_model
from llama_index.core import Settings Settings.llm = llm Settings.embed_model = embed_model
加载文档¶
在此示例中,我们将使用 paul_graham 的文章,该文章将由 SimpleDirectoryReader 处理。
In [ ]
已复制!
from llama_index.core import SimpleDirectoryReader
documents = SimpleDirectoryReader(
input_files=[r"\docs\examples\data\paul_graham\paul_graham_essay.txt"]
).load_data()
print("Document ID:", documents[0].doc_id)
from llama_index.core import SimpleDirectoryReader documents = SimpleDirectoryReader( input_files=[r"\docs\examples\data\paul_graham\paul_graham_essay.txt"] ).load_data() print("文档 ID:", documents[0].doc_id)
创建索引¶
在这里,我们建立与 cosmos db nosql 的连接并创建一个向量存储索引。
In [ ]
已复制!
from azure.cosmos import CosmosClient, PartitionKey
from llama_index.vector_stores.azurecosmosnosql import (
AzureCosmosDBNoSqlVectorSearch,
)
from llama_index.core import StorageContext
# create cosmos client
URI = "AZURE_COSMOSDB_URI"
KEY = "AZURE_COSMOSDB_KEY"
client = CosmosClient(URI, credential=KEY)
# specify vector store properties
indexing_policy = {
"indexingMode": "consistent",
"includedPaths": [{"path": "/*"}],
"excludedPaths": [{"path": '/"_etag"/?'}],
"vectorIndexes": [{"path": "/embedding", "type": "quantizedFlat"}],
}
vector_embedding_policy = {
"vectorEmbeddings": [
{
"path": "/embedding",
"dataType": "float32",
"distanceFunction": "cosine",
"dimensions": 3072,
}
]
}
partition_key = PartitionKey(path="/id")
cosmos_container_properties_test = {"partition_key": partition_key}
cosmos_database_properties_test = {}
# create vector store
store = AzureCosmosDBNoSqlVectorSearch(
cosmos_client=client,
vector_embedding_policy=vector_embedding_policy,
indexing_policy=indexing_policy,
cosmos_container_properties=cosmos_container_properties_test,
cosmos_database_properties=cosmos_database_properties_test,
create_container=True,
)
storage_context = StorageContext.from_defaults(vector_store=store)
index = VectorStoreIndex.from_documents(
documents, storage_context=storage_context
)
from azure.cosmos import CosmosClient, PartitionKey from llama_index.vector_stores.azurecosmosnosql import ( AzureCosmosDBNoSqlVectorSearch, ) from llama_index.core import StorageContext # 创建 cosmos 客户端 URI = "AZURE_COSMOSDB_URI" KEY = "AZURE_COSMOSDB_KEY" client = CosmosClient(URI, credential=KEY) # 指定向量存储属性 indexing_policy = { "indexingMode": "consistent", "includedPaths": [{"path": "/*"}], "excludedPaths": [{"path": '/"_etag"/?'}], "vectorIndexes": [{"path": "/embedding", "type": "quantizedFlat"}], } vector_embedding_policy = { "vectorEmbeddings": [ { "path": "/embedding", "dataType": "float32", "distanceFunction": "cosine", "dimensions": 3072, } ] } partition_key = PartitionKey(path="/id") cosmos_container_properties_test = {"partition_key": partition_key} cosmos_database_properties_test = {} # 创建向量存储 store = AzureCosmosDBNoSqlVectorSearch( cosmos_client=client, vector_embedding_policy=vector_embedding_policy, indexing_policy=indexing_policy, cosmos_container_properties=cosmos_container_properties_test, cosmos_database_properties=cosmos_database_properties_test, create_container=True, ) storage_context = StorageContext.from_defaults(vector_store=store) index = VectorStoreIndex.from_documents( documents, storage_context=storage_context )
查询索引¶
我们现在可以使用我们的索引提问。
In [ ]
已复制!
query_engine = index.as_query_engine()
response = query_engine.query("What did the author love working on?")
query_engine = index.as_query_engine() response = query_engine.query("作者喜欢做什么?")
In [ ]
已复制!
import textwrap
print(textwrap.fill(str(response), 100))
import textwrap print(textwrap.fill(str(response), 100))