Azure Cosmos DB NoSQL 向量存储¶

在本 Notebook 中，我们将展示如何使用 AzureCosmosDBNoSqlVectorSearch 在 LlamaIndex 中执行向量搜索的快速演示。

如果您正在 colab 上打开此 Notebook，则可能需要安装 LlamaIndex 🦙。

In [ ]

已复制！

%pip install llama-index-embeddings-openai
%pip install llama-index-llms-azure-openai
%pip install llama-index-embeddings-openai %pip install llama-index-llms-azure-openai

In [ ]

已复制！

!pip install llama-index
!pip install llama-index

In [ ]

已复制！





import os
import json
import openai
from llama_index.llms.azure_openai import AzureOpenAI
from llama_index.embeddings.openai import OpenAIEmbedding
from llama_index.core import VectorStoreIndex, SimpleDirectoryReader
from llama_index.embeddings.azure_openai import AzureOpenAIEmbedding
import os import json import openai from llama_index.llms.azure_openai import AzureOpenAI from llama_index.embeddings.openai import OpenAIEmbedding from llama_index.core import VectorStoreIndex, SimpleDirectoryReader from llama_index.embeddings.azure_openai import AzureOpenAIEmbedding

设置 Azure OpenAI¶

第一步是配置 llm 和 embedding 模型。这些模型将用于为加载到数据库中的文档创建 embedding，并用于 llm 补全。

In [ ]

已复制！





llm = AzureOpenAI(
    model="AZURE_OPENAI_MODEL",
    deployment_name="AZURE_OPENAI_DEPLOYMENT_NAME",
    azure_endpoint="AZURE_OPENAI_BASE",
    api_key="AZURE_OPENAI_KEY",
    api_version="AZURE_OPENAI_VERSION",
)

embed_model = AzureOpenAIEmbedding(
    model="AZURE_OPENAI_EMBEDDING_MODEL",
    deployment_name="AZURE_OPENAI_EMBEDDING_MODEL_DEPLOYMENT_NAME",
    azure_endpoint="AZURE_OPENAI_BASE",
    api_key="AZURE_OPENAI_KEY",
    api_version="AZURE_OPENAI_VERSION",
)
llm = AzureOpenAI( model="AZURE_OPENAI_MODEL", deployment_name="AZURE_OPENAI_DEPLOYMENT_NAME", azure_endpoint="AZURE_OPENAI_BASE", api_key="AZURE_OPENAI_KEY", api_version="AZURE_OPENAI_VERSION", ) embed_model = AzureOpenAIEmbedding( model="AZURE_OPENAI_EMBEDDING_MODEL", deployment_name="AZURE_OPENAI_EMBEDDING_MODEL_DEPLOYMENT_NAME", azure_endpoint="AZURE_OPENAI_BASE", api_key="AZURE_OPENAI_KEY", api_version="AZURE_OPENAI_VERSION", )

In [ ]

已复制！

from llama_index.core import Settings

Settings.llm = llm
Settings.embed_model = embed_model
from llama_index.core import Settings Settings.llm = llm Settings.embed_model = embed_model

加载文档¶

在此示例中，我们将使用 paul_graham 的文章，该文章将由 SimpleDirectoryReader 处理。

In [ ]

已复制！

from llama_index.core import SimpleDirectoryReader

documents = SimpleDirectoryReader(
    input_files=[r"\docs\examples\data\paul_graham\paul_graham_essay.txt"]
).load_data()

print("Document ID:", documents[0].doc_id)
from llama_index.core import SimpleDirectoryReader documents = SimpleDirectoryReader( input_files=[r"\docs\examples\data\paul_graham\paul_graham_essay.txt"] ).load_data() print("文档 ID:", documents[0].doc_id)

创建索引¶

在这里，我们建立与 cosmos db nosql 的连接并创建一个向量存储索引。

In [ ]

已复制！





from azure.cosmos import CosmosClient, PartitionKey
from llama_index.vector_stores.azurecosmosnosql import (
    AzureCosmosDBNoSqlVectorSearch,
)
from llama_index.core import StorageContext

# create cosmos client
URI = "AZURE_COSMOSDB_URI"
KEY = "AZURE_COSMOSDB_KEY"
client = CosmosClient(URI, credential=KEY)

# specify vector store properties
indexing_policy = {
    "indexingMode": "consistent",
    "includedPaths": [{"path": "/*"}],
    "excludedPaths": [{"path": '/"_etag"/?'}],
    "vectorIndexes": [{"path": "/embedding", "type": "quantizedFlat"}],
}

vector_embedding_policy = {
    "vectorEmbeddings": [
        {
            "path": "/embedding",
            "dataType": "float32",
            "distanceFunction": "cosine",
            "dimensions": 3072,
        }
    ]
}

partition_key = PartitionKey(path="/id")
cosmos_container_properties_test = {"partition_key": partition_key}
cosmos_database_properties_test = {}

# create vector store
store = AzureCosmosDBNoSqlVectorSearch(
    cosmos_client=client,
    vector_embedding_policy=vector_embedding_policy,
    indexing_policy=indexing_policy,
    cosmos_container_properties=cosmos_container_properties_test,
    cosmos_database_properties=cosmos_database_properties_test,
    create_container=True,
)

storage_context = StorageContext.from_defaults(vector_store=store)

index = VectorStoreIndex.from_documents(
    documents, storage_context=storage_context
)
from azure.cosmos import CosmosClient, PartitionKey from llama_index.vector_stores.azurecosmosnosql import ( AzureCosmosDBNoSqlVectorSearch, ) from llama_index.core import StorageContext # 创建 cosmos 客户端 URI = "AZURE_COSMOSDB_URI" KEY = "AZURE_COSMOSDB_KEY" client = CosmosClient(URI, credential=KEY) # 指定向量存储属性 indexing_policy = { "indexingMode": "consistent", "includedPaths": [{"path": "/*"}], "excludedPaths": [{"path": '/"_etag"/?'}], "vectorIndexes": [{"path": "/embedding", "type": "quantizedFlat"}], } vector_embedding_policy = { "vectorEmbeddings": [ { "path": "/embedding", "dataType": "float32", "distanceFunction": "cosine", "dimensions": 3072, } ] } partition_key = PartitionKey(path="/id") cosmos_container_properties_test = {"partition_key": partition_key} cosmos_database_properties_test = {} # 创建向量存储 store = AzureCosmosDBNoSqlVectorSearch( cosmos_client=client, vector_embedding_policy=vector_embedding_policy, indexing_policy=indexing_policy, cosmos_container_properties=cosmos_container_properties_test, cosmos_database_properties=cosmos_database_properties_test, create_container=True, ) storage_context = StorageContext.from_defaults(vector_store=store) index = VectorStoreIndex.from_documents( documents, storage_context=storage_context )

查询索引¶

我们现在可以使用我们的索引提问。

In [ ]

已复制！

query_engine = index.as_query_engine()
response = query_engine.query("What did the author love working on?")
query_engine = index.as_query_engine() response = query_engine.query("作者喜欢做什么？")

In [ ]

已复制！

import textwrap

print(textwrap.fill(str(response), 100))
import textwrap print(textwrap.fill(str(response), 100))