Tablestore 演示¶

本指南将向您展示如何直接使用由 Tablestore 支持的 DocumentStore 抽象。通过将节点放入文档存储中，这允许您在同一个底层文档存储上定义多个索引，而不是在索引之间复制数据。

In [ ]

已复制！

%pip install llama-index-storage-docstore-tablestore
%pip install llama-index-storage-index-store-tablestore
%pip install llama-index-vector-stores-tablestore

%pip install llama-index-llms-dashscope
%pip install llama-index-embeddings-dashscope

%pip install llama-index
%pip install matplotlib
%pip install llama-index-storage-docstore-tablestore %pip install llama-index-storage-index-store-tablestore %pip install llama-index-vector-stores-tablestore %pip install llama-index-llms-dashscope %pip install llama-index-embeddings-dashscope %pip install llama-index %pip install matplotlib

In [ ]

已复制！

import nest_asyncio

nest_asyncio.apply()
import nest_asyncio nest_asyncio.apply()

In [ ]

已复制！

import logging
import sys

logging.basicConfig(stream=sys.stdout, level=logging.INFO)
logging.getLogger().addHandler(logging.StreamHandler(stream=sys.stdout))
import logging import sys logging.basicConfig(stream=sys.stdout, level=logging.INFO) logging.getLogger().addHandler(logging.StreamHandler(stream=sys.stdout))

In [ ]

已复制！

from llama_index.core import SimpleDirectoryReader, StorageContext
from llama_index.core import VectorStoreIndex, SimpleKeywordTableIndex
from llama_index.core import SummaryIndex
from llama_index.core.response.notebook_utils import display_response
from llama_index.core import Settings
from llama_index.core import SimpleDirectoryReader, StorageContext from llama_index.core import VectorStoreIndex, SimpleKeywordTableIndex from llama_index.core import SummaryIndex from llama_index.core.response.notebook_utils import display_response from llama_index.core import Settings

配置 Tablestore¶

接下来，我们将使用 Tablestore 的文档存储进行演示。

In [ ]

已复制！





import getpass
import os

os.environ["tablestore_end_point"] = getpass.getpass("tablestore end_point:")
os.environ["tablestore_instance_name"] = getpass.getpass(
    "tablestore instance_name:"
)
os.environ["tablestore_access_key_id"] = getpass.getpass(
    "tablestore access_key_id:"
)
os.environ["tablestore_access_key_secret"] = getpass.getpass(
    "tablestore access_key_secret:"
)
import getpass import os os.environ["tablestore_end_point"] = getpass.getpass("tablestore end_point:") os.environ["tablestore_instance_name"] = getpass.getpass( "tablestore instance_name:" ) os.environ["tablestore_access_key_id"] = getpass.getpass( "tablestore access_key_id:" ) os.environ["tablestore_access_key_secret"] = getpass.getpass( "tablestore access_key_secret:" )

配置 DashScope LLM¶

接下来，我们将使用 DashScope 的 LLM 进行演示。

In [ ]

已复制！

import os
import getpass

os.environ["DASHSCOPE_API_KEY"] = getpass.getpass("DashScope api key:")
import os import getpass os.environ["DASHSCOPE_API_KEY"] = getpass.getpass("DashScope api key:")

下载数据¶

In [ ]

已复制！

!mkdir -p 'data/paul_graham/'
!wget 'https://raw.githubusercontent.com/run-llama/llama_index/main/docs/docs/examples/data/paul_graham/paul_graham_essay.txt' -O 'data/paul_graham/paul_graham_essay.txt'
!mkdir -p 'data/paul_graham/' !wget 'https://raw.githubusercontent.com/run-llama/llama_index/main/docs/docs/examples/data/paul_graham/paul_graham_essay.txt' -O 'data/paul_graham/paul_graham_essay.txt'

加载文档¶

In [ ]

已复制！

reader = SimpleDirectoryReader("./data/paul_graham/")
documents = reader.load_data()
reader = SimpleDirectoryReader("./data/paul_graham/") documents = reader.load_data()

解析为节点¶

In [ ]

已复制！

from llama_index.core.node_parser import SentenceSplitter

nodes = SentenceSplitter().get_nodes_from_documents(documents)
from llama_index.core.node_parser import SentenceSplitter nodes = SentenceSplitter().get_nodes_from_documents(documents)

初始化存储/嵌入/LLM/存储上下文¶

In [ ]

已复制！





from llama_index.storage.docstore.tablestore import TablestoreDocumentStore
from llama_index.storage.index_store.tablestore import TablestoreIndexStore
from llama_index.vector_stores.tablestore import TablestoreVectorStore
from llama_index.embeddings.dashscope import (
    DashScopeEmbedding,
    DashScopeTextEmbeddingModels,
    DashScopeTextEmbeddingType,
)
from llama_index.llms.dashscope import DashScope, DashScopeGenerationModels

embedder = DashScopeEmbedding(
    model_name=DashScopeTextEmbeddingModels.TEXT_EMBEDDING_V3,  # default demiension is 1024
    text_type=DashScopeTextEmbeddingType.TEXT_TYPE_DOCUMENT,
)

dashscope_llm = DashScope(
    model_name=DashScopeGenerationModels.QWEN_MAX,
    api_key=os.environ["DASHSCOPE_API_KEY"],
)
Settings.llm = dashscope_llm

docstore = TablestoreDocumentStore.from_config(
    endpoint=os.getenv("tablestore_end_point"),
    instance_name=os.getenv("tablestore_instance_name"),
    access_key_id=os.getenv("tablestore_access_key_id"),
    access_key_secret=os.getenv("tablestore_access_key_secret"),
)

index_store = TablestoreIndexStore.from_config(
    endpoint=os.getenv("tablestore_end_point"),
    instance_name=os.getenv("tablestore_instance_name"),
    access_key_id=os.getenv("tablestore_access_key_id"),
    access_key_secret=os.getenv("tablestore_access_key_secret"),
)

vector_store = TablestoreVectorStore(
    endpoint=os.getenv("tablestore_end_point"),
    instance_name=os.getenv("tablestore_instance_name"),
    access_key_id=os.getenv("tablestore_access_key_id"),
    access_key_secret=os.getenv("tablestore_access_key_secret"),
    vector_dimension=1024,  # embedder dimension is 1024
)
vector_store.create_table_if_not_exist()
vector_store.create_search_index_if_not_exist()

storage_context = StorageContext.from_defaults(
    docstore=docstore, index_store=index_store, vector_store=vector_store
)
from llama_index.storage.docstore.tablestore import TablestoreDocumentStore from llama_index.storage.index_store.tablestore import TablestoreIndexStore from llama_index.vector_stores.tablestore import TablestoreVectorStore from llama_index.embeddings.dashscope import ( DashScopeEmbedding, DashScopeTextEmbeddingModels, DashScopeTextEmbeddingType, ) from llama_index.llms.dashscope import DashScope, DashScopeGenerationModels embedder = DashScopeEmbedding( model_name=DashScopeTextEmbeddingModels.TEXT_EMBEDDING_V3, # default demiension is 1024 text_type=DashScopeTextEmbeddingType.TEXT_TYPE_DOCUMENT, ) dashscope_llm = DashScope( model_name=DashScopeGenerationModels.QWEN_MAX, api_key=os.environ["DASHSCOPE_API_KEY"], ) Settings.llm = dashscope_llm docstore = TablestoreDocumentStore.from_config( endpoint=os.getenv("tablestore_end_point"), instance_name=os.getenv("tablestore_instance_name"), access_key_id=os.getenv("tablestore_access_key_id"), access_key_secret=os.getenv("tablestore_access_key_secret"), ) index_store = TablestoreIndexStore.from_config( endpoint=os.getenv("tablestore_end_point"), instance_name=os.getenv("tablestore_instance_name"), access_key_id=os.getenv("tablestore_access_key_id"), access_key_secret=os.getenv("tablestore_access_key_secret"), ) vector_store = TablestoreVectorStore( endpoint=os.getenv("tablestore_end_point"), instance_name=os.getenv("tablestore_instance_name"), access_key_id=os.getenv("tablestore_access_key_id"), access_key_secret=os.getenv("tablestore_access_key_secret"), vector_dimension=1024, # embedder dimension is 1024 ) vector_store.create_table_if_not_exist() vector_store.create_search_index_if_not_exist() storage_context = StorageContext.from_defaults( docstore=docstore, index_store=index_store, vector_store=vector_store )

添加到文档存储¶

In [ ]

已复制！

storage_context.docstore.add_documents(nodes)
storage_context.docstore.add_documents(nodes)

定义并添加多个索引¶

每个索引都使用相同的底层节点。

In [ ]

已复制！

# https://gpt-index.readthedocs.io/en/stable/api_reference/indices/list.html
summary_index = SummaryIndex(nodes, storage_context=storage_context)
# https://gpt-index.readthedocs.io/en/stable/api_reference/indices/list.html summary_index = SummaryIndex(nodes, storage_context=storage_context)

In [ ]

已复制！





# https://gpt-index.readthedocs.io/en/stable/api_reference/indices/vector_store.html
vector_index = VectorStoreIndex(
    nodes,
    insert_batch_size=20,
    embed_model=embedder,
    storage_context=storage_context,
)
# https://gpt-index.readthedocs.io/en/stable/api_reference/indices/vector_store.html vector_index = VectorStoreIndex( nodes, insert_batch_size=20, embed_model=embedder, storage_context=storage_context, )

In [ ]

已复制！





# https://gpt-index.readthedocs.io/en/stable/api_reference/indices/table.html
keyword_table_index = SimpleKeywordTableIndex(
    nodes=nodes,
    storage_context=storage_context,
    llm=dashscope_llm,
)
# https://gpt-index.readthedocs.io/en/stable/api_reference/indices/table.html keyword_table_index = SimpleKeywordTableIndex( nodes=nodes, storage_context=storage_context, llm=dashscope_llm, )

In [ ]

已复制！

# NOTE: the docstore still has the same nodes
len(storage_context.docstore.docs)
# NOTE: the docstore still has the same nodes len(storage_context.docstore.docs)

Out [ ]

测试保存和加载¶

In [ ]

已复制！

# NOTE: docstore and index_store is persisted in Tablestore by default
# NOTE: here only need to persist simple vector store to disk
storage_context.persist()
# NOTE: docstore and index_store is persisted in Tablestore by default # NOTE: here only need to persist simple vector store to disk storage_context.persist()

In [ ]

已复制！

# note down index IDs
list_id = summary_index.index_id
vector_id = vector_index.index_id
keyword_id = keyword_table_index.index_id
print(list_id, vector_id, keyword_id)
# note down index IDs list_id = summary_index.index_id vector_id = vector_index.index_id keyword_id = keyword_table_index.index_id print(list_id, vector_id, keyword_id)

c05fec2a-ac87-4761-beeb-0901f9e6530e d0b021ed-3427-46ad-927d-12d72752dbc4 2e9bfc3a-5e69-408a-9430-7b0c8baf3d77

In [ ]

已复制！





from llama_index.core import load_index_from_storage

# re-create storage context
storage_context = StorageContext.from_defaults(
    docstore=docstore, index_store=index_store, vector_store=vector_store
)

summary_index = load_index_from_storage(
    storage_context=storage_context,
    index_id=list_id,
)
keyword_table_index = load_index_from_storage(
    llm=dashscope_llm,
    storage_context=storage_context,
    index_id=keyword_id,
)
# You need to add "vector_store=xxx" to StorageContext to load vector index from Tablestore
vector_index = load_index_from_storage(
    insert_batch_size=20,
    embed_model=embedder,
    storage_context=storage_context,
    index_id=vector_id,
)
from llama_index.core import load_index_from_storage # re-create storage context storage_context = StorageContext.from_defaults( docstore=docstore, index_store=index_store, vector_store=vector_store ) summary_index = load_index_from_storage( storage_context=storage_context, index_id=list_id, ) keyword_table_index = load_index_from_storage( llm=dashscope_llm, storage_context=storage_context, index_id=keyword_id, ) # You need to add "vector_store=xxx" to StorageContext to load vector index from Tablestore vector_index = load_index_from_storage( insert_batch_size=20, embed_model=embedder, storage_context=storage_context, index_id=vector_id, )

测试一些查询¶

In [ ]

已复制！

Settings.llm = dashscope_llm
Settings.chunk_size = 1024
Settings.llm = dashscope_llm Settings.chunk_size = 1024

In [ ]

已复制！

query_engine = summary_index.as_query_engine()
list_response = query_engine.query("What is a summary of this document?")
query_engine = summary_index.as_query_engine() list_response = query_engine.query("这份文档的摘要是什么？")

In [ ]

已复制！

display_response(list_response)
display_response(list_response)

In [ ]

已复制！

query_engine = vector_index.as_query_engine()
vector_response = query_engine.query("What did the author do growing up?")
query_engine = vector_index.as_query_engine() vector_response = query_engine.query("作者成长过程中做了些什么？")

In [ ]

已复制！

display_response(vector_response)
display_response(vector_response)

最终回复： 作者在成长过程中，除了学校的学习之外，还涉足写作和编程。最初，他写了一些短篇小说，现在他认为这些小说写得不太好，因为情节不多，更多地侧重于人物的情感。在编程方面，作者在初中时就开始接触 IBM 1401 计算机，尝试使用打孔卡用 Fortran 编写基础程序。后来，在获得 TRS-80 微型计算机后，作者深入学习编程，创作了简单的游戏、一个预测模型火箭飞行高度的程序，甚至还有一个他的父亲用来写作的文字处理器。

In [ ]

已复制！

query_engine = keyword_table_index.as_query_engine()
keyword_response = query_engine.query(
    "What did the author do after his time at YC?"
)
query_engine = keyword_table_index.as_query_engine() keyword_response = query_engine.query( "作者在 YC 工作结束后做了什么？" )

In [ ]

已复制！

display_response(keyword_response)
display_response(keyword_response)

最终回复： 在 YC 工作结束后，作者决定开始画画，并全身心投入其中，想看看自己能变得多好。他将 2014 年的大部分时间都花在了这件事上。然而，到 11 月份，他失去了兴趣并停止了。在此之后，他重新开始写文章，甚至涉足创业以外的主题。2015 年 3 月，他还再次开始研究 Lisp。