Tablestore 演示¶
本指南将向您展示如何直接使用由 Tablestore 支持的 DocumentStore
抽象。通过将节点放入文档存储中,这允许您在同一个底层文档存储上定义多个索引,而不是在索引之间复制数据。
In [ ]
已复制!
%pip install llama-index-storage-docstore-tablestore
%pip install llama-index-storage-index-store-tablestore
%pip install llama-index-vector-stores-tablestore
%pip install llama-index-llms-dashscope
%pip install llama-index-embeddings-dashscope
%pip install llama-index
%pip install matplotlib
%pip install llama-index-storage-docstore-tablestore %pip install llama-index-storage-index-store-tablestore %pip install llama-index-vector-stores-tablestore %pip install llama-index-llms-dashscope %pip install llama-index-embeddings-dashscope %pip install llama-index %pip install matplotlib
In [ ]
已复制!
import nest_asyncio
nest_asyncio.apply()
import nest_asyncio nest_asyncio.apply()
In [ ]
已复制!
import logging
import sys
logging.basicConfig(stream=sys.stdout, level=logging.INFO)
logging.getLogger().addHandler(logging.StreamHandler(stream=sys.stdout))
import logging import sys logging.basicConfig(stream=sys.stdout, level=logging.INFO) logging.getLogger().addHandler(logging.StreamHandler(stream=sys.stdout))
In [ ]
已复制!
from llama_index.core import SimpleDirectoryReader, StorageContext
from llama_index.core import VectorStoreIndex, SimpleKeywordTableIndex
from llama_index.core import SummaryIndex
from llama_index.core.response.notebook_utils import display_response
from llama_index.core import Settings
from llama_index.core import SimpleDirectoryReader, StorageContext from llama_index.core import VectorStoreIndex, SimpleKeywordTableIndex from llama_index.core import SummaryIndex from llama_index.core.response.notebook_utils import display_response from llama_index.core import Settings
配置 Tablestore¶
接下来,我们将使用 Tablestore 的文档存储进行演示。
In [ ]
已复制!
import getpass
import os
os.environ["tablestore_end_point"] = getpass.getpass("tablestore end_point:")
os.environ["tablestore_instance_name"] = getpass.getpass(
"tablestore instance_name:"
)
os.environ["tablestore_access_key_id"] = getpass.getpass(
"tablestore access_key_id:"
)
os.environ["tablestore_access_key_secret"] = getpass.getpass(
"tablestore access_key_secret:"
)
import getpass import os os.environ["tablestore_end_point"] = getpass.getpass("tablestore end_point:") os.environ["tablestore_instance_name"] = getpass.getpass( "tablestore instance_name:" ) os.environ["tablestore_access_key_id"] = getpass.getpass( "tablestore access_key_id:" ) os.environ["tablestore_access_key_secret"] = getpass.getpass( "tablestore access_key_secret:" )
配置 DashScope LLM¶
接下来,我们将使用 DashScope 的 LLM 进行演示。
In [ ]
已复制!
import os
import getpass
os.environ["DASHSCOPE_API_KEY"] = getpass.getpass("DashScope api key:")
import os import getpass os.environ["DASHSCOPE_API_KEY"] = getpass.getpass("DashScope api key:")
下载数据¶
In [ ]
已复制!
!mkdir -p 'data/paul_graham/'
!wget 'https://raw.githubusercontent.com/run-llama/llama_index/main/docs/docs/examples/data/paul_graham/paul_graham_essay.txt' -O 'data/paul_graham/paul_graham_essay.txt'
!mkdir -p 'data/paul_graham/' !wget 'https://raw.githubusercontent.com/run-llama/llama_index/main/docs/docs/examples/data/paul_graham/paul_graham_essay.txt' -O 'data/paul_graham/paul_graham_essay.txt'
加载文档¶
In [ ]
已复制!
reader = SimpleDirectoryReader("./data/paul_graham/")
documents = reader.load_data()
reader = SimpleDirectoryReader("./data/paul_graham/") documents = reader.load_data()
解析为节点¶
In [ ]
已复制!
from llama_index.core.node_parser import SentenceSplitter
nodes = SentenceSplitter().get_nodes_from_documents(documents)
from llama_index.core.node_parser import SentenceSplitter nodes = SentenceSplitter().get_nodes_from_documents(documents)
初始化存储/嵌入/LLM/存储上下文¶
In [ ]
已复制!
from llama_index.storage.docstore.tablestore import TablestoreDocumentStore
from llama_index.storage.index_store.tablestore import TablestoreIndexStore
from llama_index.vector_stores.tablestore import TablestoreVectorStore
from llama_index.embeddings.dashscope import (
DashScopeEmbedding,
DashScopeTextEmbeddingModels,
DashScopeTextEmbeddingType,
)
from llama_index.llms.dashscope import DashScope, DashScopeGenerationModels
embedder = DashScopeEmbedding(
model_name=DashScopeTextEmbeddingModels.TEXT_EMBEDDING_V3, # default demiension is 1024
text_type=DashScopeTextEmbeddingType.TEXT_TYPE_DOCUMENT,
)
dashscope_llm = DashScope(
model_name=DashScopeGenerationModels.QWEN_MAX,
api_key=os.environ["DASHSCOPE_API_KEY"],
)
Settings.llm = dashscope_llm
docstore = TablestoreDocumentStore.from_config(
endpoint=os.getenv("tablestore_end_point"),
instance_name=os.getenv("tablestore_instance_name"),
access_key_id=os.getenv("tablestore_access_key_id"),
access_key_secret=os.getenv("tablestore_access_key_secret"),
)
index_store = TablestoreIndexStore.from_config(
endpoint=os.getenv("tablestore_end_point"),
instance_name=os.getenv("tablestore_instance_name"),
access_key_id=os.getenv("tablestore_access_key_id"),
access_key_secret=os.getenv("tablestore_access_key_secret"),
)
vector_store = TablestoreVectorStore(
endpoint=os.getenv("tablestore_end_point"),
instance_name=os.getenv("tablestore_instance_name"),
access_key_id=os.getenv("tablestore_access_key_id"),
access_key_secret=os.getenv("tablestore_access_key_secret"),
vector_dimension=1024, # embedder dimension is 1024
)
vector_store.create_table_if_not_exist()
vector_store.create_search_index_if_not_exist()
storage_context = StorageContext.from_defaults(
docstore=docstore, index_store=index_store, vector_store=vector_store
)
from llama_index.storage.docstore.tablestore import TablestoreDocumentStore from llama_index.storage.index_store.tablestore import TablestoreIndexStore from llama_index.vector_stores.tablestore import TablestoreVectorStore from llama_index.embeddings.dashscope import ( DashScopeEmbedding, DashScopeTextEmbeddingModels, DashScopeTextEmbeddingType, ) from llama_index.llms.dashscope import DashScope, DashScopeGenerationModels embedder = DashScopeEmbedding( model_name=DashScopeTextEmbeddingModels.TEXT_EMBEDDING_V3, # default demiension is 1024 text_type=DashScopeTextEmbeddingType.TEXT_TYPE_DOCUMENT, ) dashscope_llm = DashScope( model_name=DashScopeGenerationModels.QWEN_MAX, api_key=os.environ["DASHSCOPE_API_KEY"], ) Settings.llm = dashscope_llm docstore = TablestoreDocumentStore.from_config( endpoint=os.getenv("tablestore_end_point"), instance_name=os.getenv("tablestore_instance_name"), access_key_id=os.getenv("tablestore_access_key_id"), access_key_secret=os.getenv("tablestore_access_key_secret"), ) index_store = TablestoreIndexStore.from_config( endpoint=os.getenv("tablestore_end_point"), instance_name=os.getenv("tablestore_instance_name"), access_key_id=os.getenv("tablestore_access_key_id"), access_key_secret=os.getenv("tablestore_access_key_secret"), ) vector_store = TablestoreVectorStore( endpoint=os.getenv("tablestore_end_point"), instance_name=os.getenv("tablestore_instance_name"), access_key_id=os.getenv("tablestore_access_key_id"), access_key_secret=os.getenv("tablestore_access_key_secret"), vector_dimension=1024, # embedder dimension is 1024 ) vector_store.create_table_if_not_exist() vector_store.create_search_index_if_not_exist() storage_context = StorageContext.from_defaults( docstore=docstore, index_store=index_store, vector_store=vector_store )
添加到文档存储¶
In [ ]
已复制!
storage_context.docstore.add_documents(nodes)
storage_context.docstore.add_documents(nodes)
定义并添加多个索引¶
每个索引都使用相同的底层节点。
In [ ]
已复制!
# https://gpt-index.readthedocs.io/en/stable/api_reference/indices/list.html
summary_index = SummaryIndex(nodes, storage_context=storage_context)
# https://gpt-index.readthedocs.io/en/stable/api_reference/indices/list.html summary_index = SummaryIndex(nodes, storage_context=storage_context)
In [ ]
已复制!
# https://gpt-index.readthedocs.io/en/stable/api_reference/indices/vector_store.html
vector_index = VectorStoreIndex(
nodes,
insert_batch_size=20,
embed_model=embedder,
storage_context=storage_context,
)
# https://gpt-index.readthedocs.io/en/stable/api_reference/indices/vector_store.html vector_index = VectorStoreIndex( nodes, insert_batch_size=20, embed_model=embedder, storage_context=storage_context, )
In [ ]
已复制!
# https://gpt-index.readthedocs.io/en/stable/api_reference/indices/table.html
keyword_table_index = SimpleKeywordTableIndex(
nodes=nodes,
storage_context=storage_context,
llm=dashscope_llm,
)
# https://gpt-index.readthedocs.io/en/stable/api_reference/indices/table.html keyword_table_index = SimpleKeywordTableIndex( nodes=nodes, storage_context=storage_context, llm=dashscope_llm, )
In [ ]
已复制!
# NOTE: the docstore still has the same nodes
len(storage_context.docstore.docs)
# NOTE: the docstore still has the same nodes len(storage_context.docstore.docs)
Out [ ]
44
测试保存和加载¶
In [ ]
已复制!
# NOTE: docstore and index_store is persisted in Tablestore by default
# NOTE: here only need to persist simple vector store to disk
storage_context.persist()
# NOTE: docstore and index_store is persisted in Tablestore by default # NOTE: here only need to persist simple vector store to disk storage_context.persist()
In [ ]
已复制!
# note down index IDs
list_id = summary_index.index_id
vector_id = vector_index.index_id
keyword_id = keyword_table_index.index_id
print(list_id, vector_id, keyword_id)
# note down index IDs list_id = summary_index.index_id vector_id = vector_index.index_id keyword_id = keyword_table_index.index_id print(list_id, vector_id, keyword_id)
c05fec2a-ac87-4761-beeb-0901f9e6530e d0b021ed-3427-46ad-927d-12d72752dbc4 2e9bfc3a-5e69-408a-9430-7b0c8baf3d77
In [ ]
已复制!
from llama_index.core import load_index_from_storage
# re-create storage context
storage_context = StorageContext.from_defaults(
docstore=docstore, index_store=index_store, vector_store=vector_store
)
summary_index = load_index_from_storage(
storage_context=storage_context,
index_id=list_id,
)
keyword_table_index = load_index_from_storage(
llm=dashscope_llm,
storage_context=storage_context,
index_id=keyword_id,
)
# You need to add "vector_store=xxx" to StorageContext to load vector index from Tablestore
vector_index = load_index_from_storage(
insert_batch_size=20,
embed_model=embedder,
storage_context=storage_context,
index_id=vector_id,
)
from llama_index.core import load_index_from_storage # re-create storage context storage_context = StorageContext.from_defaults( docstore=docstore, index_store=index_store, vector_store=vector_store ) summary_index = load_index_from_storage( storage_context=storage_context, index_id=list_id, ) keyword_table_index = load_index_from_storage( llm=dashscope_llm, storage_context=storage_context, index_id=keyword_id, ) # You need to add "vector_store=xxx" to StorageContext to load vector index from Tablestore vector_index = load_index_from_storage( insert_batch_size=20, embed_model=embedder, storage_context=storage_context, index_id=vector_id, )
测试一些查询¶
In [ ]
已复制!
Settings.llm = dashscope_llm
Settings.chunk_size = 1024
Settings.llm = dashscope_llm Settings.chunk_size = 1024
In [ ]
已复制!
query_engine = summary_index.as_query_engine()
list_response = query_engine.query("What is a summary of this document?")
query_engine = summary_index.as_query_engine() list_response = query_engine.query("这份文档的摘要是什么?")
In [ ]
已复制!
display_response(list_response)
display_response(list_response)
In [ ]
已复制!
query_engine = vector_index.as_query_engine()
vector_response = query_engine.query("What did the author do growing up?")
query_engine = vector_index.as_query_engine() vector_response = query_engine.query("作者成长过程中做了些什么?")
In [ ]
已复制!
display_response(vector_response)
display_response(vector_response)
最终回复:
作者在成长过程中,除了学校的学习之外,还涉足写作和编程。最初,他写了一些短篇小说,现在他认为这些小说写得不太好,因为情节不多,更多地侧重于人物的情感。在编程方面,作者在初中时就开始接触 IBM 1401 计算机,尝试使用打孔卡用 Fortran 编写基础程序。后来,在获得 TRS-80 微型计算机后,作者深入学习编程,创作了简单的游戏、一个预测模型火箭飞行高度的程序,甚至还有一个他的父亲用来写作的文字处理器。
In [ ]
已复制!
query_engine = keyword_table_index.as_query_engine()
keyword_response = query_engine.query(
"What did the author do after his time at YC?"
)
query_engine = keyword_table_index.as_query_engine() keyword_response = query_engine.query( "作者在 YC 工作结束后做了什么?" )
In [ ]
已复制!
display_response(keyword_response)
display_response(keyword_response)
最终回复:
在 YC 工作结束后,作者决定开始画画,并全身心投入其中,想看看自己能变得多好。他将 2014 年的大部分时间都花在了这件事上。然而,到 11 月份,他失去了兴趣并停止了。在此之后,他重新开始写文章,甚至涉足创业以外的主题。2015 年 3 月,他还再次开始研究 Lisp。