如果你在 colab 上打开此 Notebook,你可能需要安装 LlamaIndex 🦙。
In [ ]
已复制!
%pip install llama-index-vector-stores-docarray
%pip install llama-index-vector-stores-docarray
In [ ]
已复制!
!pip install llama-index
!pip install llama-index
In [ ]
已复制!
import os
import sys
import logging
import textwrap
import warnings
warnings.filterwarnings("ignore")
# stop huggingface warnings
os.environ["TOKENIZERS_PARALLELISM"] = "false"
# Uncomment to see debug logs
# logging.basicConfig(stream=sys.stdout, level=logging.INFO)
# logging.getLogger().addHandler(logging.StreamHandler(stream=sys.stdout))
from llama_index.core import (
GPTVectorStoreIndex,
SimpleDirectoryReader,
Document,
)
from llama_index.vector_stores.docarray import DocArrayInMemoryVectorStore
from IPython.display import Markdown, display
import os import sys import logging import textwrap import warnings warnings.filterwarnings("ignore") # 停止 huggingface 警告 os.environ["TOKENIZERS_PARALLELISM"] = "false" # 取消注释以查看调试日志 # logging.basicConfig(stream=sys.stdout, level=logging.INFO) # logging.getLogger().addHandler(logging.StreamHandler(stream=sys.stdout)) from llama_index.core import ( GPTVectorStoreIndex, SimpleDirectoryReader, Document, ) from llama_index.vector_stores.docarray import DocArrayInMemoryVectorStore from IPython.display import Markdown, display
In [ ]
已复制!
import os
os.environ["OPENAI_API_KEY"] = "<your openai key>"
import os os.environ["OPENAI_API_KEY"] = """
下载数据
In [ ]
已复制!
!mkdir -p 'data/paul_graham/'
!wget 'https://raw.githubusercontent.com/run-llama/llama_index/main/docs/docs/examples/data/paul_graham/paul_graham_essay.txt' -O 'data/paul_graham/paul_graham_essay.txt'
!mkdir -p 'data/paul_graham/' !wget 'https://raw.githubusercontent.com/run-llama/llama_index/main/docs/docs/examples/data/paul_graham/paul_graham_essay.txt' -O 'data/paul_graham/paul_graham_essay.txt'
In [ ]
已复制!
# load documents
documents = SimpleDirectoryReader("./data/paul_graham/").load_data()
print(
"Document ID:",
documents[0].doc_id,
"Document Hash:",
documents[0].doc_hash,
)
# 加载文档 documents = SimpleDirectoryReader("./data/paul_graham/").load_data() print( "文档 ID:", documents[0].doc_id, "文档哈希:", documents[0].doc_hash, )
Document ID: 1c21062a-50a3-4133-a0b1-75f837a953e5 Document Hash: 77ae91ab542f3abb308c4d7c77c9bc4c9ad0ccd63144802b7cbe7e1bb3a4094e
初始化和索引¶
In [ ]
已复制!
from llama_index.core import StorageContext
vector_store = DocArrayInMemoryVectorStore()
storage_context = StorageContext.from_defaults(vector_store=vector_store)
index = GPTVectorStoreIndex.from_documents(
documents, storage_context=storage_context
)
from llama_index.core import StorageContext vector_store = DocArrayInMemoryVectorStore() storage_context = StorageContext.from_defaults(vector_store=vector_store) index = GPTVectorStoreIndex.from_documents( documents, storage_context=storage_context )
查询¶
In [ ]
已复制!
# set Logging to DEBUG for more detailed outputs
query_engine = index.as_query_engine()
response = query_engine.query("What did the author do growing up?")
print(textwrap.fill(str(response), 100))
# 将日志设置为 DEBUG 以获取更详细的输出 query_engine = index.as_query_engine() response = query_engine.query("作者成长过程中做了什么?") print(textwrap.fill(str(response), 100))
Token indices sequence length is longer than the specified maximum sequence length for this model (1830 > 1024). Running this sequence through the model will result in indexing errors
Growing up, the author wrote short stories, programmed on an IBM 1401, and nagged his father to buy him a TRS-80 microcomputer. He wrote simple games, a program to predict how high his model rockets would fly, and a word processor. He also studied philosophy in college, but switched to AI after becoming bored with it. He then took art classes at Harvard and applied to art schools, eventually attending RISD.
In [ ]
已复制!
response = query_engine.query("What was a hard moment for the author?")
print(textwrap.fill(str(response), 100))
response = query_engine.query("作者遇到的一个困难时刻是什么?") print(textwrap.fill(str(response), 100))
A hard moment for the author was when he realized that the AI programs of the time were a hoax and that there was an unbridgeable gap between what they could do and actually understanding natural language. He had invested a lot of time and energy into learning about AI and was disappointed to find out that it was not going to get him the results he had hoped for.
带过滤器的查询¶
In [ ]
已复制!
from llama_index.core.schema import TextNode
nodes = [
TextNode(
text="The Shawshank Redemption",
metadata={
"author": "Stephen King",
"theme": "Friendship",
},
),
TextNode(
text="The Godfather",
metadata={
"director": "Francis Ford Coppola",
"theme": "Mafia",
},
),
TextNode(
text="Inception",
metadata={
"director": "Christopher Nolan",
},
),
]
from llama_index.core.schema import TextNode nodes = [ TextNode( text="肖申克的救赎", metadata={ "作者": "斯蒂芬·金", "主题": "友谊", }, ), TextNode( text="教父", metadata={ "导演": "弗朗西斯·福特·科波拉", "主题": "黑手党", }, ), TextNode( text="盗梦空间", metadata={ "导演": "克里斯托弗·诺兰", }, ), ]
In [ ]
已复制!
from llama_index.core import StorageContext
vector_store = DocArrayInMemoryVectorStore()
storage_context = StorageContext.from_defaults(vector_store=vector_store)
index = GPTVectorStoreIndex(nodes, storage_context=storage_context)
from llama_index.core import StorageContext vector_store = DocArrayInMemoryVectorStore() storage_context = StorageContext.from_defaults(vector_store=vector_store) index = GPTVectorStoreIndex(nodes, storage_context=storage_context)
In [ ]
已复制!
from llama_index.core.vector_stores import ExactMatchFilter, MetadataFilters
filters = MetadataFilters(
filters=[ExactMatchFilter(key="theme", value="Mafia")]
)
retriever = index.as_retriever(filters=filters)
retriever.retrieve("What is inception about?")
from llama_index.core.vector_stores import ExactMatchFilter, MetadataFilters filters = MetadataFilters( filters=[ExactMatchFilter(key="theme", value="黑手党")] ) retriever = index.as_retriever(filters=filters) retriever.retrieve("《盗梦空间》是关于什么的?")
Out[ ]
[NodeWithScore(node=Node(text='director: Francis Ford Coppola\ntheme: Mafia\n\nThe Godfather', doc_id='41c99963-b200-4ce6-a9c4-d06ffeabdbc5', embedding=None, doc_hash='b770e43e6a94854a22dc01421d3d9ef6a94931c2b8dbbadf4fdb6eb6fbe41010', extra_info=None, node_info=None, relationships={<DocumentRelationship.SOURCE: '1'>: 'None'}), score=0.7681788983417586)]