创建种子数据¶
输入 [ ]
已复制!
%pip install llama-index-storage-docstore-redis
%pip install llama-index-storage-docstore-mongodb
%pip install llama-index-embeddings-huggingface
%pip install llama-index-storage-docstore-redis %pip install llama-index-storage-docstore-mongodb %pip install llama-index-embeddings-huggingface
输入 [ ]
已复制!
# Make some test data
!mkdir -p data
!echo "This is a test file: one!" > data/test1.txt
!echo "This is a test file: two!" > data/test2.txt
# 创建一些测试数据 !mkdir -p data !echo "This is a test file: one!" > data/test1.txt !echo "This is a test file: two!" > data/test2.txt
输入 [ ]
已复制!
from llama_index.core import SimpleDirectoryReader
# load documents with deterministic IDs
documents = SimpleDirectoryReader("./data", filename_as_id=True).load_data()
from llama_index.core import SimpleDirectoryReader # 加载带有确定性 ID 的文档 documents = SimpleDirectoryReader("./data", filename_as_id=True).load_data()
/home/loganm/.cache/pypoetry/virtualenvs/llama-index-4a-wkI5X-py3.11/lib/python3.11/site-packages/deeplake/util/check_latest_version.py:32: UserWarning: A newer version of deeplake (3.8.9) is available. It's recommended that you update to the latest version using `pip install -U deeplake`. warnings.warn(
使用文档存储创建管道¶
输入 [ ]
已复制!
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
from llama_index.core.ingestion import IngestionPipeline
from llama_index.core.storage.docstore import SimpleDocumentStore
from llama_index.storage.docstore.redis import RedisDocumentStore
from llama_index.storage.docstore.mongodb import MongoDocumentStore
from llama_index.core.node_parser import SentenceSplitter
pipeline = IngestionPipeline(
transformations=[
SentenceSplitter(),
HuggingFaceEmbedding(model_name="BAAI/bge-small-en-v1.5"),
],
docstore=SimpleDocumentStore(),
)
from llama_index.embeddings.huggingface import HuggingFaceEmbedding from llama_index.core.ingestion import IngestionPipeline from llama_index.core.storage.docstore import SimpleDocumentStore from llama_index.storage.docstore.redis import RedisDocumentStore from llama_index.storage.docstore.mongodb import MongoDocumentStore from llama_index.core.node_parser import SentenceSplitter pipeline = IngestionPipeline( transformations=[ SentenceSplitter(), HuggingFaceEmbedding(model_name="BAAI/bge-small-en-v1.5"), ], docstore=SimpleDocumentStore(), )
输入 [ ]
已复制!
nodes = pipeline.run(documents=documents)
nodes = pipeline.run(documents=documents)
Docstore strategy set to upserts, but no vector store. Switching to duplicates_only strategy.
输入 [ ]
已复制!
print(f"Ingested {len(nodes)} Nodes")
print(f"已摄取 {len(nodes)} 个节点")
Ingested 2 Nodes
输入 [ ]
已复制!
pipeline.persist("./pipeline_storage")
pipeline.persist("./pipeline_storage")
输入 [ ]
已复制!
pipeline = IngestionPipeline(
transformations=[
SentenceSplitter(),
HuggingFaceEmbedding(model_name="BAAI/bge-small-en-v1.5"),
]
)
# restore the pipeline
pipeline.load("./pipeline_storage")
pipeline = IngestionPipeline( transformations=[ SentenceSplitter(), HuggingFaceEmbedding(model_name="BAAI/bge-small-en-v1.5"), ] ) # 恢复管道 pipeline.load("./pipeline_storage")
输入 [ ]
已复制!
!echo "This is a test file: three!" > data/test3.txt
!echo "This is a NEW test file: one!" > data/test1.txt
!echo "This is a test file: three!" > data/test3.txt !echo "This is a NEW test file: one!" > data/test1.txt
输入 [ ]
已复制!
documents = SimpleDirectoryReader("./data", filename_as_id=True).load_data()
documents = SimpleDirectoryReader("./data", filename_as_id=True).load_data()
输入 [ ]
已复制!
nodes = pipeline.run(documents=documents)
nodes = pipeline.run(documents=documents)
Docstore strategy set to upserts, but no vector store. Switching to duplicates_only strategy.
输入 [ ]
已复制!
print(f"Ingested {len(nodes)} Nodes")
print(f"已摄取 {len(nodes)} 个节点")
Ingested 2 Nodes
让我们确认哪些节点被摄取了
输入 [ ]
已复制!
for node in nodes:
print(f"Node: {node.text}")
for node in nodes: print(f"节点: {node.text}")
Node: This is a NEW test file: one! Node: This is a test file: three!
我们还可以验证文档存储只跟踪了三个文档
输入 [ ]
已复制!
print(len(pipeline.docstore.docs))
print(len(pipeline.docstore.docs))
3