已复制!
%pip install llama-index-llms-openai %pip install llama-index-extractors-marvin
# !pip install marvin
%pip install llama-index-llms-openai
%pip install llama-index-extractors-marvin
from llama_index.core import SimpleDirectoryReader from llama_index.llms.openai import OpenAI from llama_index.core.node_parser import TokenTextSplitter from llama_index.extractors.marvin import MarvinMetadataExtractor
%pip install llama-index-llms-openai %pip install llama-index-extractors-marvin
# !pip install marvin
# !pip install marvin
import nest_asyncio nest_asyncio.apply()
%pip install llama-index-llms-openai %pip install llama-index-extractors-marvin
# !pip install marvin
from llama_index.core import SimpleDirectoryReader
from llama_index.llms.openai import OpenAI
from llama_index.core.node_parser import TokenTextSplitter
from llama_index.extractors.marvin import MarvinMetadataExtractor
import os import openai os.environ["OPENAI_API_KEY"] = "sk-..."
%pip install llama-index-llms-openai %pip install llama-index-extractors-marvin
# !pip install marvin
import nest_asyncio
nest_asyncio.apply()
documents = SimpleDirectoryReader("data").load_data() # 限制文档文本长度 documents[0].text = documents[0].text[:10000]
%pip install llama-index-llms-openai %pip install llama-index-extractors-marvin
# !pip install marvin
import os
import openai
os.environ["OPENAI_API_KEY"] = "sk-..."
import marvin from pydantic import BaseModel, Field marvin.settings.openai.api_key = os.environ["OPENAI_API_KEY"] marvin.settings.openai.chat.completions.model = "gpt-4o" class SportsSupplement(BaseModel): name: str = Field(..., description="The name of the sports supplement") description: str = Field( ..., description="A description of the sports supplement" ) pros_cons: str = Field( ..., description="The pros and cons of the sports supplement" )
%pip install llama-index-llms-openai %pip install llama-index-extractors-marvin
# !pip install marvin
documents = SimpleDirectoryReader("data").load_data()
# limit document text length
documents[0].text = documents[0].text[:10000]
# 构建文本分割器,将文本分割成块进行处理 # 这需要一段时间才能处理,您可以通过增加 chunk_size 来增加处理时间 # 文件大小当然也是一个因素 node_parser = TokenTextSplitter( separator=" ", chunk_size=512, chunk_overlap=128 ) # 创建元数据提取器 metadata_extractor = MarvinMetadataExtractor( marvin_model=SportsSupplement ) # 让我们为每个节点提取自定义实体。 # 使用 node_parser 从文档中获取节点 from llama_index.core.ingestion import IngestionPipeline pipeline = IngestionPipeline(transformations=[node_parser, metadata_extractor]) nodes = pipeline.run(documents=documents, show_progress=True)
%pip install llama-index-llms-openai %pip install llama-index-extractors-marvin
# !pip install marvin
import marvin
from pydantic import BaseModel, Field
marvin.settings.openai.api_key = os.environ["OPENAI_API_KEY"]
marvin.settings.openai.chat.completions.model = "gpt-4o"
class SportsSupplement(BaseModel):
name: str = Field(..., description="The name of the sports supplement")
description: str = Field(
..., description="A description of the sports supplement"
)
pros_cons: str = Field(
..., description="The pros and cons of the sports supplement"
)
from pprint import pprint for i in range(5): pprint(nodes[i].metadata)
%pip install llama-index-llms-openai %pip install llama-index-extractors-marvin
# !pip install marvin
# construct text splitter to split texts into chunks for processing
# this takes a while to process, you can increase processing time by using larger chunk_size
# file size is a factor too of course
node_parser = TokenTextSplitter(
separator=" ", chunk_size=512, chunk_overlap=128
)
# create metadata extractor
metadata_extractor = MarvinMetadataExtractor(
marvin_model=SportsSupplement
) # let's extract custom entities for each node.
# use node_parser to get nodes from the documents
from llama_index.core.ingestion import IngestionPipeline
pipeline = IngestionPipeline(transformations=[node_parser, metadata_extractor])
nodes = pipeline.run(documents=documents, show_progress=True)
回到顶部
Parsing nodes: 100%|██████████| 1/1 [00:00<00:00, 41.49it/s] Extracting marvin metadata: 100%|██████████| 9/9 [00:22<00:00, 2.46s/it]
{'creation_date': '2024-08-07', 'file_name': 'Sports Supplements.csv', 'file_path': '/data001/home/dongwoo.jeong/llama_index/docs/docs/examples/metadata_extraction/data/Sports ' 'Supplements.csv', 'file_size': 62403, 'file_type': 'text/csv', 'last_modified_date': '2024-08-07', 'marvin_metadata': {'description': 'L-arginine alpha-ketoglutarate is a ' 'supplement often used to improve peak ' 'power output and strength–power during ' 'weight training. A 2006 study by Campbell ' 'et al. found that AAKG supplementation ' 'improved maximum effort 1-repetition ' 'bench press and Wingate peak power ' 'performance.', 'name': 'AAKG', 'pros_cons': 'Pros: Improves peak power output and ' 'strength–power. Cons: No significant effect ' 'on body composition, aerobic capacity, or ' 'muscle endurance.'}} {'creation_date': '2024-08-07', 'file_name': 'Sports Supplements.csv', 'file_path': '/data001/home/dongwoo.jeong/llama_index/docs/docs/examples/metadata_extraction/data/Sports ' 'Supplements.csv', 'file_size': 62403, 'file_type': 'text/csv', 'last_modified_date': '2024-08-07', 'marvin_metadata': {'description': 'Baking soda, also known as bicarbonate of ' 'soda or sodium bicarbonate (NaHCO3), is ' 'used to enhance high-intensity ' 'performance in anaerobic activities such ' 'as rowing, cycling, swimming, and ' 'running. It works by making the blood ' 'more alkaline, which can improve ' 'performance in lactic-acid-fueled events ' 'like the 800m sprint.', 'name': 'Baking soda', 'pros_cons': 'Pros: Improves performance in ' 'high-intensity, anaerobic activities. Cons: ' 'Can cause a badly upset stomach.'}} {'creation_date': '2024-08-07', 'file_name': 'Sports Supplements.csv', 'file_path': '/data001/home/dongwoo.jeong/llama_index/docs/docs/examples/metadata_extraction/data/Sports ' 'Supplements.csv', 'file_size': 62403, 'file_type': 'text/csv', 'last_modified_date': '2024-08-07', 'marvin_metadata': {'description': 'Branched-chain amino acids (BCAAs) are ' 'essential nutrients that the body obtains ' 'from proteins found in food, especially ' 'meat, dairy products, and legumes. They ' 'include leucine, isoleucine, and valine.', 'name': 'BCAAs', 'pros_cons': 'Pros: May help with fatigue resistance, ' 'aerobic endurance, and performance in ' 'activities like cycling and circuit ' 'training. Cons: Limited evidence on ' 'long-term benefits and potential side ' 'effects.'}} {'creation_date': '2024-08-07', 'file_name': 'Sports Supplements.csv', 'file_path': '/data001/home/dongwoo.jeong/llama_index/docs/docs/examples/metadata_extraction/data/Sports ' 'Supplements.csv', 'file_size': 62403, 'file_type': 'text/csv', 'last_modified_date': '2024-08-07', 'marvin_metadata': {'description': 'Branched-chain amino acids (BCAAs) are ' 'essential nutrients that the body obtains ' 'from proteins found in food, especially ' 'meat, dairy products, and legumes. They ' 'include leucine, isoleucine, and valine. ' 'BCAAs are commonly used to improve ' 'exercise performance and reduce protein ' 'and muscle breakdown during intense ' 'exercise.', 'name': 'BCAAs', 'pros_cons': 'Pros: \n' '1. May improve aerobic performance, ' 'endurance, power, and strength.\n' '2. Can enhance immune defenses in athletes ' 'and general fitness.\n' '3. Useful for various types of exercise ' 'including cycling and running.\n' '\n' 'Cons: \n' '1. Limited evidence on long-term benefits.\n' '2. Potential for overconsumption leading to ' 'imbalances.\n' '3. Some studies show no significant ' 'benefit.'}} {'creation_date': '2024-08-07', 'file_name': 'Sports Supplements.csv', 'file_path': '/data001/home/dongwoo.jeong/llama_index/docs/docs/examples/metadata_extraction/data/Sports ' 'Supplements.csv', 'file_size': 62403, 'file_type': 'text/csv', 'last_modified_date': '2024-08-07', 'marvin_metadata': {'description': 'Branched-chain amino acids (BCAAs) are ' 'essential nutrients that the body obtains ' 'from proteins found in food, especially ' 'meat, dairy products, and legumes. They ' 'include leucine, isoleucine, and valine.', 'name': 'BCAAs', 'pros_cons': 'Pros: May support immune defenses in ' 'athletes, aid in general fitness, assist in ' 'running, swimming, and rowing, and help ' 'with body composition, fat burning, muscle ' 'building, muscle damage, soreness, ' 'recovery, and injury prevention. Cons: ' 'Effectiveness can vary based on individual ' 'response and specific use case.'}}