比较用于构建知识图谱的 LLM 路径提取器¶

在本 Notebook 中，我们将比较来自 llama_index 的三种不同的 LLM 路径提取器

SimpleLLMPathExtractor
SchemaLLMPathExtractor
DynamicLLMPathExtractor（新）

我们将使用维基百科页面作为测试数据，并使用 Pyvis 可视化生成的知识图谱。

设置和导入¶

In [ ]

已复制！

!pip install llama_index pyvis wikipedia
!pip install llama_index pyvis wikipedia

In [ ]

已复制！





from llama_index.core import Document, PropertyGraphIndex
from llama_index.core.indices.property_graph import (
    SimpleLLMPathExtractor,
    SchemaLLMPathExtractor,
    DynamicLLMPathExtractor,
)
from llama_index.llms.openai import OpenAI
from llama_index.core import Settings

import wikipedia

import os
from llama_index.core import Document, PropertyGraphIndex from llama_index.core.indices.property_graph import ( SimpleLLMPathExtractor, SchemaLLMPathExtractor, DynamicLLMPathExtractor, ) from llama_index.llms.openai import OpenAI from llama_index.core import Settings import wikipedia import os

In [ ]

已复制！

import nest_asyncio

nest_asyncio.apply()
import nest_asyncio nest_asyncio.apply()

设置 LLM 后端¶

In [ ]

已复制！

os.environ["OPENAI_API_KEY"] = "sk-proj-..."

# Set up global configurations
llm = OpenAI(temperature=0.0, model="gpt-3.5-turbo")

Settings.llm = llm
Settings.chunk_size = 2048
Settings.chunk_overlap = 20
os.environ["OPENAI_API_KEY"] = "sk-proj-..." # 设置全局配置 llm = OpenAI(temperature=0.0, model="gpt-3.5-turbo") Settings.llm = llm Settings.chunk_size = 2048 Settings.chunk_overlap = 20

从维基百科获取原始文本¶

In [ ]

已复制！





def get_wikipedia_content(title):
    try:
        page = wikipedia.page(title)
        return page.content
    except wikipedia.exceptions.DisambiguationError as e:
        print(f"Disambiguation page. Options: {e.options}")
    except wikipedia.exceptions.PageError:
        print(f"Page '{title}' does not exist.")
    return None
def get_wikipedia_content(title): try: page = wikipedia.page(title) return page.content except wikipedia.exceptions.DisambiguationError as e: print(f"消歧义页面。选项：{e.options}") except wikipedia.exceptions.PageError: print(f"页面 '{title}' 不存在。") return None

In [ ]

已复制！





wiki_title = "Barack Obama"
content = get_wikipedia_content(wiki_title)

if content:
    document = Document(text=content, metadata={"title": wiki_title})
    print(
        f"Fetched content for '{wiki_title}' (length: {len(content)} characters)"
    )
else:
    print("Failed to fetch Wikipedia content.")
wiki_title = "Barack Obama" content = get_wikipedia_content(wiki_title) if content: document = Document(text=content, metadata={"title": wiki_title}) print( f"已获取 '{wiki_title}' 的内容（长度：{len(content)} 字符）" ) else: print("未能获取维基百科内容。")

Fetched content for 'Barack Obama' (length: 83977 characters)

1. SimpleLLMPathExtractor¶

In [ ]

已复制！





kg_extractor = SimpleLLMPathExtractor(
    llm=llm, max_paths_per_chunk=20, num_workers=4
)

simple_index = PropertyGraphIndex.from_documents(
    [document],
    llm=llm,
    embed_kg_nodes=False,
    kg_extractors=[kg_extractor],
    show_progress=True,
)

simple_index.property_graph_store.save_networkx_graph(
    name="./SimpleGraph.html"
)
simple_index.property_graph_store.get_triplets(
    entity_names=["Barack Obama", "Obama"]
)[:5]
kg_extractor = SimpleLLMPathExtractor( llm=llm, max_paths_per_chunk=20, num_workers=4 ) simple_index = PropertyGraphIndex.from_documents( [document], llm=llm, embed_kg_nodes=False, kg_extractors=[kg_extractor], show_progress=True, ) simple_index.property_graph_store.save_networkx_graph( name="./SimpleGraph.html" ) simple_index.property_graph_store.get_triplets( entity_names=["Barack Obama", "Obama"] )[:5]

Parsing nodes:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting paths from text: 100%|██████████| 11/11 [00:09<00:00,  1.19it/s]

Out[ ]

[(EntityNode(label='entity', embedding=None, properties={'title': 'Barack Obama', 'triplet_source_id': 'c4bbe9b8-ccd0-464c-b34c-37ede77f2717'}, name='Obama'),
  Relation(label='Has', source_id='Obama', target_id='Half-sister', properties={'title': 'Barack Obama', 'triplet_source_id': 'bd93d2e0-ab20-4f4c-a412-bb42f93ae56f'}),
  EntityNode(label='entity', embedding=None, properties={'title': 'Barack Obama', 'triplet_source_id': 'bd93d2e0-ab20-4f4c-a412-bb42f93ae56f'}, name='Half-sister')),
 (EntityNode(label='entity', embedding=None, properties={'title': 'Barack Obama', 'triplet_source_id': 'c4bbe9b8-ccd0-464c-b34c-37ede77f2717'}, name='Obama'),
  Relation(label='Selected', source_id='Obama', target_id='Joe biden as his vice presidential running mate', properties={'title': 'Barack Obama', 'triplet_source_id': 'bc18ad10-3040-41a8-b595-4dd8ddb31a0b'}),
  EntityNode(label='entity', embedding=None, properties={'title': 'Barack Obama', 'triplet_source_id': 'bc18ad10-3040-41a8-b595-4dd8ddb31a0b'}, name='Joe biden as his vice presidential running mate')),
 (EntityNode(label='entity', embedding=None, properties={'title': 'Barack Obama', 'triplet_source_id': 'c4bbe9b8-ccd0-464c-b34c-37ede77f2717'}, name='Obama'),
  Relation(label='Made', source_id='Obama', target_id='First public speech', properties={'title': 'Barack Obama', 'triplet_source_id': '6c89e860-215d-4f5b-8b1c-3183fe71bb6c'}),
  EntityNode(label='entity', embedding=None, properties={'title': 'Barack Obama', 'triplet_source_id': '6c89e860-215d-4f5b-8b1c-3183fe71bb6c'}, name='First public speech')),
 (EntityNode(label='entity', embedding=None, properties={'title': 'Barack Obama', 'triplet_source_id': 'c4bbe9b8-ccd0-464c-b34c-37ede77f2717'}, name='Obama'),
  Relation(label='Banned', source_id='Obama', target_id='New offshore oil and gas drilling', properties={'title': 'Barack Obama', 'triplet_source_id': '62942a1e-18ae-4f45-9c73-ea39934f5519'}),
  EntityNode(label='entity', embedding=None, properties={'title': 'Barack Obama', 'triplet_source_id': '62942a1e-18ae-4f45-9c73-ea39934f5519'}, name='New offshore oil and gas drilling')),
 (EntityNode(label='entity', embedding=None, properties={'title': 'Barack Obama', 'triplet_source_id': 'c4bbe9b8-ccd0-464c-b34c-37ede77f2717'}, name='Obama'),
  Relation(label='Met with', source_id='Obama', target_id='Australian prime minister', properties={'title': 'Barack Obama', 'triplet_source_id': 'c4bbe9b8-ccd0-464c-b34c-37ede77f2717'}),
  EntityNode(label='entity', embedding=None, properties={'title': 'Barack Obama', 'triplet_source_id': 'c4bbe9b8-ccd0-464c-b34c-37ede77f2717'}, name='Australian prime minister'))]

2. DynamicLLMPathExtractor¶

没有初始本体：¶

在这里，我们让 LLM 即时定义本体，赋予它完全自由来以它认为最合适的方式标记节点。

In [ ]

已复制！





kg_extractor = DynamicLLMPathExtractor(
    llm=llm,
    max_triplets_per_chunk=20,
    num_workers=4,
    # Let the LLM infer entities and their labels (types) on the fly
    allowed_entity_types=None,
    # Let the LLM infer relationships on the fly
    allowed_relation_types=None,
    # LLM will generate any entity properties, set `None` to skip property generation (will be faster without)
    allowed_relation_props=[],
    # LLM will generate any relation properties, set `None` to skip property generation (will be faster without)
    allowed_entity_props=[],
)

dynamic_index = PropertyGraphIndex.from_documents(
    [document],
    llm=llm,
    embed_kg_nodes=False,
    kg_extractors=[kg_extractor],
    show_progress=True,
)

dynamic_index.property_graph_store.save_networkx_graph(
    name="./DynamicGraph.html"
)

dynamic_index.property_graph_store.get_triplets(
    entity_names=["Barack Obama", "Obama"]
)[:5]
kg_extractor = DynamicLLMPathExtractor( llm=llm, max_triplets_per_chunk=20, num_workers=4, # 让 LLM 即时推断实体及其标签（类型） allowed_entity_types=None, # 让 LLM 即时推断关系 allowed_relation_types=None, # LLM 将生成任何实体属性，设置为 `None` 以跳过属性生成（这样会更快） allowed_relation_props=[], # LLM 将生成任何关系属性，设置为 `None` 以跳过属性生成（这样会更快） allowed_entity_props=[], ) dynamic_index = PropertyGraphIndex.from_documents( [document], llm=llm, embed_kg_nodes=False, kg_extractors=[kg_extractor], show_progress=True, ) dynamic_index.property_graph_store.save_networkx_graph( name="./DynamicGraph.html" ) dynamic_index.property_graph_store.get_triplets( entity_names=["Barack Obama", "Obama"] )[:5]

Parsing nodes:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting and inferring knowledge graph from text: 100%|██████████| 11/11 [00:50<00:00,  4.59s/it]

Out[ ]

[(EntityNode(label='PERSON', embedding=None, properties={'approval_rating': '63 percent', 'title': 'Barack Obama', 'triplet_source_id': '425eced4-ff34-49c2-b4ce-64ac96bf8d43'}, name='Obama'),
  Relation(label='MOVED_TO', source_id='Obama', target_id='Afghanistan', properties={'action': 'moved to bolster', 'quantity': 'U.S. troop strength in Afghanistan', 'title': 'Barack Obama', 'triplet_source_id': 'ff7b416e-2885-4296-b7e2-156cb3578bb1'}),
  EntityNode(label='COUNTRY', embedding=None, properties={'title': 'Barack Obama', 'triplet_source_id': 'ff7b416e-2885-4296-b7e2-156cb3578bb1'}, name='Afghanistan')),
 (EntityNode(label='PERSON', embedding=None, properties={'title': 'Barack Obama', 'triplet_source_id': '5137cb5e-04a8-4a71-bc1d-200783ec4628'}, name='Barack Obama'),
  Relation(label='RECEIVED', source_id='Barack Obama', target_id='Our Great National Parks', properties={'award': 'Primetime Emmy Award', 'category': 'Outstanding Narrator', 'title': 'Barack Obama', 'triplet_source_id': '5137cb5e-04a8-4a71-bc1d-200783ec4628'}),
  EntityNode(label='TV SHOW', embedding=None, properties={'title': 'Barack Obama', 'triplet_source_id': '5137cb5e-04a8-4a71-bc1d-200783ec4628'}, name='Our Great National Parks')),
 (EntityNode(label='PERSON', embedding=None, properties={'title': 'Barack Obama', 'triplet_source_id': '5137cb5e-04a8-4a71-bc1d-200783ec4628'}, name='Barack Obama'),
  Relation(label='PUBLISHED', source_id='Barack Obama', target_id='A Promised Land', properties={'title': 'Barack Obama', 'triplet_source_id': '43848a0a-858e-4552-b820-b8831931f63f'}),
  EntityNode(label='BOOK', embedding=None, properties={'release_date': 'November 17', 'title': 'Barack Obama', 'triplet_source_id': 'caf64843-39ce-4992-9c40-e7b1166af804'}, name='A Promised Land')),
 (EntityNode(label='PERSON', embedding=None, properties={'title': 'Barack Obama', 'triplet_source_id': '5137cb5e-04a8-4a71-bc1d-200783ec4628'}, name='Barack Obama'),
  Relation(label='RECEIVED', source_id='Barack Obama', target_id='Shoah Foundation Institute for Visual History and Education', properties={'award': 'Ambassador of Humanity Award', 'title': 'Barack Obama', 'triplet_source_id': '5137cb5e-04a8-4a71-bc1d-200783ec4628'}),
  EntityNode(label='ORGANIZATION', embedding=None, properties={'title': 'Barack Obama', 'triplet_source_id': '5137cb5e-04a8-4a71-bc1d-200783ec4628'}, name='Shoah Foundation Institute for Visual History and Education')),
 (EntityNode(label='PERSON', embedding=None, properties={'title': 'Barack Obama', 'triplet_source_id': '5137cb5e-04a8-4a71-bc1d-200783ec4628'}, name='Barack Obama'),
  Relation(label='SUPPORTED', source_id='Barack Obama', target_id='payday loan regulations', properties={'title': 'Barack Obama', 'triplet_source_id': '13073b9d-68e7-4973-9f70-bd65912d9604'}),
  EntityNode(label='POLICY', embedding=None, properties={'target': 'low-income workers', 'title': 'Barack Obama', 'triplet_source_id': '13073b9d-68e7-4973-9f70-bd65912d9604'}, name='payday loan regulations'))]

带有用于引导式 KG 提取的初始本体：¶

在这里，我们对想要检测的内容有部分了解，我们知道这篇文章是关于巴拉克·奥巴马的，因此我们定义了一些实体和关系，这些实体和关系可以在 LLM 检测实体和关系时帮助引导其标注过程。这不保证 LLM 会使用它们，它只是引导并提供一些想法。最终是否使用我们提供的实体和关系仍取决于 LLM。

In [ ]

已复制！





kg_extractor = DynamicLLMPathExtractor(
    llm=llm,
    max_triplets_per_chunk=20,
    num_workers=4,
    allowed_entity_types=["POLITICIAN", "POLITICAL_PARTY"],
    allowed_relation_types=["PRESIDENT_OF", "MEMBER_OF"],
    allowed_relation_props=["description"],
    allowed_entity_props=["description"],
)

dynamic_index_2 = PropertyGraphIndex.from_documents(
    [document],
    llm=llm,
    embed_kg_nodes=False,
    kg_extractors=[kg_extractor],
    show_progress=True,
)

dynamic_index_2.property_graph_store.save_networkx_graph(
    name="./DynamicGraph_2.html"
)
dynamic_index_2.property_graph_store.get_triplets(
    entity_names=["Barack Obama", "Obama"]
)[:5]
kg_extractor = DynamicLLMPathExtractor( llm=llm, max_triplets_per_chunk=20, num_workers=4, allowed_entity_types=["POLITICIAN", "POLITICAL_PARTY"], allowed_relation_types=["PRESIDENT_OF", "MEMBER_OF"], allowed_relation_props=["description"], allowed_entity_props=["description"], ) dynamic_index_2 = PropertyGraphIndex.from_documents( [document], llm=llm, embed_kg_nodes=False, kg_extractors=[kg_extractor], show_progress=True, ) dynamic_index_2.property_graph_store.save_networkx_graph( name="./DynamicGraph_2.html" ) dynamic_index_2.property_graph_store.get_triplets( entity_names=["Barack Obama", "Obama"] )[:5]

Parsing nodes:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting and inferring knowledge graph from text: 100%|██████████| 11/11 [00:47<00:00,  4.29s/it]

Out[ ]

[(EntityNode(label='PERSON', embedding=None, properties={'description': '44th President of the United States', 'title': 'Barack Obama', 'triplet_source_id': 'd286a836-a5ad-43af-b6de-bd43f072512c'}, name='Obama'),
  Relation(label='MOVED_TO', source_id='Obama', target_id='Afghanistan', properties={'description': 'moved to bolster U.S. troop strength', 'title': 'Barack Obama', 'triplet_source_id': '23c1750d-de01-4a75-814e-b56b81b9bbb4'}),
  EntityNode(label='COUNTRY', embedding=None, properties={'title': 'Barack Obama', 'triplet_source_id': '23c1750d-de01-4a75-814e-b56b81b9bbb4'}, name='Afghanistan')),
 (EntityNode(label='POLITICIAN', embedding=None, properties={'title': 'Barack Obama', 'triplet_source_id': '8f9dc0b3-ff33-46e9-ad3f-040755d33fc7'}, name='Barack Obama'),
  Relation(label='ESTABLISHED', source_id='Barack Obama', target_id='White House Task Force to Protect Students from Sexual Assault', properties={'title': 'Barack Obama', 'triplet_source_id': '8af352da-b50d-4043-8002-870991473cf6'}),
  EntityNode(label='ORGANIZATION', embedding=None, properties={'title': 'Barack Obama', 'triplet_source_id': '8af352da-b50d-4043-8002-870991473cf6'}, name='White House Task Force to Protect Students from Sexual Assault')),
 (EntityNode(label='POLITICIAN', embedding=None, properties={'title': 'Barack Obama', 'triplet_source_id': '8f9dc0b3-ff33-46e9-ad3f-040755d33fc7'}, name='Barack Obama'),
  Relation(label='BECAME_CHAIRMAN_OF', source_id='Barack Obama', target_id="Illinois Senate\\'s Health and Human Services Committee", properties={'title': 'Barack Obama', 'triplet_source_id': '5bf11d65-0078-48bb-97b5-109b4469d46a'}),
  EntityNode(label='COMMITTEE', embedding=None, properties={'title': 'Barack Obama', 'triplet_source_id': '5bf11d65-0078-48bb-97b5-109b4469d46a'}, name="Illinois Senate\\'s Health and Human Services Committee")),
 (EntityNode(label='PERSON', embedding=None, properties={'description': '44th President of the United States', 'title': 'Barack Obama', 'triplet_source_id': 'd286a836-a5ad-43af-b6de-bd43f072512c'}, name='Obama'),
  Relation(label='USED', source_id='Obama', target_id='last day in office', properties={'description': 'used phrase "thanks, Obama"', 'title': 'Barack Obama', 'triplet_source_id': 'd286a836-a5ad-43af-b6de-bd43f072512c'}),
  EntityNode(label='EVENT', embedding=None, properties={'description': 'final day in office', 'title': 'Barack Obama', 'triplet_source_id': 'd286a836-a5ad-43af-b6de-bd43f072512c'}, name='last day in office')),
 (EntityNode(label='PERSON', embedding=None, properties={'description': '44th President of the United States', 'title': 'Barack Obama', 'triplet_source_id': 'd286a836-a5ad-43af-b6de-bd43f072512c'}, name='Obama'),
  Relation(label='SAID', source_id='Obama', target_id='34,000 U.S. troops', properties={'description': 'said the U.S. military would reduce the troop level in Afghanistan', 'title': 'Barack Obama', 'triplet_source_id': '23c1750d-de01-4a75-814e-b56b81b9bbb4'}),
  EntityNode(label='MILITARY_FORCE', embedding=None, properties={'title': 'Barack Obama', 'triplet_source_id': '23c1750d-de01-4a75-814e-b56b81b9bbb4'}, name='34,000 U.S. troops'))]

3 - SchemaLLMPathExtractor¶

In [ ]

已复制！





kg_extractor = SchemaLLMPathExtractor(
    llm=llm,
    max_triplets_per_chunk=20,
    strict=False,  # Set to False to showcase why it's not going to be the same as DynamicLLMPathExtractor
    possible_entities=None,  # USE DEFAULT ENTITIES (PERSON, ORGANIZATION... etc)
    possible_relations=None,  # USE DEFAULT RELATIONSHIPS
    possible_relation_props=[
        "extra_description"
    ],  # Set to `None` to skip property generation
    possible_entity_props=[
        "extra_description"
    ],  # Set to `None` to skip property generation
    num_workers=4,
)

schema_index = PropertyGraphIndex.from_documents(
    [document],
    llm=llm,
    embed_kg_nodes=False,
    kg_extractors=[kg_extractor],
    show_progress=True,
)

schema_index.property_graph_store.save_networkx_graph(
    name="./SchemaGraph.html"
)
schema_index.property_graph_store.get_triplets(
    entity_names=["Barack Obama", "Obama"]
)[:5]
kg_extractor = SchemaLLMPathExtractor( llm=llm, max_triplets_per_chunk=20, strict=False, # 设置为 False，以展示为什么它不会与 DynamicLLMPathExtractor 相同 possible_entities=None, # 使用默认实体（PERSON, ORGANIZATION... 等） possible_relations=None, # 使用默认关系 possible_relation_props=[ "extra_description" ], # 设置为 `None` 以跳过属性生成 possible_entity_props=[ "extra_description" ], # 设置为 `None` 以跳过属性生成 num_workers=4, ) schema_index = PropertyGraphIndex.from_documents( [document], llm=llm, embed_kg_nodes=False, kg_extractors=[kg_extractor], show_progress=True, ) schema_index.property_graph_store.save_networkx_graph( name="./SchemaGraph.html" ) schema_index.property_graph_store.get_triplets( entity_names=["Barack Obama", "Obama"] )[:5]

Parsing nodes:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting paths from text with schema: 100%|██████████| 11/11 [00:52<00:00,  4.81s/it]

Out[ ]

[(EntityNode(label='PERSON', embedding=None, properties={'title': 'Barack Obama', 'triplet_source_id': '87af3360-fa63-40c2-8440-f4114a7093fd'}, name='Barack Obama'),
  Relation(label='HAS', source_id='Barack Obama', target_id='References', properties={'title': 'Barack Obama', 'triplet_source_id': '87af3360-fa63-40c2-8440-f4114a7093fd'}),
  EntityNode(label='CONCEPT', embedding=None, properties={'title': 'Barack Obama', 'triplet_source_id': '87af3360-fa63-40c2-8440-f4114a7093fd'}, name='References')),
 (EntityNode(label='PERSON', embedding=None, properties={'title': 'Barack Obama', 'triplet_source_id': '87af3360-fa63-40c2-8440-f4114a7093fd'}, name='Barack Obama'),
  Relation(label='INTERCEPTED', source_id='Barack Obama', target_id='pipe bomb', properties={'title': 'Barack Obama', 'triplet_source_id': 'ada0abff-9671-4156-b06c-bf5067e6d54c'}),
  EntityNode(label='PRODUCT', embedding=None, properties={'title': 'Barack Obama', 'triplet_source_id': 'ada0abff-9671-4156-b06c-bf5067e6d54c'}, name='pipe bomb')),
 (EntityNode(label='PERSON', embedding=None, properties={'title': 'Barack Obama', 'triplet_source_id': '87af3360-fa63-40c2-8440-f4114a7093fd'}, name='Barack Obama'),
  Relation(label='HAS', source_id='Barack Obama', target_id='end of 2015', properties={'title': 'Barack Obama', 'triplet_source_id': '2b64d219-d19b-4346-a6a0-4369599af5d1'}),
  EntityNode(label='TIME', embedding=None, properties={'title': 'Barack Obama', 'triplet_source_id': '2b64d219-d19b-4346-a6a0-4369599af5d1'}, name='end of 2015')),
 (EntityNode(label='PERSON', embedding=None, properties={'title': 'Barack Obama', 'triplet_source_id': '87af3360-fa63-40c2-8440-f4114a7093fd'}, name='Barack Obama'),
  Relation(label='GRADUATED_FROM', source_id='Barack Obama', target_id='Columbia University', properties={'title': 'Barack Obama', 'triplet_source_id': '65be5ae1-bc74-43ee-9655-855daf81f74f'}),
  EntityNode(label='ORGANIZATION', embedding=None, properties={'title': 'Barack Obama', 'triplet_source_id': '65be5ae1-bc74-43ee-9655-855daf81f74f'}, name='Columbia University')),
 (EntityNode(label='PERSON', embedding=None, properties={'title': 'Barack Obama', 'triplet_source_id': '87af3360-fa63-40c2-8440-f4114a7093fd'}, name='Barack Obama'),
  Relation(label='EDUCATION', source_id='Barack Obama', target_id='Schools and Universities', properties={'extra_description': 'Attended schools and universities', 'title': 'Barack Obama', 'triplet_source_id': '1f495d28-7df4-44dc-a3e3-bfc6161d3d2d'}),
  EntityNode(label='ORGANIZATION', embedding=None, properties={'title': 'Barack Obama', 'triplet_source_id': '1f495d28-7df4-44dc-a3e3-bfc6161d3d2d'}, name='Schools and Universities'))]

比较与分析¶

让我们比较这三种提取器的结果

SimpleLLMPathExtractor：此提取器创建一个不带任何预定义 Schema 的基本知识图谱。它可能会生成更多样化的关系，但在实体和关系命名方面可能缺乏一致性。
DynamicLLMPathExtractor:
- 这个新的提取器结合了 SimpleLLMPathExtractor 的灵活性和来自 Schema 的一些初始指导。它可以扩展超出初始实体和关系类型，可能产生丰富多样化的图谱，同时保持一定程度的一致性。
- 在输入中不提供任何初始实体或关系，使 LLM 可以完全自由地即时推断出其认为最合适的 Schema。这将根据所使用的 LLM 和温度而有所不同。
SchemaLLMPathExtractor：使用预定义的 Schema，此提取器生成更结构化的图谱。实体和关系仅限于 Schema 中指定的那些，这可能导致图谱更具一致性，但可能不够全面。即使我们将 "strict" 设置为 false，提取的 KG 图谱也无法反映 LLM 尝试寻找超出输入 Schema 范围的新实体和类型的努力。

主要观察结果：¶

SimpleLLMPathExtractor 生成的图谱可能拥有最多样化的实体和关系集合。
SchemaLLMPathExtractor 生成的图谱应该最一致，但可能会遗漏许多不符合预定义 Schema 的关系，即使我们不严格验证 Schema。
DynamicLLMPathExtractor 生成的图谱应该在多样性和一致性之间取得平衡，可能捕获基于 Schema 的方法可能遗漏的重要关系，同时仍然保持一定的结构。

这些提取器之间的选择取决于具体的用例：¶

对于探索性分析，如果您想捕获 RAG 应用的广泛潜在关系，并且不关心实体类型，请使用 SimpleLLMPathExtractor。
当您有一个明确定义的领域并希望确保提取的知识一致时，请使用 SchemaLLMPathExtractor。
当您想要在结构和灵活性之间取得平衡，允许模型发现新的实体和关系类型，同时仍然提供一些初始指导时，请使用 DynamicLLMPathExtractor。如果您想要一个带有标注（类型化）实体的 KG，但没有输入 Schema（或者您已部分定义了 Schema 作为起始基础），则此提取器特别有用。