跳过内容

Relik

RelikPathExtractor #

基类: TransformComponent

一个用于将文档转换为图结构的 Transformer 类。使用 Relik 库和模型。此类利用 relik 模型从文本文档中提取关系和节点,并将其转换为图格式。根据指定的可信度阈值过滤关系。有关 Relik 库的更多详细信息,请访问其 GitHub 仓库:https://github.com/SapienzaNLP/relik。参数:model (str):要使用的预训练 Relik 模型的名称。默认值为 "relik-ie/relik-relation-extraction-small-wikipedia"。relationship_confidence_threshold (float):用于过滤关系的可信度阈值。默认值为 0.1。skip_errors (bool):在提取过程中是否跳过错误。默认为 False。

源代码位于 llama-index-integrations/extractors/llama-index-extractors-relik/llama_index/extractors/relik/base.py
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
class RelikPathExtractor(TransformComponent):
    """
    A transformer class for converting documents into graph structures.
    Uses the Relik library and models.
    This class leverages relik models for extracting relationships
    and nodes from text documents and converting them into a graph format.
    The relationships are filtered based on a specified confidence threshold.
    For more details on the Relik library, visit their GitHub repository:
      https://github.com/SapienzaNLP/relik
    Args:
        model (str): The name of the pretrained Relik model to use.
          Default is "relik-ie/relik-relation-extraction-small-wikipedia".
        relationship_confidence_threshold (float): The confidence threshold for
          filtering relationships. Default is 0.1.
        skip_errors (bool): Whether to skip errors during extraction. Defaults to False.
    """

    relik_model: Any
    relationship_confidence_threshold: float
    num_workers: int
    skip_errors: bool
    ignore_self_loops: bool

    def __init__(
        self,
        model: str = "relik-ie/relik-relation-extraction-small",
        relationship_confidence_threshold: float = 0.1,
        skip_errors: bool = False,
        num_workers: int = 4,
        model_config: Dict[str, Any] = {},
        ignore_self_loops: bool = True,
    ) -> None:
        """Init params."""
        try:
            import relik  # type: ignore

            # Remove default INFO logging
            logging.getLogger("relik").setLevel(logging.WARNING)
        except ImportError:
            raise ImportError(
                "Could not import relik python package. "
                "Please install it with `pip install relik`."
            )

        relik_model = relik.Relik.from_pretrained(model, **model_config)

        super().__init__(
            relik_model=relik_model,
            relationship_confidence_threshold=relationship_confidence_threshold,
            num_workers=num_workers,
            skip_errors=skip_errors,
            ignore_self_loops=ignore_self_loops,
        )

    @classmethod
    def class_name(cls) -> str:
        return "RelikPathExtractor"

    def __call__(
        self, nodes: List[BaseNode], show_progress: bool = False, **kwargs: Any
    ) -> List[BaseNode]:
        """Extract triples from nodes."""
        result_nodes = []
        for node in tqdm.tqdm(
            nodes, desc="Extracting triples", disable=not show_progress
        ):
            result_nodes.append(self._extract(node))

        return result_nodes

    def _extract(self, node: BaseNode) -> BaseNode:
        """Extract triples from a node."""
        assert hasattr(node, "text")

        text = node.get_content(metadata_mode="llm")
        try:
            relik_out = self.relik_model(text)
        except Exception as e:
            if self.skip_errors:
                node.metadata[KG_NODES_KEY] = node.metadata.get(KG_NODES_KEY, [])
                node.metadata[KG_RELATIONS_KEY] = node.metadata.get(
                    KG_RELATIONS_KEY, []
                )
                return node
            raise ValueError(f"Failed to extract triples from text: {e}")

        existing_nodes = node.metadata.pop(KG_NODES_KEY, [])
        existing_relations = node.metadata.pop(KG_RELATIONS_KEY, [])

        metadata = node.metadata.copy()
        # Extract nodes
        for n in relik_out.spans:
            existing_nodes.append(
                EntityNode(
                    name=n.text,
                    label=DEFAULT_NODE_TYPE
                    if n.label.strip() == "--NME--"
                    else n.label.strip(),
                    properties=metadata,
                )
            )
        # Extract relationships
        for triple in relik_out.triplets:
            # Ignore relationship if below confidence threshold
            if triple.confidence < self.relationship_confidence_threshold:
                continue
            # Ignore self loops
            if self.ignore_self_loops and triple.subject.text == triple.object.text:
                continue
            rel_node = Relation(
                label=triple.label.replace(" ", "_").upper(),
                source_id=triple.subject.text,
                target_id=triple.object.text,
                properties=metadata,
            )

            existing_relations.append(rel_node)

        node.metadata[KG_NODES_KEY] = existing_nodes
        node.metadata[KG_RELATIONS_KEY] = existing_relations

        return node

    async def acall(
        self, nodes: List[BaseNode], show_progress: bool = False, **kwargs: Any
    ) -> List[BaseNode]:
        """Extract triples from nodes async."""
        return self.__call__(nodes, show_progress=show_progress, **kwargs)

acall async #

acall(nodes: List[BaseNode], show_progress: bool = False, **kwargs: Any) -> List[BaseNode]

异步从节点提取三元组。

源代码位于 llama-index-integrations/extractors/llama-index-extractors-relik/llama_index/extractors/relik/base.py
139
140
141
142
143
async def acall(
    self, nodes: List[BaseNode], show_progress: bool = False, **kwargs: Any
) -> List[BaseNode]:
    """Extract triples from nodes async."""
    return self.__call__(nodes, show_progress=show_progress, **kwargs)