Html

节点解析器.

HTMLNodeParser #

基类: NodeParser

HTML 节点解析器.

使用自定义 HTML 分割逻辑将文档分割为节点.

参数

名称	类型	描述	默认值
`include_metadata`	`bool`	是否在节点中包含元数据	必需
`include_prev_next_rel`	`bool`	是否包含上一个/下一个关系	必需
`tags`	`列表[字符串]`	用于从中提取文本的 HTML 标签.	`['p', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'li', 'b', 'i', 'u', 'section']`

源代码位于 llama-index-core/llama_index/core/node_parser/file/html.py

class HTMLNodeParser(NodeParser):
    """
    HTML node parser.

    Splits a document into Nodes using custom HTML splitting logic.

    Args:
        include_metadata (bool): whether to include metadata in nodes
        include_prev_next_rel (bool): whether to include prev/next relationships

    """

    tags: List[str] = Field(
        default=DEFAULT_TAGS, description="HTML tags to extract text from."
    )

    @classmethod
    def from_defaults(
        cls,
        include_metadata: bool = True,
        include_prev_next_rel: bool = True,
        callback_manager: Optional[CallbackManager] = None,
        tags: Optional[List[str]] = DEFAULT_TAGS,
    ) -> "HTMLNodeParser":
        callback_manager = callback_manager or CallbackManager([])

        return cls(
            include_metadata=include_metadata,
            include_prev_next_rel=include_prev_next_rel,
            callback_manager=callback_manager,
            tags=tags,
        )

    @classmethod
    def class_name(cls) -> str:
        """Get class name."""
        return "HTMLNodeParser"

    def _parse_nodes(
        self,
        nodes: Sequence[BaseNode],
        show_progress: bool = False,
        **kwargs: Any,
    ) -> List[BaseNode]:
        all_nodes: List[BaseNode] = []
        nodes_with_progress = get_tqdm_iterable(nodes, show_progress, "Parsing nodes")

        for node in nodes_with_progress:
            nodes = self.get_nodes_from_node(node)
            all_nodes.extend(nodes)

        return all_nodes

    def get_nodes_from_node(self, node: BaseNode) -> List[TextNode]:
        """Get nodes from document."""
        try:
            from bs4 import BeautifulSoup, Tag
        except ImportError:
            raise ImportError("bs4 is required to read HTML files.")

        text = node.get_content(metadata_mode=MetadataMode.NONE)
        soup = BeautifulSoup(text, "html.parser")
        html_nodes = []
        last_tag = None
        current_section = ""

        tags = soup.find_all(self.tags)
        for tag in tags:
            tag_text = self._extract_text_from_tag(tag)
            if isinstance(tag, Tag) and (tag.name == last_tag or last_tag is None):
                last_tag = tag.name
                current_section += f"{tag_text.strip()}\n"
            else:
                html_nodes.append(
                    self._build_node_from_split(
                        current_section.strip(), node, {"tag": last_tag}
                    )
                )
                if isinstance(tag, Tag):
                    last_tag = tag.name
                current_section = f"{tag_text}\n"

        if current_section:
            html_nodes.append(
                self._build_node_from_split(
                    current_section.strip(), node, {"tag": last_tag}
                )
            )

        return html_nodes

    def _extract_text_from_tag(
        self, tag: Union["Tag", "NavigableString", "PageElement"]
    ) -> str:
        from bs4 import NavigableString, Tag, PageElement

        texts = []
        if isinstance(tag, Tag):
            for elem in tag.children:
                if isinstance(elem, NavigableString):
                    if elem.strip():
                        texts.append(elem.strip())
                elif isinstance(elem, Tag):
                    if elem.name in self.tags:
                        continue
                    else:
                        texts.append(elem.get_text().strip())
                elif isinstance(elem, PageElement):
                    texts.append(elem.get_text().strip())
        else:
            texts.append(tag.get_text().strip())
        return "\n".join(texts)

    def _build_node_from_split(
        self,
        text_split: str,
        node: BaseNode,
        metadata: dict,
    ) -> TextNode:
        """Build node from single text split."""
        node = build_nodes_from_splits([text_split], node, id_func=self.id_func)[0]

        if self.include_metadata:
            node.metadata = {**node.metadata, **metadata}

        return node

class_name `类方法` #

class_name() -> str

获取类名.

源代码位于 llama-index-core/llama_index/core/node_parser/file/html.py

@classmethod
def class_name(cls) -> str:
    """Get class name."""
    return "HTMLNodeParser"

get_nodes_from_node #

get_nodes_from_node(node: BaseNode) -> List[TextNode]

从文档获取节点.

源代码位于 llama-index-core/llama_index/core/node_parser/file/html.py

def get_nodes_from_node(self, node: BaseNode) -> List[TextNode]:
    """Get nodes from document."""
    try:
        from bs4 import BeautifulSoup, Tag
    except ImportError:
        raise ImportError("bs4 is required to read HTML files.")

    text = node.get_content(metadata_mode=MetadataMode.NONE)
    soup = BeautifulSoup(text, "html.parser")
    html_nodes = []
    last_tag = None
    current_section = ""

    tags = soup.find_all(self.tags)
    for tag in tags:
        tag_text = self._extract_text_from_tag(tag)
        if isinstance(tag, Tag) and (tag.name == last_tag or last_tag is None):
            last_tag = tag.name
            current_section += f"{tag_text.strip()}\n"
        else:
            html_nodes.append(
                self._build_node_from_split(
                    current_section.strip(), node, {"tag": last_tag}
                )
            )
            if isinstance(tag, Tag):
                last_tag = tag.name
            current_section = f"{tag_text}\n"

    if current_section:
        html_nodes.append(
            self._build_node_from_split(
                current_section.strip(), node, {"tag": last_tag}
            )
        )

    return html_nodes

Html

HTMLNodeParser #

class_name 类方法 #

get_nodes_from_node #

class_name `类方法` #