句子窗口

节点解析器.

SentenceWindowNodeParser #

基类: NodeParser

句子窗口节点解析器。

将文档分割成节点，每个节点是一个句子。每个节点在其元数据中包含来自周围句子的一个“窗口”。

参数

名称	类型	描述	默认值
`sentence_splitter`	`可选[可调用对象]`	将文本分割成句子	`<function split_by_sentence_tokenizer.<locals>.<lambda> at 0x7e52a45ebf60>`
`include_metadata`	`bool`	是否在节点中包含元数据	必需的
`include_prev_next_rel`	`bool`	是否包含前后节点关系	必需的
`window_size`	`int`	捕获句子两侧的句子数量。	`3`
`window_metadata_key`	`str`	存储句子窗口的元数据键。	`'window'`
`original_text_metadata_key`	`str`	存储原始句子的元数据键。	`'original_text'`

源代码位于 llama-index-core/llama_index/core/node_parser/text/sentence_window.py

class SentenceWindowNodeParser(NodeParser):
    """
    Sentence window node parser.

    Splits a document into Nodes, with each node being a sentence.
    Each node contains a window from the surrounding sentences in the metadata.

    Args:
        sentence_splitter (Optional[Callable]): splits text into sentences
        include_metadata (bool): whether to include metadata in nodes
        include_prev_next_rel (bool): whether to include prev/next relationships

    """

    sentence_splitter: Callable[[str], List[str]] = Field(
        default_factory=split_by_sentence_tokenizer,
        description="The text splitter to use when splitting documents.",
        exclude=True,
    )
    window_size: int = Field(
        default=DEFAULT_WINDOW_SIZE,
        description="The number of sentences on each side of a sentence to capture.",
        gt=0,
    )
    window_metadata_key: str = Field(
        default=DEFAULT_WINDOW_METADATA_KEY,
        description="The metadata key to store the sentence window under.",
    )
    original_text_metadata_key: str = Field(
        default=DEFAULT_OG_TEXT_METADATA_KEY,
        description="The metadata key to store the original sentence in.",
    )

    @classmethod
    def class_name(cls) -> str:
        return "SentenceWindowNodeParser"

    @classmethod
    def from_defaults(
        cls,
        sentence_splitter: Optional[Callable[[str], List[str]]] = None,
        window_size: int = DEFAULT_WINDOW_SIZE,
        window_metadata_key: str = DEFAULT_WINDOW_METADATA_KEY,
        original_text_metadata_key: str = DEFAULT_OG_TEXT_METADATA_KEY,
        include_metadata: bool = True,
        include_prev_next_rel: bool = True,
        callback_manager: Optional[CallbackManager] = None,
        id_func: Optional[Callable[[int, Document], str]] = None,
    ) -> "SentenceWindowNodeParser":
        callback_manager = callback_manager or CallbackManager([])

        sentence_splitter = sentence_splitter or split_by_sentence_tokenizer()

        id_func = id_func or default_id_func

        return cls(
            sentence_splitter=sentence_splitter,
            window_size=window_size,
            window_metadata_key=window_metadata_key,
            original_text_metadata_key=original_text_metadata_key,
            include_metadata=include_metadata,
            include_prev_next_rel=include_prev_next_rel,
            callback_manager=callback_manager,
            id_func=id_func,
        )

    def _parse_nodes(
        self,
        nodes: Sequence[BaseNode],
        show_progress: bool = False,
        **kwargs: Any,
    ) -> List[BaseNode]:
        """Parse document into nodes."""
        all_nodes: List[BaseNode] = []
        nodes_with_progress = get_tqdm_iterable(nodes, show_progress, "Parsing nodes")

        for node in nodes_with_progress:
            nodes = self.build_window_nodes_from_documents([node])
            all_nodes.extend(nodes)

        return all_nodes

    def build_window_nodes_from_documents(
        self, documents: Sequence[Document]
    ) -> List[BaseNode]:
        """Build window nodes from documents."""
        all_nodes: List[BaseNode] = []
        for doc in documents:
            text = doc.text
            text_splits = self.sentence_splitter(text)
            nodes = build_nodes_from_splits(
                text_splits,
                doc,
                id_func=self.id_func,
            )

            # add window to each node
            for i, node in enumerate(nodes):
                window_nodes = nodes[
                    max(0, i - self.window_size) : min(
                        i + self.window_size + 1, len(nodes)
                    )
                ]

                node.metadata[self.window_metadata_key] = " ".join(
                    [n.text for n in window_nodes]
                )
                node.metadata[self.original_text_metadata_key] = node.text

                # exclude window metadata from embed and llm
                node.excluded_embed_metadata_keys.extend(
                    [self.window_metadata_key, self.original_text_metadata_key]
                )
                node.excluded_llm_metadata_keys.extend(
                    [self.window_metadata_key, self.original_text_metadata_key]
                )

            all_nodes.extend(nodes)

        return all_nodes

build_window_nodes_from_documents #

build_window_nodes_from_documents(documents: Sequence[Document]) -> List[BaseNode]

从文档构建窗口节点。

源代码位于 llama-index-core/llama_index/core/node_parser/text/sentence_window.py

def build_window_nodes_from_documents(
    self, documents: Sequence[Document]
) -> List[BaseNode]:
    """Build window nodes from documents."""
    all_nodes: List[BaseNode] = []
    for doc in documents:
        text = doc.text
        text_splits = self.sentence_splitter(text)
        nodes = build_nodes_from_splits(
            text_splits,
            doc,
            id_func=self.id_func,
        )

        # add window to each node
        for i, node in enumerate(nodes):
            window_nodes = nodes[
                max(0, i - self.window_size) : min(
                    i + self.window_size + 1, len(nodes)
                )
            ]

            node.metadata[self.window_metadata_key] = " ".join(
                [n.text for n in window_nodes]
            )
            node.metadata[self.original_text_metadata_key] = node.text

            # exclude window metadata from embed and llm
            node.excluded_embed_metadata_keys.extend(
                [self.window_metadata_key, self.original_text_metadata_key]
            )
            node.excluded_llm_metadata_keys.extend(
                [self.window_metadata_key, self.original_text_metadata_key]
            )

        all_nodes.extend(nodes)

    return all_nodes