跳到内容
  • 设置
  • 语义分割器

    节点解析器。

    SemanticSplitterNodeParser #

    基类: NodeParser

    语义节点解析器。

    将文档分割成节点,每个节点是一组语义相关的句子。

    参数

    名称 类型 描述 默认值
    buffer_size int

    评估语义相似度时分组的句子数量

    1
    embed_model BaseEmbedding

    (BaseEmbedding):要使用的嵌入模型

    必需
    sentence_splitter 可选[可调用对象]

    将文本分割成句子

    <function split_by_sentence_tokenizer.<locals>.<lambda> at 0x7e52a45eb9c0>
    include_metadata bool

    是否在节点中包含元数据

    必需
    include_prev_next_rel bool

    是否包含前/后关系

    必需
    breakpoint_percentile_threshold int

    一组句子与下一组句子之间必须超过的余弦相异度百分位数,才能形成节点。这个数字越小,生成的节点就越多

    95
    源代码位于 llama-index-core/llama_index/core/node_parser/text/semantic_splitter.py
     35
     36
     37
     38
     39
     40
     41
     42
     43
     44
     45
     46
     47
     48
     49
     50
     51
     52
     53
     54
     55
     56
     57
     58
     59
     60
     61
     62
     63
     64
     65
     66
     67
     68
     69
     70
     71
     72
     73
     74
     75
     76
     77
     78
     79
     80
     81
     82
     83
     84
     85
     86
     87
     88
     89
     90
     91
     92
     93
     94
     95
     96
     97
     98
     99
    100
    101
    102
    103
    104
    105
    106
    107
    108
    109
    110
    111
    112
    113
    114
    115
    116
    117
    118
    119
    120
    121
    122
    123
    124
    125
    126
    127
    128
    129
    130
    131
    132
    133
    134
    135
    136
    137
    138
    139
    140
    141
    142
    143
    144
    145
    146
    147
    148
    149
    150
    151
    152
    153
    154
    155
    156
    157
    158
    159
    160
    161
    162
    163
    164
    165
    166
    167
    168
    169
    170
    171
    172
    173
    174
    175
    176
    177
    178
    179
    180
    181
    182
    183
    184
    185
    186
    187
    188
    189
    190
    191
    192
    193
    194
    195
    196
    197
    198
    199
    200
    201
    202
    203
    204
    205
    206
    207
    208
    209
    210
    211
    212
    213
    214
    215
    216
    217
    218
    219
    220
    221
    222
    223
    224
    225
    226
    227
    228
    229
    230
    231
    232
    233
    234
    235
    236
    237
    238
    239
    240
    241
    242
    243
    244
    245
    246
    247
    248
    249
    250
    251
    252
    253
    254
    255
    256
    257
    258
    259
    260
    261
    262
    263
    264
    265
    266
    267
    268
    269
    270
    271
    272
    273
    274
    275
    276
    277
    278
    279
    280
    281
    282
    283
    284
    285
    286
    287
    288
    289
    290
    291
    292
    293
    294
    295
    296
    297
    298
    299
    300
    301
    302
    303
    304
    305
    306
    307
    308
    309
    310
    311
    class SemanticSplitterNodeParser(NodeParser):
        """
        Semantic node parser.
    
        Splits a document into Nodes, with each node being a group of semantically related sentences.
    
        Args:
            buffer_size (int): number of sentences to group together when evaluating semantic similarity
            embed_model: (BaseEmbedding): embedding model to use
            sentence_splitter (Optional[Callable]): splits text into sentences
            include_metadata (bool): whether to include metadata in nodes
            include_prev_next_rel (bool): whether to include prev/next relationships
    
        """
    
        sentence_splitter: SentenceSplitterCallable = Field(
            default_factory=split_by_sentence_tokenizer,
            description="The text splitter to use when splitting documents.",
            exclude=True,
        )
    
        embed_model: SerializeAsAny[BaseEmbedding] = Field(
            description="The embedding model to use to for semantic comparison",
        )
    
        buffer_size: int = Field(
            default=1,
            description=(
                "The number of sentences to group together when evaluating semantic similarity. "
                "Set to 1 to consider each sentence individually. "
                "Set to >1 to group sentences together."
            ),
        )
    
        breakpoint_percentile_threshold: int = Field(
            default=95,
            description=(
                "The percentile of cosine dissimilarity that must be exceeded between a "
                "group of sentences and the next to form a node.  The smaller this "
                "number is, the more nodes will be generated"
            ),
        )
    
        @classmethod
        def class_name(cls) -> str:
            return "SemanticSplitterNodeParser"
    
        @classmethod
        def from_defaults(
            cls,
            embed_model: Optional[BaseEmbedding] = None,
            breakpoint_percentile_threshold: Optional[int] = 95,
            buffer_size: Optional[int] = 1,
            sentence_splitter: Optional[Callable[[str], List[str]]] = None,
            original_text_metadata_key: str = DEFAULT_OG_TEXT_METADATA_KEY,
            include_metadata: bool = True,
            include_prev_next_rel: bool = True,
            callback_manager: Optional[CallbackManager] = None,
            id_func: Optional[Callable[[int, Document], str]] = None,
        ) -> "SemanticSplitterNodeParser":
            callback_manager = callback_manager or CallbackManager([])
    
            sentence_splitter = sentence_splitter or split_by_sentence_tokenizer()
            if embed_model is None:
                try:
                    from llama_index.embeddings.openai import (
                        OpenAIEmbedding,
                    )  # pants: no-infer-dep
    
                    embed_model = embed_model or OpenAIEmbedding()
                except ImportError:
                    raise ImportError(
                        "`llama-index-embeddings-openai` package not found, "
                        "please run `pip install llama-index-embeddings-openai`"
                    )
    
            id_func = id_func or default_id_func
    
            return cls(
                embed_model=embed_model,
                breakpoint_percentile_threshold=breakpoint_percentile_threshold,
                buffer_size=buffer_size,
                sentence_splitter=sentence_splitter,
                original_text_metadata_key=original_text_metadata_key,
                include_metadata=include_metadata,
                include_prev_next_rel=include_prev_next_rel,
                callback_manager=callback_manager,
                id_func=id_func,
            )
    
        def _parse_nodes(
            self,
            nodes: Sequence[BaseNode],
            show_progress: bool = False,
            **kwargs: Any,
        ) -> List[BaseNode]:
            """Parse document into nodes."""
            all_nodes: List[BaseNode] = []
            nodes_with_progress = get_tqdm_iterable(nodes, show_progress, "Parsing nodes")
    
            for node in nodes_with_progress:
                nodes = self.build_semantic_nodes_from_documents([node], show_progress)
                all_nodes.extend(nodes)
    
            return all_nodes
    
        async def _aparse_nodes(
            self,
            nodes: Sequence[BaseNode],
            show_progress: bool = False,
            **kwargs: Any,
        ) -> List[BaseNode]:
            """Asynchronously parse document into nodes."""
            all_nodes: List[BaseNode] = []
            nodes_with_progress = get_tqdm_iterable(nodes, show_progress, "Parsing nodes")
    
            for node in nodes_with_progress:
                nodes = await self.abuild_semantic_nodes_from_documents(
                    [node], show_progress
                )
                all_nodes.extend(nodes)
    
            return all_nodes
    
        def build_semantic_nodes_from_documents(
            self,
            documents: Sequence[Document],
            show_progress: bool = False,
        ) -> List[BaseNode]:
            """Build window nodes from documents."""
            all_nodes: List[BaseNode] = []
            for doc in documents:
                text = doc.text
                text_splits = self.sentence_splitter(text)
    
                sentences = self._build_sentence_groups(text_splits)
    
                combined_sentence_embeddings = self.embed_model.get_text_embedding_batch(
                    [s["combined_sentence"] for s in sentences],
                    show_progress=show_progress,
                )
    
                for i, embedding in enumerate(combined_sentence_embeddings):
                    sentences[i]["combined_sentence_embedding"] = embedding
    
                distances = self._calculate_distances_between_sentence_groups(sentences)
    
                chunks = self._build_node_chunks(sentences, distances)
    
                nodes = build_nodes_from_splits(
                    chunks,
                    doc,
                    id_func=self.id_func,
                )
    
                all_nodes.extend(nodes)
    
            return all_nodes
    
        async def abuild_semantic_nodes_from_documents(
            self,
            documents: Sequence[Document],
            show_progress: bool = False,
        ) -> List[BaseNode]:
            """Asynchronously build window nodes from documents."""
            all_nodes: List[BaseNode] = []
            for doc in documents:
                text = doc.text
                text_splits = self.sentence_splitter(text)
    
                sentences = self._build_sentence_groups(text_splits)
    
                combined_sentence_embeddings = (
                    await self.embed_model.aget_text_embedding_batch(
                        [s["combined_sentence"] for s in sentences],
                        show_progress=show_progress,
                    )
                )
    
                for i, embedding in enumerate(combined_sentence_embeddings):
                    sentences[i]["combined_sentence_embedding"] = embedding
    
                distances = self._calculate_distances_between_sentence_groups(sentences)
    
                chunks = self._build_node_chunks(sentences, distances)
    
                nodes = build_nodes_from_splits(
                    chunks,
                    doc,
                    id_func=self.id_func,
                )
    
                all_nodes.extend(nodes)
    
            return all_nodes
    
        def _build_sentence_groups(
            self, text_splits: List[str]
        ) -> List[SentenceCombination]:
            sentences: List[SentenceCombination] = [
                {
                    "sentence": x,
                    "index": i,
                    "combined_sentence": "",
                    "combined_sentence_embedding": [],
                }
                for i, x in enumerate(text_splits)
            ]
    
            # Group sentences and calculate embeddings for sentence groups
            for i in range(len(sentences)):
                combined_sentence = ""
    
                for j in range(i - self.buffer_size, i):
                    if j >= 0:
                        combined_sentence += sentences[j]["sentence"]
    
                combined_sentence += sentences[i]["sentence"]
    
                for j in range(i + 1, i + 1 + self.buffer_size):
                    if j < len(sentences):
                        combined_sentence += sentences[j]["sentence"]
    
                sentences[i]["combined_sentence"] = combined_sentence
    
            return sentences
    
        def _calculate_distances_between_sentence_groups(
            self, sentences: List[SentenceCombination]
        ) -> List[float]:
            distances = []
            for i in range(len(sentences) - 1):
                embedding_current = sentences[i]["combined_sentence_embedding"]
                embedding_next = sentences[i + 1]["combined_sentence_embedding"]
    
                similarity = self.embed_model.similarity(embedding_current, embedding_next)
    
                distance = 1 - similarity
    
                distances.append(distance)
    
            return distances
    
        def _build_node_chunks(
            self, sentences: List[SentenceCombination], distances: List[float]
        ) -> List[str]:
            chunks = []
            if len(distances) > 0:
                breakpoint_distance_threshold = np.percentile(
                    distances, self.breakpoint_percentile_threshold
                )
    
                indices_above_threshold = [
                    i for i, x in enumerate(distances) if x > breakpoint_distance_threshold
                ]
    
                # Chunk sentences into semantic groups based on percentile breakpoints
                start_index = 0
    
                for index in indices_above_threshold:
                    group = sentences[start_index : index + 1]
                    combined_text = "".join([d["sentence"] for d in group])
                    chunks.append(combined_text)
    
                    start_index = index + 1
    
                if start_index < len(sentences):
                    combined_text = "".join(
                        [d["sentence"] for d in sentences[start_index:]]
                    )
                    chunks.append(combined_text)
            else:
                # If, for some reason we didn't get any distances (i.e. very, very small documents) just
                # treat the whole document as a single node
                chunks = [" ".join([s["sentence"] for s in sentences])]
    
            return chunks
    

    build_semantic_nodes_from_documents #

    build_semantic_nodes_from_documents(documents: Sequence[Document], show_progress: bool = False) -> List[BaseNode]
    

    从文档构建窗口节点。

    源代码位于 llama-index-core/llama_index/core/node_parser/text/semantic_splitter.py
    159
    160
    161
    162
    163
    164
    165
    166
    167
    168
    169
    170
    171
    172
    173
    174
    175
    176
    177
    178
    179
    180
    181
    182
    183
    184
    185
    186
    187
    188
    189
    190
    191
    192
    def build_semantic_nodes_from_documents(
        self,
        documents: Sequence[Document],
        show_progress: bool = False,
    ) -> List[BaseNode]:
        """Build window nodes from documents."""
        all_nodes: List[BaseNode] = []
        for doc in documents:
            text = doc.text
            text_splits = self.sentence_splitter(text)
    
            sentences = self._build_sentence_groups(text_splits)
    
            combined_sentence_embeddings = self.embed_model.get_text_embedding_batch(
                [s["combined_sentence"] for s in sentences],
                show_progress=show_progress,
            )
    
            for i, embedding in enumerate(combined_sentence_embeddings):
                sentences[i]["combined_sentence_embedding"] = embedding
    
            distances = self._calculate_distances_between_sentence_groups(sentences)
    
            chunks = self._build_node_chunks(sentences, distances)
    
            nodes = build_nodes_from_splits(
                chunks,
                doc,
                id_func=self.id_func,
            )
    
            all_nodes.extend(nodes)
    
        return all_nodes
    

    abuild_semantic_nodes_from_documents async #

    abuild_semantic_nodes_from_documents(documents: Sequence[Document], show_progress: bool = False) -> List[BaseNode]
    

    异步地从文档构建窗口节点。

    源代码位于 llama-index-core/llama_index/core/node_parser/text/semantic_splitter.py
    194
    195
    196
    197
    198
    199
    200
    201
    202
    203
    204
    205
    206
    207
    208
    209
    210
    211
    212
    213
    214
    215
    216
    217
    218
    219
    220
    221
    222
    223
    224
    225
    226
    227
    228
    229
    async def abuild_semantic_nodes_from_documents(
        self,
        documents: Sequence[Document],
        show_progress: bool = False,
    ) -> List[BaseNode]:
        """Asynchronously build window nodes from documents."""
        all_nodes: List[BaseNode] = []
        for doc in documents:
            text = doc.text
            text_splits = self.sentence_splitter(text)
    
            sentences = self._build_sentence_groups(text_splits)
    
            combined_sentence_embeddings = (
                await self.embed_model.aget_text_embedding_batch(
                    [s["combined_sentence"] for s in sentences],
                    show_progress=show_progress,
                )
            )
    
            for i, embedding in enumerate(combined_sentence_embeddings):
                sentences[i]["combined_sentence_embedding"] = embedding
    
            distances = self._calculate_distances_between_sentence_groups(sentences)
    
            chunks = self._build_node_chunks(sentences, distances)
    
            nodes = build_nodes_from_splits(
                chunks,
                doc,
                id_func=self.id_func,
            )
    
            all_nodes.extend(nodes)
    
        return all_nodes