Json

JSONReader #

继承自: BaseReader

JSON 读取器。

读取 JSON 文档，并提供选项以帮助我们处理节点之间的关系。

参数

名称	类型	描述	默认值
`levels_back`	`int`	在 JSON 树中回溯的层数，如果需要所有层，则为 0。如果 levels_back 为 None，则我们只格式化 JSON，并将每一行作为一个嵌入。	`无`
`collapse_length`	`int`	在输出中折叠 JSON 片段的最大字符数（levels_back 不能为 None）。例如：如果 collapse_length = 10，输入为 {a: [1, 2, 3], b: {"hello": "world", "foo": "bar"}}，那么 a 将被折叠成一行，而 b 不会。建议从 100 左右开始调整。	`无`
`is_jsonl`	`Optional[bool]`	如果为 True，表示文件为 JSONL 格式。	`False`
`clean_json`	`Optional[bool]`	如果为 True，则移除只包含 JSON 结构体的行。	`True`

源代码位于 llama-index-integrations/readers/llama-index-readers-json/llama_index/readers/json/base.py

class JSONReader(BaseReader):
    """
    JSON reader.

    Reads JSON documents with options to help us out relationships between nodes.

    Args:
        levels_back (int): the number of levels to go back in the JSON tree, 0
          if you want all levels. If levels_back is None, then we just format the
          JSON and make each line an embedding

        collapse_length (int): the maximum number of characters a JSON fragment
          would be collapsed in the output (levels_back needs to be not None)
          ex: if collapse_length = 10, and
          input is {a: [1, 2, 3], b: {"hello": "world", "foo": "bar"}}
          then a would be collapsed into one line, while b would not.
          Recommend starting around 100 and then adjusting from there.

        is_jsonl (Optional[bool]): If True, indicates that the file is in JSONL format.
        Defaults to False.

        clean_json (Optional[bool]): If True, lines containing only JSON structure are removed.
        This removes lines that are not as useful. If False, no lines are removed and the document maintains a valid JSON object structure.
        If levels_back is set the json is not cleaned and this option is ignored.
        Defaults to True.

    """

    def __init__(
        self,
        levels_back: Optional[int] = None,
        collapse_length: Optional[int] = None,
        ensure_ascii: bool = False,
        is_jsonl: Optional[bool] = False,
        clean_json: Optional[bool] = True,
    ) -> None:
        """Initialize with arguments."""
        super().__init__()
        self.levels_back = levels_back
        self.collapse_length = collapse_length
        self.ensure_ascii = ensure_ascii
        self.is_jsonl = is_jsonl
        self.clean_json = clean_json

    def load_data(
        self, input_file: str, extra_info: Optional[Dict] = {}
    ) -> List[Document]:
        """Load data from the input file."""
        with open(input_file, encoding="utf-8") as f:
            load_data = []
            if self.is_jsonl:
                for line in f:
                    load_data.append(json.loads(line.strip()))
            else:
                load_data = [json.load(f)]

            documents = []
            for data in load_data:
                if self.levels_back is None and self.clean_json is True:
                    # If levels_back isn't set and clean json is set,
                    # remove lines containing only formatting, we just format and make each
                    # line an embedding
                    json_output = json.dumps(
                        data, indent=0, ensure_ascii=self.ensure_ascii
                    )
                    lines = json_output.split("\n")
                    useful_lines = [
                        line for line in lines if not re.match(r"^[{}\[\],]*$", line)
                    ]
                    documents.append(
                        Document(text="\n".join(useful_lines), metadata=extra_info)
                    )

                elif self.levels_back is None and self.clean_json is False:
                    # If levels_back isn't set  and clean json is False, create documents without cleaning
                    json_output = json.dumps(data, ensure_ascii=self.ensure_ascii)
                    documents.append(Document(text=json_output, metadata=extra_info))

                elif self.levels_back is not None:
                    # If levels_back is set, we make the embeddings contain the labels
                    # from further up the JSON tree
                    lines = [
                        *_depth_first_yield(
                            data,
                            self.levels_back,
                            self.collapse_length,
                            [],
                            self.ensure_ascii,
                        )
                    ]
                    documents.append(
                        Document(text="\n".join(lines), metadata=extra_info)
                    )
            return documents

load_data #

load_data(input_file: str, extra_info: Optional[Dict] = {}) -> List[Document]

从输入文件加载数据。

源代码位于 llama-index-integrations/readers/llama-index-readers-json/llama_index/readers/json/base.py

def load_data(
    self, input_file: str, extra_info: Optional[Dict] = {}
) -> List[Document]:
    """Load data from the input file."""
    with open(input_file, encoding="utf-8") as f:
        load_data = []
        if self.is_jsonl:
            for line in f:
                load_data.append(json.loads(line.strip()))
        else:
            load_data = [json.load(f)]

        documents = []
        for data in load_data:
            if self.levels_back is None and self.clean_json is True:
                # If levels_back isn't set and clean json is set,
                # remove lines containing only formatting, we just format and make each
                # line an embedding
                json_output = json.dumps(
                    data, indent=0, ensure_ascii=self.ensure_ascii
                )
                lines = json_output.split("\n")
                useful_lines = [
                    line for line in lines if not re.match(r"^[{}\[\],]*$", line)
                ]
                documents.append(
                    Document(text="\n".join(useful_lines), metadata=extra_info)
                )

            elif self.levels_back is None and self.clean_json is False:
                # If levels_back isn't set  and clean json is False, create documents without cleaning
                json_output = json.dumps(data, ensure_ascii=self.ensure_ascii)
                documents.append(Document(text=json_output, metadata=extra_info))

            elif self.levels_back is not None:
                # If levels_back is set, we make the embeddings contain the labels
                # from further up the JSON tree
                lines = [
                    *_depth_first_yield(
                        data,
                        self.levels_back,
                        self.collapse_length,
                        [],
                        self.ensure_ascii,
                    )
                ]
                documents.append(
                    Document(text="\n".join(lines), metadata=extra_info)
                )
        return documents