跳到内容

Json

JSONReader #

继承自: BaseReader

JSON 读取器。

读取 JSON 文档,并提供选项以帮助我们处理节点之间的关系。

参数

名称 类型 描述 默认值
levels_back int

在 JSON 树中回溯的层数,如果需要所有层,则为 0。如果 levels_back 为 None,则我们只格式化 JSON,并将每一行作为一个嵌入。

collapse_length int

在输出中折叠 JSON 片段的最大字符数(levels_back 不能为 None)。例如:如果 collapse_length = 10,输入为 {a: [1, 2, 3], b: {"hello": "world", "foo": "bar"}},那么 a 将被折叠成一行,而 b 不会。建议从 100 左右开始调整。

is_jsonl Optional[bool]

如果为 True,表示文件为 JSONL 格式。

False
clean_json Optional[bool]

如果为 True,则移除只包含 JSON 结构体的行。

True
源代码位于 llama-index-integrations/readers/llama-index-readers-json/llama_index/readers/json/base.py
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
class JSONReader(BaseReader):
    """
    JSON reader.

    Reads JSON documents with options to help us out relationships between nodes.

    Args:
        levels_back (int): the number of levels to go back in the JSON tree, 0
          if you want all levels. If levels_back is None, then we just format the
          JSON and make each line an embedding

        collapse_length (int): the maximum number of characters a JSON fragment
          would be collapsed in the output (levels_back needs to be not None)
          ex: if collapse_length = 10, and
          input is {a: [1, 2, 3], b: {"hello": "world", "foo": "bar"}}
          then a would be collapsed into one line, while b would not.
          Recommend starting around 100 and then adjusting from there.

        is_jsonl (Optional[bool]): If True, indicates that the file is in JSONL format.
        Defaults to False.

        clean_json (Optional[bool]): If True, lines containing only JSON structure are removed.
        This removes lines that are not as useful. If False, no lines are removed and the document maintains a valid JSON object structure.
        If levels_back is set the json is not cleaned and this option is ignored.
        Defaults to True.

    """

    def __init__(
        self,
        levels_back: Optional[int] = None,
        collapse_length: Optional[int] = None,
        ensure_ascii: bool = False,
        is_jsonl: Optional[bool] = False,
        clean_json: Optional[bool] = True,
    ) -> None:
        """Initialize with arguments."""
        super().__init__()
        self.levels_back = levels_back
        self.collapse_length = collapse_length
        self.ensure_ascii = ensure_ascii
        self.is_jsonl = is_jsonl
        self.clean_json = clean_json

    def load_data(
        self, input_file: str, extra_info: Optional[Dict] = {}
    ) -> List[Document]:
        """Load data from the input file."""
        with open(input_file, encoding="utf-8") as f:
            load_data = []
            if self.is_jsonl:
                for line in f:
                    load_data.append(json.loads(line.strip()))
            else:
                load_data = [json.load(f)]

            documents = []
            for data in load_data:
                if self.levels_back is None and self.clean_json is True:
                    # If levels_back isn't set and clean json is set,
                    # remove lines containing only formatting, we just format and make each
                    # line an embedding
                    json_output = json.dumps(
                        data, indent=0, ensure_ascii=self.ensure_ascii
                    )
                    lines = json_output.split("\n")
                    useful_lines = [
                        line for line in lines if not re.match(r"^[{}\[\],]*$", line)
                    ]
                    documents.append(
                        Document(text="\n".join(useful_lines), metadata=extra_info)
                    )

                elif self.levels_back is None and self.clean_json is False:
                    # If levels_back isn't set  and clean json is False, create documents without cleaning
                    json_output = json.dumps(data, ensure_ascii=self.ensure_ascii)
                    documents.append(Document(text=json_output, metadata=extra_info))

                elif self.levels_back is not None:
                    # If levels_back is set, we make the embeddings contain the labels
                    # from further up the JSON tree
                    lines = [
                        *_depth_first_yield(
                            data,
                            self.levels_back,
                            self.collapse_length,
                            [],
                            self.ensure_ascii,
                        )
                    ]
                    documents.append(
                        Document(text="\n".join(lines), metadata=extra_info)
                    )
            return documents

load_data #

load_data(input_file: str, extra_info: Optional[Dict] = {}) -> List[Document]

从输入文件加载数据。

源代码位于 llama-index-integrations/readers/llama-index-readers-json/llama_index/readers/json/base.py
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
def load_data(
    self, input_file: str, extra_info: Optional[Dict] = {}
) -> List[Document]:
    """Load data from the input file."""
    with open(input_file, encoding="utf-8") as f:
        load_data = []
        if self.is_jsonl:
            for line in f:
                load_data.append(json.loads(line.strip()))
        else:
            load_data = [json.load(f)]

        documents = []
        for data in load_data:
            if self.levels_back is None and self.clean_json is True:
                # If levels_back isn't set and clean json is set,
                # remove lines containing only formatting, we just format and make each
                # line an embedding
                json_output = json.dumps(
                    data, indent=0, ensure_ascii=self.ensure_ascii
                )
                lines = json_output.split("\n")
                useful_lines = [
                    line for line in lines if not re.match(r"^[{}\[\],]*$", line)
                ]
                documents.append(
                    Document(text="\n".join(useful_lines), metadata=extra_info)
                )

            elif self.levels_back is None and self.clean_json is False:
                # If levels_back isn't set  and clean json is False, create documents without cleaning
                json_output = json.dumps(data, ensure_ascii=self.ensure_ascii)
                documents.append(Document(text=json_output, metadata=extra_info))

            elif self.levels_back is not None:
                # If levels_back is set, we make the embeddings contain the labels
                # from further up the JSON tree
                lines = [
                    *_depth_first_yield(
                        data,
                        self.levels_back,
                        self.collapse_length,
                        [],
                        self.ensure_ascii,
                    )
                ]
                documents.append(
                    Document(text="\n".join(lines), metadata=extra_info)
                )
        return documents