跳到内容

文档字符串遍历器

Init 文件。

DocstringWalker #

继承自: BaseReader

用于提取 docstring 并从中构建结构化文档的加载器。递归遍历目录,从每个 Python 模块中提取 docstring - 首先是模块本身,然后是类,最后是函数。构建提取的 docstring 之间的依赖关系图。

源代码位于 llama-index-integrations/readers/llama-index-readers-docstring-walker/llama_index/readers/docstring_walker/base.py
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
class DocstringWalker(BaseReader):
    """
    A loader for docstring extraction and building structured documents from them.
    Recursively walks a directory and extracts docstrings from each Python
    module - starting from the module itself, then classes, then functions.
    Builds a graph of dependencies between the extracted docstrings.
    """

    def load_data(
        self,
        code_dir: str,
        skip_initpy: bool = True,
        fail_on_malformed_files: bool = False,
    ) -> List[Document]:
        """
        Load data from the specified code directory.
        Additionally, after loading the data, build a dependency graph between the loaded documents.
        The graph is stored as an attribute of the class.


        Parameters
        ----------
        code_dir : str
            The directory path to the code files.
        skip_initpy : bool
            Whether to skip the __init__.py files. Defaults to True.
        fail_on_malformed_files : bool
            Whether to fail on malformed files. Defaults to False - in this case,
            the malformed files are skipped and a warning is logged.

        Returns
        -------
        List[Document]
            A list of loaded documents.

        """
        return self.process_directory(code_dir, skip_initpy, fail_on_malformed_files)

    def process_directory(
        self,
        code_dir: str,
        skip_initpy: bool = True,
        fail_on_malformed_files: bool = False,
    ) -> List[Document]:
        """
        Process a directory and extract information from Python files.

        Parameters
        ----------
        code_dir : str
            The directory path to the code files.
        skip_initpy : bool
            Whether to skip the __init__.py files. Defaults to True.
        fail_on_malformed_files : bool
            Whether to fail on malformed files. Defaults to False - in this case,
            the malformed files are skipped and a warning is logged.

        Returns
        -------
        List[Document]
            A list of Document objects.

        """
        llama_docs = []
        for root, _, files in os.walk(code_dir):
            for file in files:
                if file.endswith(".py"):
                    if skip_initpy and file == "__init__.py":
                        continue
                    module_name = file.replace(".py", "")
                    module_path = os.path.join(root, file)
                    try:
                        doc = self.parse_module(module_name, module_path)
                        llama_docs.append(doc)
                    except Exception as e:
                        if fail_on_malformed_files:
                            raise e  # noqa: TRY201
                        log.warning(
                            "Failed to parse file %s. Skipping. Error: %s",
                            module_path,
                            e,
                        )
                        continue
        return llama_docs

    def read_module_text(self, path: str) -> str:
        """
        Read the text of a Python module. For tests this function can be mocked.

        Parameters
        ----------
        path : str
            Path to the module.

        Returns
        -------
        str
            The text of the module.

        """
        with open(path, encoding="utf-8") as f:
            return f.read()

    def parse_module(self, module_name: str, path: str) -> Document:
        """
        Function for parsing a single Python module.

        Parameters
        ----------
        module_name : str
            A module name.
        path : str
            Path to the module.

        Returns
        -------
        Document
            A LLama Index Document object with extracted information from the module.

        """
        module_text = self.read_module_text(path)
        module = ast.parse(module_text)
        module_docstring = ast.get_docstring(module)
        module_text = f"Module name: {module_name} \n Docstring: {module_docstring} \n"
        sub_texts = []
        for elem in module.body:
            if type(elem) in TYPES_TO_PROCESS:
                sub_text = self.process_elem(elem, module_name)
                sub_texts.append(sub_text)
        module_text += "\n".join(sub_texts)
        return Document(text=module_text)

    def process_class(self, class_node: ast.ClassDef, parent_node: str):
        """
        Process a class node in the AST and add relevant information to the graph.

        Parameters
        ----------
        class_node : ast.ClassDef
            The class node to process. It represents a class definition
            in the abstract syntax tree (AST).
        parent_node : str
            The name of the parent node. It specifies the name of the parent node in the graph.

        Returns
        -------
        str
            A string representation of the processed class node and its sub-elements.
            It provides a textual representation of the processed class node and its sub-elements.

        """
        cls_name = class_node.name
        cls_docstring = ast.get_docstring(class_node)

        text = f"\n Class name: {cls_name}, In: {parent_node} \n Docstring: {cls_docstring}"
        sub_texts = []
        for elem in class_node.body:
            sub_text = self.process_elem(elem, cls_name)
            sub_texts.append(sub_text)
        return text + "\n".join(sub_texts)

    def process_function(self, func_node: ast.FunctionDef, parent_node: str) -> str:
        """
        Process a function node in the AST and add it to the graph. Build node text.

        Parameters
        ----------
        func_node : ast.FunctionDef
            The function node to process.
        parent_node : str
            The name of the parent node.

        Returns
        -------
        str
            A string representation of the processed function node with its sub-elements.

        """
        func_name = func_node.name
        func_docstring = ast.get_docstring(func_node)

        text = f"\n Function name: {func_name}, In: {parent_node} \n Docstring: {func_docstring}"
        sub_texts = []
        for elem in func_node.body:
            sub_text = self.process_elem(elem, func_name)
            sub_texts.append(sub_text)
        return text + "\n".join(sub_texts)

    def process_elem(self, elem, parent_node: str) -> str:
        """
        Process an element in the abstract syntax tree (AST).

        This is a generic function that delegates the execution to more specific
        functions based on the type of the element.

        Args:
            elem (ast.AST): The element to process.
            parent_node (str): The parent node in the graph.
            graph (nx.Graph): The graph to update.

        Returns:
            str: The result of processing the element.

        """
        if isinstance(elem, ast.FunctionDef):
            return self.process_function(elem, parent_node)
        elif isinstance(elem, ast.ClassDef):
            return self.process_class(elem, parent_node)
        return ""

load_data #

load_data(code_dir: str, skip_initpy: bool = True, fail_on_malformed_files: bool = False) -> List[Document]

从指定的代码目录加载数据。此外,加载数据后,构建加载文档之间的依赖关系图。该图存储为类的属性。

参数#

code_dir : str 代码文件所在的目录路径。 skip_initpy : bool 是否跳过 init.py 文件。默认为 True。 fail_on_malformed_files : bool 是否在文件格式错误时失败。默认为 False - 在此情况下,格式错误的文件将被跳过并记录警告。

返回值#

List[Document] 加载的文档列表。

源代码位于 llama-index-integrations/readers/llama-index-readers-docstring-walker/llama_index/readers/docstring_walker/base.py
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
def load_data(
    self,
    code_dir: str,
    skip_initpy: bool = True,
    fail_on_malformed_files: bool = False,
) -> List[Document]:
    """
    Load data from the specified code directory.
    Additionally, after loading the data, build a dependency graph between the loaded documents.
    The graph is stored as an attribute of the class.


    Parameters
    ----------
    code_dir : str
        The directory path to the code files.
    skip_initpy : bool
        Whether to skip the __init__.py files. Defaults to True.
    fail_on_malformed_files : bool
        Whether to fail on malformed files. Defaults to False - in this case,
        the malformed files are skipped and a warning is logged.

    Returns
    -------
    List[Document]
        A list of loaded documents.

    """
    return self.process_directory(code_dir, skip_initpy, fail_on_malformed_files)

process_directory #

process_directory(code_dir: str, skip_initpy: bool = True, fail_on_malformed_files: bool = False) -> List[Document]

处理目录并从 Python 文件中提取信息。

参数#

code_dir : str 代码文件所在的目录路径。 skip_initpy : bool 是否跳过 init.py 文件。默认为 True。 fail_on_malformed_files : bool 是否在文件格式错误时失败。默认为 False - 在此情况下,格式错误的文件将被跳过并记录警告。

返回值#

List[Document] Document 对象列表。

源代码位于 llama-index-integrations/readers/llama-index-readers-docstring-walker/llama_index/readers/docstring_walker/base.py
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
def process_directory(
    self,
    code_dir: str,
    skip_initpy: bool = True,
    fail_on_malformed_files: bool = False,
) -> List[Document]:
    """
    Process a directory and extract information from Python files.

    Parameters
    ----------
    code_dir : str
        The directory path to the code files.
    skip_initpy : bool
        Whether to skip the __init__.py files. Defaults to True.
    fail_on_malformed_files : bool
        Whether to fail on malformed files. Defaults to False - in this case,
        the malformed files are skipped and a warning is logged.

    Returns
    -------
    List[Document]
        A list of Document objects.

    """
    llama_docs = []
    for root, _, files in os.walk(code_dir):
        for file in files:
            if file.endswith(".py"):
                if skip_initpy and file == "__init__.py":
                    continue
                module_name = file.replace(".py", "")
                module_path = os.path.join(root, file)
                try:
                    doc = self.parse_module(module_name, module_path)
                    llama_docs.append(doc)
                except Exception as e:
                    if fail_on_malformed_files:
                        raise e  # noqa: TRY201
                    log.warning(
                        "Failed to parse file %s. Skipping. Error: %s",
                        module_path,
                        e,
                    )
                    continue
    return llama_docs

read_module_text #

read_module_text(path: str) -> str

读取 Python 模块的文本。对于测试,此函数可以被模拟。

参数#

path : str 模块路径。

返回值#

str 模块的文本。

源代码位于 llama-index-integrations/readers/llama-index-readers-docstring-walker/llama_index/readers/docstring_walker/base.py
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
def read_module_text(self, path: str) -> str:
    """
    Read the text of a Python module. For tests this function can be mocked.

    Parameters
    ----------
    path : str
        Path to the module.

    Returns
    -------
    str
        The text of the module.

    """
    with open(path, encoding="utf-8") as f:
        return f.read()

parse_module #

parse_module(module_name: str, path: str) -> Document

用于解析单个 Python 模块的函数。

参数#

module_name : str 模块名称。 path : str 模块路径。

返回值#

Document 一个包含从模块中提取的信息的 LLama Index Document 对象。

源代码位于 llama-index-integrations/readers/llama-index-readers-docstring-walker/llama_index/readers/docstring_walker/base.py
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
def parse_module(self, module_name: str, path: str) -> Document:
    """
    Function for parsing a single Python module.

    Parameters
    ----------
    module_name : str
        A module name.
    path : str
        Path to the module.

    Returns
    -------
    Document
        A LLama Index Document object with extracted information from the module.

    """
    module_text = self.read_module_text(path)
    module = ast.parse(module_text)
    module_docstring = ast.get_docstring(module)
    module_text = f"Module name: {module_name} \n Docstring: {module_docstring} \n"
    sub_texts = []
    for elem in module.body:
        if type(elem) in TYPES_TO_PROCESS:
            sub_text = self.process_elem(elem, module_name)
            sub_texts.append(sub_text)
    module_text += "\n".join(sub_texts)
    return Document(text=module_text)

process_class #

process_class(class_node: ClassDef, parent_node: str)

处理 AST 中的类节点,并将相关信息添加到图中。

参数#

class_node : ast.ClassDef 要处理的类节点。它表示抽象语法树 (AST) 中的一个类定义。 parent_node : str 父节点的名称。它指定了图中父节点的名称。

返回值#

str 处理后的类节点及其子元素的字符串表示。它提供了处理后的类节点及其子元素的文本表示。

源代码位于 llama-index-integrations/readers/llama-index-readers-docstring-walker/llama_index/readers/docstring_walker/base.py
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
def process_class(self, class_node: ast.ClassDef, parent_node: str):
    """
    Process a class node in the AST and add relevant information to the graph.

    Parameters
    ----------
    class_node : ast.ClassDef
        The class node to process. It represents a class definition
        in the abstract syntax tree (AST).
    parent_node : str
        The name of the parent node. It specifies the name of the parent node in the graph.

    Returns
    -------
    str
        A string representation of the processed class node and its sub-elements.
        It provides a textual representation of the processed class node and its sub-elements.

    """
    cls_name = class_node.name
    cls_docstring = ast.get_docstring(class_node)

    text = f"\n Class name: {cls_name}, In: {parent_node} \n Docstring: {cls_docstring}"
    sub_texts = []
    for elem in class_node.body:
        sub_text = self.process_elem(elem, cls_name)
        sub_texts.append(sub_text)
    return text + "\n".join(sub_texts)

process_function #

process_function(func_node: FunctionDef, parent_node: str) -> str

处理 AST 中的函数节点并将其添加到图中。构建节点文本。

参数#

func_node : ast.FunctionDef 要处理的函数节点。 parent_node : str 父节点的名称。

返回值#

str 处理后的函数节点及其子元素的字符串表示。

源代码位于 llama-index-integrations/readers/llama-index-readers-docstring-walker/llama_index/readers/docstring_walker/base.py
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
def process_function(self, func_node: ast.FunctionDef, parent_node: str) -> str:
    """
    Process a function node in the AST and add it to the graph. Build node text.

    Parameters
    ----------
    func_node : ast.FunctionDef
        The function node to process.
    parent_node : str
        The name of the parent node.

    Returns
    -------
    str
        A string representation of the processed function node with its sub-elements.

    """
    func_name = func_node.name
    func_docstring = ast.get_docstring(func_node)

    text = f"\n Function name: {func_name}, In: {parent_node} \n Docstring: {func_docstring}"
    sub_texts = []
    for elem in func_node.body:
        sub_text = self.process_elem(elem, func_name)
        sub_texts.append(sub_text)
    return text + "\n".join(sub_texts)

process_elem #

process_elem(elem, parent_node: str) -> str

处理抽象语法树 (AST) 中的元素。

这是一个通用函数,根据元素的类型将执行委托给更具体的函数。

参数

名称 类型 描述 默认值
elem AST

要处理的元素。

必需的
parent_node str

图中的父节点。

必需的
graph Graph

要更新的图。

必需的

返回值

名称 类型 描述
str str

处理元素的结果。

源代码位于 llama-index-integrations/readers/llama-index-readers-docstring-walker/llama_index/readers/docstring_walker/base.py
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
def process_elem(self, elem, parent_node: str) -> str:
    """
    Process an element in the abstract syntax tree (AST).

    This is a generic function that delegates the execution to more specific
    functions based on the type of the element.

    Args:
        elem (ast.AST): The element to process.
        parent_node (str): The parent node in the graph.
        graph (nx.Graph): The graph to update.

    Returns:
        str: The result of processing the element.

    """
    if isinstance(elem, ast.FunctionDef):
        return self.process_function(elem, parent_node)
    elif isinstance(elem, ast.ClassDef):
        return self.process_class(elem, parent_node)
    return ""