跳到内容

MongoDB

SimpleMongoReader #

基类:BaseReader

简单的 Mongo 阅读器。

将每个 Mongo 文档连接成 LlamaIndex 使用的 Document 对象。

参数

名称 类型 描述 默认值
host str

Mongo 主机。

None
port int

Mongo 端口。

None
源代码位于 llama-index-integrations/readers/llama-index-readers-mongodb/llama_index/readers/mongodb/base.py
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
class SimpleMongoReader(BaseReader):
    """
    Simple mongo reader.

    Concatenates each Mongo doc into Document used by LlamaIndex.

    Args:
        host (str): Mongo host.
        port (int): Mongo port.

    """

    def __init__(
        self,
        host: Optional[str] = None,
        port: Optional[int] = None,
        uri: Optional[str] = None,
    ) -> None:
        """Initialize with parameters."""
        try:
            from pymongo import MongoClient
        except ImportError as err:
            raise ImportError(
                "`pymongo` package not found, please run `pip install pymongo`"
            ) from err

        client: MongoClient
        if uri:
            client = MongoClient(uri)
        elif host and port:
            client = MongoClient(host, port)
        else:
            raise ValueError("Either `host` and `port` or `uri` must be provided.")

        self.client = client

    def lazy_load_data(
        self,
        db_name: str,
        collection_name: str,
        field_names: List[str] = ["text"],
        separator: str = "",
        query_dict: Optional[Dict] = None,
        max_docs: int = 0,
        metadata_names: Optional[List[str]] = None,
        field_extractors: Optional[Dict[str, Callable[..., str]]] = None,
    ) -> Iterable[Document]:
        """
        Load data from the input directory.

        Args:
            db_name (str): name of the database.
            collection_name (str): name of the collection.
            field_names(List[str]): names of the fields to be concatenated.
                Defaults to ["text"]
            separator (str): separator to be used between fields.
                Defaults to ""
            query_dict (Optional[Dict]): query to filter documents. Read more
            at [official docs](https://mongodb.ac.cn/docs/manual/reference/method/db.collection.find/#std-label-method-find-query)
                Defaults to None
            max_docs (int): maximum number of documents to load.
                Defaults to 0 (no limit)
            metadata_names (Optional[List[str]]): names of the fields to be added
                to the metadata attribute of the Document. Defaults to None
            field_extractors (Optional[Dict[str, Callable[..., str]]]): dictionary
                containing field name and a function to extract text from the field.
                The default extractor function is `str`. Defaults to None.

        Returns:
            List[Document]: A list of documents.

        """
        db = self.client[db_name]
        cursor = db[collection_name].find(
            filter=query_dict or {},
            limit=max_docs,
            projection=dict.fromkeys(field_names + (metadata_names or []), 1),
        )

        field_extractors = field_extractors or {}

        for item in cursor:
            try:
                texts = [
                    field_extractors.get(name, str)(item[name]) for name in field_names
                ]
            except KeyError as err:
                raise ValueError(
                    f"{err.args[0]} field not found in Mongo document."
                ) from err

            text = separator.join(texts)

            if metadata_names is None:
                yield Document(text=text, id_=str(item["_id"]))
            else:
                try:
                    metadata = {name: item.get(name) for name in metadata_names}
                    metadata["collection"] = collection_name
                except KeyError as err:
                    raise ValueError(
                        f"{err.args[0]} field not found in Mongo document."
                    ) from err
                yield Document(text=text, id_=str(item["_id"]), metadata=metadata)

lazy_load_data #

lazy_load_data(db_name: str, collection_name: str, field_names: List[str] = ['text'], separator: str = '', query_dict: Optional[Dict] = None, max_docs: int = 0, metadata_names: Optional[List[str]] = None, field_extractors: Optional[Dict[str, Callable[..., str]]] = None) -> Iterable[Document]

从输入目录加载数据。

参数

名称 类型 描述 默认值
db_name str

数据库名称。

必填
collection_name str

集合名称。

必填
field_names(List[str])

要连接的字段名称。默认为 ["text"]

必填
separator str

字段间使用的分隔符。默认为 ""

''
query_dict 可选[Dict]

用于过滤文档的查询。了解更多

None
[官方文档](https

//mongodb.ac.cn/docs/manual/reference/method/db.collection.find/#std-label-method-find-query) 默认为 None

必填
max_docs int

要加载的最大文档数量。默认为 0(无限制)

0
metadata_names 可选[List[str]]

要添加到 Document 元数据属性的字段名称。默认为 None

None
field_extractors 可选[Dict[str, Callable[..., str]]]

包含字段名称和用于从字段中提取文本的函数的字典。默认的提取函数是 str。默认为 None。

None

返回值

类型 描述
Iterable[Document]

List[Document]:文档列表。

源代码位于 llama-index-integrations/readers/llama-index-readers-mongodb/llama_index/readers/mongodb/base.py
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
def lazy_load_data(
    self,
    db_name: str,
    collection_name: str,
    field_names: List[str] = ["text"],
    separator: str = "",
    query_dict: Optional[Dict] = None,
    max_docs: int = 0,
    metadata_names: Optional[List[str]] = None,
    field_extractors: Optional[Dict[str, Callable[..., str]]] = None,
) -> Iterable[Document]:
    """
    Load data from the input directory.

    Args:
        db_name (str): name of the database.
        collection_name (str): name of the collection.
        field_names(List[str]): names of the fields to be concatenated.
            Defaults to ["text"]
        separator (str): separator to be used between fields.
            Defaults to ""
        query_dict (Optional[Dict]): query to filter documents. Read more
        at [official docs](https://mongodb.ac.cn/docs/manual/reference/method/db.collection.find/#std-label-method-find-query)
            Defaults to None
        max_docs (int): maximum number of documents to load.
            Defaults to 0 (no limit)
        metadata_names (Optional[List[str]]): names of the fields to be added
            to the metadata attribute of the Document. Defaults to None
        field_extractors (Optional[Dict[str, Callable[..., str]]]): dictionary
            containing field name and a function to extract text from the field.
            The default extractor function is `str`. Defaults to None.

    Returns:
        List[Document]: A list of documents.

    """
    db = self.client[db_name]
    cursor = db[collection_name].find(
        filter=query_dict or {},
        limit=max_docs,
        projection=dict.fromkeys(field_names + (metadata_names or []), 1),
    )

    field_extractors = field_extractors or {}

    for item in cursor:
        try:
            texts = [
                field_extractors.get(name, str)(item[name]) for name in field_names
            ]
        except KeyError as err:
            raise ValueError(
                f"{err.args[0]} field not found in Mongo document."
            ) from err

        text = separator.join(texts)

        if metadata_names is None:
            yield Document(text=text, id_=str(item["_id"]))
        else:
            try:
                metadata = {name: item.get(name) for name in metadata_names}
                metadata["collection"] = collection_name
            except KeyError as err:
                raise ValueError(
                    f"{err.args[0]} field not found in Mongo document."
                ) from err
            yield Document(text=text, id_=str(item["_id"]), metadata=metadata)