Apify

ApifyActor #

基类: BaseReader

Apify Actor 阅读器。调用 Apify 平台上的一个 Actor，等待其完成，并读取其结果数据集。

参数

名称	类型	描述	默认值
`apify_api_token`	`str`	Apify API token。	必需

源代码位于 llama-index-integrations/readers/llama-index-readers-apify/llama_index/readers/apify/actor/base.py

class ApifyActor(BaseReader):
    """
    Apify Actor reader.
    Calls an Actor on the Apify platform and reads its resulting dataset when it finishes.

    Args:
        apify_api_token (str): Apify API token.

    """

    def __init__(self, apify_api_token: str) -> None:
        """Initialize the Apify Actor reader."""
        from apify_client import ApifyClient

        self.apify_api_token = apify_api_token

        client = ApifyClient(apify_api_token)
        if hasattr(client.http_client, "httpx_client"):
            client.http_client.httpx_client.headers[
                "user-agent"
            ] += "; Origin/llama_index"
        self.apify_client = client

    def load_data(
        self,
        actor_id: str,
        run_input: Dict,
        dataset_mapping_function: Callable[[Dict], Document],
        *,
        build: Optional[str] = None,
        memory_mbytes: Optional[int] = None,
        timeout_secs: Optional[int] = None,
    ) -> List[Document]:
        """
        Call an Actor on the Apify platform, wait for it to finish, and return its resulting dataset.

        Args:
            actor_id (str): The ID or name of the Actor.
            run_input (Dict): The input object of the Actor that you're trying to run.
            dataset_mapping_function (Callable): A function that takes a single dictionary (an Apify dataset item) and converts it to an instance of the Document class.
            build (str, optional): Optionally specifies the Actor build to run. It can be either a build tag or build number.
            memory_mbytes (int, optional): Optional memory limit for the run, in megabytes.
            timeout_secs (int, optional): Optional timeout for the run, in seconds.


        Returns:
            List[Document]: List of documents.

        """
        actor_call = self.apify_client.actor(actor_id).call(
            run_input=run_input,
            build=build,
            memory_mbytes=memory_mbytes,
            timeout_secs=timeout_secs,
        )

        reader = ApifyDataset(self.apify_api_token)
        return reader.load_data(
            dataset_id=actor_call.get("defaultDatasetId"),
            dataset_mapping_function=dataset_mapping_function,
        )

load_data #

load_data(actor_id: str, run_input: Dict, dataset_mapping_function: Callable[[Dict], Document], *, build: Optional[str] = None, memory_mbytes: Optional[int] = None, timeout_secs: Optional[int] = None) -> List[Document]

调用 Apify 平台上的 Actor，等待其完成，并返回其结果数据集。

参数

名称	类型	描述	默认值
`actor_id`	`str`	Actor 的 ID 或名称。	必需
`run_input`	`Dict`	您正在尝试运行的 Actor 的输入对象。	必需
`dataset_mapping_function`	`Callable`	一个函数，接受一个字典（一个 Apify 数据集项）并将其转换为 Document 类的一个实例。	必需
`build`	`str`	可选地指定要运行的 Actor 构建。它可以是构建标签或构建编号。	`无`
`memory_mbytes`	`int`	运行时的可选内存限制，以兆字节为单位。	`无`
`timeout_secs`	`int`	运行时的可选超时，以秒为单位。	`无`

返回

类型	描述
`List[Document]`	List[Document]: 文档列表。

源代码位于 llama-index-integrations/readers/llama-index-readers-apify/llama_index/readers/apify/actor/base.py

def load_data(
    self,
    actor_id: str,
    run_input: Dict,
    dataset_mapping_function: Callable[[Dict], Document],
    *,
    build: Optional[str] = None,
    memory_mbytes: Optional[int] = None,
    timeout_secs: Optional[int] = None,
) -> List[Document]:
    """
    Call an Actor on the Apify platform, wait for it to finish, and return its resulting dataset.

    Args:
        actor_id (str): The ID or name of the Actor.
        run_input (Dict): The input object of the Actor that you're trying to run.
        dataset_mapping_function (Callable): A function that takes a single dictionary (an Apify dataset item) and converts it to an instance of the Document class.
        build (str, optional): Optionally specifies the Actor build to run. It can be either a build tag or build number.
        memory_mbytes (int, optional): Optional memory limit for the run, in megabytes.
        timeout_secs (int, optional): Optional timeout for the run, in seconds.


    Returns:
        List[Document]: List of documents.

    """
    actor_call = self.apify_client.actor(actor_id).call(
        run_input=run_input,
        build=build,
        memory_mbytes=memory_mbytes,
        timeout_secs=timeout_secs,
    )

    reader = ApifyDataset(self.apify_api_token)
    return reader.load_data(
        dataset_id=actor_call.get("defaultDatasetId"),
        dataset_mapping_function=dataset_mapping_function,
    )

ApifyDataset #

基类: BaseReader

Apify 数据集读取器。读取 Apify 平台上的数据集。

参数

名称	类型	描述	默认值
`apify_api_token`	`str`	Apify API token。	必需

源代码位于 llama-index-integrations/readers/llama-index-readers-apify/llama_index/readers/apify/dataset/base.py

class ApifyDataset(BaseReader):
    """
    Apify Dataset reader.
    Reads a dataset on the Apify platform.

    Args:
        apify_api_token (str): Apify API token.

    """

    def __init__(self, apify_api_token: str) -> None:
        """Initialize Apify dataset reader."""
        from apify_client import ApifyClient

        client = ApifyClient(apify_api_token)
        if hasattr(client.http_client, "httpx_client"):
            client.http_client.httpx_client.headers[
                "user-agent"
            ] += "; Origin/llama_index"

        self.apify_client = client

    def load_data(
        self, dataset_id: str, dataset_mapping_function: Callable[[Dict], Document]
    ) -> List[Document]:
        """
        Load data from the Apify dataset.

        Args:
            dataset_id (str): Dataset ID.
            dataset_mapping_function (Callable[[Dict], Document]): Function to map dataset items to Document.


        Returns:
            List[Document]: List of documents.

        """
        items_list = self.apify_client.dataset(dataset_id).list_items(clean=True)

        document_list = []
        for item in items_list.items:
            document = dataset_mapping_function(item)
            if not isinstance(document, Document):
                raise ValueError("Dataset_mapping_function must return a Document")
            document_list.append(document)

        return document_list

load_data #

load_data(dataset_id: str, dataset_mapping_function: Callable[[Dict], Document]) -> List[Document]

从 Apify 数据集加载数据。

参数

名称	类型	描述	默认值
`dataset_id`	`str`	数据集 ID。	必需
`dataset_mapping_function`	`Callable[[Dict], Document]`	将数据集项映射到 Document 的函数。	必需

返回

类型	描述
`List[Document]`	List[Document]: 文档列表。

源代码位于 llama-index-integrations/readers/llama-index-readers-apify/llama_index/readers/apify/dataset/base.py

def load_data(
    self, dataset_id: str, dataset_mapping_function: Callable[[Dict], Document]
) -> List[Document]:
    """
    Load data from the Apify dataset.

    Args:
        dataset_id (str): Dataset ID.
        dataset_mapping_function (Callable[[Dict], Document]): Function to map dataset items to Document.


    Returns:
        List[Document]: List of documents.

    """
    items_list = self.apify_client.dataset(dataset_id).list_items(clean=True)

    document_list = []
    for item in items_list.items:
        document = dataset_mapping_function(item)
        if not isinstance(document, Document):
            raise ValueError("Dataset_mapping_function must return a Document")
        document_list.append(document)

    return document_list