跳到内容

远程深度

RemoteDepthReader #

基础类: BaseReader

源代码位于 llama-index-integrations/readers/llama-index-readers-remote-depth/llama_index/readers/remote_depth/base.py

class RemoteDepthReader(BaseReader):
    def __init__(
        self,
        *args: Any,
        file_extractor: Optional[Dict[str, Union[str, BaseReader]]] = None,
        depth: int = 1,
        domain_lock: bool = False,
        **kwargs: Any,
    ) -> None:
        """Init params."""
        super().__init__(*args, **kwargs)
        self.file_extractor = file_extractor
        self.depth = depth
        self.domain_lock = domain_lock

    def load_data(self, url: str) -> List[Document]:
        from tqdm.auto import tqdm

        """Parse whatever is at the URL.""" ""
        remote_reader = RemoteReader(file_extractor=self.file_extractor)
        documents = []
        links = self.get_links(url)
        urls = {-1: [url]}  # -1 is the starting point
        links_visited = []
        for i in range(self.depth + 1):
            urls[i] = []
            new_links = []
            print(f"Reading links at depth {i}...")
            for link in tqdm(links):
                """Checking if the link belongs the provided domain."""
                if (self.domain_lock and link.find(url) > -1) or (not self.domain_lock):
                    print("Loading link: " + link)
                    if link in links_visited:
                        continue
                    if link:
                        urls[i].append(link)
                        new_links.extend(self.get_links(link))
                    links_visited.append(link)
                else:
                    print("Link ignored: " + link)
            new_links = list(set(new_links))
            links = new_links
        print(f"Found {len(urls)} links at depth {self.depth}.")
        for depth_i in urls:
            for url in urls[depth_i]:
                try:
                    documents.extend(remote_reader.load_data(url))
                except Exception as e:
                    print(f"Error reading {url} at depth {depth_i}: {e}")
                    continue

        return documents

    @staticmethod
    def is_url(href) -> bool:
        """Check if a link is a URL."""
        return href.startswith("http")

    def get_links(self, url) -> List[str]:
        from urllib.parse import urljoin, urlparse, urlunparse

        from bs4 import BeautifulSoup

        """Get all links from a page."""
        page = requests.get(url)
        soup = BeautifulSoup(page.content, "html.parser")

        links = soup.find_all("a")
        result = []
        for link in links:
            if isinstance(link, str):
                href = link
            else:
                href = link.get("href")
            if href is not None:
                if not self.is_url(href):
                    href = urljoin(url, href)

            url_parsed = urlparse(href)
            url_without_query_string = urlunparse(
                (url_parsed.scheme, url_parsed.netloc, url_parsed.path, "", "", "")
            )

            if (
                url_without_query_string not in result
                and url_without_query_string
                and url_without_query_string.startswith("http")
            ):
                result.append(url_without_query_string)
        return result

is_url `staticmethod` #

is_url(href) -> bool

检查链接是否是 URL。

源代码位于 llama-index-integrations/readers/llama-index-readers-remote-depth/llama_index/readers/remote_depth/base.py

@staticmethod
def is_url(href) -> bool:
    """Check if a link is a URL."""
    return href.startswith("http")