跳到内容

Arxiv

初始化文件.

ArxivReader #

基类: BaseReader

Arxiv 阅读器.

获取搜索查询,返回 Arxiv 上排名前列的对应科学论文的 Documents 列表。

源代码位于 llama-index-integrations/readers/llama-index-readers-papers/llama_index/readers/papers/arxiv/base.py
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
class ArxivReader(BaseReader):
    """
    Arxiv Reader.

    Gets a search query, return a list of Documents of the top corresponding scientific papers on Arxiv.
    """

    def __init__(
        self,
    ) -> None:
        """Initialize with parameters."""
        super().__init__()

    def _hacky_hash(self, some_string):
        return hashlib.md5(some_string.encode("utf-8")).hexdigest()

    def load_data(
        self,
        search_query: str,
        papers_dir: Optional[str] = ".papers",
        max_results: Optional[int] = 10,
    ) -> List[Document]:
        """
        Search for a topic on Arxiv, download the PDFs of the top results locally, then read them.

        Args:
            search_query (str): A topic to search for (e.g. "Artificial Intelligence").
            papers_dir (Optional[str]): Locally directory to store the papers
            max_results (Optional[int]): Maximum number of papers to fetch.

        Returns:
            List[Document]: A list of Document objects.

        """
        import arxiv

        arxiv_search = arxiv.Search(
            query=search_query,
            id_list=[],
            max_results=max_results,
            sort_by=arxiv.SortCriterion.Relevance,
        )
        search_results = list(arxiv_search.results())
        logging.debug(f"> Successfully fetched {len(search_results)} paperes")

        if not os.path.exists(papers_dir):
            os.makedirs(papers_dir)

        paper_lookup = {}
        for paper in search_results:
            # Hash filename to avoid bad characters in file path
            hashed_name = self._hacky_hash(f"{paper.title}{paper.entry_id}")
            filename = f"{hashed_name}.pdf"
            paper_lookup[filename] = {
                "Title of this paper": paper.title,
                "Authors": (", ").join([a.name for a in paper.authors]),
                "Date published": paper.published.strftime("%m/%d/%Y"),
                "URL": paper.entry_id,
                # "summary": paper.summary
            }
            paper.download_pdf(dirpath=papers_dir, filename=filename)
            logging.debug(f"> Downloading {filename}...")

        def get_paper_metadata(filename):
            return paper_lookup[os.path.basename(filename)]

        arxiv_documents = SimpleDirectoryReader(
            papers_dir,
            file_metadata=get_paper_metadata,
            exclude_hidden=False,  # default directory is hidden ".papers"
        ).load_data()
        # Include extra documents containing the abstracts
        abstract_documents = []
        for paper in search_results:
            d = (
                f"The following is a summary of the paper: {paper.title}\n\nSummary:"
                f" {paper.summary}"
            )
            abstract_documents.append(Document(text=d))

        # Delete downloaded papers
        try:
            for f in os.listdir(papers_dir):
                os.remove(os.path.join(papers_dir, f))
                logging.debug(f"> Deleted file: {f}")
            os.rmdir(papers_dir)
            logging.debug(f"> Deleted directory: {papers_dir}")
        except OSError:
            print("Unable to delete files or directory")

        return arxiv_documents + abstract_documents

    def load_papers_and_abstracts(
        self,
        search_query: str,
        papers_dir: Optional[str] = ".papers",
        max_results: Optional[int] = 10,
    ) -> Tuple[List[Document], List[Document]]:
        """
        Search for a topic on Arxiv, download the PDFs of the top results locally, then read them.

        Args:
            search_query (str): A topic to search for (e.g. "Artificial Intelligence").
            papers_dir (Optional[str]): Locally directory to store the papers
            max_results (Optional[int]): Maximum number of papers to fetch.

        Returns:
            List[Document]: A list of Document objects representing the papers themselves
            List[Document]: A list of Document objects representing abstracts only

        """
        import arxiv

        arxiv_search = arxiv.Search(
            query=search_query,
            id_list=[],
            max_results=max_results,
            sort_by=arxiv.SortCriterion.Relevance,
        )
        search_results = list(arxiv_search.results())
        logging.debug(f"> Successfully fetched {len(search_results)} paperes")

        if not os.path.exists(papers_dir):
            os.makedirs(papers_dir)

        paper_lookup = {}
        for paper in search_results:
            # Hash filename to avoid bad characters in file path
            hashed_name = self._hacky_hash(f"{paper.title}{paper.entry_id}")
            filename = f"{hashed_name}.pdf"
            paper_lookup[filename] = {
                "Title of this paper": paper.title,
                "Authors": (", ").join([a.name for a in paper.authors]),
                "Date published": paper.published.strftime("%m/%d/%Y"),
                "URL": paper.entry_id,
                # "summary": paper.summary
            }
            paper.download_pdf(dirpath=papers_dir, filename=filename)
            logging.debug(f"> Downloading {filename}...")

        def get_paper_metadata(filename):
            return paper_lookup[os.path.basename(filename)]

        arxiv_documents = SimpleDirectoryReader(
            papers_dir,
            file_metadata=get_paper_metadata,
            exclude_hidden=False,  # default directory is hidden ".papers"
        ).load_data()
        # Include extra documents containing the abstracts
        abstract_documents = []
        for paper in search_results:
            d = (
                f"The following is a summary of the paper: {paper.title}\n\nSummary:"
                f" {paper.summary}"
            )
            abstract_documents.append(Document(text=d))

        # Delete downloaded papers
        try:
            for f in os.listdir(papers_dir):
                os.remove(os.path.join(papers_dir, f))
                logging.debug(f"> Deleted file: {f}")
            os.rmdir(papers_dir)
            logging.debug(f"> Deleted directory: {papers_dir}")
        except OSError:
            print("Unable to delete files or directory")

        return arxiv_documents, abstract_documents

load_data #

load_data(search_query: str, papers_dir: Optional[str] = '.papers', max_results: Optional[int] = 10) -> List[Document]

在 Arxiv 上搜索主题,下载排名前列结果的 PDF 到本地,然后读取它们。

参数

名称 类型 描述 默认
search_query str

要搜索的主题(例如,“人工智能”)。

必需
papers_dir Optional[str]

本地存储论文的目录

'.papers'
max_results Optional[int]

要获取的最大论文数量。

10

返回值

类型 描述
List[Document]

List[Document]: Document 对象的列表。

源代码位于 llama-index-integrations/readers/llama-index-readers-papers/llama_index/readers/papers/arxiv/base.py
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
def load_data(
    self,
    search_query: str,
    papers_dir: Optional[str] = ".papers",
    max_results: Optional[int] = 10,
) -> List[Document]:
    """
    Search for a topic on Arxiv, download the PDFs of the top results locally, then read them.

    Args:
        search_query (str): A topic to search for (e.g. "Artificial Intelligence").
        papers_dir (Optional[str]): Locally directory to store the papers
        max_results (Optional[int]): Maximum number of papers to fetch.

    Returns:
        List[Document]: A list of Document objects.

    """
    import arxiv

    arxiv_search = arxiv.Search(
        query=search_query,
        id_list=[],
        max_results=max_results,
        sort_by=arxiv.SortCriterion.Relevance,
    )
    search_results = list(arxiv_search.results())
    logging.debug(f"> Successfully fetched {len(search_results)} paperes")

    if not os.path.exists(papers_dir):
        os.makedirs(papers_dir)

    paper_lookup = {}
    for paper in search_results:
        # Hash filename to avoid bad characters in file path
        hashed_name = self._hacky_hash(f"{paper.title}{paper.entry_id}")
        filename = f"{hashed_name}.pdf"
        paper_lookup[filename] = {
            "Title of this paper": paper.title,
            "Authors": (", ").join([a.name for a in paper.authors]),
            "Date published": paper.published.strftime("%m/%d/%Y"),
            "URL": paper.entry_id,
            # "summary": paper.summary
        }
        paper.download_pdf(dirpath=papers_dir, filename=filename)
        logging.debug(f"> Downloading {filename}...")

    def get_paper_metadata(filename):
        return paper_lookup[os.path.basename(filename)]

    arxiv_documents = SimpleDirectoryReader(
        papers_dir,
        file_metadata=get_paper_metadata,
        exclude_hidden=False,  # default directory is hidden ".papers"
    ).load_data()
    # Include extra documents containing the abstracts
    abstract_documents = []
    for paper in search_results:
        d = (
            f"The following is a summary of the paper: {paper.title}\n\nSummary:"
            f" {paper.summary}"
        )
        abstract_documents.append(Document(text=d))

    # Delete downloaded papers
    try:
        for f in os.listdir(papers_dir):
            os.remove(os.path.join(papers_dir, f))
            logging.debug(f"> Deleted file: {f}")
        os.rmdir(papers_dir)
        logging.debug(f"> Deleted directory: {papers_dir}")
    except OSError:
        print("Unable to delete files or directory")

    return arxiv_documents + abstract_documents

load_papers_and_abstracts #

load_papers_and_abstracts(search_query: str, papers_dir: Optional[str] = '.papers', max_results: Optional[int] = 10) -> Tuple[List[Document], List[Document]]

在 Arxiv 上搜索主题,下载排名前列结果的 PDF 到本地,然后读取它们。

参数

名称 类型 描述 默认
search_query str

要搜索的主题(例如,“人工智能”)。

必需
papers_dir Optional[str]

本地存储论文的目录

'.papers'
max_results Optional[int]

要获取的最大论文数量。

10

返回值

类型 描述
List[Document]

List[Document]: 表示论文本身的 Document 对象列表

List[Document]

List[Document]: 仅表示摘要的 Document 对象列表

源代码位于 llama-index-integrations/readers/llama-index-readers-papers/llama_index/readers/papers/arxiv/base.py
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
def load_papers_and_abstracts(
    self,
    search_query: str,
    papers_dir: Optional[str] = ".papers",
    max_results: Optional[int] = 10,
) -> Tuple[List[Document], List[Document]]:
    """
    Search for a topic on Arxiv, download the PDFs of the top results locally, then read them.

    Args:
        search_query (str): A topic to search for (e.g. "Artificial Intelligence").
        papers_dir (Optional[str]): Locally directory to store the papers
        max_results (Optional[int]): Maximum number of papers to fetch.

    Returns:
        List[Document]: A list of Document objects representing the papers themselves
        List[Document]: A list of Document objects representing abstracts only

    """
    import arxiv

    arxiv_search = arxiv.Search(
        query=search_query,
        id_list=[],
        max_results=max_results,
        sort_by=arxiv.SortCriterion.Relevance,
    )
    search_results = list(arxiv_search.results())
    logging.debug(f"> Successfully fetched {len(search_results)} paperes")

    if not os.path.exists(papers_dir):
        os.makedirs(papers_dir)

    paper_lookup = {}
    for paper in search_results:
        # Hash filename to avoid bad characters in file path
        hashed_name = self._hacky_hash(f"{paper.title}{paper.entry_id}")
        filename = f"{hashed_name}.pdf"
        paper_lookup[filename] = {
            "Title of this paper": paper.title,
            "Authors": (", ").join([a.name for a in paper.authors]),
            "Date published": paper.published.strftime("%m/%d/%Y"),
            "URL": paper.entry_id,
            # "summary": paper.summary
        }
        paper.download_pdf(dirpath=papers_dir, filename=filename)
        logging.debug(f"> Downloading {filename}...")

    def get_paper_metadata(filename):
        return paper_lookup[os.path.basename(filename)]

    arxiv_documents = SimpleDirectoryReader(
        papers_dir,
        file_metadata=get_paper_metadata,
        exclude_hidden=False,  # default directory is hidden ".papers"
    ).load_data()
    # Include extra documents containing the abstracts
    abstract_documents = []
    for paper in search_results:
        d = (
            f"The following is a summary of the paper: {paper.title}\n\nSummary:"
            f" {paper.summary}"
        )
        abstract_documents.append(Document(text=d))

    # Delete downloaded papers
    try:
        for f in os.listdir(papers_dir):
            os.remove(os.path.join(papers_dir, f))
            logging.debug(f"> Deleted file: {f}")
        os.rmdir(papers_dir)
        logging.debug(f"> Deleted directory: {papers_dir}")
    except OSError:
        print("Unable to delete files or directory")

    return arxiv_documents, abstract_documents

PubmedReader #

基类: BaseReader

Pubmed 阅读器。

获取一个搜索查询,返回 Pubmed 上最相关的顶级科学论文对应的 Document 对象列表。

源代码位于 llama-index-integrations/readers/llama-index-readers-papers/llama_index/readers/papers/pubmed/base.py
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
class PubmedReader(BaseReader):
    """
    Pubmed Reader.

    Gets a search query, return a list of Documents of the top corresponding scientific papers on Pubmed.
    """

    def load_data_bioc(
        self,
        search_query: str,
        max_results: Optional[int] = 10,
    ) -> List[Document]:
        """
        Search for a topic on Pubmed, fetch the text of the most relevant full-length papers.
        Uses the BoiC API, which has been down a lot.

        Args:
            search_query (str): A topic to search for (e.g. "Alzheimers").
            max_results (Optional[int]): Maximum number of papers to fetch.

        Returns:
            List[Document]: A list of Document objects.

        """
        from datetime import datetime

        import requests
        from defusedxml import ElementTree as safe_xml

        pubmed_search = []
        parameters = {"tool": "tool", "email": "email", "db": "pmc"}
        parameters["term"] = search_query
        parameters["retmax"] = max_results
        resp = requests.get(
            "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi",
            params=parameters,
        )
        root = safe_xml.fromstring(resp.content)

        for elem in root.iter():
            if elem.tag == "Id":
                _id = elem.text
                try:
                    resp = requests.get(
                        f"https://www.ncbi.nlm.nih.gov/research/bionlp/RESTful/pmcoa.cgi/BioC_json/PMC{_id}/ascii"
                    )
                    info = resp.json()
                    title = "Pubmed Paper"
                    try:
                        title = next(
                            [
                                p["text"]
                                for p in info["documents"][0]["passages"]
                                if p["infons"]["section_type"] == "TITLE"
                            ]
                        )
                    except KeyError:
                        pass
                    pubmed_search.append(
                        {
                            "title": title,
                            "url": (
                                f"https://www.ncbi.nlm.nih.gov/pmc/articles/PMC{_id}/"
                            ),
                            "date": info["date"],
                            "documents": info["documents"],
                        }
                    )
                except Exception:
                    print(f"Unable to parse PMC{_id} or it does not exist")

        # Then get documents from Pubmed text, which includes abstracts
        pubmed_documents = []
        for paper in pubmed_search:
            for d in paper["documents"]:
                text = "\n".join([p["text"] for p in d["passages"]])
                pubmed_documents.append(
                    Document(
                        text=text,
                        extra_info={
                            "Title of this paper": paper["title"],
                            "URL": paper["url"],
                            "Date published": datetime.strptime(
                                paper["date"], "%Y%m%d"
                            ).strftime("%m/%d/%Y"),
                        },
                    )
                )

        return pubmed_documents

    def load_data(
        self,
        search_query: str,
        max_results: Optional[int] = 10,
    ) -> List[Document]:
        """
        Search for a topic on Pubmed, fetch the text of the most relevant full-length papers.

        Args:
            search_query (str): A topic to search for (e.g. "Alzheimers").
            max_results (Optional[int]): Maximum number of papers to fetch.


        Returns:
            List[Document]: A list of Document objects.

        """
        import time

        import requests

        pubmed_search = []
        parameters = {"tool": "tool", "email": "email", "db": "pmc"}
        parameters["term"] = search_query
        parameters["retmax"] = max_results
        resp = requests.get(
            "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi",
            params=parameters,
        )
        root = safe_xml.fromstring(resp.content)

        for elem in root.iter():
            if elem.tag == "Id":
                _id = elem.text
                url = f"https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?id={_id}&db=pmc"
                print(url)
                try:
                    resp = requests.get(url)
                    info = safe_xml.fromstring(resp.content)

                    raw_text = ""
                    title = ""
                    journal = ""
                    for element in info.iter():
                        if element.tag == "article-title":
                            title = element.text
                        elif element.tag == "journal-title":
                            journal = element.text

                        if element.text:
                            raw_text += element.text.strip() + " "

                    pubmed_search.append(
                        {
                            "title": title,
                            "journal": journal,
                            "url": (
                                f"https://www.ncbi.nlm.nih.gov/pmc/articles/PMC{_id}/"
                            ),
                            "text": raw_text,
                        }
                    )
                    time.sleep(1)  # API rate limits
                except Exception as e:
                    print(f"Unable to parse PMC{_id} or it does not exist:", e)

        # Then get documents from Pubmed text, which includes abstracts
        pubmed_documents = []
        for paper in pubmed_search:
            pubmed_documents.append(
                Document(
                    text=paper["text"],
                    extra_info={
                        "Title of this paper": paper["title"],
                        "Journal it was published in:": paper["journal"],
                        "URL": paper["url"],
                    },
                )
            )

        return pubmed_documents

load_data_bioc #

load_data_bioc(search_query: str, max_results: Optional[int] = 10) -> List[Document]

在 Pubmed 上搜索主题,获取最相关的全文论文文本。使用 BoiC API,该 API 经常无法访问。

参数

名称 类型 描述 默认
search_query str

要搜索的主题(例如,“阿尔茨海默症”)。

必需
max_results Optional[int]

要获取的最大论文数量。

10

返回值

类型 描述
List[Document]

List[Document]: Document 对象的列表。

源代码位于 llama-index-integrations/readers/llama-index-readers-papers/llama_index/readers/papers/pubmed/base.py
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
def load_data_bioc(
    self,
    search_query: str,
    max_results: Optional[int] = 10,
) -> List[Document]:
    """
    Search for a topic on Pubmed, fetch the text of the most relevant full-length papers.
    Uses the BoiC API, which has been down a lot.

    Args:
        search_query (str): A topic to search for (e.g. "Alzheimers").
        max_results (Optional[int]): Maximum number of papers to fetch.

    Returns:
        List[Document]: A list of Document objects.

    """
    from datetime import datetime

    import requests
    from defusedxml import ElementTree as safe_xml

    pubmed_search = []
    parameters = {"tool": "tool", "email": "email", "db": "pmc"}
    parameters["term"] = search_query
    parameters["retmax"] = max_results
    resp = requests.get(
        "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi",
        params=parameters,
    )
    root = safe_xml.fromstring(resp.content)

    for elem in root.iter():
        if elem.tag == "Id":
            _id = elem.text
            try:
                resp = requests.get(
                    f"https://www.ncbi.nlm.nih.gov/research/bionlp/RESTful/pmcoa.cgi/BioC_json/PMC{_id}/ascii"
                )
                info = resp.json()
                title = "Pubmed Paper"
                try:
                    title = next(
                        [
                            p["text"]
                            for p in info["documents"][0]["passages"]
                            if p["infons"]["section_type"] == "TITLE"
                        ]
                    )
                except KeyError:
                    pass
                pubmed_search.append(
                    {
                        "title": title,
                        "url": (
                            f"https://www.ncbi.nlm.nih.gov/pmc/articles/PMC{_id}/"
                        ),
                        "date": info["date"],
                        "documents": info["documents"],
                    }
                )
            except Exception:
                print(f"Unable to parse PMC{_id} or it does not exist")

    # Then get documents from Pubmed text, which includes abstracts
    pubmed_documents = []
    for paper in pubmed_search:
        for d in paper["documents"]:
            text = "\n".join([p["text"] for p in d["passages"]])
            pubmed_documents.append(
                Document(
                    text=text,
                    extra_info={
                        "Title of this paper": paper["title"],
                        "URL": paper["url"],
                        "Date published": datetime.strptime(
                            paper["date"], "%Y%m%d"
                        ).strftime("%m/%d/%Y"),
                    },
                )
            )

    return pubmed_documents

load_data #

load_data(search_query: str, max_results: Optional[int] = 10) -> List[Document]

在 Pubmed 上搜索主题,获取最相关的全文论文文本。

参数

名称 类型 描述 默认
search_query str

要搜索的主题(例如,“阿尔茨海默症”)。

必需
max_results Optional[int]

要获取的最大论文数量。

10

返回值

类型 描述
List[Document]

List[Document]: Document 对象的列表。

源代码位于 llama-index-integrations/readers/llama-index-readers-papers/llama_index/readers/papers/pubmed/base.py
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
def load_data(
    self,
    search_query: str,
    max_results: Optional[int] = 10,
) -> List[Document]:
    """
    Search for a topic on Pubmed, fetch the text of the most relevant full-length papers.

    Args:
        search_query (str): A topic to search for (e.g. "Alzheimers").
        max_results (Optional[int]): Maximum number of papers to fetch.


    Returns:
        List[Document]: A list of Document objects.

    """
    import time

    import requests

    pubmed_search = []
    parameters = {"tool": "tool", "email": "email", "db": "pmc"}
    parameters["term"] = search_query
    parameters["retmax"] = max_results
    resp = requests.get(
        "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi",
        params=parameters,
    )
    root = safe_xml.fromstring(resp.content)

    for elem in root.iter():
        if elem.tag == "Id":
            _id = elem.text
            url = f"https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?id={_id}&db=pmc"
            print(url)
            try:
                resp = requests.get(url)
                info = safe_xml.fromstring(resp.content)

                raw_text = ""
                title = ""
                journal = ""
                for element in info.iter():
                    if element.tag == "article-title":
                        title = element.text
                    elif element.tag == "journal-title":
                        journal = element.text

                    if element.text:
                        raw_text += element.text.strip() + " "

                pubmed_search.append(
                    {
                        "title": title,
                        "journal": journal,
                        "url": (
                            f"https://www.ncbi.nlm.nih.gov/pmc/articles/PMC{_id}/"
                        ),
                        "text": raw_text,
                    }
                )
                time.sleep(1)  # API rate limits
            except Exception as e:
                print(f"Unable to parse PMC{_id} or it does not exist:", e)

    # Then get documents from Pubmed text, which includes abstracts
    pubmed_documents = []
    for paper in pubmed_search:
        pubmed_documents.append(
            Document(
                text=paper["text"],
                extra_info={
                    "Title of this paper": paper["title"],
                    "Journal it was published in:": paper["journal"],
                    "URL": paper["url"],
                },
            )
        )

    return pubmed_documents