|
|
|
|
@ -1,16 +1,187 @@
|
|
|
|
|
import json
|
|
|
|
|
import time
|
|
|
|
|
import urllib.error
|
|
|
|
|
import urllib.parse
|
|
|
|
|
import urllib.request
|
|
|
|
|
from typing import Any
|
|
|
|
|
|
|
|
|
|
from langchain.tools import PubmedQueryRun
|
|
|
|
|
from pydantic import BaseModel, Field
|
|
|
|
|
|
|
|
|
|
from core.tools.entities.tool_entities import ToolInvokeMessage
|
|
|
|
|
from core.tools.tool.builtin_tool import BuiltinTool
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
class PubMedAPIWrapper(BaseModel):
|
|
|
|
|
"""
|
|
|
|
|
Wrapper around PubMed API.
|
|
|
|
|
|
|
|
|
|
This wrapper will use the PubMed API to conduct searches and fetch
|
|
|
|
|
document summaries. By default, it will return the document summaries
|
|
|
|
|
of the top-k results of an input search.
|
|
|
|
|
|
|
|
|
|
Parameters:
|
|
|
|
|
top_k_results: number of the top-scored document used for the PubMed tool
|
|
|
|
|
load_max_docs: a limit to the number of loaded documents
|
|
|
|
|
load_all_available_meta:
|
|
|
|
|
if True: the `metadata` of the loaded Documents gets all available meta info
|
|
|
|
|
(see https://www.ncbi.nlm.nih.gov/books/NBK25499/#chapter4.ESearch)
|
|
|
|
|
if False: the `metadata` gets only the most informative fields.
|
|
|
|
|
"""
|
|
|
|
|
|
|
|
|
|
base_url_esearch = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi?"
|
|
|
|
|
base_url_efetch = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?"
|
|
|
|
|
max_retry = 5
|
|
|
|
|
sleep_time = 0.2
|
|
|
|
|
|
|
|
|
|
# Default values for the parameters
|
|
|
|
|
top_k_results: int = 3
|
|
|
|
|
load_max_docs: int = 25
|
|
|
|
|
ARXIV_MAX_QUERY_LENGTH = 300
|
|
|
|
|
doc_content_chars_max: int = 2000
|
|
|
|
|
load_all_available_meta: bool = False
|
|
|
|
|
email: str = "your_email@example.com"
|
|
|
|
|
|
|
|
|
|
def run(self, query: str) -> str:
|
|
|
|
|
"""
|
|
|
|
|
Run PubMed search and get the article meta information.
|
|
|
|
|
See https://www.ncbi.nlm.nih.gov/books/NBK25499/#chapter4.ESearch
|
|
|
|
|
It uses only the most informative fields of article meta information.
|
|
|
|
|
"""
|
|
|
|
|
|
|
|
|
|
try:
|
|
|
|
|
# Retrieve the top-k results for the query
|
|
|
|
|
docs = [
|
|
|
|
|
f"Published: {result['pub_date']}\nTitle: {result['title']}\n"
|
|
|
|
|
f"Summary: {result['summary']}"
|
|
|
|
|
for result in self.load(query[: self.ARXIV_MAX_QUERY_LENGTH])
|
|
|
|
|
]
|
|
|
|
|
|
|
|
|
|
# Join the results and limit the character count
|
|
|
|
|
return (
|
|
|
|
|
"\n\n".join(docs)[:self.doc_content_chars_max]
|
|
|
|
|
if docs
|
|
|
|
|
else "No good PubMed Result was found"
|
|
|
|
|
)
|
|
|
|
|
except Exception as ex:
|
|
|
|
|
return f"PubMed exception: {ex}"
|
|
|
|
|
|
|
|
|
|
def load(self, query: str) -> list[dict]:
|
|
|
|
|
"""
|
|
|
|
|
Search PubMed for documents matching the query.
|
|
|
|
|
Return a list of dictionaries containing the document metadata.
|
|
|
|
|
"""
|
|
|
|
|
|
|
|
|
|
url = (
|
|
|
|
|
self.base_url_esearch
|
|
|
|
|
+ "db=pubmed&term="
|
|
|
|
|
+ str({urllib.parse.quote(query)})
|
|
|
|
|
+ f"&retmode=json&retmax={self.top_k_results}&usehistory=y"
|
|
|
|
|
)
|
|
|
|
|
result = urllib.request.urlopen(url)
|
|
|
|
|
text = result.read().decode("utf-8")
|
|
|
|
|
json_text = json.loads(text)
|
|
|
|
|
|
|
|
|
|
articles = []
|
|
|
|
|
webenv = json_text["esearchresult"]["webenv"]
|
|
|
|
|
for uid in json_text["esearchresult"]["idlist"]:
|
|
|
|
|
article = self.retrieve_article(uid, webenv)
|
|
|
|
|
articles.append(article)
|
|
|
|
|
|
|
|
|
|
# Convert the list of articles to a JSON string
|
|
|
|
|
return articles
|
|
|
|
|
|
|
|
|
|
def retrieve_article(self, uid: str, webenv: str) -> dict:
|
|
|
|
|
url = (
|
|
|
|
|
self.base_url_efetch
|
|
|
|
|
+ "db=pubmed&retmode=xml&id="
|
|
|
|
|
+ uid
|
|
|
|
|
+ "&webenv="
|
|
|
|
|
+ webenv
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
retry = 0
|
|
|
|
|
while True:
|
|
|
|
|
try:
|
|
|
|
|
result = urllib.request.urlopen(url)
|
|
|
|
|
break
|
|
|
|
|
except urllib.error.HTTPError as e:
|
|
|
|
|
if e.code == 429 and retry < self.max_retry:
|
|
|
|
|
# Too Many Requests error
|
|
|
|
|
# wait for an exponentially increasing amount of time
|
|
|
|
|
print(
|
|
|
|
|
f"Too Many Requests, "
|
|
|
|
|
f"waiting for {self.sleep_time:.2f} seconds..."
|
|
|
|
|
)
|
|
|
|
|
time.sleep(self.sleep_time)
|
|
|
|
|
self.sleep_time *= 2
|
|
|
|
|
retry += 1
|
|
|
|
|
else:
|
|
|
|
|
raise e
|
|
|
|
|
|
|
|
|
|
xml_text = result.read().decode("utf-8")
|
|
|
|
|
|
|
|
|
|
# Get title
|
|
|
|
|
title = ""
|
|
|
|
|
if "<ArticleTitle>" in xml_text and "</ArticleTitle>" in xml_text:
|
|
|
|
|
start_tag = "<ArticleTitle>"
|
|
|
|
|
end_tag = "</ArticleTitle>"
|
|
|
|
|
title = xml_text[
|
|
|
|
|
xml_text.index(start_tag) + len(start_tag) : xml_text.index(end_tag)
|
|
|
|
|
]
|
|
|
|
|
|
|
|
|
|
# Get abstract
|
|
|
|
|
abstract = ""
|
|
|
|
|
if "<AbstractText>" in xml_text and "</AbstractText>" in xml_text:
|
|
|
|
|
start_tag = "<AbstractText>"
|
|
|
|
|
end_tag = "</AbstractText>"
|
|
|
|
|
abstract = xml_text[
|
|
|
|
|
xml_text.index(start_tag) + len(start_tag) : xml_text.index(end_tag)
|
|
|
|
|
]
|
|
|
|
|
|
|
|
|
|
# Get publication date
|
|
|
|
|
pub_date = ""
|
|
|
|
|
if "<PubDate>" in xml_text and "</PubDate>" in xml_text:
|
|
|
|
|
start_tag = "<PubDate>"
|
|
|
|
|
end_tag = "</PubDate>"
|
|
|
|
|
pub_date = xml_text[
|
|
|
|
|
xml_text.index(start_tag) + len(start_tag) : xml_text.index(end_tag)
|
|
|
|
|
]
|
|
|
|
|
|
|
|
|
|
# Return article as dictionary
|
|
|
|
|
article = {
|
|
|
|
|
"uid": uid,
|
|
|
|
|
"title": title,
|
|
|
|
|
"summary": abstract,
|
|
|
|
|
"pub_date": pub_date,
|
|
|
|
|
}
|
|
|
|
|
return article
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
class PubmedQueryRun(BaseModel):
|
|
|
|
|
"""Tool that searches the PubMed API."""
|
|
|
|
|
|
|
|
|
|
name = "PubMed"
|
|
|
|
|
description = (
|
|
|
|
|
"A wrapper around PubMed.org "
|
|
|
|
|
"Useful for when you need to answer questions about Physics, Mathematics, "
|
|
|
|
|
"Computer Science, Quantitative Biology, Quantitative Finance, Statistics, "
|
|
|
|
|
"Electrical Engineering, and Economics "
|
|
|
|
|
"from scientific articles on PubMed.org. "
|
|
|
|
|
"Input should be a search query."
|
|
|
|
|
)
|
|
|
|
|
api_wrapper: PubMedAPIWrapper = Field(default_factory=PubMedAPIWrapper)
|
|
|
|
|
|
|
|
|
|
def _run(
|
|
|
|
|
self,
|
|
|
|
|
query: str,
|
|
|
|
|
) -> str:
|
|
|
|
|
"""Use the Arxiv tool."""
|
|
|
|
|
return self.api_wrapper.run(query)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
class PubMedInput(BaseModel):
|
|
|
|
|
query: str = Field(..., description="Search query.")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
class PubMedSearchTool(BuiltinTool):
|
|
|
|
|
"""
|
|
|
|
|
Tool for performing a search using PubMed search engine.
|
|
|
|
|
@ -34,7 +205,7 @@ class PubMedSearchTool(BuiltinTool):
|
|
|
|
|
|
|
|
|
|
tool = PubmedQueryRun(args_schema=PubMedInput)
|
|
|
|
|
|
|
|
|
|
result = tool.run(query)
|
|
|
|
|
result = tool._run(query)
|
|
|
|
|
|
|
|
|
|
return self.create_text_message(self.summary(user_id=user_id, content=result))
|
|
|
|
|
|