Feat(tools) add tavily extract tool and enhance tavily search implementation (#10786)

1 year ago · 6de1f8c770
parent 6d532bfc02
commit 6de1f8c770
5 changed files with 379 additions and 155 deletions
--- a/api/core/tools/provider/builtin/tavily/tavily.yaml
+++ b/api/core/tools/provider/builtin/tavily/tavily.yaml
@ -1,14 +1,12 @@
 identity:
-  author: Yash Parmar
+  author: Yash Parmar, Kalo Chin
  name: tavily
  label:
-    en_US: Tavily
-    zh_Hans: Tavily
-    pt_BR: Tavily
+    en_US: Tavily Search & Extract
+    zh_Hans: Tavily 搜索和提取
  description:
-    en_US: Tavily
-    zh_Hans: Tavily
-    pt_BR: Tavily
+    en_US: A powerful AI-native search engine and web content extraction tool that provides highly relevant search results and raw content extraction from web pages.
+    zh_Hans: 一个强大的原生AI搜索引擎和网页内容提取工具，提供高度相关的搜索结果和网页原始内容提取。
  icon: icon.png
  tags:
    - search
@ -19,13 +17,10 @@ credentials_for_provider:
    label:
      en_US: Tavily API key
      zh_Hans: Tavily API key
-      pt_BR: Tavily API key
    placeholder:
      en_US: Please input your Tavily API key
      zh_Hans: 请输入你的 Tavily API key
-      pt_BR: Please input your Tavily API key
    help:
      en_US: Get your Tavily API key from Tavily
      zh_Hans: 从 TavilyApi 获取您的 Tavily API key
-      pt_BR: Get your Tavily API key from Tavily
-    url: https://docs.tavily.com/docs/welcome
+    url: https://app.tavily.com/home
--- a/api/core/tools/provider/builtin/tavily/tools/tavily_extract.py
+++ b/api/core/tools/provider/builtin/tavily/tools/tavily_extract.py
@ -0,0 +1,145 @@
+from typing import Any
+
+import requests
+
+from core.tools.entities.tool_entities import ToolInvokeMessage
+from core.tools.tool.builtin_tool import BuiltinTool
+
+TAVILY_API_URL = "https://api.tavily.com"
+
+
+class TavilyExtract:
+    """
+    A class for extracting content from web pages using the Tavily Extract API.
+
+    Args:
+        api_key (str): The API key for accessing the Tavily Extract API.
+
+    Methods:
+        extract_content: Retrieves extracted content from the Tavily Extract API.
+    """
+
+    def __init__(self, api_key: str) -> None:
+        self.api_key = api_key
+
+    def extract_content(self, params: dict[str, Any]) -> dict:
+        """
+        Retrieves extracted content from the Tavily Extract API.
+
+        Args:
+            params (Dict[str, Any]): The extraction parameters.
+
+        Returns:
+            dict: The extracted content.
+
+        """
+        # Ensure required parameters are set
+        if "api_key" not in params:
+            params["api_key"] = self.api_key
+
+        # Process parameters
+        processed_params = self._process_params(params)
+
+        response = requests.post(f"{TAVILY_API_URL}/extract", json=processed_params)
+        response.raise_for_status()
+        return response.json()
+
+    def _process_params(self, params: dict[str, Any]) -> dict:
+        """
+        Processes and validates the extraction parameters.
+
+        Args:
+            params (Dict[str, Any]): The extraction parameters.
+
+        Returns:
+            dict: The processed parameters.
+        """
+        processed_params = {}
+
+        # Process 'urls'
+        if "urls" in params:
+            urls = params["urls"]
+            if isinstance(urls, str):
+                processed_params["urls"] = [url.strip() for url in urls.replace(",", " ").split()]
+            elif isinstance(urls, list):
+                processed_params["urls"] = urls
+        else:
+            raise ValueError("The 'urls' parameter is required.")
+
+        # Only include 'api_key'
+        processed_params["api_key"] = params.get("api_key", self.api_key)
+
+        return processed_params
+
+
+class TavilyExtractTool(BuiltinTool):
+    """
+    A tool for extracting content from web pages using Tavily Extract.
+    """
+
+    def _invoke(self, user_id: str, tool_parameters: dict[str, Any]) -> ToolInvokeMessage | list[ToolInvokeMessage]:
+        """
+        Invokes the Tavily Extract tool with the given user ID and tool parameters.
+
+        Args:
+            user_id (str): The ID of the user invoking the tool.
+            tool_parameters (Dict[str, Any]): The parameters for the Tavily Extract tool.
+
+        Returns:
+            ToolInvokeMessage | list[ToolInvokeMessage]: The result of the Tavily Extract tool invocation.
+        """
+        urls = tool_parameters.get("urls", "")
+        api_key = self.runtime.credentials.get("tavily_api_key")
+        if not api_key:
+            return self.create_text_message(
+                "Tavily API key is missing. Please set the 'tavily_api_key' in credentials."
+            )
+        if not urls:
+            return self.create_text_message("Please input at least one URL to extract.")
+
+        tavily_extract = TavilyExtract(api_key)
+        try:
+            raw_results = tavily_extract.extract_content(tool_parameters)
+        except requests.HTTPError as e:
+            return self.create_text_message(f"Error occurred while extracting content: {str(e)}")
+
+        if not raw_results.get("results"):
+            return self.create_text_message("No content could be extracted from the provided URLs.")
+        else:
+            # Always return JSON message with all data
+            json_message = self.create_json_message(raw_results)
+
+            # Create text message based on user-selected parameters
+            text_message_content = self._format_results_as_text(raw_results)
+            text_message = self.create_text_message(text=text_message_content)
+
+            return [json_message, text_message]
+
+    def _format_results_as_text(self, raw_results: dict) -> str:
+        """
+        Formats the raw extraction results into a markdown text based on user-selected parameters.
+
+        Args:
+            raw_results (dict): The raw extraction results.
+
+        Returns:
+            str: The formatted markdown text.
+        """
+        output_lines = []
+
+        for idx, result in enumerate(raw_results.get("results", []), 1):
+            url = result.get("url", "")
+            raw_content = result.get("raw_content", "")
+
+            output_lines.append(f"## Extracted Content {idx}: {url}\n")
+            output_lines.append(f"**Raw Content:**\n{raw_content}\n")
+            output_lines.append("---\n")
+
+        if raw_results.get("failed_results"):
+            output_lines.append("## Failed URLs:\n")
+            for failed in raw_results["failed_results"]:
+                url = failed.get("url", "")
+                error = failed.get("error", "Unknown error")
+                output_lines.append(f"- {url}: {error}\n")
+
+        return "\n".join(output_lines)
--- a/api/core/tools/provider/builtin/tavily/tools/tavily_extract.yaml
+++ b/api/core/tools/provider/builtin/tavily/tools/tavily_extract.yaml
@ -0,0 +1,23 @@
+identity:
+  name: tavily_extract
+  author: Kalo Chin
+  label:
+    en_US: Tavily Extract
+    zh_Hans: Tavily Extract
+description:
+  human:
+    en_US: A web extraction tool built specifically for AI agents (LLMs), delivering raw content from web pages.
+    zh_Hans: 专为人工智能代理 (LLM) 构建的网页提取工具，提供网页的原始内容。
+  llm: A tool for extracting raw content from web pages, designed for AI agents (LLMs).
+parameters:
+  - name: urls
+    type: string
+    required: true
+    label:
+      en_US: URLs
+      zh_Hans: URLs
+    human_description:
+      en_US: A comma-separated list of URLs to extract content from.
+      zh_Hans: 要从中提取内容的 URL 的逗号分隔列表。
+    llm_description: A comma-separated list of URLs to extract content from.
+    form: llm
--- a/api/core/tools/provider/builtin/tavily/tools/tavily_search.py
+++ b/api/core/tools/provider/builtin/tavily/tools/tavily_search.py
@ -17,8 +17,6 @@ class TavilySearch:

    Methods:
        raw_results: Retrieves raw search results from the Tavily Search API.
-        results: Retrieves cleaned search results from the Tavily Search API.
-        clean_results: Cleans the raw search results.
    """

    def __init__(self, api_key: str) -> None:
@ -35,63 +33,62 @@ class TavilySearch:
            dict: The raw search results.

        """
+        # Ensure required parameters are set
        params["api_key"] = self.api_key
-        if (
-            "exclude_domains" in params
-            and isinstance(params["exclude_domains"], str)
-            and params["exclude_domains"] != "None"
-        ):
-            params["exclude_domains"] = params["exclude_domains"].split()
-        else:
-            params["exclude_domains"] = []
-        if (
-            "include_domains" in params
-            and isinstance(params["include_domains"], str)
-            and params["include_domains"] != "None"
-        ):
-            params["include_domains"] = params["include_domains"].split()
-        else:
-            params["include_domains"] = []

-        response = requests.post(f"{TAVILY_API_URL}/search", json=params)
+        # Process parameters to ensure correct types
+        processed_params = self._process_params(params)
+
+        response = requests.post(f"{TAVILY_API_URL}/search", json=processed_params)
        response.raise_for_status()
        return response.json()

-    def results(self, params: dict[str, Any]) -> list[dict]:
+    def _process_params(self, params: dict[str, Any]) -> dict:
        """
-        Retrieves cleaned search results from the Tavily Search API.
+        Processes and validates the search parameters.

        Args:
            params (Dict[str, Any]): The search parameters.

        Returns:
-            list: The cleaned search results.
-
+            dict: The processed parameters.
        """
-        raw_search_results = self.raw_results(params)
-        return self.clean_results(raw_search_results["results"])
-
-    def clean_results(self, results: list[dict]) -> list[dict]:
-        """
-        Cleans the raw search results.
-
-        Args:
-            results (list): The raw search results.
-
-        Returns:
-            list: The cleaned search results.
-
-        """
-        clean_results = []
-        for result in results:
-            clean_results.append(
-                {
-                    "url": result["url"],
-                    "content": result["content"],
-                }
-            )
-        # return clean results as a string
-        return "\n".join([f"{res['url']}\n{res['content']}" for res in clean_results])
+        processed_params = {}
+
+        for key, value in params.items():
+            if value is None or value == "None":
+                continue
+            if key in ["include_domains", "exclude_domains"]:
+                if isinstance(value, str):
+                    # Split the string by commas or spaces and strip whitespace
+                    processed_params[key] = [domain.strip() for domain in value.replace(",", " ").split()]
+            elif key in ["include_images", "include_image_descriptions", "include_answer", "include_raw_content"]:
+                # Ensure boolean type
+                if isinstance(value, str):
+                    processed_params[key] = value.lower() == "true"
+                else:
+                    processed_params[key] = bool(value)
+            elif key in ["max_results", "days"]:
+                if isinstance(value, str):
+                    processed_params[key] = int(value)
+                else:
+                    processed_params[key] = value
+            elif key in ["search_depth", "topic", "query", "api_key"]:
+                processed_params[key] = value
+            else:
+                # Unrecognized parameter
+                pass
+
+        # Set defaults if not present
+        processed_params.setdefault("search_depth", "basic")
+        processed_params.setdefault("topic", "general")
+        processed_params.setdefault("max_results", 5)
+
+        # If topic is 'news', ensure 'days' is set
+        if processed_params.get("topic") == "news":
+            processed_params.setdefault("days", 3)
+
+        return processed_params


 class TavilySearchTool(BuiltinTool):
@ -111,14 +108,88 @@ class TavilySearchTool(BuiltinTool):
            ToolInvokeMessage | list[ToolInvokeMessage]: The result of the Tavily search tool invocation.
        """
        query = tool_parameters.get("query", "")
-
-        api_key = self.runtime.credentials["tavily_api_key"]
+        api_key = self.runtime.credentials.get("tavily_api_key")
+        if not api_key:
+            return self.create_text_message(
+                "Tavily API key is missing. Please set the 'tavily_api_key' in credentials."
+            )
        if not query:
-            return self.create_text_message("Please input query")
+            return self.create_text_message("Please input a query.")
+
        tavily_search = TavilySearch(api_key)
-        results = tavily_search.results(tool_parameters)
-        print(results)
-        if not results:
-            return self.create_text_message(f"No results found for '{query}' in Tavily")
+        try:
+            raw_results = tavily_search.raw_results(tool_parameters)
+        except requests.HTTPError as e:
+            return self.create_text_message(f"Error occurred while searching: {str(e)}")
+
+        if not raw_results.get("results"):
+            return self.create_text_message(f"No results found for '{query}' in Tavily.")
        else:
-            return self.create_text_message(text=results)
+            # Always return JSON message with all data
+            json_message = self.create_json_message(raw_results)
+
+            # Create text message based on user-selected parameters
+            text_message_content = self._format_results_as_text(raw_results, tool_parameters)
+            text_message = self.create_text_message(text=text_message_content)
+
+            return [json_message, text_message]
+
+    def _format_results_as_text(self, raw_results: dict, tool_parameters: dict[str, Any]) -> str:
+        """
+        Formats the raw results into a markdown text based on user-selected parameters.
+
+        Args:
+            raw_results (dict): The raw search results.
+            tool_parameters (dict): The tool parameters selected by the user.
+
+        Returns:
+            str: The formatted markdown text.
+        """
+        output_lines = []
+
+        # Include answer if requested
+        if tool_parameters.get("include_answer", False) and raw_results.get("answer"):
+            output_lines.append(f"**Answer:** {raw_results['answer']}\n")
+
+        # Include images if requested
+        if tool_parameters.get("include_images", False) and raw_results.get("images"):
+            output_lines.append("**Images:**\n")
+            for image in raw_results["images"]:
+                if tool_parameters.get("include_image_descriptions", False) and "description" in image:
+                    output_lines.append(f"![{image['description']}]({image['url']})\n")
+                else:
+                    output_lines.append(f"![]({image['url']})\n")
+
+        # Process each result
+        if "results" in raw_results:
+            for idx, result in enumerate(raw_results["results"], 1):
+                title = result.get("title", "No Title")
+                url = result.get("url", "")
+                content = result.get("content", "")
+                published_date = result.get("published_date", "")
+                score = result.get("score", "")
+
+                output_lines.append(f"### Result {idx}: [{title}]({url})\n")
+
+                # Include published date if available and topic is 'news'
+                if tool_parameters.get("topic") == "news" and published_date:
+                    output_lines.append(f"**Published Date:** {published_date}\n")
+
+                output_lines.append(f"**URL:** {url}\n")
+
+                # Include score (relevance)
+                if score:
+                    output_lines.append(f"**Relevance Score:** {score}\n")
+
+                # Include content
+                if content:
+                    output_lines.append(f"**Content:**\n{content}\n")
+
+                # Include raw content if requested
+                if tool_parameters.get("include_raw_content", False) and result.get("raw_content"):
+                    output_lines.append(f"**Raw Content:**\n{result['raw_content']}\n")
+
+                # Add a separator
+                output_lines.append("---\n")
+
+        return "\n".join(output_lines)
--- a/api/core/tools/provider/builtin/tavily/tools/tavily_search.yaml
+++ b/api/core/tools/provider/builtin/tavily/tools/tavily_search.yaml
@ -2,28 +2,24 @@ identity:
  name: tavily_search
  author: Yash Parmar
  label:
-    en_US: TavilySearch
-    zh_Hans: TavilySearch
-    pt_BR: TavilySearch
+    en_US: Tavily Search
+    zh_Hans: Tavily Search
 description:
  human:
-    en_US: A tool for  search engine built specifically for AI agents (LLMs), delivering real-time, accurate, and factual results at speed.
+    en_US: A search engine tool built specifically for AI agents (LLMs), delivering real-time, accurate, and factual results at speed.
    zh_Hans: 专为人工智能代理 (LLM) 构建的搜索引擎工具，可快速提供实时、准确和真实的结果。
-    pt_BR: A tool for search engine built specifically for AI agents (LLMs), delivering real-time, accurate, and factual results at speed.
  llm: A tool for search engine built specifically for AI agents (LLMs), delivering real-time, accurate, and factual results at speed.
 parameters:
  - name: query
    type: string
    required: true
    label:
-      en_US: Query string
-      zh_Hans: 查询语句
-      pt_BR: Query string
+      en_US: Query
+      zh_Hans: 查询
    human_description:
-      en_US: used for searching
-      zh_Hans: 用于搜索网页内容
-      pt_BR: used for searching
-    llm_description: key words for searching
+      en_US: The search query you want to execute with Tavily.
+      zh_Hans: 您想用 Tavily 执行的搜索查询。
+    llm_description: The search query.
    form: llm
  - name: search_depth
    type: select
@ -31,122 +27,118 @@ parameters:
    label:
      en_US: Search Depth
      zh_Hans: 搜索深度
-      pt_BR: Search Depth
    human_description:
-      en_US: The depth of search results
-      zh_Hans: 搜索结果的深度
-      pt_BR: The depth of search results
+      en_US: The depth of the search.
+      zh_Hans: 搜索的深度。
    form: form
    options:
      - value: basic
        label:
          en_US: Basic
          zh_Hans: 基本
-          pt_BR: Basic
      - value: advanced
        label:
          en_US: Advanced
          zh_Hans: 高级
-          pt_BR: Advanced
    default: basic
+  - name: topic
+    type: select
+    required: false
+    label:
+      en_US: Topic
+      zh_Hans: 主题
+    human_description:
+      en_US: The category of the search.
+      zh_Hans: 搜索的类别。
+    form: form
+    options:
+      - value: general
+        label:
+          en_US: General
+          zh_Hans: 一般
+      - value: news
+        label:
+          en_US: News
+          zh_Hans: 新闻
+    default: general
+  - name: days
+    type: number
+    required: false
+    label:
+      en_US: Days
+      zh_Hans: 天数
+    human_description:
+      en_US: The number of days back from the current date to include in the search results (only applicable when "topic" is "news").
+      zh_Hans: 从当前日期起向前追溯的天数，以包含在搜索结果中（仅当“topic”为“news”时适用）。
+    form: form
+    min: 1
+    default: 3
+  - name: max_results
+    type: number
+    required: false
+    label:
+      en_US: Max Results
+      zh_Hans: 最大结果数
+    human_description:
+      en_US: The maximum number of search results to return.
+      zh_Hans: 要返回的最大搜索结果数。
+    form: form
+    min: 1
+    max: 20
+    default: 5
  - name: include_images
    type: boolean
    required: false
    label:
      en_US: Include Images
      zh_Hans: 包含图片
-      pt_BR: Include Images
    human_description:
-      en_US: Include images in the search results
-      zh_Hans: 在搜索结果中包含图片
-      pt_BR: Include images in the search results
+      en_US: Include a list of query-related images in the response.
+      zh_Hans: 在响应中包含与查询相关的图片列表。
    form: form
-    options:
-      - value: 'true'
-        label:
-          en_US: 'Yes'
-          zh_Hans: 是
-          pt_BR: 'Yes'
-      - value: 'false'
-        label:
-          en_US: 'No'
-          zh_Hans: 否
-          pt_BR: 'No'
-    default: 'false'
+    default: false
+  - name: include_image_descriptions
+    type: boolean
+    required: false
+    label:
+      en_US: Include Image Descriptions
+      zh_Hans: 包含图片描述
+    human_description:
+      en_US: When include_images is True, adds descriptive text for each image.
+      zh_Hans: 当 include_images 为 True 时，为每个图像添加描述文本。
+    form: form
+    default: false
  - name: include_answer
    type: boolean
    required: false
    label:
      en_US: Include Answer
      zh_Hans: 包含答案
-      pt_BR: Include Answer
    human_description:
-      en_US: Include answers in the search results
-      zh_Hans: 在搜索结果中包含答案
-      pt_BR: Include answers in the search results
+      en_US: Include a short answer to the original query in the response.
+      zh_Hans: 在响应中包含对原始查询的简短回答。
    form: form
-    options:
-      - value: 'true'
-        label:
-          en_US: 'Yes'
-          zh_Hans: 是
-          pt_BR: 'Yes'
-      - value: 'false'
-        label:
-          en_US: 'No'
-          zh_Hans: 否
-          pt_BR: 'No'
-    default: 'false'
+    default: false
  - name: include_raw_content
    type: boolean
    required: false
    label:
      en_US: Include Raw Content
      zh_Hans: 包含原始内容
-      pt_BR: Include Raw Content
-    human_description:
-      en_US: Include raw content in the search results
-      zh_Hans: 在搜索结果中包含原始内容
-      pt_BR: Include raw content in the search results
-    form: form
-    options:
-      - value: 'true'
-        label:
-          en_US: 'Yes'
-          zh_Hans: 是
-          pt_BR: 'Yes'
-      - value: 'false'
-        label:
-          en_US: 'No'
-          zh_Hans: 否
-          pt_BR: 'No'
-    default: 'false'
-  - name: max_results
-    type: number
-    required: false
-    label:
-      en_US: Max Results
-      zh_Hans: 最大结果
-      pt_BR: Max Results
    human_description:
-      en_US: The number of maximum search results to return
-      zh_Hans: 返回的最大搜索结果数
-      pt_BR: The number of maximum search results to return
+      en_US: Include the cleaned and parsed HTML content of each search result.
+      zh_Hans: 包含每个搜索结果的已清理和解析的HTML内容。
    form: form
-    min: 1
-    max: 20
-    default: 5
+    default: false
  - name: include_domains
    type: string
    required: false
    label:
      en_US: Include Domains
      zh_Hans: 包含域
-      pt_BR: Include Domains
    human_description:
-      en_US: A list of domains to specifically include in the search results
-      zh_Hans: 在搜索结果中特别包含的域名列表
-      pt_BR: A list of domains to specifically include in the search results
+      en_US: A comma-separated list of domains to specifically include in the search results.
+      zh_Hans: 要在搜索结果中特别包含的域的逗号分隔列表。
    form: form
  - name: exclude_domains
    type: string
@ -154,9 +146,7 @@ parameters:
    label:
      en_US: Exclude Domains
      zh_Hans: 排除域
-      pt_BR: Exclude Domains
    human_description:
-      en_US: A list of domains to specifically exclude from the search results
-      zh_Hans: 从搜索结果中特别排除的域名列表
-      pt_BR: A list of domains to specifically exclude from the search results
+      en_US: A comma-separated list of domains to specifically exclude from the search results.
+      zh_Hans: 要从搜索结果中特别排除的域的逗号分隔列表。
    form: form