Feat(tools) add tavily extract tool and enhance tavily search implementation (#10786)

2 years ago · 6de1f8c770
parent 6d532bfc02
commit 6de1f8c770
5 changed files with 379 additions and 155 deletions
--- a/api/core/tools/provider/builtin/tavily/tavily.yaml
+++ b/api/core/tools/provider/builtin/tavily/tavily.yaml
@ -1,14 +1,12 @@
 identity:
-  author: Yash Parmar
+  author: Yash Parmar, Kalo Chin
  name: tavily
  label:
-    en_US: Tavily
+    en_US: Tavily Search & Extract
-    zh_Hans: Tavily
+    zh_Hans: Tavily 搜索和提取
    pt_BR: Tavily
  description:
-    en_US: Tavily
+    en_US: A powerful AI-native search engine and web content extraction tool that provides highly relevant search results and raw content extraction from web pages.
-    zh_Hans: Tavily
+    zh_Hans: 一个强大的原生AI搜索引擎和网页内容提取工具，提供高度相关的搜索结果和网页原始内容提取。
    pt_BR: Tavily
  icon: icon.png
  tags:
    - search
@ -19,13 +17,10 @@ credentials_for_provider:
    label:
      en_US: Tavily API key
      zh_Hans: Tavily API key
      pt_BR: Tavily API key
    placeholder:
      en_US: Please input your Tavily API key
      zh_Hans: 请输入你的 Tavily API key
      pt_BR: Please input your Tavily API key
    help:
      en_US: Get your Tavily API key from Tavily
      zh_Hans: 从 TavilyApi 获取您的 Tavily API key
-      pt_BR: Get your Tavily API key from Tavily
+    url: https://app.tavily.com/home
    url: https://docs.tavily.com/docs/welcome
--- a/api/core/tools/provider/builtin/tavily/tools/tavily_extract.py
+++ b/api/core/tools/provider/builtin/tavily/tools/tavily_extract.py
@ -0,0 +1,145 @@
 from typing import Any
 import requests
 from core.tools.entities.tool_entities import ToolInvokeMessage
 from core.tools.tool.builtin_tool import BuiltinTool
 TAVILY_API_URL = "https://api.tavily.com"
 class TavilyExtract:
    """
    A class for extracting content from web pages using the Tavily Extract API.
    Args:
        api_key (str): The API key for accessing the Tavily Extract API.
    Methods:
        extract_content: Retrieves extracted content from the Tavily Extract API.
    """
    def __init__(self, api_key: str) -> None:
        self.api_key = api_key
    def extract_content(self, params: dict[str, Any]) -> dict:
        """
        Retrieves extracted content from the Tavily Extract API.
        Args:
            params (Dict[str, Any]): The extraction parameters.
        Returns:
            dict: The extracted content.
        """
        # Ensure required parameters are set
        if "api_key" not in params:
            params["api_key"] = self.api_key
        # Process parameters
        processed_params = self._process_params(params)
        response = requests.post(f"{TAVILY_API_URL}/extract", json=processed_params)
        response.raise_for_status()
        return response.json()
    def _process_params(self, params: dict[str, Any]) -> dict:
        """
        Processes and validates the extraction parameters.
        Args:
            params (Dict[str, Any]): The extraction parameters.
        Returns:
            dict: The processed parameters.
        """
        processed_params = {}
        # Process 'urls'
        if "urls" in params:
            urls = params["urls"]
            if isinstance(urls, str):
                processed_params["urls"] = [url.strip() for url in urls.replace(",", " ").split()]
            elif isinstance(urls, list):
                processed_params["urls"] = urls
        else:
            raise ValueError("The 'urls' parameter is required.")
        # Only include 'api_key'
        processed_params["api_key"] = params.get("api_key", self.api_key)
        return processed_params
 class TavilyExtractTool(BuiltinTool):
    """
    A tool for extracting content from web pages using Tavily Extract.
    """
    def _invoke(self, user_id: str, tool_parameters: dict[str, Any]) -> ToolInvokeMessage | list[ToolInvokeMessage]:
        """
        Invokes the Tavily Extract tool with the given user ID and tool parameters.
        Args:
            user_id (str): The ID of the user invoking the tool.
            tool_parameters (Dict[str, Any]): The parameters for the Tavily Extract tool.
        Returns:
            ToolInvokeMessage | list[ToolInvokeMessage]: The result of the Tavily Extract tool invocation.
        """
        urls = tool_parameters.get("urls", "")
        api_key = self.runtime.credentials.get("tavily_api_key")
        if not api_key:
            return self.create_text_message(
                "Tavily API key is missing. Please set the 'tavily_api_key' in credentials."
            )
        if not urls:
            return self.create_text_message("Please input at least one URL to extract.")
        tavily_extract = TavilyExtract(api_key)
        try:
            raw_results = tavily_extract.extract_content(tool_parameters)
        except requests.HTTPError as e:
            return self.create_text_message(f"Error occurred while extracting content: {str(e)}")
        if not raw_results.get("results"):
            return self.create_text_message("No content could be extracted from the provided URLs.")
        else:
            # Always return JSON message with all data
            json_message = self.create_json_message(raw_results)
            # Create text message based on user-selected parameters
            text_message_content = self._format_results_as_text(raw_results)
            text_message = self.create_text_message(text=text_message_content)
            return [json_message, text_message]
    def _format_results_as_text(self, raw_results: dict) -> str:
        """
        Formats the raw extraction results into a markdown text based on user-selected parameters.
        Args:
            raw_results (dict): The raw extraction results.
        Returns:
            str: The formatted markdown text.
        """
        output_lines = []
        for idx, result in enumerate(raw_results.get("results", []), 1):
            url = result.get("url", "")
            raw_content = result.get("raw_content", "")
            output_lines.append(f"## Extracted Content {idx}: {url}\n")
            output_lines.append(f"**Raw Content:**\n{raw_content}\n")
            output_lines.append("---\n")
        if raw_results.get("failed_results"):
            output_lines.append("## Failed URLs:\n")
            for failed in raw_results["failed_results"]:
                url = failed.get("url", "")
                error = failed.get("error", "Unknown error")
                output_lines.append(f"- {url}: {error}\n")
        return "\n".join(output_lines)
--- a/api/core/tools/provider/builtin/tavily/tools/tavily_extract.yaml
+++ b/api/core/tools/provider/builtin/tavily/tools/tavily_extract.yaml
@ -0,0 +1,23 @@
 identity:
  name: tavily_extract
  author: Kalo Chin
  label:
    en_US: Tavily Extract
    zh_Hans: Tavily Extract
 description:
  human:
    en_US: A web extraction tool built specifically for AI agents (LLMs), delivering raw content from web pages.
    zh_Hans: 专为人工智能代理 (LLM) 构建的网页提取工具，提供网页的原始内容。
  llm: A tool for extracting raw content from web pages, designed for AI agents (LLMs).
 parameters:
  - name: urls
    type: string
    required: true
    label:
      en_US: URLs
      zh_Hans: URLs
    human_description:
      en_US: A comma-separated list of URLs to extract content from.
      zh_Hans: 要从中提取内容的 URL 的逗号分隔列表。
    llm_description: A comma-separated list of URLs to extract content from.
    form: llm
--- a/api/core/tools/provider/builtin/tavily/tools/tavily_search.py
+++ b/api/core/tools/provider/builtin/tavily/tools/tavily_search.py
@ -17,8 +17,6 @@ class TavilySearch:
    Methods:
        raw_results: Retrieves raw search results from the Tavily Search API.
        results: Retrieves cleaned search results from the Tavily Search API.
        clean_results: Cleans the raw search results.
    """
    def __init__(self, api_key: str) -> None:
@ -35,63 +33,62 @@ class TavilySearch:
            dict: The raw search results.
        """
        # Ensure required parameters are set
        params["api_key"] = self.api_key
        if (
            "exclude_domains" in params
            and isinstance(params["exclude_domains"], str)
            and params["exclude_domains"] != "None"
        ):
            params["exclude_domains"] = params["exclude_domains"].split()
        else:
            params["exclude_domains"] = []
        if (
            "include_domains" in params
            and isinstance(params["include_domains"], str)
            and params["include_domains"] != "None"
        ):
            params["include_domains"] = params["include_domains"].split()
        else:
            params["include_domains"] = []
-        response = requests.post(f"{TAVILY_API_URL}/search", json=params)
+        # Process parameters to ensure correct types
        processed_params = self._process_params(params)
        response = requests.post(f"{TAVILY_API_URL}/search", json=processed_params)
        response.raise_for_status()
        return response.json()
-    def results(self, params: dict[str, Any]) -> list[dict]:
+    def _process_params(self, params: dict[str, Any]) -> dict:
        """
-        Retrieves cleaned search results from the Tavily Search API.
+        Processes and validates the search parameters.
        Args:
            params (Dict[str, Any]): The search parameters.
        Returns:
-            list: The cleaned search results.
+            dict: The processed parameters.
        """
-        raw_search_results = self.raw_results(params)
+        processed_params = {}
-        return self.clean_results(raw_search_results["results"])
+
-
+        for key, value in params.items():
-    def clean_results(self, results: list[dict]) -> list[dict]:
+            if value is None or value == "None":
-        """
+                continue
-        Cleans the raw search results.
+            if key in ["include_domains", "exclude_domains"]:
                if isinstance(value, str):
                    # Split the string by commas or spaces and strip whitespace
                    processed_params[key] = [domain.strip() for domain in value.replace(",", " ").split()]
            elif key in ["include_images", "include_image_descriptions", "include_answer", "include_raw_content"]:
                # Ensure boolean type
                if isinstance(value, str):
                    processed_params[key] = value.lower() == "true"
                else:
                    processed_params[key] = bool(value)
            elif key in ["max_results", "days"]:
                if isinstance(value, str):
                    processed_params[key] = int(value)
                else:
                    processed_params[key] = value
            elif key in ["search_depth", "topic", "query", "api_key"]:
                processed_params[key] = value
            else:
                # Unrecognized parameter
                pass
-        Args:
+        # Set defaults if not present
-            results (list): The raw search results.
+        processed_params.setdefault("search_depth", "basic")
        processed_params.setdefault("topic", "general")
        processed_params.setdefault("max_results", 5)
-        Returns:
+        # If topic is 'news', ensure 'days' is set
-            list: The cleaned search results.
+        if processed_params.get("topic") == "news":
            processed_params.setdefault("days", 3)
-        """
+        return processed_params
        clean_results = []
        for result in results:
            clean_results.append(
                {
                    "url": result["url"],
                    "content": result["content"],
                }
            )
        # return clean results as a string
        return "\n".join([f"{res['url']}\n{res['content']}" for res in clean_results])
 class TavilySearchTool(BuiltinTool):
@ -111,14 +108,88 @@ class TavilySearchTool(BuiltinTool):
            ToolInvokeMessage | list[ToolInvokeMessage]: The result of the Tavily search tool invocation.
        """
        query = tool_parameters.get("query", "")
-
+        api_key = self.runtime.credentials.get("tavily_api_key")
-        api_key = self.runtime.credentials["tavily_api_key"]
+        if not api_key:
            return self.create_text_message(
                "Tavily API key is missing. Please set the 'tavily_api_key' in credentials."
            )
        if not query:
-            return self.create_text_message("Please input query")
+            return self.create_text_message("Please input a query.")
        tavily_search = TavilySearch(api_key)
-        results = tavily_search.results(tool_parameters)
+        try:
-        print(results)
+            raw_results = tavily_search.raw_results(tool_parameters)
-        if not results:
+        except requests.HTTPError as e:
-            return self.create_text_message(f"No results found for '{query}' in Tavily")
+            return self.create_text_message(f"Error occurred while searching: {str(e)}")
        if not raw_results.get("results"):
            return self.create_text_message(f"No results found for '{query}' in Tavily.")
        else:
            # Always return JSON message with all data
            json_message = self.create_json_message(raw_results)
            # Create text message based on user-selected parameters
            text_message_content = self._format_results_as_text(raw_results, tool_parameters)
            text_message = self.create_text_message(text=text_message_content)
            return [json_message, text_message]
    def _format_results_as_text(self, raw_results: dict, tool_parameters: dict[str, Any]) -> str:
        """
        Formats the raw results into a markdown text based on user-selected parameters.
        Args:
            raw_results (dict): The raw search results.
            tool_parameters (dict): The tool parameters selected by the user.
        Returns:
            str: The formatted markdown text.
        """
        output_lines = []
        # Include answer if requested
        if tool_parameters.get("include_answer", False) and raw_results.get("answer"):
            output_lines.append(f"**Answer:** {raw_results['answer']}\n")
        # Include images if requested
        if tool_parameters.get("include_images", False) and raw_results.get("images"):
            output_lines.append("**Images:**\n")
            for image in raw_results["images"]:
                if tool_parameters.get("include_image_descriptions", False) and "description" in image:
                    output_lines.append(f"![{image['description']}]({image['url']})\n")
                else:
-            return self.create_text_message(text=results)
+                    output_lines.append(f"![]({image['url']})\n")
        # Process each result
        if "results" in raw_results:
            for idx, result in enumerate(raw_results["results"], 1):
                title = result.get("title", "No Title")
                url = result.get("url", "")
                content = result.get("content", "")
                published_date = result.get("published_date", "")
                score = result.get("score", "")
                output_lines.append(f"### Result {idx}: [{title}]({url})\n")
                # Include published date if available and topic is 'news'
                if tool_parameters.get("topic") == "news" and published_date:
                    output_lines.append(f"**Published Date:** {published_date}\n")
                output_lines.append(f"**URL:** {url}\n")
                # Include score (relevance)
                if score:
                    output_lines.append(f"**Relevance Score:** {score}\n")
                # Include content
                if content:
                    output_lines.append(f"**Content:**\n{content}\n")
                # Include raw content if requested
                if tool_parameters.get("include_raw_content", False) and result.get("raw_content"):
                    output_lines.append(f"**Raw Content:**\n{result['raw_content']}\n")
                # Add a separator
                output_lines.append("---\n")
        return "\n".join(output_lines)
--- a/api/core/tools/provider/builtin/tavily/tools/tavily_search.yaml
+++ b/api/core/tools/provider/builtin/tavily/tools/tavily_search.yaml
@ -2,28 +2,24 @@ identity:
  name: tavily_search
  author: Yash Parmar
  label:
-    en_US: TavilySearch
+    en_US: Tavily Search
-    zh_Hans: TavilySearch
+    zh_Hans: Tavily Search
    pt_BR: TavilySearch
 description:
  human:
-    en_US: A tool for  search engine built specifically for AI agents (LLMs), delivering real-time, accurate, and factual results at speed.
+    en_US: A search engine tool built specifically for AI agents (LLMs), delivering real-time, accurate, and factual results at speed.
    zh_Hans: 专为人工智能代理 (LLM) 构建的搜索引擎工具，可快速提供实时、准确和真实的结果。
    pt_BR: A tool for search engine built specifically for AI agents (LLMs), delivering real-time, accurate, and factual results at speed.
  llm: A tool for search engine built specifically for AI agents (LLMs), delivering real-time, accurate, and factual results at speed.
 parameters:
  - name: query
    type: string
    required: true
    label:
-      en_US: Query string
+      en_US: Query
-      zh_Hans: 查询语句
+      zh_Hans: 查询
      pt_BR: Query string
    human_description:
-      en_US: used for searching
+      en_US: The search query you want to execute with Tavily.
-      zh_Hans: 用于搜索网页内容
+      zh_Hans: 您想用 Tavily 执行的搜索查询。
-      pt_BR: used for searching
+    llm_description: The search query.
    llm_description: key words for searching
    form: llm
  - name: search_depth
    type: select
@ -31,122 +27,118 @@ parameters:
    label:
      en_US: Search Depth
      zh_Hans: 搜索深度
      pt_BR: Search Depth
    human_description:
-      en_US: The depth of search results
+      en_US: The depth of the search.
-      zh_Hans: 搜索结果的深度
+      zh_Hans: 搜索的深度。
      pt_BR: The depth of search results
    form: form
    options:
      - value: basic
        label:
          en_US: Basic
          zh_Hans: 基本
          pt_BR: Basic
      - value: advanced
        label:
          en_US: Advanced
          zh_Hans: 高级
          pt_BR: Advanced
    default: basic
  - name: topic
    type: select
    required: false
    label:
      en_US: Topic
      zh_Hans: 主题
    human_description:
      en_US: The category of the search.
      zh_Hans: 搜索的类别。
    form: form
    options:
      - value: general
        label:
          en_US: General
          zh_Hans: 一般
      - value: news
        label:
          en_US: News
          zh_Hans: 新闻
    default: general
  - name: days
    type: number
    required: false
    label:
      en_US: Days
      zh_Hans: 天数
    human_description:
      en_US: The number of days back from the current date to include in the search results (only applicable when "topic" is "news").
      zh_Hans: 从当前日期起向前追溯的天数，以包含在搜索结果中（仅当“topic”为“news”时适用）。
    form: form
    min: 1
    default: 3
  - name: max_results
    type: number
    required: false
    label:
      en_US: Max Results
      zh_Hans: 最大结果数
    human_description:
      en_US: The maximum number of search results to return.
      zh_Hans: 要返回的最大搜索结果数。
    form: form
    min: 1
    max: 20
    default: 5
  - name: include_images
    type: boolean
    required: false
    label:
      en_US: Include Images
      zh_Hans: 包含图片
      pt_BR: Include Images
    human_description:
-      en_US: Include images in the search results
+      en_US: Include a list of query-related images in the response.
-      zh_Hans: 在搜索结果中包含图片
+      zh_Hans: 在响应中包含与查询相关的图片列表。
      pt_BR: Include images in the search results
    form: form
-    options:
+    default: false
-      - value: 'true'
+  - name: include_image_descriptions
-        label:
+    type: boolean
-          en_US: 'Yes'
+    required: false
          zh_Hans: 是
          pt_BR: 'Yes'
      - value: 'false'
    label:
-          en_US: 'No'
+      en_US: Include Image Descriptions
-          zh_Hans: 否
+      zh_Hans: 包含图片描述
-          pt_BR: 'No'
+    human_description:
-    default: 'false'
+      en_US: When include_images is True, adds descriptive text for each image.
      zh_Hans: 当 include_images 为 True 时，为每个图像添加描述文本。
    form: form
    default: false
  - name: include_answer
    type: boolean
    required: false
    label:
      en_US: Include Answer
      zh_Hans: 包含答案
      pt_BR: Include Answer
    human_description:
-      en_US: Include answers in the search results
+      en_US: Include a short answer to the original query in the response.
-      zh_Hans: 在搜索结果中包含答案
+      zh_Hans: 在响应中包含对原始查询的简短回答。
      pt_BR: Include answers in the search results
    form: form
-    options:
+    default: false
      - value: 'true'
        label:
          en_US: 'Yes'
          zh_Hans: 是
          pt_BR: 'Yes'
      - value: 'false'
        label:
          en_US: 'No'
          zh_Hans: 否
          pt_BR: 'No'
    default: 'false'
  - name: include_raw_content
    type: boolean
    required: false
    label:
      en_US: Include Raw Content
      zh_Hans: 包含原始内容
      pt_BR: Include Raw Content
    human_description:
-      en_US: Include raw content in the search results
+      en_US: Include the cleaned and parsed HTML content of each search result.
-      zh_Hans: 在搜索结果中包含原始内容
+      zh_Hans: 包含每个搜索结果的已清理和解析的HTML内容。
      pt_BR: Include raw content in the search results
    form: form
-    options:
+    default: false
      - value: 'true'
        label:
          en_US: 'Yes'
          zh_Hans: 是
          pt_BR: 'Yes'
      - value: 'false'
        label:
          en_US: 'No'
          zh_Hans: 否
          pt_BR: 'No'
    default: 'false'
  - name: max_results
    type: number
    required: false
    label:
      en_US: Max Results
      zh_Hans: 最大结果
      pt_BR: Max Results
    human_description:
      en_US: The number of maximum search results to return
      zh_Hans: 返回的最大搜索结果数
      pt_BR: The number of maximum search results to return
    form: form
    min: 1
    max: 20
    default: 5
  - name: include_domains
    type: string
    required: false
    label:
      en_US: Include Domains
      zh_Hans: 包含域
      pt_BR: Include Domains
    human_description:
-      en_US: A list of domains to specifically include in the search results
+      en_US: A comma-separated list of domains to specifically include in the search results.
-      zh_Hans: 在搜索结果中特别包含的域名列表
+      zh_Hans: 要在搜索结果中特别包含的域的逗号分隔列表。
      pt_BR: A list of domains to specifically include in the search results
    form: form
  - name: exclude_domains
    type: string
@ -154,9 +146,7 @@ parameters:
    label:
      en_US: Exclude Domains
      zh_Hans: 排除域
      pt_BR: Exclude Domains
    human_description:
-      en_US: A list of domains to specifically exclude from the search results
+      en_US: A comma-separated list of domains to specifically exclude from the search results.
-      zh_Hans: 从搜索结果中特别排除的域名列表
+      zh_Hans: 要从搜索结果中特别排除的域的逗号分隔列表。
      pt_BR: A list of domains to specifically exclude from the search results
    form: form