feat: enhance the firecrawl tool (#6705)
parent
082c46a903
commit
21f6caacd4
@ -1,36 +1,48 @@
|
|||||||
import json
|
from typing import Any
|
||||||
from typing import Any, Union
|
|
||||||
|
|
||||||
from core.tools.entities.tool_entities import ToolInvokeMessage
|
from core.tools.entities.tool_entities import ToolInvokeMessage
|
||||||
from core.tools.provider.builtin.firecrawl.firecrawl_appx import FirecrawlApp
|
from core.tools.provider.builtin.firecrawl.firecrawl_appx import FirecrawlApp, get_array_params, get_json_params
|
||||||
from core.tools.tool.builtin_tool import BuiltinTool
|
from core.tools.tool.builtin_tool import BuiltinTool
|
||||||
|
|
||||||
|
|
||||||
class CrawlTool(BuiltinTool):
|
class CrawlTool(BuiltinTool):
|
||||||
def _invoke(self, user_id: str, tool_parameters: dict[str, Any]) -> Union[ToolInvokeMessage, list[ToolInvokeMessage]]:
|
def _invoke(self, user_id: str, tool_parameters: dict[str, Any]) -> ToolInvokeMessage:
|
||||||
app = FirecrawlApp(api_key=self.runtime.credentials['firecrawl_api_key'], base_url=self.runtime.credentials['base_url'])
|
"""
|
||||||
|
the crawlerOptions and pageOptions comes from doc here:
|
||||||
options = {
|
https://docs.firecrawl.dev/api-reference/endpoint/crawl
|
||||||
'crawlerOptions': {
|
"""
|
||||||
'excludes': tool_parameters.get('excludes', '').split(',') if tool_parameters.get('excludes') else [],
|
app = FirecrawlApp(api_key=self.runtime.credentials['firecrawl_api_key'],
|
||||||
'includes': tool_parameters.get('includes', '').split(',') if tool_parameters.get('includes') else [],
|
base_url=self.runtime.credentials['base_url'])
|
||||||
'limit': tool_parameters.get('limit', 5)
|
crawlerOptions = {}
|
||||||
},
|
pageOptions = {}
|
||||||
'pageOptions': {
|
|
||||||
'onlyMainContent': tool_parameters.get('onlyMainContent', False)
|
wait_for_results = tool_parameters.get('wait_for_results', True)
|
||||||
}
|
|
||||||
}
|
crawlerOptions['excludes'] = get_array_params(tool_parameters, 'excludes')
|
||||||
|
crawlerOptions['includes'] = get_array_params(tool_parameters, 'includes')
|
||||||
|
crawlerOptions['returnOnlyUrls'] = tool_parameters.get('returnOnlyUrls', False)
|
||||||
|
crawlerOptions['maxDepth'] = tool_parameters.get('maxDepth')
|
||||||
|
crawlerOptions['mode'] = tool_parameters.get('mode')
|
||||||
|
crawlerOptions['ignoreSitemap'] = tool_parameters.get('ignoreSitemap', False)
|
||||||
|
crawlerOptions['limit'] = tool_parameters.get('limit', 5)
|
||||||
|
crawlerOptions['allowBackwardCrawling'] = tool_parameters.get('allowBackwardCrawling', False)
|
||||||
|
crawlerOptions['allowExternalContentLinks'] = tool_parameters.get('allowExternalContentLinks', False)
|
||||||
|
|
||||||
|
pageOptions['headers'] = get_json_params(tool_parameters, 'headers')
|
||||||
|
pageOptions['includeHtml'] = tool_parameters.get('includeHtml', False)
|
||||||
|
pageOptions['includeRawHtml'] = tool_parameters.get('includeRawHtml', False)
|
||||||
|
pageOptions['onlyIncludeTags'] = get_array_params(tool_parameters, 'onlyIncludeTags')
|
||||||
|
pageOptions['removeTags'] = get_array_params(tool_parameters, 'removeTags')
|
||||||
|
pageOptions['onlyMainContent'] = tool_parameters.get('onlyMainContent', False)
|
||||||
|
pageOptions['replaceAllPathsWithAbsolutePaths'] = tool_parameters.get('replaceAllPathsWithAbsolutePaths', False)
|
||||||
|
pageOptions['screenshot'] = tool_parameters.get('screenshot', False)
|
||||||
|
pageOptions['waitFor'] = tool_parameters.get('waitFor', 0)
|
||||||
|
|
||||||
crawl_result = app.crawl_url(
|
crawl_result = app.crawl_url(
|
||||||
url=tool_parameters['url'],
|
url=tool_parameters['url'],
|
||||||
params=options,
|
wait=wait_for_results,
|
||||||
wait=True
|
crawlerOptions=crawlerOptions,
|
||||||
|
pageOptions=pageOptions
|
||||||
)
|
)
|
||||||
|
|
||||||
if not isinstance(crawl_result, str):
|
return self.create_json_message(crawl_result)
|
||||||
crawl_result = json.dumps(crawl_result, ensure_ascii=False, indent=4)
|
|
||||||
|
|
||||||
if not crawl_result:
|
|
||||||
return self.create_text_message("Crawl request failed.")
|
|
||||||
|
|
||||||
return self.create_text_message(crawl_result)
|
|
||||||
|
|||||||
@ -0,0 +1,20 @@
|
|||||||
|
from typing import Any
|
||||||
|
|
||||||
|
from core.tools.entities.tool_entities import ToolInvokeMessage
|
||||||
|
from core.tools.provider.builtin.firecrawl.firecrawl_appx import FirecrawlApp
|
||||||
|
from core.tools.tool.builtin_tool import BuiltinTool
|
||||||
|
|
||||||
|
|
||||||
|
class CrawlJobTool(BuiltinTool):
|
||||||
|
def _invoke(self, user_id: str, tool_parameters: dict[str, Any]) -> ToolInvokeMessage:
|
||||||
|
app = FirecrawlApp(api_key=self.runtime.credentials['firecrawl_api_key'],
|
||||||
|
base_url=self.runtime.credentials['base_url'])
|
||||||
|
operation = tool_parameters.get('operation', 'get')
|
||||||
|
if operation == 'get':
|
||||||
|
result = app.check_crawl_status(job_id=tool_parameters['job_id'])
|
||||||
|
elif operation == 'cancel':
|
||||||
|
result = app.cancel_crawl_job(job_id=tool_parameters['job_id'])
|
||||||
|
else:
|
||||||
|
raise ValueError(f'Invalid operation: {operation}')
|
||||||
|
|
||||||
|
return self.create_json_message(result)
|
||||||
@ -1,26 +1,39 @@
|
|||||||
import json
|
from typing import Any
|
||||||
from typing import Any, Union
|
|
||||||
|
|
||||||
from core.tools.entities.tool_entities import ToolInvokeMessage
|
from core.tools.entities.tool_entities import ToolInvokeMessage
|
||||||
from core.tools.provider.builtin.firecrawl.firecrawl_appx import FirecrawlApp
|
from core.tools.provider.builtin.firecrawl.firecrawl_appx import FirecrawlApp, get_array_params, get_json_params
|
||||||
from core.tools.tool.builtin_tool import BuiltinTool
|
from core.tools.tool.builtin_tool import BuiltinTool
|
||||||
|
|
||||||
|
|
||||||
class ScrapeTool(BuiltinTool):
|
class ScrapeTool(BuiltinTool):
|
||||||
def _invoke(self, user_id: str, tool_parameters: dict[str, Any]) -> Union[ToolInvokeMessage, list[ToolInvokeMessage]]:
|
|
||||||
app = FirecrawlApp(api_key=self.runtime.credentials['firecrawl_api_key'], base_url=self.runtime.credentials['base_url'])
|
|
||||||
|
|
||||||
crawl_result = app.scrape_url(
|
def _invoke(self, user_id: str, tool_parameters: dict[str, Any]) -> ToolInvokeMessage:
|
||||||
url=tool_parameters['url'],
|
"""
|
||||||
wait=True
|
the pageOptions and extractorOptions comes from doc here:
|
||||||
)
|
https://docs.firecrawl.dev/api-reference/endpoint/scrape
|
||||||
|
"""
|
||||||
|
app = FirecrawlApp(api_key=self.runtime.credentials['firecrawl_api_key'],
|
||||||
|
base_url=self.runtime.credentials['base_url'])
|
||||||
|
|
||||||
if isinstance(crawl_result, dict):
|
pageOptions = {}
|
||||||
result_message = json.dumps(crawl_result, ensure_ascii=False, indent=4)
|
extractorOptions = {}
|
||||||
else:
|
|
||||||
result_message = str(crawl_result)
|
|
||||||
|
|
||||||
if not crawl_result:
|
pageOptions['headers'] = get_json_params(tool_parameters, 'headers')
|
||||||
return self.create_text_message("Scrape request failed.")
|
pageOptions['includeHtml'] = tool_parameters.get('includeHtml', False)
|
||||||
|
pageOptions['includeRawHtml'] = tool_parameters.get('includeRawHtml', False)
|
||||||
|
pageOptions['onlyIncludeTags'] = get_array_params(tool_parameters, 'onlyIncludeTags')
|
||||||
|
pageOptions['removeTags'] = get_array_params(tool_parameters, 'removeTags')
|
||||||
|
pageOptions['onlyMainContent'] = tool_parameters.get('onlyMainContent', False)
|
||||||
|
pageOptions['replaceAllPathsWithAbsolutePaths'] = tool_parameters.get('replaceAllPathsWithAbsolutePaths', False)
|
||||||
|
pageOptions['screenshot'] = tool_parameters.get('screenshot', False)
|
||||||
|
pageOptions['waitFor'] = tool_parameters.get('waitFor', 0)
|
||||||
|
|
||||||
return self.create_text_message(result_message)
|
extractorOptions['mode'] = tool_parameters.get('mode', '')
|
||||||
|
extractorOptions['extractionPrompt'] = tool_parameters.get('extractionPrompt', '')
|
||||||
|
extractorOptions['extractionSchema'] = get_json_params(tool_parameters, 'extractionSchema')
|
||||||
|
|
||||||
|
crawl_result = app.scrape_url(url=tool_parameters['url'],
|
||||||
|
pageOptions=pageOptions,
|
||||||
|
extractorOptions=extractorOptions)
|
||||||
|
|
||||||
|
return self.create_json_message(crawl_result)
|
||||||
|
|||||||
Loading…
Reference in New Issue