feat: enhance the firecrawl tool (#6705)
parent
082c46a903
commit
21f6caacd4
@ -1,36 +1,48 @@
|
||||
import json
|
||||
from typing import Any, Union
|
||||
from typing import Any
|
||||
|
||||
from core.tools.entities.tool_entities import ToolInvokeMessage
|
||||
from core.tools.provider.builtin.firecrawl.firecrawl_appx import FirecrawlApp
|
||||
from core.tools.provider.builtin.firecrawl.firecrawl_appx import FirecrawlApp, get_array_params, get_json_params
|
||||
from core.tools.tool.builtin_tool import BuiltinTool
|
||||
|
||||
|
||||
class CrawlTool(BuiltinTool):
|
||||
def _invoke(self, user_id: str, tool_parameters: dict[str, Any]) -> Union[ToolInvokeMessage, list[ToolInvokeMessage]]:
|
||||
app = FirecrawlApp(api_key=self.runtime.credentials['firecrawl_api_key'], base_url=self.runtime.credentials['base_url'])
|
||||
|
||||
options = {
|
||||
'crawlerOptions': {
|
||||
'excludes': tool_parameters.get('excludes', '').split(',') if tool_parameters.get('excludes') else [],
|
||||
'includes': tool_parameters.get('includes', '').split(',') if tool_parameters.get('includes') else [],
|
||||
'limit': tool_parameters.get('limit', 5)
|
||||
},
|
||||
'pageOptions': {
|
||||
'onlyMainContent': tool_parameters.get('onlyMainContent', False)
|
||||
}
|
||||
}
|
||||
def _invoke(self, user_id: str, tool_parameters: dict[str, Any]) -> ToolInvokeMessage:
|
||||
"""
|
||||
the crawlerOptions and pageOptions comes from doc here:
|
||||
https://docs.firecrawl.dev/api-reference/endpoint/crawl
|
||||
"""
|
||||
app = FirecrawlApp(api_key=self.runtime.credentials['firecrawl_api_key'],
|
||||
base_url=self.runtime.credentials['base_url'])
|
||||
crawlerOptions = {}
|
||||
pageOptions = {}
|
||||
|
||||
wait_for_results = tool_parameters.get('wait_for_results', True)
|
||||
|
||||
crawlerOptions['excludes'] = get_array_params(tool_parameters, 'excludes')
|
||||
crawlerOptions['includes'] = get_array_params(tool_parameters, 'includes')
|
||||
crawlerOptions['returnOnlyUrls'] = tool_parameters.get('returnOnlyUrls', False)
|
||||
crawlerOptions['maxDepth'] = tool_parameters.get('maxDepth')
|
||||
crawlerOptions['mode'] = tool_parameters.get('mode')
|
||||
crawlerOptions['ignoreSitemap'] = tool_parameters.get('ignoreSitemap', False)
|
||||
crawlerOptions['limit'] = tool_parameters.get('limit', 5)
|
||||
crawlerOptions['allowBackwardCrawling'] = tool_parameters.get('allowBackwardCrawling', False)
|
||||
crawlerOptions['allowExternalContentLinks'] = tool_parameters.get('allowExternalContentLinks', False)
|
||||
|
||||
pageOptions['headers'] = get_json_params(tool_parameters, 'headers')
|
||||
pageOptions['includeHtml'] = tool_parameters.get('includeHtml', False)
|
||||
pageOptions['includeRawHtml'] = tool_parameters.get('includeRawHtml', False)
|
||||
pageOptions['onlyIncludeTags'] = get_array_params(tool_parameters, 'onlyIncludeTags')
|
||||
pageOptions['removeTags'] = get_array_params(tool_parameters, 'removeTags')
|
||||
pageOptions['onlyMainContent'] = tool_parameters.get('onlyMainContent', False)
|
||||
pageOptions['replaceAllPathsWithAbsolutePaths'] = tool_parameters.get('replaceAllPathsWithAbsolutePaths', False)
|
||||
pageOptions['screenshot'] = tool_parameters.get('screenshot', False)
|
||||
pageOptions['waitFor'] = tool_parameters.get('waitFor', 0)
|
||||
|
||||
crawl_result = app.crawl_url(
|
||||
url=tool_parameters['url'],
|
||||
params=options,
|
||||
wait=True
|
||||
url=tool_parameters['url'],
|
||||
wait=wait_for_results,
|
||||
crawlerOptions=crawlerOptions,
|
||||
pageOptions=pageOptions
|
||||
)
|
||||
|
||||
if not isinstance(crawl_result, str):
|
||||
crawl_result = json.dumps(crawl_result, ensure_ascii=False, indent=4)
|
||||
|
||||
if not crawl_result:
|
||||
return self.create_text_message("Crawl request failed.")
|
||||
|
||||
return self.create_text_message(crawl_result)
|
||||
return self.create_json_message(crawl_result)
|
||||
|
||||
@ -0,0 +1,20 @@
|
||||
from typing import Any
|
||||
|
||||
from core.tools.entities.tool_entities import ToolInvokeMessage
|
||||
from core.tools.provider.builtin.firecrawl.firecrawl_appx import FirecrawlApp
|
||||
from core.tools.tool.builtin_tool import BuiltinTool
|
||||
|
||||
|
||||
class CrawlJobTool(BuiltinTool):
|
||||
def _invoke(self, user_id: str, tool_parameters: dict[str, Any]) -> ToolInvokeMessage:
|
||||
app = FirecrawlApp(api_key=self.runtime.credentials['firecrawl_api_key'],
|
||||
base_url=self.runtime.credentials['base_url'])
|
||||
operation = tool_parameters.get('operation', 'get')
|
||||
if operation == 'get':
|
||||
result = app.check_crawl_status(job_id=tool_parameters['job_id'])
|
||||
elif operation == 'cancel':
|
||||
result = app.cancel_crawl_job(job_id=tool_parameters['job_id'])
|
||||
else:
|
||||
raise ValueError(f'Invalid operation: {operation}')
|
||||
|
||||
return self.create_json_message(result)
|
||||
@ -1,26 +1,39 @@
|
||||
import json
|
||||
from typing import Any, Union
|
||||
from typing import Any
|
||||
|
||||
from core.tools.entities.tool_entities import ToolInvokeMessage
|
||||
from core.tools.provider.builtin.firecrawl.firecrawl_appx import FirecrawlApp
|
||||
from core.tools.provider.builtin.firecrawl.firecrawl_appx import FirecrawlApp, get_array_params, get_json_params
|
||||
from core.tools.tool.builtin_tool import BuiltinTool
|
||||
|
||||
|
||||
class ScrapeTool(BuiltinTool):
|
||||
def _invoke(self, user_id: str, tool_parameters: dict[str, Any]) -> Union[ToolInvokeMessage, list[ToolInvokeMessage]]:
|
||||
app = FirecrawlApp(api_key=self.runtime.credentials['firecrawl_api_key'], base_url=self.runtime.credentials['base_url'])
|
||||
|
||||
crawl_result = app.scrape_url(
|
||||
url=tool_parameters['url'],
|
||||
wait=True
|
||||
)
|
||||
def _invoke(self, user_id: str, tool_parameters: dict[str, Any]) -> ToolInvokeMessage:
|
||||
"""
|
||||
the pageOptions and extractorOptions comes from doc here:
|
||||
https://docs.firecrawl.dev/api-reference/endpoint/scrape
|
||||
"""
|
||||
app = FirecrawlApp(api_key=self.runtime.credentials['firecrawl_api_key'],
|
||||
base_url=self.runtime.credentials['base_url'])
|
||||
|
||||
if isinstance(crawl_result, dict):
|
||||
result_message = json.dumps(crawl_result, ensure_ascii=False, indent=4)
|
||||
else:
|
||||
result_message = str(crawl_result)
|
||||
pageOptions = {}
|
||||
extractorOptions = {}
|
||||
|
||||
if not crawl_result:
|
||||
return self.create_text_message("Scrape request failed.")
|
||||
pageOptions['headers'] = get_json_params(tool_parameters, 'headers')
|
||||
pageOptions['includeHtml'] = tool_parameters.get('includeHtml', False)
|
||||
pageOptions['includeRawHtml'] = tool_parameters.get('includeRawHtml', False)
|
||||
pageOptions['onlyIncludeTags'] = get_array_params(tool_parameters, 'onlyIncludeTags')
|
||||
pageOptions['removeTags'] = get_array_params(tool_parameters, 'removeTags')
|
||||
pageOptions['onlyMainContent'] = tool_parameters.get('onlyMainContent', False)
|
||||
pageOptions['replaceAllPathsWithAbsolutePaths'] = tool_parameters.get('replaceAllPathsWithAbsolutePaths', False)
|
||||
pageOptions['screenshot'] = tool_parameters.get('screenshot', False)
|
||||
pageOptions['waitFor'] = tool_parameters.get('waitFor', 0)
|
||||
|
||||
return self.create_text_message(result_message)
|
||||
extractorOptions['mode'] = tool_parameters.get('mode', '')
|
||||
extractorOptions['extractionPrompt'] = tool_parameters.get('extractionPrompt', '')
|
||||
extractorOptions['extractionSchema'] = get_json_params(tool_parameters, 'extractionSchema')
|
||||
|
||||
crawl_result = app.scrape_url(url=tool_parameters['url'],
|
||||
pageOptions=pageOptions,
|
||||
extractorOptions=extractorOptions)
|
||||
|
||||
return self.create_json_message(crawl_result)
|
||||
|
||||
Loading…
Reference in New Issue