|
|
|
@ -22,6 +22,7 @@ class FirecrawlApp:
|
|
|
|
"formats": ["markdown"],
|
|
|
|
"formats": ["markdown"],
|
|
|
|
"onlyMainContent": True,
|
|
|
|
"onlyMainContent": True,
|
|
|
|
"timeout": 30000,
|
|
|
|
"timeout": 30000,
|
|
|
|
|
|
|
|
"integration": "dify",
|
|
|
|
}
|
|
|
|
}
|
|
|
|
if params:
|
|
|
|
if params:
|
|
|
|
json_data.update(params)
|
|
|
|
json_data.update(params)
|
|
|
|
@ -39,7 +40,7 @@ class FirecrawlApp:
|
|
|
|
def crawl_url(self, url, params=None) -> str:
|
|
|
|
def crawl_url(self, url, params=None) -> str:
|
|
|
|
# Documentation: https://docs.firecrawl.dev/api-reference/endpoint/crawl-post
|
|
|
|
# Documentation: https://docs.firecrawl.dev/api-reference/endpoint/crawl-post
|
|
|
|
headers = self._prepare_headers()
|
|
|
|
headers = self._prepare_headers()
|
|
|
|
json_data = {"url": url}
|
|
|
|
json_data = {"url": url, "integration": "dify"}
|
|
|
|
if params:
|
|
|
|
if params:
|
|
|
|
json_data.update(params)
|
|
|
|
json_data.update(params)
|
|
|
|
response = self._post_request(f"{self.base_url}/v1/crawl", json_data, headers)
|
|
|
|
response = self._post_request(f"{self.base_url}/v1/crawl", json_data, headers)
|
|
|
|
@ -49,7 +50,6 @@ class FirecrawlApp:
|
|
|
|
return cast(str, job_id)
|
|
|
|
return cast(str, job_id)
|
|
|
|
else:
|
|
|
|
else:
|
|
|
|
self._handle_error(response, "start crawl job")
|
|
|
|
self._handle_error(response, "start crawl job")
|
|
|
|
# FIXME: unreachable code for mypy
|
|
|
|
|
|
|
|
return "" # unreachable
|
|
|
|
return "" # unreachable
|
|
|
|
|
|
|
|
|
|
|
|
def check_crawl_status(self, job_id) -> dict[str, Any]:
|
|
|
|
def check_crawl_status(self, job_id) -> dict[str, Any]:
|
|
|
|
@ -82,7 +82,6 @@ class FirecrawlApp:
|
|
|
|
)
|
|
|
|
)
|
|
|
|
else:
|
|
|
|
else:
|
|
|
|
self._handle_error(response, "check crawl status")
|
|
|
|
self._handle_error(response, "check crawl status")
|
|
|
|
# FIXME: unreachable code for mypy
|
|
|
|
|
|
|
|
return {} # unreachable
|
|
|
|
return {} # unreachable
|
|
|
|
|
|
|
|
|
|
|
|
def _format_crawl_status_response(
|
|
|
|
def _format_crawl_status_response(
|
|
|
|
@ -126,4 +125,31 @@ class FirecrawlApp:
|
|
|
|
|
|
|
|
|
|
|
|
def _handle_error(self, response, action) -> None:
|
|
|
|
def _handle_error(self, response, action) -> None:
|
|
|
|
error_message = response.json().get("error", "Unknown error occurred")
|
|
|
|
error_message = response.json().get("error", "Unknown error occurred")
|
|
|
|
raise Exception(f"Failed to {action}. Status code: {response.status_code}. Error: {error_message}")
|
|
|
|
raise Exception(f"Failed to {action}. Status code: {response.status_code}. Error: {error_message}") # type: ignore[return]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def search(self, query: str, params: dict[str, Any] | None = None) -> dict[str, Any]:
|
|
|
|
|
|
|
|
# Documentation: https://docs.firecrawl.dev/api-reference/endpoint/search
|
|
|
|
|
|
|
|
headers = self._prepare_headers()
|
|
|
|
|
|
|
|
json_data = {
|
|
|
|
|
|
|
|
"query": query,
|
|
|
|
|
|
|
|
"limit": 5,
|
|
|
|
|
|
|
|
"lang": "en",
|
|
|
|
|
|
|
|
"country": "us",
|
|
|
|
|
|
|
|
"timeout": 60000,
|
|
|
|
|
|
|
|
"ignoreInvalidURLs": False,
|
|
|
|
|
|
|
|
"scrapeOptions": {},
|
|
|
|
|
|
|
|
"integration": "dify",
|
|
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
if params:
|
|
|
|
|
|
|
|
json_data.update(params)
|
|
|
|
|
|
|
|
response = self._post_request(f"{self.base_url}/v1/search", json_data, headers)
|
|
|
|
|
|
|
|
if response.status_code == 200:
|
|
|
|
|
|
|
|
response_data = response.json()
|
|
|
|
|
|
|
|
if not response_data.get("success"):
|
|
|
|
|
|
|
|
raise Exception(f"Search failed. Error: {response_data.get('warning', 'Unknown error')}")
|
|
|
|
|
|
|
|
return cast(dict[str, Any], response_data)
|
|
|
|
|
|
|
|
elif response.status_code in {402, 409, 500, 429, 408}:
|
|
|
|
|
|
|
|
self._handle_error(response, "perform search")
|
|
|
|
|
|
|
|
return {} # Avoid additional exception after handling error
|
|
|
|
|
|
|
|
else:
|
|
|
|
|
|
|
|
raise Exception(f"Failed to perform search. Status code: {response.status_code}")
|
|
|
|
|