diff --git a/api/core/workflow/nodes/http_request/executor.py b/api/core/workflow/nodes/http_request/executor.py index 8ac1ae8526..57553111ca 100644 --- a/api/core/workflow/nodes/http_request/executor.py +++ b/api/core/workflow/nodes/http_request/executor.py @@ -12,6 +12,7 @@ from json_repair import repair_json from configs import dify_config from core.file import file_manager +from core.file.enums import FileTransferMethod from core.helper import ssrf_proxy from core.variables.segments import ArrayFileSegment, FileSegment from core.workflow.entities.variable_pool import VariablePool @@ -227,28 +228,32 @@ class Executor: files: dict[str, list[tuple[str | None, bytes, str]]] = {} for key, files_in_segment in files_list: for file in files_in_segment: - if file.related_id is not None: + if file.related_id is not None or file.transfer_method == FileTransferMethod.REMOTE_URL: + downloaded_file = file_manager.download(file) file_tuple = ( file.filename, - file_manager.download(file), + downloaded_file, file.mime_type or "application/octet-stream", ) if key not in files: files[key] = [] files[key].append(file_tuple) - # convert files to list for httpx request + # convert files dictionary to list for httpx request + self.files = [] + for key, file_tuples in files.items(): + for file_tuple in file_tuples: + self.files.append((key, file_tuple)) + # If there are no actual files, we still need to force httpx to use `multipart/form-data`. # This is achieved by inserting a harmless placeholder file that will be ignored by the server. - if not files: + if not self.files: self.files = [("__multipart_placeholder__", ("", b"", "application/octet-stream"))] - if files: - self.files = [] - for key, file_tuples in files.items(): - for file_tuple in file_tuples: - self.files.append((key, file_tuple)) - self.data = form_data + # Only set self.data if there are text fields. If only files are present, self.data should be None, + # which tells httpx to build the request body from self.files. + if form_data: + self.data = form_data def _assembling_headers(self) -> dict[str, Any]: authorization = deepcopy(self.auth) @@ -392,7 +397,7 @@ class Executor: body_string += content.decode("utf-8") except UnicodeDecodeError: # fix: decode binary content - pass + body_string += f"[Binary data of {len(content)} bytes]" body_string += "\r\n" body_string += f"--{boundary}--\r\n" elif self.node_data.body: diff --git a/api/factories/file_factory.py b/api/factories/file_factory.py index 512a9cb608..b6ceb60178 100644 --- a/api/factories/file_factory.py +++ b/api/factories/file_factory.py @@ -1,9 +1,9 @@ import mimetypes +import re import uuid from collections.abc import Callable, Mapping, Sequence -from typing import Any, cast +from typing import Any -import httpx from sqlalchemy import select from sqlalchemy.orm import Session @@ -220,10 +220,16 @@ def _build_from_remote_url( mime_type, filename, file_size = _get_remote_file_info(url) extension = mimetypes.guess_extension(mime_type) or ("." + filename.split(".")[-1] if "." in filename else ".bin") - file_type = _standardize_file_type(extension=extension, mime_type=mime_type) - if file_type.value != mapping.get("type", "custom"): + specified_type = mapping.get("type") + detected_file_type = _standardize_file_type(extension=extension, mime_type=mime_type) + + if strict_type_validation and specified_type and detected_file_type.value != specified_type: raise ValueError("Detected file type does not match the specified type. Please verify the file.") + file_type = ( + FileType(specified_type) if specified_type and specified_type != FileType.CUSTOM.value else detected_file_type + ) + return File( id=mapping.get("id"), filename=filename, @@ -244,12 +250,29 @@ def _get_remote_file_info(url: str): mime_type = mimetypes.guess_type(filename)[0] or "" resp = ssrf_proxy.head(url, follow_redirects=True) - resp = cast(httpx.Response, resp) - if resp.status_code == httpx.codes.OK: - if content_disposition := resp.headers.get("Content-Disposition"): - filename = str(content_disposition.split("filename=")[-1].strip('"')) - file_size = int(resp.headers.get("Content-Length", file_size)) - mime_type = mime_type or str(resp.headers.get("Content-Type", "")) + if resp.status_code >= 400: + raise ValueError(f"Failed to fetch remote file info from {url}, status code: {resp.status_code}") + + content_disposition = resp.headers.get("Content-Disposition", "") + if content_disposition: + # Use regex to parse filename from content-disposition header + # RFC 2616, Section 19.5.1 + filename_match = re.search(r'filename="([^"]+)"', content_disposition) + if filename_match: + filename = filename_match.group(1) + + if not filename: + filename = url.split("/")[-1].split("?")[0] or "unknown_file" + + if not mime_type: + mime_type = mimetypes.guess_type(filename)[0] or "application/octet-stream" + else: + # strip charset or other parameters from mime type + mime_type = mime_type.split(";")[0].strip() + + content_length = resp.headers.get("Content-Length") + if content_length and content_length.isdigit(): + file_size = int(content_length) return mime_type, filename, file_size diff --git a/docker/docker/volumes/sandbox/conf/config.yaml b/docker/docker/volumes/sandbox/conf/config.yaml new file mode 100644 index 0000000000..e69de29bb2