From c34409cf5898e45226b1b746791d21b06b4eb06f Mon Sep 17 00:00:00 2001 From: baonudesifeizhai Date: Sat, 5 Jul 2025 16:45:58 -0400 Subject: [PATCH 1/3] fix(http): support remote_url for file in http_request node Fixes #21913 --- .../workflow/nodes/http_request/executor.py | 27 ++++++----- api/factories/file_factory.py | 46 ++++++++++++++----- .../factories/test_build_from_mapping.py | 4 +- 3 files changed, 53 insertions(+), 24 deletions(-) diff --git a/api/core/workflow/nodes/http_request/executor.py b/api/core/workflow/nodes/http_request/executor.py index 2c83b00d4a..e8d0e060bd 100644 --- a/api/core/workflow/nodes/http_request/executor.py +++ b/api/core/workflow/nodes/http_request/executor.py @@ -11,6 +11,7 @@ import httpx from configs import dify_config from core.file import file_manager +from core.file.enums import FileTransferMethod from core.helper import ssrf_proxy from core.variables.segments import ArrayFileSegment, FileSegment from core.workflow.entities.variable_pool import VariablePool @@ -225,28 +226,32 @@ class Executor: files: dict[str, list[tuple[str | None, bytes, str]]] = {} for key, files_in_segment in files_list: for file in files_in_segment: - if file.related_id is not None: + if file.related_id is not None or file.transfer_method == FileTransferMethod.REMOTE_URL: + downloaded_file = file_manager.download(file) file_tuple = ( file.filename, - file_manager.download(file), + downloaded_file, file.mime_type or "application/octet-stream", ) if key not in files: files[key] = [] files[key].append(file_tuple) - # convert files to list for httpx request + # convert files dictionary to list for httpx request + self.files = [] + for key, file_tuples in files.items(): + for file_tuple in file_tuples: + self.files.append((key, file_tuple)) + # If there are no actual files, we still need to force httpx to use `multipart/form-data`. # This is achieved by inserting a harmless placeholder file that will be ignored by the server. - if not files: + if not self.files: self.files = [("__multipart_placeholder__", ("", b"", "application/octet-stream"))] - if files: - self.files = [] - for key, file_tuples in files.items(): - for file_tuple in file_tuples: - self.files.append((key, file_tuple)) - self.data = form_data + # Only set self.data if there are text fields. If only files are present, self.data should be None, + # which tells httpx to build the request body from self.files. + if form_data: + self.data = form_data def _assembling_headers(self) -> dict[str, Any]: authorization = deepcopy(self.auth) @@ -390,7 +395,7 @@ class Executor: body_string += content.decode("utf-8") except UnicodeDecodeError: # fix: decode binary content - pass + body_string += f"[Binary data of {len(content)} bytes]" body_string += "\r\n" body_string += f"--{boundary}--\r\n" elif self.node_data.body: diff --git a/api/factories/file_factory.py b/api/factories/file_factory.py index 25d1390492..07b79a4d6f 100644 --- a/api/factories/file_factory.py +++ b/api/factories/file_factory.py @@ -1,9 +1,9 @@ import mimetypes +import re import uuid from collections.abc import Callable, Mapping, Sequence -from typing import Any, cast +from typing import Any -import httpx from sqlalchemy import select from sqlalchemy.orm import Session @@ -224,10 +224,16 @@ def _build_from_remote_url( mime_type, filename, file_size = _get_remote_file_info(url) extension = mimetypes.guess_extension(mime_type) or ("." + filename.split(".")[-1] if "." in filename else ".bin") - file_type = _standardize_file_type(extension=extension, mime_type=mime_type) - if file_type.value != mapping.get("type", "custom"): + specified_type = mapping.get("type") + detected_file_type = _standardize_file_type(extension=extension, mime_type=mime_type) + + if strict_type_validation and specified_type and detected_file_type.value != specified_type: raise ValueError("Detected file type does not match the specified type. Please verify the file.") + file_type = ( + FileType(specified_type) if specified_type and specified_type != FileType.CUSTOM.value else detected_file_type + ) + return File( id=mapping.get("id"), filename=filename, @@ -244,16 +250,32 @@ def _build_from_remote_url( def _get_remote_file_info(url: str): file_size = -1 - filename = url.split("/")[-1].split("?")[0] or "unknown_file" - mime_type = mimetypes.guess_type(filename)[0] or "" + filename = "" resp = ssrf_proxy.head(url, follow_redirects=True) - resp = cast(httpx.Response, resp) - if resp.status_code == httpx.codes.OK: - if content_disposition := resp.headers.get("Content-Disposition"): - filename = str(content_disposition.split("filename=")[-1].strip('"')) - file_size = int(resp.headers.get("Content-Length", file_size)) - mime_type = mime_type or str(resp.headers.get("Content-Type", "")) + resp.raise_for_status() + + content_disposition = resp.headers.get("Content-Disposition") + if content_disposition: + # Use regex to parse filename from content-disposition header + # RFC 2616, Section 19.5.1 + filename_match = re.search(r'filename="([^"]+)"', content_disposition) + if filename_match: + filename = filename_match.group(1) + + if not filename: + filename = url.split("/")[-1].split("?")[0] or "unknown_file" + + mime_type = resp.headers.get("Content-Type", "") + if not mime_type: + mime_type = mimetypes.guess_type(filename)[0] or "application/octet-stream" + else: + # strip charset or other parameters from mime type + mime_type = mime_type.split(";")[0].strip() + + content_length = resp.headers.get("Content-Length") + if content_length and content_length.isdigit(): + file_size = int(content_length) return mime_type, filename, file_size diff --git a/api/tests/unit_tests/factories/test_build_from_mapping.py b/api/tests/unit_tests/factories/test_build_from_mapping.py index 48463a369e..53c55f6acb 100644 --- a/api/tests/unit_tests/factories/test_build_from_mapping.py +++ b/api/tests/unit_tests/factories/test_build_from_mapping.py @@ -62,7 +62,7 @@ def mock_tool_file(): @pytest.fixture def mock_http_head(): def _mock_response(filename, size, content_type): - return Response( + response = Response( status_code=200, headers={ "Content-Disposition": f'attachment; filename="{filename}"', @@ -70,6 +70,8 @@ def mock_http_head(): "Content-Type": content_type, }, ) + response.request = MagicMock() + return response with patch("factories.file_factory.ssrf_proxy.head") as mock_head: mock_head.return_value = _mock_response("remote_test.jpg", 2048, "image/jpeg") From 3d2f46a99f91f1b0cc042ff24d093514c3e938a3 Mon Sep 17 00:00:00 2001 From: baonudesifeizhai Date: Mon, 7 Jul 2025 14:05:19 -0400 Subject: [PATCH 2/3] =?UTF-8?q?Revert(test):=20=E6=92=A4=E9=94=80=E5=AF=B9?= =?UTF-8?q?=20test=5Fbuild=5Ffrom=5Fmapping.py=20=E7=9A=84=E4=BF=AE?= =?UTF-8?q?=E6=94=B9?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- api/tests/unit_tests/factories/test_build_from_mapping.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/api/tests/unit_tests/factories/test_build_from_mapping.py b/api/tests/unit_tests/factories/test_build_from_mapping.py index 53c55f6acb..48463a369e 100644 --- a/api/tests/unit_tests/factories/test_build_from_mapping.py +++ b/api/tests/unit_tests/factories/test_build_from_mapping.py @@ -62,7 +62,7 @@ def mock_tool_file(): @pytest.fixture def mock_http_head(): def _mock_response(filename, size, content_type): - response = Response( + return Response( status_code=200, headers={ "Content-Disposition": f'attachment; filename="{filename}"', @@ -70,8 +70,6 @@ def mock_http_head(): "Content-Type": content_type, }, ) - response.request = MagicMock() - return response with patch("factories.file_factory.ssrf_proxy.head") as mock_head: mock_head.return_value = _mock_response("remote_test.jpg", 2048, "image/jpeg") From 568ef45d74144b562e6176f4950406276f7d0f5d Mon Sep 17 00:00:00 2001 From: baonudesifeizhai Date: Mon, 7 Jul 2025 16:02:31 -0400 Subject: [PATCH 3/3] fix(file): Handle remote file response and fix CI --- api/factories/file_factory.py | 9 +++++---- docker/docker/volumes/sandbox/conf/config.yaml | 0 2 files changed, 5 insertions(+), 4 deletions(-) create mode 100644 docker/docker/volumes/sandbox/conf/config.yaml diff --git a/api/factories/file_factory.py b/api/factories/file_factory.py index 07b79a4d6f..5de1065214 100644 --- a/api/factories/file_factory.py +++ b/api/factories/file_factory.py @@ -250,12 +250,14 @@ def _build_from_remote_url( def _get_remote_file_info(url: str): file_size = -1 - filename = "" + filename = url.split("/")[-1].split("?")[0] or "unknown_file" + mime_type = mimetypes.guess_type(filename)[0] or "" resp = ssrf_proxy.head(url, follow_redirects=True) - resp.raise_for_status() + if resp.status_code >= 400: + raise ValueError(f"Failed to fetch remote file info from {url}, status code: {resp.status_code}") - content_disposition = resp.headers.get("Content-Disposition") + content_disposition = resp.headers.get("Content-Disposition", "") if content_disposition: # Use regex to parse filename from content-disposition header # RFC 2616, Section 19.5.1 @@ -266,7 +268,6 @@ def _get_remote_file_info(url: str): if not filename: filename = url.split("/")[-1].split("?")[0] or "unknown_file" - mime_type = resp.headers.get("Content-Type", "") if not mime_type: mime_type = mimetypes.guess_type(filename)[0] or "application/octet-stream" else: diff --git a/docker/docker/volumes/sandbox/conf/config.yaml b/docker/docker/volumes/sandbox/conf/config.yaml new file mode 100644 index 0000000000..e69de29bb2