Merge 568ef45d74 into bd43ca6275

7 months ago · f3dbfdf0dc
parent bd43ca6275 568ef45d74
commit f3dbfdf0dc
3 changed files with 49 additions and 21 deletions
--- a/api/core/workflow/nodes/http_request/executor.py
+++ b/api/core/workflow/nodes/http_request/executor.py
@ -12,6 +12,7 @@ from json_repair import repair_json

 from configs import dify_config
 from core.file import file_manager
+from core.file.enums import FileTransferMethod
 from core.helper import ssrf_proxy
 from core.variables.segments import ArrayFileSegment, FileSegment
 from core.workflow.entities.variable_pool import VariablePool
@ -227,28 +228,32 @@ class Executor:
                    files: dict[str, list[tuple[str | None, bytes, str]]] = {}
                    for key, files_in_segment in files_list:
                        for file in files_in_segment:
-                            if file.related_id is not None:
+                            if file.related_id is not None or file.transfer_method == FileTransferMethod.REMOTE_URL:
+                                downloaded_file = file_manager.download(file)
                                file_tuple = (
                                    file.filename,
-                                    file_manager.download(file),
+                                    downloaded_file,
                                    file.mime_type or "application/octet-stream",
                                )
                                if key not in files:
                                    files[key] = []
                                files[key].append(file_tuple)

-                    # convert files to list for httpx request
+                    # convert files dictionary to list for httpx request
+                    self.files = []
+                    for key, file_tuples in files.items():
+                        for file_tuple in file_tuples:
+                            self.files.append((key, file_tuple))
+
                    # If there are no actual files, we still need to force httpx to use `multipart/form-data`.
                    # This is achieved by inserting a harmless placeholder file that will be ignored by the server.
-                    if not files:
+                    if not self.files:
                        self.files = [("__multipart_placeholder__", ("", b"", "application/octet-stream"))]
-                    if files:
-                        self.files = []
-                        for key, file_tuples in files.items():
-                            for file_tuple in file_tuples:
-                                self.files.append((key, file_tuple))

-                    self.data = form_data
+                    # Only set self.data if there are text fields. If only files are present, self.data should be None,
+                    # which tells httpx to build the request body from self.files.
+                    if form_data:
+                        self.data = form_data

    def _assembling_headers(self) -> dict[str, Any]:
        authorization = deepcopy(self.auth)
@ -392,7 +397,7 @@ class Executor:
                    body_string += content.decode("utf-8")
                except UnicodeDecodeError:
                    # fix: decode binary content
-                    pass
+                    body_string += f"[Binary data of {len(content)} bytes]"
                body_string += "\r\n"
            body_string += f"--{boundary}--\r\n"
        elif self.node_data.body:
--- a/api/factories/file_factory.py
+++ b/api/factories/file_factory.py
@ -1,9 +1,9 @@
 import mimetypes
+import re
 import uuid
 from collections.abc import Callable, Mapping, Sequence
-from typing import Any, cast
+from typing import Any

-import httpx
 from sqlalchemy import select
 from sqlalchemy.orm import Session

@ -220,10 +220,16 @@ def _build_from_remote_url(
    mime_type, filename, file_size = _get_remote_file_info(url)
    extension = mimetypes.guess_extension(mime_type) or ("." + filename.split(".")[-1] if "." in filename else ".bin")

-    file_type = _standardize_file_type(extension=extension, mime_type=mime_type)
-    if file_type.value != mapping.get("type", "custom"):
+    specified_type = mapping.get("type")
+    detected_file_type = _standardize_file_type(extension=extension, mime_type=mime_type)
+
+    if strict_type_validation and specified_type and detected_file_type.value != specified_type:
        raise ValueError("Detected file type does not match the specified type. Please verify the file.")

+    file_type = (
+        FileType(specified_type) if specified_type and specified_type != FileType.CUSTOM.value else detected_file_type
+    )
+
    return File(
        id=mapping.get("id"),
        filename=filename,
@ -244,12 +250,29 @@ def _get_remote_file_info(url: str):
    mime_type = mimetypes.guess_type(filename)[0] or ""

    resp = ssrf_proxy.head(url, follow_redirects=True)
-    resp = cast(httpx.Response, resp)
-    if resp.status_code == httpx.codes.OK:
-        if content_disposition := resp.headers.get("Content-Disposition"):
-            filename = str(content_disposition.split("filename=")[-1].strip('"'))
-        file_size = int(resp.headers.get("Content-Length", file_size))
-        mime_type = mime_type or str(resp.headers.get("Content-Type", ""))
+    if resp.status_code >= 400:
+        raise ValueError(f"Failed to fetch remote file info from {url}, status code: {resp.status_code}")
+
+    content_disposition = resp.headers.get("Content-Disposition", "")
+    if content_disposition:
+        # Use regex to parse filename from content-disposition header
+        # RFC 2616, Section 19.5.1
+        filename_match = re.search(r'filename="([^"]+)"', content_disposition)
+        if filename_match:
+            filename = filename_match.group(1)
+
+    if not filename:
+        filename = url.split("/")[-1].split("?")[0] or "unknown_file"
+
+    if not mime_type:
+        mime_type = mimetypes.guess_type(filename)[0] or "application/octet-stream"
+    else:
+        # strip charset or other parameters from mime type
+        mime_type = mime_type.split(";")[0].strip()
+
+    content_length = resp.headers.get("Content-Length")
+    if content_length and content_length.isdigit():
+        file_size = int(content_length)

    return mime_type, filename, file_size

--- a/docker/docker/volumes/sandbox/conf/config.yaml
+++ b/docker/docker/volumes/sandbox/conf/config.yaml