pull/21960/merge
baonudesifeizhai 7 months ago committed by GitHub
commit f3dbfdf0dc
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194

@ -12,6 +12,7 @@ from json_repair import repair_json
from configs import dify_config
from core.file import file_manager
from core.file.enums import FileTransferMethod
from core.helper import ssrf_proxy
from core.variables.segments import ArrayFileSegment, FileSegment
from core.workflow.entities.variable_pool import VariablePool
@ -227,28 +228,32 @@ class Executor:
files: dict[str, list[tuple[str | None, bytes, str]]] = {}
for key, files_in_segment in files_list:
for file in files_in_segment:
if file.related_id is not None:
if file.related_id is not None or file.transfer_method == FileTransferMethod.REMOTE_URL:
downloaded_file = file_manager.download(file)
file_tuple = (
file.filename,
file_manager.download(file),
downloaded_file,
file.mime_type or "application/octet-stream",
)
if key not in files:
files[key] = []
files[key].append(file_tuple)
# convert files to list for httpx request
# convert files dictionary to list for httpx request
self.files = []
for key, file_tuples in files.items():
for file_tuple in file_tuples:
self.files.append((key, file_tuple))
# If there are no actual files, we still need to force httpx to use `multipart/form-data`.
# This is achieved by inserting a harmless placeholder file that will be ignored by the server.
if not files:
if not self.files:
self.files = [("__multipart_placeholder__", ("", b"", "application/octet-stream"))]
if files:
self.files = []
for key, file_tuples in files.items():
for file_tuple in file_tuples:
self.files.append((key, file_tuple))
self.data = form_data
# Only set self.data if there are text fields. If only files are present, self.data should be None,
# which tells httpx to build the request body from self.files.
if form_data:
self.data = form_data
def _assembling_headers(self) -> dict[str, Any]:
authorization = deepcopy(self.auth)
@ -392,7 +397,7 @@ class Executor:
body_string += content.decode("utf-8")
except UnicodeDecodeError:
# fix: decode binary content
pass
body_string += f"[Binary data of {len(content)} bytes]"
body_string += "\r\n"
body_string += f"--{boundary}--\r\n"
elif self.node_data.body:

@ -1,9 +1,9 @@
import mimetypes
import re
import uuid
from collections.abc import Callable, Mapping, Sequence
from typing import Any, cast
from typing import Any
import httpx
from sqlalchemy import select
from sqlalchemy.orm import Session
@ -220,10 +220,16 @@ def _build_from_remote_url(
mime_type, filename, file_size = _get_remote_file_info(url)
extension = mimetypes.guess_extension(mime_type) or ("." + filename.split(".")[-1] if "." in filename else ".bin")
file_type = _standardize_file_type(extension=extension, mime_type=mime_type)
if file_type.value != mapping.get("type", "custom"):
specified_type = mapping.get("type")
detected_file_type = _standardize_file_type(extension=extension, mime_type=mime_type)
if strict_type_validation and specified_type and detected_file_type.value != specified_type:
raise ValueError("Detected file type does not match the specified type. Please verify the file.")
file_type = (
FileType(specified_type) if specified_type and specified_type != FileType.CUSTOM.value else detected_file_type
)
return File(
id=mapping.get("id"),
filename=filename,
@ -244,12 +250,29 @@ def _get_remote_file_info(url: str):
mime_type = mimetypes.guess_type(filename)[0] or ""
resp = ssrf_proxy.head(url, follow_redirects=True)
resp = cast(httpx.Response, resp)
if resp.status_code == httpx.codes.OK:
if content_disposition := resp.headers.get("Content-Disposition"):
filename = str(content_disposition.split("filename=")[-1].strip('"'))
file_size = int(resp.headers.get("Content-Length", file_size))
mime_type = mime_type or str(resp.headers.get("Content-Type", ""))
if resp.status_code >= 400:
raise ValueError(f"Failed to fetch remote file info from {url}, status code: {resp.status_code}")
content_disposition = resp.headers.get("Content-Disposition", "")
if content_disposition:
# Use regex to parse filename from content-disposition header
# RFC 2616, Section 19.5.1
filename_match = re.search(r'filename="([^"]+)"', content_disposition)
if filename_match:
filename = filename_match.group(1)
if not filename:
filename = url.split("/")[-1].split("?")[0] or "unknown_file"
if not mime_type:
mime_type = mimetypes.guess_type(filename)[0] or "application/octet-stream"
else:
# strip charset or other parameters from mime type
mime_type = mime_type.split(";")[0].strip()
content_length = resp.headers.get("Content-Length")
if content_length and content_length.isdigit():
file_size = int(content_length)
return mime_type, filename, file_size

Loading…
Cancel
Save