fix: prevent timeout in file encoding detection for large files

- Add sample_size parameter to detect_file_encodings function
- Read only a sample of the file instead of the entire file for encoding detection
- This prevents timeout errors on large files while maintaining accurate encoding detection
- Default sample size is 1MB which is sufficient for most encoding detection scenarios

Fixes #21327
pull/21453/head
baonudesifeizhai 11 months ago
parent 8ea27bc341
commit b2cb7b5f52

@ -1,7 +1,7 @@
"""Document loader helpers.""" """Document loader helpers."""
import concurrent.futures import concurrent.futures
from pathlib import Path import os
from typing import NamedTuple, Optional, cast from typing import NamedTuple, Optional, cast
@ -16,7 +16,7 @@ class FileEncoding(NamedTuple):
"""The language of the file.""" """The language of the file."""
def detect_file_encodings(file_path: str, timeout: int = 5) -> list[FileEncoding]: def detect_file_encodings(file_path: str, timeout: int = 5, sample_size: int = 1024 * 1024) -> list[FileEncoding]:
"""Try to detect the file encoding. """Try to detect the file encoding.
Returns a list of `FileEncoding` tuples with the detected encodings ordered Returns a list of `FileEncoding` tuples with the detected encodings ordered
@ -25,11 +25,16 @@ def detect_file_encodings(file_path: str, timeout: int = 5) -> list[FileEncoding
Args: Args:
file_path: The path to the file to detect the encoding for. file_path: The path to the file to detect the encoding for.
timeout: The timeout in seconds for the encoding detection. timeout: The timeout in seconds for the encoding detection.
sample_size: The number of bytes to read for encoding detection. Default is 1MB.
For large files, reading only a sample is sufficient and prevents timeout.
""" """
import chardet import chardet
def read_and_detect(file_path: str) -> list[dict]: def read_and_detect(file_path: str) -> list[dict]:
rawdata = Path(file_path).read_bytes() with open(file_path, "rb") as f:
# Read only a sample of the file for encoding detection
# This prevents timeout on large files while still providing accurate encoding detection
rawdata = f.read(sample_size)
return cast(list[dict], chardet.detect_all(rawdata)) return cast(list[dict], chardet.detect_all(rawdata))
with concurrent.futures.ThreadPoolExecutor() as executor: with concurrent.futures.ThreadPoolExecutor() as executor:

@ -36,8 +36,16 @@ class TextExtractor(BaseExtractor):
break break
except UnicodeDecodeError: except UnicodeDecodeError:
continue continue
else:
raise RuntimeError(
f"Decode failed: {self._file_path}, all detected encodings failed. "
f"Original error: {e}"
)
else: else:
raise RuntimeError(f"Error loading {self._file_path}") from e raise RuntimeError(
f"Decode failed: {self._file_path}, specified encoding failed. "
f"Original error: {e}"
)
except Exception as e: except Exception as e:
raise RuntimeError(f"Error loading {self._file_path}") from e raise RuntimeError(f"Error loading {self._file_path}") from e

Loading…
Cancel
Save