fix: prevent timeout in file encoding detection for large files

- Add sample_size parameter to detect_file_encodings function - Read only a sample of the file instead of the entire file for encoding detection - This prevents timeout errors on large files while maintaining accurate encoding detection - Default sample size is 1MB which is sufficient for most encoding detection scenarios Fixes #21327
11 months ago · b2cb7b5f52
parent 8ea27bc341
commit b2cb7b5f52
2 changed files with 17 additions and 4 deletions
--- a/api/core/rag/extractor/helpers.py
+++ b/api/core/rag/extractor/helpers.py
@ -1,7 +1,7 @@
 """Document loader helpers."""
 import concurrent.futures
-from pathlib import Path
+import os
 from typing import NamedTuple, Optional, cast
@ -16,7 +16,7 @@ class FileEncoding(NamedTuple):
    """The language of the file."""
-def detect_file_encodings(file_path: str, timeout: int = 5) -> list[FileEncoding]:
+def detect_file_encodings(file_path: str, timeout: int = 5, sample_size: int = 1024 * 1024) -> list[FileEncoding]:
    """Try to detect the file encoding.
    Returns a list of `FileEncoding` tuples with the detected encodings ordered
@ -25,11 +25,16 @@ def detect_file_encodings(file_path: str, timeout: int = 5) -> list[FileEncoding
    Args:
        file_path: The path to the file to detect the encoding for.
        timeout: The timeout in seconds for the encoding detection.
        sample_size: The number of bytes to read for encoding detection. Default is 1MB.
                    For large files, reading only a sample is sufficient and prevents timeout.
    """
    import chardet
    def read_and_detect(file_path: str) -> list[dict]:
-        rawdata = Path(file_path).read_bytes()
+        with open(file_path, "rb") as f:
            # Read only a sample of the file for encoding detection
            # This prevents timeout on large files while still providing accurate encoding detection
            rawdata = f.read(sample_size)
        return cast(list[dict], chardet.detect_all(rawdata))
    with concurrent.futures.ThreadPoolExecutor() as executor:
--- a/api/core/rag/extractor/text_extractor.py
+++ b/api/core/rag/extractor/text_extractor.py
@ -36,8 +36,16 @@ class TextExtractor(BaseExtractor):
                        break
                    except UnicodeDecodeError:
                        continue
                else:
                    raise RuntimeError(
                        f"Decode failed: {self._file_path}, all detected encodings failed. "
                        f"Original error: {e}"
                    )
            else:
-                raise RuntimeError(f"Error loading {self._file_path}") from e
+                raise RuntimeError(
                    f"Decode failed: {self._file_path}, specified encoding failed. "
                    f"Original error: {e}"
                )
        except Exception as e:
            raise RuntimeError(f"Error loading {self._file_path}") from e