@ -1,7 +1,7 @@
""" Document loader helpers. """
""" Document loader helpers. """
import concurrent . futures
import concurrent . futures
from pathlib import Path
import os
from typing import NamedTuple , Optional , cast
from typing import NamedTuple , Optional , cast
@ -16,7 +16,7 @@ class FileEncoding(NamedTuple):
""" The language of the file. """
""" The language of the file. """
def detect_file_encodings ( file_path : str , timeout : int = 5 ) - > list [ FileEncoding ] :
def detect_file_encodings ( file_path : str , timeout : int = 5 , sample_size : int = 1024 * 1024 ) - > list [ FileEncoding ] :
""" Try to detect the file encoding.
""" Try to detect the file encoding.
Returns a list of ` FileEncoding ` tuples with the detected encodings ordered
Returns a list of ` FileEncoding ` tuples with the detected encodings ordered
@ -25,11 +25,16 @@ def detect_file_encodings(file_path: str, timeout: int = 5) -> list[FileEncoding
Args :
Args :
file_path : The path to the file to detect the encoding for .
file_path : The path to the file to detect the encoding for .
timeout : The timeout in seconds for the encoding detection .
timeout : The timeout in seconds for the encoding detection .
sample_size : The number of bytes to read for encoding detection . Default is 1 MB .
For large files , reading only a sample is sufficient and prevents timeout .
"""
"""
import chardet
import chardet
def read_and_detect ( file_path : str ) - > list [ dict ] :
def read_and_detect ( file_path : str ) - > list [ dict ] :
rawdata = Path ( file_path ) . read_bytes ( )
with open ( file_path , " rb " ) as f :
# Read only a sample of the file for encoding detection
# This prevents timeout on large files while still providing accurate encoding detection
rawdata = f . read ( sample_size )
return cast ( list [ dict ] , chardet . detect_all ( rawdata ) )
return cast ( list [ dict ] , chardet . detect_all ( rawdata ) )
with concurrent . futures . ThreadPoolExecutor ( ) as executor :
with concurrent . futures . ThreadPoolExecutor ( ) as executor :