@ -7,6 +7,7 @@ import tempfile
from collections . abc import Mapping , Sequence
from typing import Any , cast
import chardet
import docx
import pandas as pd
import pypandoc # type: ignore
@ -180,25 +181,63 @@ def _extract_text_by_file_extension(*, file_content: bytes, file_extension: str)
def _extract_text_from_plain_text ( file_content : bytes ) - > str :
try :
return file_content . decode ( " utf-8 " , " ignore " )
except UnicodeDecodeError as e :
raise TextExtractionError ( " Failed to decode plain text file " ) from e
# Detect encoding using chardet
result = chardet . detect ( file_content )
encoding = result [ " encoding " ]
# Fallback to utf-8 if detection fails
if not encoding :
encoding = " utf-8 "
return file_content . decode ( encoding , errors = " ignore " )
except ( UnicodeDecodeError , LookupError ) as e :
# If decoding fails, try with utf-8 as last resort
try :
return file_content . decode ( " utf-8 " , errors = " ignore " )
except UnicodeDecodeError :
raise TextExtractionError ( f " Failed to decode plain text file: { e } " ) from e
def _extract_text_from_json ( file_content : bytes ) - > str :
try :
json_data = json . loads ( file_content . decode ( " utf-8 " , " ignore " ) )
# Detect encoding using chardet
result = chardet . detect ( file_content )
encoding = result [ " encoding " ]
# Fallback to utf-8 if detection fails
if not encoding :
encoding = " utf-8 "
json_data = json . loads ( file_content . decode ( encoding , errors = " ignore " ) )
return json . dumps ( json_data , indent = 2 , ensure_ascii = False )
except ( UnicodeDecodeError , LookupError , json . JSONDecodeError ) as e :
# If decoding fails, try with utf-8 as last resort
try :
json_data = json . loads ( file_content . decode ( " utf-8 " , errors = " ignore " ) )
return json . dumps ( json_data , indent = 2 , ensure_ascii = False )
except ( UnicodeDecodeError , json . JSONDecodeError ) as e :
except ( UnicodeDecodeError , json . JSONDecodeError ) :
raise TextExtractionError ( f " Failed to decode or parse JSON file: { e } " ) from e
def _extract_text_from_yaml ( file_content : bytes ) - > str :
""" Extract the content from yaml file """
try :
yaml_data = yaml . safe_load_all ( file_content . decode ( " utf-8 " , " ignore " ) )
# Detect encoding using chardet
result = chardet . detect ( file_content )
encoding = result [ " encoding " ]
# Fallback to utf-8 if detection fails
if not encoding :
encoding = " utf-8 "
yaml_data = yaml . safe_load_all ( file_content . decode ( encoding , errors = " ignore " ) )
return cast ( str , yaml . dump_all ( yaml_data , allow_unicode = True , sort_keys = False ) )
except ( UnicodeDecodeError , LookupError , yaml . YAMLError ) as e :
# If decoding fails, try with utf-8 as last resort
try :
yaml_data = yaml . safe_load_all ( file_content . decode ( " utf-8 " , errors = " ignore " ) )
return cast ( str , yaml . dump_all ( yaml_data , allow_unicode = True , sort_keys = False ) )
except ( UnicodeDecodeError , yaml . YAMLError ) as e :
except ( UnicodeDecodeError , yaml . YAMLError ) :
raise TextExtractionError ( f " Failed to decode or parse YAML file: { e } " ) from e
@ -338,7 +377,20 @@ def _extract_text_from_file(file: File):
def _extract_text_from_csv ( file_content : bytes ) - > str :
try :
csv_file = io . StringIO ( file_content . decode ( " utf-8 " , " ignore " ) )
# Detect encoding using chardet
result = chardet . detect ( file_content )
encoding = result [ " encoding " ]
# Fallback to utf-8 if detection fails
if not encoding :
encoding = " utf-8 "
try :
csv_file = io . StringIO ( file_content . decode ( encoding , errors = " ignore " ) )
except ( UnicodeDecodeError , LookupError ) :
# If decoding fails, try with utf-8 as last resort
csv_file = io . StringIO ( file_content . decode ( " utf-8 " , errors = " ignore " ) )
csv_reader = csv . reader ( csv_file )
rows = list ( csv_reader )