feat: add automatic encoding detection to document extractor

- Add chardet library for automatic encoding detection
- Update text extraction functions to detect encoding before decoding
- Improve handling of files with non-UTF-8 encodings
- Add fallback to UTF-8 when encoding detection fails

Affected functions:
- _extract_text_from_plain_text()
- _extract_text_from_json()
- _extract_text_from_yaml()
- _extract_text_from_csv()

This change makes the document extractor more robust when handling
files with various character encodings (e.g., Latin-1, Windows-1252,
GB2312) instead of assuming UTF-8 for all files.

Signed-off-by: -LAN- <laipz8200@outlook.com>
pull/20269/head
-LAN- 1 year ago
parent 55503ce771
commit 20bd51beaa
No known key found for this signature in database
GPG Key ID: 6BA0D108DED011FF

@ -7,6 +7,7 @@ import tempfile
from collections.abc import Mapping, Sequence from collections.abc import Mapping, Sequence
from typing import Any, cast from typing import Any, cast
import chardet
import docx import docx
import pandas as pd import pandas as pd
import pypandoc # type: ignore import pypandoc # type: ignore
@ -180,26 +181,64 @@ def _extract_text_by_file_extension(*, file_content: bytes, file_extension: str)
def _extract_text_from_plain_text(file_content: bytes) -> str: def _extract_text_from_plain_text(file_content: bytes) -> str:
try: try:
return file_content.decode("utf-8", "ignore") # Detect encoding using chardet
except UnicodeDecodeError as e: result = chardet.detect(file_content)
raise TextExtractionError("Failed to decode plain text file") from e encoding = result["encoding"]
# Fallback to utf-8 if detection fails
if not encoding:
encoding = "utf-8"
return file_content.decode(encoding, errors="ignore")
except (UnicodeDecodeError, LookupError) as e:
# If decoding fails, try with utf-8 as last resort
try:
return file_content.decode("utf-8", errors="ignore")
except UnicodeDecodeError:
raise TextExtractionError(f"Failed to decode plain text file: {e}") from e
def _extract_text_from_json(file_content: bytes) -> str: def _extract_text_from_json(file_content: bytes) -> str:
try: try:
json_data = json.loads(file_content.decode("utf-8", "ignore")) # Detect encoding using chardet
result = chardet.detect(file_content)
encoding = result["encoding"]
# Fallback to utf-8 if detection fails
if not encoding:
encoding = "utf-8"
json_data = json.loads(file_content.decode(encoding, errors="ignore"))
return json.dumps(json_data, indent=2, ensure_ascii=False) return json.dumps(json_data, indent=2, ensure_ascii=False)
except (UnicodeDecodeError, json.JSONDecodeError) as e: except (UnicodeDecodeError, LookupError, json.JSONDecodeError) as e:
raise TextExtractionError(f"Failed to decode or parse JSON file: {e}") from e # If decoding fails, try with utf-8 as last resort
try:
json_data = json.loads(file_content.decode("utf-8", errors="ignore"))
return json.dumps(json_data, indent=2, ensure_ascii=False)
except (UnicodeDecodeError, json.JSONDecodeError):
raise TextExtractionError(f"Failed to decode or parse JSON file: {e}") from e
def _extract_text_from_yaml(file_content: bytes) -> str: def _extract_text_from_yaml(file_content: bytes) -> str:
"""Extract the content from yaml file""" """Extract the content from yaml file"""
try: try:
yaml_data = yaml.safe_load_all(file_content.decode("utf-8", "ignore")) # Detect encoding using chardet
result = chardet.detect(file_content)
encoding = result["encoding"]
# Fallback to utf-8 if detection fails
if not encoding:
encoding = "utf-8"
yaml_data = yaml.safe_load_all(file_content.decode(encoding, errors="ignore"))
return cast(str, yaml.dump_all(yaml_data, allow_unicode=True, sort_keys=False)) return cast(str, yaml.dump_all(yaml_data, allow_unicode=True, sort_keys=False))
except (UnicodeDecodeError, yaml.YAMLError) as e: except (UnicodeDecodeError, LookupError, yaml.YAMLError) as e:
raise TextExtractionError(f"Failed to decode or parse YAML file: {e}") from e # If decoding fails, try with utf-8 as last resort
try:
yaml_data = yaml.safe_load_all(file_content.decode("utf-8", errors="ignore"))
return cast(str, yaml.dump_all(yaml_data, allow_unicode=True, sort_keys=False))
except (UnicodeDecodeError, yaml.YAMLError):
raise TextExtractionError(f"Failed to decode or parse YAML file: {e}") from e
def _extract_text_from_pdf(file_content: bytes) -> str: def _extract_text_from_pdf(file_content: bytes) -> str:
@ -338,7 +377,20 @@ def _extract_text_from_file(file: File):
def _extract_text_from_csv(file_content: bytes) -> str: def _extract_text_from_csv(file_content: bytes) -> str:
try: try:
csv_file = io.StringIO(file_content.decode("utf-8", "ignore")) # Detect encoding using chardet
result = chardet.detect(file_content)
encoding = result["encoding"]
# Fallback to utf-8 if detection fails
if not encoding:
encoding = "utf-8"
try:
csv_file = io.StringIO(file_content.decode(encoding, errors="ignore"))
except (UnicodeDecodeError, LookupError):
# If decoding fails, try with utf-8 as last resort
csv_file = io.StringIO(file_content.decode("utf-8", errors="ignore"))
csv_reader = csv.reader(csv_file) csv_reader = csv.reader(csv_file)
rows = list(csv_reader) rows = list(csv_reader)

Loading…
Cancel
Save