feat: add automatic encoding detection to document extractor

- Add chardet library for automatic encoding detection - Update text extraction functions to detect encoding before decoding - Improve handling of files with non-UTF-8 encodings - Add fallback to UTF-8 when encoding detection fails Affected functions: - _extract_text_from_plain_text() - _extract_text_from_json() - _extract_text_from_yaml() - _extract_text_from_csv() This change makes the document extractor more robust when handling files with various character encodings (e.g., Latin-1, Windows-1252, GB2312) instead of assuming UTF-8 for all files. Signed-off-by: -LAN- <laipz8200@outlook.com>
1 year ago · 20bd51beaa
parent 55503ce771
commit 20bd51beaa
1 changed files with 62 additions and 10 deletions
--- a/api/core/workflow/nodes/document_extractor/node.py
+++ b/api/core/workflow/nodes/document_extractor/node.py
@ -7,6 +7,7 @@ import tempfile
 from collections.abc import Mapping, Sequence
 from typing import Any, cast

+import chardet
 import docx
 import pandas as pd
 import pypandoc  # type: ignore
@ -180,26 +181,64 @@ def _extract_text_by_file_extension(*, file_content: bytes, file_extension: str)

 def _extract_text_from_plain_text(file_content: bytes) -> str:
    try:
-        return file_content.decode("utf-8", "ignore")
-    except UnicodeDecodeError as e:
-        raise TextExtractionError("Failed to decode plain text file") from e
+        # Detect encoding using chardet
+        result = chardet.detect(file_content)
+        encoding = result["encoding"]
+
+        # Fallback to utf-8 if detection fails
+        if not encoding:
+            encoding = "utf-8"
+
+        return file_content.decode(encoding, errors="ignore")
+    except (UnicodeDecodeError, LookupError) as e:
+        # If decoding fails, try with utf-8 as last resort
+        try:
+            return file_content.decode("utf-8", errors="ignore")
+        except UnicodeDecodeError:
+            raise TextExtractionError(f"Failed to decode plain text file: {e}") from e


 def _extract_text_from_json(file_content: bytes) -> str:
    try:
-        json_data = json.loads(file_content.decode("utf-8", "ignore"))
+        # Detect encoding using chardet
+        result = chardet.detect(file_content)
+        encoding = result["encoding"]
+
+        # Fallback to utf-8 if detection fails
+        if not encoding:
+            encoding = "utf-8"
+
+        json_data = json.loads(file_content.decode(encoding, errors="ignore"))
        return json.dumps(json_data, indent=2, ensure_ascii=False)
-    except (UnicodeDecodeError, json.JSONDecodeError) as e:
-        raise TextExtractionError(f"Failed to decode or parse JSON file: {e}") from e
+    except (UnicodeDecodeError, LookupError, json.JSONDecodeError) as e:
+        # If decoding fails, try with utf-8 as last resort
+        try:
+            json_data = json.loads(file_content.decode("utf-8", errors="ignore"))
+            return json.dumps(json_data, indent=2, ensure_ascii=False)
+        except (UnicodeDecodeError, json.JSONDecodeError):
+            raise TextExtractionError(f"Failed to decode or parse JSON file: {e}") from e


 def _extract_text_from_yaml(file_content: bytes) -> str:
    """Extract the content from yaml file"""
    try:
-        yaml_data = yaml.safe_load_all(file_content.decode("utf-8", "ignore"))
+        # Detect encoding using chardet
+        result = chardet.detect(file_content)
+        encoding = result["encoding"]
+
+        # Fallback to utf-8 if detection fails
+        if not encoding:
+            encoding = "utf-8"
+
+        yaml_data = yaml.safe_load_all(file_content.decode(encoding, errors="ignore"))
        return cast(str, yaml.dump_all(yaml_data, allow_unicode=True, sort_keys=False))
-    except (UnicodeDecodeError, yaml.YAMLError) as e:
-        raise TextExtractionError(f"Failed to decode or parse YAML file: {e}") from e
+    except (UnicodeDecodeError, LookupError, yaml.YAMLError) as e:
+        # If decoding fails, try with utf-8 as last resort
+        try:
+            yaml_data = yaml.safe_load_all(file_content.decode("utf-8", errors="ignore"))
+            return cast(str, yaml.dump_all(yaml_data, allow_unicode=True, sort_keys=False))
+        except (UnicodeDecodeError, yaml.YAMLError):
+            raise TextExtractionError(f"Failed to decode or parse YAML file: {e}") from e


 def _extract_text_from_pdf(file_content: bytes) -> str:
@ -338,7 +377,20 @@ def _extract_text_from_file(file: File):

 def _extract_text_from_csv(file_content: bytes) -> str:
    try:
-        csv_file = io.StringIO(file_content.decode("utf-8", "ignore"))
+        # Detect encoding using chardet
+        result = chardet.detect(file_content)
+        encoding = result["encoding"]
+
+        # Fallback to utf-8 if detection fails
+        if not encoding:
+            encoding = "utf-8"
+
+        try:
+            csv_file = io.StringIO(file_content.decode(encoding, errors="ignore"))
+        except (UnicodeDecodeError, LookupError):
+            # If decoding fails, try with utf-8 as last resort
+            csv_file = io.StringIO(file_content.decode("utf-8", errors="ignore"))
+
        csv_reader = csv.reader(csv_file)
        rows = list(csv_reader)