From 8999519136939368b986948c9bdf1b67971d8629 Mon Sep 17 00:00:00 2001 From: uid7860 Date: Mon, 26 May 2025 11:01:13 +0900 Subject: [PATCH] fix: Doc Extractor node to handle SJIS(Japanese Symbols) --- .../workflow/nodes/document_extractor/node.py | 29 ++++++++++++++++--- 1 file changed, 25 insertions(+), 4 deletions(-) diff --git a/api/core/workflow/nodes/document_extractor/node.py b/api/core/workflow/nodes/document_extractor/node.py index 8fb1baec89..3cb3f09859 100644 --- a/api/core/workflow/nodes/document_extractor/node.py +++ b/api/core/workflow/nodes/document_extractor/node.py @@ -8,6 +8,7 @@ from collections.abc import Mapping, Sequence from typing import Any, cast import docx +import langdetect # type: ignore import pandas as pd import pypandoc # type: ignore import pypdfium2 # type: ignore @@ -180,14 +181,24 @@ def _extract_text_by_file_extension(*, file_content: bytes, file_extension: str) def _extract_text_from_plain_text(file_content: bytes) -> str: try: - return file_content.decode("utf-8", "ignore") + content = langdetect.detect(file_content.decode("shift_jis", "ignore")) + # When received japanese content, decode in a different way + if content == "ja": + return file_content.decode("shift_jis", "ignore") + else: + return file_content.decode("utf-8", "ignore") except UnicodeDecodeError as e: raise TextExtractionError("Failed to decode plain text file") from e def _extract_text_from_json(file_content: bytes) -> str: try: - json_data = json.loads(file_content.decode("utf-8", "ignore")) + content = langdetect.detect(file_content.decode("shift_jis", "ignore")) + # When received japanese content, decode in a different way + if content == "ja": + json_data = json.loads(file_content.decode("shift_jis", "ignore")) + else: + json_data = json.loads(file_content.decode("utf-8", "ignore")) return json.dumps(json_data, indent=2, ensure_ascii=False) except (UnicodeDecodeError, json.JSONDecodeError) as e: raise TextExtractionError(f"Failed to decode or parse JSON file: {e}") from e @@ -196,7 +207,12 @@ def _extract_text_from_json(file_content: bytes) -> str: def _extract_text_from_yaml(file_content: bytes) -> str: """Extract the content from yaml file""" try: - yaml_data = yaml.safe_load_all(file_content.decode("utf-8", "ignore")) + content = langdetect.detect(file_content.decode("shift_jis", "ignore")) + # When received japanese content, decode in a different way + if content == "ja": + yaml_data = yaml.safe_load_all(file_content.decode("shift_jis", "ignore")) + else: + yaml_data = yaml.safe_load_all(file_content.decode("utf-8", "ignore")) return cast(str, yaml.dump_all(yaml_data, allow_unicode=True, sort_keys=False)) except (UnicodeDecodeError, yaml.YAMLError) as e: raise TextExtractionError(f"Failed to decode or parse YAML file: {e}") from e @@ -338,7 +354,12 @@ def _extract_text_from_file(file: File): def _extract_text_from_csv(file_content: bytes) -> str: try: - csv_file = io.StringIO(file_content.decode("utf-8", "ignore")) + content = langdetect.detect(file_content.decode("shift_jis", "ignore")) + # When received japanese content, decode in a different way + if content == "ja": + csv_file = io.StringIO(file_content.decode("shift_jis", "ignore")) + else: + csv_file = io.StringIO(file_content.decode("utf-8", "ignore")) csv_reader = csv.reader(csv_file) rows = list(csv_reader)