fix: Doc Extractor node to handle SJIS(Japanese Symbols)

pull/20188/head
uid7860 1 year ago
parent cbfc32b11f
commit 8999519136

@ -8,6 +8,7 @@ from collections.abc import Mapping, Sequence
from typing import Any, cast from typing import Any, cast
import docx import docx
import langdetect # type: ignore
import pandas as pd import pandas as pd
import pypandoc # type: ignore import pypandoc # type: ignore
import pypdfium2 # type: ignore import pypdfium2 # type: ignore
@ -180,6 +181,11 @@ def _extract_text_by_file_extension(*, file_content: bytes, file_extension: str)
def _extract_text_from_plain_text(file_content: bytes) -> str: def _extract_text_from_plain_text(file_content: bytes) -> str:
try: try:
content = langdetect.detect(file_content.decode("shift_jis", "ignore"))
# When received japanese content, decode in a different way
if content == "ja":
return file_content.decode("shift_jis", "ignore")
else:
return file_content.decode("utf-8", "ignore") return file_content.decode("utf-8", "ignore")
except UnicodeDecodeError as e: except UnicodeDecodeError as e:
raise TextExtractionError("Failed to decode plain text file") from e raise TextExtractionError("Failed to decode plain text file") from e
@ -187,6 +193,11 @@ def _extract_text_from_plain_text(file_content: bytes) -> str:
def _extract_text_from_json(file_content: bytes) -> str: def _extract_text_from_json(file_content: bytes) -> str:
try: try:
content = langdetect.detect(file_content.decode("shift_jis", "ignore"))
# When received japanese content, decode in a different way
if content == "ja":
json_data = json.loads(file_content.decode("shift_jis", "ignore"))
else:
json_data = json.loads(file_content.decode("utf-8", "ignore")) json_data = json.loads(file_content.decode("utf-8", "ignore"))
return json.dumps(json_data, indent=2, ensure_ascii=False) return json.dumps(json_data, indent=2, ensure_ascii=False)
except (UnicodeDecodeError, json.JSONDecodeError) as e: except (UnicodeDecodeError, json.JSONDecodeError) as e:
@ -196,6 +207,11 @@ def _extract_text_from_json(file_content: bytes) -> str:
def _extract_text_from_yaml(file_content: bytes) -> str: def _extract_text_from_yaml(file_content: bytes) -> str:
"""Extract the content from yaml file""" """Extract the content from yaml file"""
try: try:
content = langdetect.detect(file_content.decode("shift_jis", "ignore"))
# When received japanese content, decode in a different way
if content == "ja":
yaml_data = yaml.safe_load_all(file_content.decode("shift_jis", "ignore"))
else:
yaml_data = yaml.safe_load_all(file_content.decode("utf-8", "ignore")) yaml_data = yaml.safe_load_all(file_content.decode("utf-8", "ignore"))
return cast(str, yaml.dump_all(yaml_data, allow_unicode=True, sort_keys=False)) return cast(str, yaml.dump_all(yaml_data, allow_unicode=True, sort_keys=False))
except (UnicodeDecodeError, yaml.YAMLError) as e: except (UnicodeDecodeError, yaml.YAMLError) as e:
@ -338,6 +354,11 @@ def _extract_text_from_file(file: File):
def _extract_text_from_csv(file_content: bytes) -> str: def _extract_text_from_csv(file_content: bytes) -> str:
try: try:
content = langdetect.detect(file_content.decode("shift_jis", "ignore"))
# When received japanese content, decode in a different way
if content == "ja":
csv_file = io.StringIO(file_content.decode("shift_jis", "ignore"))
else:
csv_file = io.StringIO(file_content.decode("utf-8", "ignore")) csv_file = io.StringIO(file_content.decode("utf-8", "ignore"))
csv_reader = csv.reader(csv_file) csv_reader = csv.reader(csv_file)
rows = list(csv_reader) rows = list(csv_reader)

Loading…
Cancel
Save