|
|
|
@ -8,6 +8,7 @@ from collections.abc import Mapping, Sequence
|
|
|
|
from typing import Any, cast
|
|
|
|
from typing import Any, cast
|
|
|
|
|
|
|
|
|
|
|
|
import docx
|
|
|
|
import docx
|
|
|
|
|
|
|
|
import langdetect # type: ignore
|
|
|
|
import pandas as pd
|
|
|
|
import pandas as pd
|
|
|
|
import pypandoc # type: ignore
|
|
|
|
import pypandoc # type: ignore
|
|
|
|
import pypdfium2 # type: ignore
|
|
|
|
import pypdfium2 # type: ignore
|
|
|
|
@ -180,14 +181,24 @@ def _extract_text_by_file_extension(*, file_content: bytes, file_extension: str)
|
|
|
|
|
|
|
|
|
|
|
|
def _extract_text_from_plain_text(file_content: bytes) -> str:
|
|
|
|
def _extract_text_from_plain_text(file_content: bytes) -> str:
|
|
|
|
try:
|
|
|
|
try:
|
|
|
|
return file_content.decode("utf-8", "ignore")
|
|
|
|
content = langdetect.detect(file_content.decode("shift_jis", "ignore"))
|
|
|
|
|
|
|
|
# When received japanese content, decode in a different way
|
|
|
|
|
|
|
|
if content == "ja":
|
|
|
|
|
|
|
|
return file_content.decode("shift_jis", "ignore")
|
|
|
|
|
|
|
|
else:
|
|
|
|
|
|
|
|
return file_content.decode("utf-8", "ignore")
|
|
|
|
except UnicodeDecodeError as e:
|
|
|
|
except UnicodeDecodeError as e:
|
|
|
|
raise TextExtractionError("Failed to decode plain text file") from e
|
|
|
|
raise TextExtractionError("Failed to decode plain text file") from e
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def _extract_text_from_json(file_content: bytes) -> str:
|
|
|
|
def _extract_text_from_json(file_content: bytes) -> str:
|
|
|
|
try:
|
|
|
|
try:
|
|
|
|
json_data = json.loads(file_content.decode("utf-8", "ignore"))
|
|
|
|
content = langdetect.detect(file_content.decode("shift_jis", "ignore"))
|
|
|
|
|
|
|
|
# When received japanese content, decode in a different way
|
|
|
|
|
|
|
|
if content == "ja":
|
|
|
|
|
|
|
|
json_data = json.loads(file_content.decode("shift_jis", "ignore"))
|
|
|
|
|
|
|
|
else:
|
|
|
|
|
|
|
|
json_data = json.loads(file_content.decode("utf-8", "ignore"))
|
|
|
|
return json.dumps(json_data, indent=2, ensure_ascii=False)
|
|
|
|
return json.dumps(json_data, indent=2, ensure_ascii=False)
|
|
|
|
except (UnicodeDecodeError, json.JSONDecodeError) as e:
|
|
|
|
except (UnicodeDecodeError, json.JSONDecodeError) as e:
|
|
|
|
raise TextExtractionError(f"Failed to decode or parse JSON file: {e}") from e
|
|
|
|
raise TextExtractionError(f"Failed to decode or parse JSON file: {e}") from e
|
|
|
|
@ -196,7 +207,12 @@ def _extract_text_from_json(file_content: bytes) -> str:
|
|
|
|
def _extract_text_from_yaml(file_content: bytes) -> str:
|
|
|
|
def _extract_text_from_yaml(file_content: bytes) -> str:
|
|
|
|
"""Extract the content from yaml file"""
|
|
|
|
"""Extract the content from yaml file"""
|
|
|
|
try:
|
|
|
|
try:
|
|
|
|
yaml_data = yaml.safe_load_all(file_content.decode("utf-8", "ignore"))
|
|
|
|
content = langdetect.detect(file_content.decode("shift_jis", "ignore"))
|
|
|
|
|
|
|
|
# When received japanese content, decode in a different way
|
|
|
|
|
|
|
|
if content == "ja":
|
|
|
|
|
|
|
|
yaml_data = yaml.safe_load_all(file_content.decode("shift_jis", "ignore"))
|
|
|
|
|
|
|
|
else:
|
|
|
|
|
|
|
|
yaml_data = yaml.safe_load_all(file_content.decode("utf-8", "ignore"))
|
|
|
|
return cast(str, yaml.dump_all(yaml_data, allow_unicode=True, sort_keys=False))
|
|
|
|
return cast(str, yaml.dump_all(yaml_data, allow_unicode=True, sort_keys=False))
|
|
|
|
except (UnicodeDecodeError, yaml.YAMLError) as e:
|
|
|
|
except (UnicodeDecodeError, yaml.YAMLError) as e:
|
|
|
|
raise TextExtractionError(f"Failed to decode or parse YAML file: {e}") from e
|
|
|
|
raise TextExtractionError(f"Failed to decode or parse YAML file: {e}") from e
|
|
|
|
@ -338,7 +354,12 @@ def _extract_text_from_file(file: File):
|
|
|
|
|
|
|
|
|
|
|
|
def _extract_text_from_csv(file_content: bytes) -> str:
|
|
|
|
def _extract_text_from_csv(file_content: bytes) -> str:
|
|
|
|
try:
|
|
|
|
try:
|
|
|
|
csv_file = io.StringIO(file_content.decode("utf-8", "ignore"))
|
|
|
|
content = langdetect.detect(file_content.decode("shift_jis", "ignore"))
|
|
|
|
|
|
|
|
# When received japanese content, decode in a different way
|
|
|
|
|
|
|
|
if content == "ja":
|
|
|
|
|
|
|
|
csv_file = io.StringIO(file_content.decode("shift_jis", "ignore"))
|
|
|
|
|
|
|
|
else:
|
|
|
|
|
|
|
|
csv_file = io.StringIO(file_content.decode("utf-8", "ignore"))
|
|
|
|
csv_reader = csv.reader(csv_file)
|
|
|
|
csv_reader = csv.reader(csv_file)
|
|
|
|
rows = list(csv_reader)
|
|
|
|
rows = list(csv_reader)
|
|
|
|
|
|
|
|
|
|
|
|
|