|
|
|
@ -10,9 +10,9 @@ from core.workflow.entities.node_entities import NodeRunResult
|
|
|
|
from core.workflow.nodes.document_extractor import DocumentExtractorNode, DocumentExtractorNodeData
|
|
|
|
from core.workflow.nodes.document_extractor import DocumentExtractorNode, DocumentExtractorNodeData
|
|
|
|
from core.workflow.nodes.document_extractor.node import (
|
|
|
|
from core.workflow.nodes.document_extractor.node import (
|
|
|
|
_extract_text_from_docx,
|
|
|
|
_extract_text_from_docx,
|
|
|
|
|
|
|
|
_extract_text_from_excel,
|
|
|
|
_extract_text_from_pdf,
|
|
|
|
_extract_text_from_pdf,
|
|
|
|
_extract_text_from_plain_text,
|
|
|
|
_extract_text_from_plain_text,
|
|
|
|
_extract_text_from_excel
|
|
|
|
|
|
|
|
)
|
|
|
|
)
|
|
|
|
from core.workflow.nodes.enums import NodeType
|
|
|
|
from core.workflow.nodes.enums import NodeType
|
|
|
|
from models.workflow import WorkflowNodeExecutionStatus
|
|
|
|
from models.workflow import WorkflowNodeExecutionStatus
|
|
|
|
@ -184,6 +184,7 @@ def test_extract_text_from_docx(mock_document):
|
|
|
|
def test_node_type(document_extractor_node):
|
|
|
|
def test_node_type(document_extractor_node):
|
|
|
|
assert document_extractor_node._node_type == NodeType.DOCUMENT_EXTRACTOR
|
|
|
|
assert document_extractor_node._node_type == NodeType.DOCUMENT_EXTRACTOR
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
@patch("pandas.ExcelFile")
|
|
|
|
@patch("pandas.ExcelFile")
|
|
|
|
def test_extract_text_from_excel_single_sheet(mock_excel_file):
|
|
|
|
def test_extract_text_from_excel_single_sheet(mock_excel_file):
|
|
|
|
"""Test extracting text from Excel file with single sheet."""
|
|
|
|
"""Test extracting text from Excel file with single sheet."""
|
|
|
|
@ -205,7 +206,7 @@ def test_extract_text_from_excel_single_sheet(mock_excel_file):
|
|
|
|
assert result == expected
|
|
|
|
assert result == expected
|
|
|
|
mock_excel_file.assert_called_once()
|
|
|
|
mock_excel_file.assert_called_once()
|
|
|
|
mock_df.dropna.assert_called_once_with(how="all", inplace=True)
|
|
|
|
mock_df.dropna.assert_called_once_with(how="all", inplace=True)
|
|
|
|
mock_df.to_markdown.assert_called_once_with(index=False, floatfmt='')
|
|
|
|
mock_df.to_markdown.assert_called_once_with(index=False, floatfmt="")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
@patch("pandas.ExcelFile")
|
|
|
|
@patch("pandas.ExcelFile")
|
|
|
|
@ -229,8 +230,10 @@ def test_extract_text_from_excel_multiple_sheets(mock_excel_file):
|
|
|
|
file_content = b"fake_excel_content_multiple_sheets"
|
|
|
|
file_content = b"fake_excel_content_multiple_sheets"
|
|
|
|
result = _extract_text_from_excel(file_content)
|
|
|
|
result = _extract_text_from_excel(file_content)
|
|
|
|
|
|
|
|
|
|
|
|
expected = ("| Product | Price |\n|---------|-------|\n| Apple | 1.50 |\n\n"
|
|
|
|
expected = (
|
|
|
|
"| City | Population |\n|------|------------|\n| NYC | 8000000 |\n\n")
|
|
|
|
"| Product | Price |\n|---------|-------|\n| Apple | 1.50 |\n\n"
|
|
|
|
|
|
|
|
"| City | Population |\n|------|------------|\n| NYC | 8000000 |\n\n"
|
|
|
|
|
|
|
|
)
|
|
|
|
assert result == expected
|
|
|
|
assert result == expected
|
|
|
|
assert mock_excel_instance.parse.call_count == 2
|
|
|
|
assert mock_excel_instance.parse.call_count == 2
|
|
|
|
|
|
|
|
|
|
|
|
@ -354,7 +357,7 @@ def test_extract_text_from_excel_markdown_formatting(mock_excel_file):
|
|
|
|
result = _extract_text_from_excel(file_content)
|
|
|
|
result = _extract_text_from_excel(file_content)
|
|
|
|
|
|
|
|
|
|
|
|
# Verify to_markdown was called with correct parameters
|
|
|
|
# Verify to_markdown was called with correct parameters
|
|
|
|
mock_df.to_markdown.assert_called_once_with(index=False, floatfmt='')
|
|
|
|
mock_df.to_markdown.assert_called_once_with(index=False, floatfmt="")
|
|
|
|
|
|
|
|
|
|
|
|
expected = "| Float | Int |\n|-------|-----|\n| 123456.78 | 42 |\n\n"
|
|
|
|
expected = "| Float | Int |\n|-------|-----|\n| 123456.78 | 42 |\n\n"
|
|
|
|
assert result == expected
|
|
|
|
assert result == expected
|
|
|
|
|