From f2a45af1bde5b97a92118724bb103aca986929df Mon Sep 17 00:00:00 2001 From: haiyangpengai Date: Thu, 5 Jun 2025 11:57:42 +0800 Subject: [PATCH] add testing scripts for extract_text_from_excel. --- .../nodes/test_document_extractor_node.py | 101 ++++++++---------- 1 file changed, 46 insertions(+), 55 deletions(-) diff --git a/api/tests/unit_tests/core/workflow/nodes/test_document_extractor_node.py b/api/tests/unit_tests/core/workflow/nodes/test_document_extractor_node.py index e67a23c80e..16539fff8c 100644 --- a/api/tests/unit_tests/core/workflow/nodes/test_document_extractor_node.py +++ b/api/tests/unit_tests/core/workflow/nodes/test_document_extractor_node.py @@ -1,5 +1,7 @@ +import io from unittest.mock import Mock, patch +import pandas as pd import pytest from docx.oxml.text.paragraph import CT_P @@ -16,8 +18,7 @@ from core.workflow.nodes.document_extractor.node import ( _extract_text_from_plain_text, ) from core.workflow.nodes.enums import NodeType -import pandas as pd -import io + @pytest.fixture def document_extractor_node(): @@ -191,150 +192,140 @@ def test_extract_text_from_excel_single_sheet(mock_excel_file): """Test extracting text from Excel file with single sheet and multiline content.""" # Test multi-line cell - data = { - 'Name\nwith\nnewline': ['John\nDoe', 'Jane\nSmith'], - 'Age': [25, 30] - } - + data = {"Name\nwith\nnewline": ["John\nDoe", "Jane\nSmith"], "Age": [25, 30]} + df = pd.DataFrame(data) - + # Mock ExcelFile mock_excel_instance = Mock() mock_excel_instance.sheet_names = ["Sheet1"] mock_excel_instance.parse.return_value = df mock_excel_file.return_value = mock_excel_instance - + file_content = b"fake_excel_content" result = _extract_text_from_excel(file_content) expected_manual = "| Name with newline | Age |\n| ----------------- | --- |\n\ | John Doe | 25 |\n| Jane Smith | 30 |\n\n" - + assert expected_manual == result mock_excel_instance.parse.assert_called_once_with(sheet_name="Sheet1") - + + @patch("pandas.ExcelFile") def test_extract_text_from_excel_multiple_sheets(mock_excel_file): """Test extracting text from Excel file with multiple sheets and multiline content.""" - + # Test multi-line cell - data1 = { - 'Product\nName': ['Apple\nRed', 'Banana\nYellow'], - 'Price': [1.50, 0.99] - } + data1 = {"Product\nName": ["Apple\nRed", "Banana\nYellow"], "Price": [1.50, 0.99]} df1 = pd.DataFrame(data1) - - data2 = { - 'City\nName': ['New\nYork', 'Los\nAngeles'], - 'Population': [8000000, 3900000] - } + + data2 = {"City\nName": ["New\nYork", "Los\nAngeles"], "Population": [8000000, 3900000]} df2 = pd.DataFrame(data2) - + # Mock ExcelFile mock_excel_instance = Mock() mock_excel_instance.sheet_names = ["Products", "Cities"] mock_excel_instance.parse.side_effect = [df1, df2] mock_excel_file.return_value = mock_excel_instance - + file_content = b"fake_excel_content_multiple_sheets" result = _extract_text_from_excel(file_content) - + expected_manual1 = "| Product Name | Price |\n| ------------ | ----- |\n\ | Apple Red | 1.5 |\n| Banana Yellow | 0.99 |\n\n" expected_manual2 = "| City Name | Population |\n| --------- | ---------- |\n\ | New York | 8000000 |\n| Los Angeles | 3900000 |\n\n" - + assert expected_manual1 in result assert expected_manual2 in result - + assert mock_excel_instance.parse.call_count == 2 + @patch("pandas.ExcelFile") def test_extract_text_from_excel_empty_sheets(mock_excel_file): """Test extracting text from Excel file with empty sheets.""" - + # Empty excel df = pd.DataFrame() - + # Mock ExcelFile mock_excel_instance = Mock() mock_excel_instance.sheet_names = ["EmptySheet"] mock_excel_instance.parse.return_value = df mock_excel_file.return_value = mock_excel_instance - + file_content = b"fake_excel_empty_content" result = _extract_text_from_excel(file_content) - + expected = "| |\n| |\n\n" assert result == expected - + mock_excel_instance.parse.assert_called_once_with(sheet_name="EmptySheet") - + + @patch("pandas.ExcelFile") def test_extract_text_from_excel_sheet_parse_error(mock_excel_file): """Test handling of sheet parsing errors - should continue with other sheets.""" - + # Test error - data = { - 'Data': ['Test'], - 'Value': [123] - } + data = {"Data": ["Test"], "Value": [123]} df = pd.DataFrame(data) - + # Mock ExcelFile mock_excel_instance = Mock() mock_excel_instance.sheet_names = ["GoodSheet", "BadSheet"] mock_excel_instance.parse.side_effect = [df, Exception("Parse error")] mock_excel_file.return_value = mock_excel_instance - + file_content = b"fake_excel_mixed_content" result = _extract_text_from_excel(file_content) - + expected_manual = "| Data | Value |\n| ---- | ----- |\n| Test | 123 |\n\n" - + assert expected_manual == result assert mock_excel_instance.parse.call_count == 2 + @patch("pandas.ExcelFile") def test_extract_text_from_excel_io_bytesio_usage(mock_excel_file): """Test that BytesIO is properly used with the file content.""" - + # Test bytesio - data = { - 'Test': [1], - 'Data': ['A'] - } + data = {"Test": [1], "Data": ["A"]} df = pd.DataFrame(data) - + # Mock ExcelFile mock_excel_instance = Mock() mock_excel_instance.sheet_names = ["TestSheet"] mock_excel_instance.parse.return_value = df mock_excel_file.return_value = mock_excel_instance - + file_content = b"test_excel_bytes" result = _extract_text_from_excel(file_content) - + mock_excel_file.assert_called_once() call_arg = mock_excel_file.call_args[0][0] assert isinstance(call_arg, io.BytesIO) - + expected_manual = "| Test | Data |\n| ---- | ---- |\n| 1 | A |\n\n" assert expected_manual == result - + + @patch("pandas.ExcelFile") def test_extract_text_from_excel_all_sheets_fail(mock_excel_file): """Test when all sheets fail to parse - should return empty string.""" - + # Mock ExcelFile mock_excel_instance = Mock() mock_excel_instance.sheet_names = ["BadSheet1", "BadSheet2"] mock_excel_instance.parse.side_effect = [Exception("Error 1"), Exception("Error 2")] mock_excel_file.return_value = mock_excel_instance - + file_content = b"fake_excel_all_bad_sheets" result = _extract_text_from_excel(file_content) - + assert result == "" - + assert mock_excel_instance.parse.call_count == 2