feat: refactor: add unit test for original save_document_with_dataset_id

pull/21528/head
neatguycoding 11 months ago
parent 4e1b17332c
commit a955616a21

@ -0,0 +1,824 @@
from unittest.mock import Mock, patch
import pytest
from models.account import Account
from models.dataset import Dataset, Document
from services.dataset_service import DocumentService
from services.entities.knowledge_entities.knowledge_entities import KnowledgeConfig
class TestDocumentServiceSaveDocumentWithDatasetId:
"""
Full branch unit tests for DocumentService.save_document_with_dataset_id.
This suite covers all main branches, including:
- Billing and quota checks
- Data source types: upload_file, notion_import, website_crawl
- Duplicate document handling
- Process rule creation and error cases
- Exception and edge cases
"""
@patch("services.dataset_service.FeatureService.get_features")
@patch("services.dataset_service.db.session")
@patch("services.dataset_service.redis_client")
@patch("services.dataset_service.time")
@patch("services.dataset_service.secrets.randbelow", return_value=123456)
@patch("services.dataset_service.DocumentService.build_document")
@patch("services.dataset_service.document_indexing_task.delay")
@patch("services.dataset_service.duplicate_document_indexing_task.delay")
@patch("services.dataset_service.current_user")
@patch("services.dataset_service.ModelManager")
@patch("services.dataset_service.DatasetCollectionBindingService.get_dataset_collection_binding")
@patch("services.dataset_service.DocumentService.get_documents_position", return_value=0)
def test_upload_file_success(
self,
mock_get_position,
mock_collection_binding,
mock_model_manager,
mock_current_user,
mock_dup_task,
mock_doc_task,
mock_build_doc,
mock_rand,
mock_time,
mock_redis,
mock_db,
mock_features,
):
"""
Test successful upload_file document creation, including duplicate and non-duplicate cases.
"""
# Setup mocks and input
dataset = Mock(spec=Dataset)
dataset.id = "ds1"
dataset.tenant_id = "tenant1"
dataset.data_source_type = None
dataset.indexing_technique = None
dataset.retrieval_model = None
dataset.embedding_model = None
dataset.embedding_model_provider = None
dataset.collection_binding_id = None
dataset.latest_process_rule = None
account = Mock(spec=Account)
account.id = "user1"
account.name = "User One"
# Mock current_user
mock_current_user.current_tenant_id = "tenant1"
# Mock features
features = Mock()
features.billing.enabled = True
features.billing.subscription.plan = "pro"
features.documents_upload_quota.limit = 100
features.documents_upload_quota.size = 0
mock_features.return_value = features
# Mock knowledge_config for upload_file with proper nested structure
knowledge_config = Mock(spec=KnowledgeConfig)
knowledge_config.original_document_id = None
knowledge_config.data_source = Mock()
knowledge_config.data_source.info_list = Mock()
knowledge_config.data_source.info_list.data_source_type = "upload_file"
knowledge_config.data_source.info_list.file_info_list = Mock()
knowledge_config.data_source.info_list.file_info_list.file_ids = ["file1", "file2"]
knowledge_config.indexing_technique = "high_quality"
knowledge_config.embedding_model = "embed-model"
knowledge_config.embedding_model_provider = "openai"
knowledge_config.retrieval_model = None
knowledge_config.process_rule = Mock()
knowledge_config.process_rule.mode = "custom"
knowledge_config.process_rule.rules = Mock()
knowledge_config.doc_form = "pdf"
knowledge_config.doc_language = "en"
knowledge_config.duplicate = False
# Mock ModelManager
mock_model_manager_instance = Mock()
mock_embedding_model = Mock()
mock_embedding_model.model = "embed-model"
mock_embedding_model.provider = "openai"
mock_model_manager_instance.get_default_model_instance.return_value = mock_embedding_model
mock_model_manager.return_value = mock_model_manager_instance
# Mock collection binding
mock_collection_binding_instance = Mock()
mock_collection_binding_instance.id = "binding-123"
mock_collection_binding.return_value = mock_collection_binding_instance
# Mock build_document
mock_doc1 = Mock(spec=Document, id="doc1")
mock_doc2 = Mock(spec=Document, id="doc2")
mock_build_doc.side_effect = [mock_doc1, mock_doc2]
# Mock db.session.query(UploadFile)
upload_file1 = Mock()
upload_file1.id = "file1"
upload_file1.name = "file1.pdf"
upload_file2 = Mock()
upload_file2.id = "file2"
upload_file2.name = "file2.pdf"
mock_db.query.return_value.filter.return_value.first.side_effect = [upload_file1, upload_file2]
# Mock redis lock
mock_lock = Mock()
mock_redis.lock.return_value.__enter__ = Mock(return_value=None)
mock_redis.lock.return_value.__exit__ = Mock(return_value=None)
# Mock time.strftime
mock_time.strftime.return_value = "20231201120000"
# Run
docs, batch = DocumentService.save_document_with_dataset_id(dataset, knowledge_config, account)
# Assert
assert len(docs) == 2
mock_doc_task.assert_called_once()
mock_dup_task.assert_not_called()
@patch("services.dataset_service.FeatureService.get_features")
@patch("services.dataset_service.current_user")
def test_billing_batch_limit(self, mock_current_user, mock_features):
"""
Test batch upload limit exceeded raises ValueError.
"""
dataset = Mock(spec=Dataset)
dataset.id = "ds1"
dataset.tenant_id = "tenant1"
account = Mock(spec=Account)
account.id = "user1"
mock_current_user.current_tenant_id = "tenant1"
features = Mock()
features.billing.enabled = True
features.billing.subscription.plan = "sandbox"
mock_features.return_value = features
knowledge_config = Mock()
knowledge_config.original_document_id = None
knowledge_config.data_source = Mock()
knowledge_config.data_source.info_list = Mock()
knowledge_config.data_source.info_list.data_source_type = "upload_file"
knowledge_config.data_source.info_list.file_info_list = Mock()
knowledge_config.data_source.info_list.file_info_list.file_ids = ["file1", "file2"]
with pytest.raises(ValueError, match="Your current plan does not support batch upload"):
DocumentService.save_document_with_dataset_id(dataset, knowledge_config, account)
@patch("services.dataset_service.FeatureService.get_features")
@patch("services.dataset_service.current_user")
def test_billing_quota_limit(self, mock_current_user, mock_features):
"""
Test document upload quota exceeded raises ValueError.
"""
dataset = Mock(spec=Dataset)
dataset.id = "ds1"
dataset.tenant_id = "tenant1"
account = Mock(spec=Account)
account.id = "user1"
mock_current_user.current_tenant_id = "tenant1"
features = Mock()
features.billing.enabled = True
features.billing.subscription.plan = "pro"
features.documents_upload_quota.limit = 1
features.documents_upload_quota.size = 1
mock_features.return_value = features
knowledge_config = Mock()
knowledge_config.original_document_id = None
knowledge_config.data_source = Mock()
knowledge_config.data_source.info_list = Mock()
knowledge_config.data_source.info_list.data_source_type = "upload_file"
knowledge_config.data_source.info_list.file_info_list = Mock()
knowledge_config.data_source.info_list.file_info_list.file_ids = ["file1", "file2"]
with pytest.raises(ValueError, match="You have reached the limit of your subscription"):
DocumentService.save_document_with_dataset_id(dataset, knowledge_config, account)
@patch("services.dataset_service.FeatureService.get_features")
@patch("services.dataset_service.current_user")
def test_invalid_indexing_technique(self, mock_current_user, mock_features):
"""
Test invalid indexing technique raises ValueError.
"""
dataset = Mock(spec=Dataset)
dataset.id = "ds1"
dataset.tenant_id = "tenant1"
dataset.data_source_type = None
dataset.indexing_technique = None
account = Mock(spec=Account)
account.id = "user1"
mock_current_user.current_tenant_id = "tenant1"
features = Mock()
features.billing.enabled = False
mock_features.return_value = features
knowledge_config = Mock()
knowledge_config.original_document_id = None
knowledge_config.data_source = Mock()
knowledge_config.data_source.info_list = Mock()
knowledge_config.data_source.info_list.data_source_type = "upload_file"
knowledge_config.indexing_technique = "invalid"
with pytest.raises(ValueError, match="Indexing technique is invalid"):
DocumentService.save_document_with_dataset_id(dataset, knowledge_config, account)
@patch("services.dataset_service.FeatureService.get_features")
@patch("services.dataset_service.current_user")
def test_no_process_rule_found(self, mock_current_user, mock_features):
"""
Test no process rule found raises ValueError.
"""
dataset = Mock(spec=Dataset)
dataset.id = "ds1"
dataset.tenant_id = "tenant1"
dataset.latest_process_rule = None
dataset.data_source_type = "upload_file"
dataset.indexing_technique = "high_quality"
account = Mock(spec=Account)
account.id = "user1"
mock_current_user.current_tenant_id = "tenant1"
features = Mock()
features.billing.enabled = False
mock_features.return_value = features
knowledge_config = Mock()
knowledge_config.original_document_id = None
knowledge_config.data_source = Mock()
knowledge_config.data_source.info_list = Mock()
knowledge_config.data_source.info_list.data_source_type = "upload_file"
knowledge_config.indexing_technique = "high_quality"
knowledge_config.process_rule = Mock()
knowledge_config.process_rule.mode = "custom"
knowledge_config.process_rule.rules = None
with pytest.raises(ValueError, match="No process rule found"):
DocumentService.save_document_with_dataset_id(dataset, knowledge_config, account)
@patch("services.dataset_service.db.session")
@patch("services.dataset_service.FeatureService.get_features")
@patch("services.dataset_service.current_user")
def test_invalid_process_rule_mode(self, mock_current_user, mock_features, mock_db):
"""
Test invalid process rule mode returns None (no document created).
"""
dataset = Mock(spec=Dataset)
dataset.id = "ds1"
dataset.tenant_id = "tenant1"
dataset.latest_process_rule = None
dataset.data_source_type = "upload_file"
dataset.indexing_technique = "high_quality"
account = Mock(spec=Account)
account.id = "user1"
mock_current_user.current_tenant_id = "tenant1"
features = Mock()
features.billing.enabled = False
mock_features.return_value = features
knowledge_config = Mock()
knowledge_config.original_document_id = None
knowledge_config.data_source = Mock()
knowledge_config.data_source.info_list = Mock()
knowledge_config.data_source.info_list.data_source_type = "upload_file"
knowledge_config.indexing_technique = "high_quality"
knowledge_config.process_rule = Mock()
knowledge_config.process_rule.mode = "invalid"
with patch("logging.warning") as mock_log:
result = DocumentService.save_document_with_dataset_id(dataset, knowledge_config, account)
assert result is None
mock_log.assert_called()
@patch("services.dataset_service.db.session")
@patch("services.dataset_service.redis_client")
@patch("services.dataset_service.FeatureService.get_features")
@patch("services.dataset_service.current_user")
def test_notion_import_no_info(self, mock_current_user, mock_features, mock_redis, mock_db):
"""
Test notion_import with no notion_info_list raises ValueError.
"""
dataset = Mock(spec=Dataset)
dataset.id = "ds1"
dataset.tenant_id = "tenant1"
dataset.data_source_type = "notion_import"
dataset.indexing_technique = "high_quality"
account = Mock(spec=Account)
account.id = "user1"
mock_current_user.current_tenant_id = "tenant1"
features = Mock()
features.billing.enabled = False
mock_features.return_value = features
knowledge_config = Mock()
knowledge_config.original_document_id = None
knowledge_config.process_rule = Mock()
knowledge_config.process_rule.mode = "automatic"
knowledge_config.data_source = Mock()
knowledge_config.data_source.info_list = Mock()
knowledge_config.data_source.info_list.data_source_type = "notion_import"
knowledge_config.data_source.info_list.notion_info_list = None
with pytest.raises(ValueError, match="No notion info list found"):
DocumentService.save_document_with_dataset_id(dataset, knowledge_config, account)
@patch("services.dataset_service.db.session")
@patch("services.dataset_service.redis_client")
@patch("services.dataset_service.FeatureService.get_features")
@patch("services.dataset_service.current_user")
def test_website_crawl_no_info(self, mock_current_user, mock_features, mock_redis, mock_db):
"""
Test website_crawl with no website_info raises ValueError.
"""
dataset = Mock(spec=Dataset)
dataset.id = "ds1"
dataset.tenant_id = "tenant1"
dataset.data_source_type = "website_crawl"
dataset.indexing_technique = "high_quality"
account = Mock(spec=Account)
account.id = "user1"
mock_current_user.current_tenant_id = "tenant1"
features = Mock()
features.billing.enabled = False
mock_features.return_value = features
knowledge_config = Mock()
knowledge_config.original_document_id = None
knowledge_config.process_rule = Mock()
knowledge_config.process_rule.mode = "automatic"
knowledge_config.data_source = Mock()
knowledge_config.data_source.info_list = Mock()
knowledge_config.data_source.info_list.data_source_type = "website_crawl"
knowledge_config.data_source.info_list.website_info_list = None
with pytest.raises(ValueError, match="No website info list found"):
DocumentService.save_document_with_dataset_id(dataset, knowledge_config, account)
@patch("services.dataset_service.DocumentService.update_document_with_dataset_id")
def test_update_document_branch(self, mock_update_doc):
"""
Test the branch where original_document_id is provided (update flow).
"""
dataset = Mock(spec=Dataset)
account = Mock(spec=Account)
knowledge_config = Mock()
knowledge_config.original_document_id = "docid"
mock_update_doc.return_value = Mock(batch="batch1")
# Mock current_user
mock_current_user = Mock()
mock_current_user.current_tenant_id = "tenant-123"
# Patch current_user to return the mock
with patch("services.dataset_service.current_user", mock_current_user):
docs, batch = DocumentService.save_document_with_dataset_id(dataset, knowledge_config, account)
assert len(docs) == 1
assert batch == "batch1"
@patch("services.dataset_service.db.session")
@patch("services.dataset_service.redis_client")
@patch("services.dataset_service.FeatureService.get_features")
@patch("services.dataset_service.current_user")
def test_upload_file_file_not_found(self, mock_current_user, mock_features, mock_redis, mock_db):
"""
Test upload_file: should raise FileNotExistsError if file not found in db.
"""
from services.dataset_service import FileNotExistsError
dataset = Mock(spec=Dataset)
dataset.id = "ds1"
dataset.tenant_id = "tenant1"
account = Mock(spec=Account)
account.id = "user1"
mock_current_user.current_tenant_id = "tenant1"
features = Mock()
features.billing.enabled = False
mock_features.return_value = features
knowledge_config = Mock()
knowledge_config.original_document_id = None
knowledge_config.process_rule = Mock()
knowledge_config.process_rule.mode = "automatic"
knowledge_config.data_source = Mock()
knowledge_config.data_source.info_list = Mock()
knowledge_config.data_source.info_list.data_source_type = "upload_file"
knowledge_config.data_source.info_list.file_info_list = Mock()
knowledge_config.data_source.info_list.file_info_list.file_ids = ["file1"]
mock_db.query.return_value.filter.return_value.first.return_value = None
with pytest.raises(FileNotExistsError):
DocumentService.save_document_with_dataset_id(dataset, knowledge_config, account)
@patch("services.dataset_service.FeatureService.get_features")
@patch("services.dataset_service.db.session")
@patch("services.dataset_service.redis_client")
@patch("services.dataset_service.time")
@patch("services.dataset_service.secrets.randbelow", return_value=123456)
@patch("services.dataset_service.DocumentService.build_document")
@patch("services.dataset_service.document_indexing_task.delay")
@patch("services.dataset_service.duplicate_document_indexing_task.delay")
@patch("services.dataset_service.current_user")
@patch("services.dataset_service.ModelManager")
@patch("services.dataset_service.DatasetCollectionBindingService.get_dataset_collection_binding")
@patch("services.dataset_service.DocumentService.get_documents_position", return_value=0)
def test_upload_file_duplicate(
self,
mock_get_position,
mock_collection_binding,
mock_model_manager,
mock_current_user,
mock_dup_task,
mock_doc_task,
mock_build_doc,
mock_rand,
mock_time,
mock_redis,
mock_db,
mock_features,
):
"""
Test upload_file: duplicate=True and document already exists, should update and append to documents.
"""
dataset = Mock(spec=Dataset)
dataset.id = "ds1"
dataset.tenant_id = "tenant1"
dataset.data_source_type = None
dataset.indexing_technique = None
dataset.retrieval_model = None
dataset.embedding_model = None
dataset.embedding_model_provider = None
dataset.collection_binding_id = None
dataset.latest_process_rule = None
account = Mock(spec=Account)
account.id = "user1"
account.name = "User One"
mock_current_user.current_tenant_id = "tenant1"
features = Mock()
features.billing.enabled = True
features.billing.subscription.plan = "pro"
features.documents_upload_quota.limit = 100
features.documents_upload_quota.size = 0
mock_features.return_value = features
knowledge_config = Mock()
knowledge_config.original_document_id = None
knowledge_config.data_source = Mock()
knowledge_config.data_source.info_list = Mock()
knowledge_config.data_source.info_list.data_source_type = "upload_file"
knowledge_config.data_source.info_list.file_info_list = Mock()
knowledge_config.data_source.info_list.file_info_list.file_ids = ["file1"]
knowledge_config.indexing_technique = "high_quality"
knowledge_config.embedding_model = "embed-model"
knowledge_config.embedding_model_provider = "openai"
knowledge_config.process_rule = Mock()
knowledge_config.process_rule.mode = "custom"
knowledge_config.process_rule.rules = Mock()
knowledge_config.doc_form = "pdf"
knowledge_config.doc_language = "en"
knowledge_config.duplicate = True
mock_model_manager_instance = Mock()
mock_embedding_model = Mock()
mock_embedding_model.model = "embed-model"
mock_embedding_model.provider = "openai"
mock_model_manager_instance.get_default_model_instance.return_value = mock_embedding_model
mock_model_manager.return_value = mock_model_manager_instance
mock_collection_binding_instance = Mock()
mock_collection_binding_instance.id = "binding-123"
mock_collection_binding.return_value = mock_collection_binding_instance
upload_file = Mock()
upload_file.id = "file1"
upload_file.name = "file1.pdf"
mock_db.query.return_value.filter.return_value.first.side_effect = [
upload_file,
Mock(id="docid", name="file1.pdf"),
] # file, then document
mock_redis.lock.return_value.__enter__ = Mock(return_value=None)
mock_redis.lock.return_value.__exit__ = Mock(return_value=None)
mock_time.strftime.return_value = "20231201120000"
docs, batch = DocumentService.save_document_with_dataset_id(dataset, knowledge_config, account)
assert len(docs) == 1
mock_dup_task.assert_called_once()
@patch("services.dataset_service.db.session")
@patch("services.dataset_service.redis_client")
@patch("services.dataset_service.FeatureService.get_features")
@patch("services.dataset_service.current_user")
def test_notion_import_data_source_binding_not_found(self, mock_current_user, mock_features, mock_redis, mock_db):
"""
Test notion_import: should raise ValueError if data source binding not found.
"""
dataset = Mock(spec=Dataset)
dataset.id = "ds1"
dataset.tenant_id = "tenant1"
dataset.data_source_type = "notion_import"
dataset.indexing_technique = "high_quality"
account = Mock(spec=Account)
account.id = "user1"
mock_current_user.current_tenant_id = "tenant1"
features = Mock()
features.billing.enabled = False
mock_features.return_value = features
knowledge_config = Mock()
knowledge_config.original_document_id = None
knowledge_config.process_rule = Mock()
knowledge_config.process_rule.mode = "automatic"
knowledge_config.data_source = Mock()
knowledge_config.data_source.info_list = Mock()
knowledge_config.data_source.info_list.data_source_type = "notion_import"
notion_info = Mock()
notion_info.workspace_id = "ws1"
notion_info.pages = []
knowledge_config.data_source.info_list.notion_info_list = [notion_info]
mock_db.query.return_value.filter.return_value.first.return_value = None
with pytest.raises(ValueError, match="Data source binding not found."):
DocumentService.save_document_with_dataset_id(dataset, knowledge_config, account)
@patch("services.dataset_service.db.session")
@patch("services.dataset_service.redis_client")
@patch("services.dataset_service.FeatureService.get_features")
@patch("services.dataset_service.current_user")
@patch("services.dataset_service.document_indexing_task.delay")
def test_website_crawl_url_too_long(
self, mock_document_indexing_task, mock_current_user, mock_features, mock_redis, mock_db
):
"""
Test website_crawl: url longer than 255 chars should be truncated in document name.
"""
dataset = Mock(spec=Dataset)
dataset.id = "ds1"
dataset.tenant_id = "tenant1"
dataset.data_source_type = "website_crawl"
dataset.indexing_technique = "high_quality"
account = Mock(spec=Account)
account.id = "user1"
mock_current_user.current_tenant_id = "tenant1"
features = Mock()
features.billing.enabled = False
mock_features.return_value = features
knowledge_config = Mock()
knowledge_config.original_document_id = None
knowledge_config.process_rule = Mock()
knowledge_config.process_rule.mode = "automatic"
knowledge_config.data_source = Mock()
knowledge_config.data_source.info_list = Mock()
knowledge_config.data_source.info_list.data_source_type = "website_crawl"
website_info = Mock()
website_info.urls = ["http://" + "a" * 300]
website_info.provider = "test"
website_info.job_id = "job1"
website_info.only_main_content = True
knowledge_config.data_source.info_list.website_info_list = website_info
mock_db.query.return_value.filter.return_value.first.return_value = True
# Patch build_document to check name truncation
with patch("services.dataset_service.DocumentService.build_document") as mock_build_doc:
DocumentService.save_document_with_dataset_id(dataset, knowledge_config, account)
args, kwargs = mock_build_doc.call_args
assert args[9].startswith("http://")
assert len(args[9]) < 256
@patch("services.dataset_service.db.session")
@patch("services.dataset_service.redis_client")
@patch("services.dataset_service.FeatureService.get_features")
@patch("services.dataset_service.current_user")
@patch("services.dataset_service.DocumentService.build_document")
@patch("services.dataset_service.document_indexing_task.delay")
@patch("services.dataset_service.clean_notion_document_task.delay")
def test_notion_import_success(
self, mock_clean_task, mock_doc_task, mock_build_doc, mock_current_user, mock_features, mock_redis, mock_db
):
"""
Test notion_import: successful document creation for new pages.
"""
dataset = Mock(spec=Dataset)
dataset.id = "ds1"
dataset.tenant_id = "tenant1"
dataset.data_source_type = "notion_import"
dataset.indexing_technique = "high_quality"
account = Mock(spec=Account)
account.id = "user1"
account.name = "User One"
mock_current_user.current_tenant_id = "tenant1"
features = Mock()
features.billing.enabled = False
mock_features.return_value = features
knowledge_config = Mock()
knowledge_config.original_document_id = None
knowledge_config.process_rule = Mock()
knowledge_config.process_rule.mode = "automatic"
knowledge_config.data_source = Mock()
knowledge_config.data_source.info_list = Mock()
knowledge_config.data_source.info_list.data_source_type = "notion_import"
notion_info = Mock()
notion_info.workspace_id = "ws1"
page = Mock()
page.page_id = "page1"
page.page_name = "Test Page"
page.page_icon = None
page.type = "page"
notion_info.pages = [page]
knowledge_config.data_source.info_list.notion_info_list = [notion_info]
# Mock existing documents query (empty)
mock_db.query.return_value.filter_by.return_value.all.return_value = []
# Mock data source binding
binding = Mock()
binding.id = "binding1"
mock_db.query.return_value.filter.return_value.first.return_value = binding
# Mock build_document
mock_doc = Mock(spec=Document, id="doc1")
mock_build_doc.return_value = mock_doc
docs, batch = DocumentService.save_document_with_dataset_id(dataset, knowledge_config, account)
assert len(docs) == 1
mock_doc_task.assert_called_once()
@patch("services.dataset_service.db.session")
@patch("services.dataset_service.redis_client")
@patch("services.dataset_service.FeatureService.get_features")
@patch("services.dataset_service.current_user")
@patch("services.dataset_service.DocumentService.build_document")
@patch("services.dataset_service.clean_notion_document_task.delay")
@patch("services.dataset_service.document_indexing_task.delay")
def test_notion_import_page_exists(
self, mock_doc_task, mock_clean_task, mock_build_doc, mock_current_user, mock_features, mock_redis, mock_db
):
"""
Test notion_import: page already exists, should skip creation and clean old documents.
"""
dataset = Mock(spec=Dataset)
dataset.id = "ds1"
dataset.tenant_id = "tenant1"
dataset.data_source_type = "notion_import"
dataset.indexing_technique = "high_quality"
account = Mock(spec=Account)
account.id = "user1"
mock_current_user.current_tenant_id = "tenant1"
features = Mock()
features.billing.enabled = False
mock_features.return_value = features
knowledge_config = Mock()
knowledge_config.original_document_id = None
knowledge_config.process_rule = Mock()
knowledge_config.process_rule.mode = "automatic"
knowledge_config.data_source = Mock()
knowledge_config.data_source.info_list = Mock()
knowledge_config.data_source.info_list.data_source_type = "notion_import"
notion_info = Mock()
notion_info.workspace_id = "ws1"
page = Mock()
page.page_id = "page1"
page.page_name = "Test Page"
notion_info.pages = [page]
knowledge_config.data_source.info_list.notion_info_list = [notion_info]
# Mock existing document with same page_id
existing_doc = Mock()
existing_doc.data_source_info = '{"notion_page_id": "page1"}'
existing_doc.id = "doc1"
mock_db.query.return_value.filter_by.return_value.all.return_value = [existing_doc]
# Mock data source binding
binding = Mock()
binding.id = "binding1"
mock_db.query.return_value.filter.return_value.first.return_value = binding
docs, batch = DocumentService.save_document_with_dataset_id(dataset, knowledge_config, account)
assert len(docs) == 0
mock_clean_task.assert_not_called()
@patch("services.dataset_service.db.session")
@patch("services.dataset_service.redis_client")
@patch("services.dataset_service.FeatureService.get_features")
@patch("services.dataset_service.current_user")
@patch("services.dataset_service.DocumentService.build_document")
@patch("services.dataset_service.document_indexing_task.delay")
def test_website_crawl_success(
self, mock_doc_task, mock_build_doc, mock_current_user, mock_features, mock_redis, mock_db
):
"""
Test website_crawl: successful document creation for multiple URLs.
"""
dataset = Mock(spec=Dataset)
dataset.id = "ds1"
dataset.tenant_id = "tenant1"
dataset.data_source_type = "website_crawl"
dataset.indexing_technique = "high_quality"
account = Mock(spec=Account)
account.id = "user1"
account.name = "User One"
mock_current_user.current_tenant_id = "tenant1"
features = Mock()
features.billing.enabled = False
mock_features.return_value = features
knowledge_config = Mock()
knowledge_config.original_document_id = None
knowledge_config.process_rule = Mock()
knowledge_config.process_rule.mode = "automatic"
knowledge_config.data_source = Mock()
knowledge_config.data_source.info_list = Mock()
knowledge_config.data_source.info_list.data_source_type = "website_crawl"
website_info = Mock()
website_info.urls = ["http://example1.com", "http://example2.com"]
website_info.provider = "test"
website_info.job_id = "job1"
website_info.only_main_content = True
knowledge_config.data_source.info_list.website_info_list = website_info
# Mock build_document
mock_doc1 = Mock(spec=Document, id="doc1")
mock_doc2 = Mock(spec=Document, id="doc2")
mock_build_doc.side_effect = [mock_doc1, mock_doc2]
docs, batch = DocumentService.save_document_with_dataset_id(dataset, knowledge_config, account)
assert len(docs) == 2
assert mock_build_doc.call_count == 2
mock_doc_task.assert_called_once()
@patch("services.dataset_service.db.session")
@patch("services.dataset_service.redis_client")
@patch("services.dataset_service.FeatureService.get_features")
@patch("services.dataset_service.current_user")
def test_unknown_data_source_type(self, mock_current_user, mock_features, mock_redis, mock_db):
"""
Test unknown data_source_type: should not raise error but return None when no matching branch.
"""
dataset = Mock(spec=Dataset)
dataset.id = "ds1"
dataset.tenant_id = "tenant1"
dataset.data_source_type = "unknown_type"
dataset.indexing_technique = "high_quality"
account = Mock(spec=Account)
account.id = "user1"
mock_current_user.current_tenant_id = "tenant1"
features = Mock()
features.billing.enabled = False
mock_features.return_value = features
knowledge_config = Mock()
knowledge_config.original_document_id = None
knowledge_config.process_rule = Mock()
knowledge_config.process_rule.mode = "automatic"
knowledge_config.data_source = Mock()
knowledge_config.data_source.info_list = Mock()
knowledge_config.data_source.info_list.data_source_type = "unknown_type"
# This should not raise an error but return None due to no matching data source type
result = DocumentService.save_document_with_dataset_id(dataset, knowledge_config, account)
# The method should handle unknown data source types gracefully
assert result is None or len(result[0]) == 0
@patch("services.dataset_service.FeatureService.get_features")
@patch("services.dataset_service.current_user")
def test_upload_file_batch_limit_exceeded(self, mock_current_user, mock_features):
"""
Test upload_file: batch upload limit exceeded raises ValueError.
"""
dataset = Mock(spec=Dataset)
dataset.id = "ds1"
dataset.tenant_id = "tenant1"
account = Mock(spec=Account)
account.id = "user1"
mock_current_user.current_tenant_id = "tenant1"
features = Mock()
features.billing.enabled = True
features.billing.subscription.plan = "pro"
mock_features.return_value = features
knowledge_config = Mock()
knowledge_config.original_document_id = None
knowledge_config.data_source = Mock()
knowledge_config.data_source.info_list = Mock()
knowledge_config.data_source.info_list.data_source_type = "upload_file"
knowledge_config.data_source.info_list.file_info_list = Mock()
# Create a list with more than BATCH_UPLOAD_LIMIT files
knowledge_config.data_source.info_list.file_info_list.file_ids = ["file" + str(i) for i in range(100)]
with patch("services.dataset_service.dify_config.BATCH_UPLOAD_LIMIT", 50):
with pytest.raises(ValueError, match="You have reached the batch upload limit"):
DocumentService.save_document_with_dataset_id(dataset, knowledge_config, account)
@patch("services.dataset_service.FeatureService.get_features")
@patch("services.dataset_service.current_user")
def test_notion_import_batch_limit_exceeded(self, mock_current_user, mock_features):
"""
Test notion_import: batch upload limit exceeded raises ValueError.
"""
dataset = Mock(spec=Dataset)
dataset.id = "ds1"
dataset.tenant_id = "tenant1"
account = Mock(spec=Account)
account.id = "user1"
mock_current_user.current_tenant_id = "tenant1"
features = Mock()
features.billing.enabled = True
features.billing.subscription.plan = "pro"
mock_features.return_value = features
knowledge_config = Mock()
knowledge_config.original_document_id = None
knowledge_config.data_source = Mock()
knowledge_config.data_source.info_list = Mock()
knowledge_config.data_source.info_list.data_source_type = "notion_import"
notion_info = Mock()
notion_info.pages = [Mock() for _ in range(100)] # 100 pages
knowledge_config.data_source.info_list.notion_info_list = [notion_info]
with patch("services.dataset_service.dify_config.BATCH_UPLOAD_LIMIT", 50):
with pytest.raises(ValueError, match="You have reached the batch upload limit"):
DocumentService.save_document_with_dataset_id(dataset, knowledge_config, account)
@patch("services.dataset_service.FeatureService.get_features")
@patch("services.dataset_service.current_user")
def test_website_crawl_batch_limit_exceeded(self, mock_current_user, mock_features):
"""
Test website_crawl: batch upload limit exceeded raises ValueError.
"""
dataset = Mock(spec=Dataset)
dataset.id = "ds1"
dataset.tenant_id = "tenant1"
account = Mock(spec=Account)
account.id = "user1"
mock_current_user.current_tenant_id = "tenant1"
features = Mock()
features.billing.enabled = True
features.billing.subscription.plan = "pro"
mock_features.return_value = features
knowledge_config = Mock()
knowledge_config.original_document_id = None
knowledge_config.data_source = Mock()
knowledge_config.data_source.info_list = Mock()
knowledge_config.data_source.info_list.data_source_type = "website_crawl"
website_info = Mock()
website_info.urls = ["http://example" + str(i) + ".com" for i in range(100)] # 100 URLs
knowledge_config.data_source.info_list.website_info_list = website_info
with patch("services.dataset_service.dify_config.BATCH_UPLOAD_LIMIT", 50):
with pytest.raises(ValueError, match="You have reached the batch upload limit"):
DocumentService.save_document_with_dataset_id(dataset, knowledge_config, account)
Loading…
Cancel
Save