feat: refactor: save_document_with_dataset_id for readability

11 months ago · 2036d951fc
parent bad76a6f72
commit 2036d951fc
1 changed files with 648 additions and 264 deletions
--- a/api/services/dataset_service.py
+++ b/api/services/dataset_service.py
@ -1053,290 +1053,674 @@ class DocumentService:
        dataset_process_rule: Optional[DatasetProcessRule] = None,
        created_from: str = "web",
    ):
-        # check document limit
+        """
        Save documents to a dataset with comprehensive validation and processing.
        This method handles document creation and updates for various data sources including
        file uploads, Notion imports, and website crawling. It performs billing checks,
        dataset configuration, and triggers indexing tasks.
        Args:
            dataset: The target dataset for document storage
            knowledge_config: Configuration containing document metadata and processing rules
            account: User account performing the operation
            dataset_process_rule: Optional pre-existing process rule for the dataset
            created_from: Source identifier for document creation (default: "web")
        Returns:
            tuple: (list of created/updated documents, batch identifier)
        Raises:
            ValueError: When billing limits are exceeded or configuration is invalid
            FileNotExistsError: When referenced files are not found
        """
        # Validate billing and upload limits for new documents
        features = FeatureService.get_features(current_user.current_tenant_id)
-        if features.billing.enabled:
+        if features.billing.enabled and not knowledge_config.original_document_id:
-            if not knowledge_config.original_document_id:
+            document_count = DocumentService._calculate_document_count(knowledge_config)
-                count = 0
+            if document_count > 0:
-                if knowledge_config.data_source:
+                DocumentService._validate_upload_limits(document_count, features)
-                    if knowledge_config.data_source.info_list.data_source_type == "upload_file":
+
-                        upload_file_list = knowledge_config.data_source.info_list.file_info_list.file_ids  # type: ignore
+        # Initialize dataset configuration if not already set
-                        count = len(upload_file_list)
+        DocumentService._initialize_dataset_configuration(dataset, knowledge_config)
-                    elif knowledge_config.data_source.info_list.data_source_type == "notion_import":
+
-                        notion_info_list = knowledge_config.data_source.info_list.notion_info_list
+        documents = []
-                        for notion_info in notion_info_list:  # type: ignore
+
-                            count = count + len(notion_info.pages)
+        # Handle document update scenario
-                    elif knowledge_config.data_source.info_list.data_source_type == "website_crawl":
+        if knowledge_config.original_document_id:
-                        website_info = knowledge_config.data_source.info_list.website_info_list
+            document = DocumentService.update_document_with_dataset_id(dataset, knowledge_config, account)
-                        count = len(website_info.urls)  # type: ignore
+            documents.append(document)
-                    batch_upload_limit = int(dify_config.BATCH_UPLOAD_LIMIT)
+            batch = document.batch
-
+        else:
-                    if features.billing.subscription.plan == "sandbox" and count > 1:
+            # Handle new document creation scenario
-                        raise ValueError("Your current plan does not support batch upload, please upgrade your plan.")
+            batch = DocumentService._generate_batch_identifier()
-                    if count > batch_upload_limit:
+
-                        raise ValueError(f"You have reached the batch upload limit of {batch_upload_limit}.")
+            # Create or retrieve process rule for document processing
-
+            dataset_process_rule = DocumentService._prepare_process_rule(
-                    DocumentService.check_documents_upload_quota(count, features)
+                dataset, knowledge_config, account, dataset_process_rule
-
+            )
-        # if dataset is empty, update dataset data_source_type
+            if not dataset_process_rule:
                # keep the original process rule if no valid rule found, but it seems not completed
                return
            # Process documents with distributed lock to prevent race conditions
            documents, batch = DocumentService._process_new_documents(
                dataset, knowledge_config, account, dataset_process_rule, created_from, batch
            )
        return documents, batch
    @staticmethod
    def _calculate_document_count(knowledge_config: KnowledgeConfig) -> int:
        """
        Calculate the total number of documents to be processed based on data source type.
        Args:
            knowledge_config: Configuration containing data source information
        Returns:
            int: Total count of documents to be processed
        """
        count = 0
        if knowledge_config.data_source:
            data_source_info = knowledge_config.data_source.info_list
            if data_source_info.data_source_type == "upload_file":
                upload_file_list = data_source_info.file_info_list.file_ids  # type: ignore
                count = len(upload_file_list)
            elif data_source_info.data_source_type == "notion_import":
                notion_info_list = data_source_info.notion_info_list
                for notion_info in notion_info_list:  # type: ignore
                    count += len(notion_info.pages)
            elif data_source_info.data_source_type == "website_crawl":
                website_info = data_source_info.website_info_list
                count = len(website_info.urls)  # type: ignore
        return count
    @staticmethod
    def _validate_upload_limits(document_count: int, features: FeatureModel) -> None:
        """
        Validate that the document upload operation complies with billing and subscription limits.
        Args:
            document_count: Number of documents to be uploaded
            features: Feature model containing billing and subscription information
        Raises:
            ValueError: When upload limits are exceeded
        """
        batch_upload_limit = int(dify_config.BATCH_UPLOAD_LIMIT)
        # Check sandbox plan restrictions
        if features.billing.subscription.plan == "sandbox" and document_count > 1:
            raise ValueError("Your current plan does not support batch upload, please upgrade your plan.")
        # Check batch upload limit
        if document_count > batch_upload_limit:
            raise ValueError(f"You have reached the batch upload limit of {batch_upload_limit}.")
        # Check document quota
        DocumentService.check_documents_upload_quota(document_count, features)
    @staticmethod
    def _initialize_dataset_configuration(dataset: Dataset, knowledge_config: KnowledgeConfig) -> None:
        """
        Initialize dataset configuration settings if not already configured.
        Args:
            dataset: Dataset to be configured
            knowledge_config: Configuration containing dataset settings
        Raises:
            ValueError: When indexing technique is invalid
        """
        # Set data source type if not already configured
        if not dataset.data_source_type:
            dataset.data_source_type = knowledge_config.data_source.info_list.data_source_type  # type: ignore
        # Configure indexing technique and related settings
        if not dataset.indexing_technique:
            if knowledge_config.indexing_technique not in Dataset.INDEXING_TECHNIQUE_LIST:
                raise ValueError("Indexing technique is invalid")
            dataset.indexing_technique = knowledge_config.indexing_technique
            # Configure high-quality indexing settings
            if knowledge_config.indexing_technique == "high_quality":
-                model_manager = ModelManager()
+                DocumentService._configure_high_quality_indexing(dataset, knowledge_config)
-                if knowledge_config.embedding_model and knowledge_config.embedding_model_provider:
+
-                    dataset_embedding_model = knowledge_config.embedding_model
+    @staticmethod
-                    dataset_embedding_model_provider = knowledge_config.embedding_model_provider
+    def _configure_high_quality_indexing(dataset: Dataset, knowledge_config: KnowledgeConfig) -> None:
-                else:
+        """
-                    embedding_model = model_manager.get_default_model_instance(
+        Configure embedding model and retrieval settings for high-quality indexing.
-                        tenant_id=current_user.current_tenant_id, model_type=ModelType.TEXT_EMBEDDING
+
-                    )
+        Args:
-                    dataset_embedding_model = embedding_model.model
+            dataset: Dataset to be configured
-                    dataset_embedding_model_provider = embedding_model.provider
+            knowledge_config: Configuration containing model settings
-                dataset.embedding_model = dataset_embedding_model
+        """
-                dataset.embedding_model_provider = dataset_embedding_model_provider
+        model_manager = ModelManager()
-                dataset_collection_binding = DatasetCollectionBindingService.get_dataset_collection_binding(
+
-                    dataset_embedding_model_provider, dataset_embedding_model
+        # Set embedding model configuration
        if knowledge_config.embedding_model and knowledge_config.embedding_model_provider:
            dataset_embedding_model = knowledge_config.embedding_model
            dataset_embedding_model_provider = knowledge_config.embedding_model_provider
        else:
            embedding_model = model_manager.get_default_model_instance(
                tenant_id=current_user.current_tenant_id, model_type=ModelType.TEXT_EMBEDDING
            )
            dataset_embedding_model = embedding_model.model
            dataset_embedding_model_provider = embedding_model.provider
        dataset.embedding_model = dataset_embedding_model
        dataset.embedding_model_provider = dataset_embedding_model_provider
        # Configure collection binding
        dataset_collection_binding = DatasetCollectionBindingService.get_dataset_collection_binding(
            dataset_embedding_model_provider, dataset_embedding_model
        )
        dataset.collection_binding_id = dataset_collection_binding.id
        # Configure retrieval model if not set
        if not dataset.retrieval_model:
            default_retrieval_model = {
                "search_method": RetrievalMethod.SEMANTIC_SEARCH.value,
                "reranking_enable": False,
                "reranking_model": {"reranking_provider_name": "", "reranking_model_name": ""},
                "top_k": 2,
                "score_threshold_enabled": False,
            }
            dataset.retrieval_model = (
                knowledge_config.retrieval_model.model_dump()
                if knowledge_config.retrieval_model
                else default_retrieval_model
            )  # type: ignore
    @staticmethod
    def _generate_batch_identifier() -> str:
        """
        Generate a unique batch identifier for document grouping.
        Returns:
            str: Unique batch identifier combining timestamp and random number
        """
        return time.strftime("%Y%m%d%H%M%S") + str(100000 + secrets.randbelow(exclusive_upper_bound=900000))
    @staticmethod
    def _prepare_process_rule(
        dataset: Dataset,
        knowledge_config: KnowledgeConfig,
        account: Account,
        dataset_process_rule: Optional[DatasetProcessRule],
    ) -> Optional[DatasetProcessRule]:
        """
        Prepare or create dataset process rule for document processing.
        Args:
            dataset: Target dataset
            knowledge_config: Configuration containing process rules
            account: User account
            dataset_process_rule: Optional existing process rule
        Returns:
            DatasetProcessRule: Prepared process rule for document processing
        Raises:
            ValueError: When no valid process rule can be found
        """
        if dataset_process_rule:
            return dataset_process_rule
        process_rule = knowledge_config.process_rule
        if not process_rule:
            return None
        if process_rule.mode in ("custom", "hierarchical"):
            if process_rule.rules:
                dataset_process_rule = DatasetProcessRule(
                    dataset_id=dataset.id,
                    mode=process_rule.mode,
                    rules=process_rule.rules.model_dump_json() if process_rule.rules else None,
                    created_by=account.id,
                )
-                dataset.collection_binding_id = dataset_collection_binding.id
+            else:
-                if not dataset.retrieval_model:
+                dataset_process_rule = dataset.latest_process_rule
-                    default_retrieval_model = {
+                if not dataset_process_rule:
-                        "search_method": RetrievalMethod.SEMANTIC_SEARCH.value,
+                    raise ValueError("No process rule found.")
-                        "reranking_enable": False,
+        elif process_rule.mode == "automatic":
-                        "reranking_model": {"reranking_provider_name": "", "reranking_model_name": ""},
+            dataset_process_rule = DatasetProcessRule(
-                        "top_k": 2,
+                dataset_id=dataset.id,
-                        "score_threshold_enabled": False,
+                mode=process_rule.mode,
-                    }
+                rules=json.dumps(DatasetProcessRule.AUTOMATIC_RULES),
                created_by=account.id,
            )
        else:
            logging.warning(f"Invalid process rule mode: {process_rule.mode}, can not find dataset process rule")
            return None
-                    dataset.retrieval_model = (
+        db.session.add(dataset_process_rule)
-                        knowledge_config.retrieval_model.model_dump()
+        db.session.commit()
-                        if knowledge_config.retrieval_model
+        return dataset_process_rule
                        else default_retrieval_model
                    )  # type: ignore
-        documents = []
+    @staticmethod
-        if knowledge_config.original_document_id:
+    def _process_new_documents(
-            document = DocumentService.update_document_with_dataset_id(dataset, knowledge_config, account)
+        dataset: Dataset,
        knowledge_config: KnowledgeConfig,
        account: Account,
        dataset_process_rule: Optional[DatasetProcessRule],
        created_from: str,
        batch: str,
    ) -> tuple[list, str]:
        """
        Process new documents with distributed locking to prevent race conditions.
        Args:
            dataset: Target dataset
            knowledge_config: Document configuration
            account: User account
            dataset_process_rule: Process rule for document processing
            created_from: Source identifier
            batch: Batch identifier
        Returns:
            tuple: (list of created documents, batch identifier)
        """
        lock_name = f"add_document_lock_dataset_id_{dataset.id}"
        with redis_client.lock(lock_name, timeout=600):
            position = DocumentService.get_documents_position(dataset.id)
            document_ids: list[str] = []
            duplicate_document_ids: list[str] = []
            documents: list[Document] = []
            data_source_type = knowledge_config.data_source.info_list.data_source_type  # type: ignore
            # Process documents based on data source type
            if data_source_type == "upload_file":
                documents, document_ids, duplicate_document_ids = DocumentService._process_upload_file_documents(
                    dataset, knowledge_config, account, dataset_process_rule, created_from, batch, position
                )
            elif data_source_type == "notion_import":
                documents, document_ids = DocumentService._process_notion_documents(
                    dataset, knowledge_config, account, dataset_process_rule, created_from, batch, position
                )
            elif data_source_type == "website_crawl":
                documents, document_ids = DocumentService._process_website_documents(
                    dataset, knowledge_config, account, dataset_process_rule, created_from, batch, position
                )
            db.session.commit()
            # Trigger asynchronous indexing tasks
            if document_ids:
                document_indexing_task.delay(dataset.id, document_ids)
            if duplicate_document_ids:
                duplicate_document_indexing_task.delay(dataset.id, duplicate_document_ids)
        return documents, batch
    @staticmethod
    def _process_upload_file_documents(
        dataset: Dataset,
        knowledge_config: KnowledgeConfig,
        account: Account,
        dataset_process_rule: Optional[DatasetProcessRule],
        created_from: str,
        batch: str,
        position: int,
    ) -> tuple[list[Document], list[str], list[str]]:
        """
        Process uploaded file documents with duplicate checking and validation.
        Args:
            dataset: Target dataset
            knowledge_config: Document configuration
            account: User account
            dataset_process_rule: Process rule
            created_from: Source identifier
            batch: Batch identifier
            position: Starting position for document ordering
        Returns:
            tuple: (list of documents, list of document IDs, list of duplicate document IDs)
        Raises:
            FileNotExistsError: When referenced files are not found
        """
        documents: list[Document] = []
        document_ids: list[str] = []
        duplicate_document_ids: list[str] = []
        current_position = position
        upload_file_list = knowledge_config.data_source.info_list.file_info_list.file_ids  # type: ignore
        for file_id in upload_file_list:
            # Validate file existence
            file = (
                db.session.query(UploadFile)
                .filter(UploadFile.tenant_id == dataset.tenant_id, UploadFile.id == file_id)
                .first()
            )
            if not file:
                raise FileNotExistsError()
            file_name = file.name
            data_source_info = {"upload_file_id": file_id}
            # Handle duplicate document processing
            if knowledge_config.duplicate:
                document = DocumentService._find_duplicate_document(dataset, file_name)
                if document:
                    document = DocumentService._update_duplicate_document(
                        document, dataset_process_rule, knowledge_config, created_from, data_source_info, batch
                    )
                    documents.append(document)
                    duplicate_document_ids.append(document.id)
                    continue
            # Create new document
            document = DocumentService.build_document(
                dataset,
                dataset_process_rule.id,  # type: ignore
                knowledge_config.data_source.info_list.data_source_type,  # type: ignore
                knowledge_config.doc_form,
                knowledge_config.doc_language,
                data_source_info,
                created_from,
                current_position,
                account,
                file_name,
                batch,
            )
            db.session.add(document)
            db.session.flush()
            document_ids.append(document.id)
            documents.append(document)
-            batch = document.batch
+            current_position += 1
        else:
            batch = time.strftime("%Y%m%d%H%M%S") + str(100000 + secrets.randbelow(exclusive_upper_bound=900000))
            # save process rule
            if not dataset_process_rule:
                process_rule = knowledge_config.process_rule
                if process_rule:
                    if process_rule.mode in ("custom", "hierarchical"):
                        if process_rule.rules:
                            dataset_process_rule = DatasetProcessRule(
                                dataset_id=dataset.id,
                                mode=process_rule.mode,
                                rules=process_rule.rules.model_dump_json() if process_rule.rules else None,
                                created_by=account.id,
                            )
                        else:
                            dataset_process_rule = dataset.latest_process_rule
                            if not dataset_process_rule:
                                raise ValueError("No process rule found.")
                    elif process_rule.mode == "automatic":
                        dataset_process_rule = DatasetProcessRule(
                            dataset_id=dataset.id,
                            mode=process_rule.mode,
                            rules=json.dumps(DatasetProcessRule.AUTOMATIC_RULES),
                            created_by=account.id,
                        )
                    else:
                        logging.warning(
                            f"Invalid process rule mode: {process_rule.mode}, can not find dataset process rule"
                        )
                        return
                    db.session.add(dataset_process_rule)
                    db.session.commit()
            lock_name = "add_document_lock_dataset_id_{}".format(dataset.id)
            with redis_client.lock(lock_name, timeout=600):
                position = DocumentService.get_documents_position(dataset.id)
                document_ids = []
                duplicate_document_ids = []
                if knowledge_config.data_source.info_list.data_source_type == "upload_file":  # type: ignore
                    upload_file_list = knowledge_config.data_source.info_list.file_info_list.file_ids  # type: ignore
                    for file_id in upload_file_list:
                        file = (
                            db.session.query(UploadFile)
                            .filter(UploadFile.tenant_id == dataset.tenant_id, UploadFile.id == file_id)
                            .first()
                        )
-                        # raise error if file not found
+        return documents, document_ids, duplicate_document_ids
                        if not file:
                            raise FileNotExistsError()
-                        file_name = file.name
+    @staticmethod
-                        data_source_info = {
+    def _find_duplicate_document(dataset: Dataset, file_name: str) -> Optional[Document]:
-                            "upload_file_id": file_id,
+        """
-                        }
+        Find existing document with the same name in the dataset.
-                        # check duplicate
+
-                        if knowledge_config.duplicate:
+        Args:
-                            document = (
+            dataset: Target dataset
-                                db.session.query(Document)
+            file_name: Name of the file to check for duplicates
-                                .filter_by(
+
-                                    dataset_id=dataset.id,
+        Returns:
-                                    tenant_id=current_user.current_tenant_id,
+            Document: Existing document if found, None otherwise
-                                    data_source_type="upload_file",
+        """
-                                    enabled=True,
+        return (
-                                    name=file_name,
+            db.session.query(Document)
-                                )
+            .filter_by(
-                                .first()
+                dataset_id=dataset.id,
-                            )
+                tenant_id=current_user.current_tenant_id,
-                            if document:
+                data_source_type="upload_file",
-                                document.dataset_process_rule_id = dataset_process_rule.id  # type: ignore
+                enabled=True,
-                                document.updated_at = datetime.datetime.now(datetime.UTC).replace(tzinfo=None)
+                name=file_name,
-                                document.created_from = created_from
+            )
-                                document.doc_form = knowledge_config.doc_form
+            .first()
-                                document.doc_language = knowledge_config.doc_language
+        )
-                                document.data_source_info = json.dumps(data_source_info)
+
-                                document.batch = batch
+    @staticmethod
-                                document.indexing_status = "waiting"
+    def _update_duplicate_document(
-                                db.session.add(document)
+        document: Document,
-                                documents.append(document)
+        dataset_process_rule: Optional[DatasetProcessRule],
-                                duplicate_document_ids.append(document.id)
+        knowledge_config: KnowledgeConfig,
-                                continue
+        created_from: str,
-                        document = DocumentService.build_document(
+        data_source_info: dict,
-                            dataset,
+        batch: str,
-                            dataset_process_rule.id,  # type: ignore
+    ) -> Document:
-                            knowledge_config.data_source.info_list.data_source_type,  # type: ignore
+        """
-                            knowledge_config.doc_form,
+        Update existing document with new configuration and mark for re-indexing.
-                            knowledge_config.doc_language,
+
-                            data_source_info,
+        Args:
-                            created_from,
+            document: Document to be updated
-                            position,
+            dataset_process_rule: Process rule to apply
-                            account,
+            knowledge_config: New configuration
-                            file_name,
+            created_from: Source identifier
-                            batch,
+            data_source_info: Data source information
-                        )
+            batch: Batch identifier
-                        db.session.add(document)
+
-                        db.session.flush()
+        Returns:
-                        document_ids.append(document.id)
+            Document: Updated document
-                        documents.append(document)
+        """
-                        position += 1
+        document.dataset_process_rule_id = dataset_process_rule.id  # type: ignore
-                elif knowledge_config.data_source.info_list.data_source_type == "notion_import":  # type: ignore
+        document.updated_at = datetime.datetime.now(datetime.UTC).replace(tzinfo=None)
-                    notion_info_list = knowledge_config.data_source.info_list.notion_info_list  # type: ignore
+        document.created_from = created_from
-                    if not notion_info_list:
+        document.doc_form = knowledge_config.doc_form
-                        raise ValueError("No notion info list found.")
+        document.doc_language = knowledge_config.doc_language
-                    exist_page_ids = []
+        document.data_source_info = json.dumps(data_source_info)
-                    exist_document = {}
+        document.batch = batch
-                    documents_from_db = (
+        document.indexing_status = "waiting"
-                        db.session.query(Document)
+        db.session.add(document)
-                        .filter_by(
+        return document
-                            dataset_id=dataset.id,
+
-                            tenant_id=current_user.current_tenant_id,
+    @staticmethod
-                            data_source_type="notion_import",
+    def _process_notion_documents(
-                            enabled=True,
+        dataset: Dataset,
-                        )
+        knowledge_config: KnowledgeConfig,
-                        .all()
+        account: Account,
        dataset_process_rule: Optional[DatasetProcessRule],
        created_from: str,
        batch: str,
        position: int,
    ) -> tuple[list[Document], list[str]]:
        """
        Process Notion import documents with workspace validation and page deduplication.
        Args:
            dataset: Target dataset
            knowledge_config: Document configuration
            account: User account
            dataset_process_rule: Process rule
            created_from: Source identifier
            batch: Batch identifier
            position: Starting position for document ordering
        Returns:
            tuple: (list of documents, list of document IDs)
        Raises:
            ValueError: When no notion info list is found or data source binding is missing
        """
        documents: list[Document] = []
        document_ids: list[str] = []
        current_position = position
        notion_info_list = knowledge_config.data_source.info_list.notion_info_list  # type: ignore
        if not notion_info_list:
            raise ValueError("No notion info list found.")
        # Get existing Notion documents for deduplication
        exist_page_ids, exist_document = DocumentService._get_existing_notion_documents(dataset)
        for notion_info in notion_info_list:
            workspace_id = notion_info.workspace_id
            # Validate data source binding
            data_source_binding = DocumentService._validate_notion_binding(workspace_id)
            for page in notion_info.pages:
                if page.page_id not in exist_page_ids:
                    # Create new Notion document
                    data_source_info = {
                        "notion_workspace_id": workspace_id,
                        "notion_page_id": page.page_id,
                        "notion_page_icon": page.page_icon.model_dump() if page.page_icon else None,
                        "type": page.type,
                    }
                    # Truncate page name to prevent DB field length errors
                    truncated_page_name = page.page_name[:255] if page.page_name else "nopagename"
                    document = DocumentService.build_document(
                        dataset,
                        dataset_process_rule.id,  # type: ignore
                        knowledge_config.data_source.info_list.data_source_type,  # type: ignore
                        knowledge_config.doc_form,
                        knowledge_config.doc_language,
                        data_source_info,
                        created_from,
                        current_position,
                        account,
                        truncated_page_name,
                        batch,
                    )
-                    if documents_from_db:
+                    db.session.add(document)
-                        for document in documents_from_db:
+                    db.session.flush()
-                            data_source_info = json.loads(document.data_source_info)
+                    document_ids.append(document.id)
-                            exist_page_ids.append(data_source_info["notion_page_id"])
+                    documents.append(document)
-                            exist_document[data_source_info["notion_page_id"]] = document.id
+                    current_position += 1
-                    for notion_info in notion_info_list:
+                else:
-                        workspace_id = notion_info.workspace_id
+                    exist_document.pop(page.page_id)
                        data_source_binding = (
                            db.session.query(DataSourceOauthBinding)
                            .filter(
                                db.and_(
                                    DataSourceOauthBinding.tenant_id == current_user.current_tenant_id,
                                    DataSourceOauthBinding.provider == "notion",
                                    DataSourceOauthBinding.disabled == False,
                                    DataSourceOauthBinding.source_info["workspace_id"] == f'"{workspace_id}"',
                                )
                            )
                            .first()
                        )
                        if not data_source_binding:
                            raise ValueError("Data source binding not found.")
                        for page in notion_info.pages:
                            if page.page_id not in exist_page_ids:
                                data_source_info = {
                                    "notion_workspace_id": workspace_id,
                                    "notion_page_id": page.page_id,
                                    "notion_page_icon": page.page_icon.model_dump() if page.page_icon else None,
                                    "type": page.type,
                                }
                                # Truncate page name to 255 characters to prevent DB field length errors
                                truncated_page_name = page.page_name[:255] if page.page_name else "nopagename"
                                document = DocumentService.build_document(
                                    dataset,
                                    dataset_process_rule.id,  # type: ignore
                                    knowledge_config.data_source.info_list.data_source_type,  # type: ignore
                                    knowledge_config.doc_form,
                                    knowledge_config.doc_language,
                                    data_source_info,
                                    created_from,
                                    position,
                                    account,
                                    truncated_page_name,
                                    batch,
                                )
                                db.session.add(document)
                                db.session.flush()
                                document_ids.append(document.id)
                                documents.append(document)
                                position += 1
                            else:
                                exist_document.pop(page.page_id)
                    # delete not selected documents
                    if len(exist_document) > 0:
                        clean_notion_document_task.delay(list(exist_document.values()), dataset.id)
                elif knowledge_config.data_source.info_list.data_source_type == "website_crawl":  # type: ignore
                    website_info = knowledge_config.data_source.info_list.website_info_list  # type: ignore
                    if not website_info:
                        raise ValueError("No website info list found.")
                    urls = website_info.urls
                    for url in urls:
                        data_source_info = {
                            "url": url,
                            "provider": website_info.provider,
                            "job_id": website_info.job_id,
                            "only_main_content": website_info.only_main_content,
                            "mode": "crawl",
                        }
                        if len(url) > 255:
                            document_name = url[:200] + "..."
                        else:
                            document_name = url
                        document = DocumentService.build_document(
                            dataset,
                            dataset_process_rule.id,  # type: ignore
                            knowledge_config.data_source.info_list.data_source_type,  # type: ignore
                            knowledge_config.doc_form,
                            knowledge_config.doc_language,
                            data_source_info,
                            created_from,
                            position,
                            account,
                            document_name,
                            batch,
                        )
                        db.session.add(document)
                        db.session.flush()
                        document_ids.append(document.id)
                        documents.append(document)
                        position += 1
                db.session.commit()
-                # trigger async task
+        # Clean up unselected documents
-                if document_ids:
+        if len(exist_document) > 0:
-                    document_indexing_task.delay(dataset.id, document_ids)
+            clean_notion_document_task.delay(list(exist_document.values()), dataset.id)
                if duplicate_document_ids:
                    duplicate_document_indexing_task.delay(dataset.id, duplicate_document_ids)
-        return documents, batch
+        return documents, document_ids
    @staticmethod
    def _get_existing_notion_documents(dataset: Dataset) -> tuple[list[str], dict[str, str]]:
        """
        Retrieve existing Notion documents for deduplication purposes.
        Args:
            dataset: Target dataset
        Returns:
            tuple: (list of existing page IDs, dict mapping page IDs to document IDs)
        """
        exist_page_ids: list[str] = []
        exist_document: dict[str, str] = {}
        documents = (
            db.session.query(Document)
            .filter_by(
                dataset_id=dataset.id,
                tenant_id=current_user.current_tenant_id,
                data_source_type="notion_import",
                enabled=True,
            )
            .all()
        )
        if documents:
            for document in documents:
                data_source_info = json.loads(document.data_source_info)
                exist_page_ids.append(data_source_info["notion_page_id"])
                exist_document[data_source_info["notion_page_id"]] = document.id
        return exist_page_ids, exist_document
    @staticmethod
    def _validate_notion_binding(workspace_id: str) -> DataSourceOauthBinding:
        """
        Validate Notion data source binding for the specified workspace.
        Args:
            workspace_id: Notion workspace identifier
        Returns:
            DataSourceOauthBinding: Valid binding for the workspace
        Raises:
            ValueError: When data source binding is not found
        """
        data_source_binding = (
            db.session.query(DataSourceOauthBinding)
            .filter(
                db.and_(
                    DataSourceOauthBinding.tenant_id == current_user.current_tenant_id,
                    DataSourceOauthBinding.provider == "notion",
                    DataSourceOauthBinding.disabled == False,
                    DataSourceOauthBinding.source_info["workspace_id"] == f'"{workspace_id}"',
                )
            )
            .first()
        )
        if not data_source_binding:
            raise ValueError("Data source binding not found.")
        return data_source_binding
    @staticmethod
    def _process_website_documents(
        dataset: Dataset,
        knowledge_config: KnowledgeConfig,
        account: Account,
        dataset_process_rule: Optional[DatasetProcessRule],
        created_from: str,
        batch: str,
        position: int,
    ) -> tuple[list[Document], list[str]]:
        """
        Process website crawl documents with URL validation and naming.
        Args:
            dataset: Target dataset
            knowledge_config: Document configuration
            account: User account
            dataset_process_rule: Process rule
            created_from: Source identifier
            batch: Batch identifier
            position: Starting position for document ordering
        Returns:
            tuple: (list of documents, list of document IDs)
        Raises:
            ValueError: When no website info list is found
        """
        documents: list[Document] = []
        document_ids: list[str] = []
        current_position = position
        website_info = knowledge_config.data_source.info_list.website_info_list  # type: ignore
        if not website_info:
            raise ValueError("No website info list found.")
        urls = website_info.urls
        for url in urls:
            data_source_info = {
                "url": url,
                "provider": website_info.provider,
                "job_id": website_info.job_id,
                "only_main_content": website_info.only_main_content,
                "mode": "crawl",
            }
            # Truncate URL for document naming if too long
            document_name = url[:200] + "..." if len(url) > 255 else url
            document = DocumentService.build_document(
                dataset,
                dataset_process_rule.id,  # type: ignore
                knowledge_config.data_source.info_list.data_source_type,  # type: ignore
                knowledge_config.doc_form,
                knowledge_config.doc_language,
                data_source_info,
                created_from,
                current_position,
                account,
                document_name,
                batch,
            )
            db.session.add(document)
            db.session.flush()
            document_ids.append(document.id)
            documents.append(document)
            current_position += 1
        return documents, document_ids
    @staticmethod
    def check_documents_upload_quota(count: int, features: FeatureModel):