From dc3ba65d7abe3a661ab3cac58d30e583e1ff89ee Mon Sep 17 00:00:00 2001 From: "liuchangsheng@wisdomidata.com" Date: Wed, 11 Jun 2025 23:07:29 +0800 Subject: [PATCH] =?UTF-8?q?=E3=80=90Dify=E3=80=91=20=E4=B8=8A=E4=BC=A0?= =?UTF-8?q?=E6=96=87=E6=A1=A3=E5=8A=9F=E8=83=BD=EF=BC=8C=E9=BB=98=E8=AE=A4?= =?UTF-8?q?=E4=BD=BF=E7=94=A8=E5=B7=B2=E7=BB=8F=E8=AE=BE=E7=BD=AE=E7=9A=84?= =?UTF-8?q?=E9=85=8D=E7=BD=AE=EF=BC=8C=E5=90=A6=E5=88=99=E5=8F=96=E5=AF=B9?= =?UTF-8?q?=E5=BA=94=E9=BB=98=E8=AE=A4=E7=9A=84=E9=85=8D=E7=BD=AE?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- api/configs/ext/full_text_dataset_config.yml | 42 +++++++++++++++++++ api/configs/ext_config.py | 5 ++- .../service_api/dataset/document.py | 12 +++--- api/services/ext/dataset_ext_service.py | 25 ++++++++++- 4 files changed, 76 insertions(+), 8 deletions(-) create mode 100644 api/configs/ext/full_text_dataset_config.yml diff --git a/api/configs/ext/full_text_dataset_config.yml b/api/configs/ext/full_text_dataset_config.yml new file mode 100644 index 0000000000..6598dd700c --- /dev/null +++ b/api/configs/ext/full_text_dataset_config.yml @@ -0,0 +1,42 @@ +#data_source: +# type: upload_file +# info_list: +# data_source_type: upload_file +# file_info_list: +# file_ids: +# - 17d74bd8-2a34-4065-a56d-e2b30497cda3 +indexing_technique: high_quality +process_rule: + rules: + pre_processing_rules: + - id: remove_extra_spaces + enabled: true + - id: remove_urls_emails + enabled: false + segmentation: + separator: "&&&&&" + max_tokens: 500 + chunk_overlap: 50 + mode: custom +doc_form: text_model +doc_language: Chinese Simplified +retrieval_model: + search_method: hybrid_search + reranking_enable: true + reranking_mode: weighted_score + reranking_model: + reranking_provider_name: langgenius/tongyi/tongyi + reranking_model_name: gte-rerank-v2 + weights: + weight_type: customized + keyword_setting: + keyword_weight: 0.7 + vector_setting: + vector_weight: 0.3 + embedding_model_name: text-embedding-v3 + embedding_provider_name: langgenius/tongyi/tongyi + top_k: 10 + score_threshold_enabled: false + score_threshold: 0 +embedding_model: text-embedding-v3 +embedding_model_provider: langgenius/tongyi/tongyi diff --git a/api/configs/ext_config.py b/api/configs/ext_config.py index e803d8bbeb..f4d9ba1e78 100644 --- a/api/configs/ext_config.py +++ b/api/configs/ext_config.py @@ -4,6 +4,9 @@ from pathlib import Path def get_init_knowledge_config(config:dict) -> dict : return get_ext_config(file_name="dataset_config.yml", config=config) +def get_init_full_text_knowledge_config(config:dict) -> dict : + return get_ext_config(file_name="full_text_dataset_config.yml", config=config) + def get_ext_config(file_name:str, config:dict = None,params : dict = None) -> dict : # 获取当前脚本所在的目录 current_dir = Path(__file__).resolve().parent @@ -35,4 +38,4 @@ def replace_placeholders(data, params:dict = None) -> dict: return data # 其他类型直接返回 - return data \ No newline at end of file + return data diff --git a/api/controllers/service_api/dataset/document.py b/api/controllers/service_api/dataset/document.py index 97e43354f2..1a2d46140e 100644 --- a/api/controllers/service_api/dataset/document.py +++ b/api/controllers/service_api/dataset/document.py @@ -4,7 +4,7 @@ from flask import request from flask_restful import marshal, reqparse from sqlalchemy import desc, select from werkzeug.exceptions import NotFound - +from services.dataset_service import DatasetPermissionService, DatasetService, DocumentService import services from controllers.common.errors import FilenameNotExistsError from controllers.service_api import api @@ -27,6 +27,7 @@ from libs.login import current_user from models.dataset import Dataset, Document, DocumentSegment from services.dataset_service import DocumentService from services.entities.knowledge_entities.knowledge_entities import KnowledgeConfig +from services.ext.dataset_ext_service import DatasetExtService from services.file_service import FileService from configs.ext_config import get_init_knowledge_config @@ -180,8 +181,8 @@ class DocumentAddByFileApi(DatasetApiResource): raise ValueError("Dataset does not exist.") indexing_technique = args.get("indexing_technique") or dataset.indexing_technique - if not indexing_technique: - raise ValueError("indexing_technique is required.") + # if not indexing_technique: + # raise ValueError("indexing_technique is required.") args["indexing_technique"] = indexing_technique # save file info @@ -210,10 +211,9 @@ class DocumentAddByFileApi(DatasetApiResource): } args["data_source"] = data_source - # 取默认的值 - args = get_init_knowledge_config(args) + config_args = DatasetExtService.get_datasets_config(dataset_id=dataset_id,tenant_id=tenant_id,default_config=args) # validate args - knowledge_config = KnowledgeConfig(**args) + knowledge_config = KnowledgeConfig(**config_args) DocumentService.document_create_args_validate(knowledge_config) dataset_process_rule = dataset.latest_process_rule if "process_rule" not in args else None diff --git a/api/services/ext/dataset_ext_service.py b/api/services/ext/dataset_ext_service.py index 19b6ee56d8..a9ed37651c 100644 --- a/api/services/ext/dataset_ext_service.py +++ b/api/services/ext/dataset_ext_service.py @@ -16,7 +16,7 @@ from controllers.console.app.error import ( ) from extensions.ext_database import db from services.dataset_service import DatasetService, DocumentService -from configs.ext_config import get_init_knowledge_config +from configs.ext_config import get_init_knowledge_config,get_init_full_text_knowledge_config from services.entities.knowledge_entities.knowledge_entities import KnowledgeConfig class DatasetExtService: @@ -30,6 +30,29 @@ class DatasetExtService: .all()) return datasets + @staticmethod + def get_datasets_config(dataset_id=None,tenant_id=None, default_config:dict=None) -> dict: + # 取默认的值 + dataset_ids = [dataset_id] + datasets, total = DatasetService.get_datasets_by_ids(ids=dataset_ids, tenant_id=tenant_id) + args = {} + if total > 0: + dataset_dict=datasets[0].__dict__ + if 'FULL_TEXT' in dataset_dict["name"]: + args = get_init_full_text_knowledge_config({}) + else: + args = get_init_knowledge_config({}) + keys_to_override = ['indexing_technique', 'process_rule', 'doc_form' + ,'doc_language','retrieval_model','embedding_model','embedding_model_provider'] + + args.update({k: dataset_dict[k] for k in keys_to_override if k in dataset_dict and dataset_dict[k] is not None}) + + if default_config is not None: + args={**args,**default_config} + + # validate args + return args + @staticmethod def init_dataset(tenant:Tenant=None, target_tenant_id:str=None,target_tenant_name:str=None, account:Account=None) -> list[Dataset]: