【Dify】 上传文档功能,默认使用已经设置的配置,否则取对应默认的配置

pull/22121/head
liuchangsheng@wisdomidata.com 12 months ago
parent fdbc1d89ed
commit dc3ba65d7a

@ -0,0 +1,42 @@
#data_source:
# type: upload_file
# info_list:
# data_source_type: upload_file
# file_info_list:
# file_ids:
# - 17d74bd8-2a34-4065-a56d-e2b30497cda3
indexing_technique: high_quality
process_rule:
rules:
pre_processing_rules:
- id: remove_extra_spaces
enabled: true
- id: remove_urls_emails
enabled: false
segmentation:
separator: "&&&&&"
max_tokens: 500
chunk_overlap: 50
mode: custom
doc_form: text_model
doc_language: Chinese Simplified
retrieval_model:
search_method: hybrid_search
reranking_enable: true
reranking_mode: weighted_score
reranking_model:
reranking_provider_name: langgenius/tongyi/tongyi
reranking_model_name: gte-rerank-v2
weights:
weight_type: customized
keyword_setting:
keyword_weight: 0.7
vector_setting:
vector_weight: 0.3
embedding_model_name: text-embedding-v3
embedding_provider_name: langgenius/tongyi/tongyi
top_k: 10
score_threshold_enabled: false
score_threshold: 0
embedding_model: text-embedding-v3
embedding_model_provider: langgenius/tongyi/tongyi

@ -4,6 +4,9 @@ from pathlib import Path
def get_init_knowledge_config(config:dict) -> dict :
return get_ext_config(file_name="dataset_config.yml", config=config)
def get_init_full_text_knowledge_config(config:dict) -> dict :
return get_ext_config(file_name="full_text_dataset_config.yml", config=config)
def get_ext_config(file_name:str, config:dict = None,params : dict = None) -> dict :
# 获取当前脚本所在的目录
current_dir = Path(__file__).resolve().parent

@ -4,7 +4,7 @@ from flask import request
from flask_restful import marshal, reqparse
from sqlalchemy import desc, select
from werkzeug.exceptions import NotFound
from services.dataset_service import DatasetPermissionService, DatasetService, DocumentService
import services
from controllers.common.errors import FilenameNotExistsError
from controllers.service_api import api
@ -27,6 +27,7 @@ from libs.login import current_user
from models.dataset import Dataset, Document, DocumentSegment
from services.dataset_service import DocumentService
from services.entities.knowledge_entities.knowledge_entities import KnowledgeConfig
from services.ext.dataset_ext_service import DatasetExtService
from services.file_service import FileService
from configs.ext_config import get_init_knowledge_config
@ -180,8 +181,8 @@ class DocumentAddByFileApi(DatasetApiResource):
raise ValueError("Dataset does not exist.")
indexing_technique = args.get("indexing_technique") or dataset.indexing_technique
if not indexing_technique:
raise ValueError("indexing_technique is required.")
# if not indexing_technique:
# raise ValueError("indexing_technique is required.")
args["indexing_technique"] = indexing_technique
# save file info
@ -210,10 +211,9 @@ class DocumentAddByFileApi(DatasetApiResource):
}
args["data_source"] = data_source
# 取默认的值
args = get_init_knowledge_config(args)
config_args = DatasetExtService.get_datasets_config(dataset_id=dataset_id,tenant_id=tenant_id,default_config=args)
# validate args
knowledge_config = KnowledgeConfig(**args)
knowledge_config = KnowledgeConfig(**config_args)
DocumentService.document_create_args_validate(knowledge_config)
dataset_process_rule = dataset.latest_process_rule if "process_rule" not in args else None

@ -16,7 +16,7 @@ from controllers.console.app.error import (
)
from extensions.ext_database import db
from services.dataset_service import DatasetService, DocumentService
from configs.ext_config import get_init_knowledge_config
from configs.ext_config import get_init_knowledge_config,get_init_full_text_knowledge_config
from services.entities.knowledge_entities.knowledge_entities import KnowledgeConfig
class DatasetExtService:
@ -30,6 +30,29 @@ class DatasetExtService:
.all())
return datasets
@staticmethod
def get_datasets_config(dataset_id=None,tenant_id=None, default_config:dict=None) -> dict:
# 取默认的值
dataset_ids = [dataset_id]
datasets, total = DatasetService.get_datasets_by_ids(ids=dataset_ids, tenant_id=tenant_id)
args = {}
if total > 0:
dataset_dict=datasets[0].__dict__
if 'FULL_TEXT' in dataset_dict["name"]:
args = get_init_full_text_knowledge_config({})
else:
args = get_init_knowledge_config({})
keys_to_override = ['indexing_technique', 'process_rule', 'doc_form'
,'doc_language','retrieval_model','embedding_model','embedding_model_provider']
args.update({k: dataset_dict[k] for k in keys_to_override if k in dataset_dict and dataset_dict[k] is not None})
if default_config is not None:
args={**args,**default_config}
# validate args
return args
@staticmethod
def init_dataset(tenant:Tenant=None, target_tenant_id:str=None,target_tenant_name:str=None, account:Account=None) -> list[Dataset]:

Loading…
Cancel
Save