Compare commits
14 Commits
main
...
feat/exter
| Author | SHA1 | Date |
|---|---|---|
|
|
37f7d5732a | 1 year ago |
|
|
dcb033d221 | 1 year ago |
|
|
9f894bb3b3 | 1 year ago |
|
|
89e81873c4 | 1 year ago |
|
|
9ca0e56a8a | 1 year ago |
|
|
e7c77d961b | 1 year ago |
|
|
a63e15081f | 1 year ago |
|
|
0724640bbb | 1 year ago |
|
|
cb70e12827 | 1 year ago |
|
|
067b956b2c | 1 year ago |
|
|
e7762b731c | 1 year ago |
|
|
f6c8390b0b | 1 year ago |
|
|
4fd57929df | 1 year ago |
|
|
517cdb2ca4 | 1 year ago |
@ -0,0 +1,254 @@
|
||||
from flask import request
|
||||
from flask_login import current_user
|
||||
from flask_restful import Resource, marshal, reqparse
|
||||
from werkzeug.exceptions import Forbidden, NotFound
|
||||
|
||||
import services
|
||||
from controllers.console import api
|
||||
from controllers.console.app.error import ProviderNotInitializeError
|
||||
from controllers.console.datasets.error import DatasetNameDuplicateError
|
||||
from controllers.console.setup import setup_required
|
||||
from controllers.console.wraps import account_initialization_required
|
||||
from fields.dataset_fields import dataset_detail_fields
|
||||
from libs.login import login_required
|
||||
from services.external_knowledge_service import ExternalDatasetService
|
||||
|
||||
|
||||
def _validate_name(name):
|
||||
if not name or len(name) < 1 or len(name) > 100:
|
||||
raise ValueError("Name must be between 1 to 100 characters.")
|
||||
return name
|
||||
|
||||
|
||||
def _validate_description_length(description):
|
||||
if len(description) > 400:
|
||||
raise ValueError("Description cannot exceed 400 characters.")
|
||||
return description
|
||||
|
||||
|
||||
class ExternalApiTemplateListApi(Resource):
|
||||
@setup_required
|
||||
@login_required
|
||||
@account_initialization_required
|
||||
def get(self):
|
||||
page = request.args.get("page", default=1, type=int)
|
||||
limit = request.args.get("limit", default=20, type=int)
|
||||
search = request.args.get("keyword", default=None, type=str)
|
||||
|
||||
api_templates, total = ExternalDatasetService.get_external_api_templates(
|
||||
page, limit, current_user.current_tenant_id, search
|
||||
)
|
||||
response = {
|
||||
"data": [item.to_dict() for item in api_templates],
|
||||
"has_more": len(api_templates) == limit,
|
||||
"limit": limit,
|
||||
"total": total,
|
||||
"page": page,
|
||||
}
|
||||
return response, 200
|
||||
|
||||
@setup_required
|
||||
@login_required
|
||||
@account_initialization_required
|
||||
def post(self):
|
||||
parser = reqparse.RequestParser()
|
||||
parser.add_argument(
|
||||
"name",
|
||||
nullable=False,
|
||||
required=True,
|
||||
help="Name is required. Name must be between 1 to 100 characters.",
|
||||
type=_validate_name,
|
||||
)
|
||||
parser.add_argument(
|
||||
"description",
|
||||
nullable=False,
|
||||
required=True,
|
||||
help="Description is required. Description must be between 1 to 400 characters.",
|
||||
type=_validate_description_length,
|
||||
)
|
||||
parser.add_argument(
|
||||
"settings",
|
||||
type=dict,
|
||||
location="json",
|
||||
nullable=False,
|
||||
required=True,
|
||||
)
|
||||
args = parser.parse_args()
|
||||
|
||||
ExternalDatasetService.validate_api_list(args["settings"])
|
||||
|
||||
# The role of the current user in the ta table must be admin, owner, or editor, or dataset_operator
|
||||
if not current_user.is_dataset_editor:
|
||||
raise Forbidden()
|
||||
|
||||
try:
|
||||
api_template = ExternalDatasetService.create_api_template(
|
||||
tenant_id=current_user.current_tenant_id, user_id=current_user.id, args=args
|
||||
)
|
||||
except services.errors.dataset.DatasetNameDuplicateError:
|
||||
raise DatasetNameDuplicateError()
|
||||
|
||||
return api_template.to_dict(), 201
|
||||
|
||||
|
||||
class ExternalApiTemplateApi(Resource):
|
||||
@setup_required
|
||||
@login_required
|
||||
@account_initialization_required
|
||||
def get(self, api_template_id):
|
||||
api_template_id = str(api_template_id)
|
||||
api_template = ExternalDatasetService.get_api_template(api_template_id)
|
||||
if api_template is None:
|
||||
raise NotFound("API template not found.")
|
||||
|
||||
return api_template.to_dict(), 200
|
||||
|
||||
@setup_required
|
||||
@login_required
|
||||
@account_initialization_required
|
||||
def patch(self, api_template_id):
|
||||
api_template_id = str(api_template_id)
|
||||
|
||||
parser = reqparse.RequestParser()
|
||||
parser.add_argument(
|
||||
"name",
|
||||
nullable=False,
|
||||
required=True,
|
||||
help="type is required. Name must be between 1 to 100 characters.",
|
||||
type=_validate_name,
|
||||
)
|
||||
parser.add_argument(
|
||||
"description",
|
||||
nullable=False,
|
||||
required=True,
|
||||
help="description is required. Description must be between 1 to 400 characters.",
|
||||
type=_validate_description_length,
|
||||
)
|
||||
parser.add_argument(
|
||||
"settings",
|
||||
type=dict,
|
||||
location="json",
|
||||
nullable=False,
|
||||
required=True,
|
||||
)
|
||||
args = parser.parse_args()
|
||||
ExternalDatasetService.validate_api_list(args["settings"])
|
||||
|
||||
api_template = ExternalDatasetService.update_api_template(
|
||||
tenant_id=current_user.current_tenant_id,
|
||||
user_id=current_user.id,
|
||||
api_template_id=api_template_id,
|
||||
args=args,
|
||||
)
|
||||
|
||||
return api_template.to_dict(), 200
|
||||
|
||||
@setup_required
|
||||
@login_required
|
||||
@account_initialization_required
|
||||
def delete(self, api_template_id):
|
||||
api_template_id = str(api_template_id)
|
||||
|
||||
# The role of the current user in the ta table must be admin, owner, or editor
|
||||
if not current_user.is_editor or current_user.is_dataset_operator:
|
||||
raise Forbidden()
|
||||
|
||||
ExternalDatasetService.delete_api_template(current_user.current_tenant_id, api_template_id)
|
||||
return {"result": "success"}, 204
|
||||
|
||||
|
||||
class ExternalApiUseCheckApi(Resource):
|
||||
@setup_required
|
||||
@login_required
|
||||
@account_initialization_required
|
||||
def get(self, api_template_id):
|
||||
api_template_id = str(api_template_id)
|
||||
|
||||
external_api_template_is_using = ExternalDatasetService.external_api_template_use_check(api_template_id)
|
||||
return {"is_using": external_api_template_is_using}, 200
|
||||
|
||||
|
||||
class ExternalDatasetInitApi(Resource):
|
||||
@setup_required
|
||||
@login_required
|
||||
@account_initialization_required
|
||||
def post(self):
|
||||
# The role of the current user in the ta table must be admin, owner, or editor
|
||||
if not current_user.is_editor:
|
||||
raise Forbidden()
|
||||
|
||||
parser = reqparse.RequestParser()
|
||||
parser.add_argument("api_template_id", type=str, required=True, nullable=True, location="json")
|
||||
# parser.add_argument('name', nullable=False, required=True,
|
||||
# help='name is required. Name must be between 1 to 100 characters.',
|
||||
# type=_validate_name)
|
||||
# parser.add_argument('description', type=str, required=True, nullable=True, location='json')
|
||||
parser.add_argument("data_source", type=dict, required=True, nullable=True, location="json")
|
||||
parser.add_argument("process_parameter", type=dict, required=True, nullable=True, location="json")
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
# The role of the current user in the ta table must be admin, owner, or editor, or dataset_operator
|
||||
if not current_user.is_dataset_editor:
|
||||
raise Forbidden()
|
||||
|
||||
# validate args
|
||||
ExternalDatasetService.document_create_args_validate(
|
||||
current_user.current_tenant_id, args["api_template_id"], args["process_parameter"]
|
||||
)
|
||||
|
||||
try:
|
||||
dataset, documents, batch = ExternalDatasetService.init_external_dataset(
|
||||
tenant_id=current_user.current_tenant_id,
|
||||
user_id=current_user.id,
|
||||
args=args,
|
||||
)
|
||||
except Exception as ex:
|
||||
raise ProviderNotInitializeError(ex.description)
|
||||
response = {"dataset": dataset, "documents": documents, "batch": batch}
|
||||
|
||||
return response
|
||||
|
||||
|
||||
class ExternalDatasetCreateApi(Resource):
|
||||
@setup_required
|
||||
@login_required
|
||||
@account_initialization_required
|
||||
def post(self):
|
||||
# The role of the current user in the ta table must be admin, owner, or editor
|
||||
if not current_user.is_editor:
|
||||
raise Forbidden()
|
||||
|
||||
parser = reqparse.RequestParser()
|
||||
parser.add_argument("external_api_template_id", type=str, required=True, nullable=False, location="json")
|
||||
parser.add_argument("external_knowledge_id", type=str, required=True, nullable=False, location="json")
|
||||
parser.add_argument(
|
||||
"name",
|
||||
nullable=False,
|
||||
required=True,
|
||||
help="name is required. Name must be between 1 to 100 characters.",
|
||||
type=_validate_name,
|
||||
)
|
||||
parser.add_argument("description", type=str, required=True, nullable=True, location="json")
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
# The role of the current user in the ta table must be admin, owner, or editor, or dataset_operator
|
||||
if not current_user.is_dataset_editor:
|
||||
raise Forbidden()
|
||||
|
||||
try:
|
||||
dataset = ExternalDatasetService.create_external_dataset(
|
||||
tenant_id=current_user.current_tenant_id,
|
||||
user_id=current_user.id,
|
||||
args=args,
|
||||
)
|
||||
except services.errors.dataset.DatasetNameDuplicateError:
|
||||
raise DatasetNameDuplicateError()
|
||||
|
||||
return marshal(dataset, dataset_detail_fields), 201
|
||||
|
||||
|
||||
api.add_resource(ExternalApiTemplateListApi, "/datasets/external-api-template")
|
||||
api.add_resource(ExternalApiTemplateApi, "/datasets/external-api-template/<uuid:api_template_id>")
|
||||
api.add_resource(ExternalApiUseCheckApi, "/datasets/external-api-template/<uuid:api_template_id>/use-check")
|
||||
@ -0,0 +1,49 @@
|
||||
from flask import request
|
||||
from flask_login import current_user
|
||||
from flask_restful import Resource, marshal, reqparse
|
||||
from werkzeug.exceptions import Forbidden, NotFound
|
||||
|
||||
import services
|
||||
from controllers.console import api
|
||||
from controllers.console.app.error import ProviderNotInitializeError
|
||||
from controllers.console.datasets.error import DatasetNameDuplicateError
|
||||
from controllers.console.setup import setup_required
|
||||
from controllers.console.wraps import account_initialization_required
|
||||
from fields.dataset_fields import dataset_detail_fields
|
||||
from libs.login import login_required
|
||||
from services.external_knowledge_service import ExternalDatasetService
|
||||
|
||||
class TestExternalApi(Resource):
|
||||
@setup_required
|
||||
@login_required
|
||||
@account_initialization_required
|
||||
def post(self):
|
||||
parser = reqparse.RequestParser()
|
||||
parser.add_argument(
|
||||
"top_k",
|
||||
nullable=False,
|
||||
required=True,
|
||||
type=int,
|
||||
)
|
||||
parser.add_argument(
|
||||
"score_threshold",
|
||||
nullable=False,
|
||||
required=True,
|
||||
type=float,
|
||||
)
|
||||
args = parser.parse_args()
|
||||
result = ExternalDatasetService.test_external_knowledge_retrival(
|
||||
args["top_k"], args["score_threshold"]
|
||||
)
|
||||
response = {
|
||||
"data": [item.to_dict() for item in api_templates],
|
||||
"has_more": len(api_templates) == limit,
|
||||
"limit": limit,
|
||||
"total": total,
|
||||
"page": page,
|
||||
}
|
||||
return response, 200
|
||||
|
||||
|
||||
|
||||
api.add_resource(TestExternalApi, "/dify/external-knowledge/retrival-documents")
|
||||
@ -0,0 +1,11 @@
|
||||
from flask_restful import fields
|
||||
|
||||
from libs.helper import TimestampField
|
||||
|
||||
api_template_query_detail_fields = {
|
||||
"id": fields.String,
|
||||
"name": fields.String,
|
||||
"setting": fields.String,
|
||||
"created_by": fields.String,
|
||||
"created_at": TimestampField,
|
||||
}
|
||||
@ -0,0 +1,74 @@
|
||||
"""external_knowledge
|
||||
|
||||
Revision ID: ec3df697ebbb
|
||||
Revises: 675b5321501b
|
||||
Create Date: 2024-09-18 06:59:54.048478
|
||||
|
||||
"""
|
||||
from alembic import op
|
||||
import models as models
|
||||
import sqlalchemy as sa
|
||||
from sqlalchemy.dialects import postgresql
|
||||
|
||||
# revision identifiers, used by Alembic.
|
||||
revision = 'ec3df697ebbb'
|
||||
down_revision = '675b5321501b'
|
||||
branch_labels = None
|
||||
depends_on = None
|
||||
|
||||
|
||||
def upgrade():
|
||||
# ### commands auto generated by Alembic - please adjust! ###
|
||||
op.create_table('external_api_templates',
|
||||
sa.Column('id', models.types.StringUUID(), server_default=sa.text('uuid_generate_v4()'), nullable=False),
|
||||
sa.Column('name', sa.String(length=255), nullable=False),
|
||||
sa.Column('description', sa.String(length=255), nullable=False),
|
||||
sa.Column('tenant_id', models.types.StringUUID(), nullable=False),
|
||||
sa.Column('settings', sa.Text(), nullable=True),
|
||||
sa.Column('created_by', models.types.StringUUID(), nullable=False),
|
||||
sa.Column('created_at', sa.DateTime(), server_default=sa.text('CURRENT_TIMESTAMP(0)'), nullable=False),
|
||||
sa.Column('updated_by', models.types.StringUUID(), nullable=True),
|
||||
sa.Column('updated_at', sa.DateTime(), server_default=sa.text('CURRENT_TIMESTAMP(0)'), nullable=False),
|
||||
sa.PrimaryKeyConstraint('id', name='external_api_template_pkey')
|
||||
)
|
||||
with op.batch_alter_table('external_api_templates', schema=None) as batch_op:
|
||||
batch_op.create_index('external_api_templates_name_idx', ['name'], unique=False)
|
||||
batch_op.create_index('external_api_templates_tenant_idx', ['tenant_id'], unique=False)
|
||||
|
||||
op.create_table('external_knowledge_bindings',
|
||||
sa.Column('id', models.types.StringUUID(), server_default=sa.text('uuid_generate_v4()'), nullable=False),
|
||||
sa.Column('tenant_id', models.types.StringUUID(), nullable=False),
|
||||
sa.Column('external_api_template_id', models.types.StringUUID(), nullable=False),
|
||||
sa.Column('dataset_id', models.types.StringUUID(), nullable=False),
|
||||
sa.Column('external_knowledge_id', sa.Text(), nullable=False),
|
||||
sa.Column('created_by', models.types.StringUUID(), nullable=False),
|
||||
sa.Column('created_at', sa.DateTime(), server_default=sa.text('CURRENT_TIMESTAMP(0)'), nullable=False),
|
||||
sa.Column('updated_by', models.types.StringUUID(), nullable=True),
|
||||
sa.Column('updated_at', sa.DateTime(), server_default=sa.text('CURRENT_TIMESTAMP(0)'), nullable=False),
|
||||
sa.PrimaryKeyConstraint('id', name='external_knowledge_bindings_pkey')
|
||||
)
|
||||
with op.batch_alter_table('external_knowledge_bindings', schema=None) as batch_op:
|
||||
batch_op.create_index('external_knowledge_bindings_dataset_idx', ['dataset_id'], unique=False)
|
||||
batch_op.create_index('external_knowledge_bindings_external_api_template_idx', ['external_api_template_id'], unique=False)
|
||||
batch_op.create_index('external_knowledge_bindings_external_knowledge_idx', ['external_knowledge_id'], unique=False)
|
||||
batch_op.create_index('external_knowledge_bindings_tenant_idx', ['tenant_id'], unique=False)
|
||||
|
||||
# ### end Alembic commands ###
|
||||
|
||||
|
||||
def downgrade():
|
||||
# ### commands auto generated by Alembic - please adjust! ###
|
||||
|
||||
with op.batch_alter_table('external_knowledge_bindings', schema=None) as batch_op:
|
||||
batch_op.drop_index('external_knowledge_bindings_tenant_idx')
|
||||
batch_op.drop_index('external_knowledge_bindings_external_knowledge_idx')
|
||||
batch_op.drop_index('external_knowledge_bindings_external_api_template_idx')
|
||||
batch_op.drop_index('external_knowledge_bindings_dataset_idx')
|
||||
|
||||
op.drop_table('external_knowledge_bindings')
|
||||
with op.batch_alter_table('external_api_templates', schema=None) as batch_op:
|
||||
batch_op.drop_index('external_api_templates_tenant_idx')
|
||||
batch_op.drop_index('external_api_templates_name_idx')
|
||||
|
||||
op.drop_table('external_api_templates')
|
||||
# ### end Alembic commands ###
|
||||
@ -0,0 +1,92 @@
|
||||
import datetime
|
||||
import time
|
||||
|
||||
import click
|
||||
from sqlalchemy import func
|
||||
from werkzeug.exceptions import NotFound
|
||||
|
||||
import app
|
||||
from configs import dify_config
|
||||
from core.rag.index_processor.index_processor_factory import IndexProcessorFactory
|
||||
from extensions.ext_database import db
|
||||
from models.dataset import Dataset, DatasetQuery, Document
|
||||
|
||||
|
||||
@app.celery.task(queue="dataset")
|
||||
def clean_unused_message_task():
|
||||
click.echo(click.style("Start clean unused messages .", fg="green"))
|
||||
clean_days = int(dify_config.CLEAN_DAY_SETTING)
|
||||
start_at = time.perf_counter()
|
||||
thirty_days_ago = datetime.datetime.now() - datetime.timedelta(days=clean_days)
|
||||
page = 1
|
||||
while True:
|
||||
try:
|
||||
# Subquery for counting new documents
|
||||
document_subquery_new = (
|
||||
db.session.query(Document.dataset_id, func.count(Document.id).label("document_count"))
|
||||
.filter(
|
||||
Document.indexing_status == "completed",
|
||||
Document.enabled == True,
|
||||
Document.archived == False,
|
||||
Document.updated_at > thirty_days_ago,
|
||||
)
|
||||
.group_by(Document.dataset_id)
|
||||
.subquery()
|
||||
)
|
||||
|
||||
# Subquery for counting old documents
|
||||
document_subquery_old = (
|
||||
db.session.query(Document.dataset_id, func.count(Document.id).label("document_count"))
|
||||
.filter(
|
||||
Document.indexing_status == "completed",
|
||||
Document.enabled == True,
|
||||
Document.archived == False,
|
||||
Document.updated_at < thirty_days_ago,
|
||||
)
|
||||
.group_by(Document.dataset_id)
|
||||
.subquery()
|
||||
)
|
||||
|
||||
# Main query with join and filter
|
||||
datasets = (
|
||||
db.session.query(Dataset)
|
||||
.outerjoin(document_subquery_new, Dataset.id == document_subquery_new.c.dataset_id)
|
||||
.outerjoin(document_subquery_old, Dataset.id == document_subquery_old.c.dataset_id)
|
||||
.filter(
|
||||
Dataset.created_at < thirty_days_ago,
|
||||
func.coalesce(document_subquery_new.c.document_count, 0) == 0,
|
||||
func.coalesce(document_subquery_old.c.document_count, 0) > 0,
|
||||
)
|
||||
.order_by(Dataset.created_at.desc())
|
||||
.paginate(page=page, per_page=50)
|
||||
)
|
||||
|
||||
except NotFound:
|
||||
break
|
||||
if datasets.items is None or len(datasets.items) == 0:
|
||||
break
|
||||
page += 1
|
||||
for dataset in datasets:
|
||||
dataset_query = (
|
||||
db.session.query(DatasetQuery)
|
||||
.filter(DatasetQuery.created_at > thirty_days_ago, DatasetQuery.dataset_id == dataset.id)
|
||||
.all()
|
||||
)
|
||||
if not dataset_query or len(dataset_query) == 0:
|
||||
try:
|
||||
# remove index
|
||||
index_processor = IndexProcessorFactory(dataset.doc_form).init_index_processor()
|
||||
index_processor.clean(dataset, None)
|
||||
|
||||
# update document
|
||||
update_params = {Document.enabled: False}
|
||||
|
||||
Document.query.filter_by(dataset_id=dataset.id).update(update_params)
|
||||
db.session.commit()
|
||||
click.echo(click.style("Cleaned unused dataset {} from db success!".format(dataset.id), fg="green"))
|
||||
except Exception as e:
|
||||
click.echo(
|
||||
click.style("clean dataset index error: {} {}".format(e.__class__.__name__, str(e)), fg="red")
|
||||
)
|
||||
end_at = time.perf_counter()
|
||||
click.echo(click.style("Cleaned unused dataset from db success latency: {}".format(end_at - start_at), fg="green"))
|
||||
@ -0,0 +1,28 @@
|
||||
from typing import Literal, Optional, Union
|
||||
|
||||
from pydantic import BaseModel
|
||||
|
||||
|
||||
class AuthorizationConfig(BaseModel):
|
||||
type: Literal[None, "basic", "bearer", "custom"]
|
||||
api_key: Union[None, str] = None
|
||||
header: Union[None, str] = None
|
||||
|
||||
|
||||
class Authorization(BaseModel):
|
||||
type: Literal["no-auth", "api-key"]
|
||||
config: Optional[AuthorizationConfig] = None
|
||||
|
||||
|
||||
class ProcessStatusSetting(BaseModel):
|
||||
request_method: str
|
||||
url: str
|
||||
|
||||
|
||||
class ApiTemplateSetting(BaseModel):
|
||||
method: str
|
||||
url: str
|
||||
request_method: str
|
||||
api_token: str
|
||||
headers: Optional[dict] = None
|
||||
params: Optional[dict] = None
|
||||
@ -0,0 +1,308 @@
|
||||
import json
|
||||
import random
|
||||
import time
|
||||
from copy import deepcopy
|
||||
from datetime import datetime, timezone
|
||||
from typing import Any, Optional, Union
|
||||
|
||||
import httpx
|
||||
|
||||
from core.helper import ssrf_proxy
|
||||
from extensions.ext_database import db
|
||||
from models.dataset import (
|
||||
Dataset,
|
||||
Document,
|
||||
ExternalApiTemplates,
|
||||
ExternalKnowledgeBindings,
|
||||
)
|
||||
from models.model import UploadFile
|
||||
from services.entities.external_knowledge_entities.external_knowledge_entities import ApiTemplateSetting, Authorization
|
||||
from services.errors.dataset import DatasetNameDuplicateError
|
||||
# from tasks.external_document_indexing_task import external_document_indexing_task
|
||||
|
||||
|
||||
class ExternalDatasetService:
|
||||
@staticmethod
|
||||
def get_external_api_templates(page, per_page, tenant_id, search=None) -> tuple[list[ExternalApiTemplates], int]:
|
||||
query = ExternalApiTemplates.query.filter(ExternalApiTemplates.tenant_id == tenant_id).order_by(
|
||||
ExternalApiTemplates.created_at.desc()
|
||||
)
|
||||
if search:
|
||||
query = query.filter(ExternalApiTemplates.name.ilike(f"%{search}%"))
|
||||
|
||||
api_templates = query.paginate(page=page, per_page=per_page, max_per_page=100, error_out=False)
|
||||
|
||||
return api_templates.items, api_templates.total
|
||||
|
||||
@classmethod
|
||||
def validate_api_list(cls, api_settings: dict):
|
||||
if not api_settings:
|
||||
raise ValueError("api list is empty")
|
||||
if "endpoint" not in api_settings and not api_settings["endpoint"]:
|
||||
raise ValueError("endpoint is required")
|
||||
if "api_key" not in api_settings and not api_settings["api_key"]:
|
||||
raise ValueError("api_key is required")
|
||||
|
||||
@staticmethod
|
||||
def create_api_template(tenant_id: str, user_id: str, args: dict) -> ExternalApiTemplates:
|
||||
api_template = ExternalApiTemplates(
|
||||
tenant_id=tenant_id,
|
||||
created_by=user_id,
|
||||
updated_by=user_id,
|
||||
name=args.get("name"),
|
||||
description=args.get("description", ""),
|
||||
settings=json.dumps(args.get("settings"), ensure_ascii=False),
|
||||
)
|
||||
|
||||
db.session.add(api_template)
|
||||
db.session.commit()
|
||||
return api_template
|
||||
|
||||
@staticmethod
|
||||
def get_api_template(api_template_id: str) -> ExternalApiTemplates:
|
||||
return ExternalApiTemplates.query.filter_by(id=api_template_id).first()
|
||||
|
||||
@staticmethod
|
||||
def update_api_template(tenant_id, user_id, api_template_id, args) -> ExternalApiTemplates:
|
||||
api_template = ExternalApiTemplates.query.filter_by(id=api_template_id, tenant_id=tenant_id).first()
|
||||
if api_template is None:
|
||||
raise ValueError("api template not found")
|
||||
|
||||
api_template.name = args.get("name")
|
||||
api_template.description = args.get("description", "")
|
||||
api_template.settings = json.dumps(args.get("settings"), ensure_ascii=False)
|
||||
api_template.updated_by = user_id
|
||||
api_template.updated_at = datetime.now(timezone.utc).replace(tzinfo=None)
|
||||
db.session.commit()
|
||||
|
||||
return api_template
|
||||
|
||||
@staticmethod
|
||||
def delete_api_template(tenant_id: str, api_template_id: str):
|
||||
api_template = ExternalApiTemplates.query.filter_by(id=api_template_id, tenant_id=tenant_id).first()
|
||||
if api_template is None:
|
||||
raise ValueError("api template not found")
|
||||
|
||||
db.session.delete(api_template)
|
||||
db.session.commit()
|
||||
|
||||
@staticmethod
|
||||
def external_api_template_use_check(api_template_id: str) -> bool:
|
||||
count = ExternalKnowledgeBindings.query.filter_by(external_api_template_id=api_template_id).count()
|
||||
if count > 0:
|
||||
return True
|
||||
return False
|
||||
|
||||
@staticmethod
|
||||
def get_external_knowledge_binding_with_dataset_id(tenant_id: str, dataset_id: str) -> ExternalKnowledgeBindings:
|
||||
external_knowledge_binding = ExternalKnowledgeBindings.query.filter_by(
|
||||
dataset_id=dataset_id, tenant_id=tenant_id
|
||||
).first()
|
||||
if not external_knowledge_binding:
|
||||
raise ValueError("external knowledge binding not found")
|
||||
return external_knowledge_binding
|
||||
|
||||
@staticmethod
|
||||
def document_create_args_validate(tenant_id: str, api_template_id: str, process_parameter: dict):
|
||||
api_template = ExternalApiTemplates.query.filter_by(id=api_template_id, tenant_id=tenant_id).first()
|
||||
if api_template is None:
|
||||
raise ValueError("api template not found")
|
||||
settings = json.loads(api_template.settings)
|
||||
for settings in settings:
|
||||
custom_parameters = settings.get("document_process_setting")
|
||||
if custom_parameters:
|
||||
for parameter in custom_parameters:
|
||||
if parameter.get("required", False) and not process_parameter.get(parameter.get("name")):
|
||||
raise ValueError(f'{parameter.get("name")} is required')
|
||||
|
||||
@staticmethod
|
||||
def init_external_dataset(tenant_id: str, user_id: str, args: dict, created_from: str = "web"):
|
||||
api_template_id = args.get("api_template_id")
|
||||
|
||||
data_source = args.get("data_source")
|
||||
if data_source is None:
|
||||
raise ValueError("data source is required")
|
||||
|
||||
process_parameter = args.get("process_parameter")
|
||||
api_template = ExternalApiTemplates.query.filter_by(id=api_template_id, tenant_id=tenant_id).first()
|
||||
if api_template is None:
|
||||
raise ValueError("api template not found")
|
||||
|
||||
dataset = Dataset(
|
||||
tenant_id=tenant_id,
|
||||
name=args.get("name"),
|
||||
description=args.get("description", ""),
|
||||
provider="external",
|
||||
created_by=user_id,
|
||||
)
|
||||
|
||||
db.session.add(dataset)
|
||||
db.session.flush()
|
||||
|
||||
document = Document.query.filter_by(dataset_id=dataset.id).order_by(Document.position.desc()).first()
|
||||
|
||||
position = document.position + 1 if document else 1
|
||||
|
||||
batch = time.strftime("%Y%m%d%H%M%S") + str(random.randint(100000, 999999))
|
||||
document_ids = []
|
||||
if data_source["type"] == "upload_file":
|
||||
upload_file_list = data_source["info_list"]["file_info_list"]["file_ids"]
|
||||
for file_id in upload_file_list:
|
||||
file = (
|
||||
db.session.query(UploadFile)
|
||||
.filter(UploadFile.tenant_id == dataset.tenant_id, UploadFile.id == file_id)
|
||||
.first()
|
||||
)
|
||||
if file:
|
||||
data_source_info = {
|
||||
"upload_file_id": file_id,
|
||||
}
|
||||
document = Document(
|
||||
tenant_id=dataset.tenant_id,
|
||||
dataset_id=dataset.id,
|
||||
position=position,
|
||||
data_source_type=data_source["type"],
|
||||
data_source_info=json.dumps(data_source_info),
|
||||
batch=batch,
|
||||
name=file.name,
|
||||
created_from=created_from,
|
||||
created_by=user_id,
|
||||
)
|
||||
position += 1
|
||||
db.session.add(document)
|
||||
db.session.flush()
|
||||
document_ids.append(document.id)
|
||||
db.session.commit()
|
||||
#external_document_indexing_task.delay(dataset.id, api_template_id, data_source, process_parameter)
|
||||
|
||||
return dataset
|
||||
|
||||
@staticmethod
|
||||
def process_external_api(settings: ApiTemplateSetting, files: Union[None, dict[str, Any]]) -> httpx.Response:
|
||||
"""
|
||||
do http request depending on api bundle
|
||||
"""
|
||||
|
||||
kwargs = {
|
||||
"url": settings.url,
|
||||
"headers": settings.headers,
|
||||
"follow_redirects": True,
|
||||
}
|
||||
|
||||
response = getattr(ssrf_proxy, settings.request_method)(data=settings.params, files=files, **kwargs)
|
||||
|
||||
return response
|
||||
|
||||
@staticmethod
|
||||
def assembling_headers(authorization: Authorization, headers: Optional[dict] = None) -> dict[str, Any]:
|
||||
authorization = deepcopy(authorization)
|
||||
if headers:
|
||||
headers = deepcopy(headers)
|
||||
else:
|
||||
headers = {}
|
||||
if authorization.type == "api-key":
|
||||
if authorization.config is None:
|
||||
raise ValueError("authorization config is required")
|
||||
|
||||
if authorization.config.api_key is None:
|
||||
raise ValueError("api_key is required")
|
||||
|
||||
if not authorization.config.header:
|
||||
authorization.config.header = "Authorization"
|
||||
|
||||
if authorization.config.type == "bearer":
|
||||
headers[authorization.config.header] = f"Bearer {authorization.config.api_key}"
|
||||
elif authorization.config.type == "basic":
|
||||
headers[authorization.config.header] = f"Basic {authorization.config.api_key}"
|
||||
elif authorization.config.type == "custom":
|
||||
headers[authorization.config.header] = authorization.config.api_key
|
||||
|
||||
return headers
|
||||
|
||||
@staticmethod
|
||||
def get_api_template_settings(settings: dict) -> ApiTemplateSetting:
|
||||
return ApiTemplateSetting.parse_obj(settings)
|
||||
|
||||
@staticmethod
|
||||
def create_external_dataset(tenant_id: str, user_id: str, args: dict) -> Dataset:
|
||||
# check if dataset name already exists
|
||||
if Dataset.query.filter_by(name=args.get("name"), tenant_id=tenant_id).first():
|
||||
raise DatasetNameDuplicateError(f"Dataset with name {args.get('name')} already exists.")
|
||||
api_template = ExternalApiTemplates.query.filter_by(
|
||||
id=args.get("external_api_template_id"), tenant_id=tenant_id
|
||||
).first()
|
||||
|
||||
if api_template is None:
|
||||
raise ValueError("api template not found")
|
||||
|
||||
dataset = Dataset(
|
||||
tenant_id=tenant_id,
|
||||
name=args.get("name"),
|
||||
description=args.get("description", ""),
|
||||
provider="external",
|
||||
created_by=user_id,
|
||||
)
|
||||
|
||||
db.session.add(dataset)
|
||||
db.session.flush()
|
||||
|
||||
external_knowledge_binding = ExternalKnowledgeBindings(
|
||||
tenant_id=tenant_id,
|
||||
dataset_id=dataset.id,
|
||||
external_api_template_id=args.get("external_api_template_id"),
|
||||
external_knowledge_id=args.get("external_knowledge_id"),
|
||||
created_by=user_id,
|
||||
)
|
||||
db.session.add(external_knowledge_binding)
|
||||
|
||||
db.session.commit()
|
||||
|
||||
return dataset
|
||||
|
||||
@staticmethod
|
||||
def fetch_external_knowledge_retrival(
|
||||
tenant_id: str, dataset_id: str, query: str, external_retrival_parameters: dict
|
||||
):
|
||||
external_knowledge_binding = ExternalKnowledgeBindings.query.filter_by(
|
||||
dataset_id=dataset_id, tenant_id=tenant_id
|
||||
).first()
|
||||
if not external_knowledge_binding:
|
||||
raise ValueError("external knowledge binding not found")
|
||||
|
||||
external_api_template = ExternalApiTemplates.query.filter_by(
|
||||
id=external_knowledge_binding.external_api_template_id
|
||||
).first()
|
||||
if not external_api_template:
|
||||
raise ValueError("external api template not found")
|
||||
|
||||
settings = json.loads(external_api_template.settings)
|
||||
headers = {}
|
||||
if settings.get("api_token"):
|
||||
headers["Authorization"] = f"Bearer {settings.get('api_token')}"
|
||||
|
||||
external_retrival_parameters["query"] = query
|
||||
|
||||
api_template_setting = {
|
||||
"url": f"{settings.get('endpoint')}/dify/external-knowledge/retrival-documents",
|
||||
"request_method": "post",
|
||||
"headers": settings.get("headers"),
|
||||
"params": external_retrival_parameters,
|
||||
}
|
||||
response = ExternalDatasetService.process_external_api(ApiTemplateSetting(**api_template_setting), None)
|
||||
|
||||
|
||||
@staticmethod
|
||||
def test_external_knowledge_retrival(
|
||||
top_k: int, score_threshold: float
|
||||
):
|
||||
api_template_setting = {
|
||||
"url": f"{settings.get('endpoint')}/dify/external-knowledge/retrival-documents",
|
||||
"request_method": "post",
|
||||
"headers": settings.get("headers"),
|
||||
"params": {
|
||||
"top_k": top_k,
|
||||
"score_threshold": score_threshold,
|
||||
},
|
||||
}
|
||||
response = ExternalDatasetService.process_external_api(ApiTemplateSetting(**api_template_setting), None)
|
||||
return response.json()
|
||||
@ -0,0 +1,85 @@
|
||||
import json
|
||||
import logging
|
||||
import time
|
||||
|
||||
import click
|
||||
from celery import shared_task
|
||||
|
||||
from core.indexing_runner import DocumentIsPausedException
|
||||
from extensions.ext_database import db
|
||||
from extensions.ext_storage import storage
|
||||
from models.dataset import Dataset, ExternalApiTemplates
|
||||
from models.model import UploadFile
|
||||
from services.external_knowledge_service import ExternalDatasetService
|
||||
|
||||
|
||||
@shared_task(queue="dataset")
|
||||
def external_document_indexing_task(dataset_id: str, api_template_id: str, data_source: dict, process_parameter: dict):
|
||||
"""
|
||||
Async process document
|
||||
:param dataset_id:
|
||||
:param api_template_id:
|
||||
:param data_source:
|
||||
:param process_parameter:
|
||||
Usage: external_document_indexing_task.delay(dataset_id, document_id)
|
||||
"""
|
||||
start_at = time.perf_counter()
|
||||
|
||||
dataset = db.session.query(Dataset).filter(Dataset.id == dataset_id).first()
|
||||
if not dataset:
|
||||
logging.info(
|
||||
click.style("Processed external dataset: {} failed, dataset not exit.".format(dataset_id), fg="red")
|
||||
)
|
||||
return
|
||||
|
||||
# get external api template
|
||||
api_template = (
|
||||
db.session.query(ExternalApiTemplates)
|
||||
.filter(ExternalApiTemplates.id == api_template_id, ExternalApiTemplates.tenant_id == dataset.tenant_id)
|
||||
.first()
|
||||
)
|
||||
|
||||
if not api_template:
|
||||
logging.info(
|
||||
click.style(
|
||||
"Processed external dataset: {} failed, api template: {} not exit.".format(dataset_id, api_template_id),
|
||||
fg="red",
|
||||
)
|
||||
)
|
||||
return
|
||||
files = {}
|
||||
if data_source["type"] == "upload_file":
|
||||
upload_file_list = data_source["info_list"]["file_info_list"]["file_ids"]
|
||||
for file_id in upload_file_list:
|
||||
file = (
|
||||
db.session.query(UploadFile)
|
||||
.filter(UploadFile.tenant_id == dataset.tenant_id, UploadFile.id == file_id)
|
||||
.first()
|
||||
)
|
||||
if file:
|
||||
files[file.id] = (file.name, storage.load_once(file.key), file.mime_type)
|
||||
try:
|
||||
settings = ExternalDatasetService.get_api_template_settings(json.loads(api_template.settings))
|
||||
# assemble headers
|
||||
headers = ExternalDatasetService.assembling_headers(settings.authorization, settings.headers)
|
||||
|
||||
# do http request
|
||||
response = ExternalDatasetService.process_external_api(settings, headers, process_parameter, files)
|
||||
job_id = response.json().get("job_id")
|
||||
if job_id:
|
||||
# save job_id to dataset
|
||||
dataset.job_id = job_id
|
||||
db.session.commit()
|
||||
|
||||
end_at = time.perf_counter()
|
||||
logging.info(
|
||||
click.style(
|
||||
"Processed external dataset: {} successful, latency: {}".format(dataset.id, end_at - start_at),
|
||||
fg="green",
|
||||
)
|
||||
)
|
||||
except DocumentIsPausedException as ex:
|
||||
logging.info(click.style(str(ex), fg="yellow"))
|
||||
|
||||
except Exception:
|
||||
pass
|
||||
@ -0,0 +1,24 @@
|
||||
|
||||
# unstructured .
|
||||
# (if used, you need to set ETL_TYPE to Unstructured in the api & worker service.)
|
||||
unstructured:
|
||||
image: downloads.unstructured.io/unstructured-io/unstructured-api:latest
|
||||
profiles:
|
||||
- unstructured
|
||||
restart: always
|
||||
volumes:
|
||||
- ./volumes/unstructured:/app/data
|
||||
|
||||
networks:
|
||||
# create a network between sandbox, api and ssrf_proxy, and can not access outside.
|
||||
ssrf_proxy_network:
|
||||
driver: bridge
|
||||
internal: true
|
||||
milvus:
|
||||
driver: bridge
|
||||
opensearch-net:
|
||||
driver: bridge
|
||||
internal: true
|
||||
|
||||
volumes:
|
||||
oradata:
|
||||
Loading…
Reference in New Issue