dataset metadata update
parent
5f995fac32
commit
67f2c766bc
@ -0,0 +1,143 @@
|
||||
from flask_login import current_user # type: ignore # type: ignore
|
||||
from flask_restful import Resource, marshal_with, reqparse # type: ignore
|
||||
from werkzeug.exceptions import NotFound
|
||||
|
||||
from controllers.console import api
|
||||
from controllers.console.wraps import account_initialization_required, enterprise_license_required, setup_required
|
||||
from fields.dataset_fields import dataset_metadata_fields
|
||||
from libs.login import login_required
|
||||
from services.dataset_service import DatasetService
|
||||
from services.entities.knowledge_entities.knowledge_entities import (
|
||||
MetadataArgs,
|
||||
MetadataOperationData,
|
||||
)
|
||||
from services.metadata_service import MetadataService
|
||||
|
||||
|
||||
def _validate_name(name):
|
||||
if not name or len(name) < 1 or len(name) > 40:
|
||||
raise ValueError("Name must be between 1 to 40 characters.")
|
||||
return name
|
||||
|
||||
|
||||
def _validate_description_length(description):
|
||||
if len(description) > 400:
|
||||
raise ValueError("Description cannot exceed 400 characters.")
|
||||
return description
|
||||
|
||||
|
||||
class DatasetListApi(Resource):
|
||||
@setup_required
|
||||
@login_required
|
||||
@account_initialization_required
|
||||
@enterprise_license_required
|
||||
@marshal_with(dataset_metadata_fields)
|
||||
def post(self, dataset_id):
|
||||
parser = reqparse.RequestParser()
|
||||
parser.add_argument("type", type=str, required=True, nullable=True, location="json")
|
||||
parser.add_argument("name", type=str, required=True, nullable=True, location="json")
|
||||
args = parser.parse_args()
|
||||
metadata_args = MetadataArgs(**args)
|
||||
|
||||
dataset_id_str = str(dataset_id)
|
||||
dataset = DatasetService.get_dataset(dataset_id_str)
|
||||
if dataset is None:
|
||||
raise NotFound("Dataset not found.")
|
||||
DatasetService.check_dataset_permission(dataset, current_user)
|
||||
|
||||
metadata = MetadataService.create_metadata(dataset_id_str, metadata_args)
|
||||
return metadata, 201
|
||||
|
||||
|
||||
class DatasetMetadataApi(Resource):
|
||||
@setup_required
|
||||
@login_required
|
||||
@account_initialization_required
|
||||
@enterprise_license_required
|
||||
def patch(self, dataset_id, metadata_id):
|
||||
parser = reqparse.RequestParser()
|
||||
parser.add_argument("name", type=str, required=True, nullable=True, location="json")
|
||||
args = parser.parse_args()
|
||||
|
||||
dataset_id_str = str(dataset_id)
|
||||
metadata_id_str = str(metadata_id)
|
||||
dataset = DatasetService.get_dataset(dataset_id_str)
|
||||
if dataset is None:
|
||||
raise NotFound("Dataset not found.")
|
||||
DatasetService.check_dataset_permission(dataset, current_user)
|
||||
|
||||
metadata = MetadataService.update_metadata_name(dataset_id_str, metadata_id_str, args.get("name"))
|
||||
return metadata, 200
|
||||
|
||||
@setup_required
|
||||
@login_required
|
||||
@account_initialization_required
|
||||
@enterprise_license_required
|
||||
def delete(self, dataset_id, metadata_id):
|
||||
dataset_id_str = str(dataset_id)
|
||||
metadata_id_str = str(metadata_id)
|
||||
dataset = DatasetService.get_dataset(dataset_id_str)
|
||||
if dataset is None:
|
||||
raise NotFound("Dataset not found.")
|
||||
DatasetService.check_dataset_permission(dataset, current_user)
|
||||
|
||||
MetadataService.delete_metadata(dataset_id_str, metadata_id_str)
|
||||
return 200
|
||||
|
||||
|
||||
class DatasetMetadataBuiltInFieldApi(Resource):
|
||||
@setup_required
|
||||
@login_required
|
||||
@account_initialization_required
|
||||
@enterprise_license_required
|
||||
def get(self):
|
||||
built_in_fields = MetadataService.get_built_in_fields()
|
||||
return built_in_fields, 200
|
||||
|
||||
|
||||
class DatasetMetadataBuiltInFieldActionApi(Resource):
|
||||
@setup_required
|
||||
@login_required
|
||||
@account_initialization_required
|
||||
@enterprise_license_required
|
||||
def post(self, dataset_id, action):
|
||||
dataset_id_str = str(dataset_id)
|
||||
dataset = DatasetService.get_dataset(dataset_id_str)
|
||||
if dataset is None:
|
||||
raise NotFound("Dataset not found.")
|
||||
DatasetService.check_dataset_permission(dataset, current_user)
|
||||
|
||||
if action == "enable":
|
||||
MetadataService.enable_built_in_field(dataset)
|
||||
elif action == "disable":
|
||||
MetadataService.disable_built_in_field(dataset)
|
||||
return 200
|
||||
|
||||
|
||||
class DocumentMetadataApi(Resource):
|
||||
@setup_required
|
||||
@login_required
|
||||
@account_initialization_required
|
||||
@enterprise_license_required
|
||||
def post(self, dataset_id):
|
||||
dataset_id_str = str(dataset_id)
|
||||
dataset = DatasetService.get_dataset(dataset_id_str)
|
||||
if dataset is None:
|
||||
raise NotFound("Dataset not found.")
|
||||
DatasetService.check_dataset_permission(dataset, current_user)
|
||||
|
||||
parser = reqparse.RequestParser()
|
||||
parser.add_argument("operation_data", type=list, required=True, nullable=True, location="json")
|
||||
args = parser.parse_args()
|
||||
metadata_args = MetadataOperationData(**args)
|
||||
|
||||
MetadataService.update_documents_metadata(dataset, metadata_args)
|
||||
|
||||
return 200
|
||||
|
||||
|
||||
api.add_resource(DatasetListApi, "/datasets/<uuid:dataset_id>/metadata")
|
||||
api.add_resource(DatasetMetadataApi, "/datasets/<uuid:dataset_id>/metadata/<uuid:metadata_id>")
|
||||
api.add_resource(DatasetMetadataBuiltInFieldApi, "/datasets/metadata/built-in")
|
||||
api.add_resource(DatasetMetadataBuiltInFieldActionApi, "/datasets/metadata/built-in/<string:action>")
|
||||
api.add_resource(DocumentMetadataApi, "/datasets/<uuid:dataset_id>/documents/metadata")
|
||||
@ -0,0 +1,9 @@
|
||||
from enum import Enum
|
||||
|
||||
|
||||
class BuiltInField(str, Enum):
|
||||
document_name = "document_name"
|
||||
uploader = "uploader"
|
||||
upload_date = "upload_date"
|
||||
last_update_date = "last_update_date"
|
||||
source = "source"
|
||||
@ -0,0 +1,182 @@
|
||||
import datetime
|
||||
from typing import Optional
|
||||
|
||||
from flask_login import current_user # type: ignore
|
||||
|
||||
from core.rag.index_processor.constant.built_in_field import BuiltInField
|
||||
from extensions.ext_database import db
|
||||
from extensions.ext_redis import redis_client
|
||||
from models.dataset import Dataset, DatasetMetadata, DatasetMetadataBinding
|
||||
from services.dataset_service import DocumentService
|
||||
from services.entities.knowledge_entities.knowledge_entities import (
|
||||
MetadataArgs,
|
||||
MetadataOperationData,
|
||||
)
|
||||
from tasks.update_documents_metadata_task import update_documents_metadata_task
|
||||
|
||||
|
||||
class MetadataService:
|
||||
@staticmethod
|
||||
def create_metadata(dataset_id: str, metadata_args: MetadataArgs) -> DatasetMetadata:
|
||||
metadata = DatasetMetadata(
|
||||
dataset_id=dataset_id,
|
||||
type=metadata_args.type,
|
||||
name=metadata_args.name,
|
||||
created_by=current_user.id,
|
||||
)
|
||||
db.session.add(metadata)
|
||||
db.session.commit()
|
||||
return metadata
|
||||
|
||||
@staticmethod
|
||||
def update_metadata_name(dataset_id: str, metadata_id: str, name: str) -> DatasetMetadata:
|
||||
lock_key = f"dataset_metadata_lock_{dataset_id}"
|
||||
MetadataService.knowledge_base_metadata_lock_check(dataset_id, None)
|
||||
metadata = DatasetMetadata.query.filter_by(id=metadata_id).first()
|
||||
if metadata is None:
|
||||
raise ValueError("Metadata not found.")
|
||||
old_name = metadata.name
|
||||
metadata.name = name
|
||||
metadata.updated_by = current_user.id
|
||||
metadata.updated_at = datetime.datetime.now(datetime.UTC).replace(tzinfo=None)
|
||||
|
||||
# update related documents
|
||||
documents = []
|
||||
dataset_metadata_bindings = DatasetMetadataBinding.query.filter_by(metadata_id=metadata_id).all()
|
||||
if dataset_metadata_bindings:
|
||||
document_ids = [binding.document_id for binding in dataset_metadata_bindings]
|
||||
documents = DocumentService.get_document_by_ids(document_ids)
|
||||
for document in documents:
|
||||
document.doc_metadata[name] = document.doc_metadata.pop(old_name)
|
||||
db.session.add(document)
|
||||
db.session.commit()
|
||||
if document_ids:
|
||||
update_documents_metadata_task.delay(dataset_id, document_ids, lock_key)
|
||||
return metadata
|
||||
|
||||
@staticmethod
|
||||
def delete_metadata(dataset_id: str, metadata_id: str):
|
||||
lock_key = f"dataset_metadata_lock_{dataset_id}"
|
||||
MetadataService.knowledge_base_metadata_lock_check(dataset_id, None)
|
||||
metadata = DatasetMetadata.query.filter_by(id=metadata_id).first()
|
||||
if metadata is None:
|
||||
raise ValueError("Metadata not found.")
|
||||
db.session.delete(metadata)
|
||||
|
||||
# delete related documents
|
||||
dataset_metadata_bindings = DatasetMetadataBinding.query.filter_by(metadata_id=metadata_id).all()
|
||||
if dataset_metadata_bindings:
|
||||
document_ids = [binding.document_id for binding in dataset_metadata_bindings]
|
||||
documents = DocumentService.get_document_by_ids(document_ids)
|
||||
for document in documents:
|
||||
document.doc_metadata.pop(metadata.name)
|
||||
db.session.add(document)
|
||||
db.session.commit()
|
||||
if document_ids:
|
||||
update_documents_metadata_task.delay(dataset_id, document_ids, lock_key)
|
||||
|
||||
@staticmethod
|
||||
def get_built_in_fields():
|
||||
return [
|
||||
{"name": BuiltInField.document_name, "type": "string"},
|
||||
{"name": BuiltInField.uploader, "type": "string"},
|
||||
{"name": BuiltInField.upload_date, "type": "date"},
|
||||
{"name": BuiltInField.last_update_date, "type": "date"},
|
||||
{"name": BuiltInField.source, "type": "string"},
|
||||
]
|
||||
|
||||
@staticmethod
|
||||
def enable_built_in_field(dataset: Dataset):
|
||||
if dataset.built_in_fields:
|
||||
return
|
||||
lock_key = f"dataset_metadata_lock_{dataset.id}"
|
||||
MetadataService.knowledge_base_metadata_lock_check(dataset.id, None)
|
||||
dataset.built_in_fields = True
|
||||
db.session.add(dataset)
|
||||
documents = DocumentService.get_working_documents_by_dataset_id(dataset.id)
|
||||
document_ids = []
|
||||
if documents:
|
||||
for document in documents:
|
||||
document.doc_metadata[BuiltInField.document_name] = document.name
|
||||
document.doc_metadata[BuiltInField.uploader] = document.uploader
|
||||
document.doc_metadata[BuiltInField.upload_date] = document.upload_date.strftime("%Y-%m-%d %H:%M:%S")
|
||||
document.doc_metadata[BuiltInField.last_update_date] = document.last_update_date.strftime(
|
||||
"%Y-%m-%d %H:%M:%S"
|
||||
)
|
||||
document.doc_metadata[BuiltInField.source] = document.data_source_type
|
||||
db.session.add(document)
|
||||
document_ids.append(document.id)
|
||||
db.session.commit()
|
||||
if document_ids:
|
||||
update_documents_metadata_task.delay(dataset.id, document_ids, lock_key)
|
||||
|
||||
@staticmethod
|
||||
def disable_built_in_field(dataset: Dataset):
|
||||
if not dataset.built_in_fields:
|
||||
return
|
||||
lock_key = f"dataset_metadata_lock_{dataset.id}"
|
||||
MetadataService.knowledge_base_metadata_lock_check(dataset.id, None)
|
||||
dataset.built_in_fields = False
|
||||
db.session.add(dataset)
|
||||
documents = DocumentService.get_working_documents_by_dataset_id(dataset.id)
|
||||
document_ids = []
|
||||
if documents:
|
||||
for document in documents:
|
||||
document.doc_metadata.pop(BuiltInField.document_name)
|
||||
document.doc_metadata.pop(BuiltInField.uploader)
|
||||
document.doc_metadata.pop(BuiltInField.upload_date)
|
||||
document.doc_metadata.pop(BuiltInField.last_update_date)
|
||||
document.doc_metadata.pop(BuiltInField.source)
|
||||
db.session.add(document)
|
||||
document_ids.append(document.id)
|
||||
db.session.commit()
|
||||
if document_ids:
|
||||
update_documents_metadata_task.delay(dataset.id, document_ids, lock_key)
|
||||
|
||||
@staticmethod
|
||||
def update_documents_metadata(dataset: Dataset, metadata_args: MetadataOperationData):
|
||||
for operation in metadata_args.operation_data:
|
||||
lock_key = f"document_metadata_lock_{operation.document_id}"
|
||||
MetadataService.knowledge_base_metadata_lock_check(None, operation.document_id)
|
||||
document = DocumentService.get_document(operation.document_id)
|
||||
if document is None:
|
||||
raise ValueError("Document not found.")
|
||||
document.doc_metadata = {}
|
||||
for metadata_value in metadata_args.fields:
|
||||
document.doc_metadata[metadata_value.name] = metadata_value.value
|
||||
if dataset.built_in_fields:
|
||||
document.doc_metadata[BuiltInField.document_name] = document.name
|
||||
document.doc_metadata[BuiltInField.uploader] = document.uploader
|
||||
document.doc_metadata[BuiltInField.upload_date] = document.upload_date.strftime("%Y-%m-%d %H:%M:%S")
|
||||
document.doc_metadata[BuiltInField.last_update_date] = document.last_update_date.strftime(
|
||||
"%Y-%m-%d %H:%M:%S"
|
||||
)
|
||||
document.doc_metadata[BuiltInField.source] = document.data_source_type
|
||||
# deal metadata bindding
|
||||
DatasetMetadataBinding.query.filter_by(document_id=operation.document_id).delete()
|
||||
for metadata_value in operation.metadata_list:
|
||||
dataset_metadata_binding = DatasetMetadataBinding(
|
||||
tenant_id=current_user.tenant_id,
|
||||
dataset_id=dataset.id,
|
||||
document_id=operation.document_id,
|
||||
metadata_id=metadata_value.id,
|
||||
created_by=current_user.id,
|
||||
)
|
||||
db.session.add(dataset_metadata_binding)
|
||||
db.session.add(document)
|
||||
db.session.commit()
|
||||
|
||||
update_documents_metadata_task.delay(dataset.id, [document.id], lock_key)
|
||||
|
||||
@staticmethod
|
||||
def knowledge_base_metadata_lock_check(dataset_id: Optional[str], document_id: Optional[str]):
|
||||
if dataset_id:
|
||||
lock_key = f"dataset_metadata_lock_{dataset_id}"
|
||||
if redis_client.get(lock_key):
|
||||
raise ValueError("Another knowledge base metadata operation is running, please wait a moment.")
|
||||
redis_client.set(lock_key, 1, ex=3600)
|
||||
if document_id:
|
||||
lock_key = f"document_metadata_lock_{document_id}"
|
||||
if redis_client.get(lock_key):
|
||||
raise ValueError("Another document metadata operation is running, please wait a moment.")
|
||||
redis_client.set(lock_key, 1, ex=3600)
|
||||
@ -0,0 +1,121 @@
|
||||
import logging
|
||||
import time
|
||||
from typing import Optional
|
||||
|
||||
import click
|
||||
from celery import shared_task # type: ignoreq
|
||||
|
||||
from core.rag.index_processor.constant.built_in_field import BuiltInField
|
||||
from core.rag.index_processor.constant.index_type import IndexType
|
||||
from core.rag.index_processor.index_processor_factory import IndexProcessorFactory
|
||||
from core.rag.models.document import ChildDocument, Document
|
||||
from extensions.ext_database import db
|
||||
from extensions.ext_redis import redis_client
|
||||
from models.dataset import (
|
||||
Document as DatasetDocument,
|
||||
)
|
||||
from models.dataset import (
|
||||
DocumentSegment,
|
||||
)
|
||||
from services.dataset_service import DatasetService
|
||||
|
||||
|
||||
@shared_task(queue="dataset")
|
||||
def update_documents_metadata_task(
|
||||
dataset_id: str,
|
||||
document_ids: list[str],
|
||||
lock_key: Optional[str] = None,
|
||||
):
|
||||
"""
|
||||
Update documents metadata.
|
||||
:param dataset_id: dataset id
|
||||
:param document_ids: document ids
|
||||
|
||||
Usage: update_documents_metadata_task.delay(dataset_id, document_ids)
|
||||
"""
|
||||
logging.info(click.style("Start update documents metadata: {}".format(dataset_id), fg="green"))
|
||||
start_at = time.perf_counter()
|
||||
|
||||
try:
|
||||
dataset = DatasetService.get_dataset(dataset_id)
|
||||
if dataset is None:
|
||||
raise ValueError("Dataset not found.")
|
||||
documents = (
|
||||
db.session.query(DatasetDocument)
|
||||
.filter(
|
||||
DatasetDocument.dataset_id == dataset_id,
|
||||
DatasetDocument.id.in_(document_ids),
|
||||
DatasetDocument.enabled == True,
|
||||
DatasetDocument.indexing_status == "completed",
|
||||
DatasetDocument.archived == False,
|
||||
)
|
||||
.all()
|
||||
)
|
||||
if not documents:
|
||||
raise ValueError("Documents not found.")
|
||||
for dataset_document in documents:
|
||||
index_processor = IndexProcessorFactory(dataset_document.doc_form).init_index_processor()
|
||||
|
||||
segments = (
|
||||
db.session.query(DocumentSegment)
|
||||
.filter(
|
||||
DocumentSegment.dataset_id == dataset_id,
|
||||
DocumentSegment.document_id == dataset_document.id,
|
||||
DocumentSegment.enabled == True,
|
||||
)
|
||||
.all()
|
||||
)
|
||||
if not segments:
|
||||
continue
|
||||
# delete all documents in vector index
|
||||
index_node_ids = [segment.index_node_id for segment in segments]
|
||||
index_processor.clean(dataset, index_node_ids, with_keywords=False, delete_child_chunks=True)
|
||||
# update documents metadata
|
||||
documents = []
|
||||
for segment in segments:
|
||||
document = Document(
|
||||
page_content=segment.content,
|
||||
metadata={
|
||||
"doc_id": segment.index_node_id,
|
||||
"doc_hash": segment.index_node_hash,
|
||||
"document_id": dataset_document.id,
|
||||
"dataset_id": dataset_id,
|
||||
},
|
||||
)
|
||||
|
||||
if dataset_document.doc_form == IndexType.PARENT_CHILD_INDEX:
|
||||
child_chunks = segment.child_chunks
|
||||
if child_chunks:
|
||||
child_documents = []
|
||||
for child_chunk in child_chunks:
|
||||
child_document = ChildDocument(
|
||||
page_content=child_chunk.content,
|
||||
metadata={
|
||||
"doc_id": child_chunk.index_node_id,
|
||||
"doc_hash": child_chunk.index_node_hash,
|
||||
"document_id": dataset_document.id,
|
||||
"dataset_id": dataset_id,
|
||||
},
|
||||
)
|
||||
if dataset.built_in_field_enabled:
|
||||
child_document.metadata[BuiltInField.uploader] = dataset_document.created_by
|
||||
child_document.metadata[BuiltInField.upload_date] = dataset_document.created_at
|
||||
child_document.metadata[BuiltInField.last_update_date] = dataset_document.updated_at
|
||||
child_document.metadata[BuiltInField.source] = dataset_document.data_source_type
|
||||
child_document.metadata[BuiltInField.original_filename] = dataset_document.name
|
||||
if dataset_document.doc_metadata:
|
||||
child_document.metadata.update(dataset_document.doc_metadata)
|
||||
child_documents.append(child_document)
|
||||
document.children = child_documents
|
||||
documents.append(document) # noqa: B909
|
||||
# save vector index
|
||||
index_processor.load(dataset, documents)
|
||||
end_at = time.perf_counter()
|
||||
logging.info(
|
||||
click.style("Updated documents metadata: {} latency: {}".format(dataset_id, end_at - start_at), fg="green")
|
||||
)
|
||||
except Exception:
|
||||
logging.exception("Updated documents metadata failed")
|
||||
finally:
|
||||
if lock_key:
|
||||
redis_client.delete(lock_key)
|
||||
Loading…
Reference in New Issue