add re_segment document param check

pull/114/head
Jyong 3 years ago
parent a41495703e
commit ffa8e4ccd1

@ -207,8 +207,8 @@ class DatasetDocumentListApi(Resource):
parser = reqparse.RequestParser() parser = reqparse.RequestParser()
parser.add_argument('indexing_technique', type=str, choices=Dataset.INDEXING_TECHNIQUE_LIST, nullable=False, parser.add_argument('indexing_technique', type=str, choices=Dataset.INDEXING_TECHNIQUE_LIST, nullable=False,
location='json') location='json')
parser.add_argument('data_source', type=dict, required=True, nullable=True, location='json') parser.add_argument('data_source', type=dict, required=False, location='json')
parser.add_argument('process_rule', type=dict, required=True, nullable=True, location='json') parser.add_argument('process_rule', type=dict, required=False, location='json')
parser.add_argument('duplicate', type=bool, nullable=False, location='json') parser.add_argument('duplicate', type=bool, nullable=False, location='json')
parser.add_argument('original_document_id', type=str, required=False, location='json') parser.add_argument('original_document_id', type=str, required=False, location='json')
args = parser.parse_args() args = parser.parse_args()
@ -245,8 +245,8 @@ class DatasetInitApi(Resource):
parser = reqparse.RequestParser() parser = reqparse.RequestParser()
parser.add_argument('indexing_technique', type=str, choices=Dataset.INDEXING_TECHNIQUE_LIST, required=True, parser.add_argument('indexing_technique', type=str, choices=Dataset.INDEXING_TECHNIQUE_LIST, required=True,
nullable=False, location='json') nullable=False, location='json')
parser.add_argument('data_source', type=dict, required=False, location='json') parser.add_argument('data_source', type=dict, required=True, nullable=True, location='json')
parser.add_argument('process_rule', type=dict, required=False, location='json') parser.add_argument('process_rule', type=dict, required=True, nullable=True, location='json')
args = parser.parse_args() args = parser.parse_args()
# validate args # validate args

@ -18,8 +18,8 @@ from services.errors.account import NoPermissionError
from services.errors.dataset import DatasetNameDuplicateError from services.errors.dataset import DatasetNameDuplicateError
from services.errors.document import DocumentIndexingError from services.errors.document import DocumentIndexingError
from services.errors.file import FileNotExistsError from services.errors.file import FileNotExistsError
from tasks import document_indexing_update_task
from tasks.document_indexing_task import document_indexing_task from tasks.document_indexing_task import document_indexing_task
from tasks.document_indexing_update_task import document_indexing_update_task
class DatasetService: class DatasetService:
@ -358,68 +358,68 @@ class DocumentService:
if dataset.indexing_technique == 'high_quality': if dataset.indexing_technique == 'high_quality':
IndexBuilder.get_default_service_context(dataset.tenant_id) IndexBuilder.get_default_service_context(dataset.tenant_id)
if document_data["original_document_id"]: if 'original_document_id' in document_data and document_data["original_document_id"]:
DocumentService.update_document_with_dataset_id(dataset, document_data, account) document = DocumentService.update_document_with_dataset_id(dataset, document_data, account)
# save process rule else:
if not dataset_process_rule: # save process rule
process_rule = document_data["process_rule"] if not dataset_process_rule:
if process_rule["mode"] == "custom": process_rule = document_data["process_rule"]
dataset_process_rule = DatasetProcessRule( if process_rule["mode"] == "custom":
dataset_id=dataset.id, dataset_process_rule = DatasetProcessRule(
mode=process_rule["mode"], dataset_id=dataset.id,
rules=json.dumps(process_rule["rules"]), mode=process_rule["mode"],
created_by=account.id rules=json.dumps(process_rule["rules"]),
) created_by=account.id
elif process_rule["mode"] == "automatic": )
dataset_process_rule = DatasetProcessRule( elif process_rule["mode"] == "automatic":
dataset_id=dataset.id, dataset_process_rule = DatasetProcessRule(
mode=process_rule["mode"], dataset_id=dataset.id,
rules=json.dumps(DatasetProcessRule.AUTOMATIC_RULES), mode=process_rule["mode"],
created_by=account.id rules=json.dumps(DatasetProcessRule.AUTOMATIC_RULES),
) created_by=account.id
db.session.add(dataset_process_rule) )
db.session.commit() db.session.add(dataset_process_rule)
db.session.commit()
file_name = '' file_name = ''
data_source_info = {} data_source_info = {}
if document_data["data_source"]["type"] == "upload_file": if document_data["data_source"]["type"] == "upload_file":
file_id = document_data["data_source"]["info"] file_id = document_data["data_source"]["info"]
file = db.session.query(UploadFile).filter( file = db.session.query(UploadFile).filter(
UploadFile.tenant_id == dataset.tenant_id, UploadFile.tenant_id == dataset.tenant_id,
UploadFile.id == file_id UploadFile.id == file_id
).first() ).first()
# raise error if file not found
if not file:
raise FileNotExistsError()
file_name = file.name
data_source_info = {
"upload_file_id": file_id,
}
# save document # raise error if file not found
position = DocumentService.get_documents_position(dataset.id) if not file:
document = Document( raise FileNotExistsError()
tenant_id=dataset.tenant_id,
dataset_id=dataset.id,
position=position,
data_source_type=document_data["data_source"]["type"],
data_source_info=json.dumps(data_source_info),
dataset_process_rule_id=dataset_process_rule.id,
batch=time.strftime('%Y%m%d%H%M%S') + str(random.randint(100000, 999999)),
name=file_name,
created_from=created_from,
created_by=account.id,
# created_api_request_id = db.Column(UUID, nullable=True)
)
db.session.add(document) file_name = file.name
db.session.commit() data_source_info = {
"upload_file_id": file_id,
}
# trigger async task # save document
document_indexing_task.delay(document.dataset_id, document.id) position = DocumentService.get_documents_position(dataset.id)
document = Document(
tenant_id=dataset.tenant_id,
dataset_id=dataset.id,
position=position,
data_source_type=document_data["data_source"]["type"],
data_source_info=json.dumps(data_source_info),
dataset_process_rule_id=dataset_process_rule.id,
batch=time.strftime('%Y%m%d%H%M%S') + str(random.randint(100000, 999999)),
name=file_name,
created_from=created_from,
created_by=account.id,
# created_api_request_id = db.Column(UUID, nullable=True)
)
db.session.add(document)
db.session.commit()
# trigger async task
document_indexing_task.delay(document.dataset_id, document.id)
return document return document
@staticmethod @staticmethod
@ -430,7 +430,7 @@ class DocumentService:
if document.display_status != 'available': if document.display_status != 'available':
raise ValueError("Document is not available") raise ValueError("Document is not available")
# save process rule # save process rule
if 'process_rule' in document_data or document_data['process_rule']: if 'process_rule' in document_data and document_data['process_rule']:
process_rule = document_data["process_rule"] process_rule = document_data["process_rule"]
if process_rule["mode"] == "custom": if process_rule["mode"] == "custom":
dataset_process_rule = DatasetProcessRule( dataset_process_rule = DatasetProcessRule(
@ -450,7 +450,7 @@ class DocumentService:
db.session.commit() db.session.commit()
document.dataset_process_rule_id = dataset_process_rule.id document.dataset_process_rule_id = dataset_process_rule.id
# update document data source # update document data source
if 'data_source' in document_data or document_data['data_source']: if 'data_source' in document_data and document_data['data_source']:
file_name = '' file_name = ''
data_source_info = {} data_source_info = {}
if document_data["data_source"]["type"] == "upload_file": if document_data["data_source"]["type"] == "upload_file":
@ -513,17 +513,17 @@ class DocumentService:
@classmethod @classmethod
def document_create_args_validate(cls, args: dict): def document_create_args_validate(cls, args: dict):
if 'original_document_id ' not in args or not args['original_document_id']: if 'original_document_id' not in args or not args['original_document_id']:
DocumentService.data_source_args_validate(args) DocumentService.data_source_args_validate(args)
DocumentService.process_rule_args_validate(args) DocumentService.process_rule_args_validate(args)
else: else:
if ('data_source' not in args or not args['data_source']) and ( if ('data_source' not in args and not args['data_source'])\
'process_rule' not in args or not args['process_rule']): and ('process_rule' not in args and not args['process_rule']):
raise ValueError("Data source or Process rule is required") raise ValueError("Data source or Process rule is required")
else: else:
if 'data_source' in args or args['data_source']: if 'data_source' in args and args['data_source']:
DocumentService.data_source_args_validate(args) DocumentService.data_source_args_validate(args)
elif 'process_rule' in args or args['process_rule']: if 'process_rule' in args and args['process_rule']:
DocumentService.process_rule_args_validate(args) DocumentService.process_rule_args_validate(args)
@classmethod @classmethod

@ -17,7 +17,7 @@ from models.dataset import Document, Dataset, DocumentSegment
@shared_task @shared_task
def document_indexing_update_task(dataset_id: str, document_id: str): def document_indexing_update_task(dataset_id: str, document_id: str):
""" """
Async process document Async update document
:param dataset_id: :param dataset_id:
:param document_id: :param document_id:
@ -65,7 +65,6 @@ def document_indexing_update_task(dataset_id: str, document_id: str):
click.style('Cleaned document when document update data source or process rule: {} latency: {}'.format(document_id, end_at - start_at), fg='green')) click.style('Cleaned document when document update data source or process rule: {} latency: {}'.format(document_id, end_at - start_at), fg='green'))
except Exception: except Exception:
logging.exception("Cleaned document when document update data source or process rule failed") logging.exception("Cleaned document when document update data source or process rule failed")
# start document re_segment
try: try:
indexing_runner = IndexingRunner() indexing_runner = IndexingRunner()
indexing_runner.run(document) indexing_runner.run(document)

Loading…
Cancel
Save