add document lock for multi-thread (#9873)

pull/9934/head
Jyong 2 years ago committed by GitHub
parent 9633c5dab6
commit af68084895
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194

@ -760,166 +760,168 @@ class DocumentService:
) )
db.session.add(dataset_process_rule) db.session.add(dataset_process_rule)
db.session.commit() db.session.commit()
position = DocumentService.get_documents_position(dataset.id) lock_name = "add_document_lock_dataset_id_{}".format(dataset.id)
document_ids = [] with redis_client.lock(lock_name, timeout=600):
duplicate_document_ids = [] position = DocumentService.get_documents_position(dataset.id)
if document_data["data_source"]["type"] == "upload_file": document_ids = []
upload_file_list = document_data["data_source"]["info_list"]["file_info_list"]["file_ids"] duplicate_document_ids = []
for file_id in upload_file_list: if document_data["data_source"]["type"] == "upload_file":
file = ( upload_file_list = document_data["data_source"]["info_list"]["file_info_list"]["file_ids"]
db.session.query(UploadFile) for file_id in upload_file_list:
.filter(UploadFile.tenant_id == dataset.tenant_id, UploadFile.id == file_id) file = (
.first() db.session.query(UploadFile)
) .filter(UploadFile.tenant_id == dataset.tenant_id, UploadFile.id == file_id)
.first()
)
# raise error if file not found # raise error if file not found
if not file: if not file:
raise FileNotExistsError() raise FileNotExistsError()
file_name = file.name file_name = file.name
data_source_info = { data_source_info = {
"upload_file_id": file_id, "upload_file_id": file_id,
} }
# check duplicate # check duplicate
if document_data.get("duplicate", False): if document_data.get("duplicate", False):
document = Document.query.filter_by( document = Document.query.filter_by(
dataset_id=dataset.id, dataset_id=dataset.id,
tenant_id=current_user.current_tenant_id, tenant_id=current_user.current_tenant_id,
data_source_type="upload_file", data_source_type="upload_file",
enabled=True, enabled=True,
name=file_name, name=file_name,
).first() ).first()
if document: if document:
document.dataset_process_rule_id = dataset_process_rule.id document.dataset_process_rule_id = dataset_process_rule.id
document.updated_at = datetime.datetime.utcnow() document.updated_at = datetime.datetime.utcnow()
document.created_from = created_from document.created_from = created_from
document.doc_form = document_data["doc_form"] document.doc_form = document_data["doc_form"]
document.doc_language = document_data["doc_language"] document.doc_language = document_data["doc_language"]
document.data_source_info = json.dumps(data_source_info) document.data_source_info = json.dumps(data_source_info)
document.batch = batch document.batch = batch
document.indexing_status = "waiting" document.indexing_status = "waiting"
db.session.add(document) db.session.add(document)
documents.append(document) documents.append(document)
duplicate_document_ids.append(document.id) duplicate_document_ids.append(document.id)
continue continue
document = DocumentService.build_document( document = DocumentService.build_document(
dataset, dataset,
dataset_process_rule.id, dataset_process_rule.id,
document_data["data_source"]["type"], document_data["data_source"]["type"],
document_data["doc_form"], document_data["doc_form"],
document_data["doc_language"], document_data["doc_language"],
data_source_info, data_source_info,
created_from, created_from,
position, position,
account, account,
file_name, file_name,
batch, batch,
)
db.session.add(document)
db.session.flush()
document_ids.append(document.id)
documents.append(document)
position += 1
elif document_data["data_source"]["type"] == "notion_import":
notion_info_list = document_data["data_source"]["info_list"]["notion_info_list"]
exist_page_ids = []
exist_document = {}
documents = Document.query.filter_by(
dataset_id=dataset.id,
tenant_id=current_user.current_tenant_id,
data_source_type="notion_import",
enabled=True,
).all()
if documents:
for document in documents:
data_source_info = json.loads(document.data_source_info)
exist_page_ids.append(data_source_info["notion_page_id"])
exist_document[data_source_info["notion_page_id"]] = document.id
for notion_info in notion_info_list:
workspace_id = notion_info["workspace_id"]
data_source_binding = DataSourceOauthBinding.query.filter(
db.and_(
DataSourceOauthBinding.tenant_id == current_user.current_tenant_id,
DataSourceOauthBinding.provider == "notion",
DataSourceOauthBinding.disabled == False,
DataSourceOauthBinding.source_info["workspace_id"] == f'"{workspace_id}"',
) )
).first() db.session.add(document)
if not data_source_binding: db.session.flush()
raise ValueError("Data source binding not found.") document_ids.append(document.id)
for page in notion_info["pages"]: documents.append(document)
if page["page_id"] not in exist_page_ids: position += 1
data_source_info = { elif document_data["data_source"]["type"] == "notion_import":
"notion_workspace_id": workspace_id, notion_info_list = document_data["data_source"]["info_list"]["notion_info_list"]
"notion_page_id": page["page_id"], exist_page_ids = []
"notion_page_icon": page["page_icon"], exist_document = {}
"type": page["type"], documents = Document.query.filter_by(
} dataset_id=dataset.id,
document = DocumentService.build_document( tenant_id=current_user.current_tenant_id,
dataset, data_source_type="notion_import",
dataset_process_rule.id, enabled=True,
document_data["data_source"]["type"], ).all()
document_data["doc_form"], if documents:
document_data["doc_language"], for document in documents:
data_source_info, data_source_info = json.loads(document.data_source_info)
created_from, exist_page_ids.append(data_source_info["notion_page_id"])
position, exist_document[data_source_info["notion_page_id"]] = document.id
account, for notion_info in notion_info_list:
page["page_name"], workspace_id = notion_info["workspace_id"]
batch, data_source_binding = DataSourceOauthBinding.query.filter(
db.and_(
DataSourceOauthBinding.tenant_id == current_user.current_tenant_id,
DataSourceOauthBinding.provider == "notion",
DataSourceOauthBinding.disabled == False,
DataSourceOauthBinding.source_info["workspace_id"] == f'"{workspace_id}"',
) )
db.session.add(document) ).first()
db.session.flush() if not data_source_binding:
document_ids.append(document.id) raise ValueError("Data source binding not found.")
documents.append(document) for page in notion_info["pages"]:
position += 1 if page["page_id"] not in exist_page_ids:
data_source_info = {
"notion_workspace_id": workspace_id,
"notion_page_id": page["page_id"],
"notion_page_icon": page["page_icon"],
"type": page["type"],
}
document = DocumentService.build_document(
dataset,
dataset_process_rule.id,
document_data["data_source"]["type"],
document_data["doc_form"],
document_data["doc_language"],
data_source_info,
created_from,
position,
account,
page["page_name"],
batch,
)
db.session.add(document)
db.session.flush()
document_ids.append(document.id)
documents.append(document)
position += 1
else:
exist_document.pop(page["page_id"])
# delete not selected documents
if len(exist_document) > 0:
clean_notion_document_task.delay(list(exist_document.values()), dataset.id)
elif document_data["data_source"]["type"] == "website_crawl":
website_info = document_data["data_source"]["info_list"]["website_info_list"]
urls = website_info["urls"]
for url in urls:
data_source_info = {
"url": url,
"provider": website_info["provider"],
"job_id": website_info["job_id"],
"only_main_content": website_info.get("only_main_content", False),
"mode": "crawl",
}
if len(url) > 255:
document_name = url[:200] + "..."
else: else:
exist_document.pop(page["page_id"]) document_name = url
# delete not selected documents document = DocumentService.build_document(
if len(exist_document) > 0: dataset,
clean_notion_document_task.delay(list(exist_document.values()), dataset.id) dataset_process_rule.id,
elif document_data["data_source"]["type"] == "website_crawl": document_data["data_source"]["type"],
website_info = document_data["data_source"]["info_list"]["website_info_list"] document_data["doc_form"],
urls = website_info["urls"] document_data["doc_language"],
for url in urls: data_source_info,
data_source_info = { created_from,
"url": url, position,
"provider": website_info["provider"], account,
"job_id": website_info["job_id"], document_name,
"only_main_content": website_info.get("only_main_content", False), batch,
"mode": "crawl", )
} db.session.add(document)
if len(url) > 255: db.session.flush()
document_name = url[:200] + "..." document_ids.append(document.id)
else: documents.append(document)
document_name = url position += 1
document = DocumentService.build_document( db.session.commit()
dataset,
dataset_process_rule.id,
document_data["data_source"]["type"],
document_data["doc_form"],
document_data["doc_language"],
data_source_info,
created_from,
position,
account,
document_name,
batch,
)
db.session.add(document)
db.session.flush()
document_ids.append(document.id)
documents.append(document)
position += 1
db.session.commit()
# trigger async task # trigger async task
if document_ids: if document_ids:
document_indexing_task.delay(dataset.id, document_ids) document_indexing_task.delay(dataset.id, document_ids)
if duplicate_document_ids: if duplicate_document_ids:
duplicate_document_indexing_task.delay(dataset.id, duplicate_document_ids) duplicate_document_indexing_task.delay(dataset.id, duplicate_document_ids)
return documents, batch return documents, batch
@staticmethod @staticmethod
def check_documents_upload_quota(count: int, features: FeatureModel): def check_documents_upload_quota(count: int, features: FeatureModel):

Loading…
Cancel
Save