|
|
|
|
@ -1,3 +1,4 @@
|
|
|
|
|
import logging
|
|
|
|
|
from typing import Optional
|
|
|
|
|
|
|
|
|
|
from core.model_manager import ModelInstance, ModelManager
|
|
|
|
|
@ -12,6 +13,8 @@ from models.dataset import ChildChunk, Dataset, DatasetProcessRule, DocumentSegm
|
|
|
|
|
from models.dataset import Document as DatasetDocument
|
|
|
|
|
from services.entities.knowledge_entities.knowledge_entities import ParentMode
|
|
|
|
|
|
|
|
|
|
_logger = logging.getLogger(__name__)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
class VectorService:
|
|
|
|
|
@classmethod
|
|
|
|
|
@ -22,7 +25,14 @@ class VectorService:
|
|
|
|
|
|
|
|
|
|
for segment in segments:
|
|
|
|
|
if doc_form == IndexType.PARENT_CHILD_INDEX:
|
|
|
|
|
document = DatasetDocument.query.filter_by(id=segment.document_id).first()
|
|
|
|
|
document = db.session.query(DatasetDocument).filter_by(id=segment.document_id).first()
|
|
|
|
|
if not document:
|
|
|
|
|
_logger.warning(
|
|
|
|
|
"Expected DatasetDocument record to exist, but none was found, document_id=%s, segment_id=%s",
|
|
|
|
|
segment.document_id,
|
|
|
|
|
segment.id,
|
|
|
|
|
)
|
|
|
|
|
continue
|
|
|
|
|
# get the process rule
|
|
|
|
|
processing_rule = (
|
|
|
|
|
db.session.query(DatasetProcessRule)
|
|
|
|
|
@ -52,7 +62,7 @@ class VectorService:
|
|
|
|
|
raise ValueError("The knowledge base index technique is not high quality!")
|
|
|
|
|
cls.generate_child_chunks(segment, document, dataset, embedding_model_instance, processing_rule, False)
|
|
|
|
|
else:
|
|
|
|
|
document = Document(
|
|
|
|
|
document = Document( # type: ignore
|
|
|
|
|
page_content=segment.content,
|
|
|
|
|
metadata={
|
|
|
|
|
"doc_id": segment.index_node_id,
|
|
|
|
|
@ -64,7 +74,7 @@ class VectorService:
|
|
|
|
|
documents.append(document)
|
|
|
|
|
if len(documents) > 0:
|
|
|
|
|
index_processor = IndexProcessorFactory(doc_form).init_index_processor()
|
|
|
|
|
index_processor.load(dataset, documents, with_keywords=True, keywords_list=keywords_list)
|
|
|
|
|
index_processor.load(dataset, documents, with_keywords=True, keywords_list=keywords_list) # type: ignore
|
|
|
|
|
|
|
|
|
|
@classmethod
|
|
|
|
|
def update_segment_vector(cls, keywords: Optional[list[str]], segment: DocumentSegment, dataset: Dataset):
|
|
|
|
|
|