From fbe94b0f0c0d2c08df41355cfc2dcd3a5c40add1 Mon Sep 17 00:00:00 2001 From: cpwan Date: Fri, 14 Feb 2025 08:59:34 +0000 Subject: [PATCH] feat: Add page_number attribute to document segments and update related retrieval logic --- api/core/rag/docstore/dataset_docstore.py | 1 + api/core/rag/retrieval/dataset_retrieval.py | 1 + .../dataset_multi_retriever_tool.py | 1 + .../dataset_retriever_tool.py | 1 + .../knowledge_retrieval_node.py | 1 + ...8c83edec42e8_add_page_column_to_segment.py | 33 +++++++++++++++++++ api/models/dataset.py | 1 + 7 files changed, 39 insertions(+) create mode 100644 api/migrations/versions/2025_05_15_0308-8c83edec42e8_add_page_column_to_segment.py diff --git a/api/core/rag/docstore/dataset_docstore.py b/api/core/rag/docstore/dataset_docstore.py index 398b0daad9..475a100bab 100644 --- a/api/core/rag/docstore/dataset_docstore.py +++ b/api/core/rag/docstore/dataset_docstore.py @@ -115,6 +115,7 @@ class DatasetDocumentStore: tokens=tokens, enabled=False, created_by=self._user_id, + page_number=doc.metadata.get("page", 0), ) if doc.metadata.get("answer"): segment_document.answer = doc.metadata.pop("answer", "") diff --git a/api/core/rag/retrieval/dataset_retrieval.py b/api/core/rag/retrieval/dataset_retrieval.py index d3605da146..09575daa2f 100644 --- a/api/core/rag/retrieval/dataset_retrieval.py +++ b/api/core/rag/retrieval/dataset_retrieval.py @@ -265,6 +265,7 @@ class DatasetRetrieval: source["word_count"] = segment.word_count source["segment_position"] = segment.position source["index_node_hash"] = segment.index_node_hash + source["page_number"] = segment.page_number if segment.answer: source["content"] = f"question:{segment.content} \nanswer:{segment.answer}" else: diff --git a/api/core/tools/utils/dataset_retriever/dataset_multi_retriever_tool.py b/api/core/tools/utils/dataset_retriever/dataset_multi_retriever_tool.py index 04437ea6d8..5596f777b3 100644 --- a/api/core/tools/utils/dataset_retriever/dataset_multi_retriever_tool.py +++ b/api/core/tools/utils/dataset_retriever/dataset_multi_retriever_tool.py @@ -139,6 +139,7 @@ class DatasetMultiRetrieverTool(DatasetRetrieverBaseTool): source["word_count"] = segment.word_count source["segment_position"] = segment.position source["index_node_hash"] = segment.index_node_hash + source["page_number"] = segment.page_number if segment.answer: source["content"] = f"question:{segment.content} \nanswer:{segment.answer}" else: diff --git a/api/core/tools/utils/dataset_retriever/dataset_retriever_tool.py b/api/core/tools/utils/dataset_retriever/dataset_retriever_tool.py index fff261e0bd..b389e32e70 100644 --- a/api/core/tools/utils/dataset_retriever/dataset_retriever_tool.py +++ b/api/core/tools/utils/dataset_retriever/dataset_retriever_tool.py @@ -213,6 +213,7 @@ class DatasetRetrieverTool(DatasetRetrieverBaseTool): source["word_count"] = segment.word_count source["segment_position"] = segment.position source["index_node_hash"] = segment.index_node_hash + source["page_number"] = segment.page_number if segment.answer: source["content"] = f"question:{segment.content} \nanswer:{segment.answer}" else: diff --git a/api/core/workflow/nodes/knowledge_retrieval/knowledge_retrieval_node.py b/api/core/workflow/nodes/knowledge_retrieval/knowledge_retrieval_node.py index 5955022e5f..2ae574faa8 100644 --- a/api/core/workflow/nodes/knowledge_retrieval/knowledge_retrieval_node.py +++ b/api/core/workflow/nodes/knowledge_retrieval/knowledge_retrieval_node.py @@ -302,6 +302,7 @@ class KnowledgeRetrievalNode(LLMNode): "segment_word_count": segment.word_count, "segment_position": segment.position, "segment_index_node_hash": segment.index_node_hash, + "segment_page_number": segment.page_number, "doc_metadata": document.doc_metadata, }, "title": document.name, diff --git a/api/migrations/versions/2025_05_15_0308-8c83edec42e8_add_page_column_to_segment.py b/api/migrations/versions/2025_05_15_0308-8c83edec42e8_add_page_column_to_segment.py new file mode 100644 index 0000000000..d80694550a --- /dev/null +++ b/api/migrations/versions/2025_05_15_0308-8c83edec42e8_add_page_column_to_segment.py @@ -0,0 +1,33 @@ +"""add page column to segment + +Revision ID: 8c83edec42e8 +Revises: d28f2004b072 +Create Date: 2025-05-15 03:08:23.196234 + +""" +from alembic import op +import models as models +import sqlalchemy as sa + + +# revision identifiers, used by Alembic. +revision = '8c83edec42e8' +down_revision = 'd28f2004b072' +branch_labels = None +depends_on = None + + +def upgrade(): + # ### commands auto generated by Alembic - please adjust! ### + with op.batch_alter_table('document_segments', schema=None) as batch_op: + batch_op.add_column(sa.Column('page_number', sa.Integer(), nullable=True)) + + # ### end Alembic commands ### + + +def downgrade(): + # ### commands auto generated by Alembic - please adjust! ### + with op.batch_alter_table('document_segments', schema=None) as batch_op: + batch_op.drop_column('page_number') + + # ### end Alembic commands ### diff --git a/api/models/dataset.py b/api/models/dataset.py index ad43d6f371..a6524ab4fd 100644 --- a/api/models/dataset.py +++ b/api/models/dataset.py @@ -657,6 +657,7 @@ class DocumentSegment(Base): dataset_id = db.Column(StringUUID, nullable=False) document_id = db.Column(StringUUID, nullable=False) position: Mapped[int] + page_number = db.Column(db.Integer, nullable=True) content = db.Column(db.Text, nullable=False) answer = db.Column(db.Text, nullable=True) word_count = db.Column(db.Integer, nullable=False)