Feat: Support re-segmentation (#114)
Co-authored-by: John Wang <takatost@gmail.com> Co-authored-by: Jyong <718720800@qq.com> Co-authored-by: 金伟强 <iamjoel007@gmail.com>pull/281/head
parent
f65a3ad1cc
commit
c67f626b66
@ -0,0 +1,85 @@
|
||||
import datetime
|
||||
import logging
|
||||
import time
|
||||
|
||||
import click
|
||||
from celery import shared_task
|
||||
from werkzeug.exceptions import NotFound
|
||||
|
||||
from core.index.keyword_table_index import KeywordTableIndex
|
||||
from core.index.vector_index import VectorIndex
|
||||
from core.indexing_runner import IndexingRunner, DocumentIsPausedException
|
||||
from core.llm.error import ProviderTokenNotInitError
|
||||
from extensions.ext_database import db
|
||||
from models.dataset import Document, Dataset, DocumentSegment
|
||||
|
||||
|
||||
@shared_task
|
||||
def document_indexing_update_task(dataset_id: str, document_id: str):
|
||||
"""
|
||||
Async update document
|
||||
:param dataset_id:
|
||||
:param document_id:
|
||||
|
||||
Usage: document_indexing_update_task.delay(dataset_id, document_id)
|
||||
"""
|
||||
logging.info(click.style('Start update document: {}'.format(document_id), fg='green'))
|
||||
start_at = time.perf_counter()
|
||||
|
||||
document = db.session.query(Document).filter(
|
||||
Document.id == document_id,
|
||||
Document.dataset_id == dataset_id
|
||||
).first()
|
||||
|
||||
if not document:
|
||||
raise NotFound('Document not found')
|
||||
|
||||
document.indexing_status = 'parsing'
|
||||
document.processing_started_at = datetime.datetime.utcnow()
|
||||
db.session.commit()
|
||||
|
||||
# delete all document segment and index
|
||||
try:
|
||||
dataset = db.session.query(Dataset).filter(Dataset.id == dataset_id).first()
|
||||
if not dataset:
|
||||
raise Exception('Dataset not found')
|
||||
|
||||
vector_index = VectorIndex(dataset=dataset)
|
||||
keyword_table_index = KeywordTableIndex(dataset=dataset)
|
||||
|
||||
segments = db.session.query(DocumentSegment).filter(DocumentSegment.document_id == document_id).all()
|
||||
index_node_ids = [segment.index_node_id for segment in segments]
|
||||
|
||||
# delete from vector index
|
||||
vector_index.del_nodes(index_node_ids)
|
||||
|
||||
# delete from keyword index
|
||||
if index_node_ids:
|
||||
keyword_table_index.del_nodes(index_node_ids)
|
||||
|
||||
for segment in segments:
|
||||
db.session.delete(segment)
|
||||
|
||||
end_at = time.perf_counter()
|
||||
logging.info(
|
||||
click.style('Cleaned document when document update data source or process rule: {} latency: {}'.format(document_id, end_at - start_at), fg='green'))
|
||||
except Exception:
|
||||
logging.exception("Cleaned document when document update data source or process rule failed")
|
||||
try:
|
||||
indexing_runner = IndexingRunner()
|
||||
indexing_runner.run(document)
|
||||
end_at = time.perf_counter()
|
||||
logging.info(click.style('update document: {} latency: {}'.format(document.id, end_at - start_at), fg='green'))
|
||||
except DocumentIsPausedException:
|
||||
logging.info(click.style('Document update paused, document id: {}'.format(document.id), fg='yellow'))
|
||||
except ProviderTokenNotInitError as e:
|
||||
document.indexing_status = 'error'
|
||||
document.error = str(e.description)
|
||||
document.stopped_at = datetime.datetime.utcnow()
|
||||
db.session.commit()
|
||||
except Exception as e:
|
||||
logging.exception("consume update document failed")
|
||||
document.indexing_status = 'error'
|
||||
document.error = str(e)
|
||||
document.stopped_at = datetime.datetime.utcnow()
|
||||
db.session.commit()
|
||||
@ -0,0 +1,16 @@
|
||||
import React from 'react'
|
||||
import Settings from '@/app/components/datasets/documents/detail/settings'
|
||||
|
||||
export type IProps = {
|
||||
params: { datasetId: string; documentId: string }
|
||||
}
|
||||
|
||||
const DocumentSettings = async ({
|
||||
params: { datasetId, documentId },
|
||||
}: IProps) => {
|
||||
return (
|
||||
<Settings datasetId={datasetId} documentId={documentId} />
|
||||
)
|
||||
}
|
||||
|
||||
export default DocumentSettings
|
||||
@ -0,0 +1,90 @@
|
||||
'use client'
|
||||
import React, { useState, useCallback, useEffect } from 'react'
|
||||
import { useTranslation } from 'react-i18next'
|
||||
import { useBoolean } from 'ahooks'
|
||||
import { useContext } from 'use-context-selector'
|
||||
import { useRouter } from 'next/navigation'
|
||||
import DatasetDetailContext from '@/context/dataset-detail'
|
||||
import type { FullDocumentDetail } from '@/models/datasets'
|
||||
import { fetchTenantInfo } from '@/service/common'
|
||||
import { fetchDocumentDetail, MetadataType } from '@/service/datasets'
|
||||
|
||||
import Loading from '@/app/components/base/loading'
|
||||
import StepTwo from '@/app/components/datasets/create/step-two'
|
||||
import AccountSetting from '@/app/components/header/account-setting'
|
||||
import AppUnavailable from '@/app/components/base/app-unavailable'
|
||||
|
||||
type DocumentSettingsProps = {
|
||||
datasetId: string;
|
||||
documentId: string;
|
||||
}
|
||||
|
||||
const DocumentSettings = ({ datasetId, documentId }: DocumentSettingsProps) => {
|
||||
const { t } = useTranslation()
|
||||
const router = useRouter()
|
||||
const [hasSetAPIKEY, setHasSetAPIKEY] = useState(true)
|
||||
const [isShowSetAPIKey, { setTrue: showSetAPIKey, setFalse: hideSetAPIkey }] = useBoolean()
|
||||
const [hasError, setHasError] = useState(false)
|
||||
const { indexingTechnique, dataset } = useContext(DatasetDetailContext)
|
||||
|
||||
const saveHandler = () => router.push(`/datasets/${datasetId}/documents/${documentId}`)
|
||||
|
||||
const cancelHandler = () => router.back()
|
||||
|
||||
const checkAPIKey = async () => {
|
||||
const data = await fetchTenantInfo({ url: '/info' })
|
||||
const hasSetKey = data.providers.some(({ is_valid }) => is_valid)
|
||||
setHasSetAPIKEY(hasSetKey)
|
||||
}
|
||||
|
||||
useEffect(() => {
|
||||
checkAPIKey()
|
||||
}, [])
|
||||
|
||||
const [documentDetail, setDocumentDetail] = useState<FullDocumentDetail | null>(null)
|
||||
useEffect(() => {
|
||||
(async () => {
|
||||
try {
|
||||
const detail = await fetchDocumentDetail({
|
||||
datasetId,
|
||||
documentId,
|
||||
params: { metadata: 'without' as MetadataType }
|
||||
})
|
||||
setDocumentDetail(detail)
|
||||
} catch (e) {
|
||||
setHasError(true)
|
||||
}
|
||||
})()
|
||||
}, [datasetId, documentId])
|
||||
|
||||
if (hasError) {
|
||||
return <AppUnavailable code={500} unknownReason={t('datasetCreation.error.unavailable') as string} />
|
||||
}
|
||||
|
||||
return (
|
||||
<div className='flex' style={{ height: 'calc(100vh - 56px)' }}>
|
||||
<div className="grow bg-white">
|
||||
{!documentDetail && <Loading type='app' />}
|
||||
{dataset && documentDetail && (
|
||||
<StepTwo
|
||||
hasSetAPIKEY={hasSetAPIKEY}
|
||||
onSetting={showSetAPIKey}
|
||||
datasetId={datasetId}
|
||||
indexingType={indexingTechnique || ''}
|
||||
isSetting
|
||||
documentDetail={documentDetail}
|
||||
file={documentDetail.data_source_info.upload_file}
|
||||
onSave={saveHandler}
|
||||
onCancel={cancelHandler}
|
||||
/>
|
||||
)}
|
||||
</div>
|
||||
{isShowSetAPIKey && <AccountSetting activeTab="provider" onCancel={async () => {
|
||||
await checkAPIKey()
|
||||
hideSetAPIkey()
|
||||
}} />}
|
||||
</div>
|
||||
)
|
||||
}
|
||||
|
||||
export default DocumentSettings
|
||||
@ -1,5 +1,6 @@
|
||||
import { createContext } from 'use-context-selector'
|
||||
import type { DataSet } from '@/models/datasets'
|
||||
|
||||
const DatasetDetailContext = createContext<{ indexingTechnique?: string; }>({})
|
||||
const DatasetDetailContext = createContext<{ indexingTechnique?: string; dataset?: DataSet }>({})
|
||||
|
||||
export default DatasetDetailContext
|
||||
|
||||
@ -1,30 +1,30 @@
|
||||
import { AppMode } from "./app";
|
||||
import type { AppMode } from './app'
|
||||
|
||||
export type AppBasicInfo = {
|
||||
id: string;
|
||||
name: string;
|
||||
mode: AppMode;
|
||||
icon: string;
|
||||
icon_background: string;
|
||||
id: string
|
||||
name: string
|
||||
mode: AppMode
|
||||
icon: string
|
||||
icon_background: string
|
||||
}
|
||||
|
||||
export type App = {
|
||||
app: AppBasicInfo;
|
||||
app_id: string;
|
||||
description: string;
|
||||
copyright: string;
|
||||
privacy_policy: string;
|
||||
category: string;
|
||||
position: number;
|
||||
is_listed: boolean;
|
||||
install_count: number;
|
||||
installed: boolean;
|
||||
editable: boolean;
|
||||
app: AppBasicInfo
|
||||
app_id: string
|
||||
description: string
|
||||
copyright: string
|
||||
privacy_policy: string
|
||||
category: string
|
||||
position: number
|
||||
is_listed: boolean
|
||||
install_count: number
|
||||
installed: boolean
|
||||
editable: boolean
|
||||
}
|
||||
|
||||
export type InstalledApp = {
|
||||
app: AppBasicInfo;
|
||||
id: string;
|
||||
app: AppBasicInfo
|
||||
id: string
|
||||
uninstallable: boolean
|
||||
is_pinned: boolean
|
||||
}
|
||||
Loading…
Reference in New Issue