feat: add chunking mode

pull/12097/head
Joel 1 year ago
parent 78fff31e61
commit 1578dc50ef

@ -34,9 +34,10 @@ import { formatNumber } from '@/utils/format'
import { archiveDocument, deleteDocument, disableDocument, enableDocument, syncDocument, syncWebsite, unArchiveDocument } from '@/service/datasets' import { archiveDocument, deleteDocument, disableDocument, enableDocument, syncDocument, syncWebsite, unArchiveDocument } from '@/service/datasets'
import NotionIcon from '@/app/components/base/notion-icon' import NotionIcon from '@/app/components/base/notion-icon'
import ProgressBar from '@/app/components/base/progress-bar' import ProgressBar from '@/app/components/base/progress-bar'
import { DataSourceType, type DocumentDisplayStatus, type SimpleDocumentDetail } from '@/models/datasets' import { ChuckingMode, DataSourceType, type DocumentDisplayStatus, type SimpleDocumentDetail } from '@/models/datasets'
import type { CommonResponse } from '@/models/common' import type { CommonResponse } from '@/models/common'
import useTimestamp from '@/hooks/use-timestamp' import useTimestamp from '@/hooks/use-timestamp'
import { useDatasetDetailContextWithSelector as useDatasetDetailContext } from '@/context/dataset-detail'
export const useIndexStatus = () => { export const useIndexStatus = () => {
const { t } = useTranslation() const { t } = useTranslation()
@ -389,6 +390,10 @@ const DocumentList: FC<IDocumentListProps> = ({ embeddingAvailable, documents =
const { t } = useTranslation() const { t } = useTranslation()
const { formatTime } = useTimestamp() const { formatTime } = useTimestamp()
const router = useRouter() const router = useRouter()
const [datasetConfig] = useDatasetDetailContext(s => [s.dataset])
const chunkingMode = datasetConfig?.doc_form
const isGeneralMode = chunkingMode !== ChuckingMode.parentChild
const isQAMode = chunkingMode === ChuckingMode.qa
const [localDocs, setLocalDocs] = useState<LocalDoc[]>(documents) const [localDocs, setLocalDocs] = useState<LocalDoc[]>(documents)
const [enableSort, setEnableSort] = useState(false) const [enableSort, setEnableSort] = useState(false)
@ -431,6 +436,7 @@ const DocumentList: FC<IDocumentListProps> = ({ embeddingAvailable, documents =
{t('datasetDocuments.list.table.header.fileName')} {t('datasetDocuments.list.table.header.fileName')}
</div> </div>
</td> </td>
<td className='w-[120px]'>{t('datasetDocuments.list.table.header.chunkingMode')}</td>
<td className='w-24'>{t('datasetDocuments.list.table.header.words')}</td> <td className='w-24'>{t('datasetDocuments.list.table.header.words')}</td>
<td className='w-44'>{t('datasetDocuments.list.table.header.hitCount')}</td> <td className='w-44'>{t('datasetDocuments.list.table.header.hitCount')}</td>
<td className='w-44'> <td className='w-44'>
@ -453,7 +459,7 @@ const DocumentList: FC<IDocumentListProps> = ({ embeddingAvailable, documents =
onClick={() => { onClick={() => {
router.push(`/datasets/${datasetId}/documents/${doc.id}`) router.push(`/datasets/${datasetId}/documents/${doc.id}`)
}}> }}>
<td className='text-left align-middle text-gray-500 text-xs'>{doc.position}</td> <td className='text-left align-middle text-text-tertiary text-xs'>{doc.position}</td>
<td> <td>
<div className='group flex items-center justify-between'> <div className='group flex items-center justify-between'>
<span className={s.tdValue}> <span className={s.tdValue}>
@ -482,11 +488,11 @@ const DocumentList: FC<IDocumentListProps> = ({ embeddingAvailable, documents =
</Tooltip> </Tooltip>
</div> </div>
</div> </div>
</td> </td>
<td>{isGeneralMode ? `general ${isQAMode ? '. QA' : ''}` : 'ParentChilde'}</td>
<td>{renderCount(doc.word_count)}</td> <td>{renderCount(doc.word_count)}</td>
<td>{renderCount(doc.hit_count)}</td> <td>{renderCount(doc.hit_count)}</td>
<td className='text-gray-500 text-[13px]'> <td className='text-text-secondary text-[13px]'>
{formatTime(doc.created_at, t('datasetHitTesting.dateTimeFormat') as string)} {formatTime(doc.created_at, t('datasetHitTesting.dateTimeFormat') as string)}
</td> </td>
<td> <td>

@ -1,8 +1,15 @@
import { createContext, useContext } from 'use-context-selector' import { createContext, useContext, useContextSelector } from 'use-context-selector'
import type { DataSet } from '@/models/datasets' import type { DataSet } from '@/models/datasets'
type DatasetDetailContextValue = {
const DatasetDetailContext = createContext<{ indexingTechnique?: string; dataset?: DataSet; mutateDatasetRes?: () => void }>({}) indexingTechnique?: string
dataset?: DataSet
mutateDatasetRes?: () => void
}
const DatasetDetailContext = createContext<DatasetDetailContextValue>({})
export const useDatasetDetailContext = () => useContext(DatasetDetailContext) export const useDatasetDetailContext = () => useContext(DatasetDetailContext)
export const useDatasetDetailContextWithSelector = (selector: (value: DatasetDetailContextValue) => any) => {
return useContextSelector(DatasetDetailContext, selector)
}
export default DatasetDetailContext export default DatasetDetailContext

@ -8,7 +8,8 @@ const translation = {
addUrl: 'Add URL', addUrl: 'Add URL',
table: { table: {
header: { header: {
fileName: 'FILE NAME', fileName: 'NAME',
chunkingMode: 'CHUNKING MODE',
words: 'WORDS', words: 'WORDS',
hitCount: 'RETRIEVAL COUNT', hitCount: 'RETRIEVAL COUNT',
uploadTime: 'UPLOAD TIME', uploadTime: 'UPLOAD TIME',

@ -7,7 +7,8 @@ const translation = {
addUrl: '添加 URL', addUrl: '添加 URL',
table: { table: {
header: { header: {
fileName: '文件名', fileName: '名称',
chunkingMode: '分段模式',
words: '字符数', words: '字符数',
hitCount: '召回次数', hitCount: '召回次数',
uploadTime: '上传时间', uploadTime: '上传时间',

@ -10,6 +10,12 @@ export enum DataSourceType {
export type DatasetPermission = 'only_me' | 'all_team_members' | 'partial_members' export type DatasetPermission = 'only_me' | 'all_team_members' | 'partial_members'
export enum ChuckingMode {
'text' = 'text_model', // General text
'qa' = 'qa_model', // General QA
'parentChild' = 'hierarchical_model', // Parent-Child
}
export type DataSet = { export type DataSet = {
id: string id: string
name: string name: string
@ -23,6 +29,7 @@ export type DataSet = {
updated_by: string updated_by: string
updated_at: number updated_at: number
app_count: number app_count: number
doc_form: ChuckingMode
document_count: number document_count: number
word_count: number word_count: number
provider: string provider: string
@ -170,7 +177,10 @@ export type IndexingStatusBatchResponse = {
data: IndexingStatusResponse[] data: IndexingStatusResponse[]
} }
export type ProcessMode = 'custom' | 'hierarchical' export enum ProcessMode {
general = 'custom',
parentChild = 'hierarchical',
}
export type ParentMode = 'full-doc' | 'paragraph' export type ParentMode = 'full-doc' | 'paragraph'
@ -269,6 +279,7 @@ export type InitialDocumentDetail = {
export type SimpleDocumentDetail = InitialDocumentDetail & { export type SimpleDocumentDetail = InitialDocumentDetail & {
enabled: boolean enabled: boolean
word_count: number word_count: number
is_qa: boolean // TODO waiting for backend to add this field
error?: string | null error?: string | null
archived: boolean archived: boolean
updated_at: number updated_at: number

Loading…
Cancel
Save