@ -20,6 +20,8 @@ import { PreviewContainer } from '../../preview/container'
import { ChunkContainer , QAPreview } from '../../chunk'
import { PreviewHeader } from '../../preview/header'
import DocumentPicker from '../../common/document-picker'
import { FormattedText } from '../../formatted-text/formatted'
import { PreviewSlice } from '../../formatted-text/flavours/preview-slice'
import s from './index.module.css'
import unescape from './unescape'
import escape from './escape'
@ -27,7 +29,7 @@ import { OptionCard } from './option-card'
import LanguageSelect from './language-select'
import { DelimiterInput , MaxLengthInput , OverlapInput } from './inputs'
import cn from '@/utils/classnames'
import type { CrawlOptions , CrawlResultItem , CreateDocumentReq , CustomFile , FullDocumentDetail , P reProcessingRule, ProcessRule , Rules , createDocumentResponse } from '@/models/datasets'
import type { CrawlOptions , CrawlResultItem , CreateDocumentReq , CustomFile , FullDocumentDetail , P arentMode, P reProcessingRule, ProcessRule , Rules , createDocumentResponse } from '@/models/datasets'
import Button from '@/app/components/base/button'
import FloatRightContainer from '@/app/components/base/float-right-container'
@ -38,7 +40,7 @@ import { ensureRerankModelSelected, isReRankModelSelected } from '@/app/componen
import Toast from '@/app/components/base/toast'
import type { NotionPage } from '@/models/common'
import { DataSourceProvider } from '@/models/common'
import { DataSourceType, DocForm } from '@/models/datasets'
import { ChuckingMode, DataSourceType } from '@/models/datasets'
import { useDatasetDetailContext } from '@/context/dataset-detail'
import I18n from '@/context/i18n'
import { RETRIEVE_METHOD } from '@/types/app'
@ -96,7 +98,7 @@ export enum IndexingType {
const DEFAULT_SEGMENT_IDENTIFIER = '\\n\\n'
type ParentChildConfig = {
chunkForContext : 'paragraph' | 'full_doc'
chunkForContext : ParentMode
parent : {
delimiter : string
maxLength : number
@ -111,11 +113,11 @@ const defaultParentChildConfig: ParentChildConfig = {
chunkForContext : 'paragraph' ,
parent : {
delimiter : '\\n\\n' ,
maxLength : 40 00,
maxLength : 5 00,
} ,
child : {
delimiter : '\\n\\n' ,
maxLength : 40 00,
maxLength : 2 00,
} ,
}
@ -148,7 +150,7 @@ const StepTwo = ({
const { dataset : currentDataset , mutateDatasetRes } = useDatasetDetailContext ( )
const isInCreatePage = ! datasetId || ( datasetId && ! currentDataset ? . data_source_type )
const dataSourceType = isInCreatePage ? inCreatePageDataSourceType : currentDataset?.data_source_type
const [ segmentationType , setSegmentationType ] = useState < SegmentType > ( SegmentType . AUTO )
const [ segmentationType , setSegmentationType ] = useState < SegmentType > ( SegmentType . CUSTOM )
const [ segmentIdentifier , doSetSegmentIdentifier ] = useState ( DEFAULT_SEGMENT_IDENTIFIER )
const setSegmentIdentifier = useCallback ( ( value : string ) = > {
doSetSegmentIdentifier ( value ? escape ( value ) : DEFAULT_SEGMENT_IDENTIFIER )
@ -168,9 +170,14 @@ const StepTwo = ({
// QA Related
const [ isLanguageSelectDisabled , setIsLanguageSelectDisabled ] = useState ( false )
const [ docForm , setDocForm ] = useState < DocForm | string > (
( datasetId && documentDetail ) ? documentDetail .doc_form : DocForm.TEXT ,
const [ docForm , setDocForm ] = useState < ChuckingMode > (
( datasetId && documentDetail ) ? documentDetail . doc_form as ChuckingMode : ChuckingMode.text ,
)
const handleChangeDocform = ( value : ChuckingMode ) = > {
setDocForm ( value )
// eslint-disable-next-line @typescript-eslint/no-use-before-define
currentEstimateMutation . reset ( )
}
const [ docLanguage , setDocLanguage ] = useState < string > (
( datasetId && documentDetail ) ? documentDetail . doc_language : ( locale !== LanguagesSupported [ 1 ] ? 'English' : 'Chinese' ) ,
@ -180,28 +187,42 @@ const StepTwo = ({
const getIndexing_technique = ( ) = > indexingType || indexType
const getProcessRule = ( ) = > {
const processRule : ProcessRule = {
rules : { } as any , // api will check this. It will be removed after api refactored.
mode : segmentationType ,
const getProcessRule = ( ) : ProcessRule = > {
if ( docForm === ChuckingMode . parentChild ) {
return {
rules : {
pre_processing_rules : rules ,
segmentation : {
separator : unescape (
parentChildConfig . parent . delimiter ,
) ,
max_tokens : parentChildConfig.parent.maxLength ,
chunk_overlap : overlap ,
} ,
parent_mode : parentChildConfig.chunkForContext ,
subchunk_segmentation : {
separator : parentChildConfig.child.delimiter ,
max_tokens : parentChildConfig.child.maxLength ,
} ,
} , // api will check this. It will be removed after api refactored.
mode : 'hierarchical' ,
} as ProcessRule
}
if ( segmentationType === SegmentType . CUSTOM ) {
const ruleObj = {
return {
rules : {
pre_processing_rules : rules ,
segmentation : {
separator : unescape ( segmentIdentifier ) ,
max_tokens : maxChunkLength ,
chunk_overlap : overlap ,
} ,
}
// @ts-expect-error will be removed after api refactored.
processRule . rules = ruleObj
}
return processRule
} , // api will check this. It will be removed after api refactored.
mode : segmentationType ,
} as ProcessRule
}
const fileIndexingEstimateQuery = useFetchFileIndexingEstimateForFile ( {
docForm : docForm as DocForm ,
docForm ,
docLanguage ,
dataSourceType : DataSourceType.FILE ,
files ,
@ -210,7 +231,7 @@ const StepTwo = ({
dataset_id : datasetId ! ,
} )
const notionIndexingEstimateQuery = useFetchFileIndexingEstimateForNotion ( {
docForm : docForm as DocForm ,
docForm ,
docLanguage ,
dataSourceType : DataSourceType.NOTION ,
notionPages ,
@ -220,7 +241,7 @@ const StepTwo = ({
} )
const websiteIndexingEstimateQuery = useFetchFileIndexingEstimateForWeb ( {
docForm : docForm as DocForm ,
docForm ,
docLanguage ,
dataSourceType : DataSourceType.WEB ,
websitePages ,
@ -481,29 +502,11 @@ const StepTwo = ({
isSetting && onSave && onSave ( )
}
const handleDocformSwitch = ( isQAMode : boolean ) = > {
if ( isQAMode )
setDocForm ( DocForm . QA )
else
setDocForm ( DocForm . TEXT )
}
const previewSwitch = ( ) = > {
setIsLanguageSelectDisabled ( true )
fetchEstimate ( )
}
const handleSelect = ( language : string ) = > {
setDocLanguage ( language )
// Switch language, re-cutter
if ( docForm === DocForm . QA )
previewSwitch ( )
}
const changeToEconomicalType = ( ) = > {
if ( ! hasSetIndexType ) {
setIndexType ( IndexingType . ECONOMICAL )
setDocForm ( DocForm . TEXT )
if ( docForm === ChuckingMode . qa )
handleChangeDocform ( ChuckingMode . text )
}
}
@ -519,11 +522,6 @@ const StepTwo = ({
// eslint-disable-next-line react-hooks/exhaustive-deps
} , [ ] )
useEffect ( ( ) = > {
if ( indexingType === IndexingType . ECONOMICAL && docForm === DocForm . QA )
setDocForm ( DocForm . TEXT )
} , [ indexingType , docForm ] )
useEffect ( ( ) = > {
// get indexing type by props
if ( indexingType )
@ -557,8 +555,8 @@ const StepTwo = ({
icon = { < Image src = { SettingCog } alt = { t ( 'datasetCreation.stepTwo.general' ) } / > }
activeHeaderClassName = 'bg-gradient-to-r from-[#EFF0F9] to-[#F9FAFB]'
description = { t ( 'datasetCreation.stepTwo.generalTip' ) }
isActive = { SegmentType. AUTO === segmentationType }
on Click= { ( ) = > setSegmentationType ( SegmentType . AUTO ) }
isActive = { docForm === ChuckingMode . qa || docForm === ChuckingMode . text }
on Select= { ( ) = > handleChangeDocform ( ChuckingMode . text ) }
actions = {
< >
< Button variant = { 'secondary-accent' } onClick = { ( ) = > updatePreview ( ) } >
@ -607,12 +605,12 @@ const StepTwo = ({
{ IS_CE_EDITION && < >
< div className = 'flex items-center' >
< Checkbox
checked = { docForm === DocForm. QA }
checked = { docForm === ChuckingMode. qa }
onCheck = { ( ) = > {
if ( docForm === DocForm. QA )
setDocForm( DocForm . TEXT )
if ( docForm === ChuckingMode. qa )
handleChangeDocform( ChuckingMode . text )
else
setDocForm( DocForm . QA )
handleChangeDocform( ChuckingMode . qa )
} }
className = 'mr-2'
/ >
@ -630,7 +628,7 @@ const StepTwo = ({
< Tooltip popupContent = { t ( 'datasetCreation.stepTwo.QATip' ) } / >
< / div >
< / div >
{ docForm === DocForm. QA && (
{ docForm === ChuckingMode. qa && (
< div
style = { {
background : 'linear-gradient(92deg, rgba(247, 144, 9, 0.1) 0%, rgba(255, 255, 255, 0.00) 100%)' ,
@ -652,8 +650,8 @@ const StepTwo = ({
effectImg = { OrangeEffect . src }
activeHeaderClassName = 'bg-gradient-to-r from-[#F9F1EE] to-[#F9FAFB]'
description = { t ( 'datasetCreation.stepTwo.parentChildTip' ) }
isActive = { SegmentType. CUSTOM === segmentationType }
on Click= { ( ) = > setSegmentationType ( SegmentType . CUSTOM ) }
isActive = { docForm === ChuckingMode . parentChild }
on Selected= { ( ) = > handleChangeDocform ( ChuckingMode . parentChild ) }
actions = {
< >
< Button variant = { 'secondary-accent' } onClick = { ( ) = > updatePreview ( ) } >
@ -714,10 +712,10 @@ const StepTwo = ({
onChosen = { ( ) = > setParentChildConfig (
{
. . . parentChildConfig ,
chunkForContext : 'full _ doc',
chunkForContext : 'full - doc',
} ,
) }
isChosen = { parentChildConfig . chunkForContext === 'full _ doc'}
isChosen = { parentChildConfig . chunkForContext === 'full - doc'}
/ >
< / div >
@ -924,24 +922,51 @@ const StepTwo = ({
< Badge text = '276 Estimated chunks' / >
< / div >
< / PreviewHeader > }
className = { cn ( s . previewWrap , isMobile && s . isMobile , 'relative h-full overflow-y-scroll space-y-4' ) }
className = { cn ( s . previewWrap , isMobile && s . isMobile , 'relative h-full overflow-y-scroll' ) }
mainClassName = 'space-y-6'
>
{ docForm === DocForm. QA && estimate ? . qa_preview && (
{ docForm === ChuckingMode. qa && estimate ? . qa_preview && (
estimate ? . qa_preview . map ( item = > (
< QAPreview key = { item . question } qa = { item } / >
) )
) }
{ docForm === DocForm. TEXT && estimate ? . preview && (
{ docForm === ChuckingMode. text && estimate ? . preview && (
estimate ? . preview . map ( ( item , index ) = > (
< ChunkContainer
key = { item }
key = { item .content }
label = { ` Chunk- ${ index + 1 } ` }
characterCount = { item . length}
characterCount = { item . content. length}
>
{ item }
{ item .content }
< / ChunkContainer >
) )
) }
{ docForm === ChuckingMode . parentChild && currentEstimateMutation . data ? . preview && (
estimate ? . preview ? . map ( ( item , index ) = > {
const indexForLabel = index + 1
return (
< ChunkContainer
key = { item . content }
label = { ` Chunk- ${ indexForLabel } ` }
characterCount = { item . content . length }
>
< FormattedText >
{ item . child_chunks . map ( ( child , index ) = > {
const indexForLabel = index + 1
return (
< PreviewSlice
key = { child }
label = { ` C- ${ indexForLabel } ` }
text = { child }
tooltip = { ` Child-chunk- ${ indexForLabel } · ${ child . length } Characters ` }
/ >
)
} ) }
< / FormattedText >
< / ChunkContainer >
)
} )
) }
{ currentEstimateMutation . isIdle && (
< div className = 'h-full w-full flex items-center justify-center' >
< div className = 'flex flex-col items-center justify-center gap-3' >