@ -31,17 +31,17 @@ import LanguageSelect from './language-select'
import { DelimiterInput , MaxLengthInput , OverlapInput } from './inputs'
import cn from '@/utils/classnames'
import type { CrawlOptions , CrawlResultItem , CreateDocumentReq , CustomFile , DocumentItem , FullDocumentDetail , ParentMode , PreProcessingRule , ProcessRule , Rules , createDocumentResponse } from '@/models/datasets'
import { ChunkingMode , DataSourceType , ProcessMode } from '@/models/datasets'
import Button from '@/app/components/base/button'
import FloatRightContainer from '@/app/components/base/float-right-container'
import RetrievalMethodConfig from '@/app/components/datasets/common/retrieval-method-config'
import EconomicalRetrievalMethodConfig from '@/app/components/datasets/common/economical-retrieval-method-config'
import { type RetrievalConfig } from '@/types/app'
import { ensureRerankModelSelected, isReRankModelSelected } from '@/app/components/datasets/common/check-rerank-model'
import { isReRankModelSelected } from '@/app/components/datasets/common/check-rerank-model'
import Toast from '@/app/components/base/toast'
import type { NotionPage } from '@/models/common'
import { DataSourceProvider } from '@/models/common'
import { ChunkingMode , DataSourceType , RerankingModeEnum } from '@/models/datasets'
import { useDatasetDetailContext } from '@/context/dataset-detail'
import I18n from '@/context/i18n'
import { RETRIEVE_METHOD } from '@/types/app'
@ -90,17 +90,13 @@ type StepTwoProps = {
onCancel ? : ( ) = > void
}
export enum SegmentType {
AUTO = 'automatic' ,
CUSTOM = 'custom' ,
}
export enum IndexingType {
QUALIFIED = 'high_quality' ,
ECONOMICAL = 'economy' ,
}
const DEFAULT_SEGMENT_IDENTIFIER = '\\n\\n'
const DEFAULT_MAX M IMUM_CHUNK_LENGTH = 500
const DEFAULT_MAX IMUM_CHUNK_LENGTH = 500
const DEFAULT_OVERLAP = 50
type ParentChildConfig = {
@ -131,7 +127,6 @@ const StepTwo = ({
isSetting ,
documentDetail ,
isAPIKeySet ,
onSetting ,
datasetId ,
indexingType ,
dataSourceType : inCreatePageDataSourceType ,
@ -162,12 +157,12 @@ const StepTwo = ({
const isInCreatePage = ! datasetId || ( datasetId && ! currentDataset ? . data_source_type )
const dataSourceType = isInCreatePage ? inCreatePageDataSourceType : currentDataset?.data_source_type
const [ segmentationType , setSegmentationType ] = useState < SegmentType> ( SegmentType . CUSTOM )
const [ segmentationType , setSegmentationType ] = useState < ProcessMode> ( ProcessMode . general )
const [ segmentIdentifier , doSetSegmentIdentifier ] = useState ( DEFAULT_SEGMENT_IDENTIFIER )
const setSegmentIdentifier = useCallback ( ( value : string , canEmpty? : boolean ) = > {
doSetSegmentIdentifier ( value ? escape ( value ) : ( canEmpty ? '' : DEFAULT_SEGMENT_IDENTIFIER ) )
} , [ ] )
const [ maxChunkLength , setMaxChunkLength ] = useState ( DEFAULT_MAX M IMUM_CHUNK_LENGTH) // default chunk length
const [ maxChunkLength , setMaxChunkLength ] = useState ( DEFAULT_MAX IMUM_CHUNK_LENGTH) // default chunk length
const [ limitMaxChunkLength , setLimitMaxChunkLength ] = useState ( 4000 )
const [ overlap , setOverlap ] = useState ( DEFAULT_OVERLAP )
const [ rules , setRules ] = useState < PreProcessingRule [ ] > ( [ ] )
@ -198,7 +193,6 @@ const StepTwo = ({
)
// QA Related
const [ isLanguageSelectDisabled , _setIsLanguageSelectDisabled ] = useState ( false )
const [ isQAConfirmDialogOpen , setIsQAConfirmDialogOpen ] = useState ( false )
const [ docForm , setDocForm ] = useState < ChunkingMode > (
( datasetId && documentDetail ) ? documentDetail . doc_form as ChunkingMode : ChunkingMode.text ,
@ -348,7 +342,7 @@ const StepTwo = ({
}
const updatePreview = ( ) = > {
if ( segmentationType === SegmentType. CUSTOM && maxChunkLength > 4000 ) {
if ( segmentationType === ProcessMode. general && maxChunkLength > 4000 ) {
Toast . notify ( { type : 'error' , message : t ( 'datasetCreation.stepTwo.maxLengthCheck' ) } )
return
}
@ -373,13 +367,42 @@ const StepTwo = ({
model : defaultEmbeddingModel?.model || '' ,
} ,
)
const [ retrievalConfig , setRetrievalConfig ] = useState ( currentDataset ? . retrieval_model_dict || {
search_method : RETRIEVE_METHOD.semantic ,
reranking_enable : false ,
reranking_model : {
reranking_provider_name : '' ,
reranking_model_name : '' ,
} ,
top_k : 3 ,
score_threshold_enabled : false ,
score_threshold : 0.5 ,
} as RetrievalConfig )
useEffect ( ( ) = > {
if ( currentDataset ? . retrieval_model_dict )
return
setRetrievalConfig ( {
search_method : RETRIEVE_METHOD.semantic ,
reranking_enable : ! ! isRerankDefaultModelValid ,
reranking_model : {
reranking_provider_name : isRerankDefaultModelValid ? rerankDefaultModel ? . provider . provider ? ? '' : '' ,
reranking_model_name : isRerankDefaultModelValid ? rerankDefaultModel ? . model ? ? '' : '' ,
} ,
top_k : 3 ,
score_threshold_enabled : false ,
score_threshold : 0.5 ,
} )
// eslint-disable-next-line react-hooks/exhaustive-deps
} , [ rerankDefaultModel , isRerankDefaultModelValid ] )
const getCreationParams = ( ) = > {
let params
if ( segmentationType === SegmentType . CUSTOM && overlap > maxChunkLength ) {
if ( segmentationType === ProcessMode. general && overlap > maxChunkLength ) {
Toast . notify ( { type : 'error' , message : t ( 'datasetCreation.stepTwo.overlapCheck' ) } )
return
}
if ( segmentationType === SegmentType . CUSTOM && maxChunkLength > limitMaxChunkLength ) {
if ( segmentationType === ProcessMode. general && maxChunkLength > limitMaxChunkLength ) {
Toast . notify ( { type : 'error' , message : t ( 'datasetCreation.stepTwo.maxLengthCheck' , { limit : limitMaxChunkLength } ) } )
return
}
@ -389,7 +412,6 @@ const StepTwo = ({
doc_form : currentDocForm ,
doc_language : docLanguage ,
process_rule : getProcessRule ( ) ,
// eslint-disable-next-line @typescript-eslint/no-use-before-define
retrieval_model : retrievalConfig , // Readonly. If want to changed, just go to settings page.
embedding_model : embeddingModel.model , // Readonly
embedding_model_provider : embeddingModel.provider , // Readonly
@ -400,10 +422,7 @@ const StepTwo = ({
const indexMethod = getIndexing_technique ( )
if (
! isReRankModelSelected ( {
rerankDefaultModel ,
isRerankDefaultModelValid : ! ! isRerankDefaultModelValid ,
rerankModelList ,
// eslint-disable-next-line @typescript-eslint/no-use-before-define
retrievalConfig ,
indexMethod : indexMethod as string ,
} )
@ -411,16 +430,6 @@ const StepTwo = ({
Toast . notify ( { type : 'error' , message : t ( 'appDebug.datasetConfig.rerankModelRequired' ) } )
return
}
const postRetrievalConfig = ensureRerankModelSelected ( {
rerankDefaultModel : rerankDefaultModel ! ,
retrievalConfig : {
// eslint-disable-next-line @typescript-eslint/no-use-before-define
. . . retrievalConfig ,
// eslint-disable-next-line @typescript-eslint/no-use-before-define
reranking_enable : retrievalConfig.reranking_mode === RerankingModeEnum . RerankingModel ,
} ,
indexMethod : indexMethod as string ,
} )
params = {
data_source : {
type : dataSourceType ,
@ -432,8 +441,7 @@ const StepTwo = ({
process_rule : getProcessRule ( ) ,
doc_form : currentDocForm ,
doc_language : docLanguage ,
retrieval_model : postRetrievalConfig ,
retrieval_model : retrievalConfig ,
embedding_model : embeddingModel.model ,
embedding_model_provider : embeddingModel.provider ,
} as CreateDocumentReq
@ -490,7 +498,6 @@ const StepTwo = ({
const getDefaultMode = ( ) = > {
if ( documentDetail )
// @ts-expect-error fix after api refactored
setSegmentationType ( documentDetail . dataset_process_rule . mode )
}
@ -525,7 +532,6 @@ const StepTwo = ({
onSuccess ( data ) {
updateIndexingTypeCache && updateIndexingTypeCache ( indexType as string )
updateResultCache && updateResultCache ( data )
// eslint-disable-next-line @typescript-eslint/no-use-before-define
updateRetrievalMethodCache && updateRetrievalMethodCache ( retrievalConfig . search_method as string )
} ,
} ,
@ -545,14 +551,6 @@ const StepTwo = ({
isSetting && onSave && onSave ( )
}
const changeToEconomicalType = ( ) = > {
if ( docForm !== ChunkingMode . text )
return
if ( ! hasSetIndexType )
setIndexType ( IndexingType . ECONOMICAL )
}
useEffect ( ( ) = > {
// fetch rules
if ( ! isSetting ) {
@ -574,18 +572,6 @@ const StepTwo = ({
setIndexType ( isAPIKeySet ? IndexingType.QUALIFIED : IndexingType.ECONOMICAL )
} , [ isAPIKeySet , indexingType , datasetId ] )
const [ retrievalConfig , setRetrievalConfig ] = useState ( currentDataset ? . retrieval_model_dict || {
search_method : RETRIEVE_METHOD.semantic ,
reranking_enable : false ,
reranking_model : {
reranking_provider_name : rerankDefaultModel?.provider.provider ,
reranking_model_name : rerankDefaultModel?.model ,
} ,
top_k : 3 ,
score_threshold_enabled : false ,
score_threshold : 0.5 ,
} as RetrievalConfig )
const economyDomRef = useRef < HTMLDivElement > ( null )
const isHoveringEconomy = useHover ( economyDomRef )
@ -984,12 +970,14 @@ const StepTwo = ({
getIndexing_technique ( ) === IndexingType . QUALIFIED
? (
< RetrievalMethodConfig
disabled = { ! ! datasetId }
value = { retrievalConfig }
onChange = { setRetrievalConfig }
/ >
)
: (
< EconomicalRetrievalMethodConfig
disabled = { ! ! datasetId }
value = { retrievalConfig }
onChange = { setRetrievalConfig }
/ >
@ -1010,7 +998,7 @@ const StepTwo = ({
)
: (
< div className = 'flex items-center mt-8 py-2' >
<Button loading = { isCreating } variant = 'primary' onClick = { createHandle } > { t ( 'datasetCreation.stepTwo.save' ) } < / Button >
{! datasetId && <Button loading = { isCreating } variant = 'primary' onClick = { createHandle } > { t ( 'datasetCreation.stepTwo.save' ) } < / Button > }
< Button className = 'ml-2' onClick = { onCancel } > { t ( 'datasetCreation.stepTwo.cancel' ) } < / Button >
< / div >
) }
@ -1081,11 +1069,11 @@ const StepTwo = ({
}
{
currentDocForm !== ChunkingMode . qa
&& < Badge text = { t (
'datasetCreation.stepTwo.previewChunkCount' , {
count : estimate?.total_segments || 0 ,
} ) as string }
/ >
&& < Badge text = { t (
'datasetCreation.stepTwo.previewChunkCount' , {
count : estimate?.total_segments || 0 ,
} ) as string }
/ >
}
< / div >
< / PreviewHeader > }