@ -191,7 +191,8 @@ class ClickzettaVector(BaseVector):
id STRING NOT NULL COMMENT ' Unique document identifier ' ,
{ Field . CONTENT_KEY . value } STRING NOT NULL COMMENT ' Document text content for search and retrieval ' ,
{ Field . METADATA_KEY . value } JSON COMMENT ' Document metadata including source, type, and other attributes ' ,
{ Field . VECTOR . value } VECTOR ( FLOAT , { dimension } ) NOT NULL COMMENT ' High-dimensional embedding vector for semantic similarity search ' ,
{ Field . VECTOR . value } VECTOR ( FLOAT , { dimension } ) NOT NULL COMMENT
' High-dimensional embedding vector for semantic similarity search ' ,
PRIMARY KEY ( id )
) COMMENT ' Dify RAG knowledge base vector storage table for document embeddings and content '
"""
@ -363,13 +364,18 @@ class ClickzettaVector(BaseVector):
# Use parameterized INSERT with executemany for better performance and security
# Cast JSON and VECTOR in SQL, pass raw data as parameters
columns = f " id, { Field . CONTENT_KEY . value } , { Field . METADATA_KEY . value } , { Field . VECTOR . value } "
insert_sql = f " INSERT INTO { self . _config . schema_name } . { self . _table_name } ( { columns } ) VALUES (?, ?, CAST(? AS JSON), CAST(? AS VECTOR( { vector_dimension } ))) "
insert_sql = (
f " INSERT INTO { self . _config . schema_name } . { self . _table_name } ( { columns } ) "
f " VALUES (?, ?, CAST(? AS JSON), CAST(? AS VECTOR( { vector_dimension } ))) "
)
with self . _connection . cursor ( ) as cursor :
try :
cursor . executemany ( insert_sql , data_rows )
logger . info ( f " Inserted batch { batch_index / / batch_size + 1 } / { total_batches } "
f " ( { len ( data_rows ) } valid docs using parameterized query with VECTOR( { vector_dimension } ) cast) " )
logger . info (
f " Inserted batch { batch_index / / batch_size + 1 } / { total_batches } "
f " ( { len ( data_rows ) } valid docs using parameterized query with VECTOR( { vector_dimension } ) cast) "
)
except Exception as e :
logger . exception ( f " Parameterized SQL execution failed for { len ( data_rows ) } documents: { e } " )
logger . exception ( f " SQL template: { insert_sql } " )
@ -445,7 +451,9 @@ class ClickzettaVector(BaseVector):
safe_doc_ids = [ str ( id ) . replace ( " ' " , " ' ' " ) for id in document_ids_filter ]
doc_ids_str = " , " . join ( f " ' { id } ' " for id in safe_doc_ids )
# Use json_extract_string function for ClickZetta compatibility
filter_clauses . append ( f " json_extract_string( { Field . METADATA_KEY . value } , ' $.document_id ' ) IN ( { doc_ids_str } ) " )
filter_clauses . append (
f " json_extract_string( { Field . METADATA_KEY . value } , ' $.document_id ' ) IN ( { doc_ids_str } ) "
)
# No need for dataset_id filter since each dataset has its own table
@ -541,7 +549,9 @@ class ClickzettaVector(BaseVector):
safe_doc_ids = [ str ( id ) . replace ( " ' " , " ' ' " ) for id in document_ids_filter ]
doc_ids_str = " , " . join ( f " ' { id } ' " for id in safe_doc_ids )
# Use json_extract_string function for ClickZetta compatibility
filter_clauses . append ( f " json_extract_string( { Field . METADATA_KEY . value } , ' $.document_id ' ) IN ( { doc_ids_str } ) " )
filter_clauses . append (
f " json_extract_string( { Field . METADATA_KEY . value } , ' $.document_id ' ) IN ( { doc_ids_str } ) "
)
# No need for dataset_id filter since each dataset has its own table
@ -620,7 +630,9 @@ class ClickzettaVector(BaseVector):
safe_doc_ids = [ str ( id ) . replace ( " ' " , " ' ' " ) for id in document_ids_filter ]
doc_ids_str = " , " . join ( f " ' { id } ' " for id in safe_doc_ids )
# Use json_extract_string function for ClickZetta compatibility
filter_clauses . append ( f " json_extract_string( { Field . METADATA_KEY . value } , ' $.document_id ' ) IN ( { doc_ids_str } ) " )
filter_clauses . append (
f " json_extract_string( { Field . METADATA_KEY . value } , ' $.document_id ' ) IN ( { doc_ids_str } ) "
)
# No need for dataset_id filter since each dataset has its own table
@ -724,3 +736,4 @@ class ClickzettaVectorFactory(AbstractVectorFactory):
collection_name = Dataset . gen_collection_name_by_id ( dataset . id ) . lower ( )
return ClickzettaVector ( collection_name = collection_name , config = config )