From 6cd231cfd800f63f82b78ac711499a64832611f6 Mon Sep 17 00:00:00 2001
From: Dongyu Li <544104925@qq.com>
Date: Thu, 29 May 2025 15:23:03 +0800
Subject: [PATCH] feat(kb api): retrieve api add metadata_filtering_conditions
parameters
---
api/core/rag/retrieval/dataset_retrieval.py | 3 +
api/services/hit_testing_service.py | 25 ++++++-
.../datasets/template/template.en.mdx | 69 ++++++++++++++-----
.../datasets/template/template.ja.mdx | 65 +++++++++++++----
.../datasets/template/template.zh.mdx | 37 +++++++++-
5 files changed, 164 insertions(+), 35 deletions(-)
diff --git a/api/core/rag/retrieval/dataset_retrieval.py b/api/core/rag/retrieval/dataset_retrieval.py
index c4adf6de4d..e31bc519f3 100644
--- a/api/core/rag/retrieval/dataset_retrieval.py
+++ b/api/core/rag/retrieval/dataset_retrieval.py
@@ -936,6 +936,9 @@ class DatasetRetrieval:
return metadata_filter_document_ids, metadata_condition
def _replace_metadata_filter_value(self, text: str, inputs: dict) -> str:
+ if not inputs:
+ return text
+
def replacer(match):
key = match.group(1)
return str(inputs.get(key, f"{{{{{key}}}}}"))
diff --git a/api/services/hit_testing_service.py b/api/services/hit_testing_service.py
index 56e06cc33e..f76e1202e7 100644
--- a/api/services/hit_testing_service.py
+++ b/api/services/hit_testing_service.py
@@ -4,6 +4,7 @@ from typing import Any
from core.rag.datasource.retrieval_service import RetrievalService
from core.rag.models.document import Document
+from core.rag.retrieval.dataset_retrieval import DatasetRetrieval
from core.rag.retrieval.retrieval_methods import RetrievalMethod
from extensions.ext_database import db
from models.account import Account
@@ -34,7 +35,28 @@ class HitTestingService:
# get retrieval model , if the model is not setting , using default
if not retrieval_model:
retrieval_model = dataset.retrieval_model or default_retrieval_model
-
+ document_ids_filter = None
+ metadata_filtering_conditions = retrieval_model.get("metadata_filtering_conditions", {})
+ if metadata_filtering_conditions:
+ dataset_retrieval = DatasetRetrieval()
+
+ from core.app.app_config.entities import MetadataFilteringCondition
+ metadata_filtering_conditions = MetadataFilteringCondition(**metadata_filtering_conditions)
+
+ metadata_filter_document_ids, metadata_condition = dataset_retrieval.get_metadata_filter_condition(
+ dataset_ids=[dataset.id],
+ query=query,
+ metadata_filtering_mode="manual",
+ metadata_filtering_conditions=metadata_filtering_conditions,
+ inputs={},
+ tenant_id=None,
+ user_id=None,
+ metadata_model_config=None
+ )
+ if metadata_filter_document_ids:
+ document_ids_filter = metadata_filter_document_ids.get(dataset.id, [])
+ if metadata_condition and not document_ids_filter:
+ return cls.compact_retrieve_response(query, [])
all_documents = RetrievalService.retrieve(
retrieval_method=retrieval_model.get("search_method", "semantic_search"),
dataset_id=dataset.id,
@@ -48,6 +70,7 @@ class HitTestingService:
else None,
reranking_mode=retrieval_model.get("reranking_mode") or "reranking_model",
weights=retrieval_model.get("weights", None),
+ document_ids_filter=document_ids_filter
)
end = time.perf_counter()
diff --git a/web/app/(commonLayout)/datasets/template/template.en.mdx b/web/app/(commonLayout)/datasets/template/template.en.mdx
index 806657c507..1a0979b412 100644
--- a/web/app/(commonLayout)/datasets/template/template.en.mdx
+++ b/web/app/(commonLayout)/datasets/template/template.en.mdx
@@ -1841,20 +1841,45 @@ import { Row, Col, Properties, Property, Heading, SubProperty, PropertyInstructi
Query keyword
- Retrieval model (optional, if not filled, it will be recalled according to the default method)
- - search_method (text) Search method: One of the following four keywords is required
- - keyword_search Keyword search
- - semantic_search Semantic search
- - full_text_search Full-text search
- - hybrid_search Hybrid search
- - reranking_enable (bool) Whether to enable reranking, required if the search mode is semantic_search or hybrid_search (optional)
- - reranking_mode (object) Rerank model configuration, required if reranking is enabled
- - reranking_provider_name (string) Rerank model provider
- - reranking_model_name (string) Rerank model name
- - weights (float) Semantic search weight setting in hybrid search mode
- - top_k (integer) Number of results to return (optional)
- - score_threshold_enabled (bool) Whether to enable score threshold
- - score_threshold (float) Score threshold
+ Retrieval parameters (optional, if not filled, it will be recalled according to the default method)
+ - search_method (text) Search method: One of the following four keywords is required
+ - keyword_search Keyword search
+ - semantic_search Semantic search
+ - full_text_search Full-text search
+ - hybrid_search Hybrid search
+ - reranking_enable (bool) Whether to enable reranking, required if the search mode is semantic_search or hybrid_search (optional)
+ - reranking_mode (object) Rerank model configuration, required if reranking is enabled
+ - reranking_provider_name (string) Rerank model provider
+ - reranking_model_name (string) Rerank model name
+ - weights (float) Semantic search weight setting in hybrid search mode
+ - top_k (integer) Number of results to return (optional)
+ - score_threshold_enabled (bool) Whether to enable score threshold
+ - score_threshold (float) Score threshold
+ - metadata_filtering_conditions (object) Metadata filtering conditions
+ - logical_operator (string) Logical operator: and | or
+ - conditions (array[object]) Conditions list
+ - name (string) Metadata field name
+ - comparison_operator (string) Comparison operator, allowed values:
+ - String comparison:
+ - contains: Contains
+ - not contains: Does not contain
+ - start with: Starts with
+ - end with: Ends with
+ - is: Equals
+ - is not: Does not equal
+ - empty: Is empty
+ - not empty: Is not empty
+ - Numeric comparison:
+ - =: Equals
+ - ≠: Does not equal
+ - >: Greater than
+ - < : Less than
+ - ≥: Greater than or equal
+ - ≤: Less than or equal
+ - Time comparison:
+ - before: Before
+ - after: After
+ - value (string|number|null) Comparison value
Unused field
@@ -1879,7 +1904,17 @@ import { Row, Col, Properties, Property, Heading, SubProperty, PropertyInstructi
"weights": null,
"top_k": 1,
"score_threshold_enabled": false,
- "score_threshold": null
+ "score_threshold": null,
+ "metadata_filtering_conditions": {
+ "logical_operator": "and",
+ "conditions": [
+ {
+ "name": "document_name",
+ "comparison_operator": "contains",
+ "value": "test"
+ }
+ ]
+ }
}
}'`}
>
@@ -2159,9 +2194,7 @@ import { Row, Col, Properties, Property, Heading, SubProperty, PropertyInstructi
label="/datasets/{dataset_id}/documents/metadata"
targetCode={`curl --location --request POST '${props.apiBaseUrl}/datasets/{dataset_id}/documents/metadata' \\\n--header 'Authorization: Bearer {api_key}' \\\n--header 'Content-Type: application/json'\\\n--data-raw '{"operation_data": [{"document_id": "document_id", "metadata_list": [{"id": "id", "value": "value", "name": "name"}]}]}'`}
>
- ```bash {{ title: 'cURL' }}
- ```
-
+ ```bash {{ title: 'cURL' }}
diff --git a/web/app/(commonLayout)/datasets/template/template.ja.mdx b/web/app/(commonLayout)/datasets/template/template.ja.mdx
index bffc91316c..f194c8012e 100644
--- a/web/app/(commonLayout)/datasets/template/template.ja.mdx
+++ b/web/app/(commonLayout)/datasets/template/template.ja.mdx
@@ -1596,20 +1596,45 @@ import { Row, Col, Properties, Property, Heading, SubProperty, PropertyInstructi
クエリキーワード
- 検索モデル (オプション、入力されない場合はデフォルトの方法でリコールされます)
- - search_method (text) 検索方法: 以下の 4 つのキーワードのいずれかが必要です
- - keyword_search キーワード検索
- - semantic_search セマンティック検索
- - full_text_search 全文検索
- - hybrid_search ハイブリッド検索
- - reranking_enable (bool) 再ランキングを有効にするかどうか、検索モードが semantic_search または hybrid_search の場合に必須 (オプション)
- - reranking_mode (object) 再ランキングモデル構成、再ランキングが有効な場合に必須
- - reranking_provider_name (string) 再ランキングモデルプロバイダー
- - reranking_model_name (string) 再ランキングモデル名
- - weights (float) ハイブリッド検索モードでのセマンティック検索の重み設定
- - top_k (integer) 返される結果の数 (オプション)
- - score_threshold_enabled (bool) スコア閾値を有効にするかどうか
- - score_threshold (float) スコア閾値
+ 検索パラメータ(オプション、入力されない場合はデフォルトの方法でリコールされます)
+ - search_method (text) 検索方法: 以下の4つのキーワードのいずれかが必要です
+ - keyword_search キーワード検索
+ - semantic_search セマンティック検索
+ - full_text_search 全文検索
+ - hybrid_search ハイブリッド検索
+ - reranking_enable (bool) 再ランキングを有効にするかどうか、検索モードがsemantic_searchまたはhybrid_searchの場合に必須(オプション)
+ - reranking_mode (object) 再ランキングモデル構成、再ランキングが有効な場合に必須
+ - reranking_provider_name (string) 再ランキングモデルプロバイダー
+ - reranking_model_name (string) 再ランキングモデル名
+ - weights (float) ハイブリッド検索モードでのセマンティック検索の重み設定
+ - top_k (integer) 返される結果の数(オプション)
+ - score_threshold_enabled (bool) スコア閾値を有効にするかどうか
+ - score_threshold (float) スコア閾値
+ - metadata_filtering_conditions (object) メタデータフィルタリング条件
+ - logical_operator (string) 論理演算子: and | or
+ - conditions (array[object]) 条件リスト
+ - name (string) メタデータフィールド名
+ - comparison_operator (string) 比較演算子、許可される値:
+ - 文字列比較:
+ - contains: 含む
+ - not contains: 含まない
+ - start with: で始まる
+ - end with: で終わる
+ - is: 等しい
+ - is not: 等しくない
+ - empty: 空
+ - not empty: 空でない
+ - 数値比較:
+ - =: 等しい
+ - ≠: 等しくない
+ - >: より大きい
+ - < : より小さい
+ - ≥: 以上
+ - ≤: 以下
+ - 時間比較:
+ - before: より前
+ - after: より後
+ - value (string|number|null) 比較値
未使用フィールド
@@ -1634,7 +1659,17 @@ import { Row, Col, Properties, Property, Heading, SubProperty, PropertyInstructi
"weights": null,
"top_k": 1,
"score_threshold_enabled": false,
- "score_threshold": null
+ "score_threshold": null,
+ "metadata_filtering_conditions": {
+ "logical_operator": "and",
+ "conditions": [
+ {
+ "name": "document_name",
+ "comparison_operator": "contains",
+ "value": "test"
+ }
+ ]
+ }
}
}'`}
>
diff --git a/web/app/(commonLayout)/datasets/template/template.zh.mdx b/web/app/(commonLayout)/datasets/template/template.zh.mdx
index d9ae6ab7bc..95bc4d6aa3 100644
--- a/web/app/(commonLayout)/datasets/template/template.zh.mdx
+++ b/web/app/(commonLayout)/datasets/template/template.zh.mdx
@@ -1896,6 +1896,31 @@ import { Row, Col, Properties, Property, Heading, SubProperty, PropertyInstructi
- top_k (integer) 返回结果数量,非必填
- score_threshold_enabled (bool) 是否开启 score 阈值
- score_threshold (float) Score 阈值
+ - metadata_filtering_conditions (object) 元数据过滤条件
+ - logical_operator (string) 逻辑运算符: and | or
+ - conditions (array[object]) 条件列表
+ - name (string) 元数据字段名
+ - comparison_operator (string) 比较运算符,可选值:
+ - 字符串比较:
+ - contains: 包含
+ - not contains: 不包含
+ - start with: 以...开头
+ - end with: 以...结尾
+ - is: 等于
+ - is not: 不等于
+ - empty: 为空
+ - not empty: 不为空
+ - 数值比较:
+ - =: 等于
+ - ≠: 不等于
+ - >: 大于
+ - < : 小于
+ - ≥: 大于等于
+ - ≤: 小于等于
+ - 时间比较:
+ - before: 早于
+ - after: 晚于
+ - value (string|number|null) 比较值
未启用字段
@@ -1920,7 +1945,17 @@ import { Row, Col, Properties, Property, Heading, SubProperty, PropertyInstructi
"weights": null,
"top_k": 1,
"score_threshold_enabled": false,
- "score_threshold": null
+ "score_threshold": null,
+ "metadata_filtering_conditions": {
+ "logical_operator": "and",
+ "conditions": [
+ {
+ "name": "document_name",
+ "comparison_operator": "contains",
+ "value": "test"
+ }
+ ]
+ }
}
}'`}
>