feat: couchbase integration (#6165)

Co-authored-by: crazywoola <427733928@qq.com> Co-authored-by: Elliot Scribner <elliot.scribner@couchbase.com> Co-authored-by: crazywoola <100913391+crazywoola@users.noreply.github.com> Co-authored-by: Bowen Liang <bowenliang@apache.org>
2 years ago · c8ef9223e5
parent fc37e654fc
commit c8ef9223e5
21 changed files with 639 additions and 7 deletions
--- a/.github/workflows/api-tests.yml
+++ b/.github/workflows/api-tests.yml
@ -78,7 +78,7 @@ jobs:
      - name: Run Workflow
        run: poetry run -C api bash dev/pytest/pytest_workflow.sh
-      - name: Set up Vector Stores (Weaviate, Qdrant, PGVector, Milvus, PgVecto-RS, Chroma, MyScale, ElasticSearch)
+      - name: Set up Vector Stores (Weaviate, Qdrant, PGVector, Milvus, PgVecto-RS, Chroma, MyScale, ElasticSearch, Couchbase)
        uses: hoverkraft-tech/compose-action@v2.0.0
        with:
          compose-file: |
@ -86,6 +86,7 @@ jobs:
          services: |
            weaviate
            qdrant
            couchbase-server
            etcd
            minio
            milvus-standalone
--- a/.github/workflows/expose_service_ports.sh
+++ b/.github/workflows/expose_service_ports.sh
@ -7,5 +7,7 @@ yq eval '.services["milvus-standalone"].ports += ["19530:19530"]' -i docker/dock
 yq eval '.services.pgvector.ports += ["5433:5432"]' -i docker/docker-compose.yaml
 yq eval '.services["pgvecto-rs"].ports += ["5431:5432"]' -i docker/docker-compose.yaml
 yq eval '.services["elasticsearch"].ports += ["9200:9200"]' -i docker/docker-compose.yaml
 yq eval '.services.couchbase-server.ports += ["8091-8096:8091-8096"]' -i docker/docker-compose.yaml
 yq eval '.services.couchbase-server.ports += ["11210:11210"]' -i docker/docker-compose.yaml
-echo "Ports exposed for sandbox, weaviate, qdrant, chroma, milvus, pgvector, pgvecto-rs, elasticsearch"
+echo "Ports exposed for sandbox, weaviate, qdrant, chroma, milvus, pgvector, pgvecto-rs, elasticsearch, couchbase"
--- a/.gitignore
+++ b/.gitignore
@ -173,6 +173,7 @@ docker/volumes/myscale/log/*
 docker/volumes/unstructured/*
 docker/volumes/pgvector/data/*
 docker/volumes/pgvecto_rs/data/*
 docker/volumes/couchbase/*
 docker/nginx/conf.d/default.conf
 docker/nginx/ssl/*
@ -189,4 +190,4 @@ pyrightconfig.json
 api/.vscode
 .idea/
-.vscode
+.vscode
--- a/api/.env.example
+++ b/api/.env.example
@ -120,7 +120,7 @@ SUPABASE_URL=your-server-url
 WEB_API_CORS_ALLOW_ORIGINS=http://127.0.0.1:3000,*
 CONSOLE_CORS_ALLOW_ORIGINS=http://127.0.0.1:3000,*
-# Vector database configuration, support: weaviate, qdrant, milvus, myscale, relyt, pgvecto_rs, pgvector, pgvector, chroma, opensearch, tidb_vector, vikingdb, upstash
+# Vector database configuration, support: weaviate, qdrant, milvus, myscale, relyt, pgvecto_rs, pgvector, pgvector, chroma, opensearch, tidb_vector, couchbase, vikingdb, upstash
 VECTOR_STORE=weaviate
 # Weaviate configuration
@ -136,6 +136,13 @@ QDRANT_CLIENT_TIMEOUT=20
 QDRANT_GRPC_ENABLED=false
 QDRANT_GRPC_PORT=6334
 #Couchbase configuration
 COUCHBASE_CONNECTION_STRING=127.0.0.1
 COUCHBASE_USER=Administrator
 COUCHBASE_PASSWORD=password
 COUCHBASE_BUCKET_NAME=Embeddings
 COUCHBASE_SCOPE_NAME=_default
 # Milvus configuration
 MILVUS_URI=http://127.0.0.1:19530
 MILVUS_TOKEN=
--- a/api/commands.py
+++ b/api/commands.py
@ -278,6 +278,7 @@ def migrate_knowledge_vector_database():
        VectorType.BAIDU,
        VectorType.VIKINGDB,
        VectorType.UPSTASH,
        VectorType.COUCHBASE,
    }
    page = 1
    while True:
--- a/api/configs/middleware/init.py
+++ b/api/configs/middleware/init.py
@ -17,6 +17,7 @@ from configs.middleware.storage.tencent_cos_storage_config import TencentCloudCO
 from configs.middleware.storage.volcengine_tos_storage_config import VolcengineTOSStorageConfig
 from configs.middleware.vdb.analyticdb_config import AnalyticdbConfig
 from configs.middleware.vdb.chroma_config import ChromaConfig
 from configs.middleware.vdb.couchbase_config import CouchbaseConfig
 from configs.middleware.vdb.elasticsearch_config import ElasticsearchConfig
 from configs.middleware.vdb.milvus_config import MilvusConfig
 from configs.middleware.vdb.myscale_config import MyScaleConfig
@ -251,6 +252,7 @@ class MiddlewareConfig(
    TiDBVectorConfig,
    WeaviateConfig,
    ElasticsearchConfig,
    CouchbaseConfig,
    InternalTestConfig,
    VikingDBConfig,
    UpstashConfig,
--- a/api/configs/middleware/vdb/couchbase_config.py
+++ b/api/configs/middleware/vdb/couchbase_config.py
@ -0,0 +1,34 @@
 from typing import Optional
 from pydantic import BaseModel, Field
 class CouchbaseConfig(BaseModel):
    """
    Couchbase configs
    """
    COUCHBASE_CONNECTION_STRING: Optional[str] = Field(
        description="COUCHBASE connection string",
        default=None,
    )
    COUCHBASE_USER: Optional[str] = Field(
        description="COUCHBASE user",
        default=None,
    )
    COUCHBASE_PASSWORD: Optional[str] = Field(
        description="COUCHBASE password",
        default=None,
    )
    COUCHBASE_BUCKET_NAME: Optional[str] = Field(
        description="COUCHBASE bucket name",
        default=None,
    )
    COUCHBASE_SCOPE_NAME: Optional[str] = Field(
        description="COUCHBASE scope name",
        default=None,
    )
--- a/api/controllers/console/datasets/datasets.py
+++ b/api/controllers/console/datasets/datasets.py
@ -640,6 +640,7 @@ class DatasetRetrievalSettingApi(Resource):
                | VectorType.ELASTICSEARCH
                | VectorType.PGVECTOR
                | VectorType.TIDB_ON_QDRANT
                | VectorType.COUCHBASE
            ):
                return {
                    "retrieval_method": [
@ -678,6 +679,7 @@ class DatasetRetrievalSettingMockApi(Resource):
                | VectorType.MYSCALE
                | VectorType.ORACLE
                | VectorType.ELASTICSEARCH
                | VectorType.COUCHBASE
                | VectorType.PGVECTOR
            ):
                return {
--- a/api/core/rag/datasource/vdb/couchbase/init.py
+++ b/api/core/rag/datasource/vdb/couchbase/init.py
--- a/api/core/rag/datasource/vdb/couchbase/couchbase_vector.py
+++ b/api/core/rag/datasource/vdb/couchbase/couchbase_vector.py
@ -0,0 +1,378 @@
 import json
 import logging
 import time
 import uuid
 from datetime import timedelta
 from typing import Any
 from couchbase import search
 from couchbase.auth import PasswordAuthenticator
 from couchbase.cluster import Cluster
 from couchbase.management.search import SearchIndex
 # needed for options -- cluster, timeout, SQL++ (N1QL) query, etc.
 from couchbase.options import ClusterOptions, SearchOptions
 from couchbase.vector_search import VectorQuery, VectorSearch
 from flask import current_app
 from pydantic import BaseModel, model_validator
 from core.rag.datasource.vdb.vector_base import BaseVector
 from core.rag.datasource.vdb.vector_factory import AbstractVectorFactory
 from core.rag.datasource.vdb.vector_type import VectorType
 from core.rag.embedding.embedding_base import Embeddings
 from core.rag.models.document import Document
 from extensions.ext_redis import redis_client
 from models.dataset import Dataset
 logger = logging.getLogger(__name__)
 class CouchbaseConfig(BaseModel):
    connection_string: str
    user: str
    password: str
    bucket_name: str
    scope_name: str
    @model_validator(mode="before")
    @classmethod
    def validate_config(cls, values: dict) -> dict:
        if not values.get("connection_string"):
            raise ValueError("config COUCHBASE_CONNECTION_STRING is required")
        if not values.get("user"):
            raise ValueError("config COUCHBASE_USER is required")
        if not values.get("password"):
            raise ValueError("config COUCHBASE_PASSWORD is required")
        if not values.get("bucket_name"):
            raise ValueError("config COUCHBASE_PASSWORD is required")
        if not values.get("scope_name"):
            raise ValueError("config COUCHBASE_SCOPE_NAME is required")
        return values
 class CouchbaseVector(BaseVector):
    def __init__(self, collection_name: str, config: CouchbaseConfig):
        super().__init__(collection_name)
        self._client_config = config
        """Connect to couchbase"""
        auth = PasswordAuthenticator(config.user, config.password)
        options = ClusterOptions(auth)
        self._cluster = Cluster(config.connection_string, options)
        self._bucket = self._cluster.bucket(config.bucket_name)
        self._scope = self._bucket.scope(config.scope_name)
        self._bucket_name = config.bucket_name
        self._scope_name = config.scope_name
        # Wait until the cluster is ready for use.
        self._cluster.wait_until_ready(timedelta(seconds=5))
    def create(self, texts: list[Document], embeddings: list[list[float]], **kwargs):
        index_id = str(uuid.uuid4()).replace("-", "")
        self._create_collection(uuid=index_id, vector_length=len(embeddings[0]))
        self.add_texts(texts, embeddings)
    def _create_collection(self, vector_length: int, uuid: str):
        lock_name = "vector_indexing_lock_{}".format(self._collection_name)
        with redis_client.lock(lock_name, timeout=20):
            collection_exist_cache_key = "vector_indexing_{}".format(self._collection_name)
            if redis_client.get(collection_exist_cache_key):
                return
            if self._collection_exists(self._collection_name):
                return
            manager = self._bucket.collections()
            manager.create_collection(self._client_config.scope_name, self._collection_name)
            index_manager = self._scope.search_indexes()
            index_definition = json.loads("""
 {
    "type": "fulltext-index",
    "name": "Embeddings._default.Vector_Search",
    "uuid": "26d4db528e78b716",
    "sourceType": "gocbcore",
    "sourceName": "Embeddings",
    "sourceUUID": "2242e4a25b4decd6650c9c7b3afa1dbf",
    "planParams": {
      "maxPartitionsPerPIndex": 1024,
      "indexPartitions": 1
    },
    "params": {
      "doc_config": {
        "docid_prefix_delim": "",
        "docid_regexp": "",
        "mode": "scope.collection.type_field",
        "type_field": "type"
      },
      "mapping": {
        "analysis": { },
        "default_analyzer": "standard",
        "default_datetime_parser": "dateTimeOptional",
        "default_field": "_all",
        "default_mapping": {
          "dynamic": true,
          "enabled": true
        },
        "default_type": "_default",
        "docvalues_dynamic": false,
        "index_dynamic": true,
        "store_dynamic": true,
        "type_field": "_type",
        "types": {
          "collection_name": {
            "dynamic": true,
            "enabled": true,
            "properties": {
              "embedding": {
                "dynamic": false,
                "enabled": true,
                "fields": [
                  {
                    "dims": 1536,
                    "index": true,
                    "name": "embedding",
                    "similarity": "dot_product",
                    "type": "vector",
                    "vector_index_optimized_for": "recall"
                  }
                ]
              },
              "metadata": {
                "dynamic": true,
                "enabled": true
              },
              "text": {
                "dynamic": false,
                "enabled": true,
                "fields": [
                  {
                    "index": true,
                    "name": "text",
                    "store": true,
                    "type": "text"
                  }
                ]
              }
            }
          }
        }
      },
      "store": {
        "indexType": "scorch",
        "segmentVersion": 16
      }
    },
    "sourceParams": { }
  }
 """)
            index_definition["name"] = self._collection_name + "_search"
            index_definition["uuid"] = uuid
            index_definition["params"]["mapping"]["types"]["collection_name"]["properties"]["embedding"]["fields"][0][
                "dims"
            ] = vector_length
            index_definition["params"]["mapping"]["types"][self._scope_name + "." + self._collection_name] = (
                index_definition["params"]["mapping"]["types"].pop("collection_name")
            )
            time.sleep(2)
            index_manager.upsert_index(
                SearchIndex(
                    index_definition["name"],
                    params=index_definition["params"],
                    source_name=self._bucket_name,
                ),
            )
            time.sleep(1)
            redis_client.set(collection_exist_cache_key, 1, ex=3600)
    def _collection_exists(self, name: str):
        scope_collection_map: dict[str, Any] = {}
        # Get a list of all scopes in the bucket
        for scope in self._bucket.collections().get_all_scopes():
            scope_collection_map[scope.name] = []
            # Get a list of all the collections in the scope
            for collection in scope.collections:
                scope_collection_map[scope.name].append(collection.name)
        # Check if the collection exists in the scope
        return self._collection_name in scope_collection_map[self._scope_name]
    def get_type(self) -> str:
        return VectorType.COUCHBASE
    def add_texts(self, documents: list[Document], embeddings: list[list[float]], **kwargs):
        uuids = self._get_uuids(documents)
        texts = [d.page_content for d in documents]
        metadatas = [d.metadata for d in documents]
        doc_ids = []
        documents_to_insert = [
            {"text": text, "embedding": vector, "metadata": metadata}
            for id, text, vector, metadata in zip(uuids, texts, embeddings, metadatas)
        ]
        for doc, id in zip(documents_to_insert, uuids):
            result = self._scope.collection(self._collection_name).upsert(id, doc)
        doc_ids.extend(uuids)
        return doc_ids
    def text_exists(self, id: str) -> bool:
        # Use a parameterized query for safety and correctness
        query = f"""
                SELECT COUNT(1) AS count FROM
                `{self._client_config.bucket_name}`.{self._client_config.scope_name}.{self._collection_name}
                WHERE META().id = $doc_id
                """
        # Pass the id as a parameter to the query
        result = self._cluster.query(query, named_parameters={"doc_id": id}).execute()
        for row in result:
            return row["count"] > 0
        return False  # Return False if no rows are returned
    def delete_by_ids(self, ids: list[str]) -> None:
        query = f"""
            DELETE FROM `{self._bucket_name}`.{self._client_config.scope_name}.{self._collection_name}
            WHERE META().id IN $doc_ids;
            """
        try:
            self._cluster.query(query, named_parameters={"doc_ids": ids}).execute()
        except Exception as e:
            logger.error(e)
    def delete_by_document_id(self, document_id: str):
        query = f"""
                DELETE FROM
                `{self._client_config.bucket_name}`.{self._client_config.scope_name}.{self._collection_name}
                WHERE META().id = $doc_id;
                """
        self._cluster.query(query, named_parameters={"doc_id": document_id}).execute()
    # def get_ids_by_metadata_field(self, key: str, value: str):
    #     query = f"""
    #         SELECT id FROM
    #         `{self._client_config.bucket_name}`.{self._client_config.scope_name}.{self._collection_name}
    #         WHERE `metadata.{key}` = $value;
    #         """
    #     result = self._cluster.query(query, named_parameters={'value':value})
    #     return [row['id'] for row in result.rows()]
    def delete_by_metadata_field(self, key: str, value: str) -> None:
        query = f"""
            DELETE FROM `{self._client_config.bucket_name}`.{self._client_config.scope_name}.{self._collection_name}
            WHERE metadata.{key} = $value;
            """
        self._cluster.query(query, named_parameters={"value": value}).execute()
    def search_by_vector(self, query_vector: list[float], **kwargs: Any) -> list[Document]:
        top_k = kwargs.get("top_k", 5)
        score_threshold = kwargs.get("score_threshold") or 0.0
        search_req = search.SearchRequest.create(
            VectorSearch.from_vector_query(
                VectorQuery(
                    "embedding",
                    query_vector,
                    top_k,
                )
            )
        )
        try:
            search_iter = self._scope.search(
                self._collection_name + "_search",
                search_req,
                SearchOptions(limit=top_k, collections=[self._collection_name], fields=["*"]),
            )
            docs = []
            # Parse the results
            for row in search_iter.rows():
                text = row.fields.pop("text")
                metadata = self._format_metadata(row.fields)
                score = row.score
                metadata["score"] = score
                doc = Document(page_content=text, metadata=metadata)
                if score >= score_threshold:
                    docs.append(doc)
        except Exception as e:
            raise ValueError(f"Search failed with error: {e}")
        return docs
    def search_by_full_text(self, query: str, **kwargs: Any) -> list[Document]:
        top_k = kwargs.get("top_k", 2)
        try:
            CBrequest = search.SearchRequest.create(search.QueryStringQuery("text:" + query))
            search_iter = self._scope.search(
                self._collection_name + "_search", CBrequest, SearchOptions(limit=top_k, fields=["*"])
            )
            docs = []
            for row in search_iter.rows():
                text = row.fields.pop("text")
                metadata = self._format_metadata(row.fields)
                score = row.score
                metadata["score"] = score
                doc = Document(page_content=text, metadata=metadata)
                docs.append(doc)
        except Exception as e:
            raise ValueError(f"Search failed with error: {e}")
        return docs
    def delete(self):
        manager = self._bucket.collections()
        scopes = manager.get_all_scopes()
        for scope in scopes:
            for collection in scope.collections:
                if collection.name == self._collection_name:
                    manager.drop_collection("_default", self._collection_name)
    def _format_metadata(self, row_fields: dict[str, Any]) -> dict[str, Any]:
        """Helper method to format the metadata from the Couchbase Search API.
        Args:
            row_fields (Dict[str, Any]): The fields to format.
        Returns:
            Dict[str, Any]: The formatted metadata.
        """
        metadata = {}
        for key, value in row_fields.items():
            # Couchbase Search returns the metadata key with a prefix
            # `metadata.` We remove it to get the original metadata key
            if key.startswith("metadata"):
                new_key = key.split("metadata" + ".")[-1]
                metadata[new_key] = value
            else:
                metadata[key] = value
        return metadata
 class CouchbaseVectorFactory(AbstractVectorFactory):
    def init_vector(self, dataset: Dataset, attributes: list, embeddings: Embeddings) -> CouchbaseVector:
        if dataset.index_struct_dict:
            class_prefix: str = dataset.index_struct_dict["vector_store"]["class_prefix"]
            collection_name = class_prefix
        else:
            dataset_id = dataset.id
            collection_name = Dataset.gen_collection_name_by_id(dataset_id)
            dataset.index_struct = json.dumps(self.gen_index_struct_dict(VectorType.COUCHBASE, collection_name))
        config = current_app.config
        return CouchbaseVector(
            collection_name=collection_name,
            config=CouchbaseConfig(
                connection_string=config.get("COUCHBASE_CONNECTION_STRING"),
                user=config.get("COUCHBASE_USER"),
                password=config.get("COUCHBASE_PASSWORD"),
                bucket_name=config.get("COUCHBASE_BUCKET_NAME"),
                scope_name=config.get("COUCHBASE_SCOPE_NAME"),
            ),
        )
--- a/api/core/rag/datasource/vdb/vector_factory.py
+++ b/api/core/rag/datasource/vdb/vector_factory.py
@ -114,6 +114,10 @@ class Vector:
                from core.rag.datasource.vdb.analyticdb.analyticdb_vector import AnalyticdbVectorFactory
                return AnalyticdbVectorFactory
            case VectorType.COUCHBASE:
                from core.rag.datasource.vdb.couchbase.couchbase_vector import CouchbaseVectorFactory
                return CouchbaseVectorFactory
            case VectorType.BAIDU:
                from core.rag.datasource.vdb.baidu.baidu_vector import BaiduVectorFactory
--- a/api/core/rag/datasource/vdb/vector_type.py
+++ b/api/core/rag/datasource/vdb/vector_type.py
@ -16,6 +16,7 @@ class VectorType(str, Enum):
    TENCENT = "tencent"
    ORACLE = "oracle"
    ELASTICSEARCH = "elasticsearch"
    COUCHBASE = "couchbase"
    BAIDU = "baidu"
    VIKINGDB = "vikingdb"
    UPSTASH = "upstash"
--- a/api/poetry.lock
+++ b/api/poetry.lock
@ -1801,6 +1801,46 @@ requests = ">=2.8"
 six = "*"
 xmltodict = "*"
 [[package]]
 name = "couchbase"
 version = "4.3.3"
 description = "Python Client for Couchbase"
 optional = false
 python-versions = ">=3.7"
 files = [
    {file = "couchbase-4.3.3-cp310-cp310-macosx_10_15_x86_64.whl", hash = "sha256:d8069e4f01332859d56cca597874645c914699162b3979d1b432f0dfc186b124"},
    {file = "couchbase-4.3.3-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:1caa6cfef49c785b35b1702102f718227f351df87bba2694b9334520c41e9eb5"},
    {file = "couchbase-4.3.3-cp310-cp310-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:f4a9a65c44935249fa078fb90a3c28ea71da9d2d5889fcd514b12d0538010ae0"},
    {file = "couchbase-4.3.3-cp310-cp310-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:4f144b8c482c18283d8e419b844630d41f3249b07d43d40b5e3535444e57d0fb"},
    {file = "couchbase-4.3.3-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:1c534fba6fdc7cf47eed9dee8a57d1e9eb867bf008574e321fa380a77cebf32f"},
    {file = "couchbase-4.3.3-cp310-cp310-win_amd64.whl", hash = "sha256:b841be06e0e4370b69ebef6bca3409c378186f7d6e964cd645ba18e97216c022"},
    {file = "couchbase-4.3.3-cp311-cp311-macosx_10_15_x86_64.whl", hash = "sha256:eee7a73b3acbdc78ae314fddf7f975b3c9e05df07df255f4dcc878939a2abae0"},
    {file = "couchbase-4.3.3-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:53417cafcf90ff4e2fd81ebba2a08b7ad56f17160d1c5019ad3b09c758aeb363"},
    {file = "couchbase-4.3.3-cp311-cp311-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:0cefd13bea8b0f150f1b9d27fd7614f971f77419b31817781d26ba315ed658bb"},
    {file = "couchbase-4.3.3-cp311-cp311-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:78fa1054d7740e2fe38fce0a2aab4e9a2d30263d894e0615ee5df297f02f59a3"},
    {file = "couchbase-4.3.3-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:eb093899cfad5a7472258a9b6a57775dbf23a6e0180241507ba89ce3ab241e41"},
    {file = "couchbase-4.3.3-cp311-cp311-win_amd64.whl", hash = "sha256:f7cfbdc699af5715f49365ffbb05a6a7366a534c0d7161edf270ad3e735a6c5d"},
    {file = "couchbase-4.3.3-cp312-cp312-macosx_10_15_x86_64.whl", hash = "sha256:58352cae9b8affdaa2ac012e0a03c8c2632ee6297a878232888b4e0360d0d5df"},
    {file = "couchbase-4.3.3-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:728e7e3b5e1682706cb9d63993d289226d02a25089527b8ecb4e3889dabc38cf"},
    {file = "couchbase-4.3.3-cp312-cp312-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:73014bf098cf14187a39cc13453e0d859c1d54568df28f69cc308a9a5f24feb2"},
    {file = "couchbase-4.3.3-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:a743375804068ae01b73c916bfca738764c8c12f381bb399ef04e784935856a1"},
    {file = "couchbase-4.3.3-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:394c122cfe02a76a99e7d5178e64129f6da49843225e78d8629abcab556c24af"},
    {file = "couchbase-4.3.3-cp312-cp312-win_amd64.whl", hash = "sha256:bf85d7a5cda548d9801614651206068b4445fa37972e62b14d7521a958198693"},
    {file = "couchbase-4.3.3-cp38-cp38-macosx_10_15_x86_64.whl", hash = "sha256:92d23c9cedd571631070791f2afee0e3d7d8c9ce1bf2ea6e9a4f2fdbc37a0f1e"},
    {file = "couchbase-4.3.3-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:38c42eb29a73cce2998ae5df45bd61b16dce9765d3bff968ec5cf6a622faa291"},
    {file = "couchbase-4.3.3-cp38-cp38-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:afed137bf0edc642d7b201b6ab7b1e7117bb4c8eac6b2f253cc6e106f334a2a1"},
    {file = "couchbase-4.3.3-cp38-cp38-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:954d991377d47883aaf903934c5d0f19577680a2abf80d3ce5bb9b3c80991fc7"},
    {file = "couchbase-4.3.3-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:d5552b9fa684630698dc98d6f3b1082540634c1b7ad5bf53b843b5da57b0169c"},
    {file = "couchbase-4.3.3-cp38-cp38-win_amd64.whl", hash = "sha256:f88f2b7e0c894f7237d9f3fb5c46abc44b8151a97b3ca8e75f57d23ebf59f9da"},
    {file = "couchbase-4.3.3-cp39-cp39-macosx_10_15_x86_64.whl", hash = "sha256:769e1e2367ea1d4de181fcd4b4e353e9abef97d15b581a6c5aea49ece3dc7d59"},
    {file = "couchbase-4.3.3-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:47f59a0b35ffce060583fd11f98f049f3b70701cf14aab9ac092594aca486aeb"},
    {file = "couchbase-4.3.3-cp39-cp39-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:440bb93d611827ba0ea2403c6f204fe931467a6cb5811f0e03bf1779204ef843"},
    {file = "couchbase-4.3.3-cp39-cp39-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:cdb4dde62e1d41c0b8707121ab68fa78b7a1508541bd48fc850be396f91bc8d9"},
    {file = "couchbase-4.3.3-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:7f8cf45f317b39cc19db5c67b565662f08d6c90305b3aa14e04bc22707258213"},
    {file = "couchbase-4.3.3-cp39-cp39-win_amd64.whl", hash = "sha256:c97d48ad486c8f201b4482d5594258f949369cb44792ed148d5159a3d12ae21b"},
    {file = "couchbase-4.3.3.tar.gz", hash = "sha256:27808500551564b39b46943cf3daab572694889c1eb638425d363edb48b20da7"},
 ]
 [[package]]
 name = "coverage"
 version = "7.2.7"
@ -6850,6 +6890,19 @@ files = [
    {file = "pyarrow-17.0.0-cp312-cp312-win_amd64.whl", hash = "sha256:392bc9feabc647338e6c89267635e111d71edad5fcffba204425a7c8d13610d7"},
    {file = "pyarrow-17.0.0-cp38-cp38-macosx_10_15_x86_64.whl", hash = "sha256:af5ff82a04b2171415f1410cff7ebb79861afc5dae50be73ce06d6e870615204"},
    {file = "pyarrow-17.0.0-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:edca18eaca89cd6382dfbcff3dd2d87633433043650c07375d095cd3517561d8"},
    {file = "pyarrow-17.0.0-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:7c7916bff914ac5d4a8fe25b7a25e432ff921e72f6f2b7547d1e325c1ad9d155"},
    {file = "pyarrow-17.0.0-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:f553ca691b9e94b202ff741bdd40f6ccb70cdd5fbf65c187af132f1317de6145"},
    {file = "pyarrow-17.0.0-cp38-cp38-manylinux_2_28_aarch64.whl", hash = "sha256:0cdb0e627c86c373205a2f94a510ac4376fdc523f8bb36beab2e7f204416163c"},
    {file = "pyarrow-17.0.0-cp38-cp38-manylinux_2_28_x86_64.whl", hash = "sha256:d7d192305d9d8bc9082d10f361fc70a73590a4c65cf31c3e6926cd72b76bc35c"},
    {file = "pyarrow-17.0.0-cp38-cp38-win_amd64.whl", hash = "sha256:02dae06ce212d8b3244dd3e7d12d9c4d3046945a5933d28026598e9dbbda1fca"},
    {file = "pyarrow-17.0.0-cp39-cp39-macosx_10_15_x86_64.whl", hash = "sha256:13d7a460b412f31e4c0efa1148e1d29bdf18ad1411eb6757d38f8fbdcc8645fb"},
    {file = "pyarrow-17.0.0-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:9b564a51fbccfab5a04a80453e5ac6c9954a9c5ef2890d1bcf63741909c3f8df"},
    {file = "pyarrow-17.0.0-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:32503827abbc5aadedfa235f5ece8c4f8f8b0a3cf01066bc8d29de7539532687"},
    {file = "pyarrow-17.0.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:a155acc7f154b9ffcc85497509bcd0d43efb80d6f733b0dc3bb14e281f131c8b"},
    {file = "pyarrow-17.0.0-cp39-cp39-manylinux_2_28_aarch64.whl", hash = "sha256:dec8d129254d0188a49f8a1fc99e0560dc1b85f60af729f47de4046015f9b0a5"},
    {file = "pyarrow-17.0.0-cp39-cp39-manylinux_2_28_x86_64.whl", hash = "sha256:a48ddf5c3c6a6c505904545c25a4ae13646ae1f8ba703c4df4a1bfe4f4006bda"},
    {file = "pyarrow-17.0.0-cp39-cp39-win_amd64.whl", hash = "sha256:42bf93249a083aca230ba7e2786c5f673507fa97bbd9725a1e2754715151a204"},
    {file = "pyarrow-17.0.0.tar.gz", hash = "sha256:4beca9521ed2c0921c1023e68d097d0299b62c362639ea315572a58f3f50fd28"},
 ]
 [package.dependencies]
@ -10866,4 +10919,4 @@ cffi = ["cffi (>=1.11)"]
 [metadata]
 lock-version = "2.0"
 python-versions = ">=3.10,<3.13"
-content-hash = "1b268122d3d4771ba219f0e983322e0454b7b8644dba35da38d7d950d489e1ba"
+content-hash = "52552faf5f4823056eb48afe05349ab2f0e9a5bc42105211ccbbb54b59e27b59"
--- a/api/pyproject.toml
+++ b/api/pyproject.toml
@ -239,6 +239,7 @@ alibabacloud_gpdb20160503 = "~3.8.0"
 alibabacloud_tea_openapi = "~0.3.9"
 chromadb = "0.5.1"
 clickhouse-connect = "~0.7.16"
 couchbase = "~4.3.0"
 elasticsearch = "8.14.0"
 opensearch-py = "2.4.0"
 oracledb = "~2.2.1"
--- a/api/tests/integration_tests/vdb/couchbase/init.py
+++ b/api/tests/integration_tests/vdb/couchbase/init.py
--- a/api/tests/integration_tests/vdb/couchbase/test_couchbase.py
+++ b/api/tests/integration_tests/vdb/couchbase/test_couchbase.py
@ -0,0 +1,50 @@
 import subprocess
 import time
 from core.rag.datasource.vdb.couchbase.couchbase_vector import CouchbaseConfig, CouchbaseVector
 from tests.integration_tests.vdb.test_vector_store import (
    AbstractVectorTest,
    get_example_text,
    setup_mock_redis,
 )
 def wait_for_healthy_container(service_name="couchbase-server", timeout=300):
    start_time = time.time()
    while time.time() - start_time < timeout:
        result = subprocess.run(
            ["docker", "inspect", "--format", "{{.State.Health.Status}}", service_name], capture_output=True, text=True
        )
        if result.stdout.strip() == "healthy":
            print(f"{service_name} is healthy!")
            return True
        else:
            print(f"Waiting for {service_name} to be healthy...")
        time.sleep(10)
    raise TimeoutError(f"{service_name} did not become healthy in time")
 class CouchbaseTest(AbstractVectorTest):
    def __init__(self):
        super().__init__()
        self.vector = CouchbaseVector(
            collection_name=self.collection_name,
            config=CouchbaseConfig(
                connection_string="couchbase://127.0.0.1",
                user="Administrator",
                password="password",
                bucket_name="Embeddings",
                scope_name="_default",
            ),
        )
    def search_by_vector(self):
        # brief sleep to ensure document is indexed
        time.sleep(5)
        hits_by_vector = self.vector.search_by_vector(query_vector=self.example_embedding)
        assert len(hits_by_vector) == 1
 def test_couchbase(setup_mock_redis):
    wait_for_healthy_container("couchbase-server", timeout=60)
    CouchbaseTest().run_all_tests()
--- a/dev/pytest/pytest_vdb.sh
+++ b/dev/pytest/pytest_vdb.sh
@ -11,4 +11,5 @@ pytest api/tests/integration_tests/vdb/chroma \
  api/tests/integration_tests/vdb/vikingdb \
  api/tests/integration_tests/vdb/baidu \
  api/tests/integration_tests/vdb/tcvectordb \
-  api/tests/integration_tests/vdb/upstash
+  api/tests/integration_tests/vdb/upstash \
  api/tests/integration_tests/vdb/couchbase \
--- a/docker/.env.example
+++ b/docker/.env.example
@ -375,7 +375,7 @@ SUPABASE_URL=your-server-url
 # ------------------------------
 # The type of vector store to use.
-# Supported values are `weaviate`, `qdrant`, `milvus`, `myscale`, `relyt`, `pgvector`, `pgvecto-rs`, `chroma`, `opensearch`, `tidb_vector`, `oracle`, `tencent`, `elasticsearch`, `analyticdb`, `vikingdb`.
+# Supported values are `weaviate`, `qdrant`, `milvus`, `myscale`, `relyt`, `pgvector`, `pgvecto-rs`, `chroma`, `opensearch`, `tidb_vector`, `oracle`, `tencent`, `elasticsearch`, `analyticdb`, `couchbase`, `vikingdb`.
 VECTOR_STORE=weaviate
 # The Weaviate endpoint URL. Only available when VECTOR_STORE is `weaviate`.
@ -414,6 +414,14 @@ MYSCALE_PASSWORD=
 MYSCALE_DATABASE=dify
 MYSCALE_FTS_PARAMS=
 # Couchbase configurations, only available when VECTOR_STORE is `couchbase`
 # The connection string must include hostname defined in the docker-compose file (couchbase-server in this case)
 COUCHBASE_CONNECTION_STRING=couchbase://couchbase-server
 COUCHBASE_USER=Administrator
 COUCHBASE_PASSWORD=password
 COUCHBASE_BUCKET_NAME=Embeddings
 COUCHBASE_SCOPE_NAME=_default
 # pgvector configurations, only available when VECTOR_STORE is `pgvector`
 PGVECTOR_HOST=pgvector
 PGVECTOR_PORT=5432
--- a/docker/couchbase-server/Dockerfile
+++ b/docker/couchbase-server/Dockerfile
@ -0,0 +1,4 @@
 FROM couchbase/server:latest AS stage_base
 # FROM couchbase:latest AS stage_base 
 COPY init-cbserver.sh /opt/couchbase/init/
 RUN chmod +x /opt/couchbase/init/init-cbserver.sh
--- a/docker/couchbase-server/init-cbserver.sh
+++ b/docker/couchbase-server/init-cbserver.sh
@ -0,0 +1,44 @@
 #!/bin/bash
 # used to start couchbase server - can't get around this as docker compose only allows you to start one command - so we have to start couchbase like the standard couchbase Dockerfile would 
 # https://github.com/couchbase/docker/blob/master/enterprise/couchbase-server/7.2.0/Dockerfile#L88
 /entrypoint.sh couchbase-server & 
 # track if setup is complete so we don't try to setup again
 FILE=/opt/couchbase/init/setupComplete.txt
 if ! [ -f "$FILE" ]; then
  # used to automatically create the cluster based on environment variables
  # https://docs.couchbase.com/server/current/cli/cbcli/couchbase-cli-cluster-init.html
  echo $COUCHBASE_ADMINISTRATOR_USERNAME ":"  $COUCHBASE_ADMINISTRATOR_PASSWORD
  sleep 20s
  /opt/couchbase/bin/couchbase-cli cluster-init -c 127.0.0.1 \
  --cluster-username $COUCHBASE_ADMINISTRATOR_USERNAME \
  --cluster-password $COUCHBASE_ADMINISTRATOR_PASSWORD \
  --services data,index,query,fts \
  --cluster-ramsize $COUCHBASE_RAM_SIZE \
  --cluster-index-ramsize $COUCHBASE_INDEX_RAM_SIZE \
  --cluster-eventing-ramsize $COUCHBASE_EVENTING_RAM_SIZE \
  --cluster-fts-ramsize $COUCHBASE_FTS_RAM_SIZE \
  --index-storage-setting default
  sleep 2s
  # used to auto create the bucket based on environment variables
  # https://docs.couchbase.com/server/current/cli/cbcli/couchbase-cli-bucket-create.html
  /opt/couchbase/bin/couchbase-cli bucket-create -c localhost:8091 \
  --username $COUCHBASE_ADMINISTRATOR_USERNAME \
  --password $COUCHBASE_ADMINISTRATOR_PASSWORD \
  --bucket $COUCHBASE_BUCKET \
  --bucket-ramsize $COUCHBASE_BUCKET_RAMSIZE \
  --bucket-type couchbase
  # create file so we know that the cluster is setup and don't run the setup again 
  touch $FILE
 fi 
  # docker compose will stop the container from running unless we do this
  # known issue and workaround
  tail -f /dev/null
--- a/docker/docker-compose.yaml
+++ b/docker/docker-compose.yaml
@ -110,6 +110,11 @@ x-shared-env: &shared-api-worker-env
  QDRANT_CLIENT_TIMEOUT: ${QDRANT_CLIENT_TIMEOUT:-20}
  QDRANT_GRPC_ENABLED: ${QDRANT_GRPC_ENABLED:-false}
  QDRANT_GRPC_PORT: ${QDRANT_GRPC_PORT:-6334}
  COUCHBASE_CONNECTION_STRING: ${COUCHBASE_CONNECTION_STRING:-'couchbase-server'}
  COUCHBASE_USER: ${COUCHBASE_USER:-Administrator}
  COUCHBASE_PASSWORD: ${COUCHBASE_PASSWORD:-password}
  COUCHBASE_BUCKET_NAME: ${COUCHBASE_BUCKET_NAME:-Embeddings}
  COUCHBASE_SCOPE_NAME: ${COUCHBASE_SCOPE_NAME:-_default}
  MILVUS_URI: ${MILVUS_URI:-http://127.0.0.1:19530}
  MILVUS_TOKEN: ${MILVUS_TOKEN:-}
  MILVUS_USER: ${MILVUS_USER:-root}
@ -475,6 +480,39 @@ services:
    environment:
      QDRANT_API_KEY: ${QDRANT_API_KEY:-difyai123456}
  # The Couchbase vector store.
  couchbase-server:
    build: ./couchbase-server
    profiles:
      - couchbase
    restart: always
    environment:
      - CLUSTER_NAME=dify_search
      - COUCHBASE_ADMINISTRATOR_USERNAME=${COUCHBASE_USER:-Administrator}
      - COUCHBASE_ADMINISTRATOR_PASSWORD=${COUCHBASE_PASSWORD:-password}
      - COUCHBASE_BUCKET=${COUCHBASE_BUCKET_NAME:-Embeddings}
      - COUCHBASE_BUCKET_RAMSIZE=512
      - COUCHBASE_RAM_SIZE=2048
      - COUCHBASE_EVENTING_RAM_SIZE=512
      - COUCHBASE_INDEX_RAM_SIZE=512
      - COUCHBASE_FTS_RAM_SIZE=1024
    hostname: couchbase-server
    container_name: couchbase-server
    working_dir: /opt/couchbase
    stdin_open: true
    tty: true
    entrypoint: [""]
    command: sh -c "/opt/couchbase/init/init-cbserver.sh"
    volumes:
      - ./volumes/couchbase/data:/opt/couchbase/var/lib/couchbase/data
    healthcheck:
      # ensure bucket was created before proceeding
      test: [ "CMD-SHELL", "curl -s -f -u Administrator:password http://localhost:8091/pools/default/buckets | grep -q '\\[{' || exit 1" ]
      interval: 10s
      retries: 10
      start_period: 30s
      timeout: 10s
  # The pgvector vector database.
  pgvector:
    image: pgvector/pgvector:pg16