mirror of https://github.com/langgenius/dify.git
feat: Add OceanBase hybrid search features (#16652)
Co-authored-by: 李远军 <4842@9ji.com> Co-authored-by: yourchanges <yourchanges@gmail.com>
This commit is contained in:
parent
c4bb07184d
commit
6157f57872
|
@ -297,6 +297,7 @@ OCEANBASE_VECTOR_USER=root@test
|
||||||
OCEANBASE_VECTOR_PASSWORD=difyai123456
|
OCEANBASE_VECTOR_PASSWORD=difyai123456
|
||||||
OCEANBASE_VECTOR_DATABASE=test
|
OCEANBASE_VECTOR_DATABASE=test
|
||||||
OCEANBASE_MEMORY_LIMIT=6G
|
OCEANBASE_MEMORY_LIMIT=6G
|
||||||
|
OCEANBASE_ENABLE_HYBRID_SEARCH=false
|
||||||
|
|
||||||
# openGauss configuration
|
# openGauss configuration
|
||||||
OPENGAUSS_HOST=127.0.0.1
|
OPENGAUSS_HOST=127.0.0.1
|
||||||
|
|
|
@ -33,3 +33,9 @@ class OceanBaseVectorConfig(BaseSettings):
|
||||||
description="Name of the OceanBase Vector database to connect to",
|
description="Name of the OceanBase Vector database to connect to",
|
||||||
default=None,
|
default=None,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
OCEANBASE_ENABLE_HYBRID_SEARCH: bool = Field(
|
||||||
|
description="Enable hybrid search features (requires OceanBase >= 4.3.5.1). Set to false for compatibility "
|
||||||
|
"with older versions",
|
||||||
|
default=False,
|
||||||
|
)
|
||||||
|
|
|
@ -646,7 +646,6 @@ class DatasetRetrievalSettingApi(Resource):
|
||||||
| VectorType.BAIDU
|
| VectorType.BAIDU
|
||||||
| VectorType.VIKINGDB
|
| VectorType.VIKINGDB
|
||||||
| VectorType.UPSTASH
|
| VectorType.UPSTASH
|
||||||
| VectorType.OCEANBASE
|
|
||||||
):
|
):
|
||||||
return {"retrieval_method": [RetrievalMethod.SEMANTIC_SEARCH.value]}
|
return {"retrieval_method": [RetrievalMethod.SEMANTIC_SEARCH.value]}
|
||||||
case (
|
case (
|
||||||
|
@ -664,6 +663,7 @@ class DatasetRetrievalSettingApi(Resource):
|
||||||
| VectorType.COUCHBASE
|
| VectorType.COUCHBASE
|
||||||
| VectorType.MILVUS
|
| VectorType.MILVUS
|
||||||
| VectorType.OPENGAUSS
|
| VectorType.OPENGAUSS
|
||||||
|
| VectorType.OCEANBASE
|
||||||
):
|
):
|
||||||
return {
|
return {
|
||||||
"retrieval_method": [
|
"retrieval_method": [
|
||||||
|
@ -692,7 +692,6 @@ class DatasetRetrievalSettingMockApi(Resource):
|
||||||
| VectorType.BAIDU
|
| VectorType.BAIDU
|
||||||
| VectorType.VIKINGDB
|
| VectorType.VIKINGDB
|
||||||
| VectorType.UPSTASH
|
| VectorType.UPSTASH
|
||||||
| VectorType.OCEANBASE
|
|
||||||
):
|
):
|
||||||
return {"retrieval_method": [RetrievalMethod.SEMANTIC_SEARCH.value]}
|
return {"retrieval_method": [RetrievalMethod.SEMANTIC_SEARCH.value]}
|
||||||
case (
|
case (
|
||||||
|
@ -708,6 +707,7 @@ class DatasetRetrievalSettingMockApi(Resource):
|
||||||
| VectorType.PGVECTOR
|
| VectorType.PGVECTOR
|
||||||
| VectorType.LINDORM
|
| VectorType.LINDORM
|
||||||
| VectorType.OPENGAUSS
|
| VectorType.OPENGAUSS
|
||||||
|
| VectorType.OCEANBASE
|
||||||
):
|
):
|
||||||
return {
|
return {
|
||||||
"retrieval_method": [
|
"retrieval_method": [
|
||||||
|
|
|
@ -31,6 +31,7 @@ class OceanBaseVectorConfig(BaseModel):
|
||||||
user: str
|
user: str
|
||||||
password: str
|
password: str
|
||||||
database: str
|
database: str
|
||||||
|
enable_hybrid_search: bool = False
|
||||||
|
|
||||||
@model_validator(mode="before")
|
@model_validator(mode="before")
|
||||||
@classmethod
|
@classmethod
|
||||||
|
@ -57,6 +58,7 @@ class OceanBaseVector(BaseVector):
|
||||||
password=self._config.password,
|
password=self._config.password,
|
||||||
db_name=self._config.database,
|
db_name=self._config.database,
|
||||||
)
|
)
|
||||||
|
self._hybrid_search_enabled = self._check_hybrid_search_support() # Check if hybrid search is supported
|
||||||
|
|
||||||
def get_type(self) -> str:
|
def get_type(self) -> str:
|
||||||
return VectorType.OCEANBASE
|
return VectorType.OCEANBASE
|
||||||
|
@ -98,6 +100,16 @@ class OceanBaseVector(BaseVector):
|
||||||
columns=cols,
|
columns=cols,
|
||||||
vidxs=vidx_params,
|
vidxs=vidx_params,
|
||||||
)
|
)
|
||||||
|
try:
|
||||||
|
if self._hybrid_search_enabled:
|
||||||
|
self._client.perform_raw_text_sql(f"""ALTER TABLE {self._collection_name}
|
||||||
|
ADD FULLTEXT INDEX fulltext_index_for_col_text (text) WITH PARSER ik""")
|
||||||
|
except Exception as e:
|
||||||
|
raise Exception(
|
||||||
|
"Failed to add fulltext index to the target table, your OceanBase version must be 4.3.5.1 or above "
|
||||||
|
+ "to support fulltext index and vector index in the same table",
|
||||||
|
e,
|
||||||
|
)
|
||||||
vals = []
|
vals = []
|
||||||
params = self._client.perform_raw_text_sql("SHOW PARAMETERS LIKE '%ob_vector_memory_limit_percentage%'")
|
params = self._client.perform_raw_text_sql("SHOW PARAMETERS LIKE '%ob_vector_memory_limit_percentage%'")
|
||||||
for row in params:
|
for row in params:
|
||||||
|
@ -116,6 +128,27 @@ class OceanBaseVector(BaseVector):
|
||||||
)
|
)
|
||||||
redis_client.set(collection_exist_cache_key, 1, ex=3600)
|
redis_client.set(collection_exist_cache_key, 1, ex=3600)
|
||||||
|
|
||||||
|
def _check_hybrid_search_support(self) -> bool:
|
||||||
|
"""
|
||||||
|
Check if the current OceanBase version supports hybrid search.
|
||||||
|
Returns True if the version is >= 4.3.5.1, otherwise False.
|
||||||
|
"""
|
||||||
|
if not self._config.enable_hybrid_search:
|
||||||
|
return False
|
||||||
|
|
||||||
|
try:
|
||||||
|
from packaging import version
|
||||||
|
|
||||||
|
# return OceanBase_CE 4.3.5.1 (r101000042025031818-bxxxx) (Built Mar 18 2025 18:13:36)
|
||||||
|
result = self._client.perform_raw_text_sql("SELECT @@version_comment AS version")
|
||||||
|
ob_full_version = result.fetchone()[0]
|
||||||
|
ob_version = ob_full_version.split()[1]
|
||||||
|
logger.debug("Current OceanBase version is %s", ob_version)
|
||||||
|
return version.parse(ob_version).base_version >= version.parse("4.3.5.1").base_version
|
||||||
|
except Exception as e:
|
||||||
|
logger.warning(f"Failed to check OceanBase version: {str(e)}. Disabling hybrid search.")
|
||||||
|
return False
|
||||||
|
|
||||||
def add_texts(self, documents: list[Document], embeddings: list[list[float]], **kwargs):
|
def add_texts(self, documents: list[Document], embeddings: list[list[float]], **kwargs):
|
||||||
ids = self._get_uuids(documents)
|
ids = self._get_uuids(documents)
|
||||||
for id, doc, emb in zip(ids, documents, embeddings):
|
for id, doc, emb in zip(ids, documents, embeddings):
|
||||||
|
@ -130,7 +163,7 @@ class OceanBaseVector(BaseVector):
|
||||||
)
|
)
|
||||||
|
|
||||||
def text_exists(self, id: str) -> bool:
|
def text_exists(self, id: str) -> bool:
|
||||||
cur = self._client.get(table_name=self._collection_name, id=id)
|
cur = self._client.get(table_name=self._collection_name, ids=id)
|
||||||
return bool(cur.rowcount != 0)
|
return bool(cur.rowcount != 0)
|
||||||
|
|
||||||
def delete_by_ids(self, ids: list[str]) -> None:
|
def delete_by_ids(self, ids: list[str]) -> None:
|
||||||
|
@ -139,9 +172,12 @@ class OceanBaseVector(BaseVector):
|
||||||
self._client.delete(table_name=self._collection_name, ids=ids)
|
self._client.delete(table_name=self._collection_name, ids=ids)
|
||||||
|
|
||||||
def get_ids_by_metadata_field(self, key: str, value: str) -> list[str]:
|
def get_ids_by_metadata_field(self, key: str, value: str) -> list[str]:
|
||||||
|
from sqlalchemy import text
|
||||||
|
|
||||||
cur = self._client.get(
|
cur = self._client.get(
|
||||||
table_name=self._collection_name,
|
table_name=self._collection_name,
|
||||||
where_clause=f"metadata->>'$.{key}' = '{value}'",
|
ids=None,
|
||||||
|
where_clause=[text(f"metadata->>'$.{key}' = '{value}'")],
|
||||||
output_column_name=["id"],
|
output_column_name=["id"],
|
||||||
)
|
)
|
||||||
return [row[0] for row in cur]
|
return [row[0] for row in cur]
|
||||||
|
@ -151,36 +187,84 @@ class OceanBaseVector(BaseVector):
|
||||||
self.delete_by_ids(ids)
|
self.delete_by_ids(ids)
|
||||||
|
|
||||||
def search_by_full_text(self, query: str, **kwargs: Any) -> list[Document]:
|
def search_by_full_text(self, query: str, **kwargs: Any) -> list[Document]:
|
||||||
return []
|
if not self._hybrid_search_enabled:
|
||||||
|
return []
|
||||||
|
|
||||||
|
try:
|
||||||
|
top_k = kwargs.get("top_k", 5)
|
||||||
|
if not isinstance(top_k, int) or top_k <= 0:
|
||||||
|
raise ValueError("top_k must be a positive integer")
|
||||||
|
|
||||||
|
document_ids_filter = kwargs.get("document_ids_filter")
|
||||||
|
where_clause = ""
|
||||||
|
if document_ids_filter:
|
||||||
|
document_ids = ", ".join(f"'{id}'" for id in document_ids_filter)
|
||||||
|
where_clause = f" AND metadata->>'$.document_id' IN ({document_ids})"
|
||||||
|
|
||||||
|
full_sql = f"""SELECT metadata, text, MATCH (text) AGAINST (:query) AS score
|
||||||
|
FROM {self._collection_name}
|
||||||
|
WHERE MATCH (text) AGAINST (:query) > 0
|
||||||
|
{where_clause}
|
||||||
|
ORDER BY score DESC
|
||||||
|
LIMIT {top_k}"""
|
||||||
|
|
||||||
|
with self._client.engine.connect() as conn:
|
||||||
|
with conn.begin():
|
||||||
|
from sqlalchemy import text
|
||||||
|
|
||||||
|
result = conn.execute(text(full_sql), {"query": query})
|
||||||
|
rows = result.fetchall()
|
||||||
|
|
||||||
|
docs = []
|
||||||
|
for row in rows:
|
||||||
|
metadata_str, _text, score = row
|
||||||
|
try:
|
||||||
|
metadata = json.loads(metadata_str)
|
||||||
|
except json.JSONDecodeError:
|
||||||
|
print(f"Invalid JSON metadata: {metadata_str}")
|
||||||
|
metadata = {}
|
||||||
|
metadata["score"] = score
|
||||||
|
docs.append(Document(page_content=_text, metadata=metadata))
|
||||||
|
|
||||||
|
return docs
|
||||||
|
except Exception as e:
|
||||||
|
logger.warning(f"Failed to fulltext search: {str(e)}.")
|
||||||
|
return []
|
||||||
|
|
||||||
def search_by_vector(self, query_vector: list[float], **kwargs: Any) -> list[Document]:
|
def search_by_vector(self, query_vector: list[float], **kwargs: Any) -> list[Document]:
|
||||||
document_ids_filter = kwargs.get("document_ids_filter")
|
document_ids_filter = kwargs.get("document_ids_filter")
|
||||||
where_clause = None
|
_where_clause = None
|
||||||
if document_ids_filter:
|
if document_ids_filter:
|
||||||
document_ids = ", ".join(f"'{id}'" for id in document_ids_filter)
|
document_ids = ", ".join(f"'{id}'" for id in document_ids_filter)
|
||||||
where_clause = f"metadata->>'$.document_id' in ({document_ids})"
|
where_clause = f"metadata->>'$.document_id' in ({document_ids})"
|
||||||
|
from sqlalchemy import text
|
||||||
|
|
||||||
|
_where_clause = [text(where_clause)]
|
||||||
ef_search = kwargs.get("ef_search", self._hnsw_ef_search)
|
ef_search = kwargs.get("ef_search", self._hnsw_ef_search)
|
||||||
if ef_search != self._hnsw_ef_search:
|
if ef_search != self._hnsw_ef_search:
|
||||||
self._client.set_ob_hnsw_ef_search(ef_search)
|
self._client.set_ob_hnsw_ef_search(ef_search)
|
||||||
self._hnsw_ef_search = ef_search
|
self._hnsw_ef_search = ef_search
|
||||||
topk = kwargs.get("top_k", 10)
|
topk = kwargs.get("top_k", 10)
|
||||||
cur = self._client.ann_search(
|
try:
|
||||||
table_name=self._collection_name,
|
cur = self._client.ann_search(
|
||||||
vec_column_name="vector",
|
table_name=self._collection_name,
|
||||||
vec_data=query_vector,
|
vec_column_name="vector",
|
||||||
topk=topk,
|
vec_data=query_vector,
|
||||||
distance_func=func.l2_distance,
|
topk=topk,
|
||||||
output_column_names=["text", "metadata"],
|
distance_func=func.l2_distance,
|
||||||
with_dist=True,
|
output_column_names=["text", "metadata"],
|
||||||
where_clause=where_clause,
|
with_dist=True,
|
||||||
)
|
where_clause=_where_clause,
|
||||||
|
)
|
||||||
|
except Exception as e:
|
||||||
|
raise Exception("Failed to search by vector. ", e)
|
||||||
docs = []
|
docs = []
|
||||||
for text, metadata, distance in cur:
|
for _text, metadata, distance in cur:
|
||||||
metadata = json.loads(metadata)
|
metadata = json.loads(metadata)
|
||||||
metadata["score"] = 1 - distance / math.sqrt(2)
|
metadata["score"] = 1 - distance / math.sqrt(2)
|
||||||
docs.append(
|
docs.append(
|
||||||
Document(
|
Document(
|
||||||
page_content=text,
|
page_content=_text,
|
||||||
metadata=metadata,
|
metadata=metadata,
|
||||||
)
|
)
|
||||||
)
|
)
|
||||||
|
|
|
@ -554,6 +554,7 @@ OCEANBASE_VECTOR_PASSWORD=difyai123456
|
||||||
OCEANBASE_VECTOR_DATABASE=test
|
OCEANBASE_VECTOR_DATABASE=test
|
||||||
OCEANBASE_CLUSTER_NAME=difyai
|
OCEANBASE_CLUSTER_NAME=difyai
|
||||||
OCEANBASE_MEMORY_LIMIT=6G
|
OCEANBASE_MEMORY_LIMIT=6G
|
||||||
|
OCEANBASE_ENABLE_HYBRID_SEARCH=false
|
||||||
|
|
||||||
# opengauss configurations, only available when VECTOR_STORE is `opengauss`
|
# opengauss configurations, only available when VECTOR_STORE is `opengauss`
|
||||||
OPENGAUSS_HOST=opengauss
|
OPENGAUSS_HOST=opengauss
|
||||||
|
|
|
@ -373,7 +373,8 @@ services:
|
||||||
|
|
||||||
# OceanBase vector database
|
# OceanBase vector database
|
||||||
oceanbase:
|
oceanbase:
|
||||||
image: quay.io/oceanbase/oceanbase-ce:4.3.3.0-100000142024101215
|
image: oceanbase/oceanbase-ce:4.3.5.1-101000042025031818
|
||||||
|
container_name: oceanbase
|
||||||
profiles:
|
profiles:
|
||||||
- oceanbase
|
- oceanbase
|
||||||
restart: always
|
restart: always
|
||||||
|
@ -386,7 +387,9 @@ services:
|
||||||
OB_SYS_PASSWORD: ${OCEANBASE_VECTOR_PASSWORD:-difyai123456}
|
OB_SYS_PASSWORD: ${OCEANBASE_VECTOR_PASSWORD:-difyai123456}
|
||||||
OB_TENANT_PASSWORD: ${OCEANBASE_VECTOR_PASSWORD:-difyai123456}
|
OB_TENANT_PASSWORD: ${OCEANBASE_VECTOR_PASSWORD:-difyai123456}
|
||||||
OB_CLUSTER_NAME: ${OCEANBASE_CLUSTER_NAME:-difyai}
|
OB_CLUSTER_NAME: ${OCEANBASE_CLUSTER_NAME:-difyai}
|
||||||
OB_SERVER_IP: '127.0.0.1'
|
MODE: MINI
|
||||||
|
ports:
|
||||||
|
- "${OCEANBASE_VECTOR_PORT:-2881}:2881"
|
||||||
|
|
||||||
# Oracle vector database
|
# Oracle vector database
|
||||||
oracle:
|
oracle:
|
||||||
|
|
|
@ -252,6 +252,7 @@ x-shared-env: &shared-api-worker-env
|
||||||
OCEANBASE_VECTOR_DATABASE: ${OCEANBASE_VECTOR_DATABASE:-test}
|
OCEANBASE_VECTOR_DATABASE: ${OCEANBASE_VECTOR_DATABASE:-test}
|
||||||
OCEANBASE_CLUSTER_NAME: ${OCEANBASE_CLUSTER_NAME:-difyai}
|
OCEANBASE_CLUSTER_NAME: ${OCEANBASE_CLUSTER_NAME:-difyai}
|
||||||
OCEANBASE_MEMORY_LIMIT: ${OCEANBASE_MEMORY_LIMIT:-6G}
|
OCEANBASE_MEMORY_LIMIT: ${OCEANBASE_MEMORY_LIMIT:-6G}
|
||||||
|
OCEANBASE_ENABLE_HYBRID_SEARCH: ${OCEANBASE_ENABLE_HYBRID_SEARCH:-false}
|
||||||
OPENGAUSS_HOST: ${OPENGAUSS_HOST:-opengauss}
|
OPENGAUSS_HOST: ${OPENGAUSS_HOST:-opengauss}
|
||||||
OPENGAUSS_PORT: ${OPENGAUSS_PORT:-6600}
|
OPENGAUSS_PORT: ${OPENGAUSS_PORT:-6600}
|
||||||
OPENGAUSS_USER: ${OPENGAUSS_USER:-postgres}
|
OPENGAUSS_USER: ${OPENGAUSS_USER:-postgres}
|
||||||
|
@ -804,7 +805,8 @@ services:
|
||||||
|
|
||||||
# OceanBase vector database
|
# OceanBase vector database
|
||||||
oceanbase:
|
oceanbase:
|
||||||
image: quay.io/oceanbase/oceanbase-ce:4.3.3.0-100000142024101215
|
image: oceanbase/oceanbase-ce:4.3.5.1-101000042025031818
|
||||||
|
container_name: oceanbase
|
||||||
profiles:
|
profiles:
|
||||||
- oceanbase
|
- oceanbase
|
||||||
restart: always
|
restart: always
|
||||||
|
@ -817,7 +819,9 @@ services:
|
||||||
OB_SYS_PASSWORD: ${OCEANBASE_VECTOR_PASSWORD:-difyai123456}
|
OB_SYS_PASSWORD: ${OCEANBASE_VECTOR_PASSWORD:-difyai123456}
|
||||||
OB_TENANT_PASSWORD: ${OCEANBASE_VECTOR_PASSWORD:-difyai123456}
|
OB_TENANT_PASSWORD: ${OCEANBASE_VECTOR_PASSWORD:-difyai123456}
|
||||||
OB_CLUSTER_NAME: ${OCEANBASE_CLUSTER_NAME:-difyai}
|
OB_CLUSTER_NAME: ${OCEANBASE_CLUSTER_NAME:-difyai}
|
||||||
OB_SERVER_IP: '127.0.0.1'
|
MODE: MINI
|
||||||
|
ports:
|
||||||
|
- "${OCEANBASE_VECTOR_PORT:-2881}:2881"
|
||||||
|
|
||||||
# Oracle vector database
|
# Oracle vector database
|
||||||
oracle:
|
oracle:
|
||||||
|
|
Loading…
Reference in New Issue