Compare commits

...

6 Commits

Author SHA1 Message Date
-LAN-
28622679f9 fix: parameterize myscale query vector and add regression test 2026-02-27 18:19:24 +08:00
-LAN-
a24d333ec0 fix: preserve MyScale text content on insert 2026-02-27 18:19:23 +08:00
-LAN-
b9092f839c Harden MyScale query parameterization 2026-02-27 18:19:23 +08:00
-LAN-
1e6de0e6ad docs(api): simplify setup README and worker guidance (#32704) 2026-02-27 18:12:52 +08:00
非法操作
9f0ee5c145 fix: the action button of structure output modal should align right (#32700) 2026-02-27 17:28:41 +08:00
dependabot[bot]
6c66e11cac chore(deps-dev): bump nltk from 3.9.2 to 3.9.3 in /api (#32691)
Signed-off-by: dependabot[bot] <support@github.com>
Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com>
2026-02-27 17:20:55 +09:00
6 changed files with 133 additions and 129 deletions

View File

@@ -42,7 +42,7 @@ The scripts resolve paths relative to their location, so you can run them from a
1. Set up your application by visiting `http://localhost:3000`.
1. Optional: start the worker service (async tasks, runs from `api`).
1. Start the worker service (async and scheduler tasks, runs from `api`).
```bash
./dev/start-worker
@@ -54,86 +54,6 @@ The scripts resolve paths relative to their location, so you can run them from a
./dev/start-beat
```
### Manual commands
<details>
<summary>Show manual setup and run steps</summary>
These commands assume you start from the repository root.
1. Start the docker-compose stack.
The backend requires middleware, including PostgreSQL, Redis, and Weaviate, which can be started together using `docker-compose`.
```bash
cp docker/middleware.env.example docker/middleware.env
# Use mysql or another vector database profile if you are not using postgres/weaviate.
docker compose -f docker/docker-compose.middleware.yaml --profile postgresql --profile weaviate -p dify up -d
```
1. Copy env files.
```bash
cp api/.env.example api/.env
cp web/.env.example web/.env.local
```
1. Install UV if needed.
```bash
pip install uv
# Or on macOS
brew install uv
```
1. Install API dependencies.
```bash
cd api
uv sync --group dev
```
1. Install web dependencies.
```bash
cd web
pnpm install
cd ..
```
1. Start backend (runs migrations first, in a new terminal).
```bash
cd api
uv run flask db upgrade
uv run flask run --host 0.0.0.0 --port=5001 --debug
```
1. Start Dify [web](../web) service (in a new terminal).
```bash
cd web
pnpm dev:inspect
```
1. Set up your application by visiting `http://localhost:3000`.
1. Optional: start the worker service (async tasks, in a new terminal).
```bash
cd api
uv run celery -A app.celery worker -P threads -c 2 --loglevel INFO -Q api_token,dataset,priority_dataset,priority_pipeline,pipeline,mail,ops_trace,app_deletion,plugin,workflow_storage,conversation,workflow,schedule_poller,schedule_executor,triggered_workflow_dispatcher,trigger_refresh_executor,retention
```
1. Optional: start Celery Beat (scheduled tasks, in a new terminal).
```bash
cd api
uv run celery -A app.celery beat
```
</details>
### Environment notes
> [!IMPORTANT]

View File

@@ -33,6 +33,18 @@ class SortOrder(StrEnum):
class MyScaleVector(BaseVector):
_METADATA_KEY_WHITELIST = {
"annotation_id",
"app_id",
"batch",
"dataset_id",
"doc_hash",
"doc_id",
"document_id",
"lang",
"source",
}
def __init__(self, collection_name: str, config: MyScaleConfig, metric: str = "Cosine"):
super().__init__(collection_name)
self._config = config
@@ -45,10 +57,17 @@ class MyScaleVector(BaseVector):
password=config.password,
)
self._client.command("SET allow_experimental_object_type=1")
self._qualified_table = f"{self._config.database}.{self._collection_name}"
def get_type(self) -> str:
return VectorType.MYSCALE
@classmethod
def _validate_metadata_key(cls, key: str) -> str:
if key not in cls._METADATA_KEY_WHITELIST:
raise ValueError(f"Unsupported metadata key: {key!r}")
return key
def create(self, texts: list[Document], embeddings: list[list[float]], **kwargs):
dimension = len(embeddings[0])
self._create_collection(dimension)
@@ -59,7 +78,7 @@ class MyScaleVector(BaseVector):
self._client.command(f"CREATE DATABASE IF NOT EXISTS {self._config.database}")
fts_params = f"('{self._config.fts_params}')" if self._config.fts_params else ""
sql = f"""
CREATE TABLE IF NOT EXISTS {self._config.database}.{self._collection_name}(
CREATE TABLE IF NOT EXISTS {self._qualified_table}(
id String,
text String,
vector Array(Float32),
@@ -74,73 +93,103 @@ class MyScaleVector(BaseVector):
def add_texts(self, documents: list[Document], embeddings: list[list[float]], **kwargs):
ids = []
columns = ["id", "text", "vector", "metadata"]
values = []
rows = []
for i, doc in enumerate(documents):
if doc.metadata is not None:
doc_id = doc.metadata.get("doc_id", str(uuid.uuid4()))
row = (
doc_id,
self.escape_str(doc.page_content),
embeddings[i],
json.dumps(doc.metadata) if doc.metadata else {},
rows.append(
(
doc_id,
doc.page_content,
embeddings[i],
json.dumps(doc.metadata or {}),
)
)
values.append(str(row))
ids.append(doc_id)
sql = f"""
INSERT INTO {self._config.database}.{self._collection_name}
({",".join(columns)}) VALUES {",".join(values)}
"""
self._client.command(sql)
if rows:
self._client.insert(self._qualified_table, rows, column_names=columns)
return ids
@staticmethod
def escape_str(value: Any) -> str:
return "".join(" " if c in {"\\", "'"} else c for c in str(value))
def text_exists(self, id: str) -> bool:
results = self._client.query(f"SELECT id FROM {self._config.database}.{self._collection_name} WHERE id='{id}'")
results = self._client.query(
f"SELECT id FROM {self._qualified_table} WHERE id = %(id)s LIMIT 1",
parameters={"id": id},
)
return results.row_count > 0
def delete_by_ids(self, ids: list[str]):
if not ids:
return
placeholders, params = self._build_in_params("id", ids)
self._client.command(
f"DELETE FROM {self._config.database}.{self._collection_name} WHERE id IN {str(tuple(ids))}"
f"DELETE FROM {self._qualified_table} WHERE id IN ({placeholders})",
parameters=params,
)
def get_ids_by_metadata_field(self, key: str, value: str):
safe_key = self._validate_metadata_key(key)
rows = self._client.query(
f"SELECT DISTINCT id FROM {self._config.database}.{self._collection_name} WHERE metadata.{key}='{value}'"
f"SELECT DISTINCT id FROM {self._qualified_table} WHERE metadata.{safe_key} = %(value)s",
parameters={"value": value},
).result_rows
return [row[0] for row in rows]
def delete_by_metadata_field(self, key: str, value: str):
safe_key = self._validate_metadata_key(key)
self._client.command(
f"DELETE FROM {self._config.database}.{self._collection_name} WHERE metadata.{key}='{value}'"
f"DELETE FROM {self._qualified_table} WHERE metadata.{safe_key} = %(value)s",
parameters={"value": value},
)
def search_by_vector(self, query_vector: list[float], **kwargs: Any) -> list[Document]:
return self._search(f"distance(vector, {str(query_vector)})", self._vec_order, **kwargs)
return self._search(
"distance(vector, %(query_vector)s)",
self._vec_order,
parameters={"query_vector": query_vector},
**kwargs,
)
def search_by_full_text(self, query: str, **kwargs: Any) -> list[Document]:
return self._search(f"TextSearch('enable_nlq=false')(text, '{query}')", SortOrder.DESC, **kwargs)
return self._search(
"TextSearch('enable_nlq=false')(text, %(query)s)",
SortOrder.DESC,
parameters={"query": query},
**kwargs,
)
def _search(self, dist: str, order: SortOrder, **kwargs: Any) -> list[Document]:
@staticmethod
def _build_in_params(prefix: str, values: list[str]) -> tuple[str, dict[str, str]]:
params: dict[str, str] = {}
placeholders = []
for i, value in enumerate(values):
name = f"{prefix}_{i}"
placeholders.append(f"%({name})s")
params[name] = value
return ", ".join(placeholders), params
def _search(
self,
dist: str,
order: SortOrder,
parameters: dict[str, Any] | None = None,
**kwargs: Any,
) -> list[Document]:
top_k = kwargs.get("top_k", 4)
if not isinstance(top_k, int) or top_k <= 0:
raise ValueError("top_k must be a positive integer")
score_threshold = float(kwargs.get("score_threshold") or 0.0)
where_str = (
f"WHERE dist < {1 - score_threshold}"
if self._metric.upper() == "COSINE" and order == SortOrder.ASC and score_threshold > 0.0
else ""
)
where_clauses = []
if self._metric.upper() == "COSINE" and order == SortOrder.ASC and score_threshold > 0.0:
where_clauses.append(f"dist < {1 - score_threshold}")
document_ids_filter = kwargs.get("document_ids_filter")
query_params = dict(parameters or {})
if document_ids_filter:
document_ids = ", ".join(f"'{id}'" for id in document_ids_filter)
where_str = f"{where_str} AND metadata['document_id'] in ({document_ids})"
placeholders, params = self._build_in_params("document_id", document_ids_filter)
where_clauses.append(f"metadata['document_id'] IN ({placeholders})")
query_params.update(params)
where_str = f"WHERE {' AND '.join(where_clauses)}" if where_clauses else ""
sql = f"""
SELECT text, vector, metadata, {dist} as dist FROM {self._config.database}.{self._collection_name}
SELECT text, vector, metadata, {dist} as dist FROM {self._qualified_table}
{where_str} ORDER BY dist {order.value} LIMIT {top_k}
"""
try:
@@ -150,14 +199,14 @@ class MyScaleVector(BaseVector):
vector=r["vector"],
metadata=r["metadata"],
)
for r in self._client.query(sql).named_results()
for r in self._client.query(sql, parameters=query_params).named_results()
]
except Exception:
logger.exception("Vector search operation failed")
return []
def delete(self):
self._client.command(f"DROP TABLE IF EXISTS {self._config.database}.{self._collection_name}")
self._client.command(f"DROP TABLE IF EXISTS {self._qualified_table}")
class MyScaleVectorFactory(AbstractVectorFactory):

View File

@@ -0,0 +1,32 @@
from unittest.mock import MagicMock, patch
from core.rag.datasource.vdb.myscale.myscale_vector import MyScaleConfig, MyScaleVector
@patch("core.rag.datasource.vdb.myscale.myscale_vector.get_client")
def test_search_by_vector_uses_parameterized_query(mock_get_client):
mock_client = MagicMock()
mock_get_client.return_value = mock_client
vector = MyScaleVector(
collection_name="test_collection",
config=MyScaleConfig(
host="localhost",
port=8123,
user="default",
password="",
database="dify",
fts_params="",
),
)
vector._search = MagicMock(return_value=[])
query_vector = [0.1, 0.2, 0.3]
vector.search_by_vector(query_vector, top_k=5)
vector._search.assert_called_once_with(
"distance(vector, %(query_vector)s)",
vector._vec_order,
parameters={"query_vector": query_vector},
top_k=5,
)

6
api/uv.lock generated
View File

@@ -3700,7 +3700,7 @@ wheels = [
[[package]]
name = "nltk"
version = "3.9.2"
version = "3.9.3"
source = { registry = "https://pypi.org/simple" }
dependencies = [
{ name = "click" },
@@ -3708,9 +3708,9 @@ dependencies = [
{ name = "regex" },
{ name = "tqdm" },
]
sdist = { url = "https://files.pythonhosted.org/packages/f9/76/3a5e4312c19a028770f86fd7c058cf9f4ec4321c6cf7526bab998a5b683c/nltk-3.9.2.tar.gz", hash = "sha256:0f409e9b069ca4177c1903c3e843eef90c7e92992fa4931ae607da6de49e1419", size = 2887629, upload-time = "2025-10-01T07:19:23.764Z" }
sdist = { url = "https://files.pythonhosted.org/packages/e1/8f/915e1c12df07c70ed779d18ab83d065718a926e70d3ea33eb0cd66ffb7c0/nltk-3.9.3.tar.gz", hash = "sha256:cb5945d6424a98d694c2b9a0264519fab4363711065a46aa0ae7a2195b92e71f", size = 2923673, upload-time = "2026-02-24T12:05:53.833Z" }
wheels = [
{ url = "https://files.pythonhosted.org/packages/60/90/81ac364ef94209c100e12579629dc92bf7a709a84af32f8c551b02c07e94/nltk-3.9.2-py3-none-any.whl", hash = "sha256:1e209d2b3009110635ed9709a67a1a3e33a10f799490fa71cf4bec218c11c88a", size = 1513404, upload-time = "2025-10-01T07:19:21.648Z" },
{ url = "https://files.pythonhosted.org/packages/c2/7e/9af5a710a1236e4772de8dfcc6af942a561327bb9f42b5b4a24d0cf100fd/nltk-3.9.3-py3-none-any.whl", hash = "sha256:60b3db6e9995b3dd976b1f0fa7dec22069b2677e759c28eb69b62ddd44870522", size = 1525385, upload-time = "2026-02-24T12:05:46.54Z" },
]
[[package]]

View File

@@ -1,12 +1,12 @@
import type { FC } from 'react'
import type { SchemaRoot } from '../../types'
import { RiBracesLine, RiCloseLine, RiTimelineView } from '@remixicon/react'
import { useCallback, useState } from 'react'
import { useTranslation } from 'react-i18next'
import Button from '@/app/components/base/button'
import Divider from '@/app/components/base/divider'
import Toast from '@/app/components/base/toast'
import { JSON_SCHEMA_MAX_DEPTH } from '@/config'
import { cn } from '@/utils/classnames'
import { SegmentedControl } from '../../../../../base/segmented-control'
import { Type } from '../../types'
import {
@@ -35,9 +35,17 @@ enum SchemaView {
JsonSchema = 'jsonSchema',
}
const TimelineViewIcon: FC<{ className?: string }> = ({ className }) => {
return <span className={cn('i-ri-timeline-view', className)} />
}
const BracesIcon: FC<{ className?: string }> = ({ className }) => {
return <span className={cn('i-ri-braces-line', className)} />
}
const VIEW_TABS = [
{ Icon: RiTimelineView, text: 'Visual Editor', value: SchemaView.VisualEditor },
{ Icon: RiBracesLine, text: 'JSON Schema', value: SchemaView.JsonSchema },
{ Icon: TimelineViewIcon, text: 'Visual Editor', value: SchemaView.VisualEditor },
{ Icon: BracesIcon, text: 'JSON Schema', value: SchemaView.JsonSchema },
]
const DEFAULT_SCHEMA: SchemaRoot = {
@@ -203,11 +211,11 @@ const JsonSchemaConfig: FC<JsonSchemaConfigProps> = ({
<div className="flex h-full flex-col">
{/* Header */}
<div className="relative flex p-6 pb-3 pr-14">
<div className="title-2xl-semi-bold grow truncate text-text-primary">
<div className="grow truncate text-text-primary title-2xl-semi-bold">
{t('nodes.llm.jsonSchema.title', { ns: 'workflow' })}
</div>
<div className="absolute right-5 top-5 flex h-8 w-8 items-center justify-center p-1.5" onClick={onClose}>
<RiCloseLine className="h-[18px] w-[18px] text-text-tertiary" />
<span className="i-ri-close-line h-[18px] w-[18px] text-text-tertiary" />
</div>
</div>
{/* Content */}
@@ -249,7 +257,7 @@ const JsonSchemaConfig: FC<JsonSchemaConfigProps> = ({
{validationError && <ErrorMessage message={validationError} />}
</div>
{/* Footer */}
<div className="flex items-center gap-x-2 p-6 pt-5">
<div className="flex items-center justify-end gap-x-2 p-6 pt-5">
<div className="flex items-center gap-x-3">
<div className="flex items-center gap-x-2">
<Button variant="secondary" onClick={handleResetDefaults}>

View File

@@ -6817,11 +6817,6 @@
"count": 3
}
},
"app/components/workflow/nodes/llm/components/json-schema-config-modal/json-schema-config.tsx": {
"tailwindcss/enforce-consistent-class-order": {
"count": 1
}
},
"app/components/workflow/nodes/llm/components/json-schema-config-modal/json-schema-generator/generated-result.tsx": {
"style/multiline-ternary": {
"count": 2