Compare commits

...

12 Commits

Author SHA1 Message Date
-LAN-
d2ce4960f1 chore(versioning): bump version to 0.9.0 (#8911) 2024-09-30 18:33:20 +08:00
KVOJJJin
1af4ca344e Feat: add debounce for search in logs (#8924) 2024-09-30 17:18:47 +08:00
zhuhao
fa837b2dfd fix: fix the issue with the system model configuration update (#8923) 2024-09-30 17:14:13 +08:00
github-actions[bot]
824a71388a chore: translate i18n files (#8917)
Co-authored-by: JohnJyong <76649700+JohnJyong@users.noreply.github.com>
Co-authored-by: crazywoola <100913391+crazywoola@users.noreply.github.com>
2024-09-30 16:35:00 +08:00
Aurelius Huang
4585cffce1 fix: Compatible with special characters in pg full-text search. (#8921)
Co-authored-by: Aurelius Huang <cm.huang@aftership.com>
2024-09-30 16:32:23 +08:00
Yi Xiao
13046709a9 fix: line in iteration node is not straight (#8918) 2024-09-30 16:04:51 +08:00
Jyong
9d221a5e19 external knowledge api (#8913)
Co-authored-by: Yi <yxiaoisme@gmail.com>
2024-09-30 15:38:43 +08:00
zhuhao
77aef9ff1d refactor: optimize the calculation of rerank threshold and the logic for forbidden characters in model_uid (#8879) 2024-09-30 12:55:01 +08:00
zhuhao
503561f464 fix: fix the data validation consistency issue in keyword content review (#8908) 2024-09-30 12:52:18 +08:00
-LAN-
ada9d408ac refactor(api/variables): VariableError as a ValueError. (#8554) 2024-09-30 12:48:58 +08:00
-LAN-
3af65b2f45 feat(api): add version comparison logic (#8902) 2024-09-30 11:12:26 +08:00
Zhaofeng Miao
369e1e6f58 feat(website-crawl): add jina reader as additional alternative for website crawling (#8761) 2024-09-30 09:57:19 +08:00
225 changed files with 7182 additions and 1287 deletions

View File

@@ -9,7 +9,7 @@ class PackagingInfo(BaseSettings):
CURRENT_VERSION: str = Field(
description="Dify version",
default="0.8.3",
default="0.9.0",
)
COMMIT_SHA: str = Field(

View File

@@ -37,7 +37,16 @@ from .auth import activate, data_source_bearer_auth, data_source_oauth, forgot_p
from .billing import billing
# Import datasets controllers
from .datasets import data_source, datasets, datasets_document, datasets_segments, file, hit_testing, website
from .datasets import (
data_source,
datasets,
datasets_document,
datasets_segments,
external,
file,
hit_testing,
website,
)
# Import explore controllers
from .explore import (

View File

@@ -49,7 +49,7 @@ class DatasetListApi(Resource):
page = request.args.get("page", default=1, type=int)
limit = request.args.get("limit", default=20, type=int)
ids = request.args.getlist("ids")
provider = request.args.get("provider", default="vendor")
# provider = request.args.get("provider", default="vendor")
search = request.args.get("keyword", default=None, type=str)
tag_ids = request.args.getlist("tag_ids")
@@ -57,7 +57,7 @@ class DatasetListApi(Resource):
datasets, total = DatasetService.get_datasets_by_ids(ids, current_user.current_tenant_id)
else:
datasets, total = DatasetService.get_datasets(
page, limit, provider, current_user.current_tenant_id, current_user, search, tag_ids
page, limit, current_user.current_tenant_id, current_user, search, tag_ids
)
# check embedding setting
@@ -110,6 +110,26 @@ class DatasetListApi(Resource):
nullable=True,
help="Invalid indexing technique.",
)
parser.add_argument(
"external_knowledge_api_id",
type=str,
nullable=True,
required=False,
)
parser.add_argument(
"provider",
type=str,
nullable=True,
choices=Dataset.PROVIDER_LIST,
required=False,
default="vendor",
)
parser.add_argument(
"external_knowledge_id",
type=str,
nullable=True,
required=False,
)
args = parser.parse_args()
# The role of the current user in the ta table must be admin, owner, or editor, or dataset_operator
@@ -123,6 +143,9 @@ class DatasetListApi(Resource):
indexing_technique=args["indexing_technique"],
account=current_user,
permission=DatasetPermissionEnum.ONLY_ME,
provider=args["provider"],
external_knowledge_api_id=args["external_knowledge_api_id"],
external_knowledge_id=args["external_knowledge_id"],
)
except services.errors.dataset.DatasetNameDuplicateError:
raise DatasetNameDuplicateError()
@@ -211,6 +234,33 @@ class DatasetApi(Resource):
)
parser.add_argument("retrieval_model", type=dict, location="json", help="Invalid retrieval model.")
parser.add_argument("partial_member_list", type=list, location="json", help="Invalid parent user list.")
parser.add_argument(
"external_retrieval_model",
type=dict,
required=False,
nullable=True,
location="json",
help="Invalid external retrieval model.",
)
parser.add_argument(
"external_knowledge_id",
type=str,
required=False,
nullable=True,
location="json",
help="Invalid external knowledge id.",
)
parser.add_argument(
"external_knowledge_api_id",
type=str,
required=False,
nullable=True,
location="json",
help="Invalid external knowledge api id.",
)
args = parser.parse_args()
data = request.get_json()

View File

@@ -0,0 +1,239 @@
from flask import request
from flask_login import current_user
from flask_restful import Resource, marshal, reqparse
from werkzeug.exceptions import Forbidden, InternalServerError, NotFound
import services
from controllers.console import api
from controllers.console.datasets.error import DatasetNameDuplicateError
from controllers.console.setup import setup_required
from controllers.console.wraps import account_initialization_required
from fields.dataset_fields import dataset_detail_fields
from libs.login import login_required
from services.dataset_service import DatasetService
from services.external_knowledge_service import ExternalDatasetService
from services.hit_testing_service import HitTestingService
def _validate_name(name):
if not name or len(name) < 1 or len(name) > 100:
raise ValueError("Name must be between 1 to 100 characters.")
return name
def _validate_description_length(description):
if description and len(description) > 400:
raise ValueError("Description cannot exceed 400 characters.")
return description
class ExternalApiTemplateListApi(Resource):
@setup_required
@login_required
@account_initialization_required
def get(self):
page = request.args.get("page", default=1, type=int)
limit = request.args.get("limit", default=20, type=int)
search = request.args.get("keyword", default=None, type=str)
external_knowledge_apis, total = ExternalDatasetService.get_external_knowledge_apis(
page, limit, current_user.current_tenant_id, search
)
response = {
"data": [item.to_dict() for item in external_knowledge_apis],
"has_more": len(external_knowledge_apis) == limit,
"limit": limit,
"total": total,
"page": page,
}
return response, 200
@setup_required
@login_required
@account_initialization_required
def post(self):
parser = reqparse.RequestParser()
parser.add_argument(
"name",
nullable=False,
required=True,
help="Name is required. Name must be between 1 to 100 characters.",
type=_validate_name,
)
parser.add_argument(
"settings",
type=dict,
location="json",
nullable=False,
required=True,
)
args = parser.parse_args()
ExternalDatasetService.validate_api_list(args["settings"])
# The role of the current user in the ta table must be admin, owner, or editor, or dataset_operator
if not current_user.is_dataset_editor:
raise Forbidden()
try:
external_knowledge_api = ExternalDatasetService.create_external_knowledge_api(
tenant_id=current_user.current_tenant_id, user_id=current_user.id, args=args
)
except services.errors.dataset.DatasetNameDuplicateError:
raise DatasetNameDuplicateError()
return external_knowledge_api.to_dict(), 201
class ExternalApiTemplateApi(Resource):
@setup_required
@login_required
@account_initialization_required
def get(self, external_knowledge_api_id):
external_knowledge_api_id = str(external_knowledge_api_id)
external_knowledge_api = ExternalDatasetService.get_external_knowledge_api(external_knowledge_api_id)
if external_knowledge_api is None:
raise NotFound("API template not found.")
return external_knowledge_api.to_dict(), 200
@setup_required
@login_required
@account_initialization_required
def patch(self, external_knowledge_api_id):
external_knowledge_api_id = str(external_knowledge_api_id)
parser = reqparse.RequestParser()
parser.add_argument(
"name",
nullable=False,
required=True,
help="type is required. Name must be between 1 to 100 characters.",
type=_validate_name,
)
parser.add_argument(
"settings",
type=dict,
location="json",
nullable=False,
required=True,
)
args = parser.parse_args()
ExternalDatasetService.validate_api_list(args["settings"])
external_knowledge_api = ExternalDatasetService.update_external_knowledge_api(
tenant_id=current_user.current_tenant_id,
user_id=current_user.id,
external_knowledge_api_id=external_knowledge_api_id,
args=args,
)
return external_knowledge_api.to_dict(), 200
@setup_required
@login_required
@account_initialization_required
def delete(self, external_knowledge_api_id):
external_knowledge_api_id = str(external_knowledge_api_id)
# The role of the current user in the ta table must be admin, owner, or editor
if not current_user.is_editor or current_user.is_dataset_operator:
raise Forbidden()
ExternalDatasetService.delete_external_knowledge_api(current_user.current_tenant_id, external_knowledge_api_id)
return {"result": "success"}, 200
class ExternalApiUseCheckApi(Resource):
@setup_required
@login_required
@account_initialization_required
def get(self, external_knowledge_api_id):
external_knowledge_api_id = str(external_knowledge_api_id)
external_knowledge_api_is_using, count = ExternalDatasetService.external_knowledge_api_use_check(
external_knowledge_api_id
)
return {"is_using": external_knowledge_api_is_using, "count": count}, 200
class ExternalDatasetCreateApi(Resource):
@setup_required
@login_required
@account_initialization_required
def post(self):
# The role of the current user in the ta table must be admin, owner, or editor
if not current_user.is_editor:
raise Forbidden()
parser = reqparse.RequestParser()
parser.add_argument("external_knowledge_api_id", type=str, required=True, nullable=False, location="json")
parser.add_argument("external_knowledge_id", type=str, required=True, nullable=False, location="json")
parser.add_argument(
"name",
nullable=False,
required=True,
help="name is required. Name must be between 1 to 100 characters.",
type=_validate_name,
)
parser.add_argument("description", type=str, required=False, nullable=True, location="json")
parser.add_argument("external_retrieval_model", type=dict, required=False, location="json")
args = parser.parse_args()
# The role of the current user in the ta table must be admin, owner, or editor, or dataset_operator
if not current_user.is_dataset_editor:
raise Forbidden()
try:
dataset = ExternalDatasetService.create_external_dataset(
tenant_id=current_user.current_tenant_id,
user_id=current_user.id,
args=args,
)
except services.errors.dataset.DatasetNameDuplicateError:
raise DatasetNameDuplicateError()
return marshal(dataset, dataset_detail_fields), 201
class ExternalKnowledgeHitTestingApi(Resource):
@setup_required
@login_required
@account_initialization_required
def post(self, dataset_id):
dataset_id_str = str(dataset_id)
dataset = DatasetService.get_dataset(dataset_id_str)
if dataset is None:
raise NotFound("Dataset not found.")
try:
DatasetService.check_dataset_permission(dataset, current_user)
except services.errors.account.NoPermissionError as e:
raise Forbidden(str(e))
parser = reqparse.RequestParser()
parser.add_argument("query", type=str, location="json")
parser.add_argument("external_retrieval_model", type=dict, required=False, location="json")
args = parser.parse_args()
HitTestingService.hit_testing_args_check(args)
try:
response = HitTestingService.external_retrieve(
dataset=dataset,
query=args["query"],
account=current_user,
external_retrieval_model=args["external_retrieval_model"],
)
return response
except Exception as e:
raise InternalServerError(str(e))
api.add_resource(ExternalKnowledgeHitTestingApi, "/datasets/<uuid:dataset_id>/external-hit-testing")
api.add_resource(ExternalDatasetCreateApi, "/datasets/external")
api.add_resource(ExternalApiTemplateListApi, "/datasets/external-knowledge-api")
api.add_resource(ExternalApiTemplateApi, "/datasets/external-knowledge-api/<uuid:external_knowledge_api_id>")
api.add_resource(ExternalApiUseCheckApi, "/datasets/external-knowledge-api/<uuid:external_knowledge_api_id>/use-check")

View File

@@ -47,6 +47,7 @@ class HitTestingApi(Resource):
parser = reqparse.RequestParser()
parser.add_argument("query", type=str, location="json")
parser.add_argument("retrieval_model", type=dict, required=False, location="json")
parser.add_argument("external_retrieval_model", type=dict, required=False, location="json")
args = parser.parse_args()
HitTestingService.hit_testing_args_check(args)
@@ -57,6 +58,7 @@ class HitTestingApi(Resource):
query=args["query"],
account=current_user,
retrieval_model=args["retrieval_model"],
external_retrieval_model=args["external_retrieval_model"],
limit=10,
)

View File

@@ -14,7 +14,9 @@ class WebsiteCrawlApi(Resource):
@account_initialization_required
def post(self):
parser = reqparse.RequestParser()
parser.add_argument("provider", type=str, choices=["firecrawl"], required=True, nullable=True, location="json")
parser.add_argument(
"provider", type=str, choices=["firecrawl", "jinareader"], required=True, nullable=True, location="json"
)
parser.add_argument("url", type=str, required=True, nullable=True, location="json")
parser.add_argument("options", type=dict, required=True, nullable=True, location="json")
args = parser.parse_args()
@@ -33,7 +35,7 @@ class WebsiteCrawlStatusApi(Resource):
@account_initialization_required
def get(self, job_id: str):
parser = reqparse.RequestParser()
parser.add_argument("provider", type=str, choices=["firecrawl"], required=True, location="args")
parser.add_argument("provider", type=str, choices=["firecrawl", "jinareader"], required=True, location="args")
args = parser.parse_args()
# get crawl status
try:

View File

@@ -38,11 +38,52 @@ class VersionApi(Resource):
return result
content = json.loads(response.content)
result["version"] = content["version"]
result["release_date"] = content["releaseDate"]
result["release_notes"] = content["releaseNotes"]
result["can_auto_update"] = content["canAutoUpdate"]
if _has_new_version(latest_version=content["version"], current_version=f"{args.get('current_version')}"):
result["version"] = content["version"]
result["release_date"] = content["releaseDate"]
result["release_notes"] = content["releaseNotes"]
result["can_auto_update"] = content["canAutoUpdate"]
return result
def _has_new_version(*, latest_version: str, current_version: str) -> bool:
def parse_version(version: str) -> tuple:
# Split version into parts and pre-release suffix if any
parts = version.split("-")
version_parts = parts[0].split(".")
pre_release = parts[1] if len(parts) > 1 else None
# Validate version format
if len(version_parts) != 3:
raise ValueError(f"Invalid version format: {version}")
try:
# Convert version parts to integers
major, minor, patch = map(int, version_parts)
return (major, minor, patch, pre_release)
except ValueError:
raise ValueError(f"Invalid version format: {version}")
latest = parse_version(latest_version)
current = parse_version(current_version)
# Compare major, minor, and patch versions
for latest_part, current_part in zip(latest[:3], current[:3]):
if latest_part > current_part:
return True
elif latest_part < current_part:
return False
# If versions are equal, check pre-release suffixes
if latest[3] is None and current[3] is not None:
return True
elif latest[3] is not None and current[3] is None:
return False
elif latest[3] is not None and current[3] is not None:
# Simple string comparison for pre-release versions
return latest[3] > current[3]
return False
api.add_resource(VersionApi, "/version")

View File

@@ -72,8 +72,9 @@ class DefaultModelApi(Resource):
provider=model_setting["provider"],
model=model_setting["model"],
)
except Exception:
logging.warning(f"{model_setting['model_type']} save error")
except Exception as ex:
logging.exception(f"{model_setting['model_type']} save error: {ex}")
raise ex
return {"result": "success"}

View File

@@ -28,11 +28,11 @@ class DatasetListApi(DatasetApiResource):
page = request.args.get("page", default=1, type=int)
limit = request.args.get("limit", default=20, type=int)
provider = request.args.get("provider", default="vendor")
# provider = request.args.get("provider", default="vendor")
search = request.args.get("keyword", default=None, type=str)
tag_ids = request.args.getlist("tag_ids")
datasets, total = DatasetService.get_datasets(page, limit, provider, tenant_id, current_user, search, tag_ids)
datasets, total = DatasetService.get_datasets(page, limit, tenant_id, current_user, search, tag_ids)
# check embedding setting
provider_manager = ProviderManager()
configurations = provider_manager.get_configurations(tenant_id=current_user.current_tenant_id)
@@ -82,6 +82,26 @@ class DatasetListApi(DatasetApiResource):
required=False,
nullable=False,
)
parser.add_argument(
"external_knowledge_api_id",
type=str,
nullable=True,
required=False,
default="_validate_name",
)
parser.add_argument(
"provider",
type=str,
nullable=True,
required=False,
default="vendor",
)
parser.add_argument(
"external_knowledge_id",
type=str,
nullable=True,
required=False,
)
args = parser.parse_args()
try:
@@ -91,6 +111,9 @@ class DatasetListApi(DatasetApiResource):
indexing_technique=args["indexing_technique"],
account=current_user,
permission=args["permission"],
provider=args["provider"],
external_knowledge_api_id=args["external_knowledge_api_id"],
external_knowledge_id=args["external_knowledge_id"],
)
except services.errors.dataset.DatasetNameDuplicateError:
raise DatasetNameDuplicateError()

View File

@@ -1,2 +1,2 @@
class VariableError(Exception):
class VariableError(ValueError):
pass

View File

@@ -59,7 +59,7 @@ class DatasetIndexToolCallbackHandler:
for item in resource:
dataset_retriever_resource = DatasetRetrieverResource(
message_id=self._message_id,
position=item.get("position"),
position=item.get("position") or 0,
dataset_id=item.get("dataset_id"),
dataset_name=item.get("dataset_name"),
document_id=item.get("document_id"),

View File

@@ -59,6 +59,7 @@ from core.model_runtime.model_providers.__base.large_language_model import Large
from core.model_runtime.model_providers.xinference.xinference_helper import (
XinferenceHelper,
XinferenceModelExtraParameter,
validate_model_uid,
)
from core.model_runtime.utils import helper
@@ -114,7 +115,7 @@ class XinferenceAILargeLanguageModel(LargeLanguageModel):
}
"""
try:
if "/" in credentials["model_uid"] or "?" in credentials["model_uid"] or "#" in credentials["model_uid"]:
if not validate_model_uid(credentials):
raise CredentialsValidateFailedError("model_uid should not contain /, ?, or #")
extra_param = XinferenceHelper.get_xinference_extra_parameter(

View File

@@ -15,6 +15,7 @@ from core.model_runtime.errors.invoke import (
)
from core.model_runtime.errors.validate import CredentialsValidateFailedError
from core.model_runtime.model_providers.__base.rerank_model import RerankModel
from core.model_runtime.model_providers.xinference.xinference_helper import validate_model_uid
class XinferenceRerankModel(RerankModel):
@@ -77,10 +78,7 @@ class XinferenceRerankModel(RerankModel):
)
# score threshold check
if score_threshold is not None:
if result["relevance_score"] >= score_threshold:
rerank_documents.append(rerank_document)
else:
if score_threshold is None or result["relevance_score"] >= score_threshold:
rerank_documents.append(rerank_document)
return RerankResult(model=model, docs=rerank_documents)
@@ -94,7 +92,7 @@ class XinferenceRerankModel(RerankModel):
:return:
"""
try:
if "/" in credentials["model_uid"] or "?" in credentials["model_uid"] or "#" in credentials["model_uid"]:
if not validate_model_uid(credentials):
raise CredentialsValidateFailedError("model_uid should not contain /, ?, or #")
credentials["server_url"] = credentials["server_url"].removesuffix("/")

View File

@@ -14,6 +14,7 @@ from core.model_runtime.errors.invoke import (
)
from core.model_runtime.errors.validate import CredentialsValidateFailedError
from core.model_runtime.model_providers.__base.speech2text_model import Speech2TextModel
from core.model_runtime.model_providers.xinference.xinference_helper import validate_model_uid
class XinferenceSpeech2TextModel(Speech2TextModel):
@@ -42,7 +43,7 @@ class XinferenceSpeech2TextModel(Speech2TextModel):
:return:
"""
try:
if "/" in credentials["model_uid"] or "?" in credentials["model_uid"] or "#" in credentials["model_uid"]:
if not validate_model_uid(credentials):
raise CredentialsValidateFailedError("model_uid should not contain /, ?, or #")
credentials["server_url"] = credentials["server_url"].removesuffix("/")

View File

@@ -17,7 +17,7 @@ from core.model_runtime.errors.invoke import (
)
from core.model_runtime.errors.validate import CredentialsValidateFailedError
from core.model_runtime.model_providers.__base.text_embedding_model import TextEmbeddingModel
from core.model_runtime.model_providers.xinference.xinference_helper import XinferenceHelper
from core.model_runtime.model_providers.xinference.xinference_helper import XinferenceHelper, validate_model_uid
class XinferenceTextEmbeddingModel(TextEmbeddingModel):
@@ -110,7 +110,7 @@ class XinferenceTextEmbeddingModel(TextEmbeddingModel):
:return:
"""
try:
if "/" in credentials["model_uid"] or "?" in credentials["model_uid"] or "#" in credentials["model_uid"]:
if not validate_model_uid(credentials):
raise CredentialsValidateFailedError("model_uid should not contain /, ?, or #")
server_url = credentials["server_url"]

View File

@@ -15,7 +15,7 @@ from core.model_runtime.errors.invoke import (
)
from core.model_runtime.errors.validate import CredentialsValidateFailedError
from core.model_runtime.model_providers.__base.tts_model import TTSModel
from core.model_runtime.model_providers.xinference.xinference_helper import XinferenceHelper
from core.model_runtime.model_providers.xinference.xinference_helper import XinferenceHelper, validate_model_uid
class XinferenceText2SpeechModel(TTSModel):
@@ -70,7 +70,7 @@ class XinferenceText2SpeechModel(TTSModel):
:return:
"""
try:
if "/" in credentials["model_uid"] or "?" in credentials["model_uid"] or "#" in credentials["model_uid"]:
if not validate_model_uid(credentials):
raise CredentialsValidateFailedError("model_uid should not contain /, ?, or #")
credentials["server_url"] = credentials["server_url"].removesuffix("/")

View File

@@ -132,3 +132,16 @@ class XinferenceHelper:
context_length=context_length,
model_family=model_family,
)
def validate_model_uid(credentials: dict) -> bool:
"""
Validate the model_uid within the credentials dictionary to ensure it does not
contain forbidden characters ("/", "?", "#").
param credentials: model credentials
:return: True if the model_uid does not contain forbidden characters ("/", "?", "#"), else False.
"""
forbidden_characters = ["/", "?", "#"]
model_uid = credentials.get("model_uid", "")
return not any(char in forbidden_characters for char in model_uid)

View File

@@ -18,8 +18,12 @@ class KeywordsModeration(Moderation):
if not config.get("keywords"):
raise ValueError("keywords is required")
if len(config.get("keywords")) > 1000:
raise ValueError("keywords length must be less than 1000")
if len(config.get("keywords")) > 10000:
raise ValueError("keywords length must be less than 10000")
keywords_row_len = config["keywords"].split("\n")
if len(keywords_row_len) > 100:
raise ValueError("the number of rows for the keywords must be less than 100")
def moderation_for_inputs(self, inputs: dict, query: str = "") -> ModerationInputsResult:
flagged = False

View File

@@ -10,6 +10,7 @@ from core.rag.rerank.constants.rerank_mode import RerankMode
from core.rag.retrieval.retrieval_methods import RetrievalMethod
from extensions.ext_database import db
from models.dataset import Dataset
from services.external_knowledge_service import ExternalDatasetService
default_retrieval_model = {
"search_method": RetrievalMethod.SEMANTIC_SEARCH.value,
@@ -34,6 +35,9 @@ class RetrievalService:
weights: Optional[dict] = None,
):
dataset = db.session.query(Dataset).filter(Dataset.id == dataset_id).first()
if not dataset:
return []
if not dataset or dataset.available_document_count == 0 or dataset.available_segment_count == 0:
return []
all_documents = []
@@ -108,6 +112,16 @@ class RetrievalService:
)
return all_documents
@classmethod
def external_retrieve(cls, dataset_id: str, query: str, external_retrieval_model: Optional[dict] = None):
dataset = db.session.query(Dataset).filter(Dataset.id == dataset_id).first()
if not dataset:
return []
all_documents = ExternalDatasetService.fetch_external_knowledge_retrieval(
dataset.tenant_id, dataset_id, query, external_retrieval_model
)
return all_documents
@classmethod
def keyword_search(
cls, flask_app: Flask, dataset_id: str, query: str, top_k: int, all_documents: list, exceptions: list

View File

@@ -166,7 +166,7 @@ class PGVector(BaseVector):
with self._get_cursor() as cur:
cur.execute(
f"""SELECT meta, text, ts_rank(to_tsvector(coalesce(text, '')), to_tsquery(%s)) AS score
f"""SELECT meta, text, ts_rank(to_tsvector(coalesce(text, '')), plainto_tsquery(%s)) AS score
FROM {self.table_name}
WHERE to_tsvector(text) @@ plainto_tsquery(%s)
ORDER BY score DESC

View File

@@ -0,0 +1,10 @@
from pydantic import BaseModel
class DocumentContext(BaseModel):
"""
Model class for document context.
"""
content: str
score: float

View File

@@ -12,6 +12,7 @@ from core.rag.extractor.entity.extract_setting import ExtractSetting
from core.rag.extractor.excel_extractor import ExcelExtractor
from core.rag.extractor.firecrawl.firecrawl_web_extractor import FirecrawlWebExtractor
from core.rag.extractor.html_extractor import HtmlExtractor
from core.rag.extractor.jina_reader_extractor import JinaReaderWebExtractor
from core.rag.extractor.markdown_extractor import MarkdownExtractor
from core.rag.extractor.notion_extractor import NotionExtractor
from core.rag.extractor.pdf_extractor import PdfExtractor
@@ -171,6 +172,15 @@ class ExtractProcessor:
only_main_content=extract_setting.website_info.only_main_content,
)
return extractor.extract()
elif extract_setting.website_info.provider == "jinareader":
extractor = JinaReaderWebExtractor(
url=extract_setting.website_info.url,
job_id=extract_setting.website_info.job_id,
tenant_id=extract_setting.website_info.tenant_id,
mode=extract_setting.website_info.mode,
only_main_content=extract_setting.website_info.only_main_content,
)
return extractor.extract()
else:
raise ValueError(f"Unsupported website provider: {extract_setting.website_info.provider}")
else:

View File

@@ -0,0 +1,35 @@
from core.rag.extractor.extractor_base import BaseExtractor
from core.rag.models.document import Document
from services.website_service import WebsiteService
class JinaReaderWebExtractor(BaseExtractor):
"""
Crawl and scrape websites and return content in clean llm-ready markdown.
"""
def __init__(self, url: str, job_id: str, tenant_id: str, mode: str = "crawl", only_main_content: bool = False):
"""Initialize with url, api_key, base_url and mode."""
self._url = url
self.job_id = job_id
self.tenant_id = tenant_id
self.mode = mode
self.only_main_content = only_main_content
def extract(self) -> list[Document]:
"""Extract content from the URL."""
documents = []
if self.mode == "crawl":
crawl_data = WebsiteService.get_crawl_url_data(self.job_id, "jinareader", self._url, self.tenant_id)
if crawl_data is None:
return []
document = Document(
page_content=crawl_data.get("content", ""),
metadata={
"source_url": crawl_data.get("url"),
"description": crawl_data.get("description"),
"title": crawl_data.get("title"),
},
)
documents.append(document)
return documents

View File

@@ -17,6 +17,8 @@ class Document(BaseModel):
"""
metadata: Optional[dict] = Field(default_factory=dict)
provider: Optional[str] = "dify"
class BaseDocumentTransformer(ABC):
"""Abstract base class for document transformation systems.

View File

@@ -28,11 +28,16 @@ class RerankModelRunner:
docs = []
doc_id = []
unique_documents = []
for document in documents:
dify_documents = [item for item in documents if item.provider == "dify"]
external_documents = [item for item in documents if item.provider == "external"]
for document in dify_documents:
if document.metadata["doc_id"] not in doc_id:
doc_id.append(document.metadata["doc_id"])
docs.append(document.page_content)
unique_documents.append(document)
for document in external_documents:
docs.append(document.page_content)
unique_documents.append(document)
documents = unique_documents
@@ -46,14 +51,10 @@ class RerankModelRunner:
# format document
rerank_document = Document(
page_content=result.text,
metadata={
"doc_id": documents[result.index].metadata["doc_id"],
"doc_hash": documents[result.index].metadata["doc_hash"],
"document_id": documents[result.index].metadata["document_id"],
"dataset_id": documents[result.index].metadata["dataset_id"],
"score": result.score,
},
metadata=documents[result.index].metadata,
provider=documents[result.index].provider,
)
rerank_document.metadata["score"] = result.score
rerank_documents.append(rerank_document)
return rerank_documents

View File

@@ -20,6 +20,7 @@ from core.ops.utils import measure_time
from core.rag.data_post_processor.data_post_processor import DataPostProcessor
from core.rag.datasource.keyword.jieba.jieba_keyword_table_handler import JiebaKeywordTableHandler
from core.rag.datasource.retrieval_service import RetrievalService
from core.rag.entities.context_entities import DocumentContext
from core.rag.models.document import Document
from core.rag.retrieval.retrieval_methods import RetrievalMethod
from core.rag.retrieval.router.multi_dataset_function_call_router import FunctionCallMultiDatasetRouter
@@ -30,6 +31,7 @@ from core.tools.tool.dataset_retriever.dataset_retriever_tool import DatasetRetr
from extensions.ext_database import db
from models.dataset import Dataset, DatasetQuery, DocumentSegment
from models.dataset import Document as DatasetDocument
from services.external_knowledge_service import ExternalDatasetService
default_retrieval_model = {
"search_method": RetrievalMethod.SEMANTIC_SEARCH.value,
@@ -110,7 +112,7 @@ class DatasetRetrieval:
continue
# pass if dataset is not available
if dataset and dataset.available_document_count == 0:
if dataset and dataset.available_document_count == 0 and dataset.provider != "external":
continue
available_datasets.append(dataset)
@@ -146,69 +148,93 @@ class DatasetRetrieval:
message_id,
)
document_score_list = {}
for item in all_documents:
if item.metadata.get("score"):
document_score_list[item.metadata["doc_id"]] = item.metadata["score"]
dify_documents = [item for item in all_documents if item.provider == "dify"]
external_documents = [item for item in all_documents if item.provider == "external"]
document_context_list = []
index_node_ids = [document.metadata["doc_id"] for document in all_documents]
segments = DocumentSegment.query.filter(
DocumentSegment.dataset_id.in_(dataset_ids),
DocumentSegment.completed_at.isnot(None),
DocumentSegment.status == "completed",
DocumentSegment.enabled == True,
DocumentSegment.index_node_id.in_(index_node_ids),
).all()
retrieval_resource_list = []
# deal with external documents
for item in external_documents:
document_context_list.append(DocumentContext(content=item.page_content, score=item.metadata.get("score")))
source = {
"dataset_id": item.metadata.get("dataset_id"),
"dataset_name": item.metadata.get("dataset_name"),
"document_name": item.metadata.get("title"),
"data_source_type": "external",
"retriever_from": invoke_from.to_source(),
"score": item.metadata.get("score"),
"content": item.page_content,
}
retrieval_resource_list.append(source)
document_score_list = {}
# deal with dify documents
if dify_documents:
for item in dify_documents:
if item.metadata.get("score"):
document_score_list[item.metadata["doc_id"]] = item.metadata["score"]
if segments:
index_node_id_to_position = {id: position for position, id in enumerate(index_node_ids)}
sorted_segments = sorted(
segments, key=lambda segment: index_node_id_to_position.get(segment.index_node_id, float("inf"))
)
for segment in sorted_segments:
if segment.answer:
document_context_list.append(f"question:{segment.get_sign_content()} answer:{segment.answer}")
else:
document_context_list.append(segment.get_sign_content())
if show_retrieve_source:
context_list = []
resource_number = 1
index_node_ids = [document.metadata["doc_id"] for document in dify_documents]
segments = DocumentSegment.query.filter(
DocumentSegment.dataset_id.in_(dataset_ids),
DocumentSegment.status == "completed",
DocumentSegment.enabled == True,
DocumentSegment.index_node_id.in_(index_node_ids),
).all()
if segments:
index_node_id_to_position = {id: position for position, id in enumerate(index_node_ids)}
sorted_segments = sorted(
segments, key=lambda segment: index_node_id_to_position.get(segment.index_node_id, float("inf"))
)
for segment in sorted_segments:
dataset = Dataset.query.filter_by(id=segment.dataset_id).first()
document = DatasetDocument.query.filter(
DatasetDocument.id == segment.document_id,
DatasetDocument.enabled == True,
DatasetDocument.archived == False,
).first()
if dataset and document:
source = {
"position": resource_number,
"dataset_id": dataset.id,
"dataset_name": dataset.name,
"document_id": document.id,
"document_name": document.name,
"data_source_type": document.data_source_type,
"segment_id": segment.id,
"retriever_from": invoke_from.to_source(),
"score": document_score_list.get(segment.index_node_id, None),
}
if segment.answer:
document_context_list.append(
DocumentContext(
content=f"question:{segment.get_sign_content()} answer:{segment.answer}",
score=document_score_list.get(segment.index_node_id, None),
)
)
else:
document_context_list.append(
DocumentContext(
content=segment.get_sign_content(),
score=document_score_list.get(segment.index_node_id, None),
)
)
if show_retrieve_source:
for segment in sorted_segments:
dataset = Dataset.query.filter_by(id=segment.dataset_id).first()
document = DatasetDocument.query.filter(
DatasetDocument.id == segment.document_id,
DatasetDocument.enabled == True,
DatasetDocument.archived == False,
).first()
if dataset and document:
source = {
"dataset_id": dataset.id,
"dataset_name": dataset.name,
"document_id": document.id,
"document_name": document.name,
"data_source_type": document.data_source_type,
"segment_id": segment.id,
"retriever_from": invoke_from.to_source(),
"score": document_score_list.get(segment.index_node_id, None),
}
if invoke_from.to_source() == "dev":
source["hit_count"] = segment.hit_count
source["word_count"] = segment.word_count
source["segment_position"] = segment.position
source["index_node_hash"] = segment.index_node_hash
if segment.answer:
source["content"] = f"question:{segment.content} \nanswer:{segment.answer}"
else:
source["content"] = segment.content
context_list.append(source)
resource_number += 1
if hit_callback:
hit_callback.return_retriever_resource_info(context_list)
return str("\n".join(document_context_list))
if invoke_from.to_source() == "dev":
source["hit_count"] = segment.hit_count
source["word_count"] = segment.word_count
source["segment_position"] = segment.position
source["index_node_hash"] = segment.index_node_hash
if segment.answer:
source["content"] = f"question:{segment.content} \nanswer:{segment.answer}"
else:
source["content"] = segment.content
retrieval_resource_list.append(source)
if hit_callback and retrieval_resource_list:
hit_callback.return_retriever_resource_info(retrieval_resource_list)
if document_context_list:
document_context_list = sorted(document_context_list, key=lambda x: x.score, reverse=True)
return str("\n".join([document_context.content for document_context in document_context_list]))
return ""
def single_retrieve(
@@ -256,36 +282,58 @@ class DatasetRetrieval:
# get retrieval model config
dataset = db.session.query(Dataset).filter(Dataset.id == dataset_id).first()
if dataset:
retrieval_model_config = dataset.retrieval_model or default_retrieval_model
# get top k
top_k = retrieval_model_config["top_k"]
# get retrieval method
if dataset.indexing_technique == "economy":
retrieval_method = "keyword_search"
else:
retrieval_method = retrieval_model_config["search_method"]
# get reranking model
reranking_model = (
retrieval_model_config["reranking_model"] if retrieval_model_config["reranking_enable"] else None
)
# get score threshold
score_threshold = 0.0
score_threshold_enabled = retrieval_model_config.get("score_threshold_enabled")
if score_threshold_enabled:
score_threshold = retrieval_model_config.get("score_threshold")
with measure_time() as timer:
results = RetrievalService.retrieve(
retrieval_method=retrieval_method,
dataset_id=dataset.id,
results = []
if dataset.provider == "external":
external_documents = ExternalDatasetService.fetch_external_knowledge_retrieval(
tenant_id=dataset.tenant_id,
dataset_id=dataset_id,
query=query,
top_k=top_k,
score_threshold=score_threshold,
reranking_model=reranking_model,
reranking_mode=retrieval_model_config.get("reranking_mode", "reranking_model"),
weights=retrieval_model_config.get("weights", None),
external_retrieval_parameters=dataset.retrieval_model,
)
for external_document in external_documents:
document = Document(
page_content=external_document.get("content"),
metadata=external_document.get("metadata"),
provider="external",
)
document.metadata["score"] = external_document.get("score")
document.metadata["title"] = external_document.get("title")
document.metadata["dataset_id"] = dataset_id
document.metadata["dataset_name"] = dataset.name
results.append(document)
else:
retrieval_model_config = dataset.retrieval_model or default_retrieval_model
# get top k
top_k = retrieval_model_config["top_k"]
# get retrieval method
if dataset.indexing_technique == "economy":
retrieval_method = "keyword_search"
else:
retrieval_method = retrieval_model_config["search_method"]
# get reranking model
reranking_model = (
retrieval_model_config["reranking_model"]
if retrieval_model_config["reranking_enable"]
else None
)
# get score threshold
score_threshold = 0.0
score_threshold_enabled = retrieval_model_config.get("score_threshold_enabled")
if score_threshold_enabled:
score_threshold = retrieval_model_config.get("score_threshold")
with measure_time() as timer:
results = RetrievalService.retrieve(
retrieval_method=retrieval_method,
dataset_id=dataset.id,
query=query,
top_k=top_k,
score_threshold=score_threshold,
reranking_model=reranking_model,
reranking_mode=retrieval_model_config.get("reranking_mode", "reranking_model"),
weights=retrieval_model_config.get("weights", None),
)
self._on_query(query, [dataset_id], app_id, user_from, user_id)
if results:
@@ -356,7 +404,8 @@ class DatasetRetrieval:
self, documents: list[Document], message_id: Optional[str] = None, timer: Optional[dict] = None
) -> None:
"""Handle retrieval end."""
for document in documents:
dify_documents = [document for document in documents if document.provider == "dify"]
for document in dify_documents:
query = db.session.query(DocumentSegment).filter(
DocumentSegment.index_node_id == document.metadata["doc_id"]
)
@@ -409,35 +458,54 @@ class DatasetRetrieval:
if not dataset:
return []
# get retrieval model , if the model is not setting , using default
retrieval_model = dataset.retrieval_model or default_retrieval_model
if dataset.indexing_technique == "economy":
# use keyword table query
documents = RetrievalService.retrieve(
retrieval_method="keyword_search", dataset_id=dataset.id, query=query, top_k=top_k
if dataset.provider == "external":
external_documents = ExternalDatasetService.fetch_external_knowledge_retrieval(
tenant_id=dataset.tenant_id,
dataset_id=dataset_id,
query=query,
external_retrieval_parameters=dataset.retrieval_model,
)
if documents:
all_documents.extend(documents)
else:
if top_k > 0:
# retrieval source
documents = RetrievalService.retrieve(
retrieval_method=retrieval_model["search_method"],
dataset_id=dataset.id,
query=query,
top_k=retrieval_model.get("top_k") or 2,
score_threshold=retrieval_model.get("score_threshold", 0.0)
if retrieval_model["score_threshold_enabled"]
else 0.0,
reranking_model=retrieval_model.get("reranking_model", None)
if retrieval_model["reranking_enable"]
else None,
reranking_mode=retrieval_model.get("reranking_mode") or "reranking_model",
weights=retrieval_model.get("weights", None),
for external_document in external_documents:
document = Document(
page_content=external_document.get("content"),
metadata=external_document.get("metadata"),
provider="external",
)
document.metadata["score"] = external_document.get("score")
document.metadata["title"] = external_document.get("title")
document.metadata["dataset_id"] = dataset_id
document.metadata["dataset_name"] = dataset.name
all_documents.append(document)
else:
# get retrieval model , if the model is not setting , using default
retrieval_model = dataset.retrieval_model or default_retrieval_model
all_documents.extend(documents)
if dataset.indexing_technique == "economy":
# use keyword table query
documents = RetrievalService.retrieve(
retrieval_method="keyword_search", dataset_id=dataset.id, query=query, top_k=top_k
)
if documents:
all_documents.extend(documents)
else:
if top_k > 0:
# retrieval source
documents = RetrievalService.retrieve(
retrieval_method=retrieval_model["search_method"],
dataset_id=dataset.id,
query=query,
top_k=retrieval_model.get("top_k") or 2,
score_threshold=retrieval_model.get("score_threshold", 0.0)
if retrieval_model["score_threshold_enabled"]
else 0.0,
reranking_model=retrieval_model.get("reranking_model", None)
if retrieval_model["reranking_enable"]
else None,
reranking_mode=retrieval_model.get("reranking_mode") or "reranking_model",
weights=retrieval_model.get("weights", None),
)
all_documents.extend(documents)
def to_dataset_retriever_tool(
self,

View File

@@ -156,16 +156,34 @@ class KnowledgeRetrievalNode(BaseNode):
weights,
node_data.multiple_retrieval_config.reranking_enable,
)
context_list = []
if all_documents:
dify_documents = [item for item in all_documents if item.provider == "dify"]
external_documents = [item for item in all_documents if item.provider == "external"]
retrieval_resource_list = []
# deal with external documents
for item in external_documents:
source = {
"metadata": {
"_source": "knowledge",
"dataset_id": item.metadata.get("dataset_id"),
"dataset_name": item.metadata.get("dataset_name"),
"document_name": item.metadata.get("title"),
"data_source_type": "external",
"retriever_from": "workflow",
"score": item.metadata.get("score"),
},
"title": item.metadata.get("title"),
"content": item.page_content,
}
retrieval_resource_list.append(source)
document_score_list = {}
# deal with dify documents
if dify_documents:
document_score_list = {}
page_number_list = {}
for item in all_documents:
for item in dify_documents:
if item.metadata.get("score"):
document_score_list[item.metadata["doc_id"]] = item.metadata["score"]
index_node_ids = [document.metadata["doc_id"] for document in all_documents]
index_node_ids = [document.metadata["doc_id"] for document in dify_documents]
segments = DocumentSegment.query.filter(
DocumentSegment.dataset_id.in_(dataset_ids),
DocumentSegment.completed_at.isnot(None),
@@ -186,13 +204,10 @@ class KnowledgeRetrievalNode(BaseNode):
Document.enabled == True,
Document.archived == False,
).first()
resource_number = 1
if dataset and document:
source = {
"metadata": {
"_source": "knowledge",
"position": resource_number,
"dataset_id": dataset.id,
"dataset_name": dataset.name,
"document_id": document.id,
@@ -212,9 +227,14 @@ class KnowledgeRetrievalNode(BaseNode):
source["content"] = f"question:{segment.get_sign_content()} \nanswer:{segment.answer}"
else:
source["content"] = segment.get_sign_content()
context_list.append(source)
resource_number += 1
return context_list
retrieval_resource_list.append(source)
if retrieval_resource_list:
retrieval_resource_list = sorted(retrieval_resource_list, key=lambda x: x.get("score"), reverse=True)
position = 1
for item in retrieval_resource_list:
item["metadata"]["position"] = position
position += 1
return retrieval_resource_list
@classmethod
def _extract_variable_selector_to_variable_mapping(

View File

@@ -38,9 +38,20 @@ dataset_retrieval_model_fields = {
"score_threshold_enabled": fields.Boolean,
"score_threshold": fields.Float,
}
external_retrieval_model_fields = {
"top_k": fields.Integer,
"score_threshold": fields.Float,
}
tag_fields = {"id": fields.String, "name": fields.String, "type": fields.String}
external_knowledge_info_fields = {
"external_knowledge_id": fields.String,
"external_knowledge_api_id": fields.String,
"external_knowledge_api_name": fields.String,
"external_knowledge_api_endpoint": fields.String,
}
dataset_detail_fields = {
"id": fields.String,
"name": fields.String,
@@ -61,6 +72,8 @@ dataset_detail_fields = {
"embedding_available": fields.Boolean,
"retrieval_model_dict": fields.Nested(dataset_retrieval_model_fields),
"tags": fields.List(fields.Nested(tag_fields)),
"external_knowledge_info": fields.Nested(external_knowledge_info_fields),
"external_retrieval_model": fields.Nested(external_retrieval_model_fields, allow_null=True),
}
dataset_query_detail_fields = {

View File

@@ -0,0 +1,11 @@
from flask_restful import fields
from libs.helper import TimestampField
external_knowledge_api_query_detail_fields = {
"id": fields.String,
"name": fields.String,
"setting": fields.String,
"created_by": fields.String,
"created_at": TimestampField,
}

View File

@@ -0,0 +1,48 @@
"""update-retrieval-resource
Revision ID: 6af6a521a53e
Revises: ec3df697ebbb
Create Date: 2024-09-24 09:22:43.570120
"""
from alembic import op
import models as models
import sqlalchemy as sa
from sqlalchemy.dialects import postgresql
# revision identifiers, used by Alembic.
revision = '6af6a521a53e'
down_revision = 'd57ba9ebb251'
branch_labels = None
depends_on = None
def upgrade():
# ### commands auto generated by Alembic - please adjust! ###
with op.batch_alter_table('dataset_retriever_resources', schema=None) as batch_op:
batch_op.alter_column('document_id',
existing_type=sa.UUID(),
nullable=True)
batch_op.alter_column('data_source_type',
existing_type=sa.TEXT(),
nullable=True)
batch_op.alter_column('segment_id',
existing_type=sa.UUID(),
nullable=True)
# ### end Alembic commands ###
def downgrade():
# ### commands auto generated by Alembic - please adjust! ###
with op.batch_alter_table('dataset_retriever_resources', schema=None) as batch_op:
batch_op.alter_column('segment_id',
existing_type=sa.UUID(),
nullable=False)
batch_op.alter_column('data_source_type',
existing_type=sa.TEXT(),
nullable=False)
batch_op.alter_column('document_id',
existing_type=sa.UUID(),
nullable=False)
# ### end Alembic commands ###

View File

@@ -0,0 +1,73 @@
"""external_knowledge_api
Revision ID: 33f5fac87f29
Revises: 6af6a521a53e
Create Date: 2024-09-25 04:34:57.249436
"""
from alembic import op
import models as models
import sqlalchemy as sa
from sqlalchemy.dialects import postgresql
# revision identifiers, used by Alembic.
revision = '33f5fac87f29'
down_revision = '6af6a521a53e'
branch_labels = None
depends_on = None
def upgrade():
# ### commands auto generated by Alembic - please adjust! ###
op.create_table('external_knowledge_apis',
sa.Column('id', models.types.StringUUID(), server_default=sa.text('uuid_generate_v4()'), nullable=False),
sa.Column('name', sa.String(length=255), nullable=False),
sa.Column('description', sa.String(length=255), nullable=False),
sa.Column('tenant_id', models.types.StringUUID(), nullable=False),
sa.Column('settings', sa.Text(), nullable=True),
sa.Column('created_by', models.types.StringUUID(), nullable=False),
sa.Column('created_at', sa.DateTime(), server_default=sa.text('CURRENT_TIMESTAMP(0)'), nullable=False),
sa.Column('updated_by', models.types.StringUUID(), nullable=True),
sa.Column('updated_at', sa.DateTime(), server_default=sa.text('CURRENT_TIMESTAMP(0)'), nullable=False),
sa.PrimaryKeyConstraint('id', name='external_knowledge_apis_pkey')
)
with op.batch_alter_table('external_knowledge_apis', schema=None) as batch_op:
batch_op.create_index('external_knowledge_apis_name_idx', ['name'], unique=False)
batch_op.create_index('external_knowledge_apis_tenant_idx', ['tenant_id'], unique=False)
op.create_table('external_knowledge_bindings',
sa.Column('id', models.types.StringUUID(), server_default=sa.text('uuid_generate_v4()'), nullable=False),
sa.Column('tenant_id', models.types.StringUUID(), nullable=False),
sa.Column('external_knowledge_api_id', models.types.StringUUID(), nullable=False),
sa.Column('dataset_id', models.types.StringUUID(), nullable=False),
sa.Column('external_knowledge_id', sa.Text(), nullable=False),
sa.Column('created_by', models.types.StringUUID(), nullable=False),
sa.Column('created_at', sa.DateTime(), server_default=sa.text('CURRENT_TIMESTAMP(0)'), nullable=False),
sa.Column('updated_by', models.types.StringUUID(), nullable=True),
sa.Column('updated_at', sa.DateTime(), server_default=sa.text('CURRENT_TIMESTAMP(0)'), nullable=False),
sa.PrimaryKeyConstraint('id', name='external_knowledge_bindings_pkey')
)
with op.batch_alter_table('external_knowledge_bindings', schema=None) as batch_op:
batch_op.create_index('external_knowledge_bindings_dataset_idx', ['dataset_id'], unique=False)
batch_op.create_index('external_knowledge_bindings_external_knowledge_api_idx', ['external_knowledge_api_id'], unique=False)
batch_op.create_index('external_knowledge_bindings_external_knowledge_idx', ['external_knowledge_id'], unique=False)
batch_op.create_index('external_knowledge_bindings_tenant_idx', ['tenant_id'], unique=False)
# ### end Alembic commands ###
def downgrade():
# ### commands auto generated by Alembic - please adjust! ###
with op.batch_alter_table('external_knowledge_bindings', schema=None) as batch_op:
batch_op.drop_index('external_knowledge_bindings_tenant_idx')
batch_op.drop_index('external_knowledge_bindings_external_knowledge_idx')
batch_op.drop_index('external_knowledge_bindings_external_knowledge_api_idx')
batch_op.drop_index('external_knowledge_bindings_dataset_idx')
op.drop_table('external_knowledge_bindings')
with op.batch_alter_table('external_knowledge_apis', schema=None) as batch_op:
batch_op.drop_index('external_knowledge_apis_tenant_idx')
batch_op.drop_index('external_knowledge_apis_name_idx')
op.drop_table('external_knowledge_apis')
# ### end Alembic commands ###

View File

@@ -1,4 +1,4 @@
"""add-dataset-retrival-model
"""add-dataset-retrieval-model
Revision ID: fca025d3b60f
Revises: b3a09c049e8e

View File

@@ -38,6 +38,7 @@ class Dataset(db.Model):
)
INDEXING_TECHNIQUE_LIST = ["high_quality", "economy", None]
PROVIDER_LIST = ["vendor", "external", None]
id = db.Column(StringUUID, server_default=db.text("uuid_generate_v4()"))
tenant_id = db.Column(StringUUID, nullable=False)
@@ -71,6 +72,14 @@ class Dataset(db.Model):
def index_struct_dict(self):
return json.loads(self.index_struct) if self.index_struct else None
@property
def external_retrieval_model(self):
default_retrieval_model = {
"top_k": 2,
"score_threshold": 0.0,
}
return self.retrieval_model or default_retrieval_model
@property
def created_by_account(self):
return db.session.get(Account, self.created_by)
@@ -162,6 +171,29 @@ class Dataset(db.Model):
return tags or []
@property
def external_knowledge_info(self):
if self.provider != "external":
return None
external_knowledge_binding = (
db.session.query(ExternalKnowledgeBindings).filter(ExternalKnowledgeBindings.dataset_id == self.id).first()
)
if not external_knowledge_binding:
return None
external_knowledge_api = (
db.session.query(ExternalKnowledgeApis)
.filter(ExternalKnowledgeApis.id == external_knowledge_binding.external_knowledge_api_id)
.first()
)
if not external_knowledge_api:
return None
return {
"external_knowledge_id": external_knowledge_binding.external_knowledge_id,
"external_knowledge_api_id": external_knowledge_api.id,
"external_knowledge_api_name": external_knowledge_api.name,
"external_knowledge_api_endpoint": json.loads(external_knowledge_api.settings).get("endpoint", ""),
}
@staticmethod
def gen_collection_name_by_id(dataset_id: str) -> str:
normalized_dataset_id = dataset_id.replace("-", "_")
@@ -687,3 +719,77 @@ class DatasetPermission(db.Model):
tenant_id = db.Column(StringUUID, nullable=False)
has_permission = db.Column(db.Boolean, nullable=False, server_default=db.text("true"))
created_at = db.Column(db.DateTime, nullable=False, server_default=db.text("CURRENT_TIMESTAMP(0)"))
class ExternalKnowledgeApis(db.Model):
__tablename__ = "external_knowledge_apis"
__table_args__ = (
db.PrimaryKeyConstraint("id", name="external_knowledge_apis_pkey"),
db.Index("external_knowledge_apis_tenant_idx", "tenant_id"),
db.Index("external_knowledge_apis_name_idx", "name"),
)
id = db.Column(StringUUID, nullable=False, server_default=db.text("uuid_generate_v4()"))
name = db.Column(db.String(255), nullable=False)
description = db.Column(db.String(255), nullable=False)
tenant_id = db.Column(StringUUID, nullable=False)
settings = db.Column(db.Text, nullable=True)
created_by = db.Column(StringUUID, nullable=False)
created_at = db.Column(db.DateTime, nullable=False, server_default=db.text("CURRENT_TIMESTAMP(0)"))
updated_by = db.Column(StringUUID, nullable=True)
updated_at = db.Column(db.DateTime, nullable=False, server_default=db.text("CURRENT_TIMESTAMP(0)"))
def to_dict(self):
return {
"id": self.id,
"tenant_id": self.tenant_id,
"name": self.name,
"description": self.description,
"settings": self.settings_dict,
"dataset_bindings": self.dataset_bindings,
"created_by": self.created_by,
"created_at": self.created_at.isoformat(),
}
@property
def settings_dict(self):
try:
return json.loads(self.settings) if self.settings else None
except JSONDecodeError:
return None
@property
def dataset_bindings(self):
external_knowledge_bindings = (
db.session.query(ExternalKnowledgeBindings)
.filter(ExternalKnowledgeBindings.external_knowledge_api_id == self.id)
.all()
)
dataset_ids = [binding.dataset_id for binding in external_knowledge_bindings]
datasets = db.session.query(Dataset).filter(Dataset.id.in_(dataset_ids)).all()
dataset_bindings = []
for dataset in datasets:
dataset_bindings.append({"id": dataset.id, "name": dataset.name})
return dataset_bindings
class ExternalKnowledgeBindings(db.Model):
__tablename__ = "external_knowledge_bindings"
__table_args__ = (
db.PrimaryKeyConstraint("id", name="external_knowledge_bindings_pkey"),
db.Index("external_knowledge_bindings_tenant_idx", "tenant_id"),
db.Index("external_knowledge_bindings_dataset_idx", "dataset_id"),
db.Index("external_knowledge_bindings_external_knowledge_idx", "external_knowledge_id"),
db.Index("external_knowledge_bindings_external_knowledge_api_idx", "external_knowledge_api_id"),
)
id = db.Column(StringUUID, nullable=False, server_default=db.text("uuid_generate_v4()"))
tenant_id = db.Column(StringUUID, nullable=False)
external_knowledge_api_id = db.Column(StringUUID, nullable=False)
dataset_id = db.Column(StringUUID, nullable=False)
external_knowledge_id = db.Column(db.Text, nullable=False)
created_by = db.Column(StringUUID, nullable=False)
created_at = db.Column(db.DateTime, nullable=False, server_default=db.text("CURRENT_TIMESTAMP(0)"))
updated_by = db.Column(StringUUID, nullable=True)
updated_at = db.Column(db.DateTime, nullable=False, server_default=db.text("CURRENT_TIMESTAMP(0)"))

View File

@@ -1423,10 +1423,10 @@ class DatasetRetrieverResource(db.Model):
position = db.Column(db.Integer, nullable=False)
dataset_id = db.Column(StringUUID, nullable=False)
dataset_name = db.Column(db.Text, nullable=False)
document_id = db.Column(StringUUID, nullable=False)
document_id = db.Column(StringUUID, nullable=True)
document_name = db.Column(db.Text, nullable=False)
data_source_type = db.Column(db.Text, nullable=False)
segment_id = db.Column(StringUUID, nullable=False)
data_source_type = db.Column(db.Text, nullable=True)
segment_id = db.Column(StringUUID, nullable=True)
score = db.Column(db.Float, nullable=True)
content = db.Column(db.Text, nullable=False)
hit_count = db.Column(db.Integer, nullable=True)

1524
api/poetry.lock generated

File diff suppressed because it is too large Load Diff

View File

@@ -221,6 +221,7 @@ volcengine-python-sdk = {extras = ["ark"], version = "^1.0.98"}
oci = "^2.133.0"
tos = "^2.7.1"
nomic = "^3.1.2"
validators = "0.21.0"
[tool.poetry.group.indriect.dependencies]
kaleido = "0.2.1"
rank-bm25 = "~0.2.2"

View File

@@ -0,0 +1,92 @@
import datetime
import time
import click
from sqlalchemy import func
from werkzeug.exceptions import NotFound
import app
from configs import dify_config
from core.rag.index_processor.index_processor_factory import IndexProcessorFactory
from extensions.ext_database import db
from models.dataset import Dataset, DatasetQuery, Document
@app.celery.task(queue="dataset")
def clean_unused_message_task():
click.echo(click.style("Start clean unused messages .", fg="green"))
clean_days = int(dify_config.CLEAN_DAY_SETTING)
start_at = time.perf_counter()
thirty_days_ago = datetime.datetime.now() - datetime.timedelta(days=clean_days)
page = 1
while True:
try:
# Subquery for counting new documents
document_subquery_new = (
db.session.query(Document.dataset_id, func.count(Document.id).label("document_count"))
.filter(
Document.indexing_status == "completed",
Document.enabled == True,
Document.archived == False,
Document.updated_at > thirty_days_ago,
)
.group_by(Document.dataset_id)
.subquery()
)
# Subquery for counting old documents
document_subquery_old = (
db.session.query(Document.dataset_id, func.count(Document.id).label("document_count"))
.filter(
Document.indexing_status == "completed",
Document.enabled == True,
Document.archived == False,
Document.updated_at < thirty_days_ago,
)
.group_by(Document.dataset_id)
.subquery()
)
# Main query with join and filter
datasets = (
db.session.query(Dataset)
.outerjoin(document_subquery_new, Dataset.id == document_subquery_new.c.dataset_id)
.outerjoin(document_subquery_old, Dataset.id == document_subquery_old.c.dataset_id)
.filter(
Dataset.created_at < thirty_days_ago,
func.coalesce(document_subquery_new.c.document_count, 0) == 0,
func.coalesce(document_subquery_old.c.document_count, 0) > 0,
)
.order_by(Dataset.created_at.desc())
.paginate(page=page, per_page=50)
)
except NotFound:
break
if datasets.items is None or len(datasets.items) == 0:
break
page += 1
for dataset in datasets:
dataset_query = (
db.session.query(DatasetQuery)
.filter(DatasetQuery.created_at > thirty_days_ago, DatasetQuery.dataset_id == dataset.id)
.all()
)
if not dataset_query or len(dataset_query) == 0:
try:
# remove index
index_processor = IndexProcessorFactory(dataset.doc_form).init_index_processor()
index_processor.clean(dataset, None)
# update document
update_params = {Document.enabled: False}
Document.query.filter_by(dataset_id=dataset.id).update(update_params)
db.session.commit()
click.echo(click.style("Cleaned unused dataset {} from db success!".format(dataset.id), fg="green"))
except Exception as e:
click.echo(
click.style("clean dataset index error: {} {}".format(e.__class__.__name__, str(e)), fg="red")
)
end_at = time.perf_counter()
click.echo(click.style("Cleaned unused dataset from db success latency: {}".format(end_at - start_at), fg="green"))

View File

@@ -1,10 +1,13 @@
from services.auth.firecrawl import FirecrawlAuth
from services.auth.jina import JinaAuth
class ApiKeyAuthFactory:
def __init__(self, provider: str, credentials: dict):
if provider == "firecrawl":
self.auth = FirecrawlAuth(credentials)
elif provider == "jinareader":
self.auth = JinaAuth(credentials)
else:
raise ValueError("Invalid provider")

44
api/services/auth/jina.py Normal file
View File

@@ -0,0 +1,44 @@
import json
import requests
from services.auth.api_key_auth_base import ApiKeyAuthBase
class JinaAuth(ApiKeyAuthBase):
def __init__(self, credentials: dict):
super().__init__(credentials)
auth_type = credentials.get("auth_type")
if auth_type != "bearer":
raise ValueError("Invalid auth type, Jina Reader auth type must be Bearer")
self.api_key = credentials.get("config").get("api_key", None)
if not self.api_key:
raise ValueError("No API key provided")
def validate_credentials(self):
headers = self._prepare_headers()
options = {
"url": "https://example.com",
}
response = self._post_request("https://r.jina.ai", options, headers)
if response.status_code == 200:
return True
else:
self._handle_error(response)
def _prepare_headers(self):
return {"Content-Type": "application/json", "Authorization": f"Bearer {self.api_key}"}
def _post_request(self, url, data, headers):
return requests.post(url, headers=headers, json=data)
def _handle_error(self, response):
if response.status_code in {402, 409, 500}:
error_message = response.json().get("error", "Unknown error occurred")
raise Exception(f"Failed to authorize. Status code: {response.status_code}. Error: {error_message}")
else:
if response.text:
error_message = json.loads(response.text).get("error", "Unknown error occurred")
raise Exception(f"Failed to authorize. Status code: {response.status_code}. Error: {error_message}")
raise Exception(f"Unexpected error occurred while trying to authorize. Status code: {response.status_code}")

View File

@@ -32,6 +32,7 @@ from models.dataset import (
DatasetQuery,
Document,
DocumentSegment,
ExternalKnowledgeBindings,
)
from models.model import UploadFile
from models.source import DataSourceOauthBinding
@@ -39,6 +40,7 @@ from services.errors.account import NoPermissionError
from services.errors.dataset import DatasetNameDuplicateError
from services.errors.document import DocumentIndexingError
from services.errors.file import FileNotExistsError
from services.external_knowledge_service import ExternalDatasetService
from services.feature_service import FeatureModel, FeatureService
from services.tag_service import TagService
from services.vector_service import VectorService
@@ -56,10 +58,8 @@ from tasks.sync_website_document_indexing_task import sync_website_document_inde
class DatasetService:
@staticmethod
def get_datasets(page, per_page, provider="vendor", tenant_id=None, user=None, search=None, tag_ids=None):
query = Dataset.query.filter(Dataset.provider == provider, Dataset.tenant_id == tenant_id).order_by(
Dataset.created_at.desc()
)
def get_datasets(page, per_page, tenant_id=None, user=None, search=None, tag_ids=None):
query = Dataset.query.filter(Dataset.tenant_id == tenant_id).order_by(Dataset.created_at.desc())
if user:
# get permitted dataset ids
@@ -137,7 +137,14 @@ class DatasetService:
@staticmethod
def create_empty_dataset(
tenant_id: str, name: str, indexing_technique: Optional[str], account: Account, permission: Optional[str] = None
tenant_id: str,
name: str,
indexing_technique: Optional[str],
account: Account,
permission: Optional[str] = None,
provider: str = "vendor",
external_knowledge_api_id: Optional[str] = None,
external_knowledge_id: Optional[str] = None,
):
# check if dataset name already exists
if Dataset.query.filter_by(name=name, tenant_id=tenant_id).first():
@@ -156,12 +163,28 @@ class DatasetService:
dataset.embedding_model_provider = embedding_model.provider if embedding_model else None
dataset.embedding_model = embedding_model.model if embedding_model else None
dataset.permission = permission or DatasetPermissionEnum.ONLY_ME
dataset.provider = provider
db.session.add(dataset)
db.session.flush()
if provider == "external" and external_knowledge_api_id:
external_knowledge_api = ExternalDatasetService.get_external_knowledge_api(external_knowledge_api_id)
if not external_knowledge_api:
raise ValueError("External API template not found.")
external_knowledge_binding = ExternalKnowledgeBindings(
tenant_id=tenant_id,
dataset_id=dataset.id,
external_knowledge_api_id=external_knowledge_api_id,
external_knowledge_id=external_knowledge_id,
created_by=account.id,
)
db.session.add(external_knowledge_binding)
db.session.commit()
return dataset
@staticmethod
def get_dataset(dataset_id):
def get_dataset(dataset_id) -> Dataset:
return Dataset.query.filter_by(id=dataset_id).first()
@staticmethod
@@ -202,81 +225,103 @@ class DatasetService:
@staticmethod
def update_dataset(dataset_id, data, user):
data.pop("partial_member_list", None)
filtered_data = {k: v for k, v in data.items() if v is not None or k == "description"}
dataset = DatasetService.get_dataset(dataset_id)
DatasetService.check_dataset_permission(dataset, user)
action = None
if dataset.indexing_technique != data["indexing_technique"]:
# if update indexing_technique
if data["indexing_technique"] == "economy":
action = "remove"
filtered_data["embedding_model"] = None
filtered_data["embedding_model_provider"] = None
filtered_data["collection_binding_id"] = None
elif data["indexing_technique"] == "high_quality":
action = "add"
# get embedding model setting
try:
model_manager = ModelManager()
embedding_model = model_manager.get_model_instance(
tenant_id=current_user.current_tenant_id,
provider=data["embedding_model_provider"],
model_type=ModelType.TEXT_EMBEDDING,
model=data["embedding_model"],
)
filtered_data["embedding_model"] = embedding_model.model
filtered_data["embedding_model_provider"] = embedding_model.provider
dataset_collection_binding = DatasetCollectionBindingService.get_dataset_collection_binding(
embedding_model.provider, embedding_model.model
)
filtered_data["collection_binding_id"] = dataset_collection_binding.id
except LLMBadRequestError:
raise ValueError(
"No Embedding Model available. Please configure a valid provider "
"in the Settings -> Model Provider."
)
except ProviderTokenNotInitError as ex:
raise ValueError(ex.description)
else:
if dataset.provider == "external":
dataset.retrieval_model = data.get("external_retrieval_model", None)
dataset.name = data.get("name", dataset.name)
dataset.description = data.get("description", "")
external_knowledge_id = data.get("external_knowledge_id", None)
db.session.add(dataset)
if not external_knowledge_id:
raise ValueError("External knowledge id is required.")
external_knowledge_api_id = data.get("external_knowledge_api_id", None)
if not external_knowledge_api_id:
raise ValueError("External knowledge api id is required.")
external_knowledge_binding = ExternalKnowledgeBindings.query.filter_by(dataset_id=dataset_id).first()
if (
data["embedding_model_provider"] != dataset.embedding_model_provider
or data["embedding_model"] != dataset.embedding_model
external_knowledge_binding.external_knowledge_id != external_knowledge_id
or external_knowledge_binding.external_knowledge_api_id != external_knowledge_api_id
):
action = "update"
try:
model_manager = ModelManager()
embedding_model = model_manager.get_model_instance(
tenant_id=current_user.current_tenant_id,
provider=data["embedding_model_provider"],
model_type=ModelType.TEXT_EMBEDDING,
model=data["embedding_model"],
)
filtered_data["embedding_model"] = embedding_model.model
filtered_data["embedding_model_provider"] = embedding_model.provider
dataset_collection_binding = DatasetCollectionBindingService.get_dataset_collection_binding(
embedding_model.provider, embedding_model.model
)
filtered_data["collection_binding_id"] = dataset_collection_binding.id
except LLMBadRequestError:
raise ValueError(
"No Embedding Model available. Please configure a valid provider "
"in the Settings -> Model Provider."
)
except ProviderTokenNotInitError as ex:
raise ValueError(ex.description)
external_knowledge_binding.external_knowledge_id = external_knowledge_id
external_knowledge_binding.external_knowledge_api_id = external_knowledge_api_id
db.session.add(external_knowledge_binding)
db.session.commit()
else:
data.pop("partial_member_list", None)
filtered_data = {k: v for k, v in data.items() if v is not None or k == "description"}
action = None
if dataset.indexing_technique != data["indexing_technique"]:
# if update indexing_technique
if data["indexing_technique"] == "economy":
action = "remove"
filtered_data["embedding_model"] = None
filtered_data["embedding_model_provider"] = None
filtered_data["collection_binding_id"] = None
elif data["indexing_technique"] == "high_quality":
action = "add"
# get embedding model setting
try:
model_manager = ModelManager()
embedding_model = model_manager.get_model_instance(
tenant_id=current_user.current_tenant_id,
provider=data["embedding_model_provider"],
model_type=ModelType.TEXT_EMBEDDING,
model=data["embedding_model"],
)
filtered_data["embedding_model"] = embedding_model.model
filtered_data["embedding_model_provider"] = embedding_model.provider
dataset_collection_binding = DatasetCollectionBindingService.get_dataset_collection_binding(
embedding_model.provider, embedding_model.model
)
filtered_data["collection_binding_id"] = dataset_collection_binding.id
except LLMBadRequestError:
raise ValueError(
"No Embedding Model available. Please configure a valid provider "
"in the Settings -> Model Provider."
)
except ProviderTokenNotInitError as ex:
raise ValueError(ex.description)
else:
if (
data["embedding_model_provider"] != dataset.embedding_model_provider
or data["embedding_model"] != dataset.embedding_model
):
action = "update"
try:
model_manager = ModelManager()
embedding_model = model_manager.get_model_instance(
tenant_id=current_user.current_tenant_id,
provider=data["embedding_model_provider"],
model_type=ModelType.TEXT_EMBEDDING,
model=data["embedding_model"],
)
filtered_data["embedding_model"] = embedding_model.model
filtered_data["embedding_model_provider"] = embedding_model.provider
dataset_collection_binding = DatasetCollectionBindingService.get_dataset_collection_binding(
embedding_model.provider, embedding_model.model
)
filtered_data["collection_binding_id"] = dataset_collection_binding.id
except LLMBadRequestError:
raise ValueError(
"No Embedding Model available. Please configure a valid provider "
"in the Settings -> Model Provider."
)
except ProviderTokenNotInitError as ex:
raise ValueError(ex.description)
filtered_data["updated_by"] = user.id
filtered_data["updated_at"] = datetime.datetime.now()
filtered_data["updated_by"] = user.id
filtered_data["updated_at"] = datetime.datetime.now()
# update Retrieval model
filtered_data["retrieval_model"] = data["retrieval_model"]
# update Retrieval model
filtered_data["retrieval_model"] = data["retrieval_model"]
dataset.query.filter_by(id=dataset_id).update(filtered_data)
dataset.query.filter_by(id=dataset_id).update(filtered_data)
db.session.commit()
if action:
deal_dataset_vector_index_task.delay(dataset_id, action)
db.session.commit()
if action:
deal_dataset_vector_index_task.delay(dataset_id, action)
return dataset
@staticmethod

View File

@@ -0,0 +1,26 @@
from typing import Literal, Optional, Union
from pydantic import BaseModel
class AuthorizationConfig(BaseModel):
type: Literal[None, "basic", "bearer", "custom"]
api_key: Union[None, str] = None
header: Union[None, str] = None
class Authorization(BaseModel):
type: Literal["no-auth", "api-key"]
config: Optional[AuthorizationConfig] = None
class ProcessStatusSetting(BaseModel):
request_method: str
url: str
class ExternalKnowledgeApiSetting(BaseModel):
url: str
request_method: str
headers: Optional[dict] = None
params: Optional[dict] = None

View File

@@ -0,0 +1,274 @@
import json
from copy import deepcopy
from datetime import datetime, timezone
from typing import Any, Optional, Union
import httpx
import validators
# from tasks.external_document_indexing_task import external_document_indexing_task
from core.helper import ssrf_proxy
from extensions.ext_database import db
from models.dataset import (
Dataset,
ExternalKnowledgeApis,
ExternalKnowledgeBindings,
)
from services.entities.external_knowledge_entities.external_knowledge_entities import (
Authorization,
ExternalKnowledgeApiSetting,
)
from services.errors.dataset import DatasetNameDuplicateError
class ExternalDatasetService:
@staticmethod
def get_external_knowledge_apis(page, per_page, tenant_id, search=None) -> tuple[list[ExternalKnowledgeApis], int]:
query = ExternalKnowledgeApis.query.filter(ExternalKnowledgeApis.tenant_id == tenant_id).order_by(
ExternalKnowledgeApis.created_at.desc()
)
if search:
query = query.filter(ExternalKnowledgeApis.name.ilike(f"%{search}%"))
external_knowledge_apis = query.paginate(page=page, per_page=per_page, max_per_page=100, error_out=False)
return external_knowledge_apis.items, external_knowledge_apis.total
@classmethod
def validate_api_list(cls, api_settings: dict):
if not api_settings:
raise ValueError("api list is empty")
if "endpoint" not in api_settings and not api_settings["endpoint"]:
raise ValueError("endpoint is required")
if "api_key" not in api_settings and not api_settings["api_key"]:
raise ValueError("api_key is required")
@staticmethod
def create_external_knowledge_api(tenant_id: str, user_id: str, args: dict) -> ExternalKnowledgeApis:
ExternalDatasetService.check_endpoint_and_api_key(args.get("settings"))
external_knowledge_api = ExternalKnowledgeApis(
tenant_id=tenant_id,
created_by=user_id,
updated_by=user_id,
name=args.get("name"),
description=args.get("description", ""),
settings=json.dumps(args.get("settings"), ensure_ascii=False),
)
db.session.add(external_knowledge_api)
db.session.commit()
return external_knowledge_api
@staticmethod
def check_endpoint_and_api_key(settings: dict):
if "endpoint" not in settings or not settings["endpoint"]:
raise ValueError("endpoint is required")
if "api_key" not in settings or not settings["api_key"]:
raise ValueError("api_key is required")
endpoint = f"{settings['endpoint']}/retrieval"
api_key = settings["api_key"]
if not validators.url(endpoint):
raise ValueError(f"invalid endpoint: {endpoint}")
try:
response = httpx.post(endpoint, headers={"Authorization": f"Bearer {api_key}"})
except Exception as e:
raise ValueError(f"failed to connect to the endpoint: {endpoint}")
if response.status_code == 502:
raise ValueError(f"Bad Gateway: failed to connect to the endpoint: {endpoint}")
if response.status_code == 404:
raise ValueError(f"Not Found: failed to connect to the endpoint: {endpoint}")
if response.status_code == 403:
raise ValueError(f"Forbidden: Authorization failed with api_key: {api_key}")
@staticmethod
def get_external_knowledge_api(external_knowledge_api_id: str) -> ExternalKnowledgeApis:
return ExternalKnowledgeApis.query.filter_by(id=external_knowledge_api_id).first()
@staticmethod
def update_external_knowledge_api(tenant_id, user_id, external_knowledge_api_id, args) -> ExternalKnowledgeApis:
external_knowledge_api = ExternalKnowledgeApis.query.filter_by(
id=external_knowledge_api_id, tenant_id=tenant_id
).first()
if external_knowledge_api is None:
raise ValueError("api template not found")
external_knowledge_api.name = args.get("name")
external_knowledge_api.description = args.get("description", "")
external_knowledge_api.settings = json.dumps(args.get("settings"), ensure_ascii=False)
external_knowledge_api.updated_by = user_id
external_knowledge_api.updated_at = datetime.now(timezone.utc).replace(tzinfo=None)
db.session.commit()
return external_knowledge_api
@staticmethod
def delete_external_knowledge_api(tenant_id: str, external_knowledge_api_id: str):
external_knowledge_api = ExternalKnowledgeApis.query.filter_by(
id=external_knowledge_api_id, tenant_id=tenant_id
).first()
if external_knowledge_api is None:
raise ValueError("api template not found")
db.session.delete(external_knowledge_api)
db.session.commit()
@staticmethod
def external_knowledge_api_use_check(external_knowledge_api_id: str) -> tuple[bool, int]:
count = ExternalKnowledgeBindings.query.filter_by(external_knowledge_api_id=external_knowledge_api_id).count()
if count > 0:
return True, count
return False, 0
@staticmethod
def get_external_knowledge_binding_with_dataset_id(tenant_id: str, dataset_id: str) -> ExternalKnowledgeBindings:
external_knowledge_binding = ExternalKnowledgeBindings.query.filter_by(
dataset_id=dataset_id, tenant_id=tenant_id
).first()
if not external_knowledge_binding:
raise ValueError("external knowledge binding not found")
return external_knowledge_binding
@staticmethod
def document_create_args_validate(tenant_id: str, external_knowledge_api_id: str, process_parameter: dict):
external_knowledge_api = ExternalKnowledgeApis.query.filter_by(
id=external_knowledge_api_id, tenant_id=tenant_id
).first()
if external_knowledge_api is None:
raise ValueError("api template not found")
settings = json.loads(external_knowledge_api.settings)
for setting in settings:
custom_parameters = setting.get("document_process_setting")
if custom_parameters:
for parameter in custom_parameters:
if parameter.get("required", False) and not process_parameter.get(parameter.get("name")):
raise ValueError(f'{parameter.get("name")} is required')
@staticmethod
def process_external_api(
settings: ExternalKnowledgeApiSetting, files: Union[None, dict[str, Any]]
) -> httpx.Response:
"""
do http request depending on api bundle
"""
kwargs = {
"url": settings.url,
"headers": settings.headers,
"follow_redirects": True,
}
response = getattr(ssrf_proxy, settings.request_method)(data=json.dumps(settings.params), files=files, **kwargs)
return response
@staticmethod
def assembling_headers(authorization: Authorization, headers: Optional[dict] = None) -> dict[str, Any]:
authorization = deepcopy(authorization)
if headers:
headers = deepcopy(headers)
else:
headers = {}
if authorization.type == "api-key":
if authorization.config is None:
raise ValueError("authorization config is required")
if authorization.config.api_key is None:
raise ValueError("api_key is required")
if not authorization.config.header:
authorization.config.header = "Authorization"
if authorization.config.type == "bearer":
headers[authorization.config.header] = f"Bearer {authorization.config.api_key}"
elif authorization.config.type == "basic":
headers[authorization.config.header] = f"Basic {authorization.config.api_key}"
elif authorization.config.type == "custom":
headers[authorization.config.header] = authorization.config.api_key
return headers
@staticmethod
def get_external_knowledge_api_settings(settings: dict) -> ExternalKnowledgeApiSetting:
return ExternalKnowledgeApiSetting.parse_obj(settings)
@staticmethod
def create_external_dataset(tenant_id: str, user_id: str, args: dict) -> Dataset:
# check if dataset name already exists
if Dataset.query.filter_by(name=args.get("name"), tenant_id=tenant_id).first():
raise DatasetNameDuplicateError(f"Dataset with name {args.get('name')} already exists.")
external_knowledge_api = ExternalKnowledgeApis.query.filter_by(
id=args.get("external_knowledge_api_id"), tenant_id=tenant_id
).first()
if external_knowledge_api is None:
raise ValueError("api template not found")
dataset = Dataset(
tenant_id=tenant_id,
name=args.get("name"),
description=args.get("description", ""),
provider="external",
retrieval_model=args.get("external_retrieval_model"),
created_by=user_id,
)
db.session.add(dataset)
db.session.flush()
external_knowledge_binding = ExternalKnowledgeBindings(
tenant_id=tenant_id,
dataset_id=dataset.id,
external_knowledge_api_id=args.get("external_knowledge_api_id"),
external_knowledge_id=args.get("external_knowledge_id"),
created_by=user_id,
)
db.session.add(external_knowledge_binding)
db.session.commit()
return dataset
@staticmethod
def fetch_external_knowledge_retrieval(
tenant_id: str, dataset_id: str, query: str, external_retrieval_parameters: dict
) -> list:
external_knowledge_binding = ExternalKnowledgeBindings.query.filter_by(
dataset_id=dataset_id, tenant_id=tenant_id
).first()
if not external_knowledge_binding:
raise ValueError("external knowledge binding not found")
external_knowledge_api = ExternalKnowledgeApis.query.filter_by(
id=external_knowledge_binding.external_knowledge_api_id
).first()
if not external_knowledge_api:
raise ValueError("external api template not found")
settings = json.loads(external_knowledge_api.settings)
headers = {"Content-Type": "application/json"}
if settings.get("api_key"):
headers["Authorization"] = f"Bearer {settings.get('api_key')}"
score_threshold_enabled = external_retrieval_parameters.get("score_threshold_enabled") or False
score_threshold = external_retrieval_parameters.get("score_threshold", 0.0) if score_threshold_enabled else 0.0
request_params = {
"retrieval_setting": {
"top_k": external_retrieval_parameters.get("top_k"),
"score_threshold": score_threshold,
},
"query": query,
"knowledge_id": external_knowledge_binding.external_knowledge_id,
}
external_knowledge_api_setting = {
"url": f"{settings.get('endpoint')}/retrieval",
"request_method": "post",
"headers": headers,
"params": request_params,
}
response = ExternalDatasetService.process_external_api(
ExternalKnowledgeApiSetting(**external_knowledge_api_setting), None
)
if response.status_code == 200:
return response.json().get("records", [])
return []

View File

@@ -19,7 +19,15 @@ default_retrieval_model = {
class HitTestingService:
@classmethod
def retrieve(cls, dataset: Dataset, query: str, account: Account, retrieval_model: dict, limit: int = 10) -> dict:
def retrieve(
cls,
dataset: Dataset,
query: str,
account: Account,
retrieval_model: dict,
external_retrieval_model: dict,
limit: int = 10,
) -> dict:
if dataset.available_document_count == 0 or dataset.available_segment_count == 0:
return {
"query": {
@@ -62,10 +70,44 @@ class HitTestingService:
return cls.compact_retrieve_response(dataset, query, all_documents)
@classmethod
def external_retrieve(
cls,
dataset: Dataset,
query: str,
account: Account,
external_retrieval_model: dict,
) -> dict:
if dataset.provider != "external":
return {
"query": {"content": query},
"records": [],
}
start = time.perf_counter()
all_documents = RetrievalService.external_retrieve(
dataset_id=dataset.id,
query=cls.escape_query_for_search(query),
external_retrieval_model=external_retrieval_model,
)
end = time.perf_counter()
logging.debug(f"External knowledge hit testing retrieve in {end - start:0.4f} seconds")
dataset_query = DatasetQuery(
dataset_id=dataset.id, content=query, source="hit_testing", created_by_role="account", created_by=account.id
)
db.session.add(dataset_query)
db.session.commit()
return cls.compact_external_retrieve_response(dataset, query, all_documents)
@classmethod
def compact_retrieve_response(cls, dataset: Dataset, query: str, documents: list[Document]):
i = 0
records = []
for document in documents:
index_node_id = document.metadata["doc_id"]
@@ -81,7 +123,6 @@ class HitTestingService:
)
if not segment:
i += 1
continue
record = {
@@ -91,8 +132,6 @@ class HitTestingService:
records.append(record)
i += 1
return {
"query": {
"content": query,
@@ -100,6 +139,25 @@ class HitTestingService:
"records": records,
}
@classmethod
def compact_external_retrieve_response(cls, dataset: Dataset, query: str, documents: list):
records = []
if dataset.provider == "external":
for document in documents:
record = {
"content": document.get("content", None),
"title": document.get("title", None),
"score": document.get("score", None),
"metadata": document.get("metadata", None),
}
records.append(record)
return {
"query": {
"content": query,
},
"records": records,
}
@classmethod
def hit_testing_args_check(cls, args):
query = args["query"]

View File

@@ -1,6 +1,7 @@
import datetime
import json
import requests
from flask_login import current_user
from core.helper import encrypter
@@ -65,6 +66,35 @@ class WebsiteService:
time = str(datetime.datetime.now().timestamp())
redis_client.setex(website_crawl_time_cache_key, 3600, time)
return {"status": "active", "job_id": job_id}
elif provider == "jinareader":
api_key = encrypter.decrypt_token(
tenant_id=current_user.current_tenant_id, token=credentials.get("config").get("api_key")
)
crawl_sub_pages = options.get("crawl_sub_pages", False)
if not crawl_sub_pages:
response = requests.get(
f"https://r.jina.ai/{url}",
headers={"Accept": "application/json", "Authorization": f"Bearer {api_key}"},
)
if response.json().get("code") != 200:
raise ValueError("Failed to crawl")
return {"status": "active", "data": response.json().get("data")}
else:
response = requests.post(
"https://adaptivecrawl-kir3wx7b3a-uc.a.run.app",
json={
"url": url,
"maxPages": options.get("limit", 1),
"useSitemap": options.get("use_sitemap", True),
},
headers={
"Content-Type": "application/json",
"Authorization": f"Bearer {api_key}",
},
)
if response.json().get("code") != 200:
raise ValueError("Failed to crawl")
return {"status": "active", "job_id": response.json().get("data", {}).get("taskId")}
else:
raise ValueError("Invalid provider")
@@ -93,6 +123,42 @@ class WebsiteService:
time_consuming = abs(end_time - float(start_time))
crawl_status_data["time_consuming"] = f"{time_consuming:.2f}"
redis_client.delete(website_crawl_time_cache_key)
elif provider == "jinareader":
api_key = encrypter.decrypt_token(
tenant_id=current_user.current_tenant_id, token=credentials.get("config").get("api_key")
)
response = requests.post(
"https://adaptivecrawlstatus-kir3wx7b3a-uc.a.run.app",
headers={"Content-Type": "application/json", "Authorization": f"Bearer {api_key}"},
json={"taskId": job_id},
)
data = response.json().get("data", {})
crawl_status_data = {
"status": data.get("status", "active"),
"job_id": job_id,
"total": len(data.get("urls", [])),
"current": len(data.get("processed", [])) + len(data.get("failed", [])),
"data": [],
"time_consuming": data.get("duration", 0) / 1000,
}
if crawl_status_data["status"] == "completed":
response = requests.post(
"https://adaptivecrawlstatus-kir3wx7b3a-uc.a.run.app",
headers={"Content-Type": "application/json", "Authorization": f"Bearer {api_key}"},
json={"taskId": job_id, "urls": list(data.get("processed", {}).keys())},
)
data = response.json().get("data", {})
formatted_data = [
{
"title": item.get("data", {}).get("title"),
"source_url": item.get("data", {}).get("url"),
"description": item.get("data", {}).get("description"),
"markdown": item.get("data", {}).get("content"),
}
for item in data.get("processed", {}).values()
]
crawl_status_data["data"] = formatted_data
else:
raise ValueError("Invalid provider")
return crawl_status_data
@@ -119,6 +185,40 @@ class WebsiteService:
if item.get("source_url") == url:
return item
return None
elif provider == "jinareader":
file_key = "website_files/" + job_id + ".txt"
if storage.exists(file_key):
data = storage.load_once(file_key)
if data:
data = json.loads(data.decode("utf-8"))
elif not job_id:
response = requests.get(
f"https://r.jina.ai/{url}",
headers={"Accept": "application/json", "Authorization": f"Bearer {api_key}"},
)
if response.json().get("code") != 200:
raise ValueError("Failed to crawl")
return response.json().get("data")
else:
api_key = encrypter.decrypt_token(tenant_id=tenant_id, token=credentials.get("config").get("api_key"))
response = requests.post(
"https://adaptivecrawlstatus-kir3wx7b3a-uc.a.run.app",
headers={"Content-Type": "application/json", "Authorization": f"Bearer {api_key}"},
json={"taskId": job_id},
)
data = response.json().get("data", {})
if data.get("status") != "completed":
raise ValueError("Crawl job is not completed")
response = requests.post(
"https://adaptivecrawlstatus-kir3wx7b3a-uc.a.run.app",
headers={"Content-Type": "application/json", "Authorization": f"Bearer {api_key}"},
json={"taskId": job_id, "urls": list(data.get("processed", {}).keys())},
)
data = response.json().get("data", {})
for item in data.get("processed", {}).values():
if item.get("data", {}).get("url") == url:
return item.get("data", {})
else:
raise ValueError("Invalid provider")

View File

@@ -0,0 +1,93 @@
import json
import logging
import time
import click
from celery import shared_task
from core.indexing_runner import DocumentIsPausedException
from extensions.ext_database import db
from extensions.ext_storage import storage
from models.dataset import Dataset, ExternalKnowledgeApis
from models.model import UploadFile
from services.external_knowledge_service import ExternalDatasetService
@shared_task(queue="dataset")
def external_document_indexing_task(
dataset_id: str, external_knowledge_api_id: str, data_source: dict, process_parameter: dict
):
"""
Async process document
:param dataset_id:
:param external_knowledge_api_id:
:param data_source:
:param process_parameter:
Usage: external_document_indexing_task.delay(dataset_id, document_id)
"""
start_at = time.perf_counter()
dataset = db.session.query(Dataset).filter(Dataset.id == dataset_id).first()
if not dataset:
logging.info(
click.style("Processed external dataset: {} failed, dataset not exit.".format(dataset_id), fg="red")
)
return
# get external api template
external_knowledge_api = (
db.session.query(ExternalKnowledgeApis)
.filter(
ExternalKnowledgeApis.id == external_knowledge_api_id, ExternalKnowledgeApis.tenant_id == dataset.tenant_id
)
.first()
)
if not external_knowledge_api:
logging.info(
click.style(
"Processed external dataset: {} failed, api template: {} not exit.".format(
dataset_id, external_knowledge_api_id
),
fg="red",
)
)
return
files = {}
if data_source["type"] == "upload_file":
upload_file_list = data_source["info_list"]["file_info_list"]["file_ids"]
for file_id in upload_file_list:
file = (
db.session.query(UploadFile)
.filter(UploadFile.tenant_id == dataset.tenant_id, UploadFile.id == file_id)
.first()
)
if file:
files[file.id] = (file.name, storage.load_once(file.key), file.mime_type)
try:
settings = ExternalDatasetService.get_external_knowledge_api_settings(
json.loads(external_knowledge_api.settings)
)
# assemble headers
headers = ExternalDatasetService.assembling_headers(settings.authorization, settings.headers)
# do http request
response = ExternalDatasetService.process_external_api(settings, headers, process_parameter, files)
job_id = response.json().get("job_id")
if job_id:
# save job_id to dataset
dataset.job_id = job_id
db.session.commit()
end_at = time.perf_counter()
logging.info(
click.style(
"Processed external dataset: {} successful, latency: {}".format(dataset.id, end_at - start_at),
fg="green",
)
)
except DocumentIsPausedException as ex:
logging.info(click.style(str(ex), fg="yellow"))
except Exception:
pass

View File

@@ -0,0 +1,38 @@
import pytest
from controllers.console.version import _has_new_version
@pytest.mark.parametrize(
("latest_version", "current_version", "expected"),
[
("1.0.1", "1.0.0", True),
("1.1.0", "1.0.0", True),
("2.0.0", "1.9.9", True),
("1.0.0", "1.0.0", False),
("1.0.0", "1.0.1", False),
("1.0.0", "2.0.0", False),
("1.0.1", "1.0.0-beta", True),
("1.0.0", "1.0.0-alpha", True),
("1.0.0-beta", "1.0.0-alpha", True),
("1.0.0", "1.0.0-rc1", True),
("1.0.0", "0.9.9", True),
("1.0.0", "1.0.0-dev", True),
],
)
def test_has_new_version(latest_version, current_version, expected):
assert _has_new_version(latest_version=latest_version, current_version=current_version) == expected
def test_has_new_version_invalid_input():
with pytest.raises(ValueError):
_has_new_version(latest_version="1.0", current_version="1.0.0")
with pytest.raises(ValueError):
_has_new_version(latest_version="1.0.0", current_version="1.0")
with pytest.raises(ValueError):
_has_new_version(latest_version="invalid", current_version="1.0.0")
with pytest.raises(ValueError):
_has_new_version(latest_version="1.0.0", current_version="invalid")

View File

@@ -2,7 +2,7 @@ version: '3'
services:
# API service
api:
image: langgenius/dify-api:0.8.3
image: langgenius/dify-api:0.9.0
restart: always
environment:
# Startup mode, 'api' starts the API server.
@@ -227,7 +227,7 @@ services:
# worker service
# The Celery worker for processing the queue.
worker:
image: langgenius/dify-api:0.8.3
image: langgenius/dify-api:0.9.0
restart: always
environment:
CONSOLE_WEB_URL: ''
@@ -396,7 +396,7 @@ services:
# Frontend web application.
web:
image: langgenius/dify-web:0.8.3
image: langgenius/dify-web:0.9.0
restart: always
environment:
# The base URL of console application api server, refers to the Console base URL of WEB service if console domain is

View File

@@ -213,7 +213,7 @@ x-shared-env: &shared-api-worker-env
services:
# API service
api:
image: langgenius/dify-api:0.8.3
image: langgenius/dify-api:0.9.0
restart: always
environment:
# Use the shared environment variables.
@@ -233,7 +233,7 @@ services:
# worker service
# The Celery worker for processing the queue.
worker:
image: langgenius/dify-api:0.8.3
image: langgenius/dify-api:0.9.0
restart: always
environment:
# Use the shared environment variables.
@@ -252,7 +252,7 @@ services:
# Frontend web application.
web:
image: langgenius/dify-web:0.8.3
image: langgenius/dify-web:0.9.0
restart: always
environment:
CONSOLE_API_URL: ${CONSOLE_API_URL:-}

View File

@@ -1,6 +1,6 @@
'use client'
import type { FC, SVGProps } from 'react'
import React, { useEffect } from 'react'
import React, { useEffect, useMemo } from 'react'
import { usePathname } from 'next/navigation'
import useSWR from 'swr'
import { useTranslation } from 'react-i18next'
@@ -203,12 +203,23 @@ const DatasetDetailLayout: FC<IAppDetailLayoutProps> = (props) => {
datasetId,
}, apiParams => fetchDatasetRelatedApps(apiParams.datasetId))
const navigation = [
{ name: t('common.datasetMenus.documents'), href: `/datasets/${datasetId}/documents`, icon: DocumentTextIcon, selectedIcon: DocumentTextSolidIcon },
{ name: t('common.datasetMenus.hitTesting'), href: `/datasets/${datasetId}/hitTesting`, icon: TargetIcon, selectedIcon: TargetSolidIcon },
// { name: 'api & webhook', href: `/datasets/${datasetId}/api`, icon: CommandLineIcon, selectedIcon: CommandLineSolidIcon },
{ name: t('common.datasetMenus.settings'), href: `/datasets/${datasetId}/settings`, icon: Cog8ToothIcon, selectedIcon: Cog8ToothSolidIcon },
]
const navigation = useMemo(() => {
const baseNavigation = [
{ name: t('common.datasetMenus.hitTesting'), href: `/datasets/${datasetId}/hitTesting`, icon: TargetIcon, selectedIcon: TargetSolidIcon },
// { name: 'api & webhook', href: `/datasets/${datasetId}/api`, icon: CommandLineIcon, selectedIcon: CommandLineSolidIcon },
{ name: t('common.datasetMenus.settings'), href: `/datasets/${datasetId}/settings`, icon: Cog8ToothIcon, selectedIcon: Cog8ToothSolidIcon },
]
if (datasetRes?.provider !== 'external') {
baseNavigation.unshift({
name: t('common.datasetMenus.documents'),
href: `/datasets/${datasetId}/documents`,
icon: DocumentTextIcon,
selectedIcon: DocumentTextSolidIcon,
})
}
return baseNavigation
}, [datasetRes?.provider, datasetId, t])
useEffect(() => {
if (datasetRes)
@@ -233,6 +244,7 @@ const DatasetDetailLayout: FC<IAppDetailLayoutProps> = (props) => {
icon={datasetRes?.icon || 'https://static.dify.ai/images/dataset-default-icon.png'}
icon_background={datasetRes?.icon_background || '#F5F5F5'}
desc={datasetRes?.description || '--'}
isExternal={datasetRes?.provider === 'external'}
navigation={navigation}
extraInfo={!isCurrentWorkspaceDatasetOperator ? mode => <ExtraInfo isMobile={mode === 'collapse'} relatedApps={relatedApps} /> : undefined}
iconType={datasetRes?.data_source_type === DataSourceType.NOTION ? 'notion' : 'dataset'}

View File

@@ -8,6 +8,7 @@ import { useDebounceFn } from 'ahooks'
import useSWR from 'swr'
// Components
import ExternalAPIPanel from '../../components/datasets/external-api/external-api-panel'
import Datasets from './Datasets'
import DatasetFooter from './DatasetFooter'
import ApiServer from './ApiServer'
@@ -16,6 +17,8 @@ import TabSliderNew from '@/app/components/base/tab-slider-new'
import SearchInput from '@/app/components/base/search-input'
import TagManagementModal from '@/app/components/base/tag-management'
import TagFilter from '@/app/components/base/tag-management/filter'
import Button from '@/app/components/base/button'
import { ApiConnectionMod } from '@/app/components/base/icons/src/vender/solid/development'
// Services
import { fetchDatasetApiBaseUrl } from '@/service/datasets'
@@ -24,12 +27,14 @@ import { fetchDatasetApiBaseUrl } from '@/service/datasets'
import { useTabSearchParams } from '@/hooks/use-tab-searchparams'
import { useStore as useTagStore } from '@/app/components/base/tag-management/store'
import { useAppContext } from '@/context/app-context'
import { useExternalApiPanel } from '@/context/external-api-panel-context'
const Container = () => {
const { t } = useTranslation()
const router = useRouter()
const { currentWorkspace } = useAppContext()
const showTagManagementModal = useTagStore(s => s.showTagManagementModal)
const { showExternalApiPanel, setShowExternalApiPanel } = useExternalApiPanel()
const options = useMemo(() => {
return [
@@ -66,7 +71,7 @@ const Container = () => {
useEffect(() => {
if (currentWorkspace.role === 'normal')
return router.replace('/apps')
}, [currentWorkspace])
}, [currentWorkspace, router])
return (
<div ref={containerRef} className='grow relative flex flex-col bg-gray-100 overflow-y-auto'>
@@ -80,11 +85,18 @@ const Container = () => {
<div className='flex items-center gap-2'>
<TagFilter type='knowledge' value={tagFilterValue} onChange={handleTagsChange} />
<SearchInput className='w-[200px]' value={keywords} onChange={handleKeywordsChange} />
<div className="w-[1px] h-4 bg-divider-regular" />
<Button
className='gap-0.5 shadows-shadow-xs'
onClick={() => setShowExternalApiPanel(true)}
>
<ApiConnectionMod className='w-4 h-4 text-components-button-secondary-text' />
<div className='flex px-0.5 justify-center items-center gap-1 text-components-button-secondary-text system-sm-medium'>{t('dataset.externalAPIPanelTitle')}</div>
</Button>
</div>
)}
{activeTab === 'api' && data && <ApiServer apiBaseUrl={data.api_base_url || ''} />}
</div>
{activeTab === 'dataset' && (
<>
<Datasets containerRef={containerRef} tags={tagIDs} keywords={searchKeywords} />
@@ -94,10 +106,10 @@ const Container = () => {
)}
</>
)}
{activeTab === 'api' && data && <Doc apiBaseUrl={data.api_base_url || ''} />}
</div>
{showExternalApiPanel && <ExternalAPIPanel onClose={() => setShowExternalApiPanel(false)} />}
</div>
)
}

View File

@@ -18,6 +18,7 @@ import Divider from '@/app/components/base/divider'
import RenameDatasetModal from '@/app/components/datasets/rename-modal'
import type { Tag } from '@/app/components/base/tag-management/constant'
import TagSelector from '@/app/components/base/tag-management/selector'
import CornerLabel from '@/app/components/base/corner-label'
import { useAppContext } from '@/context/app-context'
export type DatasetCardProps = {
@@ -32,6 +33,7 @@ const DatasetCard = ({
const { t } = useTranslation()
const { notify } = useContext(ToastContext)
const { push } = useRouter()
const EXTERNAL_PROVIDER = 'external' as const
const { isCurrentWorkspaceDatasetOperator } = useAppContext()
const [tags, setTags] = useState<Tag[]>(dataset.tags)
@@ -39,6 +41,7 @@ const DatasetCard = ({
const [showRenameModal, setShowRenameModal] = useState(false)
const [showConfirmDelete, setShowConfirmDelete] = useState(false)
const [confirmMessage, setConfirmMessage] = useState<string>('')
const isExternalProvider = (provider: string): boolean => provider === EXTERNAL_PROVIDER
const detectIsUsedByApp = useCallback(async () => {
try {
const { is_using: isUsedByApp } = await checkIsUsedInApp(dataset.id)
@@ -108,13 +111,16 @@ const DatasetCard = ({
return (
<>
<div
className='group col-span-1 bg-white border-2 border-solid border-transparent rounded-xl shadow-sm min-h-[160px] flex flex-col transition-all duration-200 ease-in-out cursor-pointer hover:shadow-lg'
className='group relative col-span-1 bg-white border-[0.5px] border-solid border-transparent rounded-xl shadow-sm min-h-[160px] flex flex-col transition-all duration-200 ease-in-out cursor-pointer hover:shadow-lg'
data-disable-nprogress={true}
onClick={(e) => {
e.preventDefault()
push(`/datasets/${dataset.id}/documents`)
isExternalProvider(dataset.provider)
? push(`/datasets/${dataset.id}/hitTesting`)
: push(`/datasets/${dataset.id}/documents`)
}}
>
{isExternalProvider(dataset.provider) && <CornerLabel label='External' className='absolute right-0' labelClassName='rounded-tr-xl' />}
<div className='flex pt-[14px] px-[14px] pb-3 h-[66px] items-center gap-3 grow-0 shrink-0'>
<div className={cn(
'shrink-0 flex items-center justify-center p-2.5 bg-[#F5F8FF] rounded-md border-[0.5px] border-[#E0EAFF]',
@@ -136,13 +142,20 @@ const DatasetCard = ({
<div className='flex items-center mt-[1px] text-xs leading-[18px] text-gray-500'>
<div
className={cn('truncate', (!dataset.embedding_available || !dataset.document_count) && 'opacity-50')}
title={`${dataset.document_count}${t('dataset.documentCount')} · ${Math.round(dataset.word_count / 1000)}${t('dataset.wordCount')} · ${dataset.app_count}${t('dataset.appCount')}`}
title={dataset.provider === 'external' ? `${dataset.app_count}${t('dataset.appCount')}` : `${dataset.document_count}${t('dataset.documentCount')} · ${Math.round(dataset.word_count / 1000)}${t('dataset.wordCount')} · ${dataset.app_count}${t('dataset.appCount')}`}
>
<span>{dataset.document_count}{t('dataset.documentCount')}</span>
<span className='shrink-0 mx-0.5 w-1 text-gray-400'>·</span>
<span>{Math.round(dataset.word_count / 1000)}{t('dataset.wordCount')}</span>
<span className='shrink-0 mx-0.5 w-1 text-gray-400'>·</span>
<span>{dataset.app_count}{t('dataset.appCount')}</span>
{dataset.provider === 'external'
? <>
<span>{dataset.app_count}{t('dataset.appCount')}</span>
</>
: <>
<span>{dataset.document_count}{t('dataset.documentCount')}</span>
<span className='shrink-0 mx-0.5 w-1 text-gray-400'>·</span>
<span>{Math.round(dataset.word_count / 1000)}{t('dataset.wordCount')}</span>
<span className='shrink-0 mx-0.5 w-1 text-gray-400'>·</span>
<span>{dataset.app_count}{t('dataset.appCount')}</span>
</>
}
</div>
</div>
</div>

View File

@@ -4,21 +4,32 @@ import { forwardRef } from 'react'
import { useTranslation } from 'react-i18next'
import {
RiAddLine,
RiArrowRightLine,
} from '@remixicon/react'
const CreateAppCard = forwardRef<HTMLAnchorElement>((_, ref) => {
const { t } = useTranslation()
return (
<a ref={ref} className='group flex flex-col col-span-1 bg-gray-200 border-[0.5px] border-black/5 rounded-xl min-h-[160px] transition-all duration-200 ease-in-out cursor-pointer hover:bg-white hover:shadow-lg' href='/datasets/create'>
<div className='shrink-0 flex items-center p-4 pb-3'>
<div className='w-10 h-10 flex items-center justify-center border border-gray-200 bg-gray-100 rounded-lg'>
<RiAddLine className='w-4 h-4 text-gray-500'/>
<div className='flex flex-col bg-background-default-dimm border-[0.5px] border-components-panel-border rounded-xl
min-h-[160px] transition-all duration-200 ease-in-out'
>
<a ref={ref} className='group flex flex-grow items-start p-4 cursor-pointer' href='/datasets/create'>
<div className='flex items-center gap-3'>
<div className='w-10 h-10 p-2 flex items-center justify-center border border-dashed border-divider-regular rounded-lg
bg-background-default-lighter group-hover:border-solid group-hover:border-effects-highlight group-hover:bg-background-default-dodge'
>
<RiAddLine className='w-4 h-4 text-text-tertiary group-hover:text-text-accent'/>
</div>
<div className='system-md-semibold text-text-secondary group-hover:text-text-accent'>{t('dataset.createDataset')}</div>
</div>
<div className='ml-3 text-sm font-semibold leading-5 text-gray-800 group-hover:text-primary-600'>{t('dataset.createDataset')}</div>
</div>
<div className='mb-1 px-4 text-xs leading-normal text-gray-500 line-clamp-4'>{t('dataset.createDatasetIntro')}</div>
</a>
</a>
<div className='p-4 pt-0 text-text-tertiary system-xs-regular'>{t('dataset.createDatasetIntro')}</div>
<a className='group flex p-4 items-center gap-1 border-t-[0.5px] border-divider-subtle rounded-b-xl cursor-pointer' href='/datasets/connect'>
<div className='system-xs-medium text-text-tertiary group-hover:text-text-accent'>{t('dataset.connectDataset')}</div>
<RiArrowRightLine className='w-3.5 h-3.5 text-text-tertiary group-hover:text-text-accent' />
</a>
</div>
)
})

View File

@@ -0,0 +1,8 @@
import React from 'react'
import ExternalKnowledgeBaseConnector from '@/app/components/datasets/external-knowledge-base/connector'
const ExternalKnowledgeBaseCreation = () => {
return <ExternalKnowledgeBaseConnector />
}
export default ExternalKnowledgeBaseCreation

View File

@@ -0,0 +1,14 @@
'use client'
import { ExternalApiPanelProvider } from '@/context/external-api-panel-context'
import { ExternalKnowledgeApiProvider } from '@/context/external-knowledge-api-context'
export default function DatasetsLayout({ children }: { children: React.ReactNode }) {
return (
<ExternalKnowledgeApiProvider>
<ExternalApiPanelProvider>
{children}
</ExternalApiPanelProvider>
</ExternalKnowledgeApiProvider>
)
}

View File

@@ -1,9 +1,7 @@
import Container from './Container'
const AppList = async () => {
return (
<Container />
)
return <Container />
}
export const metadata = {

View File

@@ -0,0 +1,11 @@
import { create } from 'zustand'
type DatasetStore = {
showExternalApiPanel: boolean
setShowExternalApiPanel: (show: boolean) => void
}
export const useDatasetStore = create<DatasetStore>(set => ({
showExternalApiPanel: false,
setShowExternalApiPanel: show => set({ showExternalApiPanel: show }),
}))

View File

@@ -1,4 +1,5 @@
import React from 'react'
import { useTranslation } from 'react-i18next'
import AppIcon from '../base/app-icon'
import Tooltip from '@/app/components/base/tooltip'
@@ -6,6 +7,7 @@ export type IAppBasicProps = {
iconType?: 'app' | 'api' | 'dataset' | 'webapp' | 'notion'
icon?: string
icon_background?: string | null
isExternal?: boolean
name: string
type: string | React.ReactNode
hoverTip?: string
@@ -52,7 +54,9 @@ const ICON_MAP = {
notion: <AppIcon innerIcon={NotionSvg} className='!border-[0.5px] !border-indigo-100 !bg-white' />,
}
export default function AppBasic({ icon, icon_background, name, type, hoverTip, textStyle, mode = 'expand', iconType = 'app' }: IAppBasicProps) {
export default function AppBasic({ icon, icon_background, name, isExternal, type, hoverTip, textStyle, mode = 'expand', iconType = 'app' }: IAppBasicProps) {
const { t } = useTranslation()
return (
<div className="flex items-start p-1">
{icon && icon_background && iconType === 'app' && (
@@ -83,6 +87,7 @@ export default function AppBasic({ icon, icon_background, name, type, hoverTip,
}
</div>
<div className={`text-xs font-normal text-gray-500 group-hover:text-gray-700 break-all ${textStyle?.extra ?? ''}`}>{type}</div>
<div className='text-text-tertiary system-2xs-medium-uppercase'>{isExternal ? t('dataset.externalTag') : ''}</div>
</div>}
</div>
)

View File

@@ -15,6 +15,7 @@ export type IAppDetailNavProps = {
iconType?: 'app' | 'dataset' | 'notion'
title: string
desc: string
isExternal?: boolean
icon: string
icon_background: string
navigation: Array<{
@@ -26,7 +27,7 @@ export type IAppDetailNavProps = {
extraInfo?: (modeState: string) => React.ReactNode
}
const AppDetailNav = ({ title, desc, icon, icon_background, navigation, extraInfo, iconType = 'app' }: IAppDetailNavProps) => {
const AppDetailNav = ({ title, desc, isExternal, icon, icon_background, navigation, extraInfo, iconType = 'app' }: IAppDetailNavProps) => {
const { appSidebarExpand, setAppSiderbarExpand } = useAppStore(useShallow(state => ({
appSidebarExpand: state.appSidebarExpand,
setAppSiderbarExpand: state.setAppSiderbarExpand,
@@ -70,6 +71,7 @@ const AppDetailNav = ({ title, desc, icon, icon_background, navigation, extraInf
icon_background={icon_background}
name={title}
type={desc}
isExternal={isExternal}
/>
)}
</div>

View File

@@ -3,6 +3,7 @@ import type { FC } from 'react'
import React, { useEffect, useState } from 'react'
import { useTranslation } from 'react-i18next'
import { Pagination } from 'react-headless-pagination'
import { useDebounce } from 'ahooks'
import { ArrowLeftIcon, ArrowRightIcon } from '@heroicons/react/24/outline'
import Toast from '../../base/toast'
import Filter from './filter'
@@ -67,10 +68,11 @@ const Annotation: FC<Props> = ({
const [queryParams, setQueryParams] = useState<QueryParam>({})
const [currPage, setCurrPage] = React.useState<number>(0)
const debouncedQueryParams = useDebounce(queryParams, { wait: 500 })
const query = {
page: currPage + 1,
limit: APP_PAGE_LIMIT,
keyword: queryParams.keyword || '',
keyword: debouncedQueryParams.keyword || '',
}
const [controlUpdateList, setControlUpdateList] = useState(Date.now())
@@ -232,8 +234,8 @@ const Annotation: FC<Props> = ({
middlePagesSiblingCount={1}
setCurrentPage={setCurrPage}
totalPages={Math.ceil(total / APP_PAGE_LIMIT)}
truncatableClassName="w-8 px-0.5 text-center"
truncatableText="..."
truncableClassName="w-8 px-0.5 text-center"
truncableText="..."
>
<Pagination.PrevButton
disabled={currPage === 0}

View File

@@ -5,6 +5,7 @@ import {
RiDeleteBinLine,
RiEditLine,
} from '@remixicon/react'
import { useTranslation } from 'react-i18next'
import SettingsModal from '../settings-modal'
import type { DataSet } from '@/models/datasets'
import { DataSourceType } from '@/models/datasets'
@@ -33,6 +34,7 @@ const Item: FC<ItemProps> = ({
const isMobile = media === MediaType.mobile
const [showSettingsModal, setShowSettingsModal] = useState(false)
const { formatIndexingTechniqueAndMethod } = useKnowledge()
const { t } = useTranslation()
const handleSave = (newDataset: DataSet) => {
onSave(newDataset)
@@ -65,9 +67,11 @@ const Item: FC<ItemProps> = ({
<div className='grow'>
<div className='flex items-center h-[18px]'>
<div className='grow text-[13px] font-medium text-gray-800 truncate' title={config.name}>{config.name}</div>
<Badge
text={formatIndexingTechniqueAndMethod(config.indexing_technique, config.retrieval_model_dict?.search_method)}
/>
{config.provider === 'external'
? <Badge text={t('dataset.externalTag')}></Badge>
: <Badge
text={formatIndexingTechniqueAndMethod(config.indexing_technique, config.retrieval_model_dict?.search_method)}
/>}
</div>
</div>
<div className='hidden rounded-lg group-hover:flex items-center justify-end absolute right-0 top-0 bottom-0 pr-2 w-[124px] bg-gradient-to-r from-white/50 to-white to-50%'>

View File

@@ -174,6 +174,20 @@ const ConfigContent: FC<Props> = ({
</div>
)
}
{
selectedDatasetsMode.mixtureInternalAndExternal && (
<div className='mt-4 system-xs-medium text-text-warning'>
{t('dataset.mixtureInternalAndExternalTip')}
</div>
)
}
{
selectedDatasetsMode.allExternal && (
<div className='mt-4 system-xs-medium text-text-warning'>
{t('dataset.allExternalTip')}
</div>
)
}
{
selectedDatasetsMode.mixtureHighQualityAndEconomic
&& (
@@ -229,15 +243,15 @@ const ConfigContent: FC<Props> = ({
/>
)
}
<div className='ml-2 leading-[32px] text-[13px] font-medium text-gray-900'>{t('common.modelProvider.rerankModel.key')}</div>
<div className='leading-[32px] text-text-secondary system-sm-semibold'>{t('common.modelProvider.rerankModel.key')}</div>
<Tooltip
popupContent={
<div className="w-[200px]">
{t('common.modelProvider.rerankModel.tip')}
</div>
}
popupClassName='ml-0.5'
triggerClassName='ml-0.5 w-3.5 h-3.5'
popupClassName='ml-1'
triggerClassName='ml-1 w-4 h-4'
/>
</div>
<div>

View File

@@ -39,13 +39,26 @@ const ParamsConfig = ({
useEffect(() => {
const {
allEconomic,
allHighQuality,
allHighQualityFullTextSearch,
allHighQualityVectorSearch,
allExternal,
mixtureHighQualityAndEconomic,
inconsistentEmbeddingModel,
mixtureInternalAndExternal,
} = getSelectedDatasetsMode(selectedDatasets)
const { datasets, retrieval_model, score_threshold_enabled, ...restConfigs } = datasetConfigs
let rerankEnable = restConfigs.reranking_enable
if (allEconomic && !restConfigs.reranking_model?.reranking_provider_name && rerankEnable === undefined)
if ((allEconomic && !restConfigs.reranking_model?.reranking_provider_name && rerankEnable === undefined) || allExternal)
rerankEnable = false
if (allEconomic || allHighQuality || allHighQualityFullTextSearch || allHighQualityVectorSearch || (allExternal && selectedDatasets.length === 1))
setRerankSettingModalOpen(false)
if (mixtureHighQualityAndEconomic || inconsistentEmbeddingModel || mixtureInternalAndExternal || (allExternal && selectedDatasets.length > 1))
setRerankSettingModalOpen(true)
setTempDataSetConfigs({
...getMultipleRetrievalConfig({
top_k: restConfigs.top_k,

View File

@@ -47,7 +47,7 @@ const SelectDataSet: FC<ISelectDataSetProps> = ({
const { data, has_more } = await fetchDatasets({ url: '/datasets', params: { page } })
setPage(getPage() + 1)
setIsNoMore(!has_more)
const newList = [...(datasets || []), ...data.filter(item => item.indexing_technique)]
const newList = [...(datasets || []), ...data.filter(item => item.indexing_technique || item.provider === 'external')]
setDataSets(newList)
setLoaded(true)
if (!selected.find(item => !item.name))
@@ -145,6 +145,11 @@ const SelectDataSet: FC<ISelectDataSetProps> = ({
/>
)
}
{
item.provider === 'external' && (
<Badge text={t('dataset.externalTag')} />
)
}
</div>
))}
</div>

View File

@@ -5,8 +5,10 @@ import { useTranslation } from 'react-i18next'
import { isEqual } from 'lodash-es'
import { RiCloseLine } from '@remixicon/react'
import { BookOpenIcon } from '@heroicons/react/24/outline'
import { ApiConnectionMod } from '@/app/components/base/icons/src/vender/solid/development'
import cn from '@/utils/classnames'
import IndexMethodRadio from '@/app/components/datasets/settings/index-method-radio'
import Divider from '@/app/components/base/divider'
import Button from '@/app/components/base/button'
import type { DataSet } from '@/models/datasets'
import { useToastContext } from '@/app/components/base/toast'
@@ -14,6 +16,7 @@ import { updateDatasetSetting } from '@/service/datasets'
import { useAppContext } from '@/context/app-context'
import { useModalContext } from '@/context/modal-context'
import type { RetrievalConfig } from '@/types/app'
import RetrievalSettings from '@/app/components/datasets/external-knowledge-base/create/RetrievalSettings'
import RetrievalMethodConfig from '@/app/components/datasets/common/retrieval-method-config'
import EconomicalRetrievalMethodConfig from '@/app/components/datasets/common/economical-retrieval-method-config'
import { ensureRerankModelSelected, isReRankModelSelected } from '@/app/components/datasets/common/check-rerank-model'
@@ -56,6 +59,9 @@ const SettingsModal: FC<SettingsModalProps> = ({
const { t } = useTranslation()
const { notify } = useToastContext()
const ref = useRef(null)
const [topK, setTopK] = useState(currentDataset?.external_retrieval_model.top_k ?? 2)
const [scoreThreshold, setScoreThreshold] = useState(currentDataset?.external_retrieval_model.score_threshold ?? 0.5)
const [scoreThresholdEnabled, setScoreThresholdEnabled] = useState(currentDataset?.external_retrieval_model.score_threshold_enabled ?? false)
const { setShowAccountSettingModal } = useModalContext()
const [loading, setLoading] = useState(false)
@@ -73,6 +79,15 @@ const SettingsModal: FC<SettingsModalProps> = ({
const [isHideChangedTip, setIsHideChangedTip] = useState(false)
const isRetrievalChanged = !isEqual(retrievalConfig, localeCurrentDataset?.retrieval_model_dict) || indexMethod !== localeCurrentDataset?.indexing_technique
const handleSettingsChange = (data: { top_k?: number; score_threshold?: number; score_threshold_enabled?: boolean }) => {
if (data.top_k !== undefined)
setTopK(data.top_k)
if (data.score_threshold !== undefined)
setScoreThreshold(data.score_threshold)
if (data.score_threshold_enabled !== undefined)
setScoreThresholdEnabled(data.score_threshold_enabled)
}
const handleSave = async () => {
if (loading)
return
@@ -107,10 +122,17 @@ const SettingsModal: FC<SettingsModalProps> = ({
description,
permission,
indexing_technique: indexMethod,
external_retrieval_model: {
top_k: topK,
score_threshold: scoreThreshold,
score_threshold_enabled: scoreThresholdEnabled,
},
retrieval_model: {
...postRetrievalConfig,
score_threshold: postRetrievalConfig.score_threshold_enabled ? postRetrievalConfig.score_threshold : 0,
},
external_knowledge_id: currentDataset!.external_knowledge_info.external_knowledge_id,
external_knowledge_api_id: currentDataset!.external_knowledge_info.external_knowledge_api_id,
embedding_model: localeCurrentDataset.embedding_model,
embedding_model_provider: localeCurrentDataset.embedding_model_provider,
},
@@ -178,7 +200,7 @@ const SettingsModal: FC<SettingsModalProps> = ({
}}>
<div className={cn(rowClass, 'items-center')}>
<div className={labelClass}>
{t('datasetSettings.form.name')}
<div className='text-text-secondary system-sm-semibold'>{t('datasetSettings.form.name')}</div>
</div>
<input
value={localeCurrentDataset.name}
@@ -189,7 +211,7 @@ const SettingsModal: FC<SettingsModalProps> = ({
</div>
<div className={cn(rowClass)}>
<div className={labelClass}>
{t('datasetSettings.form.desc')}
<div className='text-text-secondary system-sm-semibold'>{t('datasetSettings.form.desc')}</div>
</div>
<div className='w-full'>
<textarea
@@ -206,7 +228,7 @@ const SettingsModal: FC<SettingsModalProps> = ({
</div>
<div className={rowClass}>
<div className={labelClass}>
<div>{t('datasetSettings.form.permissions')}</div>
<div className='text-text-secondary system-sm-semibold'>{t('datasetSettings.form.permissions')}</div>
</div>
<div className='w-full'>
<PermissionSelector
@@ -219,24 +241,25 @@ const SettingsModal: FC<SettingsModalProps> = ({
/>
</div>
</div>
<div className="w-full h-0 border-b-[0.5px] border-b-gray-200 my-2"></div>
<div className={cn(rowClass)}>
<div className={labelClass}>
{t('datasetSettings.form.indexMethod')}
{currentDataset && currentDataset.indexing_technique && (
<div className={cn(rowClass)}>
<div className={labelClass}>
<div className='text-text-secondary system-sm-semibold'>{t('datasetSettings.form.indexMethod')}</div>
</div>
<div className='grow'>
<IndexMethodRadio
disable={!localeCurrentDataset?.embedding_available}
value={indexMethod}
onChange={v => setIndexMethod(v!)}
itemClassName='sm:!w-[280px]'
/>
</div>
</div>
<div className='grow'>
<IndexMethodRadio
disable={!localeCurrentDataset?.embedding_available}
value={indexMethod}
onChange={v => setIndexMethod(v!)}
itemClassName='sm:!w-[280px]'
/>
</div>
</div>
)}
{indexMethod === 'high_quality' && (
<div className={cn(rowClass)}>
<div className={labelClass}>
{t('datasetSettings.form.embeddingModel')}
<div className='text-text-secondary system-sm-semibold'>{t('datasetSettings.form.embeddingModel')}</div>
</div>
<div className='w-full'>
<div className='w-full h-9 rounded-lg bg-gray-100 opacity-60'>
@@ -258,32 +281,75 @@ const SettingsModal: FC<SettingsModalProps> = ({
)}
{/* Retrieval Method Config */}
<div className={rowClass}>
<div className={cn(labelClass, 'w-auto min-w-[168px]')}>
<div>
<div>{t('datasetSettings.form.retrievalSetting.title')}</div>
<div className='leading-[18px] text-xs font-normal text-gray-500'>
<a target='_blank' rel='noopener noreferrer' href='https://docs.dify.ai/guides/knowledge-base/create-knowledge-and-upload-documents#id-4-retrieval-settings' className='text-[#155eef]'>{t('datasetSettings.form.retrievalSetting.learnMore')}</a>
{t('datasetSettings.form.retrievalSetting.description')}
{currentDataset?.provider === 'external'
? <>
<div className={rowClass}><Divider/></div>
<div className={rowClass}>
<div className={labelClass}>
<div className='text-text-secondary system-sm-semibold'>{t('datasetSettings.form.retrievalSetting.title')}</div>
</div>
<RetrievalSettings
topK={topK}
scoreThreshold={scoreThreshold}
scoreThresholdEnabled={scoreThresholdEnabled}
onChange={handleSettingsChange}
isInRetrievalSetting={true}
/>
</div>
<div className={rowClass}><Divider/></div>
<div className={rowClass}>
<div className={labelClass}>
<div className='text-text-secondary system-sm-semibold'>{t('datasetSettings.form.externalKnowledgeAPI')}</div>
</div>
<div className='w-full max-w-[480px]'>
<div className='flex h-full px-3 py-2 items-center gap-1 rounded-lg bg-components-input-bg-normal'>
<ApiConnectionMod className='w-4 h-4 text-text-secondary' />
<div className='overflow-hidden text-text-secondary text-ellipsis system-sm-medium'>
{currentDataset?.external_knowledge_info.external_knowledge_api_name}
</div>
<div className='text-text-tertiary system-xs-regular'>·</div>
<div className='text-text-tertiary system-xs-regular'>{currentDataset?.external_knowledge_info.external_knowledge_api_endpoint}</div>
</div>
</div>
</div>
</div>
<div>
{indexMethod === 'high_quality'
? (
<RetrievalMethodConfig
value={retrievalConfig}
onChange={setRetrievalConfig}
/>
)
: (
<EconomicalRetrievalMethodConfig
value={retrievalConfig}
onChange={setRetrievalConfig}
/>
)}
</div>
</div>
<div className={rowClass}>
<div className={labelClass}>
<div className='text-text-secondary system-sm-semibold'>{t('datasetSettings.form.externalKnowledgeID')}</div>
</div>
<div className='w-full max-w-[480px]'>
<div className='flex h-full px-3 py-2 items-center gap-1 rounded-lg bg-components-input-bg-normal'>
<div className='text-text-tertiary system-xs-regular'>{currentDataset?.external_knowledge_info.external_knowledge_id}</div>
</div>
</div>
</div>
<div className={rowClass}><Divider/></div>
</>
: <div className={rowClass}>
<div className={cn(labelClass, 'w-auto min-w-[168px]')}>
<div>
<div className='text-text-secondary system-sm-semibold'>{t('datasetSettings.form.retrievalSetting.title')}</div>
<div className='leading-[18px] text-xs font-normal text-gray-500'>
<a target='_blank' rel='noopener noreferrer' href='https://docs.dify.ai/guides/knowledge-base/create-knowledge-and-upload-documents#id-4-retrieval-settings' className='text-[#155eef]'>{t('datasetSettings.form.retrievalSetting.learnMore')}</a>
{t('datasetSettings.form.retrievalSetting.description')}
</div>
</div>
</div>
<div>
{indexMethod === 'high_quality'
? (
<RetrievalMethodConfig
value={retrievalConfig}
onChange={setRetrievalConfig}
/>
)
: (
<EconomicalRetrievalMethodConfig
value={retrievalConfig}
onChange={setRetrievalConfig}
/>
)}
</div>
</div>}
</div>
{isRetrievalChanged && !isHideChangedTip && (
<div className='absolute z-10 left-[30px] right-[30px] bottom-[76px] flex h-10 items-center px-3 rounded-lg border border-[#FEF0C7] bg-[#FFFAEB] shadow-lg justify-between'>

View File

@@ -4,6 +4,7 @@ import React, { useState } from 'react'
import useSWR from 'swr'
import { usePathname } from 'next/navigation'
import { Pagination } from 'react-headless-pagination'
import { useDebounce } from 'ahooks'
import { omit } from 'lodash-es'
import dayjs from 'dayjs'
import { ArrowLeftIcon, ArrowRightIcon } from '@heroicons/react/24/outline'
@@ -59,6 +60,7 @@ const Logs: FC<ILogsProps> = ({ appDetail }) => {
sort_by: '-created_at',
})
const [currPage, setCurrPage] = React.useState<number>(0)
const debouncedQueryParams = useDebounce(queryParams, { wait: 500 })
// Get the app type first
const isChatMode = appDetail.mode !== 'completion'
@@ -66,14 +68,14 @@ const Logs: FC<ILogsProps> = ({ appDetail }) => {
const query = {
page: currPage + 1,
limit: APP_PAGE_LIMIT,
...(queryParams.period !== 'all'
...(debouncedQueryParams.period !== 'all'
? {
start: dayjs().subtract(queryParams.period as number, 'day').startOf('day').format('YYYY-MM-DD HH:mm'),
start: dayjs().subtract(debouncedQueryParams.period as number, 'day').startOf('day').format('YYYY-MM-DD HH:mm'),
end: dayjs().endOf('day').format('YYYY-MM-DD HH:mm'),
}
: {}),
...(isChatMode ? { sort_by: queryParams.sort_by } : {}),
...omit(queryParams, ['period']),
...(isChatMode ? { sort_by: debouncedQueryParams.sort_by } : {}),
...omit(debouncedQueryParams, ['period']),
}
const getWebAppType = (appType: AppMode) => {
@@ -119,8 +121,8 @@ const Logs: FC<ILogsProps> = ({ appDetail }) => {
middlePagesSiblingCount={1}
setCurrentPage={setCurrPage}
totalPages={Math.ceil(total / APP_PAGE_LIMIT)}
truncatableClassName="w-8 px-0.5 text-center"
truncatableText="..."
truncableClassName="w-8 px-0.5 text-center"
truncableText="..."
>
<Pagination.PrevButton
disabled={currPage === 0}

View File

@@ -4,6 +4,7 @@ import React, { useState } from 'react'
import useSWR from 'swr'
import { usePathname } from 'next/navigation'
import { Pagination } from 'react-headless-pagination'
import { useDebounce } from 'ahooks'
import { ArrowLeftIcon, ArrowRightIcon } from '@heroicons/react/24/outline'
import { Trans, useTranslation } from 'react-i18next'
import Link from 'next/link'
@@ -51,12 +52,13 @@ const Logs: FC<ILogsProps> = ({ appDetail }) => {
const { t } = useTranslation()
const [queryParams, setQueryParams] = useState<QueryParam>({ status: 'all' })
const [currPage, setCurrPage] = React.useState<number>(0)
const debouncedQueryParams = useDebounce(queryParams, { wait: 500 })
const query = {
page: currPage + 1,
limit: APP_PAGE_LIMIT,
...(queryParams.status !== 'all' ? { status: queryParams.status } : {}),
...(queryParams.keyword ? { keyword: queryParams.keyword } : {}),
...(debouncedQueryParams.status !== 'all' ? { status: debouncedQueryParams.status } : {}),
...(debouncedQueryParams.keyword ? { keyword: debouncedQueryParams.keyword } : {}),
}
const getWebAppType = (appType: AppMode) => {
@@ -93,8 +95,8 @@ const Logs: FC<ILogsProps> = ({ appDetail }) => {
middlePagesSiblingCount={1}
setCurrentPage={setCurrPage}
totalPages={Math.ceil(total / APP_PAGE_LIMIT)}
truncatableClassName="w-8 px-0.5 text-center"
truncatableText="..."
truncableClassName="w-8 px-0.5 text-center"
truncableText="..."
>
<Pagination.PrevButton
disabled={currPage === 0}

View File

@@ -0,0 +1,21 @@
import { Corner } from '../icons/src/vender/solid/shapes'
import cn from '@/utils/classnames'
type CornerLabelProps = {
label: string
className?: string
labelClassName?: string
}
const CornerLabel: React.FC<CornerLabelProps> = ({ label, className, labelClassName }) => {
return (
<div className={cn('group/corner-label inline-flex items-start', className)}>
<Corner className='w-[13px] h-5 text-background-section group-hover/corner-label:text-background-section-burn' />
<div className={cn('flex py-1 pr-2 items-center gap-0.5 bg-background-section group-hover/corner-label:bg-background-section-burn', labelClassName)}>
<div className='text-text-tertiary system-2xs-medium-uppercase'>{label}</div>
</div>
</div>
)
}
export default CornerLabel

View File

@@ -0,0 +1,5 @@
<svg width="16" height="16" viewBox="0 0 16 16" fill="none" xmlns="http://www.w3.org/2000/svg">
<g id="Icon L">
<path id="Vector" fill-rule="evenodd" clip-rule="evenodd" d="M7.99996 3.33333C5.42263 3.33333 3.33329 5.42267 3.33329 8C3.33329 10.5773 5.42263 12.6667 7.99996 12.6667C9.72643 12.6667 11.2348 11.7295 12.0427 10.3329C12.227 10.0141 12.6349 9.90523 12.9536 10.0896C13.2723 10.274 13.3812 10.6818 13.1968 11.0005C12.1604 12.7921 10.2216 14 7.99996 14C4.91159 14 2.36821 11.6666 2.03658 8.66667H1.33329C0.965103 8.66667 0.666626 8.36819 0.666626 8C0.666626 7.63181 0.965103 7.33333 1.33329 7.33333H2.03658C2.36821 4.33337 4.91159 2 7.99996 2C10.2216 2 12.1604 3.20785 13.1968 4.99952C13.3812 5.31823 13.2723 5.72605 12.9536 5.91041C12.6349 6.09477 12.227 5.98585 12.0427 5.66714C11.2348 4.27054 9.72643 3.33333 7.99996 3.33333ZM7.99996 6C6.89539 6 5.99996 6.89543 5.99996 8C5.99996 9.10455 6.89539 10 7.99996 10C9.1045 10 9.99996 9.10454 9.99996 8C9.99996 6.89543 9.10451 6 7.99996 6ZM4.66663 8C4.66663 6.15905 6.15901 4.66667 7.99996 4.66667C9.61257 4.66667 10.9578 5.81184 11.2666 7.33333H14.6666C15.0348 7.33333 15.3333 7.63181 15.3333 8C15.3333 8.36819 15.0348 8.66667 14.6666 8.66667H11.2666C10.9578 10.1881 9.61257 11.3333 7.99996 11.3333C6.159 11.3333 4.66663 9.84092 4.66663 8Z" fill="#354052"/>
</g>
</svg>

After

Width:  |  Height:  |  Size: 1.3 KiB

View File

@@ -0,0 +1,3 @@
<svg width="13" height="20" viewBox="0 0 13 20" fill="none" xmlns="http://www.w3.org/2000/svg">
<path id="Shape" d="M0 0H13V20C9.98017 20 7.26458 18.1615 6.14305 15.3576L0 0Z" fill="#F9FAFB"/>
</svg>

After

Width:  |  Height:  |  Size: 200 B

View File

@@ -0,0 +1,38 @@
{
"icon": {
"type": "element",
"isRootNode": true,
"name": "svg",
"attributes": {
"width": "16",
"height": "16",
"viewBox": "0 0 16 16",
"fill": "none",
"xmlns": "http://www.w3.org/2000/svg"
},
"children": [
{
"type": "element",
"name": "g",
"attributes": {
"id": "Icon L"
},
"children": [
{
"type": "element",
"name": "path",
"attributes": {
"id": "Vector",
"fill-rule": "evenodd",
"clip-rule": "evenodd",
"d": "M7.99996 3.33333C5.42263 3.33333 3.33329 5.42267 3.33329 8C3.33329 10.5773 5.42263 12.6667 7.99996 12.6667C9.72643 12.6667 11.2348 11.7295 12.0427 10.3329C12.227 10.0141 12.6349 9.90523 12.9536 10.0896C13.2723 10.274 13.3812 10.6818 13.1968 11.0005C12.1604 12.7921 10.2216 14 7.99996 14C4.91159 14 2.36821 11.6666 2.03658 8.66667H1.33329C0.965103 8.66667 0.666626 8.36819 0.666626 8C0.666626 7.63181 0.965103 7.33333 1.33329 7.33333H2.03658C2.36821 4.33337 4.91159 2 7.99996 2C10.2216 2 12.1604 3.20785 13.1968 4.99952C13.3812 5.31823 13.2723 5.72605 12.9536 5.91041C12.6349 6.09477 12.227 5.98585 12.0427 5.66714C11.2348 4.27054 9.72643 3.33333 7.99996 3.33333ZM7.99996 6C6.89539 6 5.99996 6.89543 5.99996 8C5.99996 9.10455 6.89539 10 7.99996 10C9.1045 10 9.99996 9.10454 9.99996 8C9.99996 6.89543 9.10451 6 7.99996 6ZM4.66663 8C4.66663 6.15905 6.15901 4.66667 7.99996 4.66667C9.61257 4.66667 10.9578 5.81184 11.2666 7.33333H14.6666C15.0348 7.33333 15.3333 7.63181 15.3333 8C15.3333 8.36819 15.0348 8.66667 14.6666 8.66667H11.2666C10.9578 10.1881 9.61257 11.3333 7.99996 11.3333C6.159 11.3333 4.66663 9.84092 4.66663 8Z",
"fill": "currentColor"
},
"children": []
}
]
}
]
},
"name": "ApiConnectionMod"
}

View File

@@ -0,0 +1,16 @@
// GENERATE BY script
// DON NOT EDIT IT MANUALLY
import * as React from 'react'
import data from './ApiConnectionMod.json'
import IconBase from '@/app/components/base/icons/IconBase'
import type { IconBaseProps, IconData } from '@/app/components/base/icons/IconBase'
const Icon = React.forwardRef<React.MutableRefObject<SVGElement>, Omit<IconBaseProps, 'data'>>((
props,
ref,
) => <IconBase {...props} ref={ref} data={data as IconData} />)
Icon.displayName = 'ApiConnectionMod'
export default Icon

View File

@@ -1,3 +1,4 @@
export { default as ApiConnectionMod } from './ApiConnectionMod'
export { default as ApiConnection } from './ApiConnection'
export { default as BarChartSquare02 } from './BarChartSquare02'
export { default as Container } from './Container'

View File

@@ -0,0 +1,27 @@
{
"icon": {
"type": "element",
"isRootNode": true,
"name": "svg",
"attributes": {
"width": "13",
"height": "20",
"viewBox": "0 0 13 20",
"fill": "none",
"xmlns": "http://www.w3.org/2000/svg"
},
"children": [
{
"type": "element",
"name": "path",
"attributes": {
"id": "Shape",
"d": "M0 0H13V20C9.98017 20 7.26458 18.1615 6.14305 15.3576L0 0Z",
"fill": "currentColor"
},
"children": []
}
]
},
"name": "Corner"
}

View File

@@ -0,0 +1,16 @@
// GENERATE BY script
// DON NOT EDIT IT MANUALLY
import * as React from 'react'
import data from './Corner.json'
import IconBase from '@/app/components/base/icons/IconBase'
import type { IconBaseProps, IconData } from '@/app/components/base/icons/IconBase'
const Icon = React.forwardRef<React.MutableRefObject<SVGElement>, Omit<IconBaseProps, 'data'>>((
props,
ref,
) => <IconBase {...props} ref={ref} data={data as IconData} />)
Icon.displayName = 'Corner'
export default Icon

View File

@@ -1,2 +1,3 @@
export { default as Corner } from './Corner'
export { default as Star04 } from './Star04'
export { default as Star06 } from './Star06'

View File

@@ -3,5 +3,5 @@
}
.modal-panel {
@apply w-full max-w-md transform rounded-2xl bg-white p-6 text-left align-middle shadow-xl transition-all;
@apply w-full max-w-[480px] transform rounded-2xl bg-white p-6 text-left align-middle shadow-xl transition-all;
}

View File

@@ -37,6 +37,7 @@ const ParamItem: FC<Props> = ({ className, id, name, noTooltip, tip, step = 0.1,
<span className="mx-1 text-gray-900 text-[13px] leading-[18px] font-medium">{name}</span>
{!noTooltip && (
<Tooltip
triggerClassName='w-4 h-4 shrink-0'
popupContent={<div className="w-[200px]">{tip}</div>}
/>
)}

View File

@@ -87,7 +87,7 @@ const Select: FC<ISelectProps> = ({
<div className='group text-gray-800'>
{allowSearch
? <Combobox.Input
className={`w-full rounded-lg border-0 ${bgClassName} py-1.5 pl-3 pr-10 shadow-sm sm:text-sm sm:leading-6 focus-visible:outline-none focus-visible:bg-gray-200 group-hover:bg-gray-200 cursor-not-allowed`}
className={`w-full rounded-lg border-0 ${bgClassName} py-1.5 pl-3 pr-10 shadow-sm sm:text-sm sm:leading-6 focus-visible:outline-none focus-visible:bg-gray-200 group-hover:bg-gray-200 ${disabled ? 'cursor-not-allowed' : 'cursor-pointer'}`}
onChange={(event) => {
if (!disabled)
setQuery(event.target.value)

Binary file not shown.

After

Width:  |  Height:  |  Size: 2.7 KiB

View File

@@ -11,7 +11,7 @@ import { DataSourceType } from '@/models/datasets'
import type { CrawlOptions, CrawlResultItem, DataSet, FileItem, createDocumentResponse } from '@/models/datasets'
import { fetchDataSource } from '@/service/common'
import { fetchDatasetDetail } from '@/service/datasets'
import type { NotionPage } from '@/models/common'
import { DataSourceProvider, type NotionPage } from '@/models/common'
import { useModalContext } from '@/context/modal-context'
import { useDefaultModel } from '@/app/components/header/account-setting/model-provider-page/hooks'
@@ -26,6 +26,7 @@ const DEFAULT_CRAWL_OPTIONS: CrawlOptions = {
excludes: '',
limit: 10,
max_depth: '',
use_sitemap: true,
}
const DatasetUpdateForm = ({ datasetId }: DatasetUpdateFormProps) => {
@@ -51,7 +52,8 @@ const DatasetUpdateForm = ({ datasetId }: DatasetUpdateFormProps) => {
const updateFileList = (preparedFiles: FileItem[]) => {
setFiles(preparedFiles)
}
const [fireCrawlJobId, setFireCrawlJobId] = useState('')
const [websiteCrawlProvider, setWebsiteCrawlProvider] = useState<DataSourceProvider>(DataSourceProvider.fireCrawl)
const [websiteCrawlJobId, setWebsiteCrawlJobId] = useState('')
const updateFile = (fileItem: FileItem, progress: number, list: FileItem[]) => {
const targetIndex = list.findIndex(file => file.fileID === fileItem.fileID)
@@ -137,7 +139,8 @@ const DatasetUpdateForm = ({ datasetId }: DatasetUpdateFormProps) => {
onStepChange={nextStep}
websitePages={websitePages}
updateWebsitePages={setWebsitePages}
onFireCrawlJobIdChange={setFireCrawlJobId}
onWebsiteCrawlProviderChange={setWebsiteCrawlProvider}
onWebsiteCrawlJobIdChange={setWebsiteCrawlJobId}
crawlOptions={crawlOptions}
onCrawlOptionsChange={setCrawlOptions}
/>
@@ -151,7 +154,8 @@ const DatasetUpdateForm = ({ datasetId }: DatasetUpdateFormProps) => {
files={fileList.map(file => file.file)}
notionPages={notionPages}
websitePages={websitePages}
fireCrawlJobId={fireCrawlJobId}
websiteCrawlProvider={websiteCrawlProvider}
websiteCrawlJobId={websiteCrawlJobId}
onStepChange={changeStep}
updateIndexingTypeCache={updateIndexingTypeCache}
updateResultCache={updateResultCache}

View File

@@ -10,7 +10,7 @@ import WebsitePreview from '../website/preview'
import s from './index.module.css'
import cn from '@/utils/classnames'
import type { CrawlOptions, CrawlResultItem, FileItem } from '@/models/datasets'
import type { NotionPage } from '@/models/common'
import type { DataSourceProvider, NotionPage } from '@/models/common'
import { DataSourceType } from '@/models/datasets'
import Button from '@/app/components/base/button'
import { NotionPageSelector } from '@/app/components/base/notion-page-selector'
@@ -33,7 +33,8 @@ type IStepOneProps = {
changeType: (type: DataSourceType) => void
websitePages?: CrawlResultItem[]
updateWebsitePages: (value: CrawlResultItem[]) => void
onFireCrawlJobIdChange: (jobId: string) => void
onWebsiteCrawlProviderChange: (provider: DataSourceProvider) => void
onWebsiteCrawlJobIdChange: (jobId: string) => void
crawlOptions: CrawlOptions
onCrawlOptionsChange: (payload: CrawlOptions) => void
}
@@ -69,7 +70,8 @@ const StepOne = ({
updateNotionPages,
websitePages = [],
updateWebsitePages,
onFireCrawlJobIdChange,
onWebsiteCrawlProviderChange,
onWebsiteCrawlJobIdChange,
crawlOptions,
onCrawlOptionsChange,
}: IStepOneProps) => {
@@ -229,7 +231,8 @@ const StepOne = ({
onPreview={setCurrentWebsite}
checkedCrawlResult={websitePages}
onCheckedCrawlResultChange={updateWebsitePages}
onJobIdChange={onFireCrawlJobIdChange}
onCrawlProviderChange={onWebsiteCrawlProviderChange}
onJobIdChange={onWebsiteCrawlJobIdChange}
crawlOptions={crawlOptions}
onCrawlOptionsChange={onCrawlOptionsChange}
/>

View File

@@ -33,6 +33,7 @@ import { ensureRerankModelSelected, isReRankModelSelected } from '@/app/componen
import Toast from '@/app/components/base/toast'
import { formatNumber } from '@/utils/format'
import type { NotionPage } from '@/models/common'
import { DataSourceProvider } from '@/models/common'
import { DataSourceType, DocForm } from '@/models/datasets'
import NotionIcon from '@/app/components/base/notion-icon'
import Switch from '@/app/components/base/switch'
@@ -63,7 +64,8 @@ type StepTwoProps = {
notionPages?: NotionPage[]
websitePages?: CrawlResultItem[]
crawlOptions?: CrawlOptions
fireCrawlJobId?: string
websiteCrawlProvider?: DataSourceProvider
websiteCrawlJobId?: string
onStepChange?: (delta: number) => void
updateIndexingTypeCache?: (type: string) => void
updateResultCache?: (res: createDocumentResponse) => void
@@ -94,7 +96,8 @@ const StepTwo = ({
notionPages = [],
websitePages = [],
crawlOptions,
fireCrawlJobId = '',
websiteCrawlProvider = DataSourceProvider.fireCrawl,
websiteCrawlJobId = '',
onStepChange,
updateIndexingTypeCache,
updateResultCache,
@@ -260,8 +263,8 @@ const StepTwo = ({
const getWebsiteInfo = () => {
return {
provider: 'firecrawl',
job_id: fireCrawlJobId,
provider: websiteCrawlProvider,
job_id: websiteCrawlJobId,
urls: websitePages.map(page => page.source_url),
only_main_content: crawlOptions?.only_main_content,
}

View File

@@ -3,6 +3,7 @@ import type { FC } from 'react'
import React from 'react'
import cn from '@/utils/classnames'
import Checkbox from '@/app/components/base/checkbox'
import Tooltip from '@/app/components/base/tooltip'
type Props = {
className?: string
@@ -10,6 +11,7 @@ type Props = {
onChange: (isChecked: boolean) => void
label: string
labelClassName?: string
tooltip?: string
}
const CheckboxWithLabel: FC<Props> = ({
@@ -18,11 +20,20 @@ const CheckboxWithLabel: FC<Props> = ({
onChange,
label,
labelClassName,
tooltip,
}) => {
return (
<label className={cn(className, 'flex items-center h-7 space-x-2')}>
<Checkbox checked={isChecked} onCheck={() => onChange(!isChecked)} />
<div className={cn(labelClassName, 'text-sm font-normal text-gray-800')}>{label}</div>
{tooltip && (
<Tooltip
popupContent={
<div className='w-[200px]'>{tooltip}</div>
}
triggerClassName='ml-0.5 w-4 h-4'
/>
)}
</label>
)
}

View File

@@ -2,7 +2,7 @@
import type { FC } from 'react'
import React, { useCallback } from 'react'
import { useTranslation } from 'react-i18next'
import CheckboxWithLabel from './base/checkbox-with-label'
import CheckboxWithLabel from './checkbox-with-label'
import CrawledResultItem from './crawled-result-item'
import cn from '@/utils/classnames'
import type { CrawlResultItem } from '@/models/datasets'

View File

@@ -2,13 +2,13 @@
import type { FC } from 'react'
import React, { useCallback, useEffect, useState } from 'react'
import { useTranslation } from 'react-i18next'
import UrlInput from '../base/url-input'
import OptionsWrap from '../base/options-wrap'
import CrawledResult from '../base/crawled-result'
import Crawling from '../base/crawling'
import ErrorMessage from '../base/error-message'
import Header from './header'
import UrlInput from './base/url-input'
import OptionsWrap from './base/options-wrap'
import Options from './options'
import CrawledResult from './crawled-result'
import Crawling from './crawling'
import ErrorMessage from './base/error-message'
import cn from '@/utils/classnames'
import { useModalContext } from '@/context/modal-context'
import type { CrawlOptions, CrawlResultItem } from '@/models/datasets'

View File

@@ -2,8 +2,8 @@
import type { FC } from 'react'
import React, { useCallback } from 'react'
import { useTranslation } from 'react-i18next'
import CheckboxWithLabel from './base/checkbox-with-label'
import Field from './base/field'
import CheckboxWithLabel from '../base/checkbox-with-label'
import Field from '../base/field'
import cn from '@/utils/classnames'
import type { CrawlOptions } from '@/models/datasets'

View File

@@ -0,0 +1,6 @@
.jinaLogo {
@apply w-4 h-4 bg-center bg-no-repeat inline-block;
background-color: #F5FAFF;
background-image: url(../assets/jina.png);
background-size: 16px;
}

View File

@@ -1,8 +1,12 @@
'use client'
import type { FC } from 'react'
import React, { useCallback, useEffect, useState } from 'react'
import { useTranslation } from 'react-i18next'
import s from './index.module.css'
import NoData from './no-data'
import Firecrawl from './firecrawl'
import JinaReader from './jina-reader'
import cn from '@/utils/classnames'
import { useModalContext } from '@/context/modal-context'
import type { CrawlOptions, CrawlResultItem } from '@/models/datasets'
import { fetchDataSources } from '@/service/datasets'
@@ -12,6 +16,7 @@ type Props = {
onPreview: (payload: CrawlResultItem) => void
checkedCrawlResult: CrawlResultItem[]
onCheckedCrawlResultChange: (payload: CrawlResultItem[]) => void
onCrawlProviderChange: (provider: DataSourceProvider) => void
onJobIdChange: (jobId: string) => void
crawlOptions: CrawlOptions
onCrawlOptionsChange: (payload: CrawlOptions) => void
@@ -21,17 +26,32 @@ const Website: FC<Props> = ({
onPreview,
checkedCrawlResult,
onCheckedCrawlResultChange,
onCrawlProviderChange,
onJobIdChange,
crawlOptions,
onCrawlOptionsChange,
}) => {
const { t } = useTranslation()
const { setShowAccountSettingModal } = useModalContext()
const [isLoaded, setIsLoaded] = useState(false)
const [isSetFirecrawlApiKey, setIsSetFirecrawlApiKey] = useState(false)
const [selectedProvider, setSelectedProvider] = useState<DataSourceProvider>(DataSourceProvider.jinaReader)
const [sources, setSources] = useState<DataSourceItem[]>([])
useEffect(() => {
onCrawlProviderChange(selectedProvider)
}, [selectedProvider, onCrawlProviderChange])
const checkSetApiKey = useCallback(async () => {
const res = await fetchDataSources() as any
const isFirecrawlSet = res.sources.some((item: DataSourceItem) => item.provider === DataSourceProvider.fireCrawl)
setIsSetFirecrawlApiKey(isFirecrawlSet)
setSources(res.sources)
// If users have configured one of the providers, select it.
const availableProviders = res.sources.filter((item: DataSourceItem) =>
[DataSourceProvider.jinaReader, DataSourceProvider.fireCrawl].includes(item.provider),
)
if (availableProviders.length > 0)
setSelectedProvider(availableProviders[0].provider)
}, [])
useEffect(() => {
@@ -52,20 +72,66 @@ const Website: FC<Props> = ({
return (
<div>
{isSetFirecrawlApiKey
? (
<Firecrawl
onPreview={onPreview}
checkedCrawlResult={checkedCrawlResult}
onCheckedCrawlResultChange={onCheckedCrawlResultChange}
onJobIdChange={onJobIdChange}
crawlOptions={crawlOptions}
onCrawlOptionsChange={onCrawlOptionsChange}
/>
)
: (
<NoData onConfig={handleOnConfig} />
)}
<div className="mb-4">
<div className="font-medium text-gray-700 mb-2 h-6">
{t('datasetCreation.stepOne.website.chooseProvider')}
</div>
<div className="flex space-x-2">
<button
className={`px-4 py-2 text-sm font-medium rounded-md flex items-center justify-center ${
selectedProvider === DataSourceProvider.jinaReader
? 'bg-primary-50 text-primary-600'
: 'bg-gray-100 text-gray-600 hover:bg-gray-200'
}`}
onClick={() => setSelectedProvider(DataSourceProvider.jinaReader)}
>
<span className={cn(s.jinaLogo, 'mr-2')} />
<span>Jina Reader</span>
</button>
<button
className={`px-4 py-2 text-sm font-medium rounded-md ${
selectedProvider === DataSourceProvider.fireCrawl
? 'bg-primary-50 text-primary-600'
: 'bg-gray-100 text-gray-600 hover:bg-gray-200'
}`}
onClick={() => setSelectedProvider(DataSourceProvider.fireCrawl)}
>
🔥 Firecrawl
</button>
</div>
</div>
{
selectedProvider === DataSourceProvider.fireCrawl
? sources.find(source => source.provider === DataSourceProvider.fireCrawl)
? (
<Firecrawl
onPreview={onPreview}
checkedCrawlResult={checkedCrawlResult}
onCheckedCrawlResultChange={onCheckedCrawlResultChange}
onJobIdChange={onJobIdChange}
crawlOptions={crawlOptions}
onCrawlOptionsChange={onCrawlOptionsChange}
/>
)
: (
<NoData onConfig={handleOnConfig} provider={selectedProvider} />
)
: sources.find(source => source.provider === DataSourceProvider.jinaReader)
? (
<JinaReader
onPreview={onPreview}
checkedCrawlResult={checkedCrawlResult}
onCheckedCrawlResultChange={onCheckedCrawlResultChange}
onJobIdChange={onJobIdChange}
crawlOptions={crawlOptions}
onCrawlOptionsChange={onCrawlOptionsChange}
/>
)
: (
<NoData onConfig={handleOnConfig} provider={selectedProvider} />
)
}
</div>
)
}

View File

@@ -0,0 +1,42 @@
'use client'
import type { FC } from 'react'
import React from 'react'
import { useTranslation } from 'react-i18next'
import { Settings01 } from '@/app/components/base/icons/src/vender/line/general'
import { BookOpen01 } from '@/app/components/base/icons/src/vender/line/education'
const I18N_PREFIX = 'datasetCreation.stepOne.website'
type Props = {
onSetting: () => void
}
const Header: FC<Props> = ({
onSetting,
}) => {
const { t } = useTranslation()
return (
<div className='flex h-6 items-center justify-between'>
<div className='flex items-center'>
<div className='text-base font-medium text-gray-700'>{t(`${I18N_PREFIX}.jinaReaderTitle`)}</div>
<div className='ml-2 mr-1 w-px h-3.5 bg-gray-200'></div>
<div
className='p-1 rounded-md hover:bg-black/5 cursor-pointer'
onClick={onSetting}
>
<Settings01 className='w-3.5 h-3.5 text-gray-500' />
</div>
</div>
<a
href='https://jina.ai/reader'
target='_blank' rel='noopener noreferrer'
className='flex items-center text-xs text-primary-600'
>
<BookOpen01 className='mr-1 w-3.5 h-3.5 text-primary-600' />
{t(`${I18N_PREFIX}.jinaReaderDoc`)}
</a>
</div>
)
}
export default React.memo(Header)

View File

@@ -0,0 +1,232 @@
'use client'
import type { FC } from 'react'
import React, { useCallback, useEffect, useState } from 'react'
import { useTranslation } from 'react-i18next'
import UrlInput from '../base/url-input'
import OptionsWrap from '../base/options-wrap'
import CrawledResult from '../base/crawled-result'
import Crawling from '../base/crawling'
import ErrorMessage from '../base/error-message'
import Header from './header'
import Options from './options'
import cn from '@/utils/classnames'
import { useModalContext } from '@/context/modal-context'
import Toast from '@/app/components/base/toast'
import { checkJinaReaderTaskStatus, createJinaReaderTask } from '@/service/datasets'
import { sleep } from '@/utils'
import type { CrawlOptions, CrawlResultItem } from '@/models/datasets'
const ERROR_I18N_PREFIX = 'common.errorMsg'
const I18N_PREFIX = 'datasetCreation.stepOne.website'
type Props = {
onPreview: (payload: CrawlResultItem) => void
checkedCrawlResult: CrawlResultItem[]
onCheckedCrawlResultChange: (payload: CrawlResultItem[]) => void
onJobIdChange: (jobId: string) => void
crawlOptions: CrawlOptions
onCrawlOptionsChange: (payload: CrawlOptions) => void
}
enum Step {
init = 'init',
running = 'running',
finished = 'finished',
}
const JinaReader: FC<Props> = ({
onPreview,
checkedCrawlResult,
onCheckedCrawlResultChange,
onJobIdChange,
crawlOptions,
onCrawlOptionsChange,
}) => {
const { t } = useTranslation()
const [step, setStep] = useState<Step>(Step.init)
const [controlFoldOptions, setControlFoldOptions] = useState<number>(0)
useEffect(() => {
if (step !== Step.init)
setControlFoldOptions(Date.now())
}, [step])
const { setShowAccountSettingModal } = useModalContext()
const handleSetting = useCallback(() => {
setShowAccountSettingModal({
payload: 'data-source',
})
}, [setShowAccountSettingModal])
const checkValid = useCallback((url: string) => {
let errorMsg = ''
if (!url) {
errorMsg = t(`${ERROR_I18N_PREFIX}.fieldRequired`, {
field: 'url',
})
}
if (!errorMsg && !((url.startsWith('http://') || url.startsWith('https://'))))
errorMsg = t(`${ERROR_I18N_PREFIX}.urlError`)
if (!errorMsg && (crawlOptions.limit === null || crawlOptions.limit === undefined || crawlOptions.limit === '')) {
errorMsg = t(`${ERROR_I18N_PREFIX}.fieldRequired`, {
field: t(`${I18N_PREFIX}.limit`),
})
}
return {
isValid: !errorMsg,
errorMsg,
}
}, [crawlOptions, t])
const isInit = step === Step.init
const isCrawlFinished = step === Step.finished
const isRunning = step === Step.running
const [crawlResult, setCrawlResult] = useState<{
current: number
total: number
data: CrawlResultItem[]
time_consuming: number | string
} | undefined>(undefined)
const [crawlErrorMessage, setCrawlErrorMessage] = useState('')
const showError = isCrawlFinished && crawlErrorMessage
const waitForCrawlFinished = useCallback(async (jobId: string) => {
try {
const res = await checkJinaReaderTaskStatus(jobId) as any
console.log('res', res)
if (res.status === 'completed') {
return {
isError: false,
data: {
...res,
total: Math.min(res.total, parseFloat(crawlOptions.limit as string)),
},
}
}
if (res.status === 'failed' || !res.status) {
return {
isError: true,
errorMessage: res.message,
data: {
data: [],
},
}
}
// update the progress
setCrawlResult({
...res,
total: Math.min(res.total, parseFloat(crawlOptions.limit as string)),
})
onCheckedCrawlResultChange(res.data || []) // default select the crawl result
await sleep(2500)
return await waitForCrawlFinished(jobId)
}
catch (e: any) {
const errorBody = await e.json()
return {
isError: true,
errorMessage: errorBody.message,
data: {
data: [],
},
}
}
}, [crawlOptions.limit])
const handleRun = useCallback(async (url: string) => {
const { isValid, errorMsg } = checkValid(url)
if (!isValid) {
Toast.notify({
message: errorMsg!,
type: 'error',
})
return
}
setStep(Step.running)
try {
const startTime = Date.now()
const res = await createJinaReaderTask({
url,
options: crawlOptions,
}) as any
if (res.data) {
const data = {
current: 1,
total: 1,
data: [{
title: res.data.title,
markdown: res.data.content,
description: res.data.description,
source_url: res.data.url,
}],
time_consuming: (Date.now() - startTime) / 1000,
}
setCrawlResult(data)
onCheckedCrawlResultChange(data.data || [])
setCrawlErrorMessage('')
}
else if (res.job_id) {
const jobId = res.job_id
onJobIdChange(jobId)
const { isError, data, errorMessage } = await waitForCrawlFinished(jobId)
if (isError) {
setCrawlErrorMessage(errorMessage || t(`${I18N_PREFIX}.unknownError`))
}
else {
setCrawlResult(data)
onCheckedCrawlResultChange(data.data || []) // default select the crawl result
setCrawlErrorMessage('')
}
}
}
catch (e) {
setCrawlErrorMessage(t(`${I18N_PREFIX}.unknownError`)!)
console.log(e)
}
finally {
setStep(Step.finished)
}
}, [checkValid, crawlOptions, onJobIdChange, t, waitForCrawlFinished])
return (
<div>
<Header onSetting={handleSetting} />
<div className={cn('mt-2 p-4 pb-0 rounded-xl border border-gray-200')}>
<UrlInput onRun={handleRun} isRunning={isRunning} />
<OptionsWrap
className={cn('mt-4')}
controlFoldOptions={controlFoldOptions}
>
<Options className='mt-2' payload={crawlOptions} onChange={onCrawlOptionsChange} />
</OptionsWrap>
{!isInit && (
<div className='mt-3 relative left-[-16px] w-[calc(100%_+_32px)] rounded-b-xl'>
{isRunning
&& <Crawling
className='mt-2'
crawledNum={crawlResult?.current || 0}
totalNum={crawlResult?.total || parseFloat(crawlOptions.limit as string) || 0}
/>}
{showError && (
<ErrorMessage className='rounded-b-xl' title={t(`${I18N_PREFIX}.exceptionErrorTitle`)} errorMsg={crawlErrorMessage} />
)}
{isCrawlFinished && !showError
&& <CrawledResult
className='mb-2'
list={crawlResult?.data || []}
checkedList={checkedCrawlResult}
onSelectedChange={onCheckedCrawlResultChange}
onPreview={onPreview}
usedTime={parseFloat(crawlResult?.time_consuming as string) || 0}
/>
}
</div>
)}
</div>
</div>
)
}
export default React.memo(JinaReader)

View File

@@ -0,0 +1,59 @@
'use client'
import type { FC } from 'react'
import React, { useCallback } from 'react'
import { useTranslation } from 'react-i18next'
import CheckboxWithLabel from '../base/checkbox-with-label'
import Field from '../base/field'
import cn from '@/utils/classnames'
import type { CrawlOptions } from '@/models/datasets'
const I18N_PREFIX = 'datasetCreation.stepOne.website'
type Props = {
className?: string
payload: CrawlOptions
onChange: (payload: CrawlOptions) => void
}
const Options: FC<Props> = ({
className = '',
payload,
onChange,
}) => {
const { t } = useTranslation()
const handleChange = useCallback((key: keyof CrawlOptions) => {
return (value: any) => {
onChange({
...payload,
[key]: value,
})
}
}, [payload, onChange])
return (
<div className={cn(className, ' space-y-2')}>
<CheckboxWithLabel
label={t(`${I18N_PREFIX}.crawlSubPage`)}
isChecked={payload.crawl_sub_pages}
onChange={handleChange('crawl_sub_pages')}
/>
<CheckboxWithLabel
label={t(`${I18N_PREFIX}.useSitemap`)}
isChecked={payload.use_sitemap}
onChange={handleChange('use_sitemap')}
tooltip={t(`${I18N_PREFIX}.useSitemapTooltip`) as string}
/>
<div className='flex justify-between space-x-4'>
<Field
className='grow shrink-0'
label={t(`${I18N_PREFIX}.limit`)}
value={payload.limit}
onChange={handleChange('limit')}
isNumber
isRequired
/>
</div>
</div>
)
}
export default React.memo(Options)

View File

@@ -2,35 +2,56 @@
import type { FC } from 'react'
import React from 'react'
import { useTranslation } from 'react-i18next'
import s from './index.module.css'
import { Icon3Dots } from '@/app/components/base/icons/src/vender/line/others'
import Button from '@/app/components/base/button'
import { DataSourceProvider } from '@/models/common'
const I18N_PREFIX = 'datasetCreation.stepOne.website'
type Props = {
onConfig: () => void
provider: DataSourceProvider
}
const NoData: FC<Props> = ({
onConfig,
provider,
}) => {
const { t } = useTranslation()
const providerConfig = {
[DataSourceProvider.jinaReader]: {
emoji: <span className={s.jinaLogo} />,
title: t(`${I18N_PREFIX}.jinaReaderNotConfigured`),
description: t(`${I18N_PREFIX}.jinaReaderNotConfiguredDescription`),
},
[DataSourceProvider.fireCrawl]: {
emoji: '🔥',
title: t(`${I18N_PREFIX}.fireCrawlNotConfigured`),
description: t(`${I18N_PREFIX}.fireCrawlNotConfiguredDescription`),
},
}
const currentProvider = providerConfig[provider]
return (
<div className='max-w-[640px] p-6 rounded-2xl bg-gray-50'>
<div className='flex w-11 h-11 items-center justify-center bg-gray-50 rounded-xl border-[0.5px] border-gray-100 shadow-lg'>
🔥
</div>
<div className='my-2'>
<span className='text-gray-700 font-semibold'>{t(`${I18N_PREFIX}.fireCrawlNotConfigured`)}<Icon3Dots className='inline relative -top-3 -left-1.5' /></span>
<div className='mt-1 pb-3 text-gray-500 text-[13px] font-normal'>
{t(`${I18N_PREFIX}.fireCrawlNotConfiguredDescription`)}
<>
<div className='max-w-[640px] p-6 rounded-2xl bg-gray-50 mt-4'>
<div className='flex w-11 h-11 items-center justify-center bg-gray-50 rounded-xl border-[0.5px] border-gray-100 shadow-lg'>
{currentProvider.emoji}
</div>
<div className='my-2'>
<span className='text-gray-700 font-semibold'>{currentProvider.title}<Icon3Dots className='inline relative -top-3 -left-1.5' /></span>
<div className='mt-1 pb-3 text-gray-500 text-[13px] font-normal'>
{currentProvider.description}
</div>
</div>
<Button variant='primary' onClick={onConfig}>
{t(`${I18N_PREFIX}.configure`)}
</Button>
</div>
<Button variant='primary' onClick={onConfig}>
{t(`${I18N_PREFIX}.configure`)}
</Button>
</div>
</>
)
}
export default React.memo(NoData)

Some files were not shown because too many files have changed in this diff Show More