mirror of
https://github.com/langgenius/dify.git
synced 2026-01-10 08:14:14 +00:00
Compare commits
12 Commits
feat/login
...
0.9.0
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
d2ce4960f1 | ||
|
|
1af4ca344e | ||
|
|
fa837b2dfd | ||
|
|
824a71388a | ||
|
|
4585cffce1 | ||
|
|
13046709a9 | ||
|
|
9d221a5e19 | ||
|
|
77aef9ff1d | ||
|
|
503561f464 | ||
|
|
ada9d408ac | ||
|
|
3af65b2f45 | ||
|
|
369e1e6f58 |
@@ -9,7 +9,7 @@ class PackagingInfo(BaseSettings):
|
||||
|
||||
CURRENT_VERSION: str = Field(
|
||||
description="Dify version",
|
||||
default="0.8.3",
|
||||
default="0.9.0",
|
||||
)
|
||||
|
||||
COMMIT_SHA: str = Field(
|
||||
|
||||
@@ -37,7 +37,16 @@ from .auth import activate, data_source_bearer_auth, data_source_oauth, forgot_p
|
||||
from .billing import billing
|
||||
|
||||
# Import datasets controllers
|
||||
from .datasets import data_source, datasets, datasets_document, datasets_segments, file, hit_testing, website
|
||||
from .datasets import (
|
||||
data_source,
|
||||
datasets,
|
||||
datasets_document,
|
||||
datasets_segments,
|
||||
external,
|
||||
file,
|
||||
hit_testing,
|
||||
website,
|
||||
)
|
||||
|
||||
# Import explore controllers
|
||||
from .explore import (
|
||||
|
||||
@@ -49,7 +49,7 @@ class DatasetListApi(Resource):
|
||||
page = request.args.get("page", default=1, type=int)
|
||||
limit = request.args.get("limit", default=20, type=int)
|
||||
ids = request.args.getlist("ids")
|
||||
provider = request.args.get("provider", default="vendor")
|
||||
# provider = request.args.get("provider", default="vendor")
|
||||
search = request.args.get("keyword", default=None, type=str)
|
||||
tag_ids = request.args.getlist("tag_ids")
|
||||
|
||||
@@ -57,7 +57,7 @@ class DatasetListApi(Resource):
|
||||
datasets, total = DatasetService.get_datasets_by_ids(ids, current_user.current_tenant_id)
|
||||
else:
|
||||
datasets, total = DatasetService.get_datasets(
|
||||
page, limit, provider, current_user.current_tenant_id, current_user, search, tag_ids
|
||||
page, limit, current_user.current_tenant_id, current_user, search, tag_ids
|
||||
)
|
||||
|
||||
# check embedding setting
|
||||
@@ -110,6 +110,26 @@ class DatasetListApi(Resource):
|
||||
nullable=True,
|
||||
help="Invalid indexing technique.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"external_knowledge_api_id",
|
||||
type=str,
|
||||
nullable=True,
|
||||
required=False,
|
||||
)
|
||||
parser.add_argument(
|
||||
"provider",
|
||||
type=str,
|
||||
nullable=True,
|
||||
choices=Dataset.PROVIDER_LIST,
|
||||
required=False,
|
||||
default="vendor",
|
||||
)
|
||||
parser.add_argument(
|
||||
"external_knowledge_id",
|
||||
type=str,
|
||||
nullable=True,
|
||||
required=False,
|
||||
)
|
||||
args = parser.parse_args()
|
||||
|
||||
# The role of the current user in the ta table must be admin, owner, or editor, or dataset_operator
|
||||
@@ -123,6 +143,9 @@ class DatasetListApi(Resource):
|
||||
indexing_technique=args["indexing_technique"],
|
||||
account=current_user,
|
||||
permission=DatasetPermissionEnum.ONLY_ME,
|
||||
provider=args["provider"],
|
||||
external_knowledge_api_id=args["external_knowledge_api_id"],
|
||||
external_knowledge_id=args["external_knowledge_id"],
|
||||
)
|
||||
except services.errors.dataset.DatasetNameDuplicateError:
|
||||
raise DatasetNameDuplicateError()
|
||||
@@ -211,6 +234,33 @@ class DatasetApi(Resource):
|
||||
)
|
||||
parser.add_argument("retrieval_model", type=dict, location="json", help="Invalid retrieval model.")
|
||||
parser.add_argument("partial_member_list", type=list, location="json", help="Invalid parent user list.")
|
||||
|
||||
parser.add_argument(
|
||||
"external_retrieval_model",
|
||||
type=dict,
|
||||
required=False,
|
||||
nullable=True,
|
||||
location="json",
|
||||
help="Invalid external retrieval model.",
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
"external_knowledge_id",
|
||||
type=str,
|
||||
required=False,
|
||||
nullable=True,
|
||||
location="json",
|
||||
help="Invalid external knowledge id.",
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
"external_knowledge_api_id",
|
||||
type=str,
|
||||
required=False,
|
||||
nullable=True,
|
||||
location="json",
|
||||
help="Invalid external knowledge api id.",
|
||||
)
|
||||
args = parser.parse_args()
|
||||
data = request.get_json()
|
||||
|
||||
|
||||
239
api/controllers/console/datasets/external.py
Normal file
239
api/controllers/console/datasets/external.py
Normal file
@@ -0,0 +1,239 @@
|
||||
from flask import request
|
||||
from flask_login import current_user
|
||||
from flask_restful import Resource, marshal, reqparse
|
||||
from werkzeug.exceptions import Forbidden, InternalServerError, NotFound
|
||||
|
||||
import services
|
||||
from controllers.console import api
|
||||
from controllers.console.datasets.error import DatasetNameDuplicateError
|
||||
from controllers.console.setup import setup_required
|
||||
from controllers.console.wraps import account_initialization_required
|
||||
from fields.dataset_fields import dataset_detail_fields
|
||||
from libs.login import login_required
|
||||
from services.dataset_service import DatasetService
|
||||
from services.external_knowledge_service import ExternalDatasetService
|
||||
from services.hit_testing_service import HitTestingService
|
||||
|
||||
|
||||
def _validate_name(name):
|
||||
if not name or len(name) < 1 or len(name) > 100:
|
||||
raise ValueError("Name must be between 1 to 100 characters.")
|
||||
return name
|
||||
|
||||
|
||||
def _validate_description_length(description):
|
||||
if description and len(description) > 400:
|
||||
raise ValueError("Description cannot exceed 400 characters.")
|
||||
return description
|
||||
|
||||
|
||||
class ExternalApiTemplateListApi(Resource):
|
||||
@setup_required
|
||||
@login_required
|
||||
@account_initialization_required
|
||||
def get(self):
|
||||
page = request.args.get("page", default=1, type=int)
|
||||
limit = request.args.get("limit", default=20, type=int)
|
||||
search = request.args.get("keyword", default=None, type=str)
|
||||
|
||||
external_knowledge_apis, total = ExternalDatasetService.get_external_knowledge_apis(
|
||||
page, limit, current_user.current_tenant_id, search
|
||||
)
|
||||
response = {
|
||||
"data": [item.to_dict() for item in external_knowledge_apis],
|
||||
"has_more": len(external_knowledge_apis) == limit,
|
||||
"limit": limit,
|
||||
"total": total,
|
||||
"page": page,
|
||||
}
|
||||
return response, 200
|
||||
|
||||
@setup_required
|
||||
@login_required
|
||||
@account_initialization_required
|
||||
def post(self):
|
||||
parser = reqparse.RequestParser()
|
||||
parser.add_argument(
|
||||
"name",
|
||||
nullable=False,
|
||||
required=True,
|
||||
help="Name is required. Name must be between 1 to 100 characters.",
|
||||
type=_validate_name,
|
||||
)
|
||||
parser.add_argument(
|
||||
"settings",
|
||||
type=dict,
|
||||
location="json",
|
||||
nullable=False,
|
||||
required=True,
|
||||
)
|
||||
args = parser.parse_args()
|
||||
|
||||
ExternalDatasetService.validate_api_list(args["settings"])
|
||||
|
||||
# The role of the current user in the ta table must be admin, owner, or editor, or dataset_operator
|
||||
if not current_user.is_dataset_editor:
|
||||
raise Forbidden()
|
||||
|
||||
try:
|
||||
external_knowledge_api = ExternalDatasetService.create_external_knowledge_api(
|
||||
tenant_id=current_user.current_tenant_id, user_id=current_user.id, args=args
|
||||
)
|
||||
except services.errors.dataset.DatasetNameDuplicateError:
|
||||
raise DatasetNameDuplicateError()
|
||||
|
||||
return external_knowledge_api.to_dict(), 201
|
||||
|
||||
|
||||
class ExternalApiTemplateApi(Resource):
|
||||
@setup_required
|
||||
@login_required
|
||||
@account_initialization_required
|
||||
def get(self, external_knowledge_api_id):
|
||||
external_knowledge_api_id = str(external_knowledge_api_id)
|
||||
external_knowledge_api = ExternalDatasetService.get_external_knowledge_api(external_knowledge_api_id)
|
||||
if external_knowledge_api is None:
|
||||
raise NotFound("API template not found.")
|
||||
|
||||
return external_knowledge_api.to_dict(), 200
|
||||
|
||||
@setup_required
|
||||
@login_required
|
||||
@account_initialization_required
|
||||
def patch(self, external_knowledge_api_id):
|
||||
external_knowledge_api_id = str(external_knowledge_api_id)
|
||||
|
||||
parser = reqparse.RequestParser()
|
||||
parser.add_argument(
|
||||
"name",
|
||||
nullable=False,
|
||||
required=True,
|
||||
help="type is required. Name must be between 1 to 100 characters.",
|
||||
type=_validate_name,
|
||||
)
|
||||
parser.add_argument(
|
||||
"settings",
|
||||
type=dict,
|
||||
location="json",
|
||||
nullable=False,
|
||||
required=True,
|
||||
)
|
||||
args = parser.parse_args()
|
||||
ExternalDatasetService.validate_api_list(args["settings"])
|
||||
|
||||
external_knowledge_api = ExternalDatasetService.update_external_knowledge_api(
|
||||
tenant_id=current_user.current_tenant_id,
|
||||
user_id=current_user.id,
|
||||
external_knowledge_api_id=external_knowledge_api_id,
|
||||
args=args,
|
||||
)
|
||||
|
||||
return external_knowledge_api.to_dict(), 200
|
||||
|
||||
@setup_required
|
||||
@login_required
|
||||
@account_initialization_required
|
||||
def delete(self, external_knowledge_api_id):
|
||||
external_knowledge_api_id = str(external_knowledge_api_id)
|
||||
|
||||
# The role of the current user in the ta table must be admin, owner, or editor
|
||||
if not current_user.is_editor or current_user.is_dataset_operator:
|
||||
raise Forbidden()
|
||||
|
||||
ExternalDatasetService.delete_external_knowledge_api(current_user.current_tenant_id, external_knowledge_api_id)
|
||||
return {"result": "success"}, 200
|
||||
|
||||
|
||||
class ExternalApiUseCheckApi(Resource):
|
||||
@setup_required
|
||||
@login_required
|
||||
@account_initialization_required
|
||||
def get(self, external_knowledge_api_id):
|
||||
external_knowledge_api_id = str(external_knowledge_api_id)
|
||||
|
||||
external_knowledge_api_is_using, count = ExternalDatasetService.external_knowledge_api_use_check(
|
||||
external_knowledge_api_id
|
||||
)
|
||||
return {"is_using": external_knowledge_api_is_using, "count": count}, 200
|
||||
|
||||
|
||||
class ExternalDatasetCreateApi(Resource):
|
||||
@setup_required
|
||||
@login_required
|
||||
@account_initialization_required
|
||||
def post(self):
|
||||
# The role of the current user in the ta table must be admin, owner, or editor
|
||||
if not current_user.is_editor:
|
||||
raise Forbidden()
|
||||
|
||||
parser = reqparse.RequestParser()
|
||||
parser.add_argument("external_knowledge_api_id", type=str, required=True, nullable=False, location="json")
|
||||
parser.add_argument("external_knowledge_id", type=str, required=True, nullable=False, location="json")
|
||||
parser.add_argument(
|
||||
"name",
|
||||
nullable=False,
|
||||
required=True,
|
||||
help="name is required. Name must be between 1 to 100 characters.",
|
||||
type=_validate_name,
|
||||
)
|
||||
parser.add_argument("description", type=str, required=False, nullable=True, location="json")
|
||||
parser.add_argument("external_retrieval_model", type=dict, required=False, location="json")
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
# The role of the current user in the ta table must be admin, owner, or editor, or dataset_operator
|
||||
if not current_user.is_dataset_editor:
|
||||
raise Forbidden()
|
||||
|
||||
try:
|
||||
dataset = ExternalDatasetService.create_external_dataset(
|
||||
tenant_id=current_user.current_tenant_id,
|
||||
user_id=current_user.id,
|
||||
args=args,
|
||||
)
|
||||
except services.errors.dataset.DatasetNameDuplicateError:
|
||||
raise DatasetNameDuplicateError()
|
||||
|
||||
return marshal(dataset, dataset_detail_fields), 201
|
||||
|
||||
|
||||
class ExternalKnowledgeHitTestingApi(Resource):
|
||||
@setup_required
|
||||
@login_required
|
||||
@account_initialization_required
|
||||
def post(self, dataset_id):
|
||||
dataset_id_str = str(dataset_id)
|
||||
dataset = DatasetService.get_dataset(dataset_id_str)
|
||||
if dataset is None:
|
||||
raise NotFound("Dataset not found.")
|
||||
|
||||
try:
|
||||
DatasetService.check_dataset_permission(dataset, current_user)
|
||||
except services.errors.account.NoPermissionError as e:
|
||||
raise Forbidden(str(e))
|
||||
|
||||
parser = reqparse.RequestParser()
|
||||
parser.add_argument("query", type=str, location="json")
|
||||
parser.add_argument("external_retrieval_model", type=dict, required=False, location="json")
|
||||
args = parser.parse_args()
|
||||
|
||||
HitTestingService.hit_testing_args_check(args)
|
||||
|
||||
try:
|
||||
response = HitTestingService.external_retrieve(
|
||||
dataset=dataset,
|
||||
query=args["query"],
|
||||
account=current_user,
|
||||
external_retrieval_model=args["external_retrieval_model"],
|
||||
)
|
||||
|
||||
return response
|
||||
except Exception as e:
|
||||
raise InternalServerError(str(e))
|
||||
|
||||
|
||||
api.add_resource(ExternalKnowledgeHitTestingApi, "/datasets/<uuid:dataset_id>/external-hit-testing")
|
||||
api.add_resource(ExternalDatasetCreateApi, "/datasets/external")
|
||||
api.add_resource(ExternalApiTemplateListApi, "/datasets/external-knowledge-api")
|
||||
api.add_resource(ExternalApiTemplateApi, "/datasets/external-knowledge-api/<uuid:external_knowledge_api_id>")
|
||||
api.add_resource(ExternalApiUseCheckApi, "/datasets/external-knowledge-api/<uuid:external_knowledge_api_id>/use-check")
|
||||
@@ -47,6 +47,7 @@ class HitTestingApi(Resource):
|
||||
parser = reqparse.RequestParser()
|
||||
parser.add_argument("query", type=str, location="json")
|
||||
parser.add_argument("retrieval_model", type=dict, required=False, location="json")
|
||||
parser.add_argument("external_retrieval_model", type=dict, required=False, location="json")
|
||||
args = parser.parse_args()
|
||||
|
||||
HitTestingService.hit_testing_args_check(args)
|
||||
@@ -57,6 +58,7 @@ class HitTestingApi(Resource):
|
||||
query=args["query"],
|
||||
account=current_user,
|
||||
retrieval_model=args["retrieval_model"],
|
||||
external_retrieval_model=args["external_retrieval_model"],
|
||||
limit=10,
|
||||
)
|
||||
|
||||
|
||||
@@ -14,7 +14,9 @@ class WebsiteCrawlApi(Resource):
|
||||
@account_initialization_required
|
||||
def post(self):
|
||||
parser = reqparse.RequestParser()
|
||||
parser.add_argument("provider", type=str, choices=["firecrawl"], required=True, nullable=True, location="json")
|
||||
parser.add_argument(
|
||||
"provider", type=str, choices=["firecrawl", "jinareader"], required=True, nullable=True, location="json"
|
||||
)
|
||||
parser.add_argument("url", type=str, required=True, nullable=True, location="json")
|
||||
parser.add_argument("options", type=dict, required=True, nullable=True, location="json")
|
||||
args = parser.parse_args()
|
||||
@@ -33,7 +35,7 @@ class WebsiteCrawlStatusApi(Resource):
|
||||
@account_initialization_required
|
||||
def get(self, job_id: str):
|
||||
parser = reqparse.RequestParser()
|
||||
parser.add_argument("provider", type=str, choices=["firecrawl"], required=True, location="args")
|
||||
parser.add_argument("provider", type=str, choices=["firecrawl", "jinareader"], required=True, location="args")
|
||||
args = parser.parse_args()
|
||||
# get crawl status
|
||||
try:
|
||||
|
||||
@@ -38,11 +38,52 @@ class VersionApi(Resource):
|
||||
return result
|
||||
|
||||
content = json.loads(response.content)
|
||||
result["version"] = content["version"]
|
||||
result["release_date"] = content["releaseDate"]
|
||||
result["release_notes"] = content["releaseNotes"]
|
||||
result["can_auto_update"] = content["canAutoUpdate"]
|
||||
if _has_new_version(latest_version=content["version"], current_version=f"{args.get('current_version')}"):
|
||||
result["version"] = content["version"]
|
||||
result["release_date"] = content["releaseDate"]
|
||||
result["release_notes"] = content["releaseNotes"]
|
||||
result["can_auto_update"] = content["canAutoUpdate"]
|
||||
return result
|
||||
|
||||
|
||||
def _has_new_version(*, latest_version: str, current_version: str) -> bool:
|
||||
def parse_version(version: str) -> tuple:
|
||||
# Split version into parts and pre-release suffix if any
|
||||
parts = version.split("-")
|
||||
version_parts = parts[0].split(".")
|
||||
pre_release = parts[1] if len(parts) > 1 else None
|
||||
|
||||
# Validate version format
|
||||
if len(version_parts) != 3:
|
||||
raise ValueError(f"Invalid version format: {version}")
|
||||
|
||||
try:
|
||||
# Convert version parts to integers
|
||||
major, minor, patch = map(int, version_parts)
|
||||
return (major, minor, patch, pre_release)
|
||||
except ValueError:
|
||||
raise ValueError(f"Invalid version format: {version}")
|
||||
|
||||
latest = parse_version(latest_version)
|
||||
current = parse_version(current_version)
|
||||
|
||||
# Compare major, minor, and patch versions
|
||||
for latest_part, current_part in zip(latest[:3], current[:3]):
|
||||
if latest_part > current_part:
|
||||
return True
|
||||
elif latest_part < current_part:
|
||||
return False
|
||||
|
||||
# If versions are equal, check pre-release suffixes
|
||||
if latest[3] is None and current[3] is not None:
|
||||
return True
|
||||
elif latest[3] is not None and current[3] is None:
|
||||
return False
|
||||
elif latest[3] is not None and current[3] is not None:
|
||||
# Simple string comparison for pre-release versions
|
||||
return latest[3] > current[3]
|
||||
|
||||
return False
|
||||
|
||||
|
||||
api.add_resource(VersionApi, "/version")
|
||||
|
||||
@@ -72,8 +72,9 @@ class DefaultModelApi(Resource):
|
||||
provider=model_setting["provider"],
|
||||
model=model_setting["model"],
|
||||
)
|
||||
except Exception:
|
||||
logging.warning(f"{model_setting['model_type']} save error")
|
||||
except Exception as ex:
|
||||
logging.exception(f"{model_setting['model_type']} save error: {ex}")
|
||||
raise ex
|
||||
|
||||
return {"result": "success"}
|
||||
|
||||
|
||||
@@ -28,11 +28,11 @@ class DatasetListApi(DatasetApiResource):
|
||||
|
||||
page = request.args.get("page", default=1, type=int)
|
||||
limit = request.args.get("limit", default=20, type=int)
|
||||
provider = request.args.get("provider", default="vendor")
|
||||
# provider = request.args.get("provider", default="vendor")
|
||||
search = request.args.get("keyword", default=None, type=str)
|
||||
tag_ids = request.args.getlist("tag_ids")
|
||||
|
||||
datasets, total = DatasetService.get_datasets(page, limit, provider, tenant_id, current_user, search, tag_ids)
|
||||
datasets, total = DatasetService.get_datasets(page, limit, tenant_id, current_user, search, tag_ids)
|
||||
# check embedding setting
|
||||
provider_manager = ProviderManager()
|
||||
configurations = provider_manager.get_configurations(tenant_id=current_user.current_tenant_id)
|
||||
@@ -82,6 +82,26 @@ class DatasetListApi(DatasetApiResource):
|
||||
required=False,
|
||||
nullable=False,
|
||||
)
|
||||
parser.add_argument(
|
||||
"external_knowledge_api_id",
|
||||
type=str,
|
||||
nullable=True,
|
||||
required=False,
|
||||
default="_validate_name",
|
||||
)
|
||||
parser.add_argument(
|
||||
"provider",
|
||||
type=str,
|
||||
nullable=True,
|
||||
required=False,
|
||||
default="vendor",
|
||||
)
|
||||
parser.add_argument(
|
||||
"external_knowledge_id",
|
||||
type=str,
|
||||
nullable=True,
|
||||
required=False,
|
||||
)
|
||||
args = parser.parse_args()
|
||||
|
||||
try:
|
||||
@@ -91,6 +111,9 @@ class DatasetListApi(DatasetApiResource):
|
||||
indexing_technique=args["indexing_technique"],
|
||||
account=current_user,
|
||||
permission=args["permission"],
|
||||
provider=args["provider"],
|
||||
external_knowledge_api_id=args["external_knowledge_api_id"],
|
||||
external_knowledge_id=args["external_knowledge_id"],
|
||||
)
|
||||
except services.errors.dataset.DatasetNameDuplicateError:
|
||||
raise DatasetNameDuplicateError()
|
||||
|
||||
@@ -1,2 +1,2 @@
|
||||
class VariableError(Exception):
|
||||
class VariableError(ValueError):
|
||||
pass
|
||||
|
||||
@@ -59,7 +59,7 @@ class DatasetIndexToolCallbackHandler:
|
||||
for item in resource:
|
||||
dataset_retriever_resource = DatasetRetrieverResource(
|
||||
message_id=self._message_id,
|
||||
position=item.get("position"),
|
||||
position=item.get("position") or 0,
|
||||
dataset_id=item.get("dataset_id"),
|
||||
dataset_name=item.get("dataset_name"),
|
||||
document_id=item.get("document_id"),
|
||||
|
||||
@@ -59,6 +59,7 @@ from core.model_runtime.model_providers.__base.large_language_model import Large
|
||||
from core.model_runtime.model_providers.xinference.xinference_helper import (
|
||||
XinferenceHelper,
|
||||
XinferenceModelExtraParameter,
|
||||
validate_model_uid,
|
||||
)
|
||||
from core.model_runtime.utils import helper
|
||||
|
||||
@@ -114,7 +115,7 @@ class XinferenceAILargeLanguageModel(LargeLanguageModel):
|
||||
}
|
||||
"""
|
||||
try:
|
||||
if "/" in credentials["model_uid"] or "?" in credentials["model_uid"] or "#" in credentials["model_uid"]:
|
||||
if not validate_model_uid(credentials):
|
||||
raise CredentialsValidateFailedError("model_uid should not contain /, ?, or #")
|
||||
|
||||
extra_param = XinferenceHelper.get_xinference_extra_parameter(
|
||||
|
||||
@@ -15,6 +15,7 @@ from core.model_runtime.errors.invoke import (
|
||||
)
|
||||
from core.model_runtime.errors.validate import CredentialsValidateFailedError
|
||||
from core.model_runtime.model_providers.__base.rerank_model import RerankModel
|
||||
from core.model_runtime.model_providers.xinference.xinference_helper import validate_model_uid
|
||||
|
||||
|
||||
class XinferenceRerankModel(RerankModel):
|
||||
@@ -77,10 +78,7 @@ class XinferenceRerankModel(RerankModel):
|
||||
)
|
||||
|
||||
# score threshold check
|
||||
if score_threshold is not None:
|
||||
if result["relevance_score"] >= score_threshold:
|
||||
rerank_documents.append(rerank_document)
|
||||
else:
|
||||
if score_threshold is None or result["relevance_score"] >= score_threshold:
|
||||
rerank_documents.append(rerank_document)
|
||||
|
||||
return RerankResult(model=model, docs=rerank_documents)
|
||||
@@ -94,7 +92,7 @@ class XinferenceRerankModel(RerankModel):
|
||||
:return:
|
||||
"""
|
||||
try:
|
||||
if "/" in credentials["model_uid"] or "?" in credentials["model_uid"] or "#" in credentials["model_uid"]:
|
||||
if not validate_model_uid(credentials):
|
||||
raise CredentialsValidateFailedError("model_uid should not contain /, ?, or #")
|
||||
|
||||
credentials["server_url"] = credentials["server_url"].removesuffix("/")
|
||||
|
||||
@@ -14,6 +14,7 @@ from core.model_runtime.errors.invoke import (
|
||||
)
|
||||
from core.model_runtime.errors.validate import CredentialsValidateFailedError
|
||||
from core.model_runtime.model_providers.__base.speech2text_model import Speech2TextModel
|
||||
from core.model_runtime.model_providers.xinference.xinference_helper import validate_model_uid
|
||||
|
||||
|
||||
class XinferenceSpeech2TextModel(Speech2TextModel):
|
||||
@@ -42,7 +43,7 @@ class XinferenceSpeech2TextModel(Speech2TextModel):
|
||||
:return:
|
||||
"""
|
||||
try:
|
||||
if "/" in credentials["model_uid"] or "?" in credentials["model_uid"] or "#" in credentials["model_uid"]:
|
||||
if not validate_model_uid(credentials):
|
||||
raise CredentialsValidateFailedError("model_uid should not contain /, ?, or #")
|
||||
|
||||
credentials["server_url"] = credentials["server_url"].removesuffix("/")
|
||||
|
||||
@@ -17,7 +17,7 @@ from core.model_runtime.errors.invoke import (
|
||||
)
|
||||
from core.model_runtime.errors.validate import CredentialsValidateFailedError
|
||||
from core.model_runtime.model_providers.__base.text_embedding_model import TextEmbeddingModel
|
||||
from core.model_runtime.model_providers.xinference.xinference_helper import XinferenceHelper
|
||||
from core.model_runtime.model_providers.xinference.xinference_helper import XinferenceHelper, validate_model_uid
|
||||
|
||||
|
||||
class XinferenceTextEmbeddingModel(TextEmbeddingModel):
|
||||
@@ -110,7 +110,7 @@ class XinferenceTextEmbeddingModel(TextEmbeddingModel):
|
||||
:return:
|
||||
"""
|
||||
try:
|
||||
if "/" in credentials["model_uid"] or "?" in credentials["model_uid"] or "#" in credentials["model_uid"]:
|
||||
if not validate_model_uid(credentials):
|
||||
raise CredentialsValidateFailedError("model_uid should not contain /, ?, or #")
|
||||
|
||||
server_url = credentials["server_url"]
|
||||
|
||||
@@ -15,7 +15,7 @@ from core.model_runtime.errors.invoke import (
|
||||
)
|
||||
from core.model_runtime.errors.validate import CredentialsValidateFailedError
|
||||
from core.model_runtime.model_providers.__base.tts_model import TTSModel
|
||||
from core.model_runtime.model_providers.xinference.xinference_helper import XinferenceHelper
|
||||
from core.model_runtime.model_providers.xinference.xinference_helper import XinferenceHelper, validate_model_uid
|
||||
|
||||
|
||||
class XinferenceText2SpeechModel(TTSModel):
|
||||
@@ -70,7 +70,7 @@ class XinferenceText2SpeechModel(TTSModel):
|
||||
:return:
|
||||
"""
|
||||
try:
|
||||
if "/" in credentials["model_uid"] or "?" in credentials["model_uid"] or "#" in credentials["model_uid"]:
|
||||
if not validate_model_uid(credentials):
|
||||
raise CredentialsValidateFailedError("model_uid should not contain /, ?, or #")
|
||||
|
||||
credentials["server_url"] = credentials["server_url"].removesuffix("/")
|
||||
|
||||
@@ -132,3 +132,16 @@ class XinferenceHelper:
|
||||
context_length=context_length,
|
||||
model_family=model_family,
|
||||
)
|
||||
|
||||
|
||||
def validate_model_uid(credentials: dict) -> bool:
|
||||
"""
|
||||
Validate the model_uid within the credentials dictionary to ensure it does not
|
||||
contain forbidden characters ("/", "?", "#").
|
||||
|
||||
param credentials: model credentials
|
||||
:return: True if the model_uid does not contain forbidden characters ("/", "?", "#"), else False.
|
||||
"""
|
||||
forbidden_characters = ["/", "?", "#"]
|
||||
model_uid = credentials.get("model_uid", "")
|
||||
return not any(char in forbidden_characters for char in model_uid)
|
||||
|
||||
@@ -18,8 +18,12 @@ class KeywordsModeration(Moderation):
|
||||
if not config.get("keywords"):
|
||||
raise ValueError("keywords is required")
|
||||
|
||||
if len(config.get("keywords")) > 1000:
|
||||
raise ValueError("keywords length must be less than 1000")
|
||||
if len(config.get("keywords")) > 10000:
|
||||
raise ValueError("keywords length must be less than 10000")
|
||||
|
||||
keywords_row_len = config["keywords"].split("\n")
|
||||
if len(keywords_row_len) > 100:
|
||||
raise ValueError("the number of rows for the keywords must be less than 100")
|
||||
|
||||
def moderation_for_inputs(self, inputs: dict, query: str = "") -> ModerationInputsResult:
|
||||
flagged = False
|
||||
|
||||
@@ -10,6 +10,7 @@ from core.rag.rerank.constants.rerank_mode import RerankMode
|
||||
from core.rag.retrieval.retrieval_methods import RetrievalMethod
|
||||
from extensions.ext_database import db
|
||||
from models.dataset import Dataset
|
||||
from services.external_knowledge_service import ExternalDatasetService
|
||||
|
||||
default_retrieval_model = {
|
||||
"search_method": RetrievalMethod.SEMANTIC_SEARCH.value,
|
||||
@@ -34,6 +35,9 @@ class RetrievalService:
|
||||
weights: Optional[dict] = None,
|
||||
):
|
||||
dataset = db.session.query(Dataset).filter(Dataset.id == dataset_id).first()
|
||||
if not dataset:
|
||||
return []
|
||||
|
||||
if not dataset or dataset.available_document_count == 0 or dataset.available_segment_count == 0:
|
||||
return []
|
||||
all_documents = []
|
||||
@@ -108,6 +112,16 @@ class RetrievalService:
|
||||
)
|
||||
return all_documents
|
||||
|
||||
@classmethod
|
||||
def external_retrieve(cls, dataset_id: str, query: str, external_retrieval_model: Optional[dict] = None):
|
||||
dataset = db.session.query(Dataset).filter(Dataset.id == dataset_id).first()
|
||||
if not dataset:
|
||||
return []
|
||||
all_documents = ExternalDatasetService.fetch_external_knowledge_retrieval(
|
||||
dataset.tenant_id, dataset_id, query, external_retrieval_model
|
||||
)
|
||||
return all_documents
|
||||
|
||||
@classmethod
|
||||
def keyword_search(
|
||||
cls, flask_app: Flask, dataset_id: str, query: str, top_k: int, all_documents: list, exceptions: list
|
||||
|
||||
@@ -166,7 +166,7 @@ class PGVector(BaseVector):
|
||||
|
||||
with self._get_cursor() as cur:
|
||||
cur.execute(
|
||||
f"""SELECT meta, text, ts_rank(to_tsvector(coalesce(text, '')), to_tsquery(%s)) AS score
|
||||
f"""SELECT meta, text, ts_rank(to_tsvector(coalesce(text, '')), plainto_tsquery(%s)) AS score
|
||||
FROM {self.table_name}
|
||||
WHERE to_tsvector(text) @@ plainto_tsquery(%s)
|
||||
ORDER BY score DESC
|
||||
|
||||
10
api/core/rag/entities/context_entities.py
Normal file
10
api/core/rag/entities/context_entities.py
Normal file
@@ -0,0 +1,10 @@
|
||||
from pydantic import BaseModel
|
||||
|
||||
|
||||
class DocumentContext(BaseModel):
|
||||
"""
|
||||
Model class for document context.
|
||||
"""
|
||||
|
||||
content: str
|
||||
score: float
|
||||
@@ -12,6 +12,7 @@ from core.rag.extractor.entity.extract_setting import ExtractSetting
|
||||
from core.rag.extractor.excel_extractor import ExcelExtractor
|
||||
from core.rag.extractor.firecrawl.firecrawl_web_extractor import FirecrawlWebExtractor
|
||||
from core.rag.extractor.html_extractor import HtmlExtractor
|
||||
from core.rag.extractor.jina_reader_extractor import JinaReaderWebExtractor
|
||||
from core.rag.extractor.markdown_extractor import MarkdownExtractor
|
||||
from core.rag.extractor.notion_extractor import NotionExtractor
|
||||
from core.rag.extractor.pdf_extractor import PdfExtractor
|
||||
@@ -171,6 +172,15 @@ class ExtractProcessor:
|
||||
only_main_content=extract_setting.website_info.only_main_content,
|
||||
)
|
||||
return extractor.extract()
|
||||
elif extract_setting.website_info.provider == "jinareader":
|
||||
extractor = JinaReaderWebExtractor(
|
||||
url=extract_setting.website_info.url,
|
||||
job_id=extract_setting.website_info.job_id,
|
||||
tenant_id=extract_setting.website_info.tenant_id,
|
||||
mode=extract_setting.website_info.mode,
|
||||
only_main_content=extract_setting.website_info.only_main_content,
|
||||
)
|
||||
return extractor.extract()
|
||||
else:
|
||||
raise ValueError(f"Unsupported website provider: {extract_setting.website_info.provider}")
|
||||
else:
|
||||
|
||||
35
api/core/rag/extractor/jina_reader_extractor.py
Normal file
35
api/core/rag/extractor/jina_reader_extractor.py
Normal file
@@ -0,0 +1,35 @@
|
||||
from core.rag.extractor.extractor_base import BaseExtractor
|
||||
from core.rag.models.document import Document
|
||||
from services.website_service import WebsiteService
|
||||
|
||||
|
||||
class JinaReaderWebExtractor(BaseExtractor):
|
||||
"""
|
||||
Crawl and scrape websites and return content in clean llm-ready markdown.
|
||||
"""
|
||||
|
||||
def __init__(self, url: str, job_id: str, tenant_id: str, mode: str = "crawl", only_main_content: bool = False):
|
||||
"""Initialize with url, api_key, base_url and mode."""
|
||||
self._url = url
|
||||
self.job_id = job_id
|
||||
self.tenant_id = tenant_id
|
||||
self.mode = mode
|
||||
self.only_main_content = only_main_content
|
||||
|
||||
def extract(self) -> list[Document]:
|
||||
"""Extract content from the URL."""
|
||||
documents = []
|
||||
if self.mode == "crawl":
|
||||
crawl_data = WebsiteService.get_crawl_url_data(self.job_id, "jinareader", self._url, self.tenant_id)
|
||||
if crawl_data is None:
|
||||
return []
|
||||
document = Document(
|
||||
page_content=crawl_data.get("content", ""),
|
||||
metadata={
|
||||
"source_url": crawl_data.get("url"),
|
||||
"description": crawl_data.get("description"),
|
||||
"title": crawl_data.get("title"),
|
||||
},
|
||||
)
|
||||
documents.append(document)
|
||||
return documents
|
||||
@@ -17,6 +17,8 @@ class Document(BaseModel):
|
||||
"""
|
||||
metadata: Optional[dict] = Field(default_factory=dict)
|
||||
|
||||
provider: Optional[str] = "dify"
|
||||
|
||||
|
||||
class BaseDocumentTransformer(ABC):
|
||||
"""Abstract base class for document transformation systems.
|
||||
|
||||
@@ -28,11 +28,16 @@ class RerankModelRunner:
|
||||
docs = []
|
||||
doc_id = []
|
||||
unique_documents = []
|
||||
for document in documents:
|
||||
dify_documents = [item for item in documents if item.provider == "dify"]
|
||||
external_documents = [item for item in documents if item.provider == "external"]
|
||||
for document in dify_documents:
|
||||
if document.metadata["doc_id"] not in doc_id:
|
||||
doc_id.append(document.metadata["doc_id"])
|
||||
docs.append(document.page_content)
|
||||
unique_documents.append(document)
|
||||
for document in external_documents:
|
||||
docs.append(document.page_content)
|
||||
unique_documents.append(document)
|
||||
|
||||
documents = unique_documents
|
||||
|
||||
@@ -46,14 +51,10 @@ class RerankModelRunner:
|
||||
# format document
|
||||
rerank_document = Document(
|
||||
page_content=result.text,
|
||||
metadata={
|
||||
"doc_id": documents[result.index].metadata["doc_id"],
|
||||
"doc_hash": documents[result.index].metadata["doc_hash"],
|
||||
"document_id": documents[result.index].metadata["document_id"],
|
||||
"dataset_id": documents[result.index].metadata["dataset_id"],
|
||||
"score": result.score,
|
||||
},
|
||||
metadata=documents[result.index].metadata,
|
||||
provider=documents[result.index].provider,
|
||||
)
|
||||
rerank_document.metadata["score"] = result.score
|
||||
rerank_documents.append(rerank_document)
|
||||
|
||||
return rerank_documents
|
||||
|
||||
@@ -20,6 +20,7 @@ from core.ops.utils import measure_time
|
||||
from core.rag.data_post_processor.data_post_processor import DataPostProcessor
|
||||
from core.rag.datasource.keyword.jieba.jieba_keyword_table_handler import JiebaKeywordTableHandler
|
||||
from core.rag.datasource.retrieval_service import RetrievalService
|
||||
from core.rag.entities.context_entities import DocumentContext
|
||||
from core.rag.models.document import Document
|
||||
from core.rag.retrieval.retrieval_methods import RetrievalMethod
|
||||
from core.rag.retrieval.router.multi_dataset_function_call_router import FunctionCallMultiDatasetRouter
|
||||
@@ -30,6 +31,7 @@ from core.tools.tool.dataset_retriever.dataset_retriever_tool import DatasetRetr
|
||||
from extensions.ext_database import db
|
||||
from models.dataset import Dataset, DatasetQuery, DocumentSegment
|
||||
from models.dataset import Document as DatasetDocument
|
||||
from services.external_knowledge_service import ExternalDatasetService
|
||||
|
||||
default_retrieval_model = {
|
||||
"search_method": RetrievalMethod.SEMANTIC_SEARCH.value,
|
||||
@@ -110,7 +112,7 @@ class DatasetRetrieval:
|
||||
continue
|
||||
|
||||
# pass if dataset is not available
|
||||
if dataset and dataset.available_document_count == 0:
|
||||
if dataset and dataset.available_document_count == 0 and dataset.provider != "external":
|
||||
continue
|
||||
|
||||
available_datasets.append(dataset)
|
||||
@@ -146,69 +148,93 @@ class DatasetRetrieval:
|
||||
message_id,
|
||||
)
|
||||
|
||||
document_score_list = {}
|
||||
for item in all_documents:
|
||||
if item.metadata.get("score"):
|
||||
document_score_list[item.metadata["doc_id"]] = item.metadata["score"]
|
||||
|
||||
dify_documents = [item for item in all_documents if item.provider == "dify"]
|
||||
external_documents = [item for item in all_documents if item.provider == "external"]
|
||||
document_context_list = []
|
||||
index_node_ids = [document.metadata["doc_id"] for document in all_documents]
|
||||
segments = DocumentSegment.query.filter(
|
||||
DocumentSegment.dataset_id.in_(dataset_ids),
|
||||
DocumentSegment.completed_at.isnot(None),
|
||||
DocumentSegment.status == "completed",
|
||||
DocumentSegment.enabled == True,
|
||||
DocumentSegment.index_node_id.in_(index_node_ids),
|
||||
).all()
|
||||
retrieval_resource_list = []
|
||||
# deal with external documents
|
||||
for item in external_documents:
|
||||
document_context_list.append(DocumentContext(content=item.page_content, score=item.metadata.get("score")))
|
||||
source = {
|
||||
"dataset_id": item.metadata.get("dataset_id"),
|
||||
"dataset_name": item.metadata.get("dataset_name"),
|
||||
"document_name": item.metadata.get("title"),
|
||||
"data_source_type": "external",
|
||||
"retriever_from": invoke_from.to_source(),
|
||||
"score": item.metadata.get("score"),
|
||||
"content": item.page_content,
|
||||
}
|
||||
retrieval_resource_list.append(source)
|
||||
document_score_list = {}
|
||||
# deal with dify documents
|
||||
if dify_documents:
|
||||
for item in dify_documents:
|
||||
if item.metadata.get("score"):
|
||||
document_score_list[item.metadata["doc_id"]] = item.metadata["score"]
|
||||
|
||||
if segments:
|
||||
index_node_id_to_position = {id: position for position, id in enumerate(index_node_ids)}
|
||||
sorted_segments = sorted(
|
||||
segments, key=lambda segment: index_node_id_to_position.get(segment.index_node_id, float("inf"))
|
||||
)
|
||||
for segment in sorted_segments:
|
||||
if segment.answer:
|
||||
document_context_list.append(f"question:{segment.get_sign_content()} answer:{segment.answer}")
|
||||
else:
|
||||
document_context_list.append(segment.get_sign_content())
|
||||
if show_retrieve_source:
|
||||
context_list = []
|
||||
resource_number = 1
|
||||
index_node_ids = [document.metadata["doc_id"] for document in dify_documents]
|
||||
segments = DocumentSegment.query.filter(
|
||||
DocumentSegment.dataset_id.in_(dataset_ids),
|
||||
DocumentSegment.status == "completed",
|
||||
DocumentSegment.enabled == True,
|
||||
DocumentSegment.index_node_id.in_(index_node_ids),
|
||||
).all()
|
||||
|
||||
if segments:
|
||||
index_node_id_to_position = {id: position for position, id in enumerate(index_node_ids)}
|
||||
sorted_segments = sorted(
|
||||
segments, key=lambda segment: index_node_id_to_position.get(segment.index_node_id, float("inf"))
|
||||
)
|
||||
for segment in sorted_segments:
|
||||
dataset = Dataset.query.filter_by(id=segment.dataset_id).first()
|
||||
document = DatasetDocument.query.filter(
|
||||
DatasetDocument.id == segment.document_id,
|
||||
DatasetDocument.enabled == True,
|
||||
DatasetDocument.archived == False,
|
||||
).first()
|
||||
if dataset and document:
|
||||
source = {
|
||||
"position": resource_number,
|
||||
"dataset_id": dataset.id,
|
||||
"dataset_name": dataset.name,
|
||||
"document_id": document.id,
|
||||
"document_name": document.name,
|
||||
"data_source_type": document.data_source_type,
|
||||
"segment_id": segment.id,
|
||||
"retriever_from": invoke_from.to_source(),
|
||||
"score": document_score_list.get(segment.index_node_id, None),
|
||||
}
|
||||
if segment.answer:
|
||||
document_context_list.append(
|
||||
DocumentContext(
|
||||
content=f"question:{segment.get_sign_content()} answer:{segment.answer}",
|
||||
score=document_score_list.get(segment.index_node_id, None),
|
||||
)
|
||||
)
|
||||
else:
|
||||
document_context_list.append(
|
||||
DocumentContext(
|
||||
content=segment.get_sign_content(),
|
||||
score=document_score_list.get(segment.index_node_id, None),
|
||||
)
|
||||
)
|
||||
if show_retrieve_source:
|
||||
for segment in sorted_segments:
|
||||
dataset = Dataset.query.filter_by(id=segment.dataset_id).first()
|
||||
document = DatasetDocument.query.filter(
|
||||
DatasetDocument.id == segment.document_id,
|
||||
DatasetDocument.enabled == True,
|
||||
DatasetDocument.archived == False,
|
||||
).first()
|
||||
if dataset and document:
|
||||
source = {
|
||||
"dataset_id": dataset.id,
|
||||
"dataset_name": dataset.name,
|
||||
"document_id": document.id,
|
||||
"document_name": document.name,
|
||||
"data_source_type": document.data_source_type,
|
||||
"segment_id": segment.id,
|
||||
"retriever_from": invoke_from.to_source(),
|
||||
"score": document_score_list.get(segment.index_node_id, None),
|
||||
}
|
||||
|
||||
if invoke_from.to_source() == "dev":
|
||||
source["hit_count"] = segment.hit_count
|
||||
source["word_count"] = segment.word_count
|
||||
source["segment_position"] = segment.position
|
||||
source["index_node_hash"] = segment.index_node_hash
|
||||
if segment.answer:
|
||||
source["content"] = f"question:{segment.content} \nanswer:{segment.answer}"
|
||||
else:
|
||||
source["content"] = segment.content
|
||||
context_list.append(source)
|
||||
resource_number += 1
|
||||
if hit_callback:
|
||||
hit_callback.return_retriever_resource_info(context_list)
|
||||
|
||||
return str("\n".join(document_context_list))
|
||||
if invoke_from.to_source() == "dev":
|
||||
source["hit_count"] = segment.hit_count
|
||||
source["word_count"] = segment.word_count
|
||||
source["segment_position"] = segment.position
|
||||
source["index_node_hash"] = segment.index_node_hash
|
||||
if segment.answer:
|
||||
source["content"] = f"question:{segment.content} \nanswer:{segment.answer}"
|
||||
else:
|
||||
source["content"] = segment.content
|
||||
retrieval_resource_list.append(source)
|
||||
if hit_callback and retrieval_resource_list:
|
||||
hit_callback.return_retriever_resource_info(retrieval_resource_list)
|
||||
if document_context_list:
|
||||
document_context_list = sorted(document_context_list, key=lambda x: x.score, reverse=True)
|
||||
return str("\n".join([document_context.content for document_context in document_context_list]))
|
||||
return ""
|
||||
|
||||
def single_retrieve(
|
||||
@@ -256,36 +282,58 @@ class DatasetRetrieval:
|
||||
# get retrieval model config
|
||||
dataset = db.session.query(Dataset).filter(Dataset.id == dataset_id).first()
|
||||
if dataset:
|
||||
retrieval_model_config = dataset.retrieval_model or default_retrieval_model
|
||||
|
||||
# get top k
|
||||
top_k = retrieval_model_config["top_k"]
|
||||
# get retrieval method
|
||||
if dataset.indexing_technique == "economy":
|
||||
retrieval_method = "keyword_search"
|
||||
else:
|
||||
retrieval_method = retrieval_model_config["search_method"]
|
||||
# get reranking model
|
||||
reranking_model = (
|
||||
retrieval_model_config["reranking_model"] if retrieval_model_config["reranking_enable"] else None
|
||||
)
|
||||
# get score threshold
|
||||
score_threshold = 0.0
|
||||
score_threshold_enabled = retrieval_model_config.get("score_threshold_enabled")
|
||||
if score_threshold_enabled:
|
||||
score_threshold = retrieval_model_config.get("score_threshold")
|
||||
|
||||
with measure_time() as timer:
|
||||
results = RetrievalService.retrieve(
|
||||
retrieval_method=retrieval_method,
|
||||
dataset_id=dataset.id,
|
||||
results = []
|
||||
if dataset.provider == "external":
|
||||
external_documents = ExternalDatasetService.fetch_external_knowledge_retrieval(
|
||||
tenant_id=dataset.tenant_id,
|
||||
dataset_id=dataset_id,
|
||||
query=query,
|
||||
top_k=top_k,
|
||||
score_threshold=score_threshold,
|
||||
reranking_model=reranking_model,
|
||||
reranking_mode=retrieval_model_config.get("reranking_mode", "reranking_model"),
|
||||
weights=retrieval_model_config.get("weights", None),
|
||||
external_retrieval_parameters=dataset.retrieval_model,
|
||||
)
|
||||
for external_document in external_documents:
|
||||
document = Document(
|
||||
page_content=external_document.get("content"),
|
||||
metadata=external_document.get("metadata"),
|
||||
provider="external",
|
||||
)
|
||||
document.metadata["score"] = external_document.get("score")
|
||||
document.metadata["title"] = external_document.get("title")
|
||||
document.metadata["dataset_id"] = dataset_id
|
||||
document.metadata["dataset_name"] = dataset.name
|
||||
results.append(document)
|
||||
else:
|
||||
retrieval_model_config = dataset.retrieval_model or default_retrieval_model
|
||||
|
||||
# get top k
|
||||
top_k = retrieval_model_config["top_k"]
|
||||
# get retrieval method
|
||||
if dataset.indexing_technique == "economy":
|
||||
retrieval_method = "keyword_search"
|
||||
else:
|
||||
retrieval_method = retrieval_model_config["search_method"]
|
||||
# get reranking model
|
||||
reranking_model = (
|
||||
retrieval_model_config["reranking_model"]
|
||||
if retrieval_model_config["reranking_enable"]
|
||||
else None
|
||||
)
|
||||
# get score threshold
|
||||
score_threshold = 0.0
|
||||
score_threshold_enabled = retrieval_model_config.get("score_threshold_enabled")
|
||||
if score_threshold_enabled:
|
||||
score_threshold = retrieval_model_config.get("score_threshold")
|
||||
|
||||
with measure_time() as timer:
|
||||
results = RetrievalService.retrieve(
|
||||
retrieval_method=retrieval_method,
|
||||
dataset_id=dataset.id,
|
||||
query=query,
|
||||
top_k=top_k,
|
||||
score_threshold=score_threshold,
|
||||
reranking_model=reranking_model,
|
||||
reranking_mode=retrieval_model_config.get("reranking_mode", "reranking_model"),
|
||||
weights=retrieval_model_config.get("weights", None),
|
||||
)
|
||||
self._on_query(query, [dataset_id], app_id, user_from, user_id)
|
||||
|
||||
if results:
|
||||
@@ -356,7 +404,8 @@ class DatasetRetrieval:
|
||||
self, documents: list[Document], message_id: Optional[str] = None, timer: Optional[dict] = None
|
||||
) -> None:
|
||||
"""Handle retrieval end."""
|
||||
for document in documents:
|
||||
dify_documents = [document for document in documents if document.provider == "dify"]
|
||||
for document in dify_documents:
|
||||
query = db.session.query(DocumentSegment).filter(
|
||||
DocumentSegment.index_node_id == document.metadata["doc_id"]
|
||||
)
|
||||
@@ -409,35 +458,54 @@ class DatasetRetrieval:
|
||||
if not dataset:
|
||||
return []
|
||||
|
||||
# get retrieval model , if the model is not setting , using default
|
||||
retrieval_model = dataset.retrieval_model or default_retrieval_model
|
||||
|
||||
if dataset.indexing_technique == "economy":
|
||||
# use keyword table query
|
||||
documents = RetrievalService.retrieve(
|
||||
retrieval_method="keyword_search", dataset_id=dataset.id, query=query, top_k=top_k
|
||||
if dataset.provider == "external":
|
||||
external_documents = ExternalDatasetService.fetch_external_knowledge_retrieval(
|
||||
tenant_id=dataset.tenant_id,
|
||||
dataset_id=dataset_id,
|
||||
query=query,
|
||||
external_retrieval_parameters=dataset.retrieval_model,
|
||||
)
|
||||
if documents:
|
||||
all_documents.extend(documents)
|
||||
else:
|
||||
if top_k > 0:
|
||||
# retrieval source
|
||||
documents = RetrievalService.retrieve(
|
||||
retrieval_method=retrieval_model["search_method"],
|
||||
dataset_id=dataset.id,
|
||||
query=query,
|
||||
top_k=retrieval_model.get("top_k") or 2,
|
||||
score_threshold=retrieval_model.get("score_threshold", 0.0)
|
||||
if retrieval_model["score_threshold_enabled"]
|
||||
else 0.0,
|
||||
reranking_model=retrieval_model.get("reranking_model", None)
|
||||
if retrieval_model["reranking_enable"]
|
||||
else None,
|
||||
reranking_mode=retrieval_model.get("reranking_mode") or "reranking_model",
|
||||
weights=retrieval_model.get("weights", None),
|
||||
for external_document in external_documents:
|
||||
document = Document(
|
||||
page_content=external_document.get("content"),
|
||||
metadata=external_document.get("metadata"),
|
||||
provider="external",
|
||||
)
|
||||
document.metadata["score"] = external_document.get("score")
|
||||
document.metadata["title"] = external_document.get("title")
|
||||
document.metadata["dataset_id"] = dataset_id
|
||||
document.metadata["dataset_name"] = dataset.name
|
||||
all_documents.append(document)
|
||||
else:
|
||||
# get retrieval model , if the model is not setting , using default
|
||||
retrieval_model = dataset.retrieval_model or default_retrieval_model
|
||||
|
||||
all_documents.extend(documents)
|
||||
if dataset.indexing_technique == "economy":
|
||||
# use keyword table query
|
||||
documents = RetrievalService.retrieve(
|
||||
retrieval_method="keyword_search", dataset_id=dataset.id, query=query, top_k=top_k
|
||||
)
|
||||
if documents:
|
||||
all_documents.extend(documents)
|
||||
else:
|
||||
if top_k > 0:
|
||||
# retrieval source
|
||||
documents = RetrievalService.retrieve(
|
||||
retrieval_method=retrieval_model["search_method"],
|
||||
dataset_id=dataset.id,
|
||||
query=query,
|
||||
top_k=retrieval_model.get("top_k") or 2,
|
||||
score_threshold=retrieval_model.get("score_threshold", 0.0)
|
||||
if retrieval_model["score_threshold_enabled"]
|
||||
else 0.0,
|
||||
reranking_model=retrieval_model.get("reranking_model", None)
|
||||
if retrieval_model["reranking_enable"]
|
||||
else None,
|
||||
reranking_mode=retrieval_model.get("reranking_mode") or "reranking_model",
|
||||
weights=retrieval_model.get("weights", None),
|
||||
)
|
||||
|
||||
all_documents.extend(documents)
|
||||
|
||||
def to_dataset_retriever_tool(
|
||||
self,
|
||||
|
||||
@@ -156,16 +156,34 @@ class KnowledgeRetrievalNode(BaseNode):
|
||||
weights,
|
||||
node_data.multiple_retrieval_config.reranking_enable,
|
||||
)
|
||||
|
||||
context_list = []
|
||||
if all_documents:
|
||||
dify_documents = [item for item in all_documents if item.provider == "dify"]
|
||||
external_documents = [item for item in all_documents if item.provider == "external"]
|
||||
retrieval_resource_list = []
|
||||
# deal with external documents
|
||||
for item in external_documents:
|
||||
source = {
|
||||
"metadata": {
|
||||
"_source": "knowledge",
|
||||
"dataset_id": item.metadata.get("dataset_id"),
|
||||
"dataset_name": item.metadata.get("dataset_name"),
|
||||
"document_name": item.metadata.get("title"),
|
||||
"data_source_type": "external",
|
||||
"retriever_from": "workflow",
|
||||
"score": item.metadata.get("score"),
|
||||
},
|
||||
"title": item.metadata.get("title"),
|
||||
"content": item.page_content,
|
||||
}
|
||||
retrieval_resource_list.append(source)
|
||||
document_score_list = {}
|
||||
# deal with dify documents
|
||||
if dify_documents:
|
||||
document_score_list = {}
|
||||
page_number_list = {}
|
||||
for item in all_documents:
|
||||
for item in dify_documents:
|
||||
if item.metadata.get("score"):
|
||||
document_score_list[item.metadata["doc_id"]] = item.metadata["score"]
|
||||
|
||||
index_node_ids = [document.metadata["doc_id"] for document in all_documents]
|
||||
index_node_ids = [document.metadata["doc_id"] for document in dify_documents]
|
||||
segments = DocumentSegment.query.filter(
|
||||
DocumentSegment.dataset_id.in_(dataset_ids),
|
||||
DocumentSegment.completed_at.isnot(None),
|
||||
@@ -186,13 +204,10 @@ class KnowledgeRetrievalNode(BaseNode):
|
||||
Document.enabled == True,
|
||||
Document.archived == False,
|
||||
).first()
|
||||
|
||||
resource_number = 1
|
||||
if dataset and document:
|
||||
source = {
|
||||
"metadata": {
|
||||
"_source": "knowledge",
|
||||
"position": resource_number,
|
||||
"dataset_id": dataset.id,
|
||||
"dataset_name": dataset.name,
|
||||
"document_id": document.id,
|
||||
@@ -212,9 +227,14 @@ class KnowledgeRetrievalNode(BaseNode):
|
||||
source["content"] = f"question:{segment.get_sign_content()} \nanswer:{segment.answer}"
|
||||
else:
|
||||
source["content"] = segment.get_sign_content()
|
||||
context_list.append(source)
|
||||
resource_number += 1
|
||||
return context_list
|
||||
retrieval_resource_list.append(source)
|
||||
if retrieval_resource_list:
|
||||
retrieval_resource_list = sorted(retrieval_resource_list, key=lambda x: x.get("score"), reverse=True)
|
||||
position = 1
|
||||
for item in retrieval_resource_list:
|
||||
item["metadata"]["position"] = position
|
||||
position += 1
|
||||
return retrieval_resource_list
|
||||
|
||||
@classmethod
|
||||
def _extract_variable_selector_to_variable_mapping(
|
||||
|
||||
@@ -38,9 +38,20 @@ dataset_retrieval_model_fields = {
|
||||
"score_threshold_enabled": fields.Boolean,
|
||||
"score_threshold": fields.Float,
|
||||
}
|
||||
external_retrieval_model_fields = {
|
||||
"top_k": fields.Integer,
|
||||
"score_threshold": fields.Float,
|
||||
}
|
||||
|
||||
tag_fields = {"id": fields.String, "name": fields.String, "type": fields.String}
|
||||
|
||||
external_knowledge_info_fields = {
|
||||
"external_knowledge_id": fields.String,
|
||||
"external_knowledge_api_id": fields.String,
|
||||
"external_knowledge_api_name": fields.String,
|
||||
"external_knowledge_api_endpoint": fields.String,
|
||||
}
|
||||
|
||||
dataset_detail_fields = {
|
||||
"id": fields.String,
|
||||
"name": fields.String,
|
||||
@@ -61,6 +72,8 @@ dataset_detail_fields = {
|
||||
"embedding_available": fields.Boolean,
|
||||
"retrieval_model_dict": fields.Nested(dataset_retrieval_model_fields),
|
||||
"tags": fields.List(fields.Nested(tag_fields)),
|
||||
"external_knowledge_info": fields.Nested(external_knowledge_info_fields),
|
||||
"external_retrieval_model": fields.Nested(external_retrieval_model_fields, allow_null=True),
|
||||
}
|
||||
|
||||
dataset_query_detail_fields = {
|
||||
|
||||
11
api/fields/external_dataset_fields.py
Normal file
11
api/fields/external_dataset_fields.py
Normal file
@@ -0,0 +1,11 @@
|
||||
from flask_restful import fields
|
||||
|
||||
from libs.helper import TimestampField
|
||||
|
||||
external_knowledge_api_query_detail_fields = {
|
||||
"id": fields.String,
|
||||
"name": fields.String,
|
||||
"setting": fields.String,
|
||||
"created_by": fields.String,
|
||||
"created_at": TimestampField,
|
||||
}
|
||||
@@ -0,0 +1,48 @@
|
||||
"""update-retrieval-resource
|
||||
|
||||
Revision ID: 6af6a521a53e
|
||||
Revises: ec3df697ebbb
|
||||
Create Date: 2024-09-24 09:22:43.570120
|
||||
|
||||
"""
|
||||
from alembic import op
|
||||
import models as models
|
||||
import sqlalchemy as sa
|
||||
from sqlalchemy.dialects import postgresql
|
||||
|
||||
# revision identifiers, used by Alembic.
|
||||
revision = '6af6a521a53e'
|
||||
down_revision = 'd57ba9ebb251'
|
||||
branch_labels = None
|
||||
depends_on = None
|
||||
|
||||
|
||||
def upgrade():
|
||||
# ### commands auto generated by Alembic - please adjust! ###
|
||||
with op.batch_alter_table('dataset_retriever_resources', schema=None) as batch_op:
|
||||
batch_op.alter_column('document_id',
|
||||
existing_type=sa.UUID(),
|
||||
nullable=True)
|
||||
batch_op.alter_column('data_source_type',
|
||||
existing_type=sa.TEXT(),
|
||||
nullable=True)
|
||||
batch_op.alter_column('segment_id',
|
||||
existing_type=sa.UUID(),
|
||||
nullable=True)
|
||||
# ### end Alembic commands ###
|
||||
|
||||
|
||||
def downgrade():
|
||||
# ### commands auto generated by Alembic - please adjust! ###
|
||||
with op.batch_alter_table('dataset_retriever_resources', schema=None) as batch_op:
|
||||
batch_op.alter_column('segment_id',
|
||||
existing_type=sa.UUID(),
|
||||
nullable=False)
|
||||
batch_op.alter_column('data_source_type',
|
||||
existing_type=sa.TEXT(),
|
||||
nullable=False)
|
||||
batch_op.alter_column('document_id',
|
||||
existing_type=sa.UUID(),
|
||||
nullable=False)
|
||||
|
||||
# ### end Alembic commands ###
|
||||
@@ -0,0 +1,73 @@
|
||||
"""external_knowledge_api
|
||||
|
||||
Revision ID: 33f5fac87f29
|
||||
Revises: 6af6a521a53e
|
||||
Create Date: 2024-09-25 04:34:57.249436
|
||||
|
||||
"""
|
||||
from alembic import op
|
||||
import models as models
|
||||
import sqlalchemy as sa
|
||||
from sqlalchemy.dialects import postgresql
|
||||
|
||||
# revision identifiers, used by Alembic.
|
||||
revision = '33f5fac87f29'
|
||||
down_revision = '6af6a521a53e'
|
||||
branch_labels = None
|
||||
depends_on = None
|
||||
|
||||
|
||||
def upgrade():
|
||||
# ### commands auto generated by Alembic - please adjust! ###
|
||||
op.create_table('external_knowledge_apis',
|
||||
sa.Column('id', models.types.StringUUID(), server_default=sa.text('uuid_generate_v4()'), nullable=False),
|
||||
sa.Column('name', sa.String(length=255), nullable=False),
|
||||
sa.Column('description', sa.String(length=255), nullable=False),
|
||||
sa.Column('tenant_id', models.types.StringUUID(), nullable=False),
|
||||
sa.Column('settings', sa.Text(), nullable=True),
|
||||
sa.Column('created_by', models.types.StringUUID(), nullable=False),
|
||||
sa.Column('created_at', sa.DateTime(), server_default=sa.text('CURRENT_TIMESTAMP(0)'), nullable=False),
|
||||
sa.Column('updated_by', models.types.StringUUID(), nullable=True),
|
||||
sa.Column('updated_at', sa.DateTime(), server_default=sa.text('CURRENT_TIMESTAMP(0)'), nullable=False),
|
||||
sa.PrimaryKeyConstraint('id', name='external_knowledge_apis_pkey')
|
||||
)
|
||||
with op.batch_alter_table('external_knowledge_apis', schema=None) as batch_op:
|
||||
batch_op.create_index('external_knowledge_apis_name_idx', ['name'], unique=False)
|
||||
batch_op.create_index('external_knowledge_apis_tenant_idx', ['tenant_id'], unique=False)
|
||||
|
||||
op.create_table('external_knowledge_bindings',
|
||||
sa.Column('id', models.types.StringUUID(), server_default=sa.text('uuid_generate_v4()'), nullable=False),
|
||||
sa.Column('tenant_id', models.types.StringUUID(), nullable=False),
|
||||
sa.Column('external_knowledge_api_id', models.types.StringUUID(), nullable=False),
|
||||
sa.Column('dataset_id', models.types.StringUUID(), nullable=False),
|
||||
sa.Column('external_knowledge_id', sa.Text(), nullable=False),
|
||||
sa.Column('created_by', models.types.StringUUID(), nullable=False),
|
||||
sa.Column('created_at', sa.DateTime(), server_default=sa.text('CURRENT_TIMESTAMP(0)'), nullable=False),
|
||||
sa.Column('updated_by', models.types.StringUUID(), nullable=True),
|
||||
sa.Column('updated_at', sa.DateTime(), server_default=sa.text('CURRENT_TIMESTAMP(0)'), nullable=False),
|
||||
sa.PrimaryKeyConstraint('id', name='external_knowledge_bindings_pkey')
|
||||
)
|
||||
with op.batch_alter_table('external_knowledge_bindings', schema=None) as batch_op:
|
||||
batch_op.create_index('external_knowledge_bindings_dataset_idx', ['dataset_id'], unique=False)
|
||||
batch_op.create_index('external_knowledge_bindings_external_knowledge_api_idx', ['external_knowledge_api_id'], unique=False)
|
||||
batch_op.create_index('external_knowledge_bindings_external_knowledge_idx', ['external_knowledge_id'], unique=False)
|
||||
batch_op.create_index('external_knowledge_bindings_tenant_idx', ['tenant_id'], unique=False)
|
||||
|
||||
# ### end Alembic commands ###
|
||||
|
||||
|
||||
def downgrade():
|
||||
# ### commands auto generated by Alembic - please adjust! ###
|
||||
with op.batch_alter_table('external_knowledge_bindings', schema=None) as batch_op:
|
||||
batch_op.drop_index('external_knowledge_bindings_tenant_idx')
|
||||
batch_op.drop_index('external_knowledge_bindings_external_knowledge_idx')
|
||||
batch_op.drop_index('external_knowledge_bindings_external_knowledge_api_idx')
|
||||
batch_op.drop_index('external_knowledge_bindings_dataset_idx')
|
||||
|
||||
op.drop_table('external_knowledge_bindings')
|
||||
with op.batch_alter_table('external_knowledge_apis', schema=None) as batch_op:
|
||||
batch_op.drop_index('external_knowledge_apis_tenant_idx')
|
||||
batch_op.drop_index('external_knowledge_apis_name_idx')
|
||||
|
||||
op.drop_table('external_knowledge_apis')
|
||||
# ### end Alembic commands ###
|
||||
@@ -1,4 +1,4 @@
|
||||
"""add-dataset-retrival-model
|
||||
"""add-dataset-retrieval-model
|
||||
|
||||
Revision ID: fca025d3b60f
|
||||
Revises: b3a09c049e8e
|
||||
|
||||
@@ -38,6 +38,7 @@ class Dataset(db.Model):
|
||||
)
|
||||
|
||||
INDEXING_TECHNIQUE_LIST = ["high_quality", "economy", None]
|
||||
PROVIDER_LIST = ["vendor", "external", None]
|
||||
|
||||
id = db.Column(StringUUID, server_default=db.text("uuid_generate_v4()"))
|
||||
tenant_id = db.Column(StringUUID, nullable=False)
|
||||
@@ -71,6 +72,14 @@ class Dataset(db.Model):
|
||||
def index_struct_dict(self):
|
||||
return json.loads(self.index_struct) if self.index_struct else None
|
||||
|
||||
@property
|
||||
def external_retrieval_model(self):
|
||||
default_retrieval_model = {
|
||||
"top_k": 2,
|
||||
"score_threshold": 0.0,
|
||||
}
|
||||
return self.retrieval_model or default_retrieval_model
|
||||
|
||||
@property
|
||||
def created_by_account(self):
|
||||
return db.session.get(Account, self.created_by)
|
||||
@@ -162,6 +171,29 @@ class Dataset(db.Model):
|
||||
|
||||
return tags or []
|
||||
|
||||
@property
|
||||
def external_knowledge_info(self):
|
||||
if self.provider != "external":
|
||||
return None
|
||||
external_knowledge_binding = (
|
||||
db.session.query(ExternalKnowledgeBindings).filter(ExternalKnowledgeBindings.dataset_id == self.id).first()
|
||||
)
|
||||
if not external_knowledge_binding:
|
||||
return None
|
||||
external_knowledge_api = (
|
||||
db.session.query(ExternalKnowledgeApis)
|
||||
.filter(ExternalKnowledgeApis.id == external_knowledge_binding.external_knowledge_api_id)
|
||||
.first()
|
||||
)
|
||||
if not external_knowledge_api:
|
||||
return None
|
||||
return {
|
||||
"external_knowledge_id": external_knowledge_binding.external_knowledge_id,
|
||||
"external_knowledge_api_id": external_knowledge_api.id,
|
||||
"external_knowledge_api_name": external_knowledge_api.name,
|
||||
"external_knowledge_api_endpoint": json.loads(external_knowledge_api.settings).get("endpoint", ""),
|
||||
}
|
||||
|
||||
@staticmethod
|
||||
def gen_collection_name_by_id(dataset_id: str) -> str:
|
||||
normalized_dataset_id = dataset_id.replace("-", "_")
|
||||
@@ -687,3 +719,77 @@ class DatasetPermission(db.Model):
|
||||
tenant_id = db.Column(StringUUID, nullable=False)
|
||||
has_permission = db.Column(db.Boolean, nullable=False, server_default=db.text("true"))
|
||||
created_at = db.Column(db.DateTime, nullable=False, server_default=db.text("CURRENT_TIMESTAMP(0)"))
|
||||
|
||||
|
||||
class ExternalKnowledgeApis(db.Model):
|
||||
__tablename__ = "external_knowledge_apis"
|
||||
__table_args__ = (
|
||||
db.PrimaryKeyConstraint("id", name="external_knowledge_apis_pkey"),
|
||||
db.Index("external_knowledge_apis_tenant_idx", "tenant_id"),
|
||||
db.Index("external_knowledge_apis_name_idx", "name"),
|
||||
)
|
||||
|
||||
id = db.Column(StringUUID, nullable=False, server_default=db.text("uuid_generate_v4()"))
|
||||
name = db.Column(db.String(255), nullable=False)
|
||||
description = db.Column(db.String(255), nullable=False)
|
||||
tenant_id = db.Column(StringUUID, nullable=False)
|
||||
settings = db.Column(db.Text, nullable=True)
|
||||
created_by = db.Column(StringUUID, nullable=False)
|
||||
created_at = db.Column(db.DateTime, nullable=False, server_default=db.text("CURRENT_TIMESTAMP(0)"))
|
||||
updated_by = db.Column(StringUUID, nullable=True)
|
||||
updated_at = db.Column(db.DateTime, nullable=False, server_default=db.text("CURRENT_TIMESTAMP(0)"))
|
||||
|
||||
def to_dict(self):
|
||||
return {
|
||||
"id": self.id,
|
||||
"tenant_id": self.tenant_id,
|
||||
"name": self.name,
|
||||
"description": self.description,
|
||||
"settings": self.settings_dict,
|
||||
"dataset_bindings": self.dataset_bindings,
|
||||
"created_by": self.created_by,
|
||||
"created_at": self.created_at.isoformat(),
|
||||
}
|
||||
|
||||
@property
|
||||
def settings_dict(self):
|
||||
try:
|
||||
return json.loads(self.settings) if self.settings else None
|
||||
except JSONDecodeError:
|
||||
return None
|
||||
|
||||
@property
|
||||
def dataset_bindings(self):
|
||||
external_knowledge_bindings = (
|
||||
db.session.query(ExternalKnowledgeBindings)
|
||||
.filter(ExternalKnowledgeBindings.external_knowledge_api_id == self.id)
|
||||
.all()
|
||||
)
|
||||
dataset_ids = [binding.dataset_id for binding in external_knowledge_bindings]
|
||||
datasets = db.session.query(Dataset).filter(Dataset.id.in_(dataset_ids)).all()
|
||||
dataset_bindings = []
|
||||
for dataset in datasets:
|
||||
dataset_bindings.append({"id": dataset.id, "name": dataset.name})
|
||||
|
||||
return dataset_bindings
|
||||
|
||||
|
||||
class ExternalKnowledgeBindings(db.Model):
|
||||
__tablename__ = "external_knowledge_bindings"
|
||||
__table_args__ = (
|
||||
db.PrimaryKeyConstraint("id", name="external_knowledge_bindings_pkey"),
|
||||
db.Index("external_knowledge_bindings_tenant_idx", "tenant_id"),
|
||||
db.Index("external_knowledge_bindings_dataset_idx", "dataset_id"),
|
||||
db.Index("external_knowledge_bindings_external_knowledge_idx", "external_knowledge_id"),
|
||||
db.Index("external_knowledge_bindings_external_knowledge_api_idx", "external_knowledge_api_id"),
|
||||
)
|
||||
|
||||
id = db.Column(StringUUID, nullable=False, server_default=db.text("uuid_generate_v4()"))
|
||||
tenant_id = db.Column(StringUUID, nullable=False)
|
||||
external_knowledge_api_id = db.Column(StringUUID, nullable=False)
|
||||
dataset_id = db.Column(StringUUID, nullable=False)
|
||||
external_knowledge_id = db.Column(db.Text, nullable=False)
|
||||
created_by = db.Column(StringUUID, nullable=False)
|
||||
created_at = db.Column(db.DateTime, nullable=False, server_default=db.text("CURRENT_TIMESTAMP(0)"))
|
||||
updated_by = db.Column(StringUUID, nullable=True)
|
||||
updated_at = db.Column(db.DateTime, nullable=False, server_default=db.text("CURRENT_TIMESTAMP(0)"))
|
||||
|
||||
@@ -1423,10 +1423,10 @@ class DatasetRetrieverResource(db.Model):
|
||||
position = db.Column(db.Integer, nullable=False)
|
||||
dataset_id = db.Column(StringUUID, nullable=False)
|
||||
dataset_name = db.Column(db.Text, nullable=False)
|
||||
document_id = db.Column(StringUUID, nullable=False)
|
||||
document_id = db.Column(StringUUID, nullable=True)
|
||||
document_name = db.Column(db.Text, nullable=False)
|
||||
data_source_type = db.Column(db.Text, nullable=False)
|
||||
segment_id = db.Column(StringUUID, nullable=False)
|
||||
data_source_type = db.Column(db.Text, nullable=True)
|
||||
segment_id = db.Column(StringUUID, nullable=True)
|
||||
score = db.Column(db.Float, nullable=True)
|
||||
content = db.Column(db.Text, nullable=False)
|
||||
hit_count = db.Column(db.Integer, nullable=True)
|
||||
|
||||
1524
api/poetry.lock
generated
1524
api/poetry.lock
generated
File diff suppressed because it is too large
Load Diff
@@ -221,6 +221,7 @@ volcengine-python-sdk = {extras = ["ark"], version = "^1.0.98"}
|
||||
oci = "^2.133.0"
|
||||
tos = "^2.7.1"
|
||||
nomic = "^3.1.2"
|
||||
validators = "0.21.0"
|
||||
[tool.poetry.group.indriect.dependencies]
|
||||
kaleido = "0.2.1"
|
||||
rank-bm25 = "~0.2.2"
|
||||
|
||||
92
api/schedule/clean_unused_messages_task.py
Normal file
92
api/schedule/clean_unused_messages_task.py
Normal file
@@ -0,0 +1,92 @@
|
||||
import datetime
|
||||
import time
|
||||
|
||||
import click
|
||||
from sqlalchemy import func
|
||||
from werkzeug.exceptions import NotFound
|
||||
|
||||
import app
|
||||
from configs import dify_config
|
||||
from core.rag.index_processor.index_processor_factory import IndexProcessorFactory
|
||||
from extensions.ext_database import db
|
||||
from models.dataset import Dataset, DatasetQuery, Document
|
||||
|
||||
|
||||
@app.celery.task(queue="dataset")
|
||||
def clean_unused_message_task():
|
||||
click.echo(click.style("Start clean unused messages .", fg="green"))
|
||||
clean_days = int(dify_config.CLEAN_DAY_SETTING)
|
||||
start_at = time.perf_counter()
|
||||
thirty_days_ago = datetime.datetime.now() - datetime.timedelta(days=clean_days)
|
||||
page = 1
|
||||
while True:
|
||||
try:
|
||||
# Subquery for counting new documents
|
||||
document_subquery_new = (
|
||||
db.session.query(Document.dataset_id, func.count(Document.id).label("document_count"))
|
||||
.filter(
|
||||
Document.indexing_status == "completed",
|
||||
Document.enabled == True,
|
||||
Document.archived == False,
|
||||
Document.updated_at > thirty_days_ago,
|
||||
)
|
||||
.group_by(Document.dataset_id)
|
||||
.subquery()
|
||||
)
|
||||
|
||||
# Subquery for counting old documents
|
||||
document_subquery_old = (
|
||||
db.session.query(Document.dataset_id, func.count(Document.id).label("document_count"))
|
||||
.filter(
|
||||
Document.indexing_status == "completed",
|
||||
Document.enabled == True,
|
||||
Document.archived == False,
|
||||
Document.updated_at < thirty_days_ago,
|
||||
)
|
||||
.group_by(Document.dataset_id)
|
||||
.subquery()
|
||||
)
|
||||
|
||||
# Main query with join and filter
|
||||
datasets = (
|
||||
db.session.query(Dataset)
|
||||
.outerjoin(document_subquery_new, Dataset.id == document_subquery_new.c.dataset_id)
|
||||
.outerjoin(document_subquery_old, Dataset.id == document_subquery_old.c.dataset_id)
|
||||
.filter(
|
||||
Dataset.created_at < thirty_days_ago,
|
||||
func.coalesce(document_subquery_new.c.document_count, 0) == 0,
|
||||
func.coalesce(document_subquery_old.c.document_count, 0) > 0,
|
||||
)
|
||||
.order_by(Dataset.created_at.desc())
|
||||
.paginate(page=page, per_page=50)
|
||||
)
|
||||
|
||||
except NotFound:
|
||||
break
|
||||
if datasets.items is None or len(datasets.items) == 0:
|
||||
break
|
||||
page += 1
|
||||
for dataset in datasets:
|
||||
dataset_query = (
|
||||
db.session.query(DatasetQuery)
|
||||
.filter(DatasetQuery.created_at > thirty_days_ago, DatasetQuery.dataset_id == dataset.id)
|
||||
.all()
|
||||
)
|
||||
if not dataset_query or len(dataset_query) == 0:
|
||||
try:
|
||||
# remove index
|
||||
index_processor = IndexProcessorFactory(dataset.doc_form).init_index_processor()
|
||||
index_processor.clean(dataset, None)
|
||||
|
||||
# update document
|
||||
update_params = {Document.enabled: False}
|
||||
|
||||
Document.query.filter_by(dataset_id=dataset.id).update(update_params)
|
||||
db.session.commit()
|
||||
click.echo(click.style("Cleaned unused dataset {} from db success!".format(dataset.id), fg="green"))
|
||||
except Exception as e:
|
||||
click.echo(
|
||||
click.style("clean dataset index error: {} {}".format(e.__class__.__name__, str(e)), fg="red")
|
||||
)
|
||||
end_at = time.perf_counter()
|
||||
click.echo(click.style("Cleaned unused dataset from db success latency: {}".format(end_at - start_at), fg="green"))
|
||||
@@ -1,10 +1,13 @@
|
||||
from services.auth.firecrawl import FirecrawlAuth
|
||||
from services.auth.jina import JinaAuth
|
||||
|
||||
|
||||
class ApiKeyAuthFactory:
|
||||
def __init__(self, provider: str, credentials: dict):
|
||||
if provider == "firecrawl":
|
||||
self.auth = FirecrawlAuth(credentials)
|
||||
elif provider == "jinareader":
|
||||
self.auth = JinaAuth(credentials)
|
||||
else:
|
||||
raise ValueError("Invalid provider")
|
||||
|
||||
|
||||
44
api/services/auth/jina.py
Normal file
44
api/services/auth/jina.py
Normal file
@@ -0,0 +1,44 @@
|
||||
import json
|
||||
|
||||
import requests
|
||||
|
||||
from services.auth.api_key_auth_base import ApiKeyAuthBase
|
||||
|
||||
|
||||
class JinaAuth(ApiKeyAuthBase):
|
||||
def __init__(self, credentials: dict):
|
||||
super().__init__(credentials)
|
||||
auth_type = credentials.get("auth_type")
|
||||
if auth_type != "bearer":
|
||||
raise ValueError("Invalid auth type, Jina Reader auth type must be Bearer")
|
||||
self.api_key = credentials.get("config").get("api_key", None)
|
||||
|
||||
if not self.api_key:
|
||||
raise ValueError("No API key provided")
|
||||
|
||||
def validate_credentials(self):
|
||||
headers = self._prepare_headers()
|
||||
options = {
|
||||
"url": "https://example.com",
|
||||
}
|
||||
response = self._post_request("https://r.jina.ai", options, headers)
|
||||
if response.status_code == 200:
|
||||
return True
|
||||
else:
|
||||
self._handle_error(response)
|
||||
|
||||
def _prepare_headers(self):
|
||||
return {"Content-Type": "application/json", "Authorization": f"Bearer {self.api_key}"}
|
||||
|
||||
def _post_request(self, url, data, headers):
|
||||
return requests.post(url, headers=headers, json=data)
|
||||
|
||||
def _handle_error(self, response):
|
||||
if response.status_code in {402, 409, 500}:
|
||||
error_message = response.json().get("error", "Unknown error occurred")
|
||||
raise Exception(f"Failed to authorize. Status code: {response.status_code}. Error: {error_message}")
|
||||
else:
|
||||
if response.text:
|
||||
error_message = json.loads(response.text).get("error", "Unknown error occurred")
|
||||
raise Exception(f"Failed to authorize. Status code: {response.status_code}. Error: {error_message}")
|
||||
raise Exception(f"Unexpected error occurred while trying to authorize. Status code: {response.status_code}")
|
||||
@@ -32,6 +32,7 @@ from models.dataset import (
|
||||
DatasetQuery,
|
||||
Document,
|
||||
DocumentSegment,
|
||||
ExternalKnowledgeBindings,
|
||||
)
|
||||
from models.model import UploadFile
|
||||
from models.source import DataSourceOauthBinding
|
||||
@@ -39,6 +40,7 @@ from services.errors.account import NoPermissionError
|
||||
from services.errors.dataset import DatasetNameDuplicateError
|
||||
from services.errors.document import DocumentIndexingError
|
||||
from services.errors.file import FileNotExistsError
|
||||
from services.external_knowledge_service import ExternalDatasetService
|
||||
from services.feature_service import FeatureModel, FeatureService
|
||||
from services.tag_service import TagService
|
||||
from services.vector_service import VectorService
|
||||
@@ -56,10 +58,8 @@ from tasks.sync_website_document_indexing_task import sync_website_document_inde
|
||||
|
||||
class DatasetService:
|
||||
@staticmethod
|
||||
def get_datasets(page, per_page, provider="vendor", tenant_id=None, user=None, search=None, tag_ids=None):
|
||||
query = Dataset.query.filter(Dataset.provider == provider, Dataset.tenant_id == tenant_id).order_by(
|
||||
Dataset.created_at.desc()
|
||||
)
|
||||
def get_datasets(page, per_page, tenant_id=None, user=None, search=None, tag_ids=None):
|
||||
query = Dataset.query.filter(Dataset.tenant_id == tenant_id).order_by(Dataset.created_at.desc())
|
||||
|
||||
if user:
|
||||
# get permitted dataset ids
|
||||
@@ -137,7 +137,14 @@ class DatasetService:
|
||||
|
||||
@staticmethod
|
||||
def create_empty_dataset(
|
||||
tenant_id: str, name: str, indexing_technique: Optional[str], account: Account, permission: Optional[str] = None
|
||||
tenant_id: str,
|
||||
name: str,
|
||||
indexing_technique: Optional[str],
|
||||
account: Account,
|
||||
permission: Optional[str] = None,
|
||||
provider: str = "vendor",
|
||||
external_knowledge_api_id: Optional[str] = None,
|
||||
external_knowledge_id: Optional[str] = None,
|
||||
):
|
||||
# check if dataset name already exists
|
||||
if Dataset.query.filter_by(name=name, tenant_id=tenant_id).first():
|
||||
@@ -156,12 +163,28 @@ class DatasetService:
|
||||
dataset.embedding_model_provider = embedding_model.provider if embedding_model else None
|
||||
dataset.embedding_model = embedding_model.model if embedding_model else None
|
||||
dataset.permission = permission or DatasetPermissionEnum.ONLY_ME
|
||||
dataset.provider = provider
|
||||
db.session.add(dataset)
|
||||
db.session.flush()
|
||||
|
||||
if provider == "external" and external_knowledge_api_id:
|
||||
external_knowledge_api = ExternalDatasetService.get_external_knowledge_api(external_knowledge_api_id)
|
||||
if not external_knowledge_api:
|
||||
raise ValueError("External API template not found.")
|
||||
external_knowledge_binding = ExternalKnowledgeBindings(
|
||||
tenant_id=tenant_id,
|
||||
dataset_id=dataset.id,
|
||||
external_knowledge_api_id=external_knowledge_api_id,
|
||||
external_knowledge_id=external_knowledge_id,
|
||||
created_by=account.id,
|
||||
)
|
||||
db.session.add(external_knowledge_binding)
|
||||
|
||||
db.session.commit()
|
||||
return dataset
|
||||
|
||||
@staticmethod
|
||||
def get_dataset(dataset_id):
|
||||
def get_dataset(dataset_id) -> Dataset:
|
||||
return Dataset.query.filter_by(id=dataset_id).first()
|
||||
|
||||
@staticmethod
|
||||
@@ -202,81 +225,103 @@ class DatasetService:
|
||||
|
||||
@staticmethod
|
||||
def update_dataset(dataset_id, data, user):
|
||||
data.pop("partial_member_list", None)
|
||||
filtered_data = {k: v for k, v in data.items() if v is not None or k == "description"}
|
||||
dataset = DatasetService.get_dataset(dataset_id)
|
||||
|
||||
DatasetService.check_dataset_permission(dataset, user)
|
||||
action = None
|
||||
if dataset.indexing_technique != data["indexing_technique"]:
|
||||
# if update indexing_technique
|
||||
if data["indexing_technique"] == "economy":
|
||||
action = "remove"
|
||||
filtered_data["embedding_model"] = None
|
||||
filtered_data["embedding_model_provider"] = None
|
||||
filtered_data["collection_binding_id"] = None
|
||||
elif data["indexing_technique"] == "high_quality":
|
||||
action = "add"
|
||||
# get embedding model setting
|
||||
try:
|
||||
model_manager = ModelManager()
|
||||
embedding_model = model_manager.get_model_instance(
|
||||
tenant_id=current_user.current_tenant_id,
|
||||
provider=data["embedding_model_provider"],
|
||||
model_type=ModelType.TEXT_EMBEDDING,
|
||||
model=data["embedding_model"],
|
||||
)
|
||||
filtered_data["embedding_model"] = embedding_model.model
|
||||
filtered_data["embedding_model_provider"] = embedding_model.provider
|
||||
dataset_collection_binding = DatasetCollectionBindingService.get_dataset_collection_binding(
|
||||
embedding_model.provider, embedding_model.model
|
||||
)
|
||||
filtered_data["collection_binding_id"] = dataset_collection_binding.id
|
||||
except LLMBadRequestError:
|
||||
raise ValueError(
|
||||
"No Embedding Model available. Please configure a valid provider "
|
||||
"in the Settings -> Model Provider."
|
||||
)
|
||||
except ProviderTokenNotInitError as ex:
|
||||
raise ValueError(ex.description)
|
||||
else:
|
||||
if dataset.provider == "external":
|
||||
dataset.retrieval_model = data.get("external_retrieval_model", None)
|
||||
dataset.name = data.get("name", dataset.name)
|
||||
dataset.description = data.get("description", "")
|
||||
external_knowledge_id = data.get("external_knowledge_id", None)
|
||||
db.session.add(dataset)
|
||||
if not external_knowledge_id:
|
||||
raise ValueError("External knowledge id is required.")
|
||||
external_knowledge_api_id = data.get("external_knowledge_api_id", None)
|
||||
if not external_knowledge_api_id:
|
||||
raise ValueError("External knowledge api id is required.")
|
||||
external_knowledge_binding = ExternalKnowledgeBindings.query.filter_by(dataset_id=dataset_id).first()
|
||||
if (
|
||||
data["embedding_model_provider"] != dataset.embedding_model_provider
|
||||
or data["embedding_model"] != dataset.embedding_model
|
||||
external_knowledge_binding.external_knowledge_id != external_knowledge_id
|
||||
or external_knowledge_binding.external_knowledge_api_id != external_knowledge_api_id
|
||||
):
|
||||
action = "update"
|
||||
try:
|
||||
model_manager = ModelManager()
|
||||
embedding_model = model_manager.get_model_instance(
|
||||
tenant_id=current_user.current_tenant_id,
|
||||
provider=data["embedding_model_provider"],
|
||||
model_type=ModelType.TEXT_EMBEDDING,
|
||||
model=data["embedding_model"],
|
||||
)
|
||||
filtered_data["embedding_model"] = embedding_model.model
|
||||
filtered_data["embedding_model_provider"] = embedding_model.provider
|
||||
dataset_collection_binding = DatasetCollectionBindingService.get_dataset_collection_binding(
|
||||
embedding_model.provider, embedding_model.model
|
||||
)
|
||||
filtered_data["collection_binding_id"] = dataset_collection_binding.id
|
||||
except LLMBadRequestError:
|
||||
raise ValueError(
|
||||
"No Embedding Model available. Please configure a valid provider "
|
||||
"in the Settings -> Model Provider."
|
||||
)
|
||||
except ProviderTokenNotInitError as ex:
|
||||
raise ValueError(ex.description)
|
||||
external_knowledge_binding.external_knowledge_id = external_knowledge_id
|
||||
external_knowledge_binding.external_knowledge_api_id = external_knowledge_api_id
|
||||
db.session.add(external_knowledge_binding)
|
||||
db.session.commit()
|
||||
else:
|
||||
data.pop("partial_member_list", None)
|
||||
filtered_data = {k: v for k, v in data.items() if v is not None or k == "description"}
|
||||
action = None
|
||||
if dataset.indexing_technique != data["indexing_technique"]:
|
||||
# if update indexing_technique
|
||||
if data["indexing_technique"] == "economy":
|
||||
action = "remove"
|
||||
filtered_data["embedding_model"] = None
|
||||
filtered_data["embedding_model_provider"] = None
|
||||
filtered_data["collection_binding_id"] = None
|
||||
elif data["indexing_technique"] == "high_quality":
|
||||
action = "add"
|
||||
# get embedding model setting
|
||||
try:
|
||||
model_manager = ModelManager()
|
||||
embedding_model = model_manager.get_model_instance(
|
||||
tenant_id=current_user.current_tenant_id,
|
||||
provider=data["embedding_model_provider"],
|
||||
model_type=ModelType.TEXT_EMBEDDING,
|
||||
model=data["embedding_model"],
|
||||
)
|
||||
filtered_data["embedding_model"] = embedding_model.model
|
||||
filtered_data["embedding_model_provider"] = embedding_model.provider
|
||||
dataset_collection_binding = DatasetCollectionBindingService.get_dataset_collection_binding(
|
||||
embedding_model.provider, embedding_model.model
|
||||
)
|
||||
filtered_data["collection_binding_id"] = dataset_collection_binding.id
|
||||
except LLMBadRequestError:
|
||||
raise ValueError(
|
||||
"No Embedding Model available. Please configure a valid provider "
|
||||
"in the Settings -> Model Provider."
|
||||
)
|
||||
except ProviderTokenNotInitError as ex:
|
||||
raise ValueError(ex.description)
|
||||
else:
|
||||
if (
|
||||
data["embedding_model_provider"] != dataset.embedding_model_provider
|
||||
or data["embedding_model"] != dataset.embedding_model
|
||||
):
|
||||
action = "update"
|
||||
try:
|
||||
model_manager = ModelManager()
|
||||
embedding_model = model_manager.get_model_instance(
|
||||
tenant_id=current_user.current_tenant_id,
|
||||
provider=data["embedding_model_provider"],
|
||||
model_type=ModelType.TEXT_EMBEDDING,
|
||||
model=data["embedding_model"],
|
||||
)
|
||||
filtered_data["embedding_model"] = embedding_model.model
|
||||
filtered_data["embedding_model_provider"] = embedding_model.provider
|
||||
dataset_collection_binding = DatasetCollectionBindingService.get_dataset_collection_binding(
|
||||
embedding_model.provider, embedding_model.model
|
||||
)
|
||||
filtered_data["collection_binding_id"] = dataset_collection_binding.id
|
||||
except LLMBadRequestError:
|
||||
raise ValueError(
|
||||
"No Embedding Model available. Please configure a valid provider "
|
||||
"in the Settings -> Model Provider."
|
||||
)
|
||||
except ProviderTokenNotInitError as ex:
|
||||
raise ValueError(ex.description)
|
||||
|
||||
filtered_data["updated_by"] = user.id
|
||||
filtered_data["updated_at"] = datetime.datetime.now()
|
||||
filtered_data["updated_by"] = user.id
|
||||
filtered_data["updated_at"] = datetime.datetime.now()
|
||||
|
||||
# update Retrieval model
|
||||
filtered_data["retrieval_model"] = data["retrieval_model"]
|
||||
# update Retrieval model
|
||||
filtered_data["retrieval_model"] = data["retrieval_model"]
|
||||
|
||||
dataset.query.filter_by(id=dataset_id).update(filtered_data)
|
||||
dataset.query.filter_by(id=dataset_id).update(filtered_data)
|
||||
|
||||
db.session.commit()
|
||||
if action:
|
||||
deal_dataset_vector_index_task.delay(dataset_id, action)
|
||||
db.session.commit()
|
||||
if action:
|
||||
deal_dataset_vector_index_task.delay(dataset_id, action)
|
||||
return dataset
|
||||
|
||||
@staticmethod
|
||||
|
||||
@@ -0,0 +1,26 @@
|
||||
from typing import Literal, Optional, Union
|
||||
|
||||
from pydantic import BaseModel
|
||||
|
||||
|
||||
class AuthorizationConfig(BaseModel):
|
||||
type: Literal[None, "basic", "bearer", "custom"]
|
||||
api_key: Union[None, str] = None
|
||||
header: Union[None, str] = None
|
||||
|
||||
|
||||
class Authorization(BaseModel):
|
||||
type: Literal["no-auth", "api-key"]
|
||||
config: Optional[AuthorizationConfig] = None
|
||||
|
||||
|
||||
class ProcessStatusSetting(BaseModel):
|
||||
request_method: str
|
||||
url: str
|
||||
|
||||
|
||||
class ExternalKnowledgeApiSetting(BaseModel):
|
||||
url: str
|
||||
request_method: str
|
||||
headers: Optional[dict] = None
|
||||
params: Optional[dict] = None
|
||||
274
api/services/external_knowledge_service.py
Normal file
274
api/services/external_knowledge_service.py
Normal file
@@ -0,0 +1,274 @@
|
||||
import json
|
||||
from copy import deepcopy
|
||||
from datetime import datetime, timezone
|
||||
from typing import Any, Optional, Union
|
||||
|
||||
import httpx
|
||||
import validators
|
||||
|
||||
# from tasks.external_document_indexing_task import external_document_indexing_task
|
||||
from core.helper import ssrf_proxy
|
||||
from extensions.ext_database import db
|
||||
from models.dataset import (
|
||||
Dataset,
|
||||
ExternalKnowledgeApis,
|
||||
ExternalKnowledgeBindings,
|
||||
)
|
||||
from services.entities.external_knowledge_entities.external_knowledge_entities import (
|
||||
Authorization,
|
||||
ExternalKnowledgeApiSetting,
|
||||
)
|
||||
from services.errors.dataset import DatasetNameDuplicateError
|
||||
|
||||
|
||||
class ExternalDatasetService:
|
||||
@staticmethod
|
||||
def get_external_knowledge_apis(page, per_page, tenant_id, search=None) -> tuple[list[ExternalKnowledgeApis], int]:
|
||||
query = ExternalKnowledgeApis.query.filter(ExternalKnowledgeApis.tenant_id == tenant_id).order_by(
|
||||
ExternalKnowledgeApis.created_at.desc()
|
||||
)
|
||||
if search:
|
||||
query = query.filter(ExternalKnowledgeApis.name.ilike(f"%{search}%"))
|
||||
|
||||
external_knowledge_apis = query.paginate(page=page, per_page=per_page, max_per_page=100, error_out=False)
|
||||
|
||||
return external_knowledge_apis.items, external_knowledge_apis.total
|
||||
|
||||
@classmethod
|
||||
def validate_api_list(cls, api_settings: dict):
|
||||
if not api_settings:
|
||||
raise ValueError("api list is empty")
|
||||
if "endpoint" not in api_settings and not api_settings["endpoint"]:
|
||||
raise ValueError("endpoint is required")
|
||||
if "api_key" not in api_settings and not api_settings["api_key"]:
|
||||
raise ValueError("api_key is required")
|
||||
|
||||
@staticmethod
|
||||
def create_external_knowledge_api(tenant_id: str, user_id: str, args: dict) -> ExternalKnowledgeApis:
|
||||
ExternalDatasetService.check_endpoint_and_api_key(args.get("settings"))
|
||||
external_knowledge_api = ExternalKnowledgeApis(
|
||||
tenant_id=tenant_id,
|
||||
created_by=user_id,
|
||||
updated_by=user_id,
|
||||
name=args.get("name"),
|
||||
description=args.get("description", ""),
|
||||
settings=json.dumps(args.get("settings"), ensure_ascii=False),
|
||||
)
|
||||
|
||||
db.session.add(external_knowledge_api)
|
||||
db.session.commit()
|
||||
return external_knowledge_api
|
||||
|
||||
@staticmethod
|
||||
def check_endpoint_and_api_key(settings: dict):
|
||||
if "endpoint" not in settings or not settings["endpoint"]:
|
||||
raise ValueError("endpoint is required")
|
||||
if "api_key" not in settings or not settings["api_key"]:
|
||||
raise ValueError("api_key is required")
|
||||
|
||||
endpoint = f"{settings['endpoint']}/retrieval"
|
||||
api_key = settings["api_key"]
|
||||
if not validators.url(endpoint):
|
||||
raise ValueError(f"invalid endpoint: {endpoint}")
|
||||
try:
|
||||
response = httpx.post(endpoint, headers={"Authorization": f"Bearer {api_key}"})
|
||||
except Exception as e:
|
||||
raise ValueError(f"failed to connect to the endpoint: {endpoint}")
|
||||
if response.status_code == 502:
|
||||
raise ValueError(f"Bad Gateway: failed to connect to the endpoint: {endpoint}")
|
||||
if response.status_code == 404:
|
||||
raise ValueError(f"Not Found: failed to connect to the endpoint: {endpoint}")
|
||||
if response.status_code == 403:
|
||||
raise ValueError(f"Forbidden: Authorization failed with api_key: {api_key}")
|
||||
|
||||
@staticmethod
|
||||
def get_external_knowledge_api(external_knowledge_api_id: str) -> ExternalKnowledgeApis:
|
||||
return ExternalKnowledgeApis.query.filter_by(id=external_knowledge_api_id).first()
|
||||
|
||||
@staticmethod
|
||||
def update_external_knowledge_api(tenant_id, user_id, external_knowledge_api_id, args) -> ExternalKnowledgeApis:
|
||||
external_knowledge_api = ExternalKnowledgeApis.query.filter_by(
|
||||
id=external_knowledge_api_id, tenant_id=tenant_id
|
||||
).first()
|
||||
if external_knowledge_api is None:
|
||||
raise ValueError("api template not found")
|
||||
|
||||
external_knowledge_api.name = args.get("name")
|
||||
external_knowledge_api.description = args.get("description", "")
|
||||
external_knowledge_api.settings = json.dumps(args.get("settings"), ensure_ascii=False)
|
||||
external_knowledge_api.updated_by = user_id
|
||||
external_knowledge_api.updated_at = datetime.now(timezone.utc).replace(tzinfo=None)
|
||||
db.session.commit()
|
||||
|
||||
return external_knowledge_api
|
||||
|
||||
@staticmethod
|
||||
def delete_external_knowledge_api(tenant_id: str, external_knowledge_api_id: str):
|
||||
external_knowledge_api = ExternalKnowledgeApis.query.filter_by(
|
||||
id=external_knowledge_api_id, tenant_id=tenant_id
|
||||
).first()
|
||||
if external_knowledge_api is None:
|
||||
raise ValueError("api template not found")
|
||||
|
||||
db.session.delete(external_knowledge_api)
|
||||
db.session.commit()
|
||||
|
||||
@staticmethod
|
||||
def external_knowledge_api_use_check(external_knowledge_api_id: str) -> tuple[bool, int]:
|
||||
count = ExternalKnowledgeBindings.query.filter_by(external_knowledge_api_id=external_knowledge_api_id).count()
|
||||
if count > 0:
|
||||
return True, count
|
||||
return False, 0
|
||||
|
||||
@staticmethod
|
||||
def get_external_knowledge_binding_with_dataset_id(tenant_id: str, dataset_id: str) -> ExternalKnowledgeBindings:
|
||||
external_knowledge_binding = ExternalKnowledgeBindings.query.filter_by(
|
||||
dataset_id=dataset_id, tenant_id=tenant_id
|
||||
).first()
|
||||
if not external_knowledge_binding:
|
||||
raise ValueError("external knowledge binding not found")
|
||||
return external_knowledge_binding
|
||||
|
||||
@staticmethod
|
||||
def document_create_args_validate(tenant_id: str, external_knowledge_api_id: str, process_parameter: dict):
|
||||
external_knowledge_api = ExternalKnowledgeApis.query.filter_by(
|
||||
id=external_knowledge_api_id, tenant_id=tenant_id
|
||||
).first()
|
||||
if external_knowledge_api is None:
|
||||
raise ValueError("api template not found")
|
||||
settings = json.loads(external_knowledge_api.settings)
|
||||
for setting in settings:
|
||||
custom_parameters = setting.get("document_process_setting")
|
||||
if custom_parameters:
|
||||
for parameter in custom_parameters:
|
||||
if parameter.get("required", False) and not process_parameter.get(parameter.get("name")):
|
||||
raise ValueError(f'{parameter.get("name")} is required')
|
||||
|
||||
@staticmethod
|
||||
def process_external_api(
|
||||
settings: ExternalKnowledgeApiSetting, files: Union[None, dict[str, Any]]
|
||||
) -> httpx.Response:
|
||||
"""
|
||||
do http request depending on api bundle
|
||||
"""
|
||||
|
||||
kwargs = {
|
||||
"url": settings.url,
|
||||
"headers": settings.headers,
|
||||
"follow_redirects": True,
|
||||
}
|
||||
|
||||
response = getattr(ssrf_proxy, settings.request_method)(data=json.dumps(settings.params), files=files, **kwargs)
|
||||
|
||||
return response
|
||||
|
||||
@staticmethod
|
||||
def assembling_headers(authorization: Authorization, headers: Optional[dict] = None) -> dict[str, Any]:
|
||||
authorization = deepcopy(authorization)
|
||||
if headers:
|
||||
headers = deepcopy(headers)
|
||||
else:
|
||||
headers = {}
|
||||
if authorization.type == "api-key":
|
||||
if authorization.config is None:
|
||||
raise ValueError("authorization config is required")
|
||||
|
||||
if authorization.config.api_key is None:
|
||||
raise ValueError("api_key is required")
|
||||
|
||||
if not authorization.config.header:
|
||||
authorization.config.header = "Authorization"
|
||||
|
||||
if authorization.config.type == "bearer":
|
||||
headers[authorization.config.header] = f"Bearer {authorization.config.api_key}"
|
||||
elif authorization.config.type == "basic":
|
||||
headers[authorization.config.header] = f"Basic {authorization.config.api_key}"
|
||||
elif authorization.config.type == "custom":
|
||||
headers[authorization.config.header] = authorization.config.api_key
|
||||
|
||||
return headers
|
||||
|
||||
@staticmethod
|
||||
def get_external_knowledge_api_settings(settings: dict) -> ExternalKnowledgeApiSetting:
|
||||
return ExternalKnowledgeApiSetting.parse_obj(settings)
|
||||
|
||||
@staticmethod
|
||||
def create_external_dataset(tenant_id: str, user_id: str, args: dict) -> Dataset:
|
||||
# check if dataset name already exists
|
||||
if Dataset.query.filter_by(name=args.get("name"), tenant_id=tenant_id).first():
|
||||
raise DatasetNameDuplicateError(f"Dataset with name {args.get('name')} already exists.")
|
||||
external_knowledge_api = ExternalKnowledgeApis.query.filter_by(
|
||||
id=args.get("external_knowledge_api_id"), tenant_id=tenant_id
|
||||
).first()
|
||||
|
||||
if external_knowledge_api is None:
|
||||
raise ValueError("api template not found")
|
||||
|
||||
dataset = Dataset(
|
||||
tenant_id=tenant_id,
|
||||
name=args.get("name"),
|
||||
description=args.get("description", ""),
|
||||
provider="external",
|
||||
retrieval_model=args.get("external_retrieval_model"),
|
||||
created_by=user_id,
|
||||
)
|
||||
|
||||
db.session.add(dataset)
|
||||
db.session.flush()
|
||||
|
||||
external_knowledge_binding = ExternalKnowledgeBindings(
|
||||
tenant_id=tenant_id,
|
||||
dataset_id=dataset.id,
|
||||
external_knowledge_api_id=args.get("external_knowledge_api_id"),
|
||||
external_knowledge_id=args.get("external_knowledge_id"),
|
||||
created_by=user_id,
|
||||
)
|
||||
db.session.add(external_knowledge_binding)
|
||||
|
||||
db.session.commit()
|
||||
|
||||
return dataset
|
||||
|
||||
@staticmethod
|
||||
def fetch_external_knowledge_retrieval(
|
||||
tenant_id: str, dataset_id: str, query: str, external_retrieval_parameters: dict
|
||||
) -> list:
|
||||
external_knowledge_binding = ExternalKnowledgeBindings.query.filter_by(
|
||||
dataset_id=dataset_id, tenant_id=tenant_id
|
||||
).first()
|
||||
if not external_knowledge_binding:
|
||||
raise ValueError("external knowledge binding not found")
|
||||
|
||||
external_knowledge_api = ExternalKnowledgeApis.query.filter_by(
|
||||
id=external_knowledge_binding.external_knowledge_api_id
|
||||
).first()
|
||||
if not external_knowledge_api:
|
||||
raise ValueError("external api template not found")
|
||||
|
||||
settings = json.loads(external_knowledge_api.settings)
|
||||
headers = {"Content-Type": "application/json"}
|
||||
if settings.get("api_key"):
|
||||
headers["Authorization"] = f"Bearer {settings.get('api_key')}"
|
||||
score_threshold_enabled = external_retrieval_parameters.get("score_threshold_enabled") or False
|
||||
score_threshold = external_retrieval_parameters.get("score_threshold", 0.0) if score_threshold_enabled else 0.0
|
||||
request_params = {
|
||||
"retrieval_setting": {
|
||||
"top_k": external_retrieval_parameters.get("top_k"),
|
||||
"score_threshold": score_threshold,
|
||||
},
|
||||
"query": query,
|
||||
"knowledge_id": external_knowledge_binding.external_knowledge_id,
|
||||
}
|
||||
|
||||
external_knowledge_api_setting = {
|
||||
"url": f"{settings.get('endpoint')}/retrieval",
|
||||
"request_method": "post",
|
||||
"headers": headers,
|
||||
"params": request_params,
|
||||
}
|
||||
response = ExternalDatasetService.process_external_api(
|
||||
ExternalKnowledgeApiSetting(**external_knowledge_api_setting), None
|
||||
)
|
||||
if response.status_code == 200:
|
||||
return response.json().get("records", [])
|
||||
return []
|
||||
@@ -19,7 +19,15 @@ default_retrieval_model = {
|
||||
|
||||
class HitTestingService:
|
||||
@classmethod
|
||||
def retrieve(cls, dataset: Dataset, query: str, account: Account, retrieval_model: dict, limit: int = 10) -> dict:
|
||||
def retrieve(
|
||||
cls,
|
||||
dataset: Dataset,
|
||||
query: str,
|
||||
account: Account,
|
||||
retrieval_model: dict,
|
||||
external_retrieval_model: dict,
|
||||
limit: int = 10,
|
||||
) -> dict:
|
||||
if dataset.available_document_count == 0 or dataset.available_segment_count == 0:
|
||||
return {
|
||||
"query": {
|
||||
@@ -62,10 +70,44 @@ class HitTestingService:
|
||||
|
||||
return cls.compact_retrieve_response(dataset, query, all_documents)
|
||||
|
||||
@classmethod
|
||||
def external_retrieve(
|
||||
cls,
|
||||
dataset: Dataset,
|
||||
query: str,
|
||||
account: Account,
|
||||
external_retrieval_model: dict,
|
||||
) -> dict:
|
||||
if dataset.provider != "external":
|
||||
return {
|
||||
"query": {"content": query},
|
||||
"records": [],
|
||||
}
|
||||
|
||||
start = time.perf_counter()
|
||||
|
||||
all_documents = RetrievalService.external_retrieve(
|
||||
dataset_id=dataset.id,
|
||||
query=cls.escape_query_for_search(query),
|
||||
external_retrieval_model=external_retrieval_model,
|
||||
)
|
||||
|
||||
end = time.perf_counter()
|
||||
logging.debug(f"External knowledge hit testing retrieve in {end - start:0.4f} seconds")
|
||||
|
||||
dataset_query = DatasetQuery(
|
||||
dataset_id=dataset.id, content=query, source="hit_testing", created_by_role="account", created_by=account.id
|
||||
)
|
||||
|
||||
db.session.add(dataset_query)
|
||||
db.session.commit()
|
||||
|
||||
return cls.compact_external_retrieve_response(dataset, query, all_documents)
|
||||
|
||||
@classmethod
|
||||
def compact_retrieve_response(cls, dataset: Dataset, query: str, documents: list[Document]):
|
||||
i = 0
|
||||
records = []
|
||||
|
||||
for document in documents:
|
||||
index_node_id = document.metadata["doc_id"]
|
||||
|
||||
@@ -81,7 +123,6 @@ class HitTestingService:
|
||||
)
|
||||
|
||||
if not segment:
|
||||
i += 1
|
||||
continue
|
||||
|
||||
record = {
|
||||
@@ -91,8 +132,6 @@ class HitTestingService:
|
||||
|
||||
records.append(record)
|
||||
|
||||
i += 1
|
||||
|
||||
return {
|
||||
"query": {
|
||||
"content": query,
|
||||
@@ -100,6 +139,25 @@ class HitTestingService:
|
||||
"records": records,
|
||||
}
|
||||
|
||||
@classmethod
|
||||
def compact_external_retrieve_response(cls, dataset: Dataset, query: str, documents: list):
|
||||
records = []
|
||||
if dataset.provider == "external":
|
||||
for document in documents:
|
||||
record = {
|
||||
"content": document.get("content", None),
|
||||
"title": document.get("title", None),
|
||||
"score": document.get("score", None),
|
||||
"metadata": document.get("metadata", None),
|
||||
}
|
||||
records.append(record)
|
||||
return {
|
||||
"query": {
|
||||
"content": query,
|
||||
},
|
||||
"records": records,
|
||||
}
|
||||
|
||||
@classmethod
|
||||
def hit_testing_args_check(cls, args):
|
||||
query = args["query"]
|
||||
|
||||
@@ -1,6 +1,7 @@
|
||||
import datetime
|
||||
import json
|
||||
|
||||
import requests
|
||||
from flask_login import current_user
|
||||
|
||||
from core.helper import encrypter
|
||||
@@ -65,6 +66,35 @@ class WebsiteService:
|
||||
time = str(datetime.datetime.now().timestamp())
|
||||
redis_client.setex(website_crawl_time_cache_key, 3600, time)
|
||||
return {"status": "active", "job_id": job_id}
|
||||
elif provider == "jinareader":
|
||||
api_key = encrypter.decrypt_token(
|
||||
tenant_id=current_user.current_tenant_id, token=credentials.get("config").get("api_key")
|
||||
)
|
||||
crawl_sub_pages = options.get("crawl_sub_pages", False)
|
||||
if not crawl_sub_pages:
|
||||
response = requests.get(
|
||||
f"https://r.jina.ai/{url}",
|
||||
headers={"Accept": "application/json", "Authorization": f"Bearer {api_key}"},
|
||||
)
|
||||
if response.json().get("code") != 200:
|
||||
raise ValueError("Failed to crawl")
|
||||
return {"status": "active", "data": response.json().get("data")}
|
||||
else:
|
||||
response = requests.post(
|
||||
"https://adaptivecrawl-kir3wx7b3a-uc.a.run.app",
|
||||
json={
|
||||
"url": url,
|
||||
"maxPages": options.get("limit", 1),
|
||||
"useSitemap": options.get("use_sitemap", True),
|
||||
},
|
||||
headers={
|
||||
"Content-Type": "application/json",
|
||||
"Authorization": f"Bearer {api_key}",
|
||||
},
|
||||
)
|
||||
if response.json().get("code") != 200:
|
||||
raise ValueError("Failed to crawl")
|
||||
return {"status": "active", "job_id": response.json().get("data", {}).get("taskId")}
|
||||
else:
|
||||
raise ValueError("Invalid provider")
|
||||
|
||||
@@ -93,6 +123,42 @@ class WebsiteService:
|
||||
time_consuming = abs(end_time - float(start_time))
|
||||
crawl_status_data["time_consuming"] = f"{time_consuming:.2f}"
|
||||
redis_client.delete(website_crawl_time_cache_key)
|
||||
elif provider == "jinareader":
|
||||
api_key = encrypter.decrypt_token(
|
||||
tenant_id=current_user.current_tenant_id, token=credentials.get("config").get("api_key")
|
||||
)
|
||||
response = requests.post(
|
||||
"https://adaptivecrawlstatus-kir3wx7b3a-uc.a.run.app",
|
||||
headers={"Content-Type": "application/json", "Authorization": f"Bearer {api_key}"},
|
||||
json={"taskId": job_id},
|
||||
)
|
||||
data = response.json().get("data", {})
|
||||
crawl_status_data = {
|
||||
"status": data.get("status", "active"),
|
||||
"job_id": job_id,
|
||||
"total": len(data.get("urls", [])),
|
||||
"current": len(data.get("processed", [])) + len(data.get("failed", [])),
|
||||
"data": [],
|
||||
"time_consuming": data.get("duration", 0) / 1000,
|
||||
}
|
||||
|
||||
if crawl_status_data["status"] == "completed":
|
||||
response = requests.post(
|
||||
"https://adaptivecrawlstatus-kir3wx7b3a-uc.a.run.app",
|
||||
headers={"Content-Type": "application/json", "Authorization": f"Bearer {api_key}"},
|
||||
json={"taskId": job_id, "urls": list(data.get("processed", {}).keys())},
|
||||
)
|
||||
data = response.json().get("data", {})
|
||||
formatted_data = [
|
||||
{
|
||||
"title": item.get("data", {}).get("title"),
|
||||
"source_url": item.get("data", {}).get("url"),
|
||||
"description": item.get("data", {}).get("description"),
|
||||
"markdown": item.get("data", {}).get("content"),
|
||||
}
|
||||
for item in data.get("processed", {}).values()
|
||||
]
|
||||
crawl_status_data["data"] = formatted_data
|
||||
else:
|
||||
raise ValueError("Invalid provider")
|
||||
return crawl_status_data
|
||||
@@ -119,6 +185,40 @@ class WebsiteService:
|
||||
if item.get("source_url") == url:
|
||||
return item
|
||||
return None
|
||||
elif provider == "jinareader":
|
||||
file_key = "website_files/" + job_id + ".txt"
|
||||
if storage.exists(file_key):
|
||||
data = storage.load_once(file_key)
|
||||
if data:
|
||||
data = json.loads(data.decode("utf-8"))
|
||||
elif not job_id:
|
||||
response = requests.get(
|
||||
f"https://r.jina.ai/{url}",
|
||||
headers={"Accept": "application/json", "Authorization": f"Bearer {api_key}"},
|
||||
)
|
||||
if response.json().get("code") != 200:
|
||||
raise ValueError("Failed to crawl")
|
||||
return response.json().get("data")
|
||||
else:
|
||||
api_key = encrypter.decrypt_token(tenant_id=tenant_id, token=credentials.get("config").get("api_key"))
|
||||
response = requests.post(
|
||||
"https://adaptivecrawlstatus-kir3wx7b3a-uc.a.run.app",
|
||||
headers={"Content-Type": "application/json", "Authorization": f"Bearer {api_key}"},
|
||||
json={"taskId": job_id},
|
||||
)
|
||||
data = response.json().get("data", {})
|
||||
if data.get("status") != "completed":
|
||||
raise ValueError("Crawl job is not completed")
|
||||
|
||||
response = requests.post(
|
||||
"https://adaptivecrawlstatus-kir3wx7b3a-uc.a.run.app",
|
||||
headers={"Content-Type": "application/json", "Authorization": f"Bearer {api_key}"},
|
||||
json={"taskId": job_id, "urls": list(data.get("processed", {}).keys())},
|
||||
)
|
||||
data = response.json().get("data", {})
|
||||
for item in data.get("processed", {}).values():
|
||||
if item.get("data", {}).get("url") == url:
|
||||
return item.get("data", {})
|
||||
else:
|
||||
raise ValueError("Invalid provider")
|
||||
|
||||
|
||||
93
api/tasks/external_document_indexing_task.py
Normal file
93
api/tasks/external_document_indexing_task.py
Normal file
@@ -0,0 +1,93 @@
|
||||
import json
|
||||
import logging
|
||||
import time
|
||||
|
||||
import click
|
||||
from celery import shared_task
|
||||
|
||||
from core.indexing_runner import DocumentIsPausedException
|
||||
from extensions.ext_database import db
|
||||
from extensions.ext_storage import storage
|
||||
from models.dataset import Dataset, ExternalKnowledgeApis
|
||||
from models.model import UploadFile
|
||||
from services.external_knowledge_service import ExternalDatasetService
|
||||
|
||||
|
||||
@shared_task(queue="dataset")
|
||||
def external_document_indexing_task(
|
||||
dataset_id: str, external_knowledge_api_id: str, data_source: dict, process_parameter: dict
|
||||
):
|
||||
"""
|
||||
Async process document
|
||||
:param dataset_id:
|
||||
:param external_knowledge_api_id:
|
||||
:param data_source:
|
||||
:param process_parameter:
|
||||
Usage: external_document_indexing_task.delay(dataset_id, document_id)
|
||||
"""
|
||||
start_at = time.perf_counter()
|
||||
|
||||
dataset = db.session.query(Dataset).filter(Dataset.id == dataset_id).first()
|
||||
if not dataset:
|
||||
logging.info(
|
||||
click.style("Processed external dataset: {} failed, dataset not exit.".format(dataset_id), fg="red")
|
||||
)
|
||||
return
|
||||
|
||||
# get external api template
|
||||
external_knowledge_api = (
|
||||
db.session.query(ExternalKnowledgeApis)
|
||||
.filter(
|
||||
ExternalKnowledgeApis.id == external_knowledge_api_id, ExternalKnowledgeApis.tenant_id == dataset.tenant_id
|
||||
)
|
||||
.first()
|
||||
)
|
||||
|
||||
if not external_knowledge_api:
|
||||
logging.info(
|
||||
click.style(
|
||||
"Processed external dataset: {} failed, api template: {} not exit.".format(
|
||||
dataset_id, external_knowledge_api_id
|
||||
),
|
||||
fg="red",
|
||||
)
|
||||
)
|
||||
return
|
||||
files = {}
|
||||
if data_source["type"] == "upload_file":
|
||||
upload_file_list = data_source["info_list"]["file_info_list"]["file_ids"]
|
||||
for file_id in upload_file_list:
|
||||
file = (
|
||||
db.session.query(UploadFile)
|
||||
.filter(UploadFile.tenant_id == dataset.tenant_id, UploadFile.id == file_id)
|
||||
.first()
|
||||
)
|
||||
if file:
|
||||
files[file.id] = (file.name, storage.load_once(file.key), file.mime_type)
|
||||
try:
|
||||
settings = ExternalDatasetService.get_external_knowledge_api_settings(
|
||||
json.loads(external_knowledge_api.settings)
|
||||
)
|
||||
# assemble headers
|
||||
headers = ExternalDatasetService.assembling_headers(settings.authorization, settings.headers)
|
||||
|
||||
# do http request
|
||||
response = ExternalDatasetService.process_external_api(settings, headers, process_parameter, files)
|
||||
job_id = response.json().get("job_id")
|
||||
if job_id:
|
||||
# save job_id to dataset
|
||||
dataset.job_id = job_id
|
||||
db.session.commit()
|
||||
|
||||
end_at = time.perf_counter()
|
||||
logging.info(
|
||||
click.style(
|
||||
"Processed external dataset: {} successful, latency: {}".format(dataset.id, end_at - start_at),
|
||||
fg="green",
|
||||
)
|
||||
)
|
||||
except DocumentIsPausedException as ex:
|
||||
logging.info(click.style(str(ex), fg="yellow"))
|
||||
|
||||
except Exception:
|
||||
pass
|
||||
38
api/tests/unit_tests/controllers/test_compare_versions.py
Normal file
38
api/tests/unit_tests/controllers/test_compare_versions.py
Normal file
@@ -0,0 +1,38 @@
|
||||
import pytest
|
||||
|
||||
from controllers.console.version import _has_new_version
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
("latest_version", "current_version", "expected"),
|
||||
[
|
||||
("1.0.1", "1.0.0", True),
|
||||
("1.1.0", "1.0.0", True),
|
||||
("2.0.0", "1.9.9", True),
|
||||
("1.0.0", "1.0.0", False),
|
||||
("1.0.0", "1.0.1", False),
|
||||
("1.0.0", "2.0.0", False),
|
||||
("1.0.1", "1.0.0-beta", True),
|
||||
("1.0.0", "1.0.0-alpha", True),
|
||||
("1.0.0-beta", "1.0.0-alpha", True),
|
||||
("1.0.0", "1.0.0-rc1", True),
|
||||
("1.0.0", "0.9.9", True),
|
||||
("1.0.0", "1.0.0-dev", True),
|
||||
],
|
||||
)
|
||||
def test_has_new_version(latest_version, current_version, expected):
|
||||
assert _has_new_version(latest_version=latest_version, current_version=current_version) == expected
|
||||
|
||||
|
||||
def test_has_new_version_invalid_input():
|
||||
with pytest.raises(ValueError):
|
||||
_has_new_version(latest_version="1.0", current_version="1.0.0")
|
||||
|
||||
with pytest.raises(ValueError):
|
||||
_has_new_version(latest_version="1.0.0", current_version="1.0")
|
||||
|
||||
with pytest.raises(ValueError):
|
||||
_has_new_version(latest_version="invalid", current_version="1.0.0")
|
||||
|
||||
with pytest.raises(ValueError):
|
||||
_has_new_version(latest_version="1.0.0", current_version="invalid")
|
||||
@@ -2,7 +2,7 @@ version: '3'
|
||||
services:
|
||||
# API service
|
||||
api:
|
||||
image: langgenius/dify-api:0.8.3
|
||||
image: langgenius/dify-api:0.9.0
|
||||
restart: always
|
||||
environment:
|
||||
# Startup mode, 'api' starts the API server.
|
||||
@@ -227,7 +227,7 @@ services:
|
||||
# worker service
|
||||
# The Celery worker for processing the queue.
|
||||
worker:
|
||||
image: langgenius/dify-api:0.8.3
|
||||
image: langgenius/dify-api:0.9.0
|
||||
restart: always
|
||||
environment:
|
||||
CONSOLE_WEB_URL: ''
|
||||
@@ -396,7 +396,7 @@ services:
|
||||
|
||||
# Frontend web application.
|
||||
web:
|
||||
image: langgenius/dify-web:0.8.3
|
||||
image: langgenius/dify-web:0.9.0
|
||||
restart: always
|
||||
environment:
|
||||
# The base URL of console application api server, refers to the Console base URL of WEB service if console domain is
|
||||
|
||||
@@ -213,7 +213,7 @@ x-shared-env: &shared-api-worker-env
|
||||
services:
|
||||
# API service
|
||||
api:
|
||||
image: langgenius/dify-api:0.8.3
|
||||
image: langgenius/dify-api:0.9.0
|
||||
restart: always
|
||||
environment:
|
||||
# Use the shared environment variables.
|
||||
@@ -233,7 +233,7 @@ services:
|
||||
# worker service
|
||||
# The Celery worker for processing the queue.
|
||||
worker:
|
||||
image: langgenius/dify-api:0.8.3
|
||||
image: langgenius/dify-api:0.9.0
|
||||
restart: always
|
||||
environment:
|
||||
# Use the shared environment variables.
|
||||
@@ -252,7 +252,7 @@ services:
|
||||
|
||||
# Frontend web application.
|
||||
web:
|
||||
image: langgenius/dify-web:0.8.3
|
||||
image: langgenius/dify-web:0.9.0
|
||||
restart: always
|
||||
environment:
|
||||
CONSOLE_API_URL: ${CONSOLE_API_URL:-}
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
'use client'
|
||||
import type { FC, SVGProps } from 'react'
|
||||
import React, { useEffect } from 'react'
|
||||
import React, { useEffect, useMemo } from 'react'
|
||||
import { usePathname } from 'next/navigation'
|
||||
import useSWR from 'swr'
|
||||
import { useTranslation } from 'react-i18next'
|
||||
@@ -203,12 +203,23 @@ const DatasetDetailLayout: FC<IAppDetailLayoutProps> = (props) => {
|
||||
datasetId,
|
||||
}, apiParams => fetchDatasetRelatedApps(apiParams.datasetId))
|
||||
|
||||
const navigation = [
|
||||
{ name: t('common.datasetMenus.documents'), href: `/datasets/${datasetId}/documents`, icon: DocumentTextIcon, selectedIcon: DocumentTextSolidIcon },
|
||||
{ name: t('common.datasetMenus.hitTesting'), href: `/datasets/${datasetId}/hitTesting`, icon: TargetIcon, selectedIcon: TargetSolidIcon },
|
||||
// { name: 'api & webhook', href: `/datasets/${datasetId}/api`, icon: CommandLineIcon, selectedIcon: CommandLineSolidIcon },
|
||||
{ name: t('common.datasetMenus.settings'), href: `/datasets/${datasetId}/settings`, icon: Cog8ToothIcon, selectedIcon: Cog8ToothSolidIcon },
|
||||
]
|
||||
const navigation = useMemo(() => {
|
||||
const baseNavigation = [
|
||||
{ name: t('common.datasetMenus.hitTesting'), href: `/datasets/${datasetId}/hitTesting`, icon: TargetIcon, selectedIcon: TargetSolidIcon },
|
||||
// { name: 'api & webhook', href: `/datasets/${datasetId}/api`, icon: CommandLineIcon, selectedIcon: CommandLineSolidIcon },
|
||||
{ name: t('common.datasetMenus.settings'), href: `/datasets/${datasetId}/settings`, icon: Cog8ToothIcon, selectedIcon: Cog8ToothSolidIcon },
|
||||
]
|
||||
|
||||
if (datasetRes?.provider !== 'external') {
|
||||
baseNavigation.unshift({
|
||||
name: t('common.datasetMenus.documents'),
|
||||
href: `/datasets/${datasetId}/documents`,
|
||||
icon: DocumentTextIcon,
|
||||
selectedIcon: DocumentTextSolidIcon,
|
||||
})
|
||||
}
|
||||
return baseNavigation
|
||||
}, [datasetRes?.provider, datasetId, t])
|
||||
|
||||
useEffect(() => {
|
||||
if (datasetRes)
|
||||
@@ -233,6 +244,7 @@ const DatasetDetailLayout: FC<IAppDetailLayoutProps> = (props) => {
|
||||
icon={datasetRes?.icon || 'https://static.dify.ai/images/dataset-default-icon.png'}
|
||||
icon_background={datasetRes?.icon_background || '#F5F5F5'}
|
||||
desc={datasetRes?.description || '--'}
|
||||
isExternal={datasetRes?.provider === 'external'}
|
||||
navigation={navigation}
|
||||
extraInfo={!isCurrentWorkspaceDatasetOperator ? mode => <ExtraInfo isMobile={mode === 'collapse'} relatedApps={relatedApps} /> : undefined}
|
||||
iconType={datasetRes?.data_source_type === DataSourceType.NOTION ? 'notion' : 'dataset'}
|
||||
|
||||
@@ -8,6 +8,7 @@ import { useDebounceFn } from 'ahooks'
|
||||
import useSWR from 'swr'
|
||||
|
||||
// Components
|
||||
import ExternalAPIPanel from '../../components/datasets/external-api/external-api-panel'
|
||||
import Datasets from './Datasets'
|
||||
import DatasetFooter from './DatasetFooter'
|
||||
import ApiServer from './ApiServer'
|
||||
@@ -16,6 +17,8 @@ import TabSliderNew from '@/app/components/base/tab-slider-new'
|
||||
import SearchInput from '@/app/components/base/search-input'
|
||||
import TagManagementModal from '@/app/components/base/tag-management'
|
||||
import TagFilter from '@/app/components/base/tag-management/filter'
|
||||
import Button from '@/app/components/base/button'
|
||||
import { ApiConnectionMod } from '@/app/components/base/icons/src/vender/solid/development'
|
||||
|
||||
// Services
|
||||
import { fetchDatasetApiBaseUrl } from '@/service/datasets'
|
||||
@@ -24,12 +27,14 @@ import { fetchDatasetApiBaseUrl } from '@/service/datasets'
|
||||
import { useTabSearchParams } from '@/hooks/use-tab-searchparams'
|
||||
import { useStore as useTagStore } from '@/app/components/base/tag-management/store'
|
||||
import { useAppContext } from '@/context/app-context'
|
||||
import { useExternalApiPanel } from '@/context/external-api-panel-context'
|
||||
|
||||
const Container = () => {
|
||||
const { t } = useTranslation()
|
||||
const router = useRouter()
|
||||
const { currentWorkspace } = useAppContext()
|
||||
const showTagManagementModal = useTagStore(s => s.showTagManagementModal)
|
||||
const { showExternalApiPanel, setShowExternalApiPanel } = useExternalApiPanel()
|
||||
|
||||
const options = useMemo(() => {
|
||||
return [
|
||||
@@ -66,7 +71,7 @@ const Container = () => {
|
||||
useEffect(() => {
|
||||
if (currentWorkspace.role === 'normal')
|
||||
return router.replace('/apps')
|
||||
}, [currentWorkspace])
|
||||
}, [currentWorkspace, router])
|
||||
|
||||
return (
|
||||
<div ref={containerRef} className='grow relative flex flex-col bg-gray-100 overflow-y-auto'>
|
||||
@@ -80,11 +85,18 @@ const Container = () => {
|
||||
<div className='flex items-center gap-2'>
|
||||
<TagFilter type='knowledge' value={tagFilterValue} onChange={handleTagsChange} />
|
||||
<SearchInput className='w-[200px]' value={keywords} onChange={handleKeywordsChange} />
|
||||
<div className="w-[1px] h-4 bg-divider-regular" />
|
||||
<Button
|
||||
className='gap-0.5 shadows-shadow-xs'
|
||||
onClick={() => setShowExternalApiPanel(true)}
|
||||
>
|
||||
<ApiConnectionMod className='w-4 h-4 text-components-button-secondary-text' />
|
||||
<div className='flex px-0.5 justify-center items-center gap-1 text-components-button-secondary-text system-sm-medium'>{t('dataset.externalAPIPanelTitle')}</div>
|
||||
</Button>
|
||||
</div>
|
||||
)}
|
||||
{activeTab === 'api' && data && <ApiServer apiBaseUrl={data.api_base_url || ''} />}
|
||||
</div>
|
||||
|
||||
{activeTab === 'dataset' && (
|
||||
<>
|
||||
<Datasets containerRef={containerRef} tags={tagIDs} keywords={searchKeywords} />
|
||||
@@ -94,10 +106,10 @@ const Container = () => {
|
||||
)}
|
||||
</>
|
||||
)}
|
||||
|
||||
{activeTab === 'api' && data && <Doc apiBaseUrl={data.api_base_url || ''} />}
|
||||
</div>
|
||||
|
||||
{showExternalApiPanel && <ExternalAPIPanel onClose={() => setShowExternalApiPanel(false)} />}
|
||||
</div>
|
||||
)
|
||||
}
|
||||
|
||||
|
||||
@@ -18,6 +18,7 @@ import Divider from '@/app/components/base/divider'
|
||||
import RenameDatasetModal from '@/app/components/datasets/rename-modal'
|
||||
import type { Tag } from '@/app/components/base/tag-management/constant'
|
||||
import TagSelector from '@/app/components/base/tag-management/selector'
|
||||
import CornerLabel from '@/app/components/base/corner-label'
|
||||
import { useAppContext } from '@/context/app-context'
|
||||
|
||||
export type DatasetCardProps = {
|
||||
@@ -32,6 +33,7 @@ const DatasetCard = ({
|
||||
const { t } = useTranslation()
|
||||
const { notify } = useContext(ToastContext)
|
||||
const { push } = useRouter()
|
||||
const EXTERNAL_PROVIDER = 'external' as const
|
||||
|
||||
const { isCurrentWorkspaceDatasetOperator } = useAppContext()
|
||||
const [tags, setTags] = useState<Tag[]>(dataset.tags)
|
||||
@@ -39,6 +41,7 @@ const DatasetCard = ({
|
||||
const [showRenameModal, setShowRenameModal] = useState(false)
|
||||
const [showConfirmDelete, setShowConfirmDelete] = useState(false)
|
||||
const [confirmMessage, setConfirmMessage] = useState<string>('')
|
||||
const isExternalProvider = (provider: string): boolean => provider === EXTERNAL_PROVIDER
|
||||
const detectIsUsedByApp = useCallback(async () => {
|
||||
try {
|
||||
const { is_using: isUsedByApp } = await checkIsUsedInApp(dataset.id)
|
||||
@@ -108,13 +111,16 @@ const DatasetCard = ({
|
||||
return (
|
||||
<>
|
||||
<div
|
||||
className='group col-span-1 bg-white border-2 border-solid border-transparent rounded-xl shadow-sm min-h-[160px] flex flex-col transition-all duration-200 ease-in-out cursor-pointer hover:shadow-lg'
|
||||
className='group relative col-span-1 bg-white border-[0.5px] border-solid border-transparent rounded-xl shadow-sm min-h-[160px] flex flex-col transition-all duration-200 ease-in-out cursor-pointer hover:shadow-lg'
|
||||
data-disable-nprogress={true}
|
||||
onClick={(e) => {
|
||||
e.preventDefault()
|
||||
push(`/datasets/${dataset.id}/documents`)
|
||||
isExternalProvider(dataset.provider)
|
||||
? push(`/datasets/${dataset.id}/hitTesting`)
|
||||
: push(`/datasets/${dataset.id}/documents`)
|
||||
}}
|
||||
>
|
||||
{isExternalProvider(dataset.provider) && <CornerLabel label='External' className='absolute right-0' labelClassName='rounded-tr-xl' />}
|
||||
<div className='flex pt-[14px] px-[14px] pb-3 h-[66px] items-center gap-3 grow-0 shrink-0'>
|
||||
<div className={cn(
|
||||
'shrink-0 flex items-center justify-center p-2.5 bg-[#F5F8FF] rounded-md border-[0.5px] border-[#E0EAFF]',
|
||||
@@ -136,13 +142,20 @@ const DatasetCard = ({
|
||||
<div className='flex items-center mt-[1px] text-xs leading-[18px] text-gray-500'>
|
||||
<div
|
||||
className={cn('truncate', (!dataset.embedding_available || !dataset.document_count) && 'opacity-50')}
|
||||
title={`${dataset.document_count}${t('dataset.documentCount')} · ${Math.round(dataset.word_count / 1000)}${t('dataset.wordCount')} · ${dataset.app_count}${t('dataset.appCount')}`}
|
||||
title={dataset.provider === 'external' ? `${dataset.app_count}${t('dataset.appCount')}` : `${dataset.document_count}${t('dataset.documentCount')} · ${Math.round(dataset.word_count / 1000)}${t('dataset.wordCount')} · ${dataset.app_count}${t('dataset.appCount')}`}
|
||||
>
|
||||
<span>{dataset.document_count}{t('dataset.documentCount')}</span>
|
||||
<span className='shrink-0 mx-0.5 w-1 text-gray-400'>·</span>
|
||||
<span>{Math.round(dataset.word_count / 1000)}{t('dataset.wordCount')}</span>
|
||||
<span className='shrink-0 mx-0.5 w-1 text-gray-400'>·</span>
|
||||
<span>{dataset.app_count}{t('dataset.appCount')}</span>
|
||||
{dataset.provider === 'external'
|
||||
? <>
|
||||
<span>{dataset.app_count}{t('dataset.appCount')}</span>
|
||||
</>
|
||||
: <>
|
||||
<span>{dataset.document_count}{t('dataset.documentCount')}</span>
|
||||
<span className='shrink-0 mx-0.5 w-1 text-gray-400'>·</span>
|
||||
<span>{Math.round(dataset.word_count / 1000)}{t('dataset.wordCount')}</span>
|
||||
<span className='shrink-0 mx-0.5 w-1 text-gray-400'>·</span>
|
||||
<span>{dataset.app_count}{t('dataset.appCount')}</span>
|
||||
</>
|
||||
}
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
@@ -4,21 +4,32 @@ import { forwardRef } from 'react'
|
||||
import { useTranslation } from 'react-i18next'
|
||||
import {
|
||||
RiAddLine,
|
||||
RiArrowRightLine,
|
||||
} from '@remixicon/react'
|
||||
|
||||
const CreateAppCard = forwardRef<HTMLAnchorElement>((_, ref) => {
|
||||
const { t } = useTranslation()
|
||||
|
||||
return (
|
||||
<a ref={ref} className='group flex flex-col col-span-1 bg-gray-200 border-[0.5px] border-black/5 rounded-xl min-h-[160px] transition-all duration-200 ease-in-out cursor-pointer hover:bg-white hover:shadow-lg' href='/datasets/create'>
|
||||
<div className='shrink-0 flex items-center p-4 pb-3'>
|
||||
<div className='w-10 h-10 flex items-center justify-center border border-gray-200 bg-gray-100 rounded-lg'>
|
||||
<RiAddLine className='w-4 h-4 text-gray-500'/>
|
||||
<div className='flex flex-col bg-background-default-dimm border-[0.5px] border-components-panel-border rounded-xl
|
||||
min-h-[160px] transition-all duration-200 ease-in-out'
|
||||
>
|
||||
<a ref={ref} className='group flex flex-grow items-start p-4 cursor-pointer' href='/datasets/create'>
|
||||
<div className='flex items-center gap-3'>
|
||||
<div className='w-10 h-10 p-2 flex items-center justify-center border border-dashed border-divider-regular rounded-lg
|
||||
bg-background-default-lighter group-hover:border-solid group-hover:border-effects-highlight group-hover:bg-background-default-dodge'
|
||||
>
|
||||
<RiAddLine className='w-4 h-4 text-text-tertiary group-hover:text-text-accent'/>
|
||||
</div>
|
||||
<div className='system-md-semibold text-text-secondary group-hover:text-text-accent'>{t('dataset.createDataset')}</div>
|
||||
</div>
|
||||
<div className='ml-3 text-sm font-semibold leading-5 text-gray-800 group-hover:text-primary-600'>{t('dataset.createDataset')}</div>
|
||||
</div>
|
||||
<div className='mb-1 px-4 text-xs leading-normal text-gray-500 line-clamp-4'>{t('dataset.createDatasetIntro')}</div>
|
||||
</a>
|
||||
</a>
|
||||
<div className='p-4 pt-0 text-text-tertiary system-xs-regular'>{t('dataset.createDatasetIntro')}</div>
|
||||
<a className='group flex p-4 items-center gap-1 border-t-[0.5px] border-divider-subtle rounded-b-xl cursor-pointer' href='/datasets/connect'>
|
||||
<div className='system-xs-medium text-text-tertiary group-hover:text-text-accent'>{t('dataset.connectDataset')}</div>
|
||||
<RiArrowRightLine className='w-3.5 h-3.5 text-text-tertiary group-hover:text-text-accent' />
|
||||
</a>
|
||||
</div>
|
||||
)
|
||||
})
|
||||
|
||||
|
||||
8
web/app/(commonLayout)/datasets/connect/page.tsx
Normal file
8
web/app/(commonLayout)/datasets/connect/page.tsx
Normal file
@@ -0,0 +1,8 @@
|
||||
import React from 'react'
|
||||
import ExternalKnowledgeBaseConnector from '@/app/components/datasets/external-knowledge-base/connector'
|
||||
|
||||
const ExternalKnowledgeBaseCreation = () => {
|
||||
return <ExternalKnowledgeBaseConnector />
|
||||
}
|
||||
|
||||
export default ExternalKnowledgeBaseCreation
|
||||
14
web/app/(commonLayout)/datasets/layout.tsx
Normal file
14
web/app/(commonLayout)/datasets/layout.tsx
Normal file
@@ -0,0 +1,14 @@
|
||||
'use client'
|
||||
|
||||
import { ExternalApiPanelProvider } from '@/context/external-api-panel-context'
|
||||
import { ExternalKnowledgeApiProvider } from '@/context/external-knowledge-api-context'
|
||||
|
||||
export default function DatasetsLayout({ children }: { children: React.ReactNode }) {
|
||||
return (
|
||||
<ExternalKnowledgeApiProvider>
|
||||
<ExternalApiPanelProvider>
|
||||
{children}
|
||||
</ExternalApiPanelProvider>
|
||||
</ExternalKnowledgeApiProvider>
|
||||
)
|
||||
}
|
||||
@@ -1,9 +1,7 @@
|
||||
import Container from './Container'
|
||||
|
||||
const AppList = async () => {
|
||||
return (
|
||||
<Container />
|
||||
)
|
||||
return <Container />
|
||||
}
|
||||
|
||||
export const metadata = {
|
||||
|
||||
11
web/app/(commonLayout)/datasets/store.ts
Normal file
11
web/app/(commonLayout)/datasets/store.ts
Normal file
@@ -0,0 +1,11 @@
|
||||
import { create } from 'zustand'
|
||||
|
||||
type DatasetStore = {
|
||||
showExternalApiPanel: boolean
|
||||
setShowExternalApiPanel: (show: boolean) => void
|
||||
}
|
||||
|
||||
export const useDatasetStore = create<DatasetStore>(set => ({
|
||||
showExternalApiPanel: false,
|
||||
setShowExternalApiPanel: show => set({ showExternalApiPanel: show }),
|
||||
}))
|
||||
@@ -1,4 +1,5 @@
|
||||
import React from 'react'
|
||||
import { useTranslation } from 'react-i18next'
|
||||
import AppIcon from '../base/app-icon'
|
||||
import Tooltip from '@/app/components/base/tooltip'
|
||||
|
||||
@@ -6,6 +7,7 @@ export type IAppBasicProps = {
|
||||
iconType?: 'app' | 'api' | 'dataset' | 'webapp' | 'notion'
|
||||
icon?: string
|
||||
icon_background?: string | null
|
||||
isExternal?: boolean
|
||||
name: string
|
||||
type: string | React.ReactNode
|
||||
hoverTip?: string
|
||||
@@ -52,7 +54,9 @@ const ICON_MAP = {
|
||||
notion: <AppIcon innerIcon={NotionSvg} className='!border-[0.5px] !border-indigo-100 !bg-white' />,
|
||||
}
|
||||
|
||||
export default function AppBasic({ icon, icon_background, name, type, hoverTip, textStyle, mode = 'expand', iconType = 'app' }: IAppBasicProps) {
|
||||
export default function AppBasic({ icon, icon_background, name, isExternal, type, hoverTip, textStyle, mode = 'expand', iconType = 'app' }: IAppBasicProps) {
|
||||
const { t } = useTranslation()
|
||||
|
||||
return (
|
||||
<div className="flex items-start p-1">
|
||||
{icon && icon_background && iconType === 'app' && (
|
||||
@@ -83,6 +87,7 @@ export default function AppBasic({ icon, icon_background, name, type, hoverTip,
|
||||
}
|
||||
</div>
|
||||
<div className={`text-xs font-normal text-gray-500 group-hover:text-gray-700 break-all ${textStyle?.extra ?? ''}`}>{type}</div>
|
||||
<div className='text-text-tertiary system-2xs-medium-uppercase'>{isExternal ? t('dataset.externalTag') : ''}</div>
|
||||
</div>}
|
||||
</div>
|
||||
)
|
||||
|
||||
@@ -15,6 +15,7 @@ export type IAppDetailNavProps = {
|
||||
iconType?: 'app' | 'dataset' | 'notion'
|
||||
title: string
|
||||
desc: string
|
||||
isExternal?: boolean
|
||||
icon: string
|
||||
icon_background: string
|
||||
navigation: Array<{
|
||||
@@ -26,7 +27,7 @@ export type IAppDetailNavProps = {
|
||||
extraInfo?: (modeState: string) => React.ReactNode
|
||||
}
|
||||
|
||||
const AppDetailNav = ({ title, desc, icon, icon_background, navigation, extraInfo, iconType = 'app' }: IAppDetailNavProps) => {
|
||||
const AppDetailNav = ({ title, desc, isExternal, icon, icon_background, navigation, extraInfo, iconType = 'app' }: IAppDetailNavProps) => {
|
||||
const { appSidebarExpand, setAppSiderbarExpand } = useAppStore(useShallow(state => ({
|
||||
appSidebarExpand: state.appSidebarExpand,
|
||||
setAppSiderbarExpand: state.setAppSiderbarExpand,
|
||||
@@ -70,6 +71,7 @@ const AppDetailNav = ({ title, desc, icon, icon_background, navigation, extraInf
|
||||
icon_background={icon_background}
|
||||
name={title}
|
||||
type={desc}
|
||||
isExternal={isExternal}
|
||||
/>
|
||||
)}
|
||||
</div>
|
||||
|
||||
@@ -3,6 +3,7 @@ import type { FC } from 'react'
|
||||
import React, { useEffect, useState } from 'react'
|
||||
import { useTranslation } from 'react-i18next'
|
||||
import { Pagination } from 'react-headless-pagination'
|
||||
import { useDebounce } from 'ahooks'
|
||||
import { ArrowLeftIcon, ArrowRightIcon } from '@heroicons/react/24/outline'
|
||||
import Toast from '../../base/toast'
|
||||
import Filter from './filter'
|
||||
@@ -67,10 +68,11 @@ const Annotation: FC<Props> = ({
|
||||
|
||||
const [queryParams, setQueryParams] = useState<QueryParam>({})
|
||||
const [currPage, setCurrPage] = React.useState<number>(0)
|
||||
const debouncedQueryParams = useDebounce(queryParams, { wait: 500 })
|
||||
const query = {
|
||||
page: currPage + 1,
|
||||
limit: APP_PAGE_LIMIT,
|
||||
keyword: queryParams.keyword || '',
|
||||
keyword: debouncedQueryParams.keyword || '',
|
||||
}
|
||||
|
||||
const [controlUpdateList, setControlUpdateList] = useState(Date.now())
|
||||
@@ -232,8 +234,8 @@ const Annotation: FC<Props> = ({
|
||||
middlePagesSiblingCount={1}
|
||||
setCurrentPage={setCurrPage}
|
||||
totalPages={Math.ceil(total / APP_PAGE_LIMIT)}
|
||||
truncatableClassName="w-8 px-0.5 text-center"
|
||||
truncatableText="..."
|
||||
truncableClassName="w-8 px-0.5 text-center"
|
||||
truncableText="..."
|
||||
>
|
||||
<Pagination.PrevButton
|
||||
disabled={currPage === 0}
|
||||
|
||||
@@ -5,6 +5,7 @@ import {
|
||||
RiDeleteBinLine,
|
||||
RiEditLine,
|
||||
} from '@remixicon/react'
|
||||
import { useTranslation } from 'react-i18next'
|
||||
import SettingsModal from '../settings-modal'
|
||||
import type { DataSet } from '@/models/datasets'
|
||||
import { DataSourceType } from '@/models/datasets'
|
||||
@@ -33,6 +34,7 @@ const Item: FC<ItemProps> = ({
|
||||
const isMobile = media === MediaType.mobile
|
||||
const [showSettingsModal, setShowSettingsModal] = useState(false)
|
||||
const { formatIndexingTechniqueAndMethod } = useKnowledge()
|
||||
const { t } = useTranslation()
|
||||
|
||||
const handleSave = (newDataset: DataSet) => {
|
||||
onSave(newDataset)
|
||||
@@ -65,9 +67,11 @@ const Item: FC<ItemProps> = ({
|
||||
<div className='grow'>
|
||||
<div className='flex items-center h-[18px]'>
|
||||
<div className='grow text-[13px] font-medium text-gray-800 truncate' title={config.name}>{config.name}</div>
|
||||
<Badge
|
||||
text={formatIndexingTechniqueAndMethod(config.indexing_technique, config.retrieval_model_dict?.search_method)}
|
||||
/>
|
||||
{config.provider === 'external'
|
||||
? <Badge text={t('dataset.externalTag')}></Badge>
|
||||
: <Badge
|
||||
text={formatIndexingTechniqueAndMethod(config.indexing_technique, config.retrieval_model_dict?.search_method)}
|
||||
/>}
|
||||
</div>
|
||||
</div>
|
||||
<div className='hidden rounded-lg group-hover:flex items-center justify-end absolute right-0 top-0 bottom-0 pr-2 w-[124px] bg-gradient-to-r from-white/50 to-white to-50%'>
|
||||
|
||||
@@ -174,6 +174,20 @@ const ConfigContent: FC<Props> = ({
|
||||
</div>
|
||||
)
|
||||
}
|
||||
{
|
||||
selectedDatasetsMode.mixtureInternalAndExternal && (
|
||||
<div className='mt-4 system-xs-medium text-text-warning'>
|
||||
{t('dataset.mixtureInternalAndExternalTip')}
|
||||
</div>
|
||||
)
|
||||
}
|
||||
{
|
||||
selectedDatasetsMode.allExternal && (
|
||||
<div className='mt-4 system-xs-medium text-text-warning'>
|
||||
{t('dataset.allExternalTip')}
|
||||
</div>
|
||||
)
|
||||
}
|
||||
{
|
||||
selectedDatasetsMode.mixtureHighQualityAndEconomic
|
||||
&& (
|
||||
@@ -229,15 +243,15 @@ const ConfigContent: FC<Props> = ({
|
||||
/>
|
||||
)
|
||||
}
|
||||
<div className='ml-2 leading-[32px] text-[13px] font-medium text-gray-900'>{t('common.modelProvider.rerankModel.key')}</div>
|
||||
<div className='leading-[32px] text-text-secondary system-sm-semibold'>{t('common.modelProvider.rerankModel.key')}</div>
|
||||
<Tooltip
|
||||
popupContent={
|
||||
<div className="w-[200px]">
|
||||
{t('common.modelProvider.rerankModel.tip')}
|
||||
</div>
|
||||
}
|
||||
popupClassName='ml-0.5'
|
||||
triggerClassName='ml-0.5 w-3.5 h-3.5'
|
||||
popupClassName='ml-1'
|
||||
triggerClassName='ml-1 w-4 h-4'
|
||||
/>
|
||||
</div>
|
||||
<div>
|
||||
|
||||
@@ -39,13 +39,26 @@ const ParamsConfig = ({
|
||||
useEffect(() => {
|
||||
const {
|
||||
allEconomic,
|
||||
allHighQuality,
|
||||
allHighQualityFullTextSearch,
|
||||
allHighQualityVectorSearch,
|
||||
allExternal,
|
||||
mixtureHighQualityAndEconomic,
|
||||
inconsistentEmbeddingModel,
|
||||
mixtureInternalAndExternal,
|
||||
} = getSelectedDatasetsMode(selectedDatasets)
|
||||
const { datasets, retrieval_model, score_threshold_enabled, ...restConfigs } = datasetConfigs
|
||||
let rerankEnable = restConfigs.reranking_enable
|
||||
|
||||
if (allEconomic && !restConfigs.reranking_model?.reranking_provider_name && rerankEnable === undefined)
|
||||
if ((allEconomic && !restConfigs.reranking_model?.reranking_provider_name && rerankEnable === undefined) || allExternal)
|
||||
rerankEnable = false
|
||||
|
||||
if (allEconomic || allHighQuality || allHighQualityFullTextSearch || allHighQualityVectorSearch || (allExternal && selectedDatasets.length === 1))
|
||||
setRerankSettingModalOpen(false)
|
||||
|
||||
if (mixtureHighQualityAndEconomic || inconsistentEmbeddingModel || mixtureInternalAndExternal || (allExternal && selectedDatasets.length > 1))
|
||||
setRerankSettingModalOpen(true)
|
||||
|
||||
setTempDataSetConfigs({
|
||||
...getMultipleRetrievalConfig({
|
||||
top_k: restConfigs.top_k,
|
||||
|
||||
@@ -47,7 +47,7 @@ const SelectDataSet: FC<ISelectDataSetProps> = ({
|
||||
const { data, has_more } = await fetchDatasets({ url: '/datasets', params: { page } })
|
||||
setPage(getPage() + 1)
|
||||
setIsNoMore(!has_more)
|
||||
const newList = [...(datasets || []), ...data.filter(item => item.indexing_technique)]
|
||||
const newList = [...(datasets || []), ...data.filter(item => item.indexing_technique || item.provider === 'external')]
|
||||
setDataSets(newList)
|
||||
setLoaded(true)
|
||||
if (!selected.find(item => !item.name))
|
||||
@@ -145,6 +145,11 @@ const SelectDataSet: FC<ISelectDataSetProps> = ({
|
||||
/>
|
||||
)
|
||||
}
|
||||
{
|
||||
item.provider === 'external' && (
|
||||
<Badge text={t('dataset.externalTag')} />
|
||||
)
|
||||
}
|
||||
</div>
|
||||
))}
|
||||
</div>
|
||||
|
||||
@@ -5,8 +5,10 @@ import { useTranslation } from 'react-i18next'
|
||||
import { isEqual } from 'lodash-es'
|
||||
import { RiCloseLine } from '@remixicon/react'
|
||||
import { BookOpenIcon } from '@heroicons/react/24/outline'
|
||||
import { ApiConnectionMod } from '@/app/components/base/icons/src/vender/solid/development'
|
||||
import cn from '@/utils/classnames'
|
||||
import IndexMethodRadio from '@/app/components/datasets/settings/index-method-radio'
|
||||
import Divider from '@/app/components/base/divider'
|
||||
import Button from '@/app/components/base/button'
|
||||
import type { DataSet } from '@/models/datasets'
|
||||
import { useToastContext } from '@/app/components/base/toast'
|
||||
@@ -14,6 +16,7 @@ import { updateDatasetSetting } from '@/service/datasets'
|
||||
import { useAppContext } from '@/context/app-context'
|
||||
import { useModalContext } from '@/context/modal-context'
|
||||
import type { RetrievalConfig } from '@/types/app'
|
||||
import RetrievalSettings from '@/app/components/datasets/external-knowledge-base/create/RetrievalSettings'
|
||||
import RetrievalMethodConfig from '@/app/components/datasets/common/retrieval-method-config'
|
||||
import EconomicalRetrievalMethodConfig from '@/app/components/datasets/common/economical-retrieval-method-config'
|
||||
import { ensureRerankModelSelected, isReRankModelSelected } from '@/app/components/datasets/common/check-rerank-model'
|
||||
@@ -56,6 +59,9 @@ const SettingsModal: FC<SettingsModalProps> = ({
|
||||
const { t } = useTranslation()
|
||||
const { notify } = useToastContext()
|
||||
const ref = useRef(null)
|
||||
const [topK, setTopK] = useState(currentDataset?.external_retrieval_model.top_k ?? 2)
|
||||
const [scoreThreshold, setScoreThreshold] = useState(currentDataset?.external_retrieval_model.score_threshold ?? 0.5)
|
||||
const [scoreThresholdEnabled, setScoreThresholdEnabled] = useState(currentDataset?.external_retrieval_model.score_threshold_enabled ?? false)
|
||||
|
||||
const { setShowAccountSettingModal } = useModalContext()
|
||||
const [loading, setLoading] = useState(false)
|
||||
@@ -73,6 +79,15 @@ const SettingsModal: FC<SettingsModalProps> = ({
|
||||
const [isHideChangedTip, setIsHideChangedTip] = useState(false)
|
||||
const isRetrievalChanged = !isEqual(retrievalConfig, localeCurrentDataset?.retrieval_model_dict) || indexMethod !== localeCurrentDataset?.indexing_technique
|
||||
|
||||
const handleSettingsChange = (data: { top_k?: number; score_threshold?: number; score_threshold_enabled?: boolean }) => {
|
||||
if (data.top_k !== undefined)
|
||||
setTopK(data.top_k)
|
||||
if (data.score_threshold !== undefined)
|
||||
setScoreThreshold(data.score_threshold)
|
||||
if (data.score_threshold_enabled !== undefined)
|
||||
setScoreThresholdEnabled(data.score_threshold_enabled)
|
||||
}
|
||||
|
||||
const handleSave = async () => {
|
||||
if (loading)
|
||||
return
|
||||
@@ -107,10 +122,17 @@ const SettingsModal: FC<SettingsModalProps> = ({
|
||||
description,
|
||||
permission,
|
||||
indexing_technique: indexMethod,
|
||||
external_retrieval_model: {
|
||||
top_k: topK,
|
||||
score_threshold: scoreThreshold,
|
||||
score_threshold_enabled: scoreThresholdEnabled,
|
||||
},
|
||||
retrieval_model: {
|
||||
...postRetrievalConfig,
|
||||
score_threshold: postRetrievalConfig.score_threshold_enabled ? postRetrievalConfig.score_threshold : 0,
|
||||
},
|
||||
external_knowledge_id: currentDataset!.external_knowledge_info.external_knowledge_id,
|
||||
external_knowledge_api_id: currentDataset!.external_knowledge_info.external_knowledge_api_id,
|
||||
embedding_model: localeCurrentDataset.embedding_model,
|
||||
embedding_model_provider: localeCurrentDataset.embedding_model_provider,
|
||||
},
|
||||
@@ -178,7 +200,7 @@ const SettingsModal: FC<SettingsModalProps> = ({
|
||||
}}>
|
||||
<div className={cn(rowClass, 'items-center')}>
|
||||
<div className={labelClass}>
|
||||
{t('datasetSettings.form.name')}
|
||||
<div className='text-text-secondary system-sm-semibold'>{t('datasetSettings.form.name')}</div>
|
||||
</div>
|
||||
<input
|
||||
value={localeCurrentDataset.name}
|
||||
@@ -189,7 +211,7 @@ const SettingsModal: FC<SettingsModalProps> = ({
|
||||
</div>
|
||||
<div className={cn(rowClass)}>
|
||||
<div className={labelClass}>
|
||||
{t('datasetSettings.form.desc')}
|
||||
<div className='text-text-secondary system-sm-semibold'>{t('datasetSettings.form.desc')}</div>
|
||||
</div>
|
||||
<div className='w-full'>
|
||||
<textarea
|
||||
@@ -206,7 +228,7 @@ const SettingsModal: FC<SettingsModalProps> = ({
|
||||
</div>
|
||||
<div className={rowClass}>
|
||||
<div className={labelClass}>
|
||||
<div>{t('datasetSettings.form.permissions')}</div>
|
||||
<div className='text-text-secondary system-sm-semibold'>{t('datasetSettings.form.permissions')}</div>
|
||||
</div>
|
||||
<div className='w-full'>
|
||||
<PermissionSelector
|
||||
@@ -219,24 +241,25 @@ const SettingsModal: FC<SettingsModalProps> = ({
|
||||
/>
|
||||
</div>
|
||||
</div>
|
||||
<div className="w-full h-0 border-b-[0.5px] border-b-gray-200 my-2"></div>
|
||||
<div className={cn(rowClass)}>
|
||||
<div className={labelClass}>
|
||||
{t('datasetSettings.form.indexMethod')}
|
||||
{currentDataset && currentDataset.indexing_technique && (
|
||||
<div className={cn(rowClass)}>
|
||||
<div className={labelClass}>
|
||||
<div className='text-text-secondary system-sm-semibold'>{t('datasetSettings.form.indexMethod')}</div>
|
||||
</div>
|
||||
<div className='grow'>
|
||||
<IndexMethodRadio
|
||||
disable={!localeCurrentDataset?.embedding_available}
|
||||
value={indexMethod}
|
||||
onChange={v => setIndexMethod(v!)}
|
||||
itemClassName='sm:!w-[280px]'
|
||||
/>
|
||||
</div>
|
||||
</div>
|
||||
<div className='grow'>
|
||||
<IndexMethodRadio
|
||||
disable={!localeCurrentDataset?.embedding_available}
|
||||
value={indexMethod}
|
||||
onChange={v => setIndexMethod(v!)}
|
||||
itemClassName='sm:!w-[280px]'
|
||||
/>
|
||||
</div>
|
||||
</div>
|
||||
)}
|
||||
{indexMethod === 'high_quality' && (
|
||||
<div className={cn(rowClass)}>
|
||||
<div className={labelClass}>
|
||||
{t('datasetSettings.form.embeddingModel')}
|
||||
<div className='text-text-secondary system-sm-semibold'>{t('datasetSettings.form.embeddingModel')}</div>
|
||||
</div>
|
||||
<div className='w-full'>
|
||||
<div className='w-full h-9 rounded-lg bg-gray-100 opacity-60'>
|
||||
@@ -258,32 +281,75 @@ const SettingsModal: FC<SettingsModalProps> = ({
|
||||
)}
|
||||
|
||||
{/* Retrieval Method Config */}
|
||||
<div className={rowClass}>
|
||||
<div className={cn(labelClass, 'w-auto min-w-[168px]')}>
|
||||
<div>
|
||||
<div>{t('datasetSettings.form.retrievalSetting.title')}</div>
|
||||
<div className='leading-[18px] text-xs font-normal text-gray-500'>
|
||||
<a target='_blank' rel='noopener noreferrer' href='https://docs.dify.ai/guides/knowledge-base/create-knowledge-and-upload-documents#id-4-retrieval-settings' className='text-[#155eef]'>{t('datasetSettings.form.retrievalSetting.learnMore')}</a>
|
||||
{t('datasetSettings.form.retrievalSetting.description')}
|
||||
{currentDataset?.provider === 'external'
|
||||
? <>
|
||||
<div className={rowClass}><Divider/></div>
|
||||
<div className={rowClass}>
|
||||
<div className={labelClass}>
|
||||
<div className='text-text-secondary system-sm-semibold'>{t('datasetSettings.form.retrievalSetting.title')}</div>
|
||||
</div>
|
||||
<RetrievalSettings
|
||||
topK={topK}
|
||||
scoreThreshold={scoreThreshold}
|
||||
scoreThresholdEnabled={scoreThresholdEnabled}
|
||||
onChange={handleSettingsChange}
|
||||
isInRetrievalSetting={true}
|
||||
/>
|
||||
</div>
|
||||
<div className={rowClass}><Divider/></div>
|
||||
<div className={rowClass}>
|
||||
<div className={labelClass}>
|
||||
<div className='text-text-secondary system-sm-semibold'>{t('datasetSettings.form.externalKnowledgeAPI')}</div>
|
||||
</div>
|
||||
<div className='w-full max-w-[480px]'>
|
||||
<div className='flex h-full px-3 py-2 items-center gap-1 rounded-lg bg-components-input-bg-normal'>
|
||||
<ApiConnectionMod className='w-4 h-4 text-text-secondary' />
|
||||
<div className='overflow-hidden text-text-secondary text-ellipsis system-sm-medium'>
|
||||
{currentDataset?.external_knowledge_info.external_knowledge_api_name}
|
||||
</div>
|
||||
<div className='text-text-tertiary system-xs-regular'>·</div>
|
||||
<div className='text-text-tertiary system-xs-regular'>{currentDataset?.external_knowledge_info.external_knowledge_api_endpoint}</div>
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
<div>
|
||||
{indexMethod === 'high_quality'
|
||||
? (
|
||||
<RetrievalMethodConfig
|
||||
value={retrievalConfig}
|
||||
onChange={setRetrievalConfig}
|
||||
/>
|
||||
)
|
||||
: (
|
||||
<EconomicalRetrievalMethodConfig
|
||||
value={retrievalConfig}
|
||||
onChange={setRetrievalConfig}
|
||||
/>
|
||||
)}
|
||||
</div>
|
||||
</div>
|
||||
<div className={rowClass}>
|
||||
<div className={labelClass}>
|
||||
<div className='text-text-secondary system-sm-semibold'>{t('datasetSettings.form.externalKnowledgeID')}</div>
|
||||
</div>
|
||||
<div className='w-full max-w-[480px]'>
|
||||
<div className='flex h-full px-3 py-2 items-center gap-1 rounded-lg bg-components-input-bg-normal'>
|
||||
<div className='text-text-tertiary system-xs-regular'>{currentDataset?.external_knowledge_info.external_knowledge_id}</div>
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
<div className={rowClass}><Divider/></div>
|
||||
</>
|
||||
: <div className={rowClass}>
|
||||
<div className={cn(labelClass, 'w-auto min-w-[168px]')}>
|
||||
<div>
|
||||
<div className='text-text-secondary system-sm-semibold'>{t('datasetSettings.form.retrievalSetting.title')}</div>
|
||||
<div className='leading-[18px] text-xs font-normal text-gray-500'>
|
||||
<a target='_blank' rel='noopener noreferrer' href='https://docs.dify.ai/guides/knowledge-base/create-knowledge-and-upload-documents#id-4-retrieval-settings' className='text-[#155eef]'>{t('datasetSettings.form.retrievalSetting.learnMore')}</a>
|
||||
{t('datasetSettings.form.retrievalSetting.description')}
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
<div>
|
||||
{indexMethod === 'high_quality'
|
||||
? (
|
||||
<RetrievalMethodConfig
|
||||
value={retrievalConfig}
|
||||
onChange={setRetrievalConfig}
|
||||
/>
|
||||
)
|
||||
: (
|
||||
<EconomicalRetrievalMethodConfig
|
||||
value={retrievalConfig}
|
||||
onChange={setRetrievalConfig}
|
||||
/>
|
||||
)}
|
||||
</div>
|
||||
</div>}
|
||||
</div>
|
||||
{isRetrievalChanged && !isHideChangedTip && (
|
||||
<div className='absolute z-10 left-[30px] right-[30px] bottom-[76px] flex h-10 items-center px-3 rounded-lg border border-[#FEF0C7] bg-[#FFFAEB] shadow-lg justify-between'>
|
||||
|
||||
@@ -4,6 +4,7 @@ import React, { useState } from 'react'
|
||||
import useSWR from 'swr'
|
||||
import { usePathname } from 'next/navigation'
|
||||
import { Pagination } from 'react-headless-pagination'
|
||||
import { useDebounce } from 'ahooks'
|
||||
import { omit } from 'lodash-es'
|
||||
import dayjs from 'dayjs'
|
||||
import { ArrowLeftIcon, ArrowRightIcon } from '@heroicons/react/24/outline'
|
||||
@@ -59,6 +60,7 @@ const Logs: FC<ILogsProps> = ({ appDetail }) => {
|
||||
sort_by: '-created_at',
|
||||
})
|
||||
const [currPage, setCurrPage] = React.useState<number>(0)
|
||||
const debouncedQueryParams = useDebounce(queryParams, { wait: 500 })
|
||||
|
||||
// Get the app type first
|
||||
const isChatMode = appDetail.mode !== 'completion'
|
||||
@@ -66,14 +68,14 @@ const Logs: FC<ILogsProps> = ({ appDetail }) => {
|
||||
const query = {
|
||||
page: currPage + 1,
|
||||
limit: APP_PAGE_LIMIT,
|
||||
...(queryParams.period !== 'all'
|
||||
...(debouncedQueryParams.period !== 'all'
|
||||
? {
|
||||
start: dayjs().subtract(queryParams.period as number, 'day').startOf('day').format('YYYY-MM-DD HH:mm'),
|
||||
start: dayjs().subtract(debouncedQueryParams.period as number, 'day').startOf('day').format('YYYY-MM-DD HH:mm'),
|
||||
end: dayjs().endOf('day').format('YYYY-MM-DD HH:mm'),
|
||||
}
|
||||
: {}),
|
||||
...(isChatMode ? { sort_by: queryParams.sort_by } : {}),
|
||||
...omit(queryParams, ['period']),
|
||||
...(isChatMode ? { sort_by: debouncedQueryParams.sort_by } : {}),
|
||||
...omit(debouncedQueryParams, ['period']),
|
||||
}
|
||||
|
||||
const getWebAppType = (appType: AppMode) => {
|
||||
@@ -119,8 +121,8 @@ const Logs: FC<ILogsProps> = ({ appDetail }) => {
|
||||
middlePagesSiblingCount={1}
|
||||
setCurrentPage={setCurrPage}
|
||||
totalPages={Math.ceil(total / APP_PAGE_LIMIT)}
|
||||
truncatableClassName="w-8 px-0.5 text-center"
|
||||
truncatableText="..."
|
||||
truncableClassName="w-8 px-0.5 text-center"
|
||||
truncableText="..."
|
||||
>
|
||||
<Pagination.PrevButton
|
||||
disabled={currPage === 0}
|
||||
|
||||
@@ -4,6 +4,7 @@ import React, { useState } from 'react'
|
||||
import useSWR from 'swr'
|
||||
import { usePathname } from 'next/navigation'
|
||||
import { Pagination } from 'react-headless-pagination'
|
||||
import { useDebounce } from 'ahooks'
|
||||
import { ArrowLeftIcon, ArrowRightIcon } from '@heroicons/react/24/outline'
|
||||
import { Trans, useTranslation } from 'react-i18next'
|
||||
import Link from 'next/link'
|
||||
@@ -51,12 +52,13 @@ const Logs: FC<ILogsProps> = ({ appDetail }) => {
|
||||
const { t } = useTranslation()
|
||||
const [queryParams, setQueryParams] = useState<QueryParam>({ status: 'all' })
|
||||
const [currPage, setCurrPage] = React.useState<number>(0)
|
||||
const debouncedQueryParams = useDebounce(queryParams, { wait: 500 })
|
||||
|
||||
const query = {
|
||||
page: currPage + 1,
|
||||
limit: APP_PAGE_LIMIT,
|
||||
...(queryParams.status !== 'all' ? { status: queryParams.status } : {}),
|
||||
...(queryParams.keyword ? { keyword: queryParams.keyword } : {}),
|
||||
...(debouncedQueryParams.status !== 'all' ? { status: debouncedQueryParams.status } : {}),
|
||||
...(debouncedQueryParams.keyword ? { keyword: debouncedQueryParams.keyword } : {}),
|
||||
}
|
||||
|
||||
const getWebAppType = (appType: AppMode) => {
|
||||
@@ -93,8 +95,8 @@ const Logs: FC<ILogsProps> = ({ appDetail }) => {
|
||||
middlePagesSiblingCount={1}
|
||||
setCurrentPage={setCurrPage}
|
||||
totalPages={Math.ceil(total / APP_PAGE_LIMIT)}
|
||||
truncatableClassName="w-8 px-0.5 text-center"
|
||||
truncatableText="..."
|
||||
truncableClassName="w-8 px-0.5 text-center"
|
||||
truncableText="..."
|
||||
>
|
||||
<Pagination.PrevButton
|
||||
disabled={currPage === 0}
|
||||
|
||||
21
web/app/components/base/corner-label/index.tsx
Normal file
21
web/app/components/base/corner-label/index.tsx
Normal file
@@ -0,0 +1,21 @@
|
||||
import { Corner } from '../icons/src/vender/solid/shapes'
|
||||
import cn from '@/utils/classnames'
|
||||
|
||||
type CornerLabelProps = {
|
||||
label: string
|
||||
className?: string
|
||||
labelClassName?: string
|
||||
}
|
||||
|
||||
const CornerLabel: React.FC<CornerLabelProps> = ({ label, className, labelClassName }) => {
|
||||
return (
|
||||
<div className={cn('group/corner-label inline-flex items-start', className)}>
|
||||
<Corner className='w-[13px] h-5 text-background-section group-hover/corner-label:text-background-section-burn' />
|
||||
<div className={cn('flex py-1 pr-2 items-center gap-0.5 bg-background-section group-hover/corner-label:bg-background-section-burn', labelClassName)}>
|
||||
<div className='text-text-tertiary system-2xs-medium-uppercase'>{label}</div>
|
||||
</div>
|
||||
</div>
|
||||
)
|
||||
}
|
||||
|
||||
export default CornerLabel
|
||||
@@ -0,0 +1,5 @@
|
||||
<svg width="16" height="16" viewBox="0 0 16 16" fill="none" xmlns="http://www.w3.org/2000/svg">
|
||||
<g id="Icon L">
|
||||
<path id="Vector" fill-rule="evenodd" clip-rule="evenodd" d="M7.99996 3.33333C5.42263 3.33333 3.33329 5.42267 3.33329 8C3.33329 10.5773 5.42263 12.6667 7.99996 12.6667C9.72643 12.6667 11.2348 11.7295 12.0427 10.3329C12.227 10.0141 12.6349 9.90523 12.9536 10.0896C13.2723 10.274 13.3812 10.6818 13.1968 11.0005C12.1604 12.7921 10.2216 14 7.99996 14C4.91159 14 2.36821 11.6666 2.03658 8.66667H1.33329C0.965103 8.66667 0.666626 8.36819 0.666626 8C0.666626 7.63181 0.965103 7.33333 1.33329 7.33333H2.03658C2.36821 4.33337 4.91159 2 7.99996 2C10.2216 2 12.1604 3.20785 13.1968 4.99952C13.3812 5.31823 13.2723 5.72605 12.9536 5.91041C12.6349 6.09477 12.227 5.98585 12.0427 5.66714C11.2348 4.27054 9.72643 3.33333 7.99996 3.33333ZM7.99996 6C6.89539 6 5.99996 6.89543 5.99996 8C5.99996 9.10455 6.89539 10 7.99996 10C9.1045 10 9.99996 9.10454 9.99996 8C9.99996 6.89543 9.10451 6 7.99996 6ZM4.66663 8C4.66663 6.15905 6.15901 4.66667 7.99996 4.66667C9.61257 4.66667 10.9578 5.81184 11.2666 7.33333H14.6666C15.0348 7.33333 15.3333 7.63181 15.3333 8C15.3333 8.36819 15.0348 8.66667 14.6666 8.66667H11.2666C10.9578 10.1881 9.61257 11.3333 7.99996 11.3333C6.159 11.3333 4.66663 9.84092 4.66663 8Z" fill="#354052"/>
|
||||
</g>
|
||||
</svg>
|
||||
|
After Width: | Height: | Size: 1.3 KiB |
@@ -0,0 +1,3 @@
|
||||
<svg width="13" height="20" viewBox="0 0 13 20" fill="none" xmlns="http://www.w3.org/2000/svg">
|
||||
<path id="Shape" d="M0 0H13V20C9.98017 20 7.26458 18.1615 6.14305 15.3576L0 0Z" fill="#F9FAFB"/>
|
||||
</svg>
|
||||
|
After Width: | Height: | Size: 200 B |
@@ -0,0 +1,38 @@
|
||||
{
|
||||
"icon": {
|
||||
"type": "element",
|
||||
"isRootNode": true,
|
||||
"name": "svg",
|
||||
"attributes": {
|
||||
"width": "16",
|
||||
"height": "16",
|
||||
"viewBox": "0 0 16 16",
|
||||
"fill": "none",
|
||||
"xmlns": "http://www.w3.org/2000/svg"
|
||||
},
|
||||
"children": [
|
||||
{
|
||||
"type": "element",
|
||||
"name": "g",
|
||||
"attributes": {
|
||||
"id": "Icon L"
|
||||
},
|
||||
"children": [
|
||||
{
|
||||
"type": "element",
|
||||
"name": "path",
|
||||
"attributes": {
|
||||
"id": "Vector",
|
||||
"fill-rule": "evenodd",
|
||||
"clip-rule": "evenodd",
|
||||
"d": "M7.99996 3.33333C5.42263 3.33333 3.33329 5.42267 3.33329 8C3.33329 10.5773 5.42263 12.6667 7.99996 12.6667C9.72643 12.6667 11.2348 11.7295 12.0427 10.3329C12.227 10.0141 12.6349 9.90523 12.9536 10.0896C13.2723 10.274 13.3812 10.6818 13.1968 11.0005C12.1604 12.7921 10.2216 14 7.99996 14C4.91159 14 2.36821 11.6666 2.03658 8.66667H1.33329C0.965103 8.66667 0.666626 8.36819 0.666626 8C0.666626 7.63181 0.965103 7.33333 1.33329 7.33333H2.03658C2.36821 4.33337 4.91159 2 7.99996 2C10.2216 2 12.1604 3.20785 13.1968 4.99952C13.3812 5.31823 13.2723 5.72605 12.9536 5.91041C12.6349 6.09477 12.227 5.98585 12.0427 5.66714C11.2348 4.27054 9.72643 3.33333 7.99996 3.33333ZM7.99996 6C6.89539 6 5.99996 6.89543 5.99996 8C5.99996 9.10455 6.89539 10 7.99996 10C9.1045 10 9.99996 9.10454 9.99996 8C9.99996 6.89543 9.10451 6 7.99996 6ZM4.66663 8C4.66663 6.15905 6.15901 4.66667 7.99996 4.66667C9.61257 4.66667 10.9578 5.81184 11.2666 7.33333H14.6666C15.0348 7.33333 15.3333 7.63181 15.3333 8C15.3333 8.36819 15.0348 8.66667 14.6666 8.66667H11.2666C10.9578 10.1881 9.61257 11.3333 7.99996 11.3333C6.159 11.3333 4.66663 9.84092 4.66663 8Z",
|
||||
"fill": "currentColor"
|
||||
},
|
||||
"children": []
|
||||
}
|
||||
]
|
||||
}
|
||||
]
|
||||
},
|
||||
"name": "ApiConnectionMod"
|
||||
}
|
||||
@@ -0,0 +1,16 @@
|
||||
// GENERATE BY script
|
||||
// DON NOT EDIT IT MANUALLY
|
||||
|
||||
import * as React from 'react'
|
||||
import data from './ApiConnectionMod.json'
|
||||
import IconBase from '@/app/components/base/icons/IconBase'
|
||||
import type { IconBaseProps, IconData } from '@/app/components/base/icons/IconBase'
|
||||
|
||||
const Icon = React.forwardRef<React.MutableRefObject<SVGElement>, Omit<IconBaseProps, 'data'>>((
|
||||
props,
|
||||
ref,
|
||||
) => <IconBase {...props} ref={ref} data={data as IconData} />)
|
||||
|
||||
Icon.displayName = 'ApiConnectionMod'
|
||||
|
||||
export default Icon
|
||||
@@ -1,3 +1,4 @@
|
||||
export { default as ApiConnectionMod } from './ApiConnectionMod'
|
||||
export { default as ApiConnection } from './ApiConnection'
|
||||
export { default as BarChartSquare02 } from './BarChartSquare02'
|
||||
export { default as Container } from './Container'
|
||||
|
||||
@@ -0,0 +1,27 @@
|
||||
{
|
||||
"icon": {
|
||||
"type": "element",
|
||||
"isRootNode": true,
|
||||
"name": "svg",
|
||||
"attributes": {
|
||||
"width": "13",
|
||||
"height": "20",
|
||||
"viewBox": "0 0 13 20",
|
||||
"fill": "none",
|
||||
"xmlns": "http://www.w3.org/2000/svg"
|
||||
},
|
||||
"children": [
|
||||
{
|
||||
"type": "element",
|
||||
"name": "path",
|
||||
"attributes": {
|
||||
"id": "Shape",
|
||||
"d": "M0 0H13V20C9.98017 20 7.26458 18.1615 6.14305 15.3576L0 0Z",
|
||||
"fill": "currentColor"
|
||||
},
|
||||
"children": []
|
||||
}
|
||||
]
|
||||
},
|
||||
"name": "Corner"
|
||||
}
|
||||
@@ -0,0 +1,16 @@
|
||||
// GENERATE BY script
|
||||
// DON NOT EDIT IT MANUALLY
|
||||
|
||||
import * as React from 'react'
|
||||
import data from './Corner.json'
|
||||
import IconBase from '@/app/components/base/icons/IconBase'
|
||||
import type { IconBaseProps, IconData } from '@/app/components/base/icons/IconBase'
|
||||
|
||||
const Icon = React.forwardRef<React.MutableRefObject<SVGElement>, Omit<IconBaseProps, 'data'>>((
|
||||
props,
|
||||
ref,
|
||||
) => <IconBase {...props} ref={ref} data={data as IconData} />)
|
||||
|
||||
Icon.displayName = 'Corner'
|
||||
|
||||
export default Icon
|
||||
@@ -1,2 +1,3 @@
|
||||
export { default as Corner } from './Corner'
|
||||
export { default as Star04 } from './Star04'
|
||||
export { default as Star06 } from './Star06'
|
||||
|
||||
@@ -3,5 +3,5 @@
|
||||
}
|
||||
|
||||
.modal-panel {
|
||||
@apply w-full max-w-md transform rounded-2xl bg-white p-6 text-left align-middle shadow-xl transition-all;
|
||||
@apply w-full max-w-[480px] transform rounded-2xl bg-white p-6 text-left align-middle shadow-xl transition-all;
|
||||
}
|
||||
|
||||
@@ -37,6 +37,7 @@ const ParamItem: FC<Props> = ({ className, id, name, noTooltip, tip, step = 0.1,
|
||||
<span className="mx-1 text-gray-900 text-[13px] leading-[18px] font-medium">{name}</span>
|
||||
{!noTooltip && (
|
||||
<Tooltip
|
||||
triggerClassName='w-4 h-4 shrink-0'
|
||||
popupContent={<div className="w-[200px]">{tip}</div>}
|
||||
/>
|
||||
)}
|
||||
|
||||
@@ -87,7 +87,7 @@ const Select: FC<ISelectProps> = ({
|
||||
<div className='group text-gray-800'>
|
||||
{allowSearch
|
||||
? <Combobox.Input
|
||||
className={`w-full rounded-lg border-0 ${bgClassName} py-1.5 pl-3 pr-10 shadow-sm sm:text-sm sm:leading-6 focus-visible:outline-none focus-visible:bg-gray-200 group-hover:bg-gray-200 cursor-not-allowed`}
|
||||
className={`w-full rounded-lg border-0 ${bgClassName} py-1.5 pl-3 pr-10 shadow-sm sm:text-sm sm:leading-6 focus-visible:outline-none focus-visible:bg-gray-200 group-hover:bg-gray-200 ${disabled ? 'cursor-not-allowed' : 'cursor-pointer'}`}
|
||||
onChange={(event) => {
|
||||
if (!disabled)
|
||||
setQuery(event.target.value)
|
||||
|
||||
BIN
web/app/components/datasets/create/assets/jina.png
Normal file
BIN
web/app/components/datasets/create/assets/jina.png
Normal file
Binary file not shown.
|
After Width: | Height: | Size: 2.7 KiB |
@@ -11,7 +11,7 @@ import { DataSourceType } from '@/models/datasets'
|
||||
import type { CrawlOptions, CrawlResultItem, DataSet, FileItem, createDocumentResponse } from '@/models/datasets'
|
||||
import { fetchDataSource } from '@/service/common'
|
||||
import { fetchDatasetDetail } from '@/service/datasets'
|
||||
import type { NotionPage } from '@/models/common'
|
||||
import { DataSourceProvider, type NotionPage } from '@/models/common'
|
||||
import { useModalContext } from '@/context/modal-context'
|
||||
import { useDefaultModel } from '@/app/components/header/account-setting/model-provider-page/hooks'
|
||||
|
||||
@@ -26,6 +26,7 @@ const DEFAULT_CRAWL_OPTIONS: CrawlOptions = {
|
||||
excludes: '',
|
||||
limit: 10,
|
||||
max_depth: '',
|
||||
use_sitemap: true,
|
||||
}
|
||||
|
||||
const DatasetUpdateForm = ({ datasetId }: DatasetUpdateFormProps) => {
|
||||
@@ -51,7 +52,8 @@ const DatasetUpdateForm = ({ datasetId }: DatasetUpdateFormProps) => {
|
||||
const updateFileList = (preparedFiles: FileItem[]) => {
|
||||
setFiles(preparedFiles)
|
||||
}
|
||||
const [fireCrawlJobId, setFireCrawlJobId] = useState('')
|
||||
const [websiteCrawlProvider, setWebsiteCrawlProvider] = useState<DataSourceProvider>(DataSourceProvider.fireCrawl)
|
||||
const [websiteCrawlJobId, setWebsiteCrawlJobId] = useState('')
|
||||
|
||||
const updateFile = (fileItem: FileItem, progress: number, list: FileItem[]) => {
|
||||
const targetIndex = list.findIndex(file => file.fileID === fileItem.fileID)
|
||||
@@ -137,7 +139,8 @@ const DatasetUpdateForm = ({ datasetId }: DatasetUpdateFormProps) => {
|
||||
onStepChange={nextStep}
|
||||
websitePages={websitePages}
|
||||
updateWebsitePages={setWebsitePages}
|
||||
onFireCrawlJobIdChange={setFireCrawlJobId}
|
||||
onWebsiteCrawlProviderChange={setWebsiteCrawlProvider}
|
||||
onWebsiteCrawlJobIdChange={setWebsiteCrawlJobId}
|
||||
crawlOptions={crawlOptions}
|
||||
onCrawlOptionsChange={setCrawlOptions}
|
||||
/>
|
||||
@@ -151,7 +154,8 @@ const DatasetUpdateForm = ({ datasetId }: DatasetUpdateFormProps) => {
|
||||
files={fileList.map(file => file.file)}
|
||||
notionPages={notionPages}
|
||||
websitePages={websitePages}
|
||||
fireCrawlJobId={fireCrawlJobId}
|
||||
websiteCrawlProvider={websiteCrawlProvider}
|
||||
websiteCrawlJobId={websiteCrawlJobId}
|
||||
onStepChange={changeStep}
|
||||
updateIndexingTypeCache={updateIndexingTypeCache}
|
||||
updateResultCache={updateResultCache}
|
||||
|
||||
@@ -10,7 +10,7 @@ import WebsitePreview from '../website/preview'
|
||||
import s from './index.module.css'
|
||||
import cn from '@/utils/classnames'
|
||||
import type { CrawlOptions, CrawlResultItem, FileItem } from '@/models/datasets'
|
||||
import type { NotionPage } from '@/models/common'
|
||||
import type { DataSourceProvider, NotionPage } from '@/models/common'
|
||||
import { DataSourceType } from '@/models/datasets'
|
||||
import Button from '@/app/components/base/button'
|
||||
import { NotionPageSelector } from '@/app/components/base/notion-page-selector'
|
||||
@@ -33,7 +33,8 @@ type IStepOneProps = {
|
||||
changeType: (type: DataSourceType) => void
|
||||
websitePages?: CrawlResultItem[]
|
||||
updateWebsitePages: (value: CrawlResultItem[]) => void
|
||||
onFireCrawlJobIdChange: (jobId: string) => void
|
||||
onWebsiteCrawlProviderChange: (provider: DataSourceProvider) => void
|
||||
onWebsiteCrawlJobIdChange: (jobId: string) => void
|
||||
crawlOptions: CrawlOptions
|
||||
onCrawlOptionsChange: (payload: CrawlOptions) => void
|
||||
}
|
||||
@@ -69,7 +70,8 @@ const StepOne = ({
|
||||
updateNotionPages,
|
||||
websitePages = [],
|
||||
updateWebsitePages,
|
||||
onFireCrawlJobIdChange,
|
||||
onWebsiteCrawlProviderChange,
|
||||
onWebsiteCrawlJobIdChange,
|
||||
crawlOptions,
|
||||
onCrawlOptionsChange,
|
||||
}: IStepOneProps) => {
|
||||
@@ -229,7 +231,8 @@ const StepOne = ({
|
||||
onPreview={setCurrentWebsite}
|
||||
checkedCrawlResult={websitePages}
|
||||
onCheckedCrawlResultChange={updateWebsitePages}
|
||||
onJobIdChange={onFireCrawlJobIdChange}
|
||||
onCrawlProviderChange={onWebsiteCrawlProviderChange}
|
||||
onJobIdChange={onWebsiteCrawlJobIdChange}
|
||||
crawlOptions={crawlOptions}
|
||||
onCrawlOptionsChange={onCrawlOptionsChange}
|
||||
/>
|
||||
|
||||
@@ -33,6 +33,7 @@ import { ensureRerankModelSelected, isReRankModelSelected } from '@/app/componen
|
||||
import Toast from '@/app/components/base/toast'
|
||||
import { formatNumber } from '@/utils/format'
|
||||
import type { NotionPage } from '@/models/common'
|
||||
import { DataSourceProvider } from '@/models/common'
|
||||
import { DataSourceType, DocForm } from '@/models/datasets'
|
||||
import NotionIcon from '@/app/components/base/notion-icon'
|
||||
import Switch from '@/app/components/base/switch'
|
||||
@@ -63,7 +64,8 @@ type StepTwoProps = {
|
||||
notionPages?: NotionPage[]
|
||||
websitePages?: CrawlResultItem[]
|
||||
crawlOptions?: CrawlOptions
|
||||
fireCrawlJobId?: string
|
||||
websiteCrawlProvider?: DataSourceProvider
|
||||
websiteCrawlJobId?: string
|
||||
onStepChange?: (delta: number) => void
|
||||
updateIndexingTypeCache?: (type: string) => void
|
||||
updateResultCache?: (res: createDocumentResponse) => void
|
||||
@@ -94,7 +96,8 @@ const StepTwo = ({
|
||||
notionPages = [],
|
||||
websitePages = [],
|
||||
crawlOptions,
|
||||
fireCrawlJobId = '',
|
||||
websiteCrawlProvider = DataSourceProvider.fireCrawl,
|
||||
websiteCrawlJobId = '',
|
||||
onStepChange,
|
||||
updateIndexingTypeCache,
|
||||
updateResultCache,
|
||||
@@ -260,8 +263,8 @@ const StepTwo = ({
|
||||
|
||||
const getWebsiteInfo = () => {
|
||||
return {
|
||||
provider: 'firecrawl',
|
||||
job_id: fireCrawlJobId,
|
||||
provider: websiteCrawlProvider,
|
||||
job_id: websiteCrawlJobId,
|
||||
urls: websitePages.map(page => page.source_url),
|
||||
only_main_content: crawlOptions?.only_main_content,
|
||||
}
|
||||
|
||||
@@ -3,6 +3,7 @@ import type { FC } from 'react'
|
||||
import React from 'react'
|
||||
import cn from '@/utils/classnames'
|
||||
import Checkbox from '@/app/components/base/checkbox'
|
||||
import Tooltip from '@/app/components/base/tooltip'
|
||||
|
||||
type Props = {
|
||||
className?: string
|
||||
@@ -10,6 +11,7 @@ type Props = {
|
||||
onChange: (isChecked: boolean) => void
|
||||
label: string
|
||||
labelClassName?: string
|
||||
tooltip?: string
|
||||
}
|
||||
|
||||
const CheckboxWithLabel: FC<Props> = ({
|
||||
@@ -18,11 +20,20 @@ const CheckboxWithLabel: FC<Props> = ({
|
||||
onChange,
|
||||
label,
|
||||
labelClassName,
|
||||
tooltip,
|
||||
}) => {
|
||||
return (
|
||||
<label className={cn(className, 'flex items-center h-7 space-x-2')}>
|
||||
<Checkbox checked={isChecked} onCheck={() => onChange(!isChecked)} />
|
||||
<div className={cn(labelClassName, 'text-sm font-normal text-gray-800')}>{label}</div>
|
||||
{tooltip && (
|
||||
<Tooltip
|
||||
popupContent={
|
||||
<div className='w-[200px]'>{tooltip}</div>
|
||||
}
|
||||
triggerClassName='ml-0.5 w-4 h-4'
|
||||
/>
|
||||
)}
|
||||
</label>
|
||||
)
|
||||
}
|
||||
@@ -2,7 +2,7 @@
|
||||
import type { FC } from 'react'
|
||||
import React, { useCallback } from 'react'
|
||||
import { useTranslation } from 'react-i18next'
|
||||
import CheckboxWithLabel from './base/checkbox-with-label'
|
||||
import CheckboxWithLabel from './checkbox-with-label'
|
||||
import CrawledResultItem from './crawled-result-item'
|
||||
import cn from '@/utils/classnames'
|
||||
import type { CrawlResultItem } from '@/models/datasets'
|
||||
@@ -2,13 +2,13 @@
|
||||
import type { FC } from 'react'
|
||||
import React, { useCallback, useEffect, useState } from 'react'
|
||||
import { useTranslation } from 'react-i18next'
|
||||
import UrlInput from '../base/url-input'
|
||||
import OptionsWrap from '../base/options-wrap'
|
||||
import CrawledResult from '../base/crawled-result'
|
||||
import Crawling from '../base/crawling'
|
||||
import ErrorMessage from '../base/error-message'
|
||||
import Header from './header'
|
||||
import UrlInput from './base/url-input'
|
||||
import OptionsWrap from './base/options-wrap'
|
||||
import Options from './options'
|
||||
import CrawledResult from './crawled-result'
|
||||
import Crawling from './crawling'
|
||||
import ErrorMessage from './base/error-message'
|
||||
import cn from '@/utils/classnames'
|
||||
import { useModalContext } from '@/context/modal-context'
|
||||
import type { CrawlOptions, CrawlResultItem } from '@/models/datasets'
|
||||
|
||||
@@ -2,8 +2,8 @@
|
||||
import type { FC } from 'react'
|
||||
import React, { useCallback } from 'react'
|
||||
import { useTranslation } from 'react-i18next'
|
||||
import CheckboxWithLabel from './base/checkbox-with-label'
|
||||
import Field from './base/field'
|
||||
import CheckboxWithLabel from '../base/checkbox-with-label'
|
||||
import Field from '../base/field'
|
||||
import cn from '@/utils/classnames'
|
||||
import type { CrawlOptions } from '@/models/datasets'
|
||||
|
||||
|
||||
@@ -0,0 +1,6 @@
|
||||
.jinaLogo {
|
||||
@apply w-4 h-4 bg-center bg-no-repeat inline-block;
|
||||
background-color: #F5FAFF;
|
||||
background-image: url(../assets/jina.png);
|
||||
background-size: 16px;
|
||||
}
|
||||
@@ -1,8 +1,12 @@
|
||||
'use client'
|
||||
import type { FC } from 'react'
|
||||
import React, { useCallback, useEffect, useState } from 'react'
|
||||
import { useTranslation } from 'react-i18next'
|
||||
import s from './index.module.css'
|
||||
import NoData from './no-data'
|
||||
import Firecrawl from './firecrawl'
|
||||
import JinaReader from './jina-reader'
|
||||
import cn from '@/utils/classnames'
|
||||
import { useModalContext } from '@/context/modal-context'
|
||||
import type { CrawlOptions, CrawlResultItem } from '@/models/datasets'
|
||||
import { fetchDataSources } from '@/service/datasets'
|
||||
@@ -12,6 +16,7 @@ type Props = {
|
||||
onPreview: (payload: CrawlResultItem) => void
|
||||
checkedCrawlResult: CrawlResultItem[]
|
||||
onCheckedCrawlResultChange: (payload: CrawlResultItem[]) => void
|
||||
onCrawlProviderChange: (provider: DataSourceProvider) => void
|
||||
onJobIdChange: (jobId: string) => void
|
||||
crawlOptions: CrawlOptions
|
||||
onCrawlOptionsChange: (payload: CrawlOptions) => void
|
||||
@@ -21,17 +26,32 @@ const Website: FC<Props> = ({
|
||||
onPreview,
|
||||
checkedCrawlResult,
|
||||
onCheckedCrawlResultChange,
|
||||
onCrawlProviderChange,
|
||||
onJobIdChange,
|
||||
crawlOptions,
|
||||
onCrawlOptionsChange,
|
||||
}) => {
|
||||
const { t } = useTranslation()
|
||||
const { setShowAccountSettingModal } = useModalContext()
|
||||
const [isLoaded, setIsLoaded] = useState(false)
|
||||
const [isSetFirecrawlApiKey, setIsSetFirecrawlApiKey] = useState(false)
|
||||
const [selectedProvider, setSelectedProvider] = useState<DataSourceProvider>(DataSourceProvider.jinaReader)
|
||||
const [sources, setSources] = useState<DataSourceItem[]>([])
|
||||
|
||||
useEffect(() => {
|
||||
onCrawlProviderChange(selectedProvider)
|
||||
}, [selectedProvider, onCrawlProviderChange])
|
||||
|
||||
const checkSetApiKey = useCallback(async () => {
|
||||
const res = await fetchDataSources() as any
|
||||
const isFirecrawlSet = res.sources.some((item: DataSourceItem) => item.provider === DataSourceProvider.fireCrawl)
|
||||
setIsSetFirecrawlApiKey(isFirecrawlSet)
|
||||
setSources(res.sources)
|
||||
|
||||
// If users have configured one of the providers, select it.
|
||||
const availableProviders = res.sources.filter((item: DataSourceItem) =>
|
||||
[DataSourceProvider.jinaReader, DataSourceProvider.fireCrawl].includes(item.provider),
|
||||
)
|
||||
|
||||
if (availableProviders.length > 0)
|
||||
setSelectedProvider(availableProviders[0].provider)
|
||||
}, [])
|
||||
|
||||
useEffect(() => {
|
||||
@@ -52,20 +72,66 @@ const Website: FC<Props> = ({
|
||||
|
||||
return (
|
||||
<div>
|
||||
{isSetFirecrawlApiKey
|
||||
? (
|
||||
<Firecrawl
|
||||
onPreview={onPreview}
|
||||
checkedCrawlResult={checkedCrawlResult}
|
||||
onCheckedCrawlResultChange={onCheckedCrawlResultChange}
|
||||
onJobIdChange={onJobIdChange}
|
||||
crawlOptions={crawlOptions}
|
||||
onCrawlOptionsChange={onCrawlOptionsChange}
|
||||
/>
|
||||
)
|
||||
: (
|
||||
<NoData onConfig={handleOnConfig} />
|
||||
)}
|
||||
<div className="mb-4">
|
||||
<div className="font-medium text-gray-700 mb-2 h-6">
|
||||
{t('datasetCreation.stepOne.website.chooseProvider')}
|
||||
</div>
|
||||
<div className="flex space-x-2">
|
||||
<button
|
||||
className={`px-4 py-2 text-sm font-medium rounded-md flex items-center justify-center ${
|
||||
selectedProvider === DataSourceProvider.jinaReader
|
||||
? 'bg-primary-50 text-primary-600'
|
||||
: 'bg-gray-100 text-gray-600 hover:bg-gray-200'
|
||||
}`}
|
||||
onClick={() => setSelectedProvider(DataSourceProvider.jinaReader)}
|
||||
>
|
||||
<span className={cn(s.jinaLogo, 'mr-2')} />
|
||||
<span>Jina Reader</span>
|
||||
</button>
|
||||
<button
|
||||
className={`px-4 py-2 text-sm font-medium rounded-md ${
|
||||
selectedProvider === DataSourceProvider.fireCrawl
|
||||
? 'bg-primary-50 text-primary-600'
|
||||
: 'bg-gray-100 text-gray-600 hover:bg-gray-200'
|
||||
}`}
|
||||
onClick={() => setSelectedProvider(DataSourceProvider.fireCrawl)}
|
||||
>
|
||||
🔥 Firecrawl
|
||||
</button>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
{
|
||||
selectedProvider === DataSourceProvider.fireCrawl
|
||||
? sources.find(source => source.provider === DataSourceProvider.fireCrawl)
|
||||
? (
|
||||
<Firecrawl
|
||||
onPreview={onPreview}
|
||||
checkedCrawlResult={checkedCrawlResult}
|
||||
onCheckedCrawlResultChange={onCheckedCrawlResultChange}
|
||||
onJobIdChange={onJobIdChange}
|
||||
crawlOptions={crawlOptions}
|
||||
onCrawlOptionsChange={onCrawlOptionsChange}
|
||||
/>
|
||||
)
|
||||
: (
|
||||
<NoData onConfig={handleOnConfig} provider={selectedProvider} />
|
||||
)
|
||||
: sources.find(source => source.provider === DataSourceProvider.jinaReader)
|
||||
? (
|
||||
<JinaReader
|
||||
onPreview={onPreview}
|
||||
checkedCrawlResult={checkedCrawlResult}
|
||||
onCheckedCrawlResultChange={onCheckedCrawlResultChange}
|
||||
onJobIdChange={onJobIdChange}
|
||||
crawlOptions={crawlOptions}
|
||||
onCrawlOptionsChange={onCrawlOptionsChange}
|
||||
/>
|
||||
)
|
||||
: (
|
||||
<NoData onConfig={handleOnConfig} provider={selectedProvider} />
|
||||
)
|
||||
}
|
||||
</div>
|
||||
)
|
||||
}
|
||||
|
||||
@@ -0,0 +1,42 @@
|
||||
'use client'
|
||||
import type { FC } from 'react'
|
||||
import React from 'react'
|
||||
import { useTranslation } from 'react-i18next'
|
||||
import { Settings01 } from '@/app/components/base/icons/src/vender/line/general'
|
||||
import { BookOpen01 } from '@/app/components/base/icons/src/vender/line/education'
|
||||
|
||||
const I18N_PREFIX = 'datasetCreation.stepOne.website'
|
||||
|
||||
type Props = {
|
||||
onSetting: () => void
|
||||
}
|
||||
|
||||
const Header: FC<Props> = ({
|
||||
onSetting,
|
||||
}) => {
|
||||
const { t } = useTranslation()
|
||||
|
||||
return (
|
||||
<div className='flex h-6 items-center justify-between'>
|
||||
<div className='flex items-center'>
|
||||
<div className='text-base font-medium text-gray-700'>{t(`${I18N_PREFIX}.jinaReaderTitle`)}</div>
|
||||
<div className='ml-2 mr-1 w-px h-3.5 bg-gray-200'></div>
|
||||
<div
|
||||
className='p-1 rounded-md hover:bg-black/5 cursor-pointer'
|
||||
onClick={onSetting}
|
||||
>
|
||||
<Settings01 className='w-3.5 h-3.5 text-gray-500' />
|
||||
</div>
|
||||
</div>
|
||||
<a
|
||||
href='https://jina.ai/reader'
|
||||
target='_blank' rel='noopener noreferrer'
|
||||
className='flex items-center text-xs text-primary-600'
|
||||
>
|
||||
<BookOpen01 className='mr-1 w-3.5 h-3.5 text-primary-600' />
|
||||
{t(`${I18N_PREFIX}.jinaReaderDoc`)}
|
||||
</a>
|
||||
</div>
|
||||
)
|
||||
}
|
||||
export default React.memo(Header)
|
||||
232
web/app/components/datasets/create/website/jina-reader/index.tsx
Normal file
232
web/app/components/datasets/create/website/jina-reader/index.tsx
Normal file
@@ -0,0 +1,232 @@
|
||||
'use client'
|
||||
import type { FC } from 'react'
|
||||
import React, { useCallback, useEffect, useState } from 'react'
|
||||
import { useTranslation } from 'react-i18next'
|
||||
import UrlInput from '../base/url-input'
|
||||
import OptionsWrap from '../base/options-wrap'
|
||||
import CrawledResult from '../base/crawled-result'
|
||||
import Crawling from '../base/crawling'
|
||||
import ErrorMessage from '../base/error-message'
|
||||
import Header from './header'
|
||||
import Options from './options'
|
||||
import cn from '@/utils/classnames'
|
||||
import { useModalContext } from '@/context/modal-context'
|
||||
import Toast from '@/app/components/base/toast'
|
||||
import { checkJinaReaderTaskStatus, createJinaReaderTask } from '@/service/datasets'
|
||||
import { sleep } from '@/utils'
|
||||
import type { CrawlOptions, CrawlResultItem } from '@/models/datasets'
|
||||
|
||||
const ERROR_I18N_PREFIX = 'common.errorMsg'
|
||||
const I18N_PREFIX = 'datasetCreation.stepOne.website'
|
||||
|
||||
type Props = {
|
||||
onPreview: (payload: CrawlResultItem) => void
|
||||
checkedCrawlResult: CrawlResultItem[]
|
||||
onCheckedCrawlResultChange: (payload: CrawlResultItem[]) => void
|
||||
onJobIdChange: (jobId: string) => void
|
||||
crawlOptions: CrawlOptions
|
||||
onCrawlOptionsChange: (payload: CrawlOptions) => void
|
||||
}
|
||||
|
||||
enum Step {
|
||||
init = 'init',
|
||||
running = 'running',
|
||||
finished = 'finished',
|
||||
}
|
||||
|
||||
const JinaReader: FC<Props> = ({
|
||||
onPreview,
|
||||
checkedCrawlResult,
|
||||
onCheckedCrawlResultChange,
|
||||
onJobIdChange,
|
||||
crawlOptions,
|
||||
onCrawlOptionsChange,
|
||||
}) => {
|
||||
const { t } = useTranslation()
|
||||
const [step, setStep] = useState<Step>(Step.init)
|
||||
const [controlFoldOptions, setControlFoldOptions] = useState<number>(0)
|
||||
useEffect(() => {
|
||||
if (step !== Step.init)
|
||||
setControlFoldOptions(Date.now())
|
||||
}, [step])
|
||||
const { setShowAccountSettingModal } = useModalContext()
|
||||
const handleSetting = useCallback(() => {
|
||||
setShowAccountSettingModal({
|
||||
payload: 'data-source',
|
||||
})
|
||||
}, [setShowAccountSettingModal])
|
||||
|
||||
const checkValid = useCallback((url: string) => {
|
||||
let errorMsg = ''
|
||||
if (!url) {
|
||||
errorMsg = t(`${ERROR_I18N_PREFIX}.fieldRequired`, {
|
||||
field: 'url',
|
||||
})
|
||||
}
|
||||
|
||||
if (!errorMsg && !((url.startsWith('http://') || url.startsWith('https://'))))
|
||||
errorMsg = t(`${ERROR_I18N_PREFIX}.urlError`)
|
||||
|
||||
if (!errorMsg && (crawlOptions.limit === null || crawlOptions.limit === undefined || crawlOptions.limit === '')) {
|
||||
errorMsg = t(`${ERROR_I18N_PREFIX}.fieldRequired`, {
|
||||
field: t(`${I18N_PREFIX}.limit`),
|
||||
})
|
||||
}
|
||||
|
||||
return {
|
||||
isValid: !errorMsg,
|
||||
errorMsg,
|
||||
}
|
||||
}, [crawlOptions, t])
|
||||
|
||||
const isInit = step === Step.init
|
||||
const isCrawlFinished = step === Step.finished
|
||||
const isRunning = step === Step.running
|
||||
const [crawlResult, setCrawlResult] = useState<{
|
||||
current: number
|
||||
total: number
|
||||
data: CrawlResultItem[]
|
||||
time_consuming: number | string
|
||||
} | undefined>(undefined)
|
||||
const [crawlErrorMessage, setCrawlErrorMessage] = useState('')
|
||||
const showError = isCrawlFinished && crawlErrorMessage
|
||||
|
||||
const waitForCrawlFinished = useCallback(async (jobId: string) => {
|
||||
try {
|
||||
const res = await checkJinaReaderTaskStatus(jobId) as any
|
||||
console.log('res', res)
|
||||
if (res.status === 'completed') {
|
||||
return {
|
||||
isError: false,
|
||||
data: {
|
||||
...res,
|
||||
total: Math.min(res.total, parseFloat(crawlOptions.limit as string)),
|
||||
},
|
||||
}
|
||||
}
|
||||
if (res.status === 'failed' || !res.status) {
|
||||
return {
|
||||
isError: true,
|
||||
errorMessage: res.message,
|
||||
data: {
|
||||
data: [],
|
||||
},
|
||||
}
|
||||
}
|
||||
// update the progress
|
||||
setCrawlResult({
|
||||
...res,
|
||||
total: Math.min(res.total, parseFloat(crawlOptions.limit as string)),
|
||||
})
|
||||
onCheckedCrawlResultChange(res.data || []) // default select the crawl result
|
||||
await sleep(2500)
|
||||
return await waitForCrawlFinished(jobId)
|
||||
}
|
||||
catch (e: any) {
|
||||
const errorBody = await e.json()
|
||||
return {
|
||||
isError: true,
|
||||
errorMessage: errorBody.message,
|
||||
data: {
|
||||
data: [],
|
||||
},
|
||||
}
|
||||
}
|
||||
}, [crawlOptions.limit])
|
||||
|
||||
const handleRun = useCallback(async (url: string) => {
|
||||
const { isValid, errorMsg } = checkValid(url)
|
||||
if (!isValid) {
|
||||
Toast.notify({
|
||||
message: errorMsg!,
|
||||
type: 'error',
|
||||
})
|
||||
return
|
||||
}
|
||||
setStep(Step.running)
|
||||
try {
|
||||
const startTime = Date.now()
|
||||
const res = await createJinaReaderTask({
|
||||
url,
|
||||
options: crawlOptions,
|
||||
}) as any
|
||||
|
||||
if (res.data) {
|
||||
const data = {
|
||||
current: 1,
|
||||
total: 1,
|
||||
data: [{
|
||||
title: res.data.title,
|
||||
markdown: res.data.content,
|
||||
description: res.data.description,
|
||||
source_url: res.data.url,
|
||||
}],
|
||||
time_consuming: (Date.now() - startTime) / 1000,
|
||||
}
|
||||
setCrawlResult(data)
|
||||
onCheckedCrawlResultChange(data.data || [])
|
||||
setCrawlErrorMessage('')
|
||||
}
|
||||
else if (res.job_id) {
|
||||
const jobId = res.job_id
|
||||
onJobIdChange(jobId)
|
||||
const { isError, data, errorMessage } = await waitForCrawlFinished(jobId)
|
||||
if (isError) {
|
||||
setCrawlErrorMessage(errorMessage || t(`${I18N_PREFIX}.unknownError`))
|
||||
}
|
||||
else {
|
||||
setCrawlResult(data)
|
||||
onCheckedCrawlResultChange(data.data || []) // default select the crawl result
|
||||
setCrawlErrorMessage('')
|
||||
}
|
||||
}
|
||||
}
|
||||
catch (e) {
|
||||
setCrawlErrorMessage(t(`${I18N_PREFIX}.unknownError`)!)
|
||||
console.log(e)
|
||||
}
|
||||
finally {
|
||||
setStep(Step.finished)
|
||||
}
|
||||
}, [checkValid, crawlOptions, onJobIdChange, t, waitForCrawlFinished])
|
||||
|
||||
return (
|
||||
<div>
|
||||
<Header onSetting={handleSetting} />
|
||||
<div className={cn('mt-2 p-4 pb-0 rounded-xl border border-gray-200')}>
|
||||
<UrlInput onRun={handleRun} isRunning={isRunning} />
|
||||
<OptionsWrap
|
||||
className={cn('mt-4')}
|
||||
controlFoldOptions={controlFoldOptions}
|
||||
>
|
||||
<Options className='mt-2' payload={crawlOptions} onChange={onCrawlOptionsChange} />
|
||||
</OptionsWrap>
|
||||
|
||||
{!isInit && (
|
||||
<div className='mt-3 relative left-[-16px] w-[calc(100%_+_32px)] rounded-b-xl'>
|
||||
{isRunning
|
||||
&& <Crawling
|
||||
className='mt-2'
|
||||
crawledNum={crawlResult?.current || 0}
|
||||
totalNum={crawlResult?.total || parseFloat(crawlOptions.limit as string) || 0}
|
||||
/>}
|
||||
{showError && (
|
||||
<ErrorMessage className='rounded-b-xl' title={t(`${I18N_PREFIX}.exceptionErrorTitle`)} errorMsg={crawlErrorMessage} />
|
||||
)}
|
||||
{isCrawlFinished && !showError
|
||||
&& <CrawledResult
|
||||
className='mb-2'
|
||||
list={crawlResult?.data || []}
|
||||
checkedList={checkedCrawlResult}
|
||||
onSelectedChange={onCheckedCrawlResultChange}
|
||||
onPreview={onPreview}
|
||||
usedTime={parseFloat(crawlResult?.time_consuming as string) || 0}
|
||||
/>
|
||||
}
|
||||
</div>
|
||||
)}
|
||||
</div>
|
||||
</div>
|
||||
)
|
||||
}
|
||||
export default React.memo(JinaReader)
|
||||
@@ -0,0 +1,59 @@
|
||||
'use client'
|
||||
import type { FC } from 'react'
|
||||
import React, { useCallback } from 'react'
|
||||
import { useTranslation } from 'react-i18next'
|
||||
import CheckboxWithLabel from '../base/checkbox-with-label'
|
||||
import Field from '../base/field'
|
||||
import cn from '@/utils/classnames'
|
||||
import type { CrawlOptions } from '@/models/datasets'
|
||||
|
||||
const I18N_PREFIX = 'datasetCreation.stepOne.website'
|
||||
|
||||
type Props = {
|
||||
className?: string
|
||||
payload: CrawlOptions
|
||||
onChange: (payload: CrawlOptions) => void
|
||||
}
|
||||
|
||||
const Options: FC<Props> = ({
|
||||
className = '',
|
||||
payload,
|
||||
onChange,
|
||||
}) => {
|
||||
const { t } = useTranslation()
|
||||
|
||||
const handleChange = useCallback((key: keyof CrawlOptions) => {
|
||||
return (value: any) => {
|
||||
onChange({
|
||||
...payload,
|
||||
[key]: value,
|
||||
})
|
||||
}
|
||||
}, [payload, onChange])
|
||||
return (
|
||||
<div className={cn(className, ' space-y-2')}>
|
||||
<CheckboxWithLabel
|
||||
label={t(`${I18N_PREFIX}.crawlSubPage`)}
|
||||
isChecked={payload.crawl_sub_pages}
|
||||
onChange={handleChange('crawl_sub_pages')}
|
||||
/>
|
||||
<CheckboxWithLabel
|
||||
label={t(`${I18N_PREFIX}.useSitemap`)}
|
||||
isChecked={payload.use_sitemap}
|
||||
onChange={handleChange('use_sitemap')}
|
||||
tooltip={t(`${I18N_PREFIX}.useSitemapTooltip`) as string}
|
||||
/>
|
||||
<div className='flex justify-between space-x-4'>
|
||||
<Field
|
||||
className='grow shrink-0'
|
||||
label={t(`${I18N_PREFIX}.limit`)}
|
||||
value={payload.limit}
|
||||
onChange={handleChange('limit')}
|
||||
isNumber
|
||||
isRequired
|
||||
/>
|
||||
</div>
|
||||
</div>
|
||||
)
|
||||
}
|
||||
export default React.memo(Options)
|
||||
@@ -2,35 +2,56 @@
|
||||
import type { FC } from 'react'
|
||||
import React from 'react'
|
||||
import { useTranslation } from 'react-i18next'
|
||||
import s from './index.module.css'
|
||||
import { Icon3Dots } from '@/app/components/base/icons/src/vender/line/others'
|
||||
import Button from '@/app/components/base/button'
|
||||
import { DataSourceProvider } from '@/models/common'
|
||||
|
||||
const I18N_PREFIX = 'datasetCreation.stepOne.website'
|
||||
|
||||
type Props = {
|
||||
onConfig: () => void
|
||||
provider: DataSourceProvider
|
||||
}
|
||||
|
||||
const NoData: FC<Props> = ({
|
||||
onConfig,
|
||||
provider,
|
||||
}) => {
|
||||
const { t } = useTranslation()
|
||||
|
||||
const providerConfig = {
|
||||
[DataSourceProvider.jinaReader]: {
|
||||
emoji: <span className={s.jinaLogo} />,
|
||||
title: t(`${I18N_PREFIX}.jinaReaderNotConfigured`),
|
||||
description: t(`${I18N_PREFIX}.jinaReaderNotConfiguredDescription`),
|
||||
},
|
||||
[DataSourceProvider.fireCrawl]: {
|
||||
emoji: '🔥',
|
||||
title: t(`${I18N_PREFIX}.fireCrawlNotConfigured`),
|
||||
description: t(`${I18N_PREFIX}.fireCrawlNotConfiguredDescription`),
|
||||
},
|
||||
}
|
||||
|
||||
const currentProvider = providerConfig[provider]
|
||||
|
||||
return (
|
||||
<div className='max-w-[640px] p-6 rounded-2xl bg-gray-50'>
|
||||
<div className='flex w-11 h-11 items-center justify-center bg-gray-50 rounded-xl border-[0.5px] border-gray-100 shadow-lg'>
|
||||
🔥
|
||||
</div>
|
||||
<div className='my-2'>
|
||||
<span className='text-gray-700 font-semibold'>{t(`${I18N_PREFIX}.fireCrawlNotConfigured`)}<Icon3Dots className='inline relative -top-3 -left-1.5' /></span>
|
||||
<div className='mt-1 pb-3 text-gray-500 text-[13px] font-normal'>
|
||||
{t(`${I18N_PREFIX}.fireCrawlNotConfiguredDescription`)}
|
||||
<>
|
||||
<div className='max-w-[640px] p-6 rounded-2xl bg-gray-50 mt-4'>
|
||||
<div className='flex w-11 h-11 items-center justify-center bg-gray-50 rounded-xl border-[0.5px] border-gray-100 shadow-lg'>
|
||||
{currentProvider.emoji}
|
||||
</div>
|
||||
<div className='my-2'>
|
||||
<span className='text-gray-700 font-semibold'>{currentProvider.title}<Icon3Dots className='inline relative -top-3 -left-1.5' /></span>
|
||||
<div className='mt-1 pb-3 text-gray-500 text-[13px] font-normal'>
|
||||
{currentProvider.description}
|
||||
</div>
|
||||
</div>
|
||||
<Button variant='primary' onClick={onConfig}>
|
||||
{t(`${I18N_PREFIX}.configure`)}
|
||||
</Button>
|
||||
</div>
|
||||
<Button variant='primary' onClick={onConfig}>
|
||||
{t(`${I18N_PREFIX}.configure`)}
|
||||
</Button>
|
||||
</div>
|
||||
</>
|
||||
)
|
||||
}
|
||||
export default React.memo(NoData)
|
||||
|
||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user