mirror of
https://github.com/langgenius/dify.git
synced 2026-01-07 23:04:12 +00:00
Compare commits
14 Commits
0.10.1
...
feat/exter
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
37f7d5732a | ||
|
|
dcb033d221 | ||
|
|
9f894bb3b3 | ||
|
|
89e81873c4 | ||
|
|
9ca0e56a8a | ||
|
|
e7c77d961b | ||
|
|
a63e15081f | ||
|
|
0724640bbb | ||
|
|
cb70e12827 | ||
|
|
067b956b2c | ||
|
|
e7762b731c | ||
|
|
f6c8390b0b | ||
|
|
4fd57929df | ||
|
|
517cdb2ca4 |
@@ -37,7 +37,7 @@ from .auth import activate, data_source_bearer_auth, data_source_oauth, forgot_p
|
|||||||
from .billing import billing
|
from .billing import billing
|
||||||
|
|
||||||
# Import datasets controllers
|
# Import datasets controllers
|
||||||
from .datasets import data_source, datasets, datasets_document, datasets_segments, file, hit_testing, website
|
from .datasets import data_source, datasets, datasets_document, datasets_segments, external, file, hit_testing, website
|
||||||
|
|
||||||
# Import explore controllers
|
# Import explore controllers
|
||||||
from .explore import (
|
from .explore import (
|
||||||
|
|||||||
@@ -110,6 +110,26 @@ class DatasetListApi(Resource):
|
|||||||
nullable=True,
|
nullable=True,
|
||||||
help="Invalid indexing technique.",
|
help="Invalid indexing technique.",
|
||||||
)
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"external_api_template_id",
|
||||||
|
type=str,
|
||||||
|
nullable=True,
|
||||||
|
required=False,
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"provider",
|
||||||
|
type=str,
|
||||||
|
nullable=True,
|
||||||
|
choices=Dataset.PROVIDER_LIST,
|
||||||
|
required=False,
|
||||||
|
default="vendor",
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"external_knowledge_id",
|
||||||
|
type=str,
|
||||||
|
nullable=True,
|
||||||
|
required=False,
|
||||||
|
)
|
||||||
args = parser.parse_args()
|
args = parser.parse_args()
|
||||||
|
|
||||||
# The role of the current user in the ta table must be admin, owner, or editor, or dataset_operator
|
# The role of the current user in the ta table must be admin, owner, or editor, or dataset_operator
|
||||||
@@ -123,6 +143,9 @@ class DatasetListApi(Resource):
|
|||||||
indexing_technique=args["indexing_technique"],
|
indexing_technique=args["indexing_technique"],
|
||||||
account=current_user,
|
account=current_user,
|
||||||
permission=DatasetPermissionEnum.ONLY_ME,
|
permission=DatasetPermissionEnum.ONLY_ME,
|
||||||
|
provider=args["provider"],
|
||||||
|
external_api_template_id=args["external_api_template_id"],
|
||||||
|
external_knowledge_id=args["external_knowledge_id"],
|
||||||
)
|
)
|
||||||
except services.errors.dataset.DatasetNameDuplicateError:
|
except services.errors.dataset.DatasetNameDuplicateError:
|
||||||
raise DatasetNameDuplicateError()
|
raise DatasetNameDuplicateError()
|
||||||
|
|||||||
254
api/controllers/console/datasets/external.py
Normal file
254
api/controllers/console/datasets/external.py
Normal file
@@ -0,0 +1,254 @@
|
|||||||
|
from flask import request
|
||||||
|
from flask_login import current_user
|
||||||
|
from flask_restful import Resource, marshal, reqparse
|
||||||
|
from werkzeug.exceptions import Forbidden, NotFound
|
||||||
|
|
||||||
|
import services
|
||||||
|
from controllers.console import api
|
||||||
|
from controllers.console.app.error import ProviderNotInitializeError
|
||||||
|
from controllers.console.datasets.error import DatasetNameDuplicateError
|
||||||
|
from controllers.console.setup import setup_required
|
||||||
|
from controllers.console.wraps import account_initialization_required
|
||||||
|
from fields.dataset_fields import dataset_detail_fields
|
||||||
|
from libs.login import login_required
|
||||||
|
from services.external_knowledge_service import ExternalDatasetService
|
||||||
|
|
||||||
|
|
||||||
|
def _validate_name(name):
|
||||||
|
if not name or len(name) < 1 or len(name) > 100:
|
||||||
|
raise ValueError("Name must be between 1 to 100 characters.")
|
||||||
|
return name
|
||||||
|
|
||||||
|
|
||||||
|
def _validate_description_length(description):
|
||||||
|
if len(description) > 400:
|
||||||
|
raise ValueError("Description cannot exceed 400 characters.")
|
||||||
|
return description
|
||||||
|
|
||||||
|
|
||||||
|
class ExternalApiTemplateListApi(Resource):
|
||||||
|
@setup_required
|
||||||
|
@login_required
|
||||||
|
@account_initialization_required
|
||||||
|
def get(self):
|
||||||
|
page = request.args.get("page", default=1, type=int)
|
||||||
|
limit = request.args.get("limit", default=20, type=int)
|
||||||
|
search = request.args.get("keyword", default=None, type=str)
|
||||||
|
|
||||||
|
api_templates, total = ExternalDatasetService.get_external_api_templates(
|
||||||
|
page, limit, current_user.current_tenant_id, search
|
||||||
|
)
|
||||||
|
response = {
|
||||||
|
"data": [item.to_dict() for item in api_templates],
|
||||||
|
"has_more": len(api_templates) == limit,
|
||||||
|
"limit": limit,
|
||||||
|
"total": total,
|
||||||
|
"page": page,
|
||||||
|
}
|
||||||
|
return response, 200
|
||||||
|
|
||||||
|
@setup_required
|
||||||
|
@login_required
|
||||||
|
@account_initialization_required
|
||||||
|
def post(self):
|
||||||
|
parser = reqparse.RequestParser()
|
||||||
|
parser.add_argument(
|
||||||
|
"name",
|
||||||
|
nullable=False,
|
||||||
|
required=True,
|
||||||
|
help="Name is required. Name must be between 1 to 100 characters.",
|
||||||
|
type=_validate_name,
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"description",
|
||||||
|
nullable=False,
|
||||||
|
required=True,
|
||||||
|
help="Description is required. Description must be between 1 to 400 characters.",
|
||||||
|
type=_validate_description_length,
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"settings",
|
||||||
|
type=dict,
|
||||||
|
location="json",
|
||||||
|
nullable=False,
|
||||||
|
required=True,
|
||||||
|
)
|
||||||
|
args = parser.parse_args()
|
||||||
|
|
||||||
|
ExternalDatasetService.validate_api_list(args["settings"])
|
||||||
|
|
||||||
|
# The role of the current user in the ta table must be admin, owner, or editor, or dataset_operator
|
||||||
|
if not current_user.is_dataset_editor:
|
||||||
|
raise Forbidden()
|
||||||
|
|
||||||
|
try:
|
||||||
|
api_template = ExternalDatasetService.create_api_template(
|
||||||
|
tenant_id=current_user.current_tenant_id, user_id=current_user.id, args=args
|
||||||
|
)
|
||||||
|
except services.errors.dataset.DatasetNameDuplicateError:
|
||||||
|
raise DatasetNameDuplicateError()
|
||||||
|
|
||||||
|
return api_template.to_dict(), 201
|
||||||
|
|
||||||
|
|
||||||
|
class ExternalApiTemplateApi(Resource):
|
||||||
|
@setup_required
|
||||||
|
@login_required
|
||||||
|
@account_initialization_required
|
||||||
|
def get(self, api_template_id):
|
||||||
|
api_template_id = str(api_template_id)
|
||||||
|
api_template = ExternalDatasetService.get_api_template(api_template_id)
|
||||||
|
if api_template is None:
|
||||||
|
raise NotFound("API template not found.")
|
||||||
|
|
||||||
|
return api_template.to_dict(), 200
|
||||||
|
|
||||||
|
@setup_required
|
||||||
|
@login_required
|
||||||
|
@account_initialization_required
|
||||||
|
def patch(self, api_template_id):
|
||||||
|
api_template_id = str(api_template_id)
|
||||||
|
|
||||||
|
parser = reqparse.RequestParser()
|
||||||
|
parser.add_argument(
|
||||||
|
"name",
|
||||||
|
nullable=False,
|
||||||
|
required=True,
|
||||||
|
help="type is required. Name must be between 1 to 100 characters.",
|
||||||
|
type=_validate_name,
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"description",
|
||||||
|
nullable=False,
|
||||||
|
required=True,
|
||||||
|
help="description is required. Description must be between 1 to 400 characters.",
|
||||||
|
type=_validate_description_length,
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"settings",
|
||||||
|
type=dict,
|
||||||
|
location="json",
|
||||||
|
nullable=False,
|
||||||
|
required=True,
|
||||||
|
)
|
||||||
|
args = parser.parse_args()
|
||||||
|
ExternalDatasetService.validate_api_list(args["settings"])
|
||||||
|
|
||||||
|
api_template = ExternalDatasetService.update_api_template(
|
||||||
|
tenant_id=current_user.current_tenant_id,
|
||||||
|
user_id=current_user.id,
|
||||||
|
api_template_id=api_template_id,
|
||||||
|
args=args,
|
||||||
|
)
|
||||||
|
|
||||||
|
return api_template.to_dict(), 200
|
||||||
|
|
||||||
|
@setup_required
|
||||||
|
@login_required
|
||||||
|
@account_initialization_required
|
||||||
|
def delete(self, api_template_id):
|
||||||
|
api_template_id = str(api_template_id)
|
||||||
|
|
||||||
|
# The role of the current user in the ta table must be admin, owner, or editor
|
||||||
|
if not current_user.is_editor or current_user.is_dataset_operator:
|
||||||
|
raise Forbidden()
|
||||||
|
|
||||||
|
ExternalDatasetService.delete_api_template(current_user.current_tenant_id, api_template_id)
|
||||||
|
return {"result": "success"}, 204
|
||||||
|
|
||||||
|
|
||||||
|
class ExternalApiUseCheckApi(Resource):
|
||||||
|
@setup_required
|
||||||
|
@login_required
|
||||||
|
@account_initialization_required
|
||||||
|
def get(self, api_template_id):
|
||||||
|
api_template_id = str(api_template_id)
|
||||||
|
|
||||||
|
external_api_template_is_using = ExternalDatasetService.external_api_template_use_check(api_template_id)
|
||||||
|
return {"is_using": external_api_template_is_using}, 200
|
||||||
|
|
||||||
|
|
||||||
|
class ExternalDatasetInitApi(Resource):
|
||||||
|
@setup_required
|
||||||
|
@login_required
|
||||||
|
@account_initialization_required
|
||||||
|
def post(self):
|
||||||
|
# The role of the current user in the ta table must be admin, owner, or editor
|
||||||
|
if not current_user.is_editor:
|
||||||
|
raise Forbidden()
|
||||||
|
|
||||||
|
parser = reqparse.RequestParser()
|
||||||
|
parser.add_argument("api_template_id", type=str, required=True, nullable=True, location="json")
|
||||||
|
# parser.add_argument('name', nullable=False, required=True,
|
||||||
|
# help='name is required. Name must be between 1 to 100 characters.',
|
||||||
|
# type=_validate_name)
|
||||||
|
# parser.add_argument('description', type=str, required=True, nullable=True, location='json')
|
||||||
|
parser.add_argument("data_source", type=dict, required=True, nullable=True, location="json")
|
||||||
|
parser.add_argument("process_parameter", type=dict, required=True, nullable=True, location="json")
|
||||||
|
|
||||||
|
args = parser.parse_args()
|
||||||
|
|
||||||
|
# The role of the current user in the ta table must be admin, owner, or editor, or dataset_operator
|
||||||
|
if not current_user.is_dataset_editor:
|
||||||
|
raise Forbidden()
|
||||||
|
|
||||||
|
# validate args
|
||||||
|
ExternalDatasetService.document_create_args_validate(
|
||||||
|
current_user.current_tenant_id, args["api_template_id"], args["process_parameter"]
|
||||||
|
)
|
||||||
|
|
||||||
|
try:
|
||||||
|
dataset, documents, batch = ExternalDatasetService.init_external_dataset(
|
||||||
|
tenant_id=current_user.current_tenant_id,
|
||||||
|
user_id=current_user.id,
|
||||||
|
args=args,
|
||||||
|
)
|
||||||
|
except Exception as ex:
|
||||||
|
raise ProviderNotInitializeError(ex.description)
|
||||||
|
response = {"dataset": dataset, "documents": documents, "batch": batch}
|
||||||
|
|
||||||
|
return response
|
||||||
|
|
||||||
|
|
||||||
|
class ExternalDatasetCreateApi(Resource):
|
||||||
|
@setup_required
|
||||||
|
@login_required
|
||||||
|
@account_initialization_required
|
||||||
|
def post(self):
|
||||||
|
# The role of the current user in the ta table must be admin, owner, or editor
|
||||||
|
if not current_user.is_editor:
|
||||||
|
raise Forbidden()
|
||||||
|
|
||||||
|
parser = reqparse.RequestParser()
|
||||||
|
parser.add_argument("external_api_template_id", type=str, required=True, nullable=False, location="json")
|
||||||
|
parser.add_argument("external_knowledge_id", type=str, required=True, nullable=False, location="json")
|
||||||
|
parser.add_argument(
|
||||||
|
"name",
|
||||||
|
nullable=False,
|
||||||
|
required=True,
|
||||||
|
help="name is required. Name must be between 1 to 100 characters.",
|
||||||
|
type=_validate_name,
|
||||||
|
)
|
||||||
|
parser.add_argument("description", type=str, required=True, nullable=True, location="json")
|
||||||
|
|
||||||
|
args = parser.parse_args()
|
||||||
|
|
||||||
|
# The role of the current user in the ta table must be admin, owner, or editor, or dataset_operator
|
||||||
|
if not current_user.is_dataset_editor:
|
||||||
|
raise Forbidden()
|
||||||
|
|
||||||
|
try:
|
||||||
|
dataset = ExternalDatasetService.create_external_dataset(
|
||||||
|
tenant_id=current_user.current_tenant_id,
|
||||||
|
user_id=current_user.id,
|
||||||
|
args=args,
|
||||||
|
)
|
||||||
|
except services.errors.dataset.DatasetNameDuplicateError:
|
||||||
|
raise DatasetNameDuplicateError()
|
||||||
|
|
||||||
|
return marshal(dataset, dataset_detail_fields), 201
|
||||||
|
|
||||||
|
|
||||||
|
api.add_resource(ExternalApiTemplateListApi, "/datasets/external-api-template")
|
||||||
|
api.add_resource(ExternalApiTemplateApi, "/datasets/external-api-template/<uuid:api_template_id>")
|
||||||
|
api.add_resource(ExternalApiUseCheckApi, "/datasets/external-api-template/<uuid:api_template_id>/use-check")
|
||||||
@@ -47,6 +47,7 @@ class HitTestingApi(Resource):
|
|||||||
parser = reqparse.RequestParser()
|
parser = reqparse.RequestParser()
|
||||||
parser.add_argument("query", type=str, location="json")
|
parser.add_argument("query", type=str, location="json")
|
||||||
parser.add_argument("retrieval_model", type=dict, required=False, location="json")
|
parser.add_argument("retrieval_model", type=dict, required=False, location="json")
|
||||||
|
parser.add_argument("external_retrival_model", type=dict, required=False, location="json")
|
||||||
args = parser.parse_args()
|
args = parser.parse_args()
|
||||||
|
|
||||||
HitTestingService.hit_testing_args_check(args)
|
HitTestingService.hit_testing_args_check(args)
|
||||||
@@ -57,6 +58,7 @@ class HitTestingApi(Resource):
|
|||||||
query=args["query"],
|
query=args["query"],
|
||||||
account=current_user,
|
account=current_user,
|
||||||
retrieval_model=args["retrieval_model"],
|
retrieval_model=args["retrieval_model"],
|
||||||
|
external_retrieval_model=args["external_retrival_model"],
|
||||||
limit=10,
|
limit=10,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|||||||
49
api/controllers/console/datasets/test_external.py
Normal file
49
api/controllers/console/datasets/test_external.py
Normal file
@@ -0,0 +1,49 @@
|
|||||||
|
from flask import request
|
||||||
|
from flask_login import current_user
|
||||||
|
from flask_restful import Resource, marshal, reqparse
|
||||||
|
from werkzeug.exceptions import Forbidden, NotFound
|
||||||
|
|
||||||
|
import services
|
||||||
|
from controllers.console import api
|
||||||
|
from controllers.console.app.error import ProviderNotInitializeError
|
||||||
|
from controllers.console.datasets.error import DatasetNameDuplicateError
|
||||||
|
from controllers.console.setup import setup_required
|
||||||
|
from controllers.console.wraps import account_initialization_required
|
||||||
|
from fields.dataset_fields import dataset_detail_fields
|
||||||
|
from libs.login import login_required
|
||||||
|
from services.external_knowledge_service import ExternalDatasetService
|
||||||
|
|
||||||
|
class TestExternalApi(Resource):
|
||||||
|
@setup_required
|
||||||
|
@login_required
|
||||||
|
@account_initialization_required
|
||||||
|
def post(self):
|
||||||
|
parser = reqparse.RequestParser()
|
||||||
|
parser.add_argument(
|
||||||
|
"top_k",
|
||||||
|
nullable=False,
|
||||||
|
required=True,
|
||||||
|
type=int,
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"score_threshold",
|
||||||
|
nullable=False,
|
||||||
|
required=True,
|
||||||
|
type=float,
|
||||||
|
)
|
||||||
|
args = parser.parse_args()
|
||||||
|
result = ExternalDatasetService.test_external_knowledge_retrival(
|
||||||
|
args["top_k"], args["score_threshold"]
|
||||||
|
)
|
||||||
|
response = {
|
||||||
|
"data": [item.to_dict() for item in api_templates],
|
||||||
|
"has_more": len(api_templates) == limit,
|
||||||
|
"limit": limit,
|
||||||
|
"total": total,
|
||||||
|
"page": page,
|
||||||
|
}
|
||||||
|
return response, 200
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
api.add_resource(TestExternalApi, "/dify/external-knowledge/retrival-documents")
|
||||||
@@ -82,6 +82,26 @@ class DatasetListApi(DatasetApiResource):
|
|||||||
required=False,
|
required=False,
|
||||||
nullable=False,
|
nullable=False,
|
||||||
)
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"external_api_template_id",
|
||||||
|
type=str,
|
||||||
|
nullable=True,
|
||||||
|
required=False,
|
||||||
|
default="_validate_name",
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"provider",
|
||||||
|
type=str,
|
||||||
|
nullable=True,
|
||||||
|
required=False,
|
||||||
|
default="vendor",
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"external_knowledge_id",
|
||||||
|
type=str,
|
||||||
|
nullable=True,
|
||||||
|
required=False,
|
||||||
|
)
|
||||||
args = parser.parse_args()
|
args = parser.parse_args()
|
||||||
|
|
||||||
try:
|
try:
|
||||||
@@ -91,6 +111,9 @@ class DatasetListApi(DatasetApiResource):
|
|||||||
indexing_technique=args["indexing_technique"],
|
indexing_technique=args["indexing_technique"],
|
||||||
account=current_user,
|
account=current_user,
|
||||||
permission=args["permission"],
|
permission=args["permission"],
|
||||||
|
provider=args["provider"],
|
||||||
|
external_api_template_id=args["external_api_template_id"],
|
||||||
|
external_knowledge_id=args["external_knowledge_id"],
|
||||||
)
|
)
|
||||||
except services.errors.dataset.DatasetNameDuplicateError:
|
except services.errors.dataset.DatasetNameDuplicateError:
|
||||||
raise DatasetNameDuplicateError()
|
raise DatasetNameDuplicateError()
|
||||||
|
|||||||
@@ -10,6 +10,7 @@ from core.rag.rerank.constants.rerank_mode import RerankMode
|
|||||||
from core.rag.retrieval.retrieval_methods import RetrievalMethod
|
from core.rag.retrieval.retrieval_methods import RetrievalMethod
|
||||||
from extensions.ext_database import db
|
from extensions.ext_database import db
|
||||||
from models.dataset import Dataset
|
from models.dataset import Dataset
|
||||||
|
from services.external_knowledge_service import ExternalDatasetService
|
||||||
|
|
||||||
default_retrieval_model = {
|
default_retrieval_model = {
|
||||||
"search_method": RetrievalMethod.SEMANTIC_SEARCH.value,
|
"search_method": RetrievalMethod.SEMANTIC_SEARCH.value,
|
||||||
@@ -22,91 +23,90 @@ default_retrieval_model = {
|
|||||||
|
|
||||||
class RetrievalService:
|
class RetrievalService:
|
||||||
@classmethod
|
@classmethod
|
||||||
def retrieve(
|
def retrieve(cls, retrival_method: str, dataset_id: str, query: str,
|
||||||
cls,
|
top_k: int, score_threshold: Optional[float] = .0,
|
||||||
retrieval_method: str,
|
reranking_model: Optional[dict] = None, reranking_mode: Optional[str] = 'reranking_model',
|
||||||
dataset_id: str,
|
weights: Optional[dict] = None, provider: Optional[str] = None,
|
||||||
query: str,
|
external_retrieval_model: Optional[dict] = None):
|
||||||
top_k: int,
|
dataset = db.session.query(Dataset).filter(
|
||||||
score_threshold: Optional[float] = 0.0,
|
Dataset.id == dataset_id
|
||||||
reranking_model: Optional[dict] = None,
|
).first()
|
||||||
reranking_mode: Optional[str] = "reranking_model",
|
if not dataset:
|
||||||
weights: Optional[dict] = None,
|
|
||||||
):
|
|
||||||
dataset = db.session.query(Dataset).filter(Dataset.id == dataset_id).first()
|
|
||||||
if not dataset or dataset.available_document_count == 0 or dataset.available_segment_count == 0:
|
|
||||||
return []
|
return []
|
||||||
all_documents = []
|
if provider == 'external':
|
||||||
threads = []
|
all_documents = ExternalDatasetService.fetch_external_knowledge_retrival(
|
||||||
exceptions = []
|
dataset.tenant_id,
|
||||||
# retrieval_model source with keyword
|
dataset_id,
|
||||||
if retrieval_method == "keyword_search":
|
query,
|
||||||
keyword_thread = threading.Thread(
|
external_retrieval_model
|
||||||
target=RetrievalService.keyword_search,
|
|
||||||
kwargs={
|
|
||||||
"flask_app": current_app._get_current_object(),
|
|
||||||
"dataset_id": dataset_id,
|
|
||||||
"query": query,
|
|
||||||
"top_k": top_k,
|
|
||||||
"all_documents": all_documents,
|
|
||||||
"exceptions": exceptions,
|
|
||||||
},
|
|
||||||
)
|
)
|
||||||
threads.append(keyword_thread)
|
else:
|
||||||
keyword_thread.start()
|
if not dataset or dataset.available_document_count == 0 or dataset.available_segment_count == 0:
|
||||||
# retrieval_model source with semantic
|
return []
|
||||||
if RetrievalMethod.is_support_semantic_search(retrieval_method):
|
all_documents = []
|
||||||
embedding_thread = threading.Thread(
|
threads = []
|
||||||
target=RetrievalService.embedding_search,
|
exceptions = []
|
||||||
kwargs={
|
# retrieval_model source with keyword
|
||||||
"flask_app": current_app._get_current_object(),
|
if retrival_method == 'keyword_search':
|
||||||
"dataset_id": dataset_id,
|
keyword_thread = threading.Thread(target=RetrievalService.keyword_search, kwargs={
|
||||||
"query": query,
|
'flask_app': current_app._get_current_object(),
|
||||||
"top_k": top_k,
|
'dataset_id': dataset_id,
|
||||||
"score_threshold": score_threshold,
|
'query': query,
|
||||||
"reranking_model": reranking_model,
|
'top_k': top_k,
|
||||||
"all_documents": all_documents,
|
'all_documents': all_documents,
|
||||||
"retrieval_method": retrieval_method,
|
'exceptions': exceptions,
|
||||||
"exceptions": exceptions,
|
})
|
||||||
},
|
threads.append(keyword_thread)
|
||||||
)
|
keyword_thread.start()
|
||||||
threads.append(embedding_thread)
|
# retrieval_model source with semantic
|
||||||
embedding_thread.start()
|
if RetrievalMethod.is_support_semantic_search(retrival_method):
|
||||||
|
embedding_thread = threading.Thread(target=RetrievalService.embedding_search, kwargs={
|
||||||
|
'flask_app': current_app._get_current_object(),
|
||||||
|
'dataset_id': dataset_id,
|
||||||
|
'query': query,
|
||||||
|
'top_k': top_k,
|
||||||
|
'score_threshold': score_threshold,
|
||||||
|
'reranking_model': reranking_model,
|
||||||
|
'all_documents': all_documents,
|
||||||
|
'retrival_method': retrival_method,
|
||||||
|
'exceptions': exceptions,
|
||||||
|
})
|
||||||
|
threads.append(embedding_thread)
|
||||||
|
embedding_thread.start()
|
||||||
|
|
||||||
# retrieval source with full text
|
# retrieval source with full text
|
||||||
if RetrievalMethod.is_support_fulltext_search(retrieval_method):
|
if RetrievalMethod.is_support_fulltext_search(retrival_method):
|
||||||
full_text_index_thread = threading.Thread(
|
full_text_index_thread = threading.Thread(target=RetrievalService.full_text_index_search, kwargs={
|
||||||
target=RetrievalService.full_text_index_search,
|
'flask_app': current_app._get_current_object(),
|
||||||
kwargs={
|
'dataset_id': dataset_id,
|
||||||
"flask_app": current_app._get_current_object(),
|
'query': query,
|
||||||
"dataset_id": dataset_id,
|
'retrival_method': retrival_method,
|
||||||
"query": query,
|
'score_threshold': score_threshold,
|
||||||
"retrieval_method": retrieval_method,
|
'top_k': top_k,
|
||||||
"score_threshold": score_threshold,
|
'reranking_model': reranking_model,
|
||||||
"top_k": top_k,
|
'all_documents': all_documents,
|
||||||
"reranking_model": reranking_model,
|
'exceptions': exceptions,
|
||||||
"all_documents": all_documents,
|
})
|
||||||
"exceptions": exceptions,
|
threads.append(full_text_index_thread)
|
||||||
},
|
full_text_index_thread.start()
|
||||||
)
|
|
||||||
threads.append(full_text_index_thread)
|
|
||||||
full_text_index_thread.start()
|
|
||||||
|
|
||||||
for thread in threads:
|
for thread in threads:
|
||||||
thread.join()
|
thread.join()
|
||||||
|
|
||||||
if exceptions:
|
if exceptions:
|
||||||
exception_message = ";\n".join(exceptions)
|
exception_message = ';\n'.join(exceptions)
|
||||||
raise Exception(exception_message)
|
raise Exception(exception_message)
|
||||||
|
|
||||||
if retrieval_method == RetrievalMethod.HYBRID_SEARCH.value:
|
if retrival_method == RetrievalMethod.HYBRID_SEARCH.value:
|
||||||
data_post_processor = DataPostProcessor(
|
data_post_processor = DataPostProcessor(str(dataset.tenant_id), reranking_mode,
|
||||||
str(dataset.tenant_id), reranking_mode, reranking_model, weights, False
|
reranking_model, weights, False)
|
||||||
)
|
all_documents = data_post_processor.invoke(
|
||||||
all_documents = data_post_processor.invoke(
|
query=query,
|
||||||
query=query, documents=all_documents, score_threshold=score_threshold, top_n=top_k
|
documents=all_documents,
|
||||||
)
|
score_threshold=score_threshold,
|
||||||
return all_documents
|
top_n=top_k
|
||||||
|
)
|
||||||
|
return all_documents
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
def keyword_search(
|
def keyword_search(
|
||||||
|
|||||||
11
api/fields/external_dataset_fields.py
Normal file
11
api/fields/external_dataset_fields.py
Normal file
@@ -0,0 +1,11 @@
|
|||||||
|
from flask_restful import fields
|
||||||
|
|
||||||
|
from libs.helper import TimestampField
|
||||||
|
|
||||||
|
api_template_query_detail_fields = {
|
||||||
|
"id": fields.String,
|
||||||
|
"name": fields.String,
|
||||||
|
"setting": fields.String,
|
||||||
|
"created_by": fields.String,
|
||||||
|
"created_at": TimestampField,
|
||||||
|
}
|
||||||
@@ -0,0 +1,74 @@
|
|||||||
|
"""external_knowledge
|
||||||
|
|
||||||
|
Revision ID: ec3df697ebbb
|
||||||
|
Revises: 675b5321501b
|
||||||
|
Create Date: 2024-09-18 06:59:54.048478
|
||||||
|
|
||||||
|
"""
|
||||||
|
from alembic import op
|
||||||
|
import models as models
|
||||||
|
import sqlalchemy as sa
|
||||||
|
from sqlalchemy.dialects import postgresql
|
||||||
|
|
||||||
|
# revision identifiers, used by Alembic.
|
||||||
|
revision = 'ec3df697ebbb'
|
||||||
|
down_revision = '675b5321501b'
|
||||||
|
branch_labels = None
|
||||||
|
depends_on = None
|
||||||
|
|
||||||
|
|
||||||
|
def upgrade():
|
||||||
|
# ### commands auto generated by Alembic - please adjust! ###
|
||||||
|
op.create_table('external_api_templates',
|
||||||
|
sa.Column('id', models.types.StringUUID(), server_default=sa.text('uuid_generate_v4()'), nullable=False),
|
||||||
|
sa.Column('name', sa.String(length=255), nullable=False),
|
||||||
|
sa.Column('description', sa.String(length=255), nullable=False),
|
||||||
|
sa.Column('tenant_id', models.types.StringUUID(), nullable=False),
|
||||||
|
sa.Column('settings', sa.Text(), nullable=True),
|
||||||
|
sa.Column('created_by', models.types.StringUUID(), nullable=False),
|
||||||
|
sa.Column('created_at', sa.DateTime(), server_default=sa.text('CURRENT_TIMESTAMP(0)'), nullable=False),
|
||||||
|
sa.Column('updated_by', models.types.StringUUID(), nullable=True),
|
||||||
|
sa.Column('updated_at', sa.DateTime(), server_default=sa.text('CURRENT_TIMESTAMP(0)'), nullable=False),
|
||||||
|
sa.PrimaryKeyConstraint('id', name='external_api_template_pkey')
|
||||||
|
)
|
||||||
|
with op.batch_alter_table('external_api_templates', schema=None) as batch_op:
|
||||||
|
batch_op.create_index('external_api_templates_name_idx', ['name'], unique=False)
|
||||||
|
batch_op.create_index('external_api_templates_tenant_idx', ['tenant_id'], unique=False)
|
||||||
|
|
||||||
|
op.create_table('external_knowledge_bindings',
|
||||||
|
sa.Column('id', models.types.StringUUID(), server_default=sa.text('uuid_generate_v4()'), nullable=False),
|
||||||
|
sa.Column('tenant_id', models.types.StringUUID(), nullable=False),
|
||||||
|
sa.Column('external_api_template_id', models.types.StringUUID(), nullable=False),
|
||||||
|
sa.Column('dataset_id', models.types.StringUUID(), nullable=False),
|
||||||
|
sa.Column('external_knowledge_id', sa.Text(), nullable=False),
|
||||||
|
sa.Column('created_by', models.types.StringUUID(), nullable=False),
|
||||||
|
sa.Column('created_at', sa.DateTime(), server_default=sa.text('CURRENT_TIMESTAMP(0)'), nullable=False),
|
||||||
|
sa.Column('updated_by', models.types.StringUUID(), nullable=True),
|
||||||
|
sa.Column('updated_at', sa.DateTime(), server_default=sa.text('CURRENT_TIMESTAMP(0)'), nullable=False),
|
||||||
|
sa.PrimaryKeyConstraint('id', name='external_knowledge_bindings_pkey')
|
||||||
|
)
|
||||||
|
with op.batch_alter_table('external_knowledge_bindings', schema=None) as batch_op:
|
||||||
|
batch_op.create_index('external_knowledge_bindings_dataset_idx', ['dataset_id'], unique=False)
|
||||||
|
batch_op.create_index('external_knowledge_bindings_external_api_template_idx', ['external_api_template_id'], unique=False)
|
||||||
|
batch_op.create_index('external_knowledge_bindings_external_knowledge_idx', ['external_knowledge_id'], unique=False)
|
||||||
|
batch_op.create_index('external_knowledge_bindings_tenant_idx', ['tenant_id'], unique=False)
|
||||||
|
|
||||||
|
# ### end Alembic commands ###
|
||||||
|
|
||||||
|
|
||||||
|
def downgrade():
|
||||||
|
# ### commands auto generated by Alembic - please adjust! ###
|
||||||
|
|
||||||
|
with op.batch_alter_table('external_knowledge_bindings', schema=None) as batch_op:
|
||||||
|
batch_op.drop_index('external_knowledge_bindings_tenant_idx')
|
||||||
|
batch_op.drop_index('external_knowledge_bindings_external_knowledge_idx')
|
||||||
|
batch_op.drop_index('external_knowledge_bindings_external_api_template_idx')
|
||||||
|
batch_op.drop_index('external_knowledge_bindings_dataset_idx')
|
||||||
|
|
||||||
|
op.drop_table('external_knowledge_bindings')
|
||||||
|
with op.batch_alter_table('external_api_templates', schema=None) as batch_op:
|
||||||
|
batch_op.drop_index('external_api_templates_tenant_idx')
|
||||||
|
batch_op.drop_index('external_api_templates_name_idx')
|
||||||
|
|
||||||
|
op.drop_table('external_api_templates')
|
||||||
|
# ### end Alembic commands ###
|
||||||
@@ -37,7 +37,8 @@ class Dataset(db.Model):
|
|||||||
db.Index("retrieval_model_idx", "retrieval_model", postgresql_using="gin"),
|
db.Index("retrieval_model_idx", "retrieval_model", postgresql_using="gin"),
|
||||||
)
|
)
|
||||||
|
|
||||||
INDEXING_TECHNIQUE_LIST = ["high_quality", "economy", None]
|
INDEXING_TECHNIQUE_LIST = ['high_quality', 'economy', None]
|
||||||
|
PROVIDER_LIST = ['vendor', 'external', None]
|
||||||
|
|
||||||
id = db.Column(StringUUID, server_default=db.text("uuid_generate_v4()"))
|
id = db.Column(StringUUID, server_default=db.text("uuid_generate_v4()"))
|
||||||
tenant_id = db.Column(StringUUID, nullable=False)
|
tenant_id = db.Column(StringUUID, nullable=False)
|
||||||
@@ -687,3 +688,67 @@ class DatasetPermission(db.Model):
|
|||||||
tenant_id = db.Column(StringUUID, nullable=False)
|
tenant_id = db.Column(StringUUID, nullable=False)
|
||||||
has_permission = db.Column(db.Boolean, nullable=False, server_default=db.text("true"))
|
has_permission = db.Column(db.Boolean, nullable=False, server_default=db.text("true"))
|
||||||
created_at = db.Column(db.DateTime, nullable=False, server_default=db.text("CURRENT_TIMESTAMP(0)"))
|
created_at = db.Column(db.DateTime, nullable=False, server_default=db.text("CURRENT_TIMESTAMP(0)"))
|
||||||
|
|
||||||
|
|
||||||
|
class ExternalApiTemplates(db.Model):
|
||||||
|
__tablename__ = 'external_api_templates'
|
||||||
|
__table_args__ = (
|
||||||
|
db.PrimaryKeyConstraint('id', name='external_api_template_pkey'),
|
||||||
|
db.Index('external_api_templates_tenant_idx', 'tenant_id'),
|
||||||
|
db.Index('external_api_templates_name_idx', 'name'),
|
||||||
|
)
|
||||||
|
|
||||||
|
id = db.Column(StringUUID, nullable=False,
|
||||||
|
server_default=db.text('uuid_generate_v4()'))
|
||||||
|
name = db.Column(db.String(255), nullable=False)
|
||||||
|
description = db.Column(db.String(255), nullable=False)
|
||||||
|
tenant_id = db.Column(StringUUID, nullable=False)
|
||||||
|
settings = db.Column(db.Text, nullable=True)
|
||||||
|
created_by = db.Column(StringUUID, nullable=False)
|
||||||
|
created_at = db.Column(db.DateTime, nullable=False,
|
||||||
|
server_default=db.text('CURRENT_TIMESTAMP(0)'))
|
||||||
|
updated_by = db.Column(StringUUID, nullable=True)
|
||||||
|
updated_at = db.Column(db.DateTime, nullable=False,
|
||||||
|
server_default=db.text('CURRENT_TIMESTAMP(0)'))
|
||||||
|
|
||||||
|
def to_dict(self):
|
||||||
|
return {
|
||||||
|
'id': self.id,
|
||||||
|
'tenant_id': self.tenant_id,
|
||||||
|
'name': self.name,
|
||||||
|
'description': self.description,
|
||||||
|
'settings': self.settings_dict,
|
||||||
|
'created_by': self.created_by,
|
||||||
|
'created_at': self.created_at.isoformat(),
|
||||||
|
}
|
||||||
|
|
||||||
|
@property
|
||||||
|
def settings_dict(self):
|
||||||
|
try:
|
||||||
|
return json.loads(self.settings) if self.settings else None
|
||||||
|
except JSONDecodeError:
|
||||||
|
return None
|
||||||
|
|
||||||
|
|
||||||
|
class ExternalKnowledgeBindings(db.Model):
|
||||||
|
__tablename__ = 'external_knowledge_bindings'
|
||||||
|
__table_args__ = (
|
||||||
|
db.PrimaryKeyConstraint('id', name='external_knowledge_bindings_pkey'),
|
||||||
|
db.Index('external_knowledge_bindings_tenant_idx', 'tenant_id'),
|
||||||
|
db.Index('external_knowledge_bindings_dataset_idx', 'dataset_id'),
|
||||||
|
db.Index('external_knowledge_bindings_external_knowledge_idx', 'external_knowledge_id'),
|
||||||
|
db.Index('external_knowledge_bindings_external_api_template_idx', 'external_api_template_id'),
|
||||||
|
)
|
||||||
|
|
||||||
|
id = db.Column(StringUUID, nullable=False,
|
||||||
|
server_default=db.text('uuid_generate_v4()'))
|
||||||
|
tenant_id = db.Column(StringUUID, nullable=False)
|
||||||
|
external_api_template_id = db.Column(StringUUID, nullable=False)
|
||||||
|
dataset_id = db.Column(StringUUID, nullable=False)
|
||||||
|
external_knowledge_id = db.Column(db.Text, nullable=False)
|
||||||
|
created_by = db.Column(StringUUID, nullable=False)
|
||||||
|
created_at = db.Column(db.DateTime, nullable=False,
|
||||||
|
server_default=db.text('CURRENT_TIMESTAMP(0)'))
|
||||||
|
updated_by = db.Column(StringUUID, nullable=True)
|
||||||
|
updated_at = db.Column(db.DateTime, nullable=False,
|
||||||
|
server_default=db.text('CURRENT_TIMESTAMP(0)'))
|
||||||
92
api/schedule/clean_unused_messages_task.py
Normal file
92
api/schedule/clean_unused_messages_task.py
Normal file
@@ -0,0 +1,92 @@
|
|||||||
|
import datetime
|
||||||
|
import time
|
||||||
|
|
||||||
|
import click
|
||||||
|
from sqlalchemy import func
|
||||||
|
from werkzeug.exceptions import NotFound
|
||||||
|
|
||||||
|
import app
|
||||||
|
from configs import dify_config
|
||||||
|
from core.rag.index_processor.index_processor_factory import IndexProcessorFactory
|
||||||
|
from extensions.ext_database import db
|
||||||
|
from models.dataset import Dataset, DatasetQuery, Document
|
||||||
|
|
||||||
|
|
||||||
|
@app.celery.task(queue="dataset")
|
||||||
|
def clean_unused_message_task():
|
||||||
|
click.echo(click.style("Start clean unused messages .", fg="green"))
|
||||||
|
clean_days = int(dify_config.CLEAN_DAY_SETTING)
|
||||||
|
start_at = time.perf_counter()
|
||||||
|
thirty_days_ago = datetime.datetime.now() - datetime.timedelta(days=clean_days)
|
||||||
|
page = 1
|
||||||
|
while True:
|
||||||
|
try:
|
||||||
|
# Subquery for counting new documents
|
||||||
|
document_subquery_new = (
|
||||||
|
db.session.query(Document.dataset_id, func.count(Document.id).label("document_count"))
|
||||||
|
.filter(
|
||||||
|
Document.indexing_status == "completed",
|
||||||
|
Document.enabled == True,
|
||||||
|
Document.archived == False,
|
||||||
|
Document.updated_at > thirty_days_ago,
|
||||||
|
)
|
||||||
|
.group_by(Document.dataset_id)
|
||||||
|
.subquery()
|
||||||
|
)
|
||||||
|
|
||||||
|
# Subquery for counting old documents
|
||||||
|
document_subquery_old = (
|
||||||
|
db.session.query(Document.dataset_id, func.count(Document.id).label("document_count"))
|
||||||
|
.filter(
|
||||||
|
Document.indexing_status == "completed",
|
||||||
|
Document.enabled == True,
|
||||||
|
Document.archived == False,
|
||||||
|
Document.updated_at < thirty_days_ago,
|
||||||
|
)
|
||||||
|
.group_by(Document.dataset_id)
|
||||||
|
.subquery()
|
||||||
|
)
|
||||||
|
|
||||||
|
# Main query with join and filter
|
||||||
|
datasets = (
|
||||||
|
db.session.query(Dataset)
|
||||||
|
.outerjoin(document_subquery_new, Dataset.id == document_subquery_new.c.dataset_id)
|
||||||
|
.outerjoin(document_subquery_old, Dataset.id == document_subquery_old.c.dataset_id)
|
||||||
|
.filter(
|
||||||
|
Dataset.created_at < thirty_days_ago,
|
||||||
|
func.coalesce(document_subquery_new.c.document_count, 0) == 0,
|
||||||
|
func.coalesce(document_subquery_old.c.document_count, 0) > 0,
|
||||||
|
)
|
||||||
|
.order_by(Dataset.created_at.desc())
|
||||||
|
.paginate(page=page, per_page=50)
|
||||||
|
)
|
||||||
|
|
||||||
|
except NotFound:
|
||||||
|
break
|
||||||
|
if datasets.items is None or len(datasets.items) == 0:
|
||||||
|
break
|
||||||
|
page += 1
|
||||||
|
for dataset in datasets:
|
||||||
|
dataset_query = (
|
||||||
|
db.session.query(DatasetQuery)
|
||||||
|
.filter(DatasetQuery.created_at > thirty_days_ago, DatasetQuery.dataset_id == dataset.id)
|
||||||
|
.all()
|
||||||
|
)
|
||||||
|
if not dataset_query or len(dataset_query) == 0:
|
||||||
|
try:
|
||||||
|
# remove index
|
||||||
|
index_processor = IndexProcessorFactory(dataset.doc_form).init_index_processor()
|
||||||
|
index_processor.clean(dataset, None)
|
||||||
|
|
||||||
|
# update document
|
||||||
|
update_params = {Document.enabled: False}
|
||||||
|
|
||||||
|
Document.query.filter_by(dataset_id=dataset.id).update(update_params)
|
||||||
|
db.session.commit()
|
||||||
|
click.echo(click.style("Cleaned unused dataset {} from db success!".format(dataset.id), fg="green"))
|
||||||
|
except Exception as e:
|
||||||
|
click.echo(
|
||||||
|
click.style("clean dataset index error: {} {}".format(e.__class__.__name__, str(e)), fg="red")
|
||||||
|
)
|
||||||
|
end_at = time.perf_counter()
|
||||||
|
click.echo(click.style("Cleaned unused dataset from db success latency: {}".format(end_at - start_at), fg="green"))
|
||||||
@@ -32,6 +32,7 @@ from models.dataset import (
|
|||||||
DatasetQuery,
|
DatasetQuery,
|
||||||
Document,
|
Document,
|
||||||
DocumentSegment,
|
DocumentSegment,
|
||||||
|
ExternalKnowledgeBindings,
|
||||||
)
|
)
|
||||||
from models.model import UploadFile
|
from models.model import UploadFile
|
||||||
from models.source import DataSourceOauthBinding
|
from models.source import DataSourceOauthBinding
|
||||||
@@ -39,6 +40,7 @@ from services.errors.account import NoPermissionError
|
|||||||
from services.errors.dataset import DatasetNameDuplicateError
|
from services.errors.dataset import DatasetNameDuplicateError
|
||||||
from services.errors.document import DocumentIndexingError
|
from services.errors.document import DocumentIndexingError
|
||||||
from services.errors.file import FileNotExistsError
|
from services.errors.file import FileNotExistsError
|
||||||
|
from services.external_knowledge_service import ExternalDatasetService
|
||||||
from services.feature_service import FeatureModel, FeatureService
|
from services.feature_service import FeatureModel, FeatureService
|
||||||
from services.tag_service import TagService
|
from services.tag_service import TagService
|
||||||
from services.vector_service import VectorService
|
from services.vector_service import VectorService
|
||||||
@@ -137,7 +139,14 @@ class DatasetService:
|
|||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def create_empty_dataset(
|
def create_empty_dataset(
|
||||||
tenant_id: str, name: str, indexing_technique: Optional[str], account: Account, permission: Optional[str] = None
|
tenant_id: str,
|
||||||
|
name: str,
|
||||||
|
indexing_technique: Optional[str],
|
||||||
|
account: Account,
|
||||||
|
permission: Optional[str] = None,
|
||||||
|
provider: str = "vendor",
|
||||||
|
external_api_template_id: Optional[str] = None,
|
||||||
|
external_knowledge_id: Optional[str] = None,
|
||||||
):
|
):
|
||||||
# check if dataset name already exists
|
# check if dataset name already exists
|
||||||
if Dataset.query.filter_by(name=name, tenant_id=tenant_id).first():
|
if Dataset.query.filter_by(name=name, tenant_id=tenant_id).first():
|
||||||
@@ -156,7 +165,23 @@ class DatasetService:
|
|||||||
dataset.embedding_model_provider = embedding_model.provider if embedding_model else None
|
dataset.embedding_model_provider = embedding_model.provider if embedding_model else None
|
||||||
dataset.embedding_model = embedding_model.model if embedding_model else None
|
dataset.embedding_model = embedding_model.model if embedding_model else None
|
||||||
dataset.permission = permission or DatasetPermissionEnum.ONLY_ME
|
dataset.permission = permission or DatasetPermissionEnum.ONLY_ME
|
||||||
|
dataset.provider = provider
|
||||||
db.session.add(dataset)
|
db.session.add(dataset)
|
||||||
|
db.session.flush()
|
||||||
|
|
||||||
|
if provider == "external" and external_api_template_id:
|
||||||
|
external_api_template = ExternalDatasetService.get_api_template(external_api_template_id)
|
||||||
|
if not external_api_template:
|
||||||
|
raise ValueError("External API template not found.")
|
||||||
|
external_knowledge_binding = ExternalKnowledgeBindings(
|
||||||
|
tenant_id=tenant_id,
|
||||||
|
dataset_id=dataset.id,
|
||||||
|
external_api_template_id=external_api_template_id,
|
||||||
|
external_knowledge_id=external_knowledge_id,
|
||||||
|
created_by=account.id,
|
||||||
|
)
|
||||||
|
db.session.add(external_knowledge_binding)
|
||||||
|
|
||||||
db.session.commit()
|
db.session.commit()
|
||||||
return dataset
|
return dataset
|
||||||
|
|
||||||
|
|||||||
@@ -0,0 +1,28 @@
|
|||||||
|
from typing import Literal, Optional, Union
|
||||||
|
|
||||||
|
from pydantic import BaseModel
|
||||||
|
|
||||||
|
|
||||||
|
class AuthorizationConfig(BaseModel):
|
||||||
|
type: Literal[None, "basic", "bearer", "custom"]
|
||||||
|
api_key: Union[None, str] = None
|
||||||
|
header: Union[None, str] = None
|
||||||
|
|
||||||
|
|
||||||
|
class Authorization(BaseModel):
|
||||||
|
type: Literal["no-auth", "api-key"]
|
||||||
|
config: Optional[AuthorizationConfig] = None
|
||||||
|
|
||||||
|
|
||||||
|
class ProcessStatusSetting(BaseModel):
|
||||||
|
request_method: str
|
||||||
|
url: str
|
||||||
|
|
||||||
|
|
||||||
|
class ApiTemplateSetting(BaseModel):
|
||||||
|
method: str
|
||||||
|
url: str
|
||||||
|
request_method: str
|
||||||
|
api_token: str
|
||||||
|
headers: Optional[dict] = None
|
||||||
|
params: Optional[dict] = None
|
||||||
308
api/services/external_knowledge_service.py
Normal file
308
api/services/external_knowledge_service.py
Normal file
@@ -0,0 +1,308 @@
|
|||||||
|
import json
|
||||||
|
import random
|
||||||
|
import time
|
||||||
|
from copy import deepcopy
|
||||||
|
from datetime import datetime, timezone
|
||||||
|
from typing import Any, Optional, Union
|
||||||
|
|
||||||
|
import httpx
|
||||||
|
|
||||||
|
from core.helper import ssrf_proxy
|
||||||
|
from extensions.ext_database import db
|
||||||
|
from models.dataset import (
|
||||||
|
Dataset,
|
||||||
|
Document,
|
||||||
|
ExternalApiTemplates,
|
||||||
|
ExternalKnowledgeBindings,
|
||||||
|
)
|
||||||
|
from models.model import UploadFile
|
||||||
|
from services.entities.external_knowledge_entities.external_knowledge_entities import ApiTemplateSetting, Authorization
|
||||||
|
from services.errors.dataset import DatasetNameDuplicateError
|
||||||
|
# from tasks.external_document_indexing_task import external_document_indexing_task
|
||||||
|
|
||||||
|
|
||||||
|
class ExternalDatasetService:
|
||||||
|
@staticmethod
|
||||||
|
def get_external_api_templates(page, per_page, tenant_id, search=None) -> tuple[list[ExternalApiTemplates], int]:
|
||||||
|
query = ExternalApiTemplates.query.filter(ExternalApiTemplates.tenant_id == tenant_id).order_by(
|
||||||
|
ExternalApiTemplates.created_at.desc()
|
||||||
|
)
|
||||||
|
if search:
|
||||||
|
query = query.filter(ExternalApiTemplates.name.ilike(f"%{search}%"))
|
||||||
|
|
||||||
|
api_templates = query.paginate(page=page, per_page=per_page, max_per_page=100, error_out=False)
|
||||||
|
|
||||||
|
return api_templates.items, api_templates.total
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def validate_api_list(cls, api_settings: dict):
|
||||||
|
if not api_settings:
|
||||||
|
raise ValueError("api list is empty")
|
||||||
|
if "endpoint" not in api_settings and not api_settings["endpoint"]:
|
||||||
|
raise ValueError("endpoint is required")
|
||||||
|
if "api_key" not in api_settings and not api_settings["api_key"]:
|
||||||
|
raise ValueError("api_key is required")
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def create_api_template(tenant_id: str, user_id: str, args: dict) -> ExternalApiTemplates:
|
||||||
|
api_template = ExternalApiTemplates(
|
||||||
|
tenant_id=tenant_id,
|
||||||
|
created_by=user_id,
|
||||||
|
updated_by=user_id,
|
||||||
|
name=args.get("name"),
|
||||||
|
description=args.get("description", ""),
|
||||||
|
settings=json.dumps(args.get("settings"), ensure_ascii=False),
|
||||||
|
)
|
||||||
|
|
||||||
|
db.session.add(api_template)
|
||||||
|
db.session.commit()
|
||||||
|
return api_template
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def get_api_template(api_template_id: str) -> ExternalApiTemplates:
|
||||||
|
return ExternalApiTemplates.query.filter_by(id=api_template_id).first()
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def update_api_template(tenant_id, user_id, api_template_id, args) -> ExternalApiTemplates:
|
||||||
|
api_template = ExternalApiTemplates.query.filter_by(id=api_template_id, tenant_id=tenant_id).first()
|
||||||
|
if api_template is None:
|
||||||
|
raise ValueError("api template not found")
|
||||||
|
|
||||||
|
api_template.name = args.get("name")
|
||||||
|
api_template.description = args.get("description", "")
|
||||||
|
api_template.settings = json.dumps(args.get("settings"), ensure_ascii=False)
|
||||||
|
api_template.updated_by = user_id
|
||||||
|
api_template.updated_at = datetime.now(timezone.utc).replace(tzinfo=None)
|
||||||
|
db.session.commit()
|
||||||
|
|
||||||
|
return api_template
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def delete_api_template(tenant_id: str, api_template_id: str):
|
||||||
|
api_template = ExternalApiTemplates.query.filter_by(id=api_template_id, tenant_id=tenant_id).first()
|
||||||
|
if api_template is None:
|
||||||
|
raise ValueError("api template not found")
|
||||||
|
|
||||||
|
db.session.delete(api_template)
|
||||||
|
db.session.commit()
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def external_api_template_use_check(api_template_id: str) -> bool:
|
||||||
|
count = ExternalKnowledgeBindings.query.filter_by(external_api_template_id=api_template_id).count()
|
||||||
|
if count > 0:
|
||||||
|
return True
|
||||||
|
return False
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def get_external_knowledge_binding_with_dataset_id(tenant_id: str, dataset_id: str) -> ExternalKnowledgeBindings:
|
||||||
|
external_knowledge_binding = ExternalKnowledgeBindings.query.filter_by(
|
||||||
|
dataset_id=dataset_id, tenant_id=tenant_id
|
||||||
|
).first()
|
||||||
|
if not external_knowledge_binding:
|
||||||
|
raise ValueError("external knowledge binding not found")
|
||||||
|
return external_knowledge_binding
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def document_create_args_validate(tenant_id: str, api_template_id: str, process_parameter: dict):
|
||||||
|
api_template = ExternalApiTemplates.query.filter_by(id=api_template_id, tenant_id=tenant_id).first()
|
||||||
|
if api_template is None:
|
||||||
|
raise ValueError("api template not found")
|
||||||
|
settings = json.loads(api_template.settings)
|
||||||
|
for settings in settings:
|
||||||
|
custom_parameters = settings.get("document_process_setting")
|
||||||
|
if custom_parameters:
|
||||||
|
for parameter in custom_parameters:
|
||||||
|
if parameter.get("required", False) and not process_parameter.get(parameter.get("name")):
|
||||||
|
raise ValueError(f'{parameter.get("name")} is required')
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def init_external_dataset(tenant_id: str, user_id: str, args: dict, created_from: str = "web"):
|
||||||
|
api_template_id = args.get("api_template_id")
|
||||||
|
|
||||||
|
data_source = args.get("data_source")
|
||||||
|
if data_source is None:
|
||||||
|
raise ValueError("data source is required")
|
||||||
|
|
||||||
|
process_parameter = args.get("process_parameter")
|
||||||
|
api_template = ExternalApiTemplates.query.filter_by(id=api_template_id, tenant_id=tenant_id).first()
|
||||||
|
if api_template is None:
|
||||||
|
raise ValueError("api template not found")
|
||||||
|
|
||||||
|
dataset = Dataset(
|
||||||
|
tenant_id=tenant_id,
|
||||||
|
name=args.get("name"),
|
||||||
|
description=args.get("description", ""),
|
||||||
|
provider="external",
|
||||||
|
created_by=user_id,
|
||||||
|
)
|
||||||
|
|
||||||
|
db.session.add(dataset)
|
||||||
|
db.session.flush()
|
||||||
|
|
||||||
|
document = Document.query.filter_by(dataset_id=dataset.id).order_by(Document.position.desc()).first()
|
||||||
|
|
||||||
|
position = document.position + 1 if document else 1
|
||||||
|
|
||||||
|
batch = time.strftime("%Y%m%d%H%M%S") + str(random.randint(100000, 999999))
|
||||||
|
document_ids = []
|
||||||
|
if data_source["type"] == "upload_file":
|
||||||
|
upload_file_list = data_source["info_list"]["file_info_list"]["file_ids"]
|
||||||
|
for file_id in upload_file_list:
|
||||||
|
file = (
|
||||||
|
db.session.query(UploadFile)
|
||||||
|
.filter(UploadFile.tenant_id == dataset.tenant_id, UploadFile.id == file_id)
|
||||||
|
.first()
|
||||||
|
)
|
||||||
|
if file:
|
||||||
|
data_source_info = {
|
||||||
|
"upload_file_id": file_id,
|
||||||
|
}
|
||||||
|
document = Document(
|
||||||
|
tenant_id=dataset.tenant_id,
|
||||||
|
dataset_id=dataset.id,
|
||||||
|
position=position,
|
||||||
|
data_source_type=data_source["type"],
|
||||||
|
data_source_info=json.dumps(data_source_info),
|
||||||
|
batch=batch,
|
||||||
|
name=file.name,
|
||||||
|
created_from=created_from,
|
||||||
|
created_by=user_id,
|
||||||
|
)
|
||||||
|
position += 1
|
||||||
|
db.session.add(document)
|
||||||
|
db.session.flush()
|
||||||
|
document_ids.append(document.id)
|
||||||
|
db.session.commit()
|
||||||
|
#external_document_indexing_task.delay(dataset.id, api_template_id, data_source, process_parameter)
|
||||||
|
|
||||||
|
return dataset
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def process_external_api(settings: ApiTemplateSetting, files: Union[None, dict[str, Any]]) -> httpx.Response:
|
||||||
|
"""
|
||||||
|
do http request depending on api bundle
|
||||||
|
"""
|
||||||
|
|
||||||
|
kwargs = {
|
||||||
|
"url": settings.url,
|
||||||
|
"headers": settings.headers,
|
||||||
|
"follow_redirects": True,
|
||||||
|
}
|
||||||
|
|
||||||
|
response = getattr(ssrf_proxy, settings.request_method)(data=settings.params, files=files, **kwargs)
|
||||||
|
|
||||||
|
return response
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def assembling_headers(authorization: Authorization, headers: Optional[dict] = None) -> dict[str, Any]:
|
||||||
|
authorization = deepcopy(authorization)
|
||||||
|
if headers:
|
||||||
|
headers = deepcopy(headers)
|
||||||
|
else:
|
||||||
|
headers = {}
|
||||||
|
if authorization.type == "api-key":
|
||||||
|
if authorization.config is None:
|
||||||
|
raise ValueError("authorization config is required")
|
||||||
|
|
||||||
|
if authorization.config.api_key is None:
|
||||||
|
raise ValueError("api_key is required")
|
||||||
|
|
||||||
|
if not authorization.config.header:
|
||||||
|
authorization.config.header = "Authorization"
|
||||||
|
|
||||||
|
if authorization.config.type == "bearer":
|
||||||
|
headers[authorization.config.header] = f"Bearer {authorization.config.api_key}"
|
||||||
|
elif authorization.config.type == "basic":
|
||||||
|
headers[authorization.config.header] = f"Basic {authorization.config.api_key}"
|
||||||
|
elif authorization.config.type == "custom":
|
||||||
|
headers[authorization.config.header] = authorization.config.api_key
|
||||||
|
|
||||||
|
return headers
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def get_api_template_settings(settings: dict) -> ApiTemplateSetting:
|
||||||
|
return ApiTemplateSetting.parse_obj(settings)
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def create_external_dataset(tenant_id: str, user_id: str, args: dict) -> Dataset:
|
||||||
|
# check if dataset name already exists
|
||||||
|
if Dataset.query.filter_by(name=args.get("name"), tenant_id=tenant_id).first():
|
||||||
|
raise DatasetNameDuplicateError(f"Dataset with name {args.get('name')} already exists.")
|
||||||
|
api_template = ExternalApiTemplates.query.filter_by(
|
||||||
|
id=args.get("external_api_template_id"), tenant_id=tenant_id
|
||||||
|
).first()
|
||||||
|
|
||||||
|
if api_template is None:
|
||||||
|
raise ValueError("api template not found")
|
||||||
|
|
||||||
|
dataset = Dataset(
|
||||||
|
tenant_id=tenant_id,
|
||||||
|
name=args.get("name"),
|
||||||
|
description=args.get("description", ""),
|
||||||
|
provider="external",
|
||||||
|
created_by=user_id,
|
||||||
|
)
|
||||||
|
|
||||||
|
db.session.add(dataset)
|
||||||
|
db.session.flush()
|
||||||
|
|
||||||
|
external_knowledge_binding = ExternalKnowledgeBindings(
|
||||||
|
tenant_id=tenant_id,
|
||||||
|
dataset_id=dataset.id,
|
||||||
|
external_api_template_id=args.get("external_api_template_id"),
|
||||||
|
external_knowledge_id=args.get("external_knowledge_id"),
|
||||||
|
created_by=user_id,
|
||||||
|
)
|
||||||
|
db.session.add(external_knowledge_binding)
|
||||||
|
|
||||||
|
db.session.commit()
|
||||||
|
|
||||||
|
return dataset
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def fetch_external_knowledge_retrival(
|
||||||
|
tenant_id: str, dataset_id: str, query: str, external_retrival_parameters: dict
|
||||||
|
):
|
||||||
|
external_knowledge_binding = ExternalKnowledgeBindings.query.filter_by(
|
||||||
|
dataset_id=dataset_id, tenant_id=tenant_id
|
||||||
|
).first()
|
||||||
|
if not external_knowledge_binding:
|
||||||
|
raise ValueError("external knowledge binding not found")
|
||||||
|
|
||||||
|
external_api_template = ExternalApiTemplates.query.filter_by(
|
||||||
|
id=external_knowledge_binding.external_api_template_id
|
||||||
|
).first()
|
||||||
|
if not external_api_template:
|
||||||
|
raise ValueError("external api template not found")
|
||||||
|
|
||||||
|
settings = json.loads(external_api_template.settings)
|
||||||
|
headers = {}
|
||||||
|
if settings.get("api_token"):
|
||||||
|
headers["Authorization"] = f"Bearer {settings.get('api_token')}"
|
||||||
|
|
||||||
|
external_retrival_parameters["query"] = query
|
||||||
|
|
||||||
|
api_template_setting = {
|
||||||
|
"url": f"{settings.get('endpoint')}/dify/external-knowledge/retrival-documents",
|
||||||
|
"request_method": "post",
|
||||||
|
"headers": settings.get("headers"),
|
||||||
|
"params": external_retrival_parameters,
|
||||||
|
}
|
||||||
|
response = ExternalDatasetService.process_external_api(ApiTemplateSetting(**api_template_setting), None)
|
||||||
|
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def test_external_knowledge_retrival(
|
||||||
|
top_k: int, score_threshold: float
|
||||||
|
):
|
||||||
|
api_template_setting = {
|
||||||
|
"url": f"{settings.get('endpoint')}/dify/external-knowledge/retrival-documents",
|
||||||
|
"request_method": "post",
|
||||||
|
"headers": settings.get("headers"),
|
||||||
|
"params": {
|
||||||
|
"top_k": top_k,
|
||||||
|
"score_threshold": score_threshold,
|
||||||
|
},
|
||||||
|
}
|
||||||
|
response = ExternalDatasetService.process_external_api(ApiTemplateSetting(**api_template_setting), None)
|
||||||
|
return response.json()
|
||||||
@@ -19,7 +19,15 @@ default_retrieval_model = {
|
|||||||
|
|
||||||
class HitTestingService:
|
class HitTestingService:
|
||||||
@classmethod
|
@classmethod
|
||||||
def retrieve(cls, dataset: Dataset, query: str, account: Account, retrieval_model: dict, limit: int = 10) -> dict:
|
def retrieve(
|
||||||
|
cls,
|
||||||
|
dataset: Dataset,
|
||||||
|
query: str,
|
||||||
|
account: Account,
|
||||||
|
retrieval_model: dict,
|
||||||
|
external_retrieval_model: dict,
|
||||||
|
limit: int = 10,
|
||||||
|
) -> dict:
|
||||||
if dataset.available_document_count == 0 or dataset.available_segment_count == 0:
|
if dataset.available_document_count == 0 or dataset.available_segment_count == 0:
|
||||||
return {
|
return {
|
||||||
"query": {
|
"query": {
|
||||||
@@ -48,6 +56,8 @@ class HitTestingService:
|
|||||||
else None,
|
else None,
|
||||||
reranking_mode=retrieval_model.get("reranking_mode") or "reranking_model",
|
reranking_mode=retrieval_model.get("reranking_mode") or "reranking_model",
|
||||||
weights=retrieval_model.get("weights", None),
|
weights=retrieval_model.get("weights", None),
|
||||||
|
provider=dataset.provider,
|
||||||
|
external_retrieval_model=external_retrieval_model,
|
||||||
)
|
)
|
||||||
|
|
||||||
end = time.perf_counter()
|
end = time.perf_counter()
|
||||||
|
|||||||
85
api/tasks/external_document_indexing_task.py
Normal file
85
api/tasks/external_document_indexing_task.py
Normal file
@@ -0,0 +1,85 @@
|
|||||||
|
import json
|
||||||
|
import logging
|
||||||
|
import time
|
||||||
|
|
||||||
|
import click
|
||||||
|
from celery import shared_task
|
||||||
|
|
||||||
|
from core.indexing_runner import DocumentIsPausedException
|
||||||
|
from extensions.ext_database import db
|
||||||
|
from extensions.ext_storage import storage
|
||||||
|
from models.dataset import Dataset, ExternalApiTemplates
|
||||||
|
from models.model import UploadFile
|
||||||
|
from services.external_knowledge_service import ExternalDatasetService
|
||||||
|
|
||||||
|
|
||||||
|
@shared_task(queue="dataset")
|
||||||
|
def external_document_indexing_task(dataset_id: str, api_template_id: str, data_source: dict, process_parameter: dict):
|
||||||
|
"""
|
||||||
|
Async process document
|
||||||
|
:param dataset_id:
|
||||||
|
:param api_template_id:
|
||||||
|
:param data_source:
|
||||||
|
:param process_parameter:
|
||||||
|
Usage: external_document_indexing_task.delay(dataset_id, document_id)
|
||||||
|
"""
|
||||||
|
start_at = time.perf_counter()
|
||||||
|
|
||||||
|
dataset = db.session.query(Dataset).filter(Dataset.id == dataset_id).first()
|
||||||
|
if not dataset:
|
||||||
|
logging.info(
|
||||||
|
click.style("Processed external dataset: {} failed, dataset not exit.".format(dataset_id), fg="red")
|
||||||
|
)
|
||||||
|
return
|
||||||
|
|
||||||
|
# get external api template
|
||||||
|
api_template = (
|
||||||
|
db.session.query(ExternalApiTemplates)
|
||||||
|
.filter(ExternalApiTemplates.id == api_template_id, ExternalApiTemplates.tenant_id == dataset.tenant_id)
|
||||||
|
.first()
|
||||||
|
)
|
||||||
|
|
||||||
|
if not api_template:
|
||||||
|
logging.info(
|
||||||
|
click.style(
|
||||||
|
"Processed external dataset: {} failed, api template: {} not exit.".format(dataset_id, api_template_id),
|
||||||
|
fg="red",
|
||||||
|
)
|
||||||
|
)
|
||||||
|
return
|
||||||
|
files = {}
|
||||||
|
if data_source["type"] == "upload_file":
|
||||||
|
upload_file_list = data_source["info_list"]["file_info_list"]["file_ids"]
|
||||||
|
for file_id in upload_file_list:
|
||||||
|
file = (
|
||||||
|
db.session.query(UploadFile)
|
||||||
|
.filter(UploadFile.tenant_id == dataset.tenant_id, UploadFile.id == file_id)
|
||||||
|
.first()
|
||||||
|
)
|
||||||
|
if file:
|
||||||
|
files[file.id] = (file.name, storage.load_once(file.key), file.mime_type)
|
||||||
|
try:
|
||||||
|
settings = ExternalDatasetService.get_api_template_settings(json.loads(api_template.settings))
|
||||||
|
# assemble headers
|
||||||
|
headers = ExternalDatasetService.assembling_headers(settings.authorization, settings.headers)
|
||||||
|
|
||||||
|
# do http request
|
||||||
|
response = ExternalDatasetService.process_external_api(settings, headers, process_parameter, files)
|
||||||
|
job_id = response.json().get("job_id")
|
||||||
|
if job_id:
|
||||||
|
# save job_id to dataset
|
||||||
|
dataset.job_id = job_id
|
||||||
|
db.session.commit()
|
||||||
|
|
||||||
|
end_at = time.perf_counter()
|
||||||
|
logging.info(
|
||||||
|
click.style(
|
||||||
|
"Processed external dataset: {} successful, latency: {}".format(dataset.id, end_at - start_at),
|
||||||
|
fg="green",
|
||||||
|
)
|
||||||
|
)
|
||||||
|
except DocumentIsPausedException as ex:
|
||||||
|
logging.info(click.style(str(ex), fg="yellow"))
|
||||||
|
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
24
docker/unstructured.yaml
Normal file
24
docker/unstructured.yaml
Normal file
@@ -0,0 +1,24 @@
|
|||||||
|
|
||||||
|
# unstructured .
|
||||||
|
# (if used, you need to set ETL_TYPE to Unstructured in the api & worker service.)
|
||||||
|
unstructured:
|
||||||
|
image: downloads.unstructured.io/unstructured-io/unstructured-api:latest
|
||||||
|
profiles:
|
||||||
|
- unstructured
|
||||||
|
restart: always
|
||||||
|
volumes:
|
||||||
|
- ./volumes/unstructured:/app/data
|
||||||
|
|
||||||
|
networks:
|
||||||
|
# create a network between sandbox, api and ssrf_proxy, and can not access outside.
|
||||||
|
ssrf_proxy_network:
|
||||||
|
driver: bridge
|
||||||
|
internal: true
|
||||||
|
milvus:
|
||||||
|
driver: bridge
|
||||||
|
opensearch-net:
|
||||||
|
driver: bridge
|
||||||
|
internal: true
|
||||||
|
|
||||||
|
volumes:
|
||||||
|
oradata:
|
||||||
Reference in New Issue
Block a user