diff --git a/ChatQnA/benchmark/performance/vllm/with_rerank/eight_gaudi/eight_gaudi_with_rerank.yaml b/ChatQnA/benchmark/performance/vllm/with_rerank/eight_gaudi/eight_gaudi_with_rerank.yaml new file mode 100644 index 000000000..d587c217f --- /dev/null +++ b/ChatQnA/benchmark/performance/vllm/with_rerank/eight_gaudi/eight_gaudi_with_rerank.yaml @@ -0,0 +1,507 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +apiVersion: v1 +data: + EMBEDDING_MODEL_ID: BAAI/bge-base-en-v1.5 + EMBEDDING_SERVER_HOST_IP: embedding-dependency-svc + HUGGINGFACEHUB_API_TOKEN: ${HF_TOKEN} + INDEX_NAME: rag-redis + LLM_MODEL_ID: Intel/neural-chat-7b-v3-3 + LLM_SERVER_HOST_IP: llm-dependency-svc + NODE_SELECTOR: chatqna-opea + REDIS_URL: redis://vector-db.default.svc.cluster.local:6379 + RERANK_MODEL_ID: BAAI/bge-reranker-base + RERANK_SERVER_HOST_IP: reranking-dependency-svc + RETRIEVER_SERVICE_HOST_IP: retriever-svc + TEI_EMBEDDING_ENDPOINT: http://embedding-dependency-svc.default.svc.cluster.local:6006 + TEI_ENDPOINT: http://embedding-dependency-svc.default.svc.cluster.local:6006 + TEI_RERANKING_ENDPOINT: http://reranking-dependency-svc.default.svc.cluster.local:8808 + TGI_LLM_ENDPOINT: http://llm-dependency-svc.default.svc.cluster.local:9009 +kind: ConfigMap +metadata: + name: qna-config + namespace: default +--- +apiVersion: apps/v1 +kind: Deployment +metadata: + name: chatqna-backend-server-deploy + namespace: default +spec: + replicas: 8 + selector: + matchLabels: + app: chatqna-backend-server-deploy + template: + metadata: + annotations: + sidecar.istio.io/rewriteAppHTTPProbers: 'true' + labels: + app: chatqna-backend-server-deploy + spec: + containers: + - envFrom: + - configMapRef: + name: qna-config + image: opea/chatqna:latest + imagePullPolicy: IfNotPresent + name: chatqna-backend-server-deploy + ports: + - containerPort: 8888 + resources: + limits: + cpu: 8 + memory: 8000Mi + requests: + cpu: 8 + memory: 8000Mi + hostIPC: true + nodeSelector: + node-type: chatqna-opea + serviceAccountName: default + topologySpreadConstraints: + - labelSelector: + matchLabels: + app: chatqna-backend-server-deploy + maxSkew: 1 + topologyKey: kubernetes.io/hostname + whenUnsatisfiable: ScheduleAnyway +--- +apiVersion: v1 +kind: Service +metadata: + name: chatqna-backend-server-svc + namespace: default +spec: + ports: + - name: service + nodePort: 30888 + port: 8888 + targetPort: 8888 + selector: + app: chatqna-backend-server-deploy + type: NodePort +--- +apiVersion: apps/v1 +kind: Deployment +metadata: + name: dataprep-deploy + namespace: default +spec: + replicas: 1 + selector: + matchLabels: + app: dataprep-deploy + template: + metadata: + annotations: + sidecar.istio.io/rewriteAppHTTPProbers: 'true' + labels: + app: dataprep-deploy + spec: + containers: + - envFrom: + - configMapRef: + name: qna-config + image: opea/dataprep-redis:latest + imagePullPolicy: IfNotPresent + name: dataprep-deploy + ports: + - containerPort: 6007 + hostIPC: true + nodeSelector: + node-type: chatqna-opea + serviceAccountName: default + topologySpreadConstraints: + - labelSelector: + matchLabels: + app: dataprep-deploy + maxSkew: 1 + topologyKey: kubernetes.io/hostname + whenUnsatisfiable: ScheduleAnyway +--- +apiVersion: v1 +kind: Service +metadata: + name: dataprep-svc + namespace: default +spec: + ports: + - name: port1 + port: 6007 + targetPort: 6007 + selector: + app: dataprep-deploy + type: ClusterIP +--- +apiVersion: apps/v1 +kind: Deployment +metadata: + name: embedding-dependency-deploy + namespace: default +spec: + replicas: 8 + selector: + matchLabels: + app: embedding-dependency-deploy + template: + metadata: + annotations: + sidecar.istio.io/rewriteAppHTTPProbers: 'true' + labels: + app: embedding-dependency-deploy + spec: + containers: + - args: + - --model-id + - $(EMBEDDING_MODEL_ID) + - --auto-truncate + envFrom: + - configMapRef: + name: qna-config + image: ghcr.io/huggingface/text-embeddings-inference:cpu-1.5 + imagePullPolicy: IfNotPresent + name: embedding-dependency-deploy + ports: + - containerPort: 80 + resources: + limits: + cpu: 80 + memory: 20000Mi + requests: + cpu: 80 + memory: 20000Mi + volumeMounts: + - mountPath: /data + name: model-volume + - mountPath: /dev/shm + name: shm + hostIPC: true + nodeSelector: + node-type: chatqna-opea + serviceAccountName: default + topologySpreadConstraints: + - labelSelector: + matchLabels: + app: embedding-dependency-deploy + maxSkew: 1 + topologyKey: kubernetes.io/hostname + whenUnsatisfiable: ScheduleAnyway + volumes: + - hostPath: + path: /mnt/models + type: Directory + name: model-volume + - emptyDir: + medium: Memory + sizeLimit: 1Gi + name: shm +--- +apiVersion: v1 +kind: Service +metadata: + name: embedding-dependency-svc + namespace: default +spec: + ports: + - name: service + port: 6006 + targetPort: 80 + selector: + app: embedding-dependency-deploy + type: ClusterIP +--- +apiVersion: apps/v1 +kind: Deployment +metadata: + name: llm-dependency-deploy + namespace: default +spec: + replicas: 63 + selector: + matchLabels: + app: llm-dependency-deploy + template: + metadata: + annotations: + sidecar.istio.io/rewriteAppHTTPProbers: 'true' + labels: + app: llm-dependency-deploy + spec: + containers: + - args: + - --model-id + - $(LLM_MODEL_ID) + - --max-input-length + - '1280' + - --max-total-tokens + - '2048' + - --max-batch-total-tokens + - '65536' + - --max-batch-prefill-tokens + - '4096' + env: + - name: OMPI_MCA_btl_vader_single_copy_mechanism + value: none + - name: PT_HPU_ENABLE_LAZY_COLLECTIVES + value: 'true' + - name: runtime + value: habana + - name: HABANA_VISIBLE_DEVICES + value: all + - name: HF_TOKEN + value: ${HF_TOKEN} + envFrom: + - configMapRef: + name: qna-config + image: ghcr.io/huggingface/tgi-gaudi:2.0.5 + imagePullPolicy: IfNotPresent + name: llm-dependency-deploy + ports: + - containerPort: 80 + resources: + limits: + habana.ai/gaudi: 1 + securityContext: + capabilities: + add: + - SYS_NICE + volumeMounts: + - mountPath: /data + name: model-volume + - mountPath: /dev/shm + name: shm + hostIPC: true + nodeSelector: + node-type: chatqna-opea + serviceAccountName: default + topologySpreadConstraints: + - labelSelector: + matchLabels: + app: llm-dependency-deploy + maxSkew: 1 + topologyKey: kubernetes.io/hostname + whenUnsatisfiable: ScheduleAnyway + volumes: + - hostPath: + path: /mnt/models + type: Directory + name: model-volume + - emptyDir: + medium: Memory + sizeLimit: 1Gi + name: shm +--- +apiVersion: v1 +kind: Service +metadata: + name: llm-dependency-svc + namespace: default +spec: + ports: + - name: service + port: 9009 + targetPort: 80 + selector: + app: llm-dependency-deploy + type: ClusterIP +--- +apiVersion: apps/v1 +kind: Deployment +metadata: + name: reranking-dependency-deploy + namespace: default +spec: + replicas: 1 + selector: + matchLabels: + app: reranking-dependency-deploy + template: + metadata: + annotations: + sidecar.istio.io/rewriteAppHTTPProbers: 'true' + labels: + app: reranking-dependency-deploy + spec: + containers: + - args: + - --model-id + - $(RERANK_MODEL_ID) + - --auto-truncate + env: + - name: OMPI_MCA_btl_vader_single_copy_mechanism + value: none + - name: PT_HPU_ENABLE_LAZY_COLLECTIVES + value: 'true' + - name: runtime + value: habana + - name: HABANA_VISIBLE_DEVICES + value: all + - name: HF_TOKEN + value: ${HF_TOKEN} + - name: MAX_WARMUP_SEQUENCE_LENGTH + value: '512' + envFrom: + - configMapRef: + name: qna-config + image: ghcr.io/huggingface/tei-gaudi:latest + imagePullPolicy: IfNotPresent + name: reranking-dependency-deploy + ports: + - containerPort: 80 + resources: + limits: + habana.ai/gaudi: 1 + volumeMounts: + - mountPath: /data + name: model-volume + - mountPath: /dev/shm + name: shm + hostIPC: true + nodeSelector: + node-type: chatqna-opea + serviceAccountName: default + topologySpreadConstraints: + - labelSelector: + matchLabels: + app: reranking-dependency-deploy + maxSkew: 1 + topologyKey: kubernetes.io/hostname + whenUnsatisfiable: ScheduleAnyway + volumes: + - hostPath: + path: /mnt/models + type: Directory + name: model-volume + - emptyDir: + medium: Memory + sizeLimit: 1Gi + name: shm +--- +apiVersion: v1 +kind: Service +metadata: + name: reranking-dependency-svc + namespace: default +spec: + ports: + - name: service + port: 8808 + targetPort: 80 + selector: + app: reranking-dependency-deploy + type: ClusterIP +--- +apiVersion: apps/v1 +kind: Deployment +metadata: + name: retriever-deploy + namespace: default +spec: + replicas: 8 + selector: + matchLabels: + app: retriever-deploy + template: + metadata: + annotations: + sidecar.istio.io/rewriteAppHTTPProbers: 'true' + labels: + app: retriever-deploy + spec: + containers: + - envFrom: + - configMapRef: + name: qna-config + image: opea/retriever-redis:latest + imagePullPolicy: IfNotPresent + name: retriever-deploy + ports: + - containerPort: 7000 + resources: + requests: + cpu: 4 + memory: 4000Mi + hostIPC: true + nodeSelector: + node-type: chatqna-opea + serviceAccountName: default + topologySpreadConstraints: + - labelSelector: + matchLabels: + app: retriever-deploy + maxSkew: 1 + topologyKey: kubernetes.io/hostname + whenUnsatisfiable: ScheduleAnyway +--- +apiVersion: v1 +kind: Service +metadata: + name: retriever-svc + namespace: default +spec: + ports: + - name: service + port: 7000 + targetPort: 7000 + selector: + app: retriever-deploy + type: ClusterIP +--- +apiVersion: apps/v1 +kind: Deployment +metadata: + name: vector-db + namespace: default +spec: + replicas: 1 + selector: + matchLabels: + app: vector-db + template: + metadata: + annotations: + sidecar.istio.io/rewriteAppHTTPProbers: 'true' + labels: + app: vector-db + spec: + containers: + - envFrom: + - configMapRef: + name: qna-config + image: redis/redis-stack:7.2.0-v9 + imagePullPolicy: IfNotPresent + name: vector-db + ports: + - containerPort: 6379 + - containerPort: 8001 + hostIPC: true + nodeSelector: + node-type: chatqna-opea + serviceAccountName: default + topologySpreadConstraints: + - labelSelector: + matchLabels: + app: vector-db + maxSkew: 1 + topologyKey: kubernetes.io/hostname + whenUnsatisfiable: ScheduleAnyway +--- +apiVersion: v1 +kind: Service +metadata: + name: vector-db + namespace: default +spec: + ports: + - name: vector-db-service + port: 6379 + targetPort: 6379 + - name: vector-db-insight + port: 8001 + targetPort: 8001 + selector: + app: vector-db + type: ClusterIP +--- diff --git a/ChatQnA/benchmark/performance/vllm/with_rerank/four_gaudi/tuned_four_gaudi_with_rerank.yaml b/ChatQnA/benchmark/performance/vllm/with_rerank/four_gaudi/tuned_four_gaudi_with_rerank.yaml new file mode 100644 index 000000000..b9d023c3a --- /dev/null +++ b/ChatQnA/benchmark/performance/vllm/with_rerank/four_gaudi/tuned_four_gaudi_with_rerank.yaml @@ -0,0 +1,507 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +apiVersion: v1 +data: + EMBEDDING_MODEL_ID: BAAI/bge-base-en-v1.5 + EMBEDDING_SERVER_HOST_IP: embedding-dependency-svc + HUGGINGFACEHUB_API_TOKEN: ${HF_TOKEN} + INDEX_NAME: rag-redis + LLM_MODEL_ID: Intel/neural-chat-7b-v3-3 + LLM_SERVER_HOST_IP: llm-dependency-svc + NODE_SELECTOR: chatqna-opea + REDIS_URL: redis://vector-db.default.svc.cluster.local:6379 + RERANK_MODEL_ID: BAAI/bge-reranker-base + RERANK_SERVER_HOST_IP: reranking-dependency-svc + RETRIEVER_SERVICE_HOST_IP: retriever-svc + TEI_EMBEDDING_ENDPOINT: http://embedding-dependency-svc.default.svc.cluster.local:6006 + TEI_ENDPOINT: http://embedding-dependency-svc.default.svc.cluster.local:6006 + TEI_RERANKING_ENDPOINT: http://reranking-dependency-svc.default.svc.cluster.local:8808 + TGI_LLM_ENDPOINT: http://llm-dependency-svc.default.svc.cluster.local:9009 +kind: ConfigMap +metadata: + name: qna-config + namespace: default +--- +apiVersion: apps/v1 +kind: Deployment +metadata: + name: chatqna-backend-server-deploy + namespace: default +spec: + replicas: 4 + selector: + matchLabels: + app: chatqna-backend-server-deploy + template: + metadata: + annotations: + sidecar.istio.io/rewriteAppHTTPProbers: 'true' + labels: + app: chatqna-backend-server-deploy + spec: + containers: + - envFrom: + - configMapRef: + name: qna-config + image: opea/chatqna:latest + imagePullPolicy: IfNotPresent + name: chatqna-backend-server-deploy + ports: + - containerPort: 8888 + resources: + limits: + cpu: 8 + memory: 8000Mi + requests: + cpu: 8 + memory: 8000Mi + hostIPC: true + nodeSelector: + node-type: chatqna-opea + serviceAccountName: default + topologySpreadConstraints: + - labelSelector: + matchLabels: + app: chatqna-backend-server-deploy + maxSkew: 1 + topologyKey: kubernetes.io/hostname + whenUnsatisfiable: ScheduleAnyway +--- +apiVersion: v1 +kind: Service +metadata: + name: chatqna-backend-server-svc + namespace: default +spec: + ports: + - name: service + nodePort: 30888 + port: 8888 + targetPort: 8888 + selector: + app: chatqna-backend-server-deploy + type: NodePort +--- +apiVersion: apps/v1 +kind: Deployment +metadata: + name: dataprep-deploy + namespace: default +spec: + replicas: 1 + selector: + matchLabels: + app: dataprep-deploy + template: + metadata: + annotations: + sidecar.istio.io/rewriteAppHTTPProbers: 'true' + labels: + app: dataprep-deploy + spec: + containers: + - envFrom: + - configMapRef: + name: qna-config + image: opea/dataprep-redis:latest + imagePullPolicy: IfNotPresent + name: dataprep-deploy + ports: + - containerPort: 6007 + hostIPC: true + nodeSelector: + node-type: chatqna-opea + serviceAccountName: default + topologySpreadConstraints: + - labelSelector: + matchLabels: + app: dataprep-deploy + maxSkew: 1 + topologyKey: kubernetes.io/hostname + whenUnsatisfiable: ScheduleAnyway +--- +apiVersion: v1 +kind: Service +metadata: + name: dataprep-svc + namespace: default +spec: + ports: + - name: port1 + port: 6007 + targetPort: 6007 + selector: + app: dataprep-deploy + type: ClusterIP +--- +apiVersion: apps/v1 +kind: Deployment +metadata: + name: embedding-dependency-deploy + namespace: default +spec: + replicas: 4 + selector: + matchLabels: + app: embedding-dependency-deploy + template: + metadata: + annotations: + sidecar.istio.io/rewriteAppHTTPProbers: 'true' + labels: + app: embedding-dependency-deploy + spec: + containers: + - args: + - --model-id + - $(EMBEDDING_MODEL_ID) + - --auto-truncate + envFrom: + - configMapRef: + name: qna-config + image: ghcr.io/huggingface/text-embeddings-inference:cpu-1.5 + imagePullPolicy: IfNotPresent + name: embedding-dependency-deploy + ports: + - containerPort: 80 + resources: + limits: + cpu: 80 + memory: 20000Mi + requests: + cpu: 80 + memory: 20000Mi + volumeMounts: + - mountPath: /data + name: model-volume + - mountPath: /dev/shm + name: shm + hostIPC: true + nodeSelector: + node-type: chatqna-opea + serviceAccountName: default + topologySpreadConstraints: + - labelSelector: + matchLabels: + app: embedding-dependency-deploy + maxSkew: 1 + topologyKey: kubernetes.io/hostname + whenUnsatisfiable: ScheduleAnyway + volumes: + - hostPath: + path: /mnt/models + type: Directory + name: model-volume + - emptyDir: + medium: Memory + sizeLimit: 1Gi + name: shm +--- +apiVersion: v1 +kind: Service +metadata: + name: embedding-dependency-svc + namespace: default +spec: + ports: + - name: service + port: 6006 + targetPort: 80 + selector: + app: embedding-dependency-deploy + type: ClusterIP +--- +apiVersion: apps/v1 +kind: Deployment +metadata: + name: llm-dependency-deploy + namespace: default +spec: + replicas: 31 + selector: + matchLabels: + app: llm-dependency-deploy + template: + metadata: + annotations: + sidecar.istio.io/rewriteAppHTTPProbers: 'true' + labels: + app: llm-dependency-deploy + spec: + containers: + - args: + - --model-id + - $(LLM_MODEL_ID) + - --max-input-length + - '1280' + - --max-total-tokens + - '2048' + - --max-batch-total-tokens + - '65536' + - --max-batch-prefill-tokens + - '4096' + env: + - name: OMPI_MCA_btl_vader_single_copy_mechanism + value: none + - name: PT_HPU_ENABLE_LAZY_COLLECTIVES + value: 'true' + - name: runtime + value: habana + - name: HABANA_VISIBLE_DEVICES + value: all + - name: HF_TOKEN + value: ${HF_TOKEN} + envFrom: + - configMapRef: + name: qna-config + image: ghcr.io/huggingface/tgi-gaudi:2.0.5 + imagePullPolicy: IfNotPresent + name: llm-dependency-deploy + ports: + - containerPort: 80 + resources: + limits: + habana.ai/gaudi: 1 + securityContext: + capabilities: + add: + - SYS_NICE + volumeMounts: + - mountPath: /data + name: model-volume + - mountPath: /dev/shm + name: shm + hostIPC: true + nodeSelector: + node-type: chatqna-opea + serviceAccountName: default + topologySpreadConstraints: + - labelSelector: + matchLabels: + app: llm-dependency-deploy + maxSkew: 1 + topologyKey: kubernetes.io/hostname + whenUnsatisfiable: ScheduleAnyway + volumes: + - hostPath: + path: /mnt/models + type: Directory + name: model-volume + - emptyDir: + medium: Memory + sizeLimit: 1Gi + name: shm +--- +apiVersion: v1 +kind: Service +metadata: + name: llm-dependency-svc + namespace: default +spec: + ports: + - name: service + port: 9009 + targetPort: 80 + selector: + app: llm-dependency-deploy + type: ClusterIP +--- +apiVersion: apps/v1 +kind: Deployment +metadata: + name: reranking-dependency-deploy + namespace: default +spec: + replicas: 1 + selector: + matchLabels: + app: reranking-dependency-deploy + template: + metadata: + annotations: + sidecar.istio.io/rewriteAppHTTPProbers: 'true' + labels: + app: reranking-dependency-deploy + spec: + containers: + - args: + - --model-id + - $(RERANK_MODEL_ID) + - --auto-truncate + env: + - name: OMPI_MCA_btl_vader_single_copy_mechanism + value: none + - name: PT_HPU_ENABLE_LAZY_COLLECTIVES + value: 'true' + - name: runtime + value: habana + - name: HABANA_VISIBLE_DEVICES + value: all + - name: HF_TOKEN + value: ${HF_TOKEN} + - name: MAX_WARMUP_SEQUENCE_LENGTH + value: '512' + envFrom: + - configMapRef: + name: qna-config + image: ghcr.io/huggingface/tei-gaudi:latest + imagePullPolicy: IfNotPresent + name: reranking-dependency-deploy + ports: + - containerPort: 80 + resources: + limits: + habana.ai/gaudi: 1 + volumeMounts: + - mountPath: /data + name: model-volume + - mountPath: /dev/shm + name: shm + hostIPC: true + nodeSelector: + node-type: chatqna-opea + serviceAccountName: default + topologySpreadConstraints: + - labelSelector: + matchLabels: + app: reranking-dependency-deploy + maxSkew: 1 + topologyKey: kubernetes.io/hostname + whenUnsatisfiable: ScheduleAnyway + volumes: + - hostPath: + path: /mnt/models + type: Directory + name: model-volume + - emptyDir: + medium: Memory + sizeLimit: 1Gi + name: shm +--- +apiVersion: v1 +kind: Service +metadata: + name: reranking-dependency-svc + namespace: default +spec: + ports: + - name: service + port: 8808 + targetPort: 80 + selector: + app: reranking-dependency-deploy + type: ClusterIP +--- +apiVersion: apps/v1 +kind: Deployment +metadata: + name: retriever-deploy + namespace: default +spec: + replicas: 4 + selector: + matchLabels: + app: retriever-deploy + template: + metadata: + annotations: + sidecar.istio.io/rewriteAppHTTPProbers: 'true' + labels: + app: retriever-deploy + spec: + containers: + - envFrom: + - configMapRef: + name: qna-config + image: opea/retriever-redis:latest + imagePullPolicy: IfNotPresent + name: retriever-deploy + ports: + - containerPort: 7000 + resources: + requests: + cpu: 4 + memory: 4000Mi + hostIPC: true + nodeSelector: + node-type: chatqna-opea + serviceAccountName: default + topologySpreadConstraints: + - labelSelector: + matchLabels: + app: retriever-deploy + maxSkew: 1 + topologyKey: kubernetes.io/hostname + whenUnsatisfiable: ScheduleAnyway +--- +apiVersion: v1 +kind: Service +metadata: + name: retriever-svc + namespace: default +spec: + ports: + - name: service + port: 7000 + targetPort: 7000 + selector: + app: retriever-deploy + type: ClusterIP +--- +apiVersion: apps/v1 +kind: Deployment +metadata: + name: vector-db + namespace: default +spec: + replicas: 1 + selector: + matchLabels: + app: vector-db + template: + metadata: + annotations: + sidecar.istio.io/rewriteAppHTTPProbers: 'true' + labels: + app: vector-db + spec: + containers: + - envFrom: + - configMapRef: + name: qna-config + image: redis/redis-stack:7.2.0-v9 + imagePullPolicy: IfNotPresent + name: vector-db + ports: + - containerPort: 6379 + - containerPort: 8001 + hostIPC: true + nodeSelector: + node-type: chatqna-opea + serviceAccountName: default + topologySpreadConstraints: + - labelSelector: + matchLabels: + app: vector-db + maxSkew: 1 + topologyKey: kubernetes.io/hostname + whenUnsatisfiable: ScheduleAnyway +--- +apiVersion: v1 +kind: Service +metadata: + name: vector-db + namespace: default +spec: + ports: + - name: vector-db-service + port: 6379 + targetPort: 6379 + - name: vector-db-insight + port: 8001 + targetPort: 8001 + selector: + app: vector-db + type: ClusterIP +--- diff --git a/ChatQnA/benchmark/performance/vllm/with_rerank/single_gaudi/tuned_single_gaudi_with_rerank.yaml b/ChatQnA/benchmark/performance/vllm/with_rerank/single_gaudi/tuned_single_gaudi_with_rerank.yaml new file mode 100644 index 000000000..9a7da2f45 --- /dev/null +++ b/ChatQnA/benchmark/performance/vllm/with_rerank/single_gaudi/tuned_single_gaudi_with_rerank.yaml @@ -0,0 +1,507 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +apiVersion: v1 +data: + EMBEDDING_MODEL_ID: BAAI/bge-base-en-v1.5 + EMBEDDING_SERVER_HOST_IP: embedding-dependency-svc + HUGGINGFACEHUB_API_TOKEN: ${HF_TOKEN} + INDEX_NAME: rag-redis + LLM_MODEL_ID: Intel/neural-chat-7b-v3-3 + LLM_SERVER_HOST_IP: llm-dependency-svc + NODE_SELECTOR: chatqna-opea + REDIS_URL: redis://vector-db.default.svc.cluster.local:6379 + RERANK_MODEL_ID: BAAI/bge-reranker-base + RERANK_SERVER_HOST_IP: reranking-dependency-svc + RETRIEVER_SERVICE_HOST_IP: retriever-svc + TEI_EMBEDDING_ENDPOINT: http://embedding-dependency-svc.default.svc.cluster.local:6006 + TEI_ENDPOINT: http://embedding-dependency-svc.default.svc.cluster.local:6006 + TEI_RERANKING_ENDPOINT: http://reranking-dependency-svc.default.svc.cluster.local:8808 + TGI_LLM_ENDPOINT: http://llm-dependency-svc.default.svc.cluster.local:9009 +kind: ConfigMap +metadata: + name: qna-config + namespace: default +--- +apiVersion: apps/v1 +kind: Deployment +metadata: + name: chatqna-backend-server-deploy + namespace: default +spec: + replicas: 2 + selector: + matchLabels: + app: chatqna-backend-server-deploy + template: + metadata: + annotations: + sidecar.istio.io/rewriteAppHTTPProbers: 'true' + labels: + app: chatqna-backend-server-deploy + spec: + containers: + - envFrom: + - configMapRef: + name: qna-config + image: opea/chatqna:latest + imagePullPolicy: IfNotPresent + name: chatqna-backend-server-deploy + ports: + - containerPort: 8888 + resources: + limits: + cpu: 8 + memory: 8000Mi + requests: + cpu: 8 + memory: 8000Mi + hostIPC: true + nodeSelector: + node-type: chatqna-opea + serviceAccountName: default + topologySpreadConstraints: + - labelSelector: + matchLabels: + app: chatqna-backend-server-deploy + maxSkew: 1 + topologyKey: kubernetes.io/hostname + whenUnsatisfiable: ScheduleAnyway +--- +apiVersion: v1 +kind: Service +metadata: + name: chatqna-backend-server-svc + namespace: default +spec: + ports: + - name: service + nodePort: 30888 + port: 8888 + targetPort: 8888 + selector: + app: chatqna-backend-server-deploy + type: NodePort +--- +apiVersion: apps/v1 +kind: Deployment +metadata: + name: dataprep-deploy + namespace: default +spec: + replicas: 1 + selector: + matchLabels: + app: dataprep-deploy + template: + metadata: + annotations: + sidecar.istio.io/rewriteAppHTTPProbers: 'true' + labels: + app: dataprep-deploy + spec: + containers: + - envFrom: + - configMapRef: + name: qna-config + image: opea/dataprep-redis:latest + imagePullPolicy: IfNotPresent + name: dataprep-deploy + ports: + - containerPort: 6007 + hostIPC: true + nodeSelector: + node-type: chatqna-opea + serviceAccountName: default + topologySpreadConstraints: + - labelSelector: + matchLabels: + app: dataprep-deploy + maxSkew: 1 + topologyKey: kubernetes.io/hostname + whenUnsatisfiable: ScheduleAnyway +--- +apiVersion: v1 +kind: Service +metadata: + name: dataprep-svc + namespace: default +spec: + ports: + - name: port1 + port: 6007 + targetPort: 6007 + selector: + app: dataprep-deploy + type: ClusterIP +--- +apiVersion: apps/v1 +kind: Deployment +metadata: + name: embedding-dependency-deploy + namespace: default +spec: + replicas: 1 + selector: + matchLabels: + app: embedding-dependency-deploy + template: + metadata: + annotations: + sidecar.istio.io/rewriteAppHTTPProbers: 'true' + labels: + app: embedding-dependency-deploy + spec: + containers: + - args: + - --model-id + - $(EMBEDDING_MODEL_ID) + - --auto-truncate + envFrom: + - configMapRef: + name: qna-config + image: ghcr.io/huggingface/text-embeddings-inference:cpu-1.5 + imagePullPolicy: IfNotPresent + name: embedding-dependency-deploy + ports: + - containerPort: 80 + resources: + limits: + cpu: 80 + memory: 20000Mi + requests: + cpu: 80 + memory: 20000Mi + volumeMounts: + - mountPath: /data + name: model-volume + - mountPath: /dev/shm + name: shm + hostIPC: true + nodeSelector: + node-type: chatqna-opea + serviceAccountName: default + topologySpreadConstraints: + - labelSelector: + matchLabels: + app: embedding-dependency-deploy + maxSkew: 1 + topologyKey: kubernetes.io/hostname + whenUnsatisfiable: ScheduleAnyway + volumes: + - hostPath: + path: /mnt/models + type: Directory + name: model-volume + - emptyDir: + medium: Memory + sizeLimit: 1Gi + name: shm +--- +apiVersion: v1 +kind: Service +metadata: + name: embedding-dependency-svc + namespace: default +spec: + ports: + - name: service + port: 6006 + targetPort: 80 + selector: + app: embedding-dependency-deploy + type: ClusterIP +--- +apiVersion: apps/v1 +kind: Deployment +metadata: + name: llm-dependency-deploy + namespace: default +spec: + replicas: 7 + selector: + matchLabels: + app: llm-dependency-deploy + template: + metadata: + annotations: + sidecar.istio.io/rewriteAppHTTPProbers: 'true' + labels: + app: llm-dependency-deploy + spec: + containers: + - args: + - --model-id + - $(LLM_MODEL_ID) + - --max-input-length + - '1280' + - --max-total-tokens + - '2048' + - --max-batch-total-tokens + - '65536' + - --max-batch-prefill-tokens + - '4096' + env: + - name: OMPI_MCA_btl_vader_single_copy_mechanism + value: none + - name: PT_HPU_ENABLE_LAZY_COLLECTIVES + value: 'true' + - name: runtime + value: habana + - name: HABANA_VISIBLE_DEVICES + value: all + - name: HF_TOKEN + value: ${HF_TOKEN} + envFrom: + - configMapRef: + name: qna-config + image: ghcr.io/huggingface/tgi-gaudi:2.0.5 + imagePullPolicy: IfNotPresent + name: llm-dependency-deploy + ports: + - containerPort: 80 + resources: + limits: + habana.ai/gaudi: 1 + securityContext: + capabilities: + add: + - SYS_NICE + volumeMounts: + - mountPath: /data + name: model-volume + - mountPath: /dev/shm + name: shm + hostIPC: true + nodeSelector: + node-type: chatqna-opea + serviceAccountName: default + topologySpreadConstraints: + - labelSelector: + matchLabels: + app: llm-dependency-deploy + maxSkew: 1 + topologyKey: kubernetes.io/hostname + whenUnsatisfiable: ScheduleAnyway + volumes: + - hostPath: + path: /mnt/models + type: Directory + name: model-volume + - emptyDir: + medium: Memory + sizeLimit: 1Gi + name: shm +--- +apiVersion: v1 +kind: Service +metadata: + name: llm-dependency-svc + namespace: default +spec: + ports: + - name: service + port: 9009 + targetPort: 80 + selector: + app: llm-dependency-deploy + type: ClusterIP +--- +apiVersion: apps/v1 +kind: Deployment +metadata: + name: reranking-dependency-deploy + namespace: default +spec: + replicas: 1 + selector: + matchLabels: + app: reranking-dependency-deploy + template: + metadata: + annotations: + sidecar.istio.io/rewriteAppHTTPProbers: 'true' + labels: + app: reranking-dependency-deploy + spec: + containers: + - args: + - --model-id + - $(RERANK_MODEL_ID) + - --auto-truncate + env: + - name: OMPI_MCA_btl_vader_single_copy_mechanism + value: none + - name: PT_HPU_ENABLE_LAZY_COLLECTIVES + value: 'true' + - name: runtime + value: habana + - name: HABANA_VISIBLE_DEVICES + value: all + - name: HF_TOKEN + value: ${HF_TOKEN} + - name: MAX_WARMUP_SEQUENCE_LENGTH + value: '512' + envFrom: + - configMapRef: + name: qna-config + image: ghcr.io/huggingface/tei-gaudi:latest + imagePullPolicy: IfNotPresent + name: reranking-dependency-deploy + ports: + - containerPort: 80 + resources: + limits: + habana.ai/gaudi: 1 + volumeMounts: + - mountPath: /data + name: model-volume + - mountPath: /dev/shm + name: shm + hostIPC: true + nodeSelector: + node-type: chatqna-opea + serviceAccountName: default + topologySpreadConstraints: + - labelSelector: + matchLabels: + app: reranking-dependency-deploy + maxSkew: 1 + topologyKey: kubernetes.io/hostname + whenUnsatisfiable: ScheduleAnyway + volumes: + - hostPath: + path: /mnt/models + type: Directory + name: model-volume + - emptyDir: + medium: Memory + sizeLimit: 1Gi + name: shm +--- +apiVersion: v1 +kind: Service +metadata: + name: reranking-dependency-svc + namespace: default +spec: + ports: + - name: service + port: 8808 + targetPort: 80 + selector: + app: reranking-dependency-deploy + type: ClusterIP +--- +apiVersion: apps/v1 +kind: Deployment +metadata: + name: retriever-deploy + namespace: default +spec: + replicas: 2 + selector: + matchLabels: + app: retriever-deploy + template: + metadata: + annotations: + sidecar.istio.io/rewriteAppHTTPProbers: 'true' + labels: + app: retriever-deploy + spec: + containers: + - envFrom: + - configMapRef: + name: qna-config + image: opea/retriever-redis:latest + imagePullPolicy: IfNotPresent + name: retriever-deploy + ports: + - containerPort: 7000 + resources: + requests: + cpu: 4 + memory: 4000Mi + hostIPC: true + nodeSelector: + node-type: chatqna-opea + serviceAccountName: default + topologySpreadConstraints: + - labelSelector: + matchLabels: + app: retriever-deploy + maxSkew: 1 + topologyKey: kubernetes.io/hostname + whenUnsatisfiable: ScheduleAnyway +--- +apiVersion: v1 +kind: Service +metadata: + name: retriever-svc + namespace: default +spec: + ports: + - name: service + port: 7000 + targetPort: 7000 + selector: + app: retriever-deploy + type: ClusterIP +--- +apiVersion: apps/v1 +kind: Deployment +metadata: + name: vector-db + namespace: default +spec: + replicas: 1 + selector: + matchLabels: + app: vector-db + template: + metadata: + annotations: + sidecar.istio.io/rewriteAppHTTPProbers: 'true' + labels: + app: vector-db + spec: + containers: + - envFrom: + - configMapRef: + name: qna-config + image: redis/redis-stack:7.2.0-v9 + imagePullPolicy: IfNotPresent + name: vector-db + ports: + - containerPort: 6379 + - containerPort: 8001 + hostIPC: true + nodeSelector: + node-type: chatqna-opea + serviceAccountName: default + topologySpreadConstraints: + - labelSelector: + matchLabels: + app: vector-db + maxSkew: 1 + topologyKey: kubernetes.io/hostname + whenUnsatisfiable: ScheduleAnyway +--- +apiVersion: v1 +kind: Service +metadata: + name: vector-db + namespace: default +spec: + ports: + - name: vector-db-service + port: 6379 + targetPort: 6379 + - name: vector-db-insight + port: 8001 + targetPort: 8001 + selector: + app: vector-db + type: ClusterIP +--- diff --git a/ChatQnA/benchmark/performance/vllm/with_rerank/two_gaudi/tuned_two_gaudi_with_rerank.yaml b/ChatQnA/benchmark/performance/vllm/with_rerank/two_gaudi/tuned_two_gaudi_with_rerank.yaml new file mode 100644 index 000000000..58cbfaf80 --- /dev/null +++ b/ChatQnA/benchmark/performance/vllm/with_rerank/two_gaudi/tuned_two_gaudi_with_rerank.yaml @@ -0,0 +1,507 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +apiVersion: v1 +data: + EMBEDDING_MODEL_ID: BAAI/bge-base-en-v1.5 + EMBEDDING_SERVER_HOST_IP: embedding-dependency-svc + HUGGINGFACEHUB_API_TOKEN: ${HF_TOKEN} + INDEX_NAME: rag-redis + LLM_MODEL_ID: Intel/neural-chat-7b-v3-3 + LLM_SERVER_HOST_IP: llm-dependency-svc + NODE_SELECTOR: chatqna-opea + REDIS_URL: redis://vector-db.default.svc.cluster.local:6379 + RERANK_MODEL_ID: BAAI/bge-reranker-base + RERANK_SERVER_HOST_IP: reranking-dependency-svc + RETRIEVER_SERVICE_HOST_IP: retriever-svc + TEI_EMBEDDING_ENDPOINT: http://embedding-dependency-svc.default.svc.cluster.local:6006 + TEI_ENDPOINT: http://embedding-dependency-svc.default.svc.cluster.local:6006 + TEI_RERANKING_ENDPOINT: http://reranking-dependency-svc.default.svc.cluster.local:8808 + TGI_LLM_ENDPOINT: http://llm-dependency-svc.default.svc.cluster.local:9009 +kind: ConfigMap +metadata: + name: qna-config + namespace: default +--- +apiVersion: apps/v1 +kind: Deployment +metadata: + name: chatqna-backend-server-deploy + namespace: default +spec: + replicas: 2 + selector: + matchLabels: + app: chatqna-backend-server-deploy + template: + metadata: + annotations: + sidecar.istio.io/rewriteAppHTTPProbers: 'true' + labels: + app: chatqna-backend-server-deploy + spec: + containers: + - envFrom: + - configMapRef: + name: qna-config + image: opea/chatqna:latest + imagePullPolicy: IfNotPresent + name: chatqna-backend-server-deploy + ports: + - containerPort: 8888 + resources: + limits: + cpu: 8 + memory: 8000Mi + requests: + cpu: 8 + memory: 8000Mi + hostIPC: true + nodeSelector: + node-type: chatqna-opea + serviceAccountName: default + topologySpreadConstraints: + - labelSelector: + matchLabels: + app: chatqna-backend-server-deploy + maxSkew: 1 + topologyKey: kubernetes.io/hostname + whenUnsatisfiable: ScheduleAnyway +--- +apiVersion: v1 +kind: Service +metadata: + name: chatqna-backend-server-svc + namespace: default +spec: + ports: + - name: service + nodePort: 30888 + port: 8888 + targetPort: 8888 + selector: + app: chatqna-backend-server-deploy + type: NodePort +--- +apiVersion: apps/v1 +kind: Deployment +metadata: + name: dataprep-deploy + namespace: default +spec: + replicas: 1 + selector: + matchLabels: + app: dataprep-deploy + template: + metadata: + annotations: + sidecar.istio.io/rewriteAppHTTPProbers: 'true' + labels: + app: dataprep-deploy + spec: + containers: + - envFrom: + - configMapRef: + name: qna-config + image: opea/dataprep-redis:latest + imagePullPolicy: IfNotPresent + name: dataprep-deploy + ports: + - containerPort: 6007 + hostIPC: true + nodeSelector: + node-type: chatqna-opea + serviceAccountName: default + topologySpreadConstraints: + - labelSelector: + matchLabels: + app: dataprep-deploy + maxSkew: 1 + topologyKey: kubernetes.io/hostname + whenUnsatisfiable: ScheduleAnyway +--- +apiVersion: v1 +kind: Service +metadata: + name: dataprep-svc + namespace: default +spec: + ports: + - name: port1 + port: 6007 + targetPort: 6007 + selector: + app: dataprep-deploy + type: ClusterIP +--- +apiVersion: apps/v1 +kind: Deployment +metadata: + name: embedding-dependency-deploy + namespace: default +spec: + replicas: 2 + selector: + matchLabels: + app: embedding-dependency-deploy + template: + metadata: + annotations: + sidecar.istio.io/rewriteAppHTTPProbers: 'true' + labels: + app: embedding-dependency-deploy + spec: + containers: + - args: + - --model-id + - $(EMBEDDING_MODEL_ID) + - --auto-truncate + envFrom: + - configMapRef: + name: qna-config + image: ghcr.io/huggingface/text-embeddings-inference:cpu-1.5 + imagePullPolicy: IfNotPresent + name: embedding-dependency-deploy + ports: + - containerPort: 80 + resources: + limits: + cpu: 80 + memory: 20000Mi + requests: + cpu: 80 + memory: 20000Mi + volumeMounts: + - mountPath: /data + name: model-volume + - mountPath: /dev/shm + name: shm + hostIPC: true + nodeSelector: + node-type: chatqna-opea + serviceAccountName: default + topologySpreadConstraints: + - labelSelector: + matchLabels: + app: embedding-dependency-deploy + maxSkew: 1 + topologyKey: kubernetes.io/hostname + whenUnsatisfiable: ScheduleAnyway + volumes: + - hostPath: + path: /mnt/models + type: Directory + name: model-volume + - emptyDir: + medium: Memory + sizeLimit: 1Gi + name: shm +--- +apiVersion: v1 +kind: Service +metadata: + name: embedding-dependency-svc + namespace: default +spec: + ports: + - name: service + port: 6006 + targetPort: 80 + selector: + app: embedding-dependency-deploy + type: ClusterIP +--- +apiVersion: apps/v1 +kind: Deployment +metadata: + name: llm-dependency-deploy + namespace: default +spec: + replicas: 15 + selector: + matchLabels: + app: llm-dependency-deploy + template: + metadata: + annotations: + sidecar.istio.io/rewriteAppHTTPProbers: 'true' + labels: + app: llm-dependency-deploy + spec: + containers: + - args: + - --model-id + - $(LLM_MODEL_ID) + - --max-input-length + - '1280' + - --max-total-tokens + - '2048' + - --max-batch-total-tokens + - '65536' + - --max-batch-prefill-tokens + - '4096' + env: + - name: OMPI_MCA_btl_vader_single_copy_mechanism + value: none + - name: PT_HPU_ENABLE_LAZY_COLLECTIVES + value: 'true' + - name: runtime + value: habana + - name: HABANA_VISIBLE_DEVICES + value: all + - name: HF_TOKEN + value: ${HF_TOKEN} + envFrom: + - configMapRef: + name: qna-config + image: ghcr.io/huggingface/tgi-gaudi:2.0.5 + imagePullPolicy: IfNotPresent + name: llm-dependency-deploy + ports: + - containerPort: 80 + resources: + limits: + habana.ai/gaudi: 1 + securityContext: + capabilities: + add: + - SYS_NICE + volumeMounts: + - mountPath: /data + name: model-volume + - mountPath: /dev/shm + name: shm + hostIPC: true + nodeSelector: + node-type: chatqna-opea + serviceAccountName: default + topologySpreadConstraints: + - labelSelector: + matchLabels: + app: llm-dependency-deploy + maxSkew: 1 + topologyKey: kubernetes.io/hostname + whenUnsatisfiable: ScheduleAnyway + volumes: + - hostPath: + path: /mnt/models + type: Directory + name: model-volume + - emptyDir: + medium: Memory + sizeLimit: 1Gi + name: shm +--- +apiVersion: v1 +kind: Service +metadata: + name: llm-dependency-svc + namespace: default +spec: + ports: + - name: service + port: 9009 + targetPort: 80 + selector: + app: llm-dependency-deploy + type: ClusterIP +--- +apiVersion: apps/v1 +kind: Deployment +metadata: + name: reranking-dependency-deploy + namespace: default +spec: + replicas: 1 + selector: + matchLabels: + app: reranking-dependency-deploy + template: + metadata: + annotations: + sidecar.istio.io/rewriteAppHTTPProbers: 'true' + labels: + app: reranking-dependency-deploy + spec: + containers: + - args: + - --model-id + - $(RERANK_MODEL_ID) + - --auto-truncate + env: + - name: OMPI_MCA_btl_vader_single_copy_mechanism + value: none + - name: PT_HPU_ENABLE_LAZY_COLLECTIVES + value: 'true' + - name: runtime + value: habana + - name: HABANA_VISIBLE_DEVICES + value: all + - name: HF_TOKEN + value: ${HF_TOKEN} + - name: MAX_WARMUP_SEQUENCE_LENGTH + value: '512' + envFrom: + - configMapRef: + name: qna-config + image: ghcr.io/huggingface/tei-gaudi:latest + imagePullPolicy: IfNotPresent + name: reranking-dependency-deploy + ports: + - containerPort: 80 + resources: + limits: + habana.ai/gaudi: 1 + volumeMounts: + - mountPath: /data + name: model-volume + - mountPath: /dev/shm + name: shm + hostIPC: true + nodeSelector: + node-type: chatqna-opea + serviceAccountName: default + topologySpreadConstraints: + - labelSelector: + matchLabels: + app: reranking-dependency-deploy + maxSkew: 1 + topologyKey: kubernetes.io/hostname + whenUnsatisfiable: ScheduleAnyway + volumes: + - hostPath: + path: /mnt/models + type: Directory + name: model-volume + - emptyDir: + medium: Memory + sizeLimit: 1Gi + name: shm +--- +apiVersion: v1 +kind: Service +metadata: + name: reranking-dependency-svc + namespace: default +spec: + ports: + - name: service + port: 8808 + targetPort: 80 + selector: + app: reranking-dependency-deploy + type: ClusterIP +--- +apiVersion: apps/v1 +kind: Deployment +metadata: + name: retriever-deploy + namespace: default +spec: + replicas: 2 + selector: + matchLabels: + app: retriever-deploy + template: + metadata: + annotations: + sidecar.istio.io/rewriteAppHTTPProbers: 'true' + labels: + app: retriever-deploy + spec: + containers: + - envFrom: + - configMapRef: + name: qna-config + image: opea/retriever-redis:latest + imagePullPolicy: IfNotPresent + name: retriever-deploy + ports: + - containerPort: 7000 + resources: + requests: + cpu: 4 + memory: 4000Mi + hostIPC: true + nodeSelector: + node-type: chatqna-opea + serviceAccountName: default + topologySpreadConstraints: + - labelSelector: + matchLabels: + app: retriever-deploy + maxSkew: 1 + topologyKey: kubernetes.io/hostname + whenUnsatisfiable: ScheduleAnyway +--- +apiVersion: v1 +kind: Service +metadata: + name: retriever-svc + namespace: default +spec: + ports: + - name: service + port: 7000 + targetPort: 7000 + selector: + app: retriever-deploy + type: ClusterIP +--- +apiVersion: apps/v1 +kind: Deployment +metadata: + name: vector-db + namespace: default +spec: + replicas: 1 + selector: + matchLabels: + app: vector-db + template: + metadata: + annotations: + sidecar.istio.io/rewriteAppHTTPProbers: 'true' + labels: + app: vector-db + spec: + containers: + - envFrom: + - configMapRef: + name: qna-config + image: redis/redis-stack:7.2.0-v9 + imagePullPolicy: IfNotPresent + name: vector-db + ports: + - containerPort: 6379 + - containerPort: 8001 + hostIPC: true + nodeSelector: + node-type: chatqna-opea + serviceAccountName: default + topologySpreadConstraints: + - labelSelector: + matchLabels: + app: vector-db + maxSkew: 1 + topologyKey: kubernetes.io/hostname + whenUnsatisfiable: ScheduleAnyway +--- +apiVersion: v1 +kind: Service +metadata: + name: vector-db + namespace: default +spec: + ports: + - name: vector-db-service + port: 6379 + targetPort: 6379 + - name: vector-db-insight + port: 8001 + targetPort: 8001 + selector: + app: vector-db + type: ClusterIP +--- diff --git a/ChatQnA/benchmark/performance/vllm/without_rerank/eight_gaudi/tuned_eight_gaudi_without_rerank.yaml b/ChatQnA/benchmark/performance/vllm/without_rerank/eight_gaudi/tuned_eight_gaudi_without_rerank.yaml new file mode 100644 index 000000000..28999d4fe --- /dev/null +++ b/ChatQnA/benchmark/performance/vllm/without_rerank/eight_gaudi/tuned_eight_gaudi_without_rerank.yaml @@ -0,0 +1,421 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +apiVersion: v1 +data: + EMBEDDING_MODEL_ID: BAAI/bge-base-en-v1.5 + EMBEDDING_SERVER_HOST_IP: embedding-dependency-svc + HUGGINGFACEHUB_API_TOKEN: ${HF_TOKEN} + INDEX_NAME: rag-redis + LLM_MODEL_ID: Intel/neural-chat-7b-v3-3 + LLM_SERVER_HOST_IP: llm-dependency-svc + NODE_SELECTOR: chatqna-opea + REDIS_URL: redis://vector-db.default.svc.cluster.local:6379 + RERANK_MODEL_ID: BAAI/bge-reranker-base + RERANK_SERVER_HOST_IP: reranking-dependency-svc + RETRIEVER_SERVICE_HOST_IP: retriever-svc + TEI_EMBEDDING_ENDPOINT: http://embedding-dependency-svc.default.svc.cluster.local:6006 + TEI_ENDPOINT: http://embedding-dependency-svc.default.svc.cluster.local:6006 + TEI_RERANKING_ENDPOINT: http://reranking-dependency-svc.default.svc.cluster.local:8808 + TGI_LLM_ENDPOINT: http://llm-dependency-svc.default.svc.cluster.local:9009 +kind: ConfigMap +metadata: + name: qna-config + namespace: default +--- +apiVersion: apps/v1 +kind: Deployment +metadata: + name: chatqna-backend-server-deploy + namespace: default +spec: + replicas: 8 + selector: + matchLabels: + app: chatqna-backend-server-deploy + template: + metadata: + annotations: + sidecar.istio.io/rewriteAppHTTPProbers: 'true' + labels: + app: chatqna-backend-server-deploy + spec: + containers: + - envFrom: + - configMapRef: + name: qna-config + image: opea/chatqna-without-rerank:latest + imagePullPolicy: IfNotPresent + name: chatqna-backend-server-deploy + ports: + - containerPort: 8888 + resources: + limits: + cpu: 8 + memory: 8000Mi + requests: + cpu: 8 + memory: 8000Mi + hostIPC: true + nodeSelector: + node-type: chatqna-opea + serviceAccountName: default + topologySpreadConstraints: + - labelSelector: + matchLabels: + app: chatqna-backend-server-deploy + maxSkew: 1 + topologyKey: kubernetes.io/hostname + whenUnsatisfiable: ScheduleAnyway +--- +apiVersion: v1 +kind: Service +metadata: + name: chatqna-backend-server-svc + namespace: default +spec: + ports: + - name: service + nodePort: 30888 + port: 8888 + targetPort: 8888 + selector: + app: chatqna-backend-server-deploy + type: NodePort +--- +apiVersion: apps/v1 +kind: Deployment +metadata: + name: dataprep-deploy + namespace: default +spec: + replicas: 1 + selector: + matchLabels: + app: dataprep-deploy + template: + metadata: + annotations: + sidecar.istio.io/rewriteAppHTTPProbers: 'true' + labels: + app: dataprep-deploy + spec: + containers: + - envFrom: + - configMapRef: + name: qna-config + image: opea/dataprep-redis:latest + imagePullPolicy: IfNotPresent + name: dataprep-deploy + ports: + - containerPort: 6007 + hostIPC: true + nodeSelector: + node-type: chatqna-opea + serviceAccountName: default + topologySpreadConstraints: + - labelSelector: + matchLabels: + app: dataprep-deploy + maxSkew: 1 + topologyKey: kubernetes.io/hostname + whenUnsatisfiable: ScheduleAnyway +--- +apiVersion: v1 +kind: Service +metadata: + name: dataprep-svc + namespace: default +spec: + ports: + - name: port1 + port: 6007 + targetPort: 6007 + selector: + app: dataprep-deploy + type: ClusterIP +--- +apiVersion: apps/v1 +kind: Deployment +metadata: + name: embedding-dependency-deploy + namespace: default +spec: + replicas: 8 + selector: + matchLabels: + app: embedding-dependency-deploy + template: + metadata: + annotations: + sidecar.istio.io/rewriteAppHTTPProbers: 'true' + labels: + app: embedding-dependency-deploy + spec: + containers: + - args: + - --model-id + - $(EMBEDDING_MODEL_ID) + - --auto-truncate + envFrom: + - configMapRef: + name: qna-config + image: ghcr.io/huggingface/text-embeddings-inference:cpu-1.5 + imagePullPolicy: IfNotPresent + name: embedding-dependency-deploy + ports: + - containerPort: 80 + resources: + limits: + cpu: 80 + memory: 20000Mi + requests: + cpu: 80 + memory: 20000Mi + volumeMounts: + - mountPath: /data + name: model-volume + - mountPath: /dev/shm + name: shm + hostIPC: true + nodeSelector: + node-type: chatqna-opea + serviceAccountName: default + topologySpreadConstraints: + - labelSelector: + matchLabels: + app: embedding-dependency-deploy + maxSkew: 1 + topologyKey: kubernetes.io/hostname + whenUnsatisfiable: ScheduleAnyway + volumes: + - hostPath: + path: /mnt/models + type: Directory + name: model-volume + - emptyDir: + medium: Memory + sizeLimit: 1Gi + name: shm +--- +apiVersion: v1 +kind: Service +metadata: + name: embedding-dependency-svc + namespace: default +spec: + ports: + - name: service + port: 6006 + targetPort: 80 + selector: + app: embedding-dependency-deploy + type: ClusterIP +--- +apiVersion: apps/v1 +kind: Deployment +metadata: + name: llm-dependency-deploy + namespace: default +spec: + replicas: 64 + selector: + matchLabels: + app: llm-dependency-deploy + template: + metadata: + annotations: + sidecar.istio.io/rewriteAppHTTPProbers: 'true' + labels: + app: llm-dependency-deploy + spec: + containers: + - args: + - --model-id + - $(LLM_MODEL_ID) + - --max-input-length + - '1280' + - --max-total-tokens + - '2048' + - --max-batch-total-tokens + - '65536' + - --max-batch-prefill-tokens + - '4096' + env: + - name: OMPI_MCA_btl_vader_single_copy_mechanism + value: none + - name: PT_HPU_ENABLE_LAZY_COLLECTIVES + value: 'true' + - name: runtime + value: habana + - name: HABANA_VISIBLE_DEVICES + value: all + - name: HF_TOKEN + value: ${HF_TOKEN} + envFrom: + - configMapRef: + name: qna-config + image: ghcr.io/huggingface/tgi-gaudi:2.0.5 + imagePullPolicy: IfNotPresent + name: llm-dependency-deploy + ports: + - containerPort: 80 + resources: + limits: + habana.ai/gaudi: 1 + securityContext: + capabilities: + add: + - SYS_NICE + volumeMounts: + - mountPath: /data + name: model-volume + - mountPath: /dev/shm + name: shm + hostIPC: true + nodeSelector: + node-type: chatqna-opea + serviceAccountName: default + topologySpreadConstraints: + - labelSelector: + matchLabels: + app: llm-dependency-deploy + maxSkew: 1 + topologyKey: kubernetes.io/hostname + whenUnsatisfiable: ScheduleAnyway + volumes: + - hostPath: + path: /mnt/models + type: Directory + name: model-volume + - emptyDir: + medium: Memory + sizeLimit: 1Gi + name: shm +--- +apiVersion: v1 +kind: Service +metadata: + name: llm-dependency-svc + namespace: default +spec: + ports: + - name: service + port: 9009 + targetPort: 80 + selector: + app: llm-dependency-deploy + type: ClusterIP +--- +apiVersion: apps/v1 +kind: Deployment +metadata: + name: retriever-deploy + namespace: default +spec: + replicas: 8 + selector: + matchLabels: + app: retriever-deploy + template: + metadata: + annotations: + sidecar.istio.io/rewriteAppHTTPProbers: 'true' + labels: + app: retriever-deploy + spec: + containers: + - envFrom: + - configMapRef: + name: qna-config + image: opea/retriever-redis:latest + imagePullPolicy: IfNotPresent + name: retriever-deploy + ports: + - containerPort: 7000 + resources: + requests: + cpu: 4 + memory: 4000Mi + hostIPC: true + nodeSelector: + node-type: chatqna-opea + serviceAccountName: default + topologySpreadConstraints: + - labelSelector: + matchLabels: + app: retriever-deploy + maxSkew: 1 + topologyKey: kubernetes.io/hostname + whenUnsatisfiable: ScheduleAnyway +--- +apiVersion: v1 +kind: Service +metadata: + name: retriever-svc + namespace: default +spec: + ports: + - name: service + port: 7000 + targetPort: 7000 + selector: + app: retriever-deploy + type: ClusterIP +--- +apiVersion: apps/v1 +kind: Deployment +metadata: + name: vector-db + namespace: default +spec: + replicas: 1 + selector: + matchLabels: + app: vector-db + template: + metadata: + annotations: + sidecar.istio.io/rewriteAppHTTPProbers: 'true' + labels: + app: vector-db + spec: + containers: + - envFrom: + - configMapRef: + name: qna-config + image: redis/redis-stack:7.2.0-v9 + imagePullPolicy: IfNotPresent + name: vector-db + ports: + - containerPort: 6379 + - containerPort: 8001 + hostIPC: true + nodeSelector: + node-type: chatqna-opea + serviceAccountName: default + topologySpreadConstraints: + - labelSelector: + matchLabels: + app: vector-db + maxSkew: 1 + topologyKey: kubernetes.io/hostname + whenUnsatisfiable: ScheduleAnyway +--- +apiVersion: v1 +kind: Service +metadata: + name: vector-db + namespace: default +spec: + ports: + - name: vector-db-service + port: 6379 + targetPort: 6379 + - name: vector-db-insight + port: 8001 + targetPort: 8001 + selector: + app: vector-db + type: ClusterIP +--- diff --git a/ChatQnA/benchmark/performance/vllm/without_rerank/four_gaudi/tuned_four_gaudi_without_rerank.yaml b/ChatQnA/benchmark/performance/vllm/without_rerank/four_gaudi/tuned_four_gaudi_without_rerank.yaml new file mode 100644 index 000000000..91c8b3a7a --- /dev/null +++ b/ChatQnA/benchmark/performance/vllm/without_rerank/four_gaudi/tuned_four_gaudi_without_rerank.yaml @@ -0,0 +1,421 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +apiVersion: v1 +data: + EMBEDDING_MODEL_ID: BAAI/bge-base-en-v1.5 + EMBEDDING_SERVER_HOST_IP: embedding-dependency-svc + HUGGINGFACEHUB_API_TOKEN: ${HF_TOKEN} + INDEX_NAME: rag-redis + LLM_MODEL_ID: Intel/neural-chat-7b-v3-3 + LLM_SERVER_HOST_IP: llm-dependency-svc + NODE_SELECTOR: chatqna-opea + REDIS_URL: redis://vector-db.default.svc.cluster.local:6379 + RERANK_MODEL_ID: BAAI/bge-reranker-base + RERANK_SERVER_HOST_IP: reranking-dependency-svc + RETRIEVER_SERVICE_HOST_IP: retriever-svc + TEI_EMBEDDING_ENDPOINT: http://embedding-dependency-svc.default.svc.cluster.local:6006 + TEI_ENDPOINT: http://embedding-dependency-svc.default.svc.cluster.local:6006 + TEI_RERANKING_ENDPOINT: http://reranking-dependency-svc.default.svc.cluster.local:8808 + TGI_LLM_ENDPOINT: http://llm-dependency-svc.default.svc.cluster.local:9009 +kind: ConfigMap +metadata: + name: qna-config + namespace: default +--- +apiVersion: apps/v1 +kind: Deployment +metadata: + name: chatqna-backend-server-deploy + namespace: default +spec: + replicas: 4 + selector: + matchLabels: + app: chatqna-backend-server-deploy + template: + metadata: + annotations: + sidecar.istio.io/rewriteAppHTTPProbers: 'true' + labels: + app: chatqna-backend-server-deploy + spec: + containers: + - envFrom: + - configMapRef: + name: qna-config + image: opea/chatqna-without-rerank:latest + imagePullPolicy: IfNotPresent + name: chatqna-backend-server-deploy + ports: + - containerPort: 8888 + resources: + limits: + cpu: 8 + memory: 8000Mi + requests: + cpu: 8 + memory: 8000Mi + hostIPC: true + nodeSelector: + node-type: chatqna-opea + serviceAccountName: default + topologySpreadConstraints: + - labelSelector: + matchLabels: + app: chatqna-backend-server-deploy + maxSkew: 1 + topologyKey: kubernetes.io/hostname + whenUnsatisfiable: ScheduleAnyway +--- +apiVersion: v1 +kind: Service +metadata: + name: chatqna-backend-server-svc + namespace: default +spec: + ports: + - name: service + nodePort: 30888 + port: 8888 + targetPort: 8888 + selector: + app: chatqna-backend-server-deploy + type: NodePort +--- +apiVersion: apps/v1 +kind: Deployment +metadata: + name: dataprep-deploy + namespace: default +spec: + replicas: 1 + selector: + matchLabels: + app: dataprep-deploy + template: + metadata: + annotations: + sidecar.istio.io/rewriteAppHTTPProbers: 'true' + labels: + app: dataprep-deploy + spec: + containers: + - envFrom: + - configMapRef: + name: qna-config + image: opea/dataprep-redis:latest + imagePullPolicy: IfNotPresent + name: dataprep-deploy + ports: + - containerPort: 6007 + hostIPC: true + nodeSelector: + node-type: chatqna-opea + serviceAccountName: default + topologySpreadConstraints: + - labelSelector: + matchLabels: + app: dataprep-deploy + maxSkew: 1 + topologyKey: kubernetes.io/hostname + whenUnsatisfiable: ScheduleAnyway +--- +apiVersion: v1 +kind: Service +metadata: + name: dataprep-svc + namespace: default +spec: + ports: + - name: port1 + port: 6007 + targetPort: 6007 + selector: + app: dataprep-deploy + type: ClusterIP +--- +apiVersion: apps/v1 +kind: Deployment +metadata: + name: embedding-dependency-deploy + namespace: default +spec: + replicas: 4 + selector: + matchLabels: + app: embedding-dependency-deploy + template: + metadata: + annotations: + sidecar.istio.io/rewriteAppHTTPProbers: 'true' + labels: + app: embedding-dependency-deploy + spec: + containers: + - args: + - --model-id + - $(EMBEDDING_MODEL_ID) + - --auto-truncate + envFrom: + - configMapRef: + name: qna-config + image: ghcr.io/huggingface/text-embeddings-inference:cpu-1.5 + imagePullPolicy: IfNotPresent + name: embedding-dependency-deploy + ports: + - containerPort: 80 + resources: + limits: + cpu: 80 + memory: 20000Mi + requests: + cpu: 80 + memory: 20000Mi + volumeMounts: + - mountPath: /data + name: model-volume + - mountPath: /dev/shm + name: shm + hostIPC: true + nodeSelector: + node-type: chatqna-opea + serviceAccountName: default + topologySpreadConstraints: + - labelSelector: + matchLabels: + app: embedding-dependency-deploy + maxSkew: 1 + topologyKey: kubernetes.io/hostname + whenUnsatisfiable: ScheduleAnyway + volumes: + - hostPath: + path: /mnt/models + type: Directory + name: model-volume + - emptyDir: + medium: Memory + sizeLimit: 1Gi + name: shm +--- +apiVersion: v1 +kind: Service +metadata: + name: embedding-dependency-svc + namespace: default +spec: + ports: + - name: service + port: 6006 + targetPort: 80 + selector: + app: embedding-dependency-deploy + type: ClusterIP +--- +apiVersion: apps/v1 +kind: Deployment +metadata: + name: llm-dependency-deploy + namespace: default +spec: + replicas: 32 + selector: + matchLabels: + app: llm-dependency-deploy + template: + metadata: + annotations: + sidecar.istio.io/rewriteAppHTTPProbers: 'true' + labels: + app: llm-dependency-deploy + spec: + containers: + - args: + - --model-id + - $(LLM_MODEL_ID) + - --max-input-length + - '1280' + - --max-total-tokens + - '2048' + - --max-batch-total-tokens + - '65536' + - --max-batch-prefill-tokens + - '4096' + env: + - name: OMPI_MCA_btl_vader_single_copy_mechanism + value: none + - name: PT_HPU_ENABLE_LAZY_COLLECTIVES + value: 'true' + - name: runtime + value: habana + - name: HABANA_VISIBLE_DEVICES + value: all + - name: HF_TOKEN + value: ${HF_TOKEN} + envFrom: + - configMapRef: + name: qna-config + image: ghcr.io/huggingface/tgi-gaudi:2.0.5 + imagePullPolicy: IfNotPresent + name: llm-dependency-deploy + ports: + - containerPort: 80 + resources: + limits: + habana.ai/gaudi: 1 + securityContext: + capabilities: + add: + - SYS_NICE + volumeMounts: + - mountPath: /data + name: model-volume + - mountPath: /dev/shm + name: shm + hostIPC: true + nodeSelector: + node-type: chatqna-opea + serviceAccountName: default + topologySpreadConstraints: + - labelSelector: + matchLabels: + app: llm-dependency-deploy + maxSkew: 1 + topologyKey: kubernetes.io/hostname + whenUnsatisfiable: ScheduleAnyway + volumes: + - hostPath: + path: /mnt/models + type: Directory + name: model-volume + - emptyDir: + medium: Memory + sizeLimit: 1Gi + name: shm +--- +apiVersion: v1 +kind: Service +metadata: + name: llm-dependency-svc + namespace: default +spec: + ports: + - name: service + port: 9009 + targetPort: 80 + selector: + app: llm-dependency-deploy + type: ClusterIP +--- +apiVersion: apps/v1 +kind: Deployment +metadata: + name: retriever-deploy + namespace: default +spec: + replicas: 4 + selector: + matchLabels: + app: retriever-deploy + template: + metadata: + annotations: + sidecar.istio.io/rewriteAppHTTPProbers: 'true' + labels: + app: retriever-deploy + spec: + containers: + - envFrom: + - configMapRef: + name: qna-config + image: opea/retriever-redis:latest + imagePullPolicy: IfNotPresent + name: retriever-deploy + ports: + - containerPort: 7000 + resources: + requests: + cpu: 4 + memory: 4000Mi + hostIPC: true + nodeSelector: + node-type: chatqna-opea + serviceAccountName: default + topologySpreadConstraints: + - labelSelector: + matchLabels: + app: retriever-deploy + maxSkew: 1 + topologyKey: kubernetes.io/hostname + whenUnsatisfiable: ScheduleAnyway +--- +apiVersion: v1 +kind: Service +metadata: + name: retriever-svc + namespace: default +spec: + ports: + - name: service + port: 7000 + targetPort: 7000 + selector: + app: retriever-deploy + type: ClusterIP +--- +apiVersion: apps/v1 +kind: Deployment +metadata: + name: vector-db + namespace: default +spec: + replicas: 1 + selector: + matchLabels: + app: vector-db + template: + metadata: + annotations: + sidecar.istio.io/rewriteAppHTTPProbers: 'true' + labels: + app: vector-db + spec: + containers: + - envFrom: + - configMapRef: + name: qna-config + image: redis/redis-stack:7.2.0-v9 + imagePullPolicy: IfNotPresent + name: vector-db + ports: + - containerPort: 6379 + - containerPort: 8001 + hostIPC: true + nodeSelector: + node-type: chatqna-opea + serviceAccountName: default + topologySpreadConstraints: + - labelSelector: + matchLabels: + app: vector-db + maxSkew: 1 + topologyKey: kubernetes.io/hostname + whenUnsatisfiable: ScheduleAnyway +--- +apiVersion: v1 +kind: Service +metadata: + name: vector-db + namespace: default +spec: + ports: + - name: vector-db-service + port: 6379 + targetPort: 6379 + - name: vector-db-insight + port: 8001 + targetPort: 8001 + selector: + app: vector-db + type: ClusterIP +--- diff --git a/ChatQnA/benchmark/performance/vllm/without_rerank/single_gaudi/tuned_single_gaudi_without_rerank.yaml b/ChatQnA/benchmark/performance/vllm/without_rerank/single_gaudi/tuned_single_gaudi_without_rerank.yaml new file mode 100644 index 000000000..9c3e4ef6b --- /dev/null +++ b/ChatQnA/benchmark/performance/vllm/without_rerank/single_gaudi/tuned_single_gaudi_without_rerank.yaml @@ -0,0 +1,421 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +apiVersion: v1 +data: + EMBEDDING_MODEL_ID: BAAI/bge-base-en-v1.5 + EMBEDDING_SERVER_HOST_IP: embedding-dependency-svc + HUGGINGFACEHUB_API_TOKEN: ${HF_TOKEN} + INDEX_NAME: rag-redis + LLM_MODEL_ID: Intel/neural-chat-7b-v3-3 + LLM_SERVER_HOST_IP: llm-dependency-svc + NODE_SELECTOR: chatqna-opea + REDIS_URL: redis://vector-db.default.svc.cluster.local:6379 + RERANK_MODEL_ID: BAAI/bge-reranker-base + RERANK_SERVER_HOST_IP: reranking-dependency-svc + RETRIEVER_SERVICE_HOST_IP: retriever-svc + TEI_EMBEDDING_ENDPOINT: http://embedding-dependency-svc.default.svc.cluster.local:6006 + TEI_ENDPOINT: http://embedding-dependency-svc.default.svc.cluster.local:6006 + TEI_RERANKING_ENDPOINT: http://reranking-dependency-svc.default.svc.cluster.local:8808 + TGI_LLM_ENDPOINT: http://llm-dependency-svc.default.svc.cluster.local:9009 +kind: ConfigMap +metadata: + name: qna-config + namespace: default +--- +apiVersion: apps/v1 +kind: Deployment +metadata: + name: chatqna-backend-server-deploy + namespace: default +spec: + replicas: 2 + selector: + matchLabels: + app: chatqna-backend-server-deploy + template: + metadata: + annotations: + sidecar.istio.io/rewriteAppHTTPProbers: 'true' + labels: + app: chatqna-backend-server-deploy + spec: + containers: + - envFrom: + - configMapRef: + name: qna-config + image: opea/chatqna-without-rerank:latest + imagePullPolicy: IfNotPresent + name: chatqna-backend-server-deploy + ports: + - containerPort: 8888 + resources: + limits: + cpu: 8 + memory: 8000Mi + requests: + cpu: 8 + memory: 8000Mi + hostIPC: true + nodeSelector: + node-type: chatqna-opea + serviceAccountName: default + topologySpreadConstraints: + - labelSelector: + matchLabels: + app: chatqna-backend-server-deploy + maxSkew: 1 + topologyKey: kubernetes.io/hostname + whenUnsatisfiable: ScheduleAnyway +--- +apiVersion: v1 +kind: Service +metadata: + name: chatqna-backend-server-svc + namespace: default +spec: + ports: + - name: service + nodePort: 30888 + port: 8888 + targetPort: 8888 + selector: + app: chatqna-backend-server-deploy + type: NodePort +--- +apiVersion: apps/v1 +kind: Deployment +metadata: + name: dataprep-deploy + namespace: default +spec: + replicas: 1 + selector: + matchLabels: + app: dataprep-deploy + template: + metadata: + annotations: + sidecar.istio.io/rewriteAppHTTPProbers: 'true' + labels: + app: dataprep-deploy + spec: + containers: + - envFrom: + - configMapRef: + name: qna-config + image: opea/dataprep-redis:latest + imagePullPolicy: IfNotPresent + name: dataprep-deploy + ports: + - containerPort: 6007 + hostIPC: true + nodeSelector: + node-type: chatqna-opea + serviceAccountName: default + topologySpreadConstraints: + - labelSelector: + matchLabels: + app: dataprep-deploy + maxSkew: 1 + topologyKey: kubernetes.io/hostname + whenUnsatisfiable: ScheduleAnyway +--- +apiVersion: v1 +kind: Service +metadata: + name: dataprep-svc + namespace: default +spec: + ports: + - name: port1 + port: 6007 + targetPort: 6007 + selector: + app: dataprep-deploy + type: ClusterIP +--- +apiVersion: apps/v1 +kind: Deployment +metadata: + name: embedding-dependency-deploy + namespace: default +spec: + replicas: 1 + selector: + matchLabels: + app: embedding-dependency-deploy + template: + metadata: + annotations: + sidecar.istio.io/rewriteAppHTTPProbers: 'true' + labels: + app: embedding-dependency-deploy + spec: + containers: + - args: + - --model-id + - $(EMBEDDING_MODEL_ID) + - --auto-truncate + envFrom: + - configMapRef: + name: qna-config + image: ghcr.io/huggingface/text-embeddings-inference:cpu-1.5 + imagePullPolicy: IfNotPresent + name: embedding-dependency-deploy + ports: + - containerPort: 80 + resources: + limits: + cpu: 80 + memory: 20000Mi + requests: + cpu: 80 + memory: 20000Mi + volumeMounts: + - mountPath: /data + name: model-volume + - mountPath: /dev/shm + name: shm + hostIPC: true + nodeSelector: + node-type: chatqna-opea + serviceAccountName: default + topologySpreadConstraints: + - labelSelector: + matchLabels: + app: embedding-dependency-deploy + maxSkew: 1 + topologyKey: kubernetes.io/hostname + whenUnsatisfiable: ScheduleAnyway + volumes: + - hostPath: + path: /mnt/models + type: Directory + name: model-volume + - emptyDir: + medium: Memory + sizeLimit: 1Gi + name: shm +--- +apiVersion: v1 +kind: Service +metadata: + name: embedding-dependency-svc + namespace: default +spec: + ports: + - name: service + port: 6006 + targetPort: 80 + selector: + app: embedding-dependency-deploy + type: ClusterIP +--- +apiVersion: apps/v1 +kind: Deployment +metadata: + name: llm-dependency-deploy + namespace: default +spec: + replicas: 8 + selector: + matchLabels: + app: llm-dependency-deploy + template: + metadata: + annotations: + sidecar.istio.io/rewriteAppHTTPProbers: 'true' + labels: + app: llm-dependency-deploy + spec: + containers: + - args: + - --model-id + - $(LLM_MODEL_ID) + - --max-input-length + - '1280' + - --max-total-tokens + - '2048' + - --max-batch-total-tokens + - '65536' + - --max-batch-prefill-tokens + - '4096' + env: + - name: OMPI_MCA_btl_vader_single_copy_mechanism + value: none + - name: PT_HPU_ENABLE_LAZY_COLLECTIVES + value: 'true' + - name: runtime + value: habana + - name: HABANA_VISIBLE_DEVICES + value: all + - name: HF_TOKEN + value: ${HF_TOKEN} + envFrom: + - configMapRef: + name: qna-config + image: ghcr.io/huggingface/tgi-gaudi:2.0.5 + imagePullPolicy: IfNotPresent + name: llm-dependency-deploy + ports: + - containerPort: 80 + resources: + limits: + habana.ai/gaudi: 1 + securityContext: + capabilities: + add: + - SYS_NICE + volumeMounts: + - mountPath: /data + name: model-volume + - mountPath: /dev/shm + name: shm + hostIPC: true + nodeSelector: + node-type: chatqna-opea + serviceAccountName: default + topologySpreadConstraints: + - labelSelector: + matchLabels: + app: llm-dependency-deploy + maxSkew: 1 + topologyKey: kubernetes.io/hostname + whenUnsatisfiable: ScheduleAnyway + volumes: + - hostPath: + path: /mnt/models + type: Directory + name: model-volume + - emptyDir: + medium: Memory + sizeLimit: 1Gi + name: shm +--- +apiVersion: v1 +kind: Service +metadata: + name: llm-dependency-svc + namespace: default +spec: + ports: + - name: service + port: 9009 + targetPort: 80 + selector: + app: llm-dependency-deploy + type: ClusterIP +--- +apiVersion: apps/v1 +kind: Deployment +metadata: + name: retriever-deploy + namespace: default +spec: + replicas: 2 + selector: + matchLabels: + app: retriever-deploy + template: + metadata: + annotations: + sidecar.istio.io/rewriteAppHTTPProbers: 'true' + labels: + app: retriever-deploy + spec: + containers: + - envFrom: + - configMapRef: + name: qna-config + image: opea/retriever-redis:latest + imagePullPolicy: IfNotPresent + name: retriever-deploy + ports: + - containerPort: 7000 + resources: + requests: + cpu: 4 + memory: 4000Mi + hostIPC: true + nodeSelector: + node-type: chatqna-opea + serviceAccountName: default + topologySpreadConstraints: + - labelSelector: + matchLabels: + app: retriever-deploy + maxSkew: 1 + topologyKey: kubernetes.io/hostname + whenUnsatisfiable: ScheduleAnyway +--- +apiVersion: v1 +kind: Service +metadata: + name: retriever-svc + namespace: default +spec: + ports: + - name: service + port: 7000 + targetPort: 7000 + selector: + app: retriever-deploy + type: ClusterIP +--- +apiVersion: apps/v1 +kind: Deployment +metadata: + name: vector-db + namespace: default +spec: + replicas: 1 + selector: + matchLabels: + app: vector-db + template: + metadata: + annotations: + sidecar.istio.io/rewriteAppHTTPProbers: 'true' + labels: + app: vector-db + spec: + containers: + - envFrom: + - configMapRef: + name: qna-config + image: redis/redis-stack:7.2.0-v9 + imagePullPolicy: IfNotPresent + name: vector-db + ports: + - containerPort: 6379 + - containerPort: 8001 + hostIPC: true + nodeSelector: + node-type: chatqna-opea + serviceAccountName: default + topologySpreadConstraints: + - labelSelector: + matchLabels: + app: vector-db + maxSkew: 1 + topologyKey: kubernetes.io/hostname + whenUnsatisfiable: ScheduleAnyway +--- +apiVersion: v1 +kind: Service +metadata: + name: vector-db + namespace: default +spec: + ports: + - name: vector-db-service + port: 6379 + targetPort: 6379 + - name: vector-db-insight + port: 8001 + targetPort: 8001 + selector: + app: vector-db + type: ClusterIP +--- diff --git a/ChatQnA/benchmark/performance/vllm/without_rerank/two_gaudi/tuned_two_gaudi_without_rerank.yaml b/ChatQnA/benchmark/performance/vllm/without_rerank/two_gaudi/tuned_two_gaudi_without_rerank.yaml new file mode 100644 index 000000000..2baa1f110 --- /dev/null +++ b/ChatQnA/benchmark/performance/vllm/without_rerank/two_gaudi/tuned_two_gaudi_without_rerank.yaml @@ -0,0 +1,421 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +apiVersion: v1 +data: + EMBEDDING_MODEL_ID: BAAI/bge-base-en-v1.5 + EMBEDDING_SERVER_HOST_IP: embedding-dependency-svc + HUGGINGFACEHUB_API_TOKEN: ${HF_TOKEN} + INDEX_NAME: rag-redis + LLM_MODEL_ID: Intel/neural-chat-7b-v3-3 + LLM_SERVER_HOST_IP: llm-dependency-svc + NODE_SELECTOR: chatqna-opea + REDIS_URL: redis://vector-db.default.svc.cluster.local:6379 + RERANK_MODEL_ID: BAAI/bge-reranker-base + RERANK_SERVER_HOST_IP: reranking-dependency-svc + RETRIEVER_SERVICE_HOST_IP: retriever-svc + TEI_EMBEDDING_ENDPOINT: http://embedding-dependency-svc.default.svc.cluster.local:6006 + TEI_ENDPOINT: http://embedding-dependency-svc.default.svc.cluster.local:6006 + TEI_RERANKING_ENDPOINT: http://reranking-dependency-svc.default.svc.cluster.local:8808 + TGI_LLM_ENDPOINT: http://llm-dependency-svc.default.svc.cluster.local:9009 +kind: ConfigMap +metadata: + name: qna-config + namespace: default +--- +apiVersion: apps/v1 +kind: Deployment +metadata: + name: chatqna-backend-server-deploy + namespace: default +spec: + replicas: 2 + selector: + matchLabels: + app: chatqna-backend-server-deploy + template: + metadata: + annotations: + sidecar.istio.io/rewriteAppHTTPProbers: 'true' + labels: + app: chatqna-backend-server-deploy + spec: + containers: + - envFrom: + - configMapRef: + name: qna-config + image: opea/chatqna-without-rerank:latest + imagePullPolicy: IfNotPresent + name: chatqna-backend-server-deploy + ports: + - containerPort: 8888 + resources: + limits: + cpu: 8 + memory: 8000Mi + requests: + cpu: 8 + memory: 8000Mi + hostIPC: true + nodeSelector: + node-type: chatqna-opea + serviceAccountName: default + topologySpreadConstraints: + - labelSelector: + matchLabels: + app: chatqna-backend-server-deploy + maxSkew: 1 + topologyKey: kubernetes.io/hostname + whenUnsatisfiable: ScheduleAnyway +--- +apiVersion: v1 +kind: Service +metadata: + name: chatqna-backend-server-svc + namespace: default +spec: + ports: + - name: service + nodePort: 30888 + port: 8888 + targetPort: 8888 + selector: + app: chatqna-backend-server-deploy + type: NodePort +--- +apiVersion: apps/v1 +kind: Deployment +metadata: + name: dataprep-deploy + namespace: default +spec: + replicas: 1 + selector: + matchLabels: + app: dataprep-deploy + template: + metadata: + annotations: + sidecar.istio.io/rewriteAppHTTPProbers: 'true' + labels: + app: dataprep-deploy + spec: + containers: + - envFrom: + - configMapRef: + name: qna-config + image: opea/dataprep-redis:latest + imagePullPolicy: IfNotPresent + name: dataprep-deploy + ports: + - containerPort: 6007 + hostIPC: true + nodeSelector: + node-type: chatqna-opea + serviceAccountName: default + topologySpreadConstraints: + - labelSelector: + matchLabels: + app: dataprep-deploy + maxSkew: 1 + topologyKey: kubernetes.io/hostname + whenUnsatisfiable: ScheduleAnyway +--- +apiVersion: v1 +kind: Service +metadata: + name: dataprep-svc + namespace: default +spec: + ports: + - name: port1 + port: 6007 + targetPort: 6007 + selector: + app: dataprep-deploy + type: ClusterIP +--- +apiVersion: apps/v1 +kind: Deployment +metadata: + name: embedding-dependency-deploy + namespace: default +spec: + replicas: 2 + selector: + matchLabels: + app: embedding-dependency-deploy + template: + metadata: + annotations: + sidecar.istio.io/rewriteAppHTTPProbers: 'true' + labels: + app: embedding-dependency-deploy + spec: + containers: + - args: + - --model-id + - $(EMBEDDING_MODEL_ID) + - --auto-truncate + envFrom: + - configMapRef: + name: qna-config + image: ghcr.io/huggingface/text-embeddings-inference:cpu-1.5 + imagePullPolicy: IfNotPresent + name: embedding-dependency-deploy + ports: + - containerPort: 80 + resources: + limits: + cpu: 80 + memory: 20000Mi + requests: + cpu: 80 + memory: 20000Mi + volumeMounts: + - mountPath: /data + name: model-volume + - mountPath: /dev/shm + name: shm + hostIPC: true + nodeSelector: + node-type: chatqna-opea + serviceAccountName: default + topologySpreadConstraints: + - labelSelector: + matchLabels: + app: embedding-dependency-deploy + maxSkew: 1 + topologyKey: kubernetes.io/hostname + whenUnsatisfiable: ScheduleAnyway + volumes: + - hostPath: + path: /mnt/models + type: Directory + name: model-volume + - emptyDir: + medium: Memory + sizeLimit: 1Gi + name: shm +--- +apiVersion: v1 +kind: Service +metadata: + name: embedding-dependency-svc + namespace: default +spec: + ports: + - name: service + port: 6006 + targetPort: 80 + selector: + app: embedding-dependency-deploy + type: ClusterIP +--- +apiVersion: apps/v1 +kind: Deployment +metadata: + name: llm-dependency-deploy + namespace: default +spec: + replicas: 16 + selector: + matchLabels: + app: llm-dependency-deploy + template: + metadata: + annotations: + sidecar.istio.io/rewriteAppHTTPProbers: 'true' + labels: + app: llm-dependency-deploy + spec: + containers: + - args: + - --model-id + - $(LLM_MODEL_ID) + - --max-input-length + - '1280' + - --max-total-tokens + - '2048' + - --max-batch-total-tokens + - '65536' + - --max-batch-prefill-tokens + - '4096' + env: + - name: OMPI_MCA_btl_vader_single_copy_mechanism + value: none + - name: PT_HPU_ENABLE_LAZY_COLLECTIVES + value: 'true' + - name: runtime + value: habana + - name: HABANA_VISIBLE_DEVICES + value: all + - name: HF_TOKEN + value: ${HF_TOKEN} + envFrom: + - configMapRef: + name: qna-config + image: ghcr.io/huggingface/tgi-gaudi:2.0.5 + imagePullPolicy: IfNotPresent + name: llm-dependency-deploy + ports: + - containerPort: 80 + resources: + limits: + habana.ai/gaudi: 1 + securityContext: + capabilities: + add: + - SYS_NICE + volumeMounts: + - mountPath: /data + name: model-volume + - mountPath: /dev/shm + name: shm + hostIPC: true + nodeSelector: + node-type: chatqna-opea + serviceAccountName: default + topologySpreadConstraints: + - labelSelector: + matchLabels: + app: llm-dependency-deploy + maxSkew: 1 + topologyKey: kubernetes.io/hostname + whenUnsatisfiable: ScheduleAnyway + volumes: + - hostPath: + path: /mnt/models + type: Directory + name: model-volume + - emptyDir: + medium: Memory + sizeLimit: 1Gi + name: shm +--- +apiVersion: v1 +kind: Service +metadata: + name: llm-dependency-svc + namespace: default +spec: + ports: + - name: service + port: 9009 + targetPort: 80 + selector: + app: llm-dependency-deploy + type: ClusterIP +--- +apiVersion: apps/v1 +kind: Deployment +metadata: + name: retriever-deploy + namespace: default +spec: + replicas: 2 + selector: + matchLabels: + app: retriever-deploy + template: + metadata: + annotations: + sidecar.istio.io/rewriteAppHTTPProbers: 'true' + labels: + app: retriever-deploy + spec: + containers: + - envFrom: + - configMapRef: + name: qna-config + image: opea/retriever-redis:latest + imagePullPolicy: IfNotPresent + name: retriever-deploy + ports: + - containerPort: 7000 + resources: + requests: + cpu: 4 + memory: 4000Mi + hostIPC: true + nodeSelector: + node-type: chatqna-opea + serviceAccountName: default + topologySpreadConstraints: + - labelSelector: + matchLabels: + app: retriever-deploy + maxSkew: 1 + topologyKey: kubernetes.io/hostname + whenUnsatisfiable: ScheduleAnyway +--- +apiVersion: v1 +kind: Service +metadata: + name: retriever-svc + namespace: default +spec: + ports: + - name: service + port: 7000 + targetPort: 7000 + selector: + app: retriever-deploy + type: ClusterIP +--- +apiVersion: apps/v1 +kind: Deployment +metadata: + name: vector-db + namespace: default +spec: + replicas: 1 + selector: + matchLabels: + app: vector-db + template: + metadata: + annotations: + sidecar.istio.io/rewriteAppHTTPProbers: 'true' + labels: + app: vector-db + spec: + containers: + - envFrom: + - configMapRef: + name: qna-config + image: redis/redis-stack:7.2.0-v9 + imagePullPolicy: IfNotPresent + name: vector-db + ports: + - containerPort: 6379 + - containerPort: 8001 + hostIPC: true + nodeSelector: + node-type: chatqna-opea + serviceAccountName: default + topologySpreadConstraints: + - labelSelector: + matchLabels: + app: vector-db + maxSkew: 1 + topologyKey: kubernetes.io/hostname + whenUnsatisfiable: ScheduleAnyway +--- +apiVersion: v1 +kind: Service +metadata: + name: vector-db + namespace: default +spec: + ports: + - name: vector-db-service + port: 6379 + targetPort: 6379 + - name: vector-db-insight + port: 8001 + targetPort: 8001 + selector: + app: vector-db + type: ClusterIP +---