GenAIExamples/FaqGen/kubernetes/intel/hpu/gaudi/manifest/faqgen.yaml

---
# Copyright (C) 2024 Intel Corporation
# SPDX-License-Identifier: Apache-2.0

apiVersion: apps/v1
kind: Deployment
metadata:
  name: faq-tgi-deploy
  namespace: default
spec:
  replicas: 1
  selector:
    matchLabels:
      app: faq-tgi-deploy
  template:
    metadata:
      annotations:
        sidecar.istio.io/rewriteAppHTTPProbers: 'true'
      labels:
        app: faq-tgi-deploy
    spec:
      hostIPC: true
      containers:
      - name: faq-tgi-deploy-demo
        env:
          - name: HUGGING_FACE_HUB_TOKEN
            value: "insert-your-huggingface-token-here"
          - name: OMPI_MCA_btl_vader_single_copy_mechanism
            value: none
          - name: PT_HPU_ENABLE_LAZY_COLLECTIVES
            value: 'true'
          - name: runtime
            value: habana
          - name: HABANA_VISIBLE_DEVICES
            value: all
          - name: PREFILL_BATCH_BUCKET_SIZE
            value: "1"
          - name: BATCH_BUCKET_SIZE
            value: "8"
          - name: PORT
            value: "80"
          - name: ENABLE_HPU_GRAPH
            value: 'true'
          - name: LIMIT_HPU_GRAPH
            value: 'true'
          - name: USE_FLASH_ATTENTION
            value: 'true'
          - name: FLASH_ATTENTION_RECOMPUTE
            value: 'true'
        image: ghcr.io/huggingface/tgi-gaudi:2.0.6
        imagePullPolicy: IfNotPresent
        securityContext:
          capabilities:
            add:
            - SYS_NICE
        args:
        - --model-id
        - 'meta-llama/Meta-Llama-3-8B-Instruct'
        - --cuda_graphs
        - '0'
        - --max-input-length
        - '3096'
        - --max-total-tokens
        - '4096'
        - --max-batch-total-tokens
        - '65536'
        - --max-batch-prefill-tokens
        - '4096'
        volumeMounts:
        - mountPath: /data
          name: model-volume
        - mountPath: /dev/shm
          name: shm
        ports:
        - containerPort: 80
        resources:
          limits:
            habana.ai/gaudi: 1
      serviceAccountName: default
      volumes:
      - name: model-volume
        hostPath:
          path: /mnt/models
          type: Directory
      - name: shm
        emptyDir:
          medium: Memory
          sizeLimit: 1Gi
---
kind: Service
apiVersion: v1
metadata:
  name: faq-tgi-svc
spec:
  type: ClusterIP
  selector:
    app: faq-tgi-deploy
  ports:
  - name: service
    port: 8010
    targetPort: 80
---
apiVersion: apps/v1
kind: Deployment
metadata:
  name: faq-micro-deploy
  namespace: default
spec:
  replicas: 1
  selector:
    matchLabels:
      app: faq-micro-deploy
  template:
    metadata:
      annotations:
        sidecar.istio.io/rewriteAppHTTPProbers: 'true'
      labels:
        app: faq-micro-deploy
    spec:
      hostIPC: true
      containers:
        - name: faq-micro-deploy
          env:
            - name: TGI_LLM_ENDPOINT
              value: "http://faq-tgi-svc.default.svc.cluster.local:8010"
            - name: HUGGINGFACEHUB_API_TOKEN
              value: "insert-your-huggingface-token-here"
          image: opea/llm-faqgen-tgi:latest
          imagePullPolicy: IfNotPresent
          args: null
          ports:
          - containerPort: 9000
      serviceAccountName: default
---
kind: Service
apiVersion: v1
metadata:
  name: faq-micro-svc
spec:
  type: ClusterIP
  selector:
    app: faq-micro-deploy
  ports:
  - name: service
    port: 9003
    targetPort: 9000
---
apiVersion: apps/v1
kind: Deployment
metadata:
  name: faq-mega-server-deploy
  namespace: default
spec:
  replicas: 1
  selector:
    matchLabels:
      app: faq-mega-server-deploy
  template:
    metadata:
      annotations:
        sidecar.istio.io/rewriteAppHTTPProbers: 'true'
      labels:
        app: faq-mega-server-deploy
    spec:
      hostIPC: true
      containers:
        - name: faq-mega-server-deploy
          env:
            - name: LLM_SERVICE_HOST_IP
              value: faq-micro-svc
            - name: LLM_SERVICE_PORT
              value: "9003"
            - name: MEGA_SERVICE_HOST_IP
              value: faq-mega-server-svc
            - name: MEGA_SERVICE_PORT
              value: "7777"
          image: opea/faqgen:latest
          imagePullPolicy: IfNotPresent
          args: null
          ports:
          - containerPort: 7777
      serviceAccountName: default
---
kind: Service
apiVersion: v1
metadata:
  name: faq-mega-server-svc
spec:
  type: NodePort
  selector:
    app: faq-mega-server-deploy
  ports:
  - name: service
    port: 7779
    targetPort: 7777
    nodePort: 30779