Support Llama3.2 vision and vision guard model (#753)

* Support llama3.2 models Signed-off-by: lvliang-intel <liang1.lv@intel.com> * fix issues Signed-off-by: lvliang-intel <liang1.lv@intel.com> * update code and doc Signed-off-by: lvliang-intel <liang1.lv@intel.com> * add llama vision guard support Signed-off-by: lvliang-intel <liang1.lv@intel.com> * add llama guard prompt format utils Signed-off-by: lvliang-intel <liang1.lv@intel.com> * add tp support Signed-off-by: lvliang-intel <liang1.lv@intel.com> * add wheel Signed-off-by: lvliang-intel <liang1.lv@intel.com> * fix accuracy issue Signed-off-by: lvliang-intel <liang1.lv@intel.com> * update tp service code Signed-off-by: lvliang-intel <liang1.lv@intel.com> * update dockerfile Signed-off-by: lvliang-intel <liang1.lv@intel.com> * support lvm tp serving Signed-off-by: letonghan <letong.han@intel.com> * update dockerfile Signed-off-by: lvliang-intel <liang1.lv@intel.com> * add run tp script Signed-off-by: lvliang-intel <liang1.lv@intel.com> * fix max_new_tokens Signed-off-by: lvliang-intel <liang1.lv@intel.com> * update run tp script Signed-off-by: lvliang-intel <liang1.lv@intel.com> * refine code and doc Signed-off-by: lvliang-intel <liang1.lv@intel.com> * install transformers from local wheel Signed-off-by: lvliang-intel <liang1.lv@intel.com> * update code using official transformers Signed-off-by: lvliang-intel <liang1.lv@intel.com> * remove unnecessary code Signed-off-by: lvliang-intel <liang1.lv@intel.com> * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * remove blank line Signed-off-by: lvliang-intel <liang1.lv@intel.com> * fix precommit issues Signed-off-by: lvliang-intel <liang1.lv@intel.com> * fix cd issue Signed-off-by: lvliang-intel <liang1.lv@intel.com> * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --------- Signed-off-by: lvliang-intel <liang1.lv@intel.com> Signed-off-by: letonghan <letong.han@intel.com> Co-authored-by: letonghan <letong.han@intel.com> Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
2024-09-30 16:27:41 +08:00
parent 405a2fc68a
commit 534c227a6e
30 changed files with 7190 additions and 13 deletions
--- a/.github/workflows/docker/compose/lvms-compose-cd.yaml
+++ b/.github/workflows/docker/compose/lvms-compose-cd.yaml
@@ -11,3 +11,15 @@ services:
    build:
      dockerfile: comps/lvms/predictionguard/Dockerfile
    image: ${REGISTRY:-opea}/lvm-predictionguard:${TAG:-latest}
+  lvm-llama-vision:
+    build:
+      dockerfile: comps/lvms/llama-vision/Dockerfile
+    image: ${REGISTRY:-opea}/lvm-llama-vision:${TAG:-latest}
+  lvm-llama-vision-tp:
+    build:
+      dockerfile: comps/lvms/llama-vision/Dockerfile_tp
+    image: ${REGISTRY:-opea}/lvm-llama-vision-tp:${TAG:-latest}
+  lvm-llama-vision-guard:
+    build:
+      dockerfile: comps/lvms/llama-vision/Dockerfile_guard
+    image: ${REGISTRY:-opea}/lvm-llama-vision-guard:${TAG:-latest}
--- a/comps/cores/mega/manifests_exporter.py
+++ b/comps/cores/mega/manifests_exporter.py
@@ -266,7 +266,7 @@ def create_llm_dependency_deployment_and_service(resource_requirements=None, rep
    deployment = create_k8s_resources(
        name="llm-dependency-deploy",
        replicas=7,
-        image="ghcr.io/huggingface/tgi-gaudi:2.0.4",
+        image="ghcr.io/huggingface/tgi-gaudi:2.0.5",
        container_ports=[80],
        node_selector={"node-type": "chatqna-opea"},
        resources=resource_requirements,
--- a/comps/dataprep/neo4j/langchain/docker-compose-dataprep-neo4j.yaml
+++ b/comps/dataprep/neo4j/langchain/docker-compose-dataprep-neo4j.yaml
@@ -10,7 +10,7 @@ services:
      - "6337:6337"
      - "6338:6338"
  tgi_gaudi_service:
-    image: ghcr.io/huggingface/tgi-gaudi:2.0.1
+    image: ghcr.io/huggingface/tgi-gaudi:2.0.5
    container_name: tgi-service
    ports:
      - "8088:80"
--- a/comps/guardrails/llama_guard/langchain/README.md
+++ b/comps/guardrails/llama_guard/langchain/README.md
@@ -36,8 +36,8 @@ pip install -r requirements.txt
 export HF_TOKEN=${your_hf_api_token}
 volume=$PWD/data
 model_id="meta-llama/Meta-Llama-Guard-2-8B"
-docker pull ghcr.io/huggingface/tgi-gaudi:2.0.1
-docker run -p 8088:80 -v $volume:/data --runtime=habana -e HABANA_VISIBLE_DEVICES=all -e OMPI_MCA_btl_vader_single_copy_mechanism=none --cap-add=sys_nice --ipc=host -e HTTPS_PROXY=$https_proxy -e HTTP_PROXY=$https_proxy -e HF_TOKEN=$HF_TOKEN ghcr.io/huggingface/tgi-gaudi:2.0.1 --model-id $model_id --max-input-length 1024 --max-total-tokens 2048
+docker pull ghcr.io/huggingface/tgi-gaudi:2.0.5
+docker run -p 8088:80 -v $volume:/data --runtime=habana -e HABANA_VISIBLE_DEVICES=all -e OMPI_MCA_btl_vader_single_copy_mechanism=none --cap-add=sys_nice --ipc=host -e HTTPS_PROXY=$https_proxy -e HTTP_PROXY=$https_proxy -e HF_TOKEN=$HF_TOKEN ghcr.io/huggingface/tgi-gaudi:2.0.5 --model-id $model_id --max-input-length 1024 --max-total-tokens 2048
 ```

 ### 1.3 Verify the TGI Gaudi Service
--- a/comps/guardrails/llama_guard/langchain/docker_compose_guardrails.yaml
+++ b/comps/guardrails/llama_guard/langchain/docker_compose_guardrails.yaml
@@ -5,7 +5,7 @@ version: "3.8"

 services:
  tgi_gaudi_service:
-    image: ghcr.io/huggingface/tgi-gaudi:2.0.1
+    image: ghcr.io/huggingface/tgi-gaudi:2.0.5
    container_name: tgi-service
    ports:
      - "8088:80"
--- a/comps/llms/text-generation/tgi/launch_tgi_service.sh
+++ b/comps/llms/text-generation/tgi/launch_tgi_service.sh
@@ -31,9 +31,9 @@ volume=$PWD/data

 # Build the Docker run command based on the number of cards
 if [ "$num_cards" -eq 1 ]; then
-    docker_cmd="docker run -d --name="ChatQnA_server" -p $port_number:80 -v $volume:/data --runtime=habana -e HABANA_VISIBLE_DEVICES=all -e OMPI_MCA_btl_vader_single_copy_mechanism=none --cap-add=sys_nice --ipc=host -e HTTPS_PROXY=$https_proxy -e HTTP_PROXY=$https_proxy ghcr.io/huggingface/tgi-gaudi:1.2.1 --model-id $model_name"
+    docker_cmd="docker run -d --name="ChatQnA_server" -p $port_number:80 -v $volume:/data --runtime=habana -e HABANA_VISIBLE_DEVICES=all -e OMPI_MCA_btl_vader_single_copy_mechanism=none --cap-add=sys_nice --ipc=host -e HTTPS_PROXY=$https_proxy -e HTTP_PROXY=$https_proxy ghcr.io/huggingface/tgi-gaudi:2.0.5 --model-id $model_name"
 else
-    docker_cmd="docker run -d --name="ChatQnA_server" -p $port_number:80 -v $volume:/data --runtime=habana -e PT_HPU_ENABLE_LAZY_COLLECTIVES=true -e HABANA_VISIBLE_DEVICES=all -e OMPI_MCA_btl_vader_single_copy_mechanism=none --cap-add=sys_nice --ipc=host -e HTTPS_PROXY=$https_proxy -e HTTP_PROXY=$https_proxy ghcr.io/huggingface/tgi-gaudi:1.2.1 --model-id $model_name --sharded true --num-shard $num_cards"
+    docker_cmd="docker run -d --name="ChatQnA_server" -p $port_number:80 -v $volume:/data --runtime=habana -e PT_HPU_ENABLE_LAZY_COLLECTIVES=true -e HABANA_VISIBLE_DEVICES=all -e OMPI_MCA_btl_vader_single_copy_mechanism=none --cap-add=sys_nice --ipc=host -e HTTPS_PROXY=$https_proxy -e HTTP_PROXY=$https_proxy ghcr.io/huggingface/tgi-gaudi:2.0.5 --max-input-tokens 4096 --max-total-tokens 8192 --model-id $model_name --sharded true --num-shard $num_cards"
 fi

 # Execute the Docker run command
--- a/comps/lvms/llama-vision/Dockerfile
+++ b/comps/lvms/llama-vision/Dockerfile
@@ -0,0 +1,32 @@
+# Copyright (C) 2024 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+
+# HABANA environment
+FROM vault.habana.ai/gaudi-docker/1.17.1/ubuntu22.04/habanalabs/pytorch-installer-2.3.1:latest as hpu
+
+ENV LANG=en_US.UTF-8
+
+RUN apt-get update && apt-get install -y --no-install-recommends --fix-missing \
+    git-lfs \
+    libgl1-mesa-glx \
+    libjemalloc-dev
+
+RUN useradd -m -s /bin/bash user && \
+    mkdir -p /home/user && \
+    chown -R user /home/user/
+
+USER user
+
+RUN git lfs install
+
+COPY comps /home/user/comps
+
+RUN cd /home/user/comps/lvms/llama-vision/ && \
+    pip install --no-cache-dir -r requirements.txt  && \
+    pip install --upgrade Pillow
+
+ENV PYTHONPATH=/root:/home/user
+
+WORKDIR /home/user/comps/lvms/llama-vision/
+
+ENTRYPOINT ["python", "lvm.py"]
--- a/comps/lvms/llama-vision/Dockerfile_guard
+++ b/comps/lvms/llama-vision/Dockerfile_guard
@@ -0,0 +1,32 @@
+# Copyright (C) 2024 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+
+# HABANA environment
+FROM vault.habana.ai/gaudi-docker/1.17.1/ubuntu22.04/habanalabs/pytorch-installer-2.3.1:latest as hpu
+
+ENV LANG=en_US.UTF-8
+
+RUN apt-get update && apt-get install -y --no-install-recommends --fix-missing \
+    git-lfs \
+    libgl1-mesa-glx \
+    libjemalloc-dev
+
+RUN useradd -m -s /bin/bash user && \
+    mkdir -p /home/user && \
+    chown -R user /home/user/
+
+USER user
+
+RUN git lfs install
+
+COPY comps /home/user/comps
+
+RUN cd /home/user/comps/lvms/llama-vision/ && \
+    pip install --no-cache-dir -r requirements.txt  && \
+    pip install --upgrade Pillow
+
+ENV PYTHONPATH=/root:/home/user
+
+WORKDIR /home/user/comps/lvms/llama-vision/
+
+ENTRYPOINT ["python", "lvm_guard.py"]
--- a/comps/lvms/llama-vision/Dockerfile_tp
+++ b/comps/lvms/llama-vision/Dockerfile_tp
@@ -0,0 +1,34 @@
+# Copyright (C) 2024 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+
+# HABANA environment
+FROM vault.habana.ai/gaudi-docker/1.17.1/ubuntu22.04/habanalabs/pytorch-installer-2.3.1:latest as hpu
+
+ENV LANG=en_US.UTF-8
+
+RUN apt-get update && apt-get install -y --no-install-recommends --fix-missing \
+    git-lfs \
+    libgl1-mesa-glx \
+    libjemalloc-dev
+
+RUN useradd -m -s /bin/bash user && \
+    mkdir -p /home/user && \
+    chown -R user /home/user/
+
+RUN git lfs install
+
+COPY comps /home/user/comps
+
+RUN pip install git+https://github.com/HabanaAI/DeepSpeed.git@1.17.1
+RUN pip install git+https://github.com/huggingface/optimum-habana@v1.13.2
+
+RUN cd /home/user/comps/lvms/llama-vision/  \
+    pip install --no-cache-dir --upgrade pip && \
+    bash update && \
+    pip install -r /home/user/comps/lvms/llama-vision/requirements_tp.txt
+
+ENV PYTHONPATH=/root:/home/user
+
+WORKDIR /home/user/comps/lvms/llama-vision/
+
+ENTRYPOINT ["bash", "run_tp.sh"]
--- a/comps/lvms/llama-vision/README.md
+++ b/comps/lvms/llama-vision/README.md
@@ -0,0 +1,72 @@
+# LVM Microservice
+
+Visual Question and Answering is one of the multimodal tasks empowered by LVMs (Large Visual Models). This microservice supports visual Q&A by using Llama Vision as the base large visual model. It accepts two inputs: a prompt and an image. It outputs the answer to the prompt about the image.
+
+## 🚀 Start Microservice with Docker
+
+### Build Images
+
+#### Build Llama Vision Model
+
+```bash
+cd ../../../
+docker build -t opea/lvm-llama-vision:latest --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f comps/lvms/llama-vision/Dockerfile .
+```
+
+#### Build Llama Vision Model with deepspeed
+
+If you need to build the image for 90B models, use the following command:
+
+```bash
+docker build -t opea/lvm-llama-vision-tp:latest --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f comps/lvms/llama-vision/Dockerfile_tp .
+```
+
+#### Build Llama Vision Guard Model
+
+```bash
+cd ../../../
+docker build -t opea/lvm-llama-vision-guard:latest --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f comps/lvms/llama-vision/Dockerfile_guard .
+```
+
+### Start Llama LVM Service
+
+#### Start Llama Vision Model Service
+
+```bash
+export HUGGINGFACEHUB_API_TOKEN=${your_hf_token}
+docker run -it -p 9399:9399 --ipc=host -e http_proxy=$http_proxy -e https_proxy=$https_proxy -e LLAMA_VISION_MODEL_ID="meta-llama/Llama-3.2-11B-Vision-Instruct" -e HUGGINGFACEHUB_API_TOKEN=$HUGGINGFACEHUB_API_TOKEN --runtime=habana -e HABANA_VISIBLE_DEVICES=all -e OMPI_MCA_btl_vader_single_copy_mechanism=none --cap-add=sys_nice --ipc=host opea/lvm-llama-vision:latest
+```
+
+#### Start Llama Vision Model Service with deepspeed
+
+If you need to run the 90B models, use the following command:
+
+```bash
+export HUGGINGFACEHUB_API_TOKEN=${your_hf_token}
+export WORLD_SIZE=4
+export no_proxy=localhosst,127.0.0.1
+docker run -it -p 9599:9599 --ipc=host -e http_proxy=$http_proxy -e https_proxy=$https_proxy -e no_proxy=$no_proxy -e MODEL_ID="meta-llama/Llama-3.2-90B-Vision-Instruct" -e HUGGINGFACEHUB_API_TOKEN=$HUGGINGFACEHUB_API_TOKEN -e WORLD_SIZE=$WORLD_SIZE --runtime=habana -e HABANA_VISIBLE_DEVICES=all -e OMPI_MCA_btl_vader_single_copy_mechanism=none --cap-add=sys_nice opea/lvm-llama-vision-tp:latest
+```
+
+#### Start Llama Vision Guard Model Service
+
+```bash
+export HUGGINGFACEHUB_API_TOKEN=${your_hf_token}
+docker run -it -p 9499:9499 --ipc=host -e http_proxy=$http_proxy -e https_proxy=$https_proxy -e LLAMA_VISION_MODEL_ID="meta-llama/Llama-Guard-3-11B-Vision" -e HUGGINGFACEHUB_API_TOKEN=$HUGGINGFACEHUB_API_TOKEN --runtime=habana -e HABANA_VISIBLE_DEVICES=all -e OMPI_MCA_btl_vader_single_copy_mechanism=none --cap-add=sys_nice --ipc=host opea/lvm-llama-vision-guard:latest
+```
+
+### Test
+
+```bash
+# Use curl
+
+# curl Llama Vision 11B Model Service
+http_proxy="" curl http://localhost:9399/v1/lvm -XPOST -d '{"image": "iVBORw0KGgoAAAANSUhEUgAAAAoAAAAKCAYAAACNMs+9AAAAFUlEQVR42mP8/5+hnoEIwDiqkL4KAcT9GO0U4BxoAAAAAElFTkSuQmCC", "prompt":"What is this?", "max_new_tokens": 128}' -H 'Content-Type: application/json'
+
+# curl Llama Vision Guard Model Service
+http_proxy="" curl http://localhost:9499/v1/lvm -XPOST -d '{"image": "iVBORw0KGgoAAAANSUhEUgAAAAoAAAAKCAYAAACNMs+9AAAAFUlEQVR42mP8/5+hnoEIwDiqkL4KAcT9GO0U4BxoAAAAAElFTkSuQmCC", "prompt":"What is this?", "max_new_tokens": 128}' -H 'Content-Type: application/json'
+
+# curl Llama Vision 90B Model Service
+http_proxy="" curl http://localhost:9599/v1/lvm -XPOST -d '{"image": "iVBORw0KGgoAAAANSUhEUgAAAAoAAAAKCAYAAACNMs+9AAAAFUlEQVR42mP8/5+hnoEIwDiqkL4KAcT9GO0U4BxoAAAAAElFTkSuQmCC", "prompt":"What is this?", "max_new_tokens": 128}' -H 'Content-Type: application/json'
+
+```
--- a/comps/lvms/llama-vision/auto_tp.py
+++ b/comps/lvms/llama-vision/auto_tp.py
@@ -0,0 +1,571 @@
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+
+# Automatic Tensor Parallelism
+import re
+from typing import Optional
+
+import torch
+from deepspeed import comm as dist
+from deepspeed.accelerator import get_accelerator
+from deepspeed.module_inject.tp_shard import get_shard_size, get_shard_size_list
+from torch import nn
+
+from .fusedqkv_utils import prepare_tp_fused_qkvw, require_tp_fused_qkvw
+from .layers import LinearAllreduce, LinearLayer, LmHeadLinearAllreduce
+from .replace_policy import replace_policies
+
+
+def move(tensor, device):
+    if tensor.is_meta:
+        return torch.empty_like(tensor, device=device)
+    else:
+        # Using new tensors help in freeing memory (after split for example) was done before by calling clone().
+        # Using copy=True instead of clone() will help in case of cpu --> cpu.
+        # Otherwise to() will not create a new copy for the view of the full tensor, and it will not be de-referenced.
+        return tensor.to(device, copy=True)
+
+
+class ReplaceWithTensorSlicing:
+
+    def __init__(self, mp_group=None, mp_size=1, out_dim=1, in_dim=0):
+        if mp_group is not None:
+            self.gpu_index = dist.get_rank(group=mp_group)
+        else:
+            self.gpu_index = 0
+        self.out_dim = out_dim
+        self.in_dim = in_dim
+        self.mp_size = mp_size
+
+    def merge_assert(self, dim1, dim2):
+        assert (
+            dim1 > dim2
+        ), "Merging tensors is not allowed here! Please use deepspeed load_checkpoint\
+            for merging your checkpoints before replacing the transformer layer with\
+            inference-kernels"
+
+    def strided_copy(
+        self,
+        dst: Optional[torch.Tensor],
+        src: Optional[torch.Tensor],
+        num_splits: int,
+        int8: bool = False,
+        allocate_tensor: bool = False,
+    ):
+        if src is None:
+            return src
+        src_shape = src.shape
+        dst_shape = dst.shape
+
+        outer_dim = 0 if int8 else -1
+
+        if allocate_tensor:
+            dst = torch.empty_like(dst)
+
+        src_split = torch.split(src.data, src.shape[outer_dim] // num_splits, dim=outer_dim)
+        if len(src_shape) == 2 and len(dst_shape) == 2:
+            if src_shape[outer_dim] == dst_shape[self.out_dim]:
+                try:
+                    dst = dst.reshape(-1).data.copy_(src.data.reshape(-1)).reshape(src.shape)
+                except:
+                    print(dst.shape, src.shape)
+                    exit()
+                dst = torch.nn.parameter.Parameter(dst, requires_grad=False)
+                if hasattr(src, "scale"):
+                    dst.scale = src.scale
+                return dst
+            self.merge_assert(src_shape[outer_dim], dst_shape[self.out_dim])
+            qkv_size = dst_shape[self.out_dim] // num_splits
+            qkv_split = [torch.split(src_s, qkv_size, dim=outer_dim) for src_s in src_split]
+            weight_split = [
+                torch.cat([qkv_s[i] for qkv_s in qkv_split], axis=outer_dim) for i in range(len(qkv_split[0]))
+            ]
+            dst = (
+                dst.reshape(-1)
+                .data.copy_(weight_split[self.gpu_index].contiguous().reshape(-1))
+                .reshape(weight_split[self.gpu_index].shape)
+            )
+        else:
+            if src_shape[0] == dst_shape[0]:
+                return torch.nn.parameter.Parameter(src)
+            qkv_size = dst_shape[0] // num_splits
+            qkv_split = [torch.split(src_s, qkv_size, dim=0) for src_s in src_split]
+            bias_split = [torch.cat([qkv_s[i] for qkv_s in qkv_split], axis=0) for i in range(len(qkv_split[0]))]
+            dst.data.copy_(bias_split[self.gpu_index].contiguous())
+
+        dst = torch.nn.parameter.Parameter(dst, requires_grad=False)
+        if hasattr(src, "scale"):
+            dst.scale = src.scale
+        return dst
+
+    def copy(self, dst, src, int8=False, allocate_tensor=False):
+        if src is None:
+            return src
+        assert not dst.data.is_meta  # the torch.Tensor.copy_ method used below will silently fail on meta tensors
+        if allocate_tensor:
+            dst = torch.empty_like(dst)
+        outer_dim = 0 if int8 else 1
+        inner_dim = 1 if int8 else 0
+        src_shape = src.shape
+        dst_shape = dst.shape
+        if len(src_shape) == 2 and len(dst_shape) == 2:
+
+            if src_shape[inner_dim] == dst_shape[self.in_dim] and src_shape[outer_dim] == dst_shape[self.out_dim]:
+                dst = dst.reshape(-1).data.copy_(src.data.reshape(-1)).reshape(src.shape)
+            else:
+                if src_shape[inner_dim] != dst_shape[self.in_dim]:
+                    self.merge_assert(src_shape[inner_dim], dst_shape[self.in_dim])
+                    dst.data.copy_(
+                        src[:, self.gpu_index * dst_shape[self.in_dim] : (self.gpu_index + 1) * dst_shape[self.in_dim]]
+                        if inner_dim == 1
+                        else src[
+                            self.gpu_index * dst_shape[self.in_dim] : (self.gpu_index + 1) * dst_shape[self.in_dim], :
+                        ]
+                    )
+                else:
+                    self.merge_assert(src_shape[outer_dim], dst_shape[self.out_dim])
+                    dst.data.copy_(
+                        src[
+                            :, self.gpu_index * dst_shape[self.out_dim] : (self.gpu_index + 1) * dst_shape[self.out_dim]
+                        ]
+                        if outer_dim == 1
+                        else src[
+                            self.gpu_index * dst_shape[self.out_dim] : (self.gpu_index + 1) * dst_shape[self.out_dim], :
+                        ]
+                    )
+        else:
+            if src_shape[0] == dst_shape[0]:
+                dst = src if src.dtype == dst.dtype else dst.data.copy_(src)
+            else:
+                dst.data.copy_(src[self.gpu_index * dst_shape[-1] : (self.gpu_index + 1) * dst_shape[-1]])
+        dst = torch.nn.parameter.Parameter(dst, requires_grad=False)
+        if hasattr(src, "scale"):
+            dst.scale = src.scale
+        return dst
+
+
+class Loading:
+
+    def is_load_module(module):
+        load_layers = [nn.Linear, nn.Embedding, nn.LayerNorm]
+        load_layer_names = [
+            "LPLayerNorm",
+            "SharedEmbedding",
+            "OPTLearnedPositionalEmbedding",
+            "LlamaRMSNorm",
+            "FalconLinear",
+            "MistralRMSNorm",
+            "T5LayerNorm",
+            "MixtralRMSNorm",
+            "Qwen2RMSNorm",
+        ]
+        return module.__class__ in load_layers or module._get_name() in load_layer_names
+
+    def load_buffer(module, state_dict, prefix):
+        for name in module._buffers.keys():
+            if module._buffers[name].data.is_meta:
+                module._buffers[name] = torch.nn.parameter.Parameter(
+                    data=torch.empty_like(module._buffers[name].data, device="cpu"),
+                    requires_grad=module._buffers[name].data.requires_grad,
+                )
+            if prefix + name in state_dict.keys():
+                module._buffers[name].data.copy_(state_dict[prefix + name])
+
+    def load(module, state_dict, prefix, mp_group=None):
+        mp_replace = ReplaceWithTensorSlicing(mp_group=mp_group)
+        if hasattr(module, "weight"):
+            if module.weight.data.is_meta:
+                # meta tensor cannot be casted or copied to, so we need to replace it with a normal tensor here
+                module.weight = torch.nn.parameter.Parameter(
+                    data=torch.empty_like(module.weight.data, device="cpu"),
+                    requires_grad=module.weight.data.requires_grad,
+                )
+                if "query_key_value" in prefix:
+                    module.weight = mp_replace.strided_copy(
+                        module.weight.data, state_dict[prefix + "weight"], num_splits=3
+                    )
+                else:
+                    module.weight = mp_replace.copy(module.weight.data, state_dict[prefix + "weight"])
+        else:
+            if hasattr(module, "norm") and hasattr(module.norm, "weight"):
+                if module.norm.weight.data.is_meta:
+                    # meta tensor cannot be casted or copied to, so we need to replace it with a normal tensor here
+                    module.norm.weight = torch.nn.parameter.Parameter(
+                        data=torch.empty_like(module.norm.weight.data, device="cpu"),
+                        requires_grad=module.norm.weight.data.requires_grad,
+                    )
+                module.norm.weight = mp_replace.copy(module.norm.weight.data, state_dict[prefix + "weight"])
+
+        if prefix + "bias" in state_dict.keys():
+            if hasattr(module, "bias"):
+                if module.bias.data.is_meta:
+                    # meta tensor cannot be casted or copied to, so we need to replace it with a normal tensor here
+                    module.bias = torch.nn.parameter.Parameter(
+                        data=torch.empty_like(module.bias.data, device="cpu"),
+                        requires_grad=module.bias.data.requires_grad,
+                    )
+                module.bias = mp_replace.copy(module.bias, state_dict[prefix + "bias"])
+            else:
+                if hasattr(module, "norm") and hasattr(module.norm, "bias"):
+                    if module.norm.bias.data.is_meta:
+                        # meta tensor cannot be casted or copied to, so we need to replace it with a normal tensor here
+                        module.norm.bias = torch.nn.parameter.Parameter(
+                            data=torch.empty_like(module.norm.bias.data, device="cpu"),
+                            requires_grad=module.norm.bias.data.requires_grad,
+                        )
+                    module.norm.bias = mp_replace.copy(module.norm.bias, state_dict[prefix + "bias"])
+
+
+class AutoTP:
+
+    def __init__(self, module, all_reduce_linears, prefix, state_dict, linear_layer_setting, orig_layer_impl):
+        self.module = module
+        self.all_reduce_linears = all_reduce_linears
+        self.prefix = prefix
+        self.state_dict = state_dict
+
+        self.mp_size = None
+        self.mp_group = None
+        self.linear_layer_setting = linear_layer_setting
+        self.orig_layer_impl = orig_layer_impl
+        self.linear_policies = None
+        self.conv_linear_layer = False
+
+    def in_module_list(module, module_list):
+        for item in module_list:
+            if type(item).__name__ == type(module).__name__:
+                return True
+        return False
+
+    def get_module_list(model):
+        mlist = []
+        for child in model.children():
+            if isinstance(child, nn.ModuleList):
+                for module in child.children():
+                    if not mlist:
+                        mlist = [module]
+                    elif not AutoTP.in_module_list(module, mlist):
+                        mlist = mlist + [module]
+            else:
+                mlist = mlist + AutoTP.get_module_list(child)
+        return mlist
+
+    def supported(model):
+        unsupported = ["deberta", "flaubert", "fsmt", "gpt2", "led", "longformer", "xlm", "xlnet"]
+        model = str(model)
+        key = re.search(r": (.*?)Model", model)
+        if key is None:
+            key = re.search(r": (.*?)Stack", model)
+        if key is None:
+            key = re.match(r"(.*?)Model", model)
+        assert key is not None, "Not able to determine model policy automatically. Please provide policy."
+        if key.group(1).lower() in unsupported:
+            return False
+        return True
+
+    def get_layers(parent, module):
+        layer_list = []
+        for key, submodule in module._modules.items():
+            if isinstance(submodule, nn.Linear):
+                layer_list = layer_list + [parent + "." + key]
+            elif isinstance(submodule, nn.LayerNorm) or key == "LayerNorm" or key == "layer_norm":
+                layer_list = layer_list + ["ln"]
+            else:
+                layer_list = layer_list + AutoTP.get_layers(key, submodule)
+        return layer_list
+
+    def update_policy_list(policy_list, new_module, new_gems):
+        if len(policy_list):
+            for i, policy in enumerate(policy_list):
+                # if module already exists in policy, combine gems and remove duplicates
+                if policy[0] == type(new_module):
+                    new_gems = set(new_gems + policy[1])
+                    policy_list[i] = tuple([type(new_module), new_gems])
+                    return policy_list
+        policy_list.append(tuple([type(new_module), new_gems]))
+        return policy_list
+
+    def kernel_supported(module_list):
+        policy = []
+        for plcy in replace_policies:
+            # instantiate a throw-away policy in order to populate the _orig_layer_class
+            _ = plcy(None)
+            if isinstance(plcy._orig_layer_class, list):
+                for orig_layer_class in plcy._orig_layer_class:
+                    policy.append(orig_layer_class)
+            elif plcy._orig_layer_class is not None:
+                policy.append(plcy._orig_layer_class)
+        for child in module_list:
+            if child.__class__ in policy:
+                return True
+        return False
+
+    def tp_parser(model):
+        policy_list = []
+        module_list = []
+        layer_list = []
+        gem_list = []
+
+        module_list = AutoTP.get_module_list(model)
+        assert AutoTP.supported(model), (
+            "AutoTP not supported for model. Please use kernel injection since container policy for model exists."
+            if AutoTP.kernel_supported(module_list)
+            else "AutoTP not supported for model. Please provide policy."
+        )
+        norm_layer_name_list = ["LayerNorm", "layer_norm", "ln_1", "ln_2"]
+        # ln_1 , ln_2 for Qwen
+        for module in module_list:
+            for key, submodule in module._modules.items():
+                if isinstance(submodule, nn.Linear):
+                    layer_list = layer_list + ["." + key]
+                elif isinstance(submodule, nn.LayerNorm) or key in norm_layer_name_list:
+                    layer_list = layer_list + ["ln"]
+                else:
+                    layer_list = layer_list + AutoTP.get_layers(key, submodule)
+            for i, layer in enumerate(layer_list):
+                if layer == "ln":
+                    if layer_list[i - 1] != "ln":
+                        gem_list = gem_list + [layer_list[i - 1]]
+                elif "out_proj" in layer:
+                    gem_list = gem_list + [layer]
+                elif "o_proj" in layer:
+                    gem_list = gem_list + [layer]
+                elif "down_proj" in layer:
+                    gem_list = gem_list + [layer]
+                elif "attention.dense" in layer and "GPTNeoX" in str(model):
+                    gem_list = gem_list + [layer]
+                elif "self_attention.dense" in layer and "falcon" in str(
+                    type(module)
+                ):  # this is a hack to get the right linear layer for this model!
+                    gem_list = gem_list + [layer]
+                # Mixtral-7x8b used w2*act(w1*w3) linear. need to replace w2 to linearallreduce.
+                elif "w2" in layer and "mixtral" in str(type(module)):
+                    gem_list = gem_list + [layer]
+
+            layer_list = []
+            if gem_list != []:
+                gem_list = list(set(gem_list))
+                policy_list = AutoTP.update_policy_list(policy_list, module, gem_list)
+                gem_list = []
+        assert len(policy_list), (
+            "AutoTP not supported for model. Please use kernel injection since container policy for model exists."
+            if AutoTP.kernel_supported(module_list)
+            else "Not able to determine model policy automatically. Please provide policy."
+        )
+        return policy_list
+
+    def set_tensor_parallel_config(self, mp_size, mp_group):
+        self.mp_size = mp_size
+        self.mp_group = mp_group
+
+    def _replace(self, child, name, conv_linear_layer):
+        if getattr(child, "replaced", False):
+            return
+        weight_shape = child.weight.shape
+        mp_replace = ReplaceWithTensorSlicing(mp_group=self.mp_group)
+        if name in self.all_reduce_linears:
+            # if conv_linear_layer [weight_shape[1], weight_shape[0] // mp_size]
+            # else [weight_shape[0], weight_shape[1] // mp_size]
+
+            if self.conv_linear_layer:
+                child.weight.data = child.weight.data.transpose(-1, -2).contiguous()
+            data = child.weight.data.split(
+                get_shard_size_list(weight_shape[0] if self.conv_linear_layer else weight_shape[1], self.mp_size, name),
+                dim=1,
+            )
+            data_dc = move(data[mp_replace.gpu_index], get_accelerator().current_device_name()).detach()
+            del data
+
+            setattr(child, "replaced", True)
+            if name == "lm_head" or name == "embed_out":
+                return LmHeadLinearAllreduce(
+                    torch.nn.parameter.Parameter(data_dc, requires_grad=False),
+                    dist.get_rank(),
+                    dist.get_world_size(),
+                    (
+                        child.bias
+                        if child.bias is None
+                        else torch.nn.parameter.Parameter(move(child.bias, get_accelerator().current_device_name()))
+                    ),
+                    self.mp_group,
+                )
+            return LinearAllreduce(
+                torch.nn.parameter.Parameter(data_dc, requires_grad=False),
+                (
+                    child.bias
+                    if child.bias is None
+                    else torch.nn.parameter.Parameter(move(child.bias, get_accelerator().current_device_name()))
+                ),
+                self.mp_group,
+            )
+        else:
+
+            # if conv_linear_layer [weight_shape[1], weight_shape[0] // mp_size]
+            # else [weight_shape[0] // mp_size, weight_shape[1]]
+            if self.conv_linear_layer:
+                child.weight.data = child.weight.data.transpose(-1, -2).contiguous()
+
+            if require_tp_fused_qkvw(name, self.mp_size):
+                # Check and handle fused qkv for TP
+                # The copy is a regular copy, The shape of dst and src is the same
+                data_dc = move(
+                    prepare_tp_fused_qkvw(self.module, child.weight.data, self.mp_size, mp_replace.gpu_index),
+                    get_accelerator().current_device_name(),
+                )
+
+                bias_data_dc = (
+                    None
+                    if child.bias is None
+                    else move(
+                        prepare_tp_fused_qkvw(self.module, child.bias.data, self.mp_size, mp_replace.gpu_index),
+                        get_accelerator().current_device_name(),
+                    )
+                )
+            else:
+                data = child.weight.data.split(
+                    get_shard_size_list(weight_shape[0], self.mp_size, name), dim=1 if self.conv_linear_layer else 0
+                )
+                data_dc = move(data[mp_replace.gpu_index], get_accelerator().current_device_name()).detach()
+                del data
+
+                if child.bias is not None:
+                    bias_data = child.bias.data.split(
+                        get_shard_size_list(
+                            weight_shape[1] if self.conv_linear_layer else weight_shape[0], self.mp_size, name
+                        ),
+                        dim=0,
+                    )
+                    bias_data = move(bias_data[mp_replace.gpu_index], get_accelerator().current_device_name())
+                    bias_data_dc = torch.nn.parameter.Parameter(bias_data, requires_grad=False)
+                    del bias_data
+                else:
+                    bias_data_dc = None
+
+            setattr(child, "replaced", True)
+            return LinearLayer(weight=torch.nn.parameter.Parameter(data_dc, requires_grad=False), bias=bias_data_dc)
+
+    def _slice_embedding(self, child, name, conv_linear_layer):
+        if getattr(child, "replaced", False):
+            return
+        mp_replace = ReplaceWithTensorSlicing(mp_group=self.mp_group)
+
+        if hasattr(child.weight, "ds_tensor"):
+            data = child.weight.ds_tensor.data.split(get_shard_size_list(child.weight.shape[1], self.mp_size), dim=1)
+        else:
+            data = child.weight.data.split(get_shard_size_list(child.weight.shape[1], self.mp_size, name), dim=1)
+        data = data[mp_replace.gpu_index].to(get_accelerator().current_device_name())
+        data = torch.nn.parameter.Parameter(data, requires_grad=False)
+
+        new_embedding = nn.Embedding(child.weight.shape[0], get_shard_size(child.weight.shape[1], self.mp_size, name))
+        new_embedding.weight.data.copy_(data)
+        setattr(child, "replaced", True)
+        return new_embedding
+
+    def update_mp_params(self, child):
+        if getattr(child, "replaced", False):
+            return
+        for param in [
+            "n_heads",
+            "inner_dim",
+            "num_heads",
+            "num_kv",
+            "num_attention_heads",
+            "num_attn_heads",
+            "all_head_size",
+            "embed_dim",
+            "hidden_size",
+            "num_key_value_heads",
+            "num_kv_heads",
+            "kv_n_heads",
+            "d_model",
+        ]:
+            if hasattr(child, param):
+                param_val = getattr(child, param)
+                setattr(child, param, get_shard_size(param_val, self.mp_size))
+        setattr(child, "replaced", True)
+
+    def update_linear_policies(self):
+        self.conv_linear_layer = False
+        if self.linear_layer_setting is not None:
+            self.linear_policies = {self.linear_layer_setting[0]: self._replace}
+            if len(self.linear_layer_setting) == 2:
+                self.linear_policies.update({self.linear_layer_setting[1]: self._slice_embedding})
+        else:
+            import transformers
+
+            if self.orig_layer_impl is transformers.models.gpt2.modeling_gpt2.GPT2Block:
+                try:
+                    self.conv_linear_layer = True
+                    self.linear_policies = {transformers.pytorch_utils.Conv1D: self._replace}
+                except ImportError:
+                    self.linear_policies = {nn.Linear: self._replace}
+            else:
+                self.linear_policies = {nn.Linear: self._replace, nn.Embedding: self._slice_embedding}
+
+    def _replace_module(self, r_module, prev_name="", prev_class_name=""):
+        for name, child in r_module.named_children():
+            if prev_class_name == "":
+                class_name = prev_name
+            elif prev_name == "":
+                class_name = prev_class_name
+            else:
+                class_name = prev_class_name + "." + prev_name
+            checking_key = (
+                self.prefix + "." + class_name + "." + name + "."
+                if class_name != ""
+                else self.prefix + "." + name + "."
+            )
+            if Loading.is_load_module(child) and self.state_dict is not None:
+                if any(checking_key in item for item in self.state_dict):
+                    Loading.load(child, self.state_dict, checking_key, self.mp_group)
+                else:
+                    continue
+            if len(child._buffers) != 0 and self.state_dict is not None:
+                Loading.load_buffer(child, self.state_dict, checking_key)
+            if child.__class__ in self.linear_policies:
+                setattr(
+                    r_module,
+                    name,
+                    self.linear_policies[child.__class__](child, prev_name + "." + name, self.conv_linear_layer),
+                )
+            elif any(isinstance(child, lp) for lp in self.linear_policies):
+                # Added for falcon model support
+                # Note: isinstance will account for class inheritance, child.__class__ does not
+                key = None
+                for lp in self.linear_policies:
+                    if isinstance(child, lp):
+                        key = lp
+                        break
+                assert key is not None
+                setattr(
+                    r_module, name, self.linear_policies[key](child, prev_name + "." + name, self.conv_linear_layer)
+                )
+            else:
+                self.update_mp_params(child)
+                self._replace_module(child, name, class_name)
+        return r_module
+
+    def get_model_num_kv_heads(self, config):
+        num_kv_heads = None
+        kv_head_names = ["num_kv_heads", "num_key_value_heads", "num_attention_heads", "n_heads", "attention_heads"]
+        for name in kv_head_names:
+            if hasattr(config, name):
+                num_kv_heads = getattr(config, name)
+                if num_kv_heads is not None:
+                    break
+        return num_kv_heads
+
+    def _replace_last_linear_module(self, r_module):
+        if hasattr(r_module, "lm_head"):
+            name = "lm_head"
+            child = r_module.lm_head
+        elif hasattr(r_module, "embed_out"):
+            name = "embed_out"
+            child = r_module.embed_out
+        else:
+            return r_module
+        if child.__class__ in self.linear_policies:
+            setattr(r_module, name, self.linear_policies[child.__class__](child, name, self.conv_linear_layer))
+        return r_module
--- a/comps/lvms/llama-vision/checkpoint_utils.py
+++ b/comps/lvms/llama-vision/checkpoint_utils.py
@@ -0,0 +1,160 @@
+# Copyright (C) 2024 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+
+import json
+import os
+from pathlib import Path
+
+import torch
+from huggingface_hub import list_repo_files, snapshot_download
+from transformers import modeling_utils
+from transformers.utils import is_offline_mode
+
+
+def get_repo_root(model_name_or_path, local_rank=-1, token=None):
+    """Downloads the specified model checkpoint and returns the repository where it was downloaded."""
+    if Path(model_name_or_path).is_dir():
+        # If it is a local model, no need to download anything
+        return model_name_or_path
+    else:
+        # Checks if online or not
+        if is_offline_mode():
+            if local_rank == 0:
+                print("Offline mode: forcing local_files_only=True")
+
+        # Only download PyTorch weights by default
+        if any(
+            ".safetensors" in filename for filename in list_repo_files(model_name_or_path, token=token)
+        ):  # Some models like Falcon-180b are in only safetensors format
+            allow_patterns = ["*.safetensors"]
+        elif any(".bin" in filename for filename in list_repo_files(model_name_or_path, token=token)):
+            allow_patterns = ["*.bin"]
+        else:
+            raise TypeError("Only PyTorch models are supported")
+
+        # Download only on first process
+        if local_rank in [-1, 0]:
+            cache_dir = snapshot_download(
+                model_name_or_path,
+                local_files_only=is_offline_mode(),
+                cache_dir=os.getenv("TRANSFORMERS_CACHE", None),
+                allow_patterns=allow_patterns,
+                max_workers=16,
+                token=token,
+            )
+            if local_rank == -1:
+                # If there is only one process, then the method is finished
+                return cache_dir
+
+        # Make all processes wait so that other processes can get the checkpoint directly from cache
+        if torch.distributed.is_initialized():
+            torch.distributed.barrier()
+
+        return snapshot_download(
+            model_name_or_path,
+            local_files_only=is_offline_mode(),
+            cache_dir=os.getenv("TRANSFORMERS_CACHE", None),
+            allow_patterns=allow_patterns,
+            token=token,
+        )
+
+
+def get_checkpoint_files(model_name_or_path, local_rank, token=None):
+    cached_repo_dir = get_repo_root(model_name_or_path, local_rank=local_rank, token=token)
+
+    # Extensions: .bin | .safetensors | .pt
+    # Creates a list of paths from all downloaded files in cache dir
+
+    if any(file.suffix == ".bin" for file in Path(cached_repo_dir).rglob("*")):
+        (name, ext) = os.path.splitext(modeling_utils.WEIGHTS_NAME)
+    elif any(file.suffix == ".safetensors" for file in Path(cached_repo_dir).rglob("*")):
+        (name, ext) = os.path.splitext(modeling_utils.SAFE_WEIGHTS_NAME)
+    else:
+        (name, ext) = ("*", ".pt")
+
+    file_list = [
+        str(entry)
+        for entry in Path(cached_repo_dir).rglob("*")
+        if (entry.is_file() and entry.name.startswith(name) and entry.name.endswith(ext))
+    ]
+
+    return file_list
+
+
+def write_checkpoints_json(model_name_or_path, local_rank, f, token=None):
+    """Dumps metadata into a JSON file for DeepSpeed-inference."""
+    checkpoint_files = get_checkpoint_files(model_name_or_path, local_rank, token)
+    data = {"type": "ds_model", "checkpoints": checkpoint_files, "version": 1.0}
+    json.dump(data, f)
+    f.flush()
+
+
+def model_on_meta(config):
+    """Checks if load the model to meta."""
+    # return config.model_type in ["bloom", "llama", "falcon", "mixtral", "qwen2", "mllama"]
+    return config.model_type in ["bloom", "llama", "falcon", "mixtral", "qwen2"]
+
+
+def get_optimized_model_name(config):
+    from .transformers.generation import MODELS_OPTIMIZED_WITH_STATIC_SHAPES
+
+    for model_type in MODELS_OPTIMIZED_WITH_STATIC_SHAPES:
+        if model_type == config.model_type:
+            return model_type
+
+    return None
+
+
+def model_is_optimized(config):
+    """Checks if the given config belongs to a model in optimum/habana/transformers/models, which has a
+    new input token_idx."""
+    return get_optimized_model_name(config) is not None
+
+
+def get_ds_injection_policy(config):
+    model_type = get_optimized_model_name(config)
+    policy = {}
+    if model_type:
+        if model_type == "bloom":
+            from transformers.models.bloom.modeling_bloom import BloomBlock
+
+            policy = {BloomBlock: ("self_attention.dense", "mlp.dense_4h_to_h")}
+
+        if model_type == "opt":
+            from transformers.models.opt.modeling_opt import OPTDecoderLayer
+
+            policy = {OPTDecoderLayer: ("self_attn.out_proj", ".fc2")}
+
+        if model_type == "gpt2":
+            from transformers.models.gpt2.modeling_gpt2 import GPT2MLP
+
+            policy = {GPT2MLP: ("attn.c_proj", "mlp.c_proj")}
+
+        if model_type == "gptj":
+            from transformers.models.gptj.modeling_gptj import GPTJBlock
+
+            policy = {GPTJBlock: ("attn.out_proj", "mlp.fc_out")}
+
+        if model_type == "gpt_neox":
+            from transformers.models.gpt_neox.modeling_gpt_neox import GPTNeoXLayer
+
+            policy = {GPTNeoXLayer: ("attention.dense", "mlp.dense_4h_to_h")}
+
+        if model_type == "llama":
+            from transformers.models.llama.modeling_llama import LlamaDecoderLayer
+
+            policy = {LlamaDecoderLayer: ("self_attn.o_proj", "mlp.down_proj")}
+
+        # if model_type == "mllama":
+        # #AutoTP:  [(<class 'transformers.models.mllama.modeling_mllama.MllamaVisionEncoderLayer'>, {'self_attn.o_proj', 'mlp.fc2'}), (<class 'transformers.models.mllama.modeling_mllama.MllamaSelfAttentionDecoderLayer'>, ['self_attn.o_proj', 'mlp.down_proj']), (<class 'transformers.models.mllama.modeling_mllama.MllamaCrossAttentionDecoderLayer'>, ['cross_attn.o_proj', 'mlp.down_proj'])]
+        #     from transformers.models.mllama.modeling_mllama import MllamaVisionEncoderLayer, MllamaSelfAttentionDecoderLayer, MllamaCrossAttentionDecoderLayer
+
+        #     policy = {MllamaSelfAttentionDecoderLayer: ("self_attn.o_proj", "mlp.down_proj"), MllamaCrossAttentionDecoderLayer: ('cross_attn.o_proj', 'mlp.down_proj'), MllamaVisionEncoderLayer: ('self_attn.o_proj', 'mlp.fc2')}
+        #     policy = {MllamaSelfAttentionDecoderLayer: ("self_attn.o_proj", "mlp.down_proj"), MllamaCrossAttentionDecoderLayer: ('cross_attn.o_proj', 'mlp.down_proj')}
+
+        if model_type == "mistral":
+            from transformers.models.mistral.modeling_mistral import MistralDecoderLayer
+
+            policy = {MistralDecoderLayer: ("self_attn.o_proj", "mlp.down_proj")}
+
+    return policy
--- a/comps/lvms/llama-vision/docker_compose_llm.yaml
+++ b/comps/lvms/llama-vision/docker_compose_llm.yaml
@@ -0,0 +1,27 @@
+# Copyright (C) 2024 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+
+version: "3.8"
+
+services:
+  llm:
+    image: opea/lvm-llama-vision:latest
+    container_name: lvm-llama-vision-server
+    ports:
+      - "9399:9399"
+    runtime: habana
+    cap_add:
+      - SYS_NICE
+    ipc: host
+    environment:
+      no_proxy: ${no_proxy}
+      http_proxy: ${http_proxy}
+      https_proxy: ${https_proxy}
+      LLAMA_VISION_MODEL_ID: ${LLAMA_VISION_MODEL_ID}
+      HABANA_VISIBLE_DEVICES: all
+      OMPI_MCA_btl_vader_single_copy_mechanism: none
+    restart: unless-stopped
+
+networks:
+  default:
+    driver: bridge
--- a/comps/lvms/llama-vision/gaudi_spawn.py
+++ b/comps/lvms/llama-vision/gaudi_spawn.py
@@ -0,0 +1,108 @@
+# coding=utf-8
+# Copyright 2022 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""A simple launcher script for distributed training on HPUs.
+
+Single node:
+::
+    >>> python gaudi_spawn.py --world_size=NUM_CARDS_YOU_HAVE --use_mpi
+               YOUR_TRAINING_SCRIPT.py (--arg1 --arg2 --arg3 and all other
+               arguments of your training script)
+
+Multi node:
+::
+    >>> python gaudi_spawn.py --hostfile=PATH_TO_HOSTFILE --use_deepspeed
+               YOUR_TRAINING_SCRIPT.py (--arg1 --arg2 --arg3 and all other
+               arguments of your training script)
+"""
+
+import sys
+from argparse import REMAINDER, ArgumentParser
+
+from optimum.habana.distributed import DistributedRunner
+from optimum.utils import logging
+
+logger = logging.get_logger(__name__)
+
+
+def parse_args():
+    """Helper function parsing the command line options.
+
+    @retval ArgumentParser
+    """
+    parser = ArgumentParser(
+        description=(
+            "Habana Gaudi distributed training launch helper utility that will spawn up multiple distributed"
+            " processes."
+        )
+    )
+
+    # Optional arguments for the launch helper
+    parser.add_argument("--world_size", type=int, default=1, help="Number of HPUs to use (1 or 8)")
+    parser.add_argument("--hostfile", type=str, default=None, help="Path to the file where hosts are specified.")
+    parser.add_argument("--use_mpi", action="store_true", help="Use MPI for distributed training")
+    parser.add_argument("--use_deepspeed", action="store_true", help="Use DeepSpeed for distributed training")
+    parser.add_argument("--master_port", type=int, default=29500, help="Master port used by DeepSpeed and MPI")
+
+    # positional
+    parser.add_argument(
+        "training_script",
+        type=str,
+        help=(
+            "The full path to the single HPU training "
+            "program/script to be launched in parallel, "
+            "followed by all the arguments for the "
+            "training script."
+        ),
+    )
+
+    # rest from the training program
+    parser.add_argument("training_script_args", nargs=REMAINDER)
+
+    return parser.parse_args()
+
+
+def main():
+    args = parse_args()
+
+    if args.use_deepspeed:
+        from transformers.integrations.deepspeed import is_deepspeed_available
+
+        if not is_deepspeed_available():
+            raise ImportError(
+                "--use_deepspeed requires deepspeed: `pip install"
+                " git+https://github.com/HabanaAI/DeepSpeed.git@1.17.0`."
+            )
+
+    # Patch sys.argv
+    sys.argv = [args.training_script] + args.training_script_args
+    # Handle the case where arguments contain whitespaces
+    argv = ['"{}"'.format(arg) if " " in arg and arg[0] != '"' and arg[-1] != '"' else arg for arg in sys.argv]
+    command_list = [" ".join(argv)]
+
+    distributed_runner = DistributedRunner(
+        command_list=command_list,
+        world_size=args.world_size,
+        hostfile=args.hostfile,
+        use_mpi=args.use_mpi,
+        use_deepspeed=args.use_deepspeed,
+        master_port=args.master_port,
+    )
+
+    ret_code = distributed_runner.run()
+    sys.exit(ret_code)
+
+
+if __name__ == "__main__":
+    main()
--- a/comps/lvms/llama-vision/lvm.py
+++ b/comps/lvms/llama-vision/lvm.py
@@ -0,0 +1,103 @@
+# Copyright (C) 2024 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+
+import base64
+import os
+import threading
+import time
+from io import BytesIO
+from typing import Union
+
+import habana_frameworks.torch as htorch
+import requests
+import torch
+from PIL import Image
+from transformers import AutoModelForVision2Seq, AutoProcessor
+
+from comps import (
+    CustomLogger,
+    LVMDoc,
+    ServiceType,
+    TextDoc,
+    opea_microservices,
+    register_microservice,
+    register_statistics,
+    statistics_dict,
+)
+
+logger = CustomLogger("lvm-llama-vision-native")
+logflag = os.getenv("LOGFLAG", False)
+initialization_lock = threading.Lock()
+initialized = False
+
+
+def initialize():
+    global model, processor, initialized
+    with initialization_lock:
+        if not initialized:
+            import habana_frameworks.torch.hpu as torch_hpu
+
+            model_id = os.getenv("LLAMA_VISION_MODEL_ID", "meta-llama/Llama-3.2-11B-Vision-Instruct")
+            huggingface_token = os.getenv("HUGGINGFACEHUB_API_TOKEN")
+            model = AutoModelForVision2Seq.from_pretrained(
+                model_id, device_map="hpu", torch_dtype=torch.bfloat16, token=huggingface_token
+            )
+            processor = AutoProcessor.from_pretrained(model_id, token=huggingface_token)
+            messages = [
+                {
+                    "role": "user",
+                    "content": [
+                        {"type": "image"},
+                        {"type": "text", "text": "If I had to write a haiku for this one, it would be: "},
+                    ],
+                }
+            ]
+            url = "https://llava-vl.github.io/static/images/view.jpg"
+            raw_image = Image.open(requests.get(url, stream=True).raw)
+            prompt = processor.apply_chat_template(messages, add_generation_prompt=True)
+            inputs = processor(raw_image, prompt, return_tensors="pt").to(model.device)
+            prompt_len = len(inputs["input_ids"][0])
+            output = model.generate(**inputs, pad_token_id=0, max_new_tokens=32)
+            generated_tokens = output[:, prompt_len:]
+            logger.info(processor.decode(generated_tokens[0], skip_special_tokens=True))
+            initialized = True
+            logger.info("[LVM] Llama Vision LVM initialized.")
+
+
+@register_microservice(
+    name="opea_service@lvm_llama_vision_native",
+    service_type=ServiceType.LVM,
+    endpoint="/v1/lvm",
+    host="0.0.0.0",
+    port=9399,
+)
+@register_statistics(names=["opea_service@lvm_llama_vision_native"])
+async def lvm(request: Union[LVMDoc]) -> Union[TextDoc]:
+    initialize()
+    if logflag:
+        logger.info(request)
+    start = time.time()
+    img_b64_str = request.image
+    prompt = request.prompt
+    max_new_tokens = request.max_new_tokens
+
+    messages = [{"role": "user", "content": [{"type": "image"}, {"type": "text", "text": prompt}]}]
+    text = processor.apply_chat_template(messages, add_generation_prompt=True)
+    image_data = base64.b64decode(img_b64_str)
+    image_stream = BytesIO(image_data)
+    raw_image = Image.open(image_stream)
+
+    inputs = processor(raw_image, text, return_tensors="pt").to(model.device)
+    prompt_len = len(inputs["input_ids"][0])
+    output = model.generate(**inputs, do_sample=False, max_new_tokens=max_new_tokens)
+    generated_tokens = output[:, prompt_len:]
+    result = processor.decode(generated_tokens[0], skip_special_tokens=True)
+    statistics_dict["opea_service@lvm_llama_vision_native"].append_latency(time.time() - start, None)
+    if logflag:
+        logger.info(result)
+
+    return TextDoc(text=result)
+
+
+if __name__ == "__main__":
+    opea_microservices["opea_service@lvm_llama_vision_native"].start()
--- a/comps/lvms/llama-vision/lvm_guard.py
+++ b/comps/lvms/llama-vision/lvm_guard.py
@@ -0,0 +1,129 @@
+# Copyright (C) 2024 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+
+import base64
+import os
+import threading
+import time
+from enum import Enum
+from io import BytesIO
+from typing import Union
+
+import habana_frameworks.torch as htorch
+import requests
+import torch
+from PIL import Image
+from prompt_format_utils import LlamaGuardVersion, build_default_prompt, create_conversation
+from transformers import AutoModelForVision2Seq, AutoProcessor
+
+from comps import (
+    CustomLogger,
+    LVMDoc,
+    ServiceType,
+    TextDoc,
+    opea_microservices,
+    register_microservice,
+    register_statistics,
+    statistics_dict,
+)
+
+logger = CustomLogger("lvm-llama-vision-guard-native")
+logflag = os.getenv("LOGFLAG", False)
+initialization_lock = threading.Lock()
+initialized = False
+
+
+class AgentType(Enum):
+    AGENT = "Agent"
+    USER = "User"
+
+
+def initialize():
+    global model, processor, initialized
+    with initialization_lock:
+        if not initialized:
+            import habana_frameworks.torch.hpu as torch_hpu
+
+            model_id = os.getenv("LLAMA_VISION_GUARD_MODEL_ID", "meta-llama/Llama-Guard-3-11B-Vision")
+            huggingface_token = os.getenv("HUGGINGFACEHUB_API_TOKEN")
+            model = AutoModelForVision2Seq.from_pretrained(
+                model_id, device_map="hpu", torch_dtype=torch.bfloat16, token=huggingface_token
+            )
+            processor = AutoProcessor.from_pretrained(model_id, token=huggingface_token)
+            url = "https://llava-vl.github.io/static/images/view.jpg"
+            conversation = [
+                {
+                    "role": "user",
+                    "content": [
+                        {"type": "text", "text": "If I had to write a haiku for this one, it would be: "},
+                        {
+                            "type": "image",
+                        },
+                    ],
+                }
+            ]
+            input_prompt = processor.apply_chat_template(conversation, return_tensors="pt")
+            raw_image = Image.open(requests.get(url, stream=True).raw)
+            inputs = processor(text=input_prompt, images=raw_image, return_tensors="pt").to(model.device)
+            prompt_len = len(inputs["input_ids"][0])
+            output = model.generate(**inputs, pad_token_id=0, max_new_tokens=32)
+            generated_tokens = output[:, prompt_len:]
+            logger.info(processor.decode(generated_tokens[0], skip_special_tokens=True))
+            initialized = True
+            logger.info("[LVM] Llama Vision GUARD LVM initialized.")
+
+
+@register_microservice(
+    name="opea_service@lvm_llama_vision_guard_native",
+    service_type=ServiceType.LVM,
+    endpoint="/v1/lvm",
+    host="0.0.0.0",
+    port=9499,
+)
+@register_statistics(names=["opea_service@lvm_llama_vision_guard_native"])
+async def lvm(request: Union[LVMDoc]) -> Union[TextDoc]:
+    initialize()
+    if logflag:
+        logger.info(request)
+    start = time.time()
+    img_b64_str = request.image
+    prompt = request.prompt
+    max_new_tokens = request.max_new_tokens
+
+    llama_guard_version = "LLAMA_GUARD_3"
+    prompts = [(prompt, AgentType.USER)]
+    for prompt in prompts:
+        formatted_prompt = build_default_prompt(prompt[1], create_conversation([prompt[0]]), llama_guard_version)
+
+    conversation = [
+        {
+            "role": "user",
+            "content": [
+                {"type": "text", "text": formatted_prompt},
+                {
+                    "type": "image",
+                },
+            ],
+        }
+    ]
+    input_prompt = processor.apply_chat_template(conversation, return_tensors="pt")
+
+    image_data = base64.b64decode(img_b64_str)
+    image_stream = BytesIO(image_data)
+    raw_image = Image.open(image_stream)
+
+    inputs = processor(text=input_prompt, images=raw_image, return_tensors="pt").to(model.device)
+    prompt_len = len(inputs["input_ids"][0])
+    output = model.generate(**inputs, do_sample=False, max_new_tokens=max_new_tokens)
+    generated_tokens = output[:, prompt_len:]
+    result = processor.decode(generated_tokens[0], skip_special_tokens=True)
+    statistics_dict["opea_service@lvm_llama_vision_guard_native"].append_latency(time.time() - start, None)
+
+    if logflag:
+        logger.info(result)
+
+    return TextDoc(text=result)
+
+
+if __name__ == "__main__":
+    opea_microservices["opea_service@lvm_llama_vision_guard_native"].start()
--- a/comps/lvms/llama-vision/lvm_tp.py
+++ b/comps/lvms/llama-vision/lvm_tp.py
@@ -0,0 +1,74 @@
+# Copyright (C) 2024 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+
+import json
+import os
+import time
+from concurrent import futures
+from typing import Union
+
+import requests
+
+from comps import (
+    CustomLogger,
+    LVMDoc,
+    ServiceType,
+    TextDoc,
+    opea_microservices,
+    register_microservice,
+    register_statistics,
+    statistics_dict,
+)
+
+logger = CustomLogger("lvm-llama-vision-tp-native")
+logflag = os.getenv("LOGFLAG", False)
+
+
+@register_microservice(
+    name="opea_service@lvm_llama_vision_tp_native",
+    service_type=ServiceType.LVM,
+    endpoint="/v1/lvm",
+    host="0.0.0.0",
+    port=9599,
+)
+@register_statistics(names=["opea_service@lvm_llama_vision_tp_native"])
+async def lvm(request: Union[LVMDoc]) -> Union[TextDoc]:
+    if logflag:
+        logger.info(request)
+
+    start = time.time()
+    # Initialize responses list
+    responses = []
+
+    # Function to send requests to individual TP workers
+    def send_request_to_tp_worker(port):
+        try:
+            # Build the worker URL dynamically
+            url = f"http://127.0.0.1:{port}/v1/lvm_serve"
+            # Send POST request to the TP worker
+            response = requests.post(url, json=request.dict())
+            response.raise_for_status()  # Ensure the request was successful
+
+            # Parse and process the response
+            json_response = response.json()
+            responses.append(TextDoc(text=json_response.get("text", "")))
+
+        except requests.exceptions.RequestException as e:
+            # Log any errors that occur
+            logger.error(f"Error sending request to TP worker on port {port}: {e}")
+            return None
+
+    # Distribute work across TP workers using ThreadPoolExecutor
+    with futures.ThreadPoolExecutor(max_workers=4) as executor:
+        # TP worker ports (e.g., worker processes listen on sequential ports)
+        worker_ports = [9393 + i + 1 for i in range(4)]
+        # Map the `send_request_to_tp_worker` function to each worker port
+        executor.map(send_request_to_tp_worker, worker_ports)
+
+    statistics_dict["opea_service@lvm_llama_vision_tp_native"].append_latency(time.time() - start, None)
+    if responses:
+        return responses[0]
+
+
+if __name__ == "__main__":
+    opea_microservices["opea_service@lvm_llama_vision_tp_native"].start()
--- a/comps/lvms/llama-vision/lvm_tp_serve.py
+++ b/comps/lvms/llama-vision/lvm_tp_serve.py
@@ -0,0 +1,212 @@
+# Copyright (C) 2024 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+
+import base64
+import json
+import os
+import threading
+import time
+from io import BytesIO
+from pathlib import Path
+from typing import Union
+
+import deepspeed
+import deepspeed.comm as dist
+import requests
+import torch
+import uvicorn
+from fastapi import FastAPI
+from huggingface_hub import snapshot_download
+from PIL import Image
+from starlette.middleware.cors import CORSMiddleware
+from transformers import AutoProcessor, MllamaForConditionalGeneration
+from transformers.utils import is_offline_mode
+
+from comps import CustomLogger, LVMDoc, TextDoc
+
+app = FastAPI(title="NeuralChat Gaudi Serving Process", description="Serving", version="0.0.1")
+logger = CustomLogger("lvm-llama-vision-tp")
+logflag = os.getenv("LOGFLAG", False)
+model = None
+model_id = os.getenv("MODEL_ID", "/workspace/models/final_weights/Llama-3.2-90B-Vision-Instruct")
+processor = None
+initialization_lock = threading.Lock()
+initialized = False
+local_rank = 0
+
+app.add_middleware(
+    CORSMiddleware,
+    allow_origins=["*"],  # Adjust this to restrict origins as necessary
+    allow_credentials=True,
+    allow_methods=["*"],
+    allow_headers=["*"],
+)
+
+
+def print_rank0(*msg):
+    if local_rank != 0:
+        return
+    print(*msg)
+
+
+def get_repo_root(model_name_or_path):
+    huggingface_token = os.getenv("HUGGINGFACEHUB_API_TOKEN")
+    if os.path.exists(model_name_or_path):
+        # local path
+        return model_name_or_path
+    # checks if online or not
+    if is_offline_mode():
+        print_rank0("Offline mode: forcing local_files_only=True")
+    # download only on first process
+    allow_patterns = ["*.bin", "*.model", "*.json", "*.txt", "*.py", "*LICENSE"]
+    if local_rank == 0:
+        snapshot_download(
+            model_name_or_path,
+            local_files_only=is_offline_mode(),
+            cache_dir=os.getenv("TRANSFORMERS_CACHE", None),
+            allow_patterns=allow_patterns,
+            token=huggingface_token,
+            # ignore_patterns=["*.safetensors"],
+        )
+
+    dist.barrier()
+
+    return snapshot_download(
+        model_name_or_path,
+        local_files_only=is_offline_mode(),
+        cache_dir=os.getenv("TRANSFORMERS_CACHE", None),
+        allow_patterns=allow_patterns,
+        token=huggingface_token,
+        # ignore_patterns=["*.safetensors"],
+    )
+
+
+def get_checkpoint_files(model_name_or_path):
+    cached_repo_dir = get_repo_root(model_name_or_path)
+
+    # extensions: .bin | .pt
+    # creates a list of paths from all downloaded files in cache dir
+    file_list = [
+        str(entry)
+        for entry in Path(cached_repo_dir).rglob("*.[sbp][ait][fn][e][t][e][n][s][o][r][s]")
+        if entry.is_file()
+    ]
+    print(file_list)
+    return file_list
+
+
+def write_checkpoints_json(local_rank, checkpoints_json):
+    checkpoint_files = get_checkpoint_files(model_id)
+    if local_rank == 0:
+        # model.config.model_type.upper()
+        data = {"type": "BLOOM", "checkpoints": checkpoint_files, "version": 1.0}
+        json.dump(data, open(checkpoints_json, "w"))
+
+
+def get_int_from_env(env_keys, default):
+    """Returns the first positive env value found in the `env_keys` list or the default."""
+    for e in env_keys:
+        val = int(os.environ.get(e, -1))
+        if val >= 0:
+            return val
+    return default
+
+
+def generate(prompt, raw_image, max_new_tokens=32):
+    if logflag:
+        logger.info(f"[lvm tp serve] start to generate text with {prompt}")
+    inputs = processor(raw_image, prompt, return_tensors="pt").to(torch.device("hpu"))
+    prompt_len = len(inputs["input_ids"][0])
+    output = model.generate(**inputs, max_new_tokens=max_new_tokens)
+    generated_tokens = output[:, prompt_len:]
+    result = processor.decode(generated_tokens[0], skip_special_tokens=True)
+    if logflag:
+        logger.info(f"[lvm tp serve] text generated: {result}")
+    return result
+
+
+def initialize():
+    global model, processor, initialized, local_rank
+    if logflag:
+        logger.info("[lvm tp serve] start to initialize model and processor")
+    initialized = True
+    huggingface_token = os.getenv("HUGGINGFACEHUB_API_TOKEN")
+    model = MllamaForConditionalGeneration.from_pretrained(
+        model_id, torch_dtype=torch.bfloat16, device_map="auto", token=huggingface_token
+    )
+    processor = AutoProcessor.from_pretrained(model_id, token=huggingface_token)
+
+    deepspeed.init_distributed(dist_backend="hccl")
+    local_rank = get_int_from_env(["LOCAL_RANK", "MPI_LOCALRANKID"], "0")
+    world_size = get_int_from_env(["WORLD_SIZE", "PMI_SIZE"], "1")
+    if logflag:
+        logger.info(f"[lvm tp serve] local rank: {local_rank}, world size: {world_size}")
+
+    repo_root = get_repo_root(model_id)
+    checkpoints_json = "checkpoints.json"
+    write_checkpoints_json(local_rank, checkpoints_json)
+    if logflag:
+        logger.info("[lvm tp serve] checkpoint json written")
+
+    # sleep for 10 seconds to avoid multi-process conflict
+    time.sleep(10)
+    model = deepspeed.init_inference(
+        model,
+        mp_size=world_size,
+        base_dir=repo_root,
+        dtype=torch.bfloat16,
+        checkpoint=checkpoints_json,
+    )
+    if logflag:
+        logger.info(model)
+        logger.info("[lvm tp serve] model initialized")
+
+    # warm up model
+    if logflag:
+        logger.info("[lvm tp serve] start to warm up model")
+    warmup = 3
+    messages = [
+        {
+            "role": "user",
+            "content": [
+                {"type": "image"},
+                {"type": "text", "text": "If I had to write a haiku for this one, it would be: "},
+            ],
+        }
+    ]
+    input_text = processor.apply_chat_template(messages, add_generation_prompt=True)
+    url = "https://llava-vl.github.io/static/images/view.jpg"
+    raw_image = Image.open(requests.get(url, stream=True).raw)
+    for i in range(warmup):
+        if logflag:
+            logger.info(f"[lvm tp serve] warming up iteration {i}")
+        generate(input_text, raw_image)
+
+
+@app.post("/v1/lvm_serve")
+async def lvm_tp_endpoint(input: Union[LVMDoc]) -> Union[TextDoc]:
+    if logflag:
+        logger.info(input)
+
+    img_b64_str = input.image
+    prompt = input.prompt
+    max_new_tokens = input.max_new_tokens
+    messages = [{"role": "user", "content": [{"type": "image"}, {"type": "text", "text": prompt}]}]
+    text = processor.apply_chat_template(messages, add_generation_prompt=True)
+    image_data = base64.b64decode(img_b64_str)
+    image_stream = BytesIO(image_data)
+    raw_image = Image.open(image_stream)
+
+    result = generate(text, raw_image, max_new_tokens)
+    if logflag:
+        logger.info(f"res: {result}")
+    return TextDoc(text=result)
+
+
+if __name__ == "__main__":
+    initialize()
+    process_port = 9393 + local_rank + 1
+    try:
+        uvicorn.run(app, host="localhost", port=process_port)
+    except Exception as e:
+        print(f"Error starting uvicorn: {str(e)}")
--- a/comps/lvms/llama-vision/prompt_format_utils.py
+++ b/comps/lvms/llama-vision/prompt_format_utils.py
@@ -0,0 +1,380 @@
+# Copyright (C) 2024 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# This software may be used and distributed according to the terms of the Llama 2 Community License Agreement.
+
+from dataclasses import dataclass
+from enum import Enum
+from string import Template
+from typing import List
+
+
+class LlamaGuardVersion(Enum):
+    LLAMA_GUARD_1 = "Llama Guard 1"
+    LLAMA_GUARD_2 = "Llama Guard 2"
+    LLAMA_GUARD_3 = "Llama Guard 3"
+
+
+class AgentType(Enum):
+    AGENT = "Agent"
+    USER = "User"
+
+
+@dataclass
+class SafetyCategory:
+    name: str
+    description: str
+
+
+@dataclass
+class ConversationTurn:
+    message: str
+    agent_type: AgentType
+
+
+LLAMA_GUARD_1_CATEGORY_SHORT_NAME_PREFIX = "O"
+LLAMA_GUARD_2_CATEGORY_SHORT_NAME_PREFIX = "S"
+LLAMA_GUARD_3_CATEGORY_SHORT_NAME_PREFIX = "S"
+
+PROMPT_TASK = "Task: Check if there is unsafe content in '$agent_type' messages in conversations according our safety policy with the below categories."
+
+SAFETY_CATEGORIES = """
+
+<BEGIN UNSAFE CONTENT CATEGORIES>
+$categories
+<END UNSAFE CONTENT CATEGORIES>
+"""
+
+PROMPT_CONVERSATION = """
+<BEGIN CONVERSATION>
+
+$conversations
+
+<END CONVERSATION>
+"""
+
+PROMPT_INSTRUCTIONS_1 = """
+Provide your safety assessment for $agent_type in the above conversation:
+- First line must read 'safe' or 'unsafe'.
+- If unsafe, a second line must include a comma-separated list of violated categories."""
+
+PROMPT_INSTRUCTIONS_2 = """
+Provide your safety assessment for ONLY THE LAST $agent_type message in the above conversation:
+ - First line must read 'safe' or 'unsafe'.
+ - If unsafe, a second line must include a comma-separated list of violated categories."""
+
+PROMPT_TEMPLATE_1 = Template(
+    f"[INST] {PROMPT_TASK}{SAFETY_CATEGORIES}{PROMPT_CONVERSATION}{PROMPT_INSTRUCTIONS_1} [/INST]"
+)
+PROMPT_TEMPLATE_2 = Template(
+    f"[INST] {PROMPT_TASK}{SAFETY_CATEGORIES}{PROMPT_CONVERSATION}{PROMPT_INSTRUCTIONS_2} [/INST]"
+)
+PROMPT_TEMPLATE_3 = Template(
+    f"<|begin_of_text|><|start_header_id|>user<|end_header_id|> {PROMPT_TASK}{SAFETY_CATEGORIES}{PROMPT_CONVERSATION}{PROMPT_INSTRUCTIONS_2} <|eot_id|><|start_header_id|>assistant<|end_header_id|>"
+)
+
+
+LLAMA_GUARD_1_CATEGORY = [
+    SafetyCategory(
+        "Violence and Hate.",
+        """Should not
+ - Help people plan or engage in violence.
+ - Encourage people to engage in violence.
+ - Express hateful, derogatory, or demeaning sentiments against people based on sensitive personal characteristics like their race, color, religion, national origin, sexual orientation, gender, gender identity, or disability.
+ - Encourage discrimination against people based on those characteristics and should not use slurs or create insults based on those characteristics.
+Can
+ - Provide information on violence and discrimination.
+ - Discuss the topics of hate and violence, and can discuss historical events involving violence.""",
+    ),
+    SafetyCategory(
+        "Sexual Content.",
+        """Should not
+ - Engage in sexually explicit (i.e., erotic) conversations and should not encourage people to engage in sexual activities.
+Can
+ - Discuss topics related to sexuality and sexual education.
+ - Provide general information regarding sexual activities.
+ - Provide guidance on topics related to sexual health.""",
+    ),
+    SafetyCategory(
+        "Criminal Planning.",
+        """Should not
+ - Help people plan or engage in criminal activities like theft, kidnapping, or financial crimes, especially when the person indicates that they may act on the information provided to them.
+ - Encourage people to engage in criminal activities or condone criminal behavior.
+Can
+ - Help people understand how crimes are committed, provided that the response is not so detailed that it could be used to provide a roadmap for future crimes.
+ - Help people understand how systems work that are used to prevent crimes, provided those descriptions do not provide detailed information on how to defeat those systems.""",
+    ),
+    SafetyCategory(
+        "Guns and Illegal Weapons.",
+        """Should not
+ - Assist in the planning or execution of firearm or illegal weapons crimes, especially when the person indicates that they may act on the information provided to them.
+ - Encourage people to engage in firearm or illegal weapons crimes or condone such behavior.
+Can
+ - Discuss firearms and the arguments for and against firearm ownership.""",
+    ),
+    SafetyCategory(
+        "Regulated or Controlled Substances.",
+        """Should not
+ - Encourage or assist people in the trafficking, consumption, or use of regulated or controlled substances, including but not limited to illegal drugs, tobacco, alcohol, and cannabis.
+ - Assist or encourage people to create such substances when it is illegal to do so.
+Can
+ - Discuss the use of regulated or controlled substances, and can provide general information on those substances (ex: what their chemical composition is and what their effects are).
+ - Provide information on how those substances are manufactured, provided that the information is not so detailed as to provide step-by-step instructions for their creation.""",
+    ),
+    SafetyCategory(
+        "Self-Harm.",
+        """Should not
+ - Encourage people to harm themselves.
+ - Romanticize or condone self-harm or suicide.
+ - Provide information regarding the methods of suicide or self-harm.
+ - Provide any information or advice, other than information regarding appropriate health resources, to people who express or imply that they wish to harm themselves.
+Can
+ - Provide general information regarding self-harm and suicide (ex: statistics, the mental health challenges faced by those who take these actions).
+Should
+ - Provide information regarding appropriate health resources to people who imply or express the desire to harm themselves.""",
+    ),
+]
+
+
+LLAMA_GUARD_2_CATEGORY = [
+    SafetyCategory(
+        "Violent Crimes. ",
+        """AI models should not create content that enables, encourages, or excuses the commission of violent crimes. Examples of violent crimes include, but are not limited to:
+ - Unlawful violence toward people (ex: terrorism, genocide, murder, hate crimes, child abuse, assault, battery, kidnapping)
+ - Unlawful violence toward animals (ex: animal abuse)""",
+    ),
+    SafetyCategory(
+        "Non-Violent Crimes. ",
+        """AI models should not create content that enables, encourages, or excuses the commission of non-violent crimes. Examples of non-violent crimes include, but are not limited to:
+ - Financial crimes (ex: fraud, scams, money laundering)
+ - Property crimes (ex: burglary, robbery, arson, vandalism)
+ - Drug crimes (ex: creating or trafficking narcotics)
+ - Weapons crimes (ex: producing unlicensed firearms)
+ - Cyber crimes (ex: hacking, spyware, malware)""",
+    ),
+    SafetyCategory(
+        "Sex Crimes. ",
+        """AI models should not create content that enables, encourages, or excuses the commission of sex crimes. Examples of sex crimes include, but are not limited to:
+ - Human trafficking
+ - Sexual assault (ex: rape)
+ - Sexual harassment (ex: groping)
+ - Lewd conduct""",
+    ),
+    SafetyCategory(
+        "Child Exploitation. ",
+        "AI models should not create content that depicts child nudity or that enables, encourages, excuses, or depicts the sexual abuse of children.",
+    ),
+    SafetyCategory(
+        "Specialized Advice. ",
+        "AI models should not create content that contains specialized financial, medical, or legal advice.",
+    ),
+    SafetyCategory(
+        "Privacy. ",
+        "AI models should not create content that contains sensitive, personal information about private individuals.",
+    ),
+    SafetyCategory(
+        "Intellectual Property. ",
+        "AI models should not create content that violates the intellectual property rights of third parties.",
+    ),
+    SafetyCategory(
+        "Indiscriminate Weapons. ",
+        """AI models should not create content that enables the creation of weapons of mass destruction. Examples of such weapons include, but are not limited to:
+ - Chemical weapons (ex: nerve gas)
+ - Biological weapons (ex: anthrax)
+ - Radiological weapons (ex: salted bombs)
+ - Nuclear weapons (ex: atomic warheads)
+ - High-yield explosive weapons (ex: cluster munitions)""",
+    ),
+    SafetyCategory(
+        "Hate. ",
+        "AI models should not create content that is hateful toward people on the basis of their protected characteristics (race, color, ethnicity, national origin, disability, religious affiliation, caste, sexual orientation, sex, gender identity and serious disease). AI models should not create content that perpetuates negative stereotypes.",
+    ),
+    SafetyCategory(
+        "Self-Harm. ",
+        """AI models should not create content that enables, encourages, or excuses acts of intentional self-harm. Examples of such actions include, but are not limited to:
+ - Suicide
+ - Self-injury (ex: cutting)
+ - Disordered eating""",
+    ),
+    SafetyCategory(
+        "Sexual Content. ",
+        "AI models should not create depictions of nude adults or content that contains erotic descriptions or explicit depictions of sex acts.",
+    ),
+]
+
+LLAMA_GUARD_3_CATEGORY = [
+    SafetyCategory(
+        "Violent Crimes. ",
+        """AI models should not create content that enables, encourages, or excuses the commission of violent crimes. Examples of violent crimes include, but are not limited to:
+                - Unlawful violence toward people (ex: terrorism, genocide, murder, hate crimes, child abuse, assault, battery, kidnapping)
+                - Unlawful violence toward animals (ex: animal abuse)""",
+    ),
+    SafetyCategory(
+        "Non-Violent Crimes. ",
+        """AI models should not create content that enables, encourages, or excuses the commission of non-violent crimes. Examples of non-violent crimes include, but are not limited to:
+                - Financial crimes (ex: fraud, scams, money laundering)
+                - Property crimes (ex: burglary, robbery, arson, vandalism)
+                - Drug crimes (ex: creating or trafficking narcotics)
+                - Weapons crimes (ex: producing unlicensed firearms)
+                - Cyber crimes (ex: hacking, spyware, malware)""",
+    ),
+    SafetyCategory(
+        "Sex Crimes. ",
+        """AI models should not create content that enables, encourages, or excuses the commission of sex crimes. Examples of sex crimes include, but are not limited to:
+                - Human trafficking
+                - Sexual assault (ex: rape)
+                - Sexual harassment (ex: groping)
+                - Lewd conduct""",
+    ),
+    SafetyCategory(
+        "Child Exploitation. ",
+        "AI models should not create content that depicts child nudity or that enables, encourages, excuses, or depicts the sexual abuse of children.",
+    ),
+    SafetyCategory(
+        "Defamation. ",
+        "AI models should not create content about a real, living person that is verifiably false and likely to injure the person's reputation.",
+    ),
+    SafetyCategory(
+        "Specialized Advice. ",
+        "AI models should not create content that contains specialized financial, medical, or legal advice.",
+    ),
+    SafetyCategory(
+        "Privacy. ",
+        "AI models should not create content that contains sensitive, personal information about private individuals.",
+    ),
+    SafetyCategory(
+        "Intellectual Property. ",
+        "AI models should not create content that violates the intellectual property rights of third parties.",
+    ),
+    SafetyCategory(
+        "Indiscriminate Weapons. ",
+        """I models should not create content that enables the creation of weapons of mass destruction. Examples of such weapons include, but are not limited to:
+                - Chemical weapons (ex: nerve gas)
+                - Biological weapons (ex: anthrax)
+                - Radiological weapons (ex: salted bombs)
+                - Nuclear weapons (ex: atomic warheads)
+                - High-yield explosive weapons (ex: cluster munitions)""",
+    ),
+    SafetyCategory(
+        "Hate. ",
+        "AI models should not create content that is hateful toward people on the basis of their protected characteristics (race, color, ethnicity, national origin, disability, religious affiliation, caste, sexual orientation, sex, gender identity and serious disease). AI models should not create content that perpetuates negative stereotypes.",
+    ),
+    SafetyCategory(
+        "Self-Harm. ",
+        """AI models should not create content that enables, encourages, or excuses acts of intentional self-harm. Examples of such actions include, but are not limited to:
+                - Suicide
+                - Self-injury (ex: cutting)
+                - Disordered eating""",
+    ),
+    SafetyCategory(
+        "Sexual Content. ",
+        "AI models should not create depictions of nude adults or content that contains erotic descriptions or explicit depictions of sex acts.",
+    ),
+    SafetyCategory(
+        "Elections. ",
+        "AI models should not create content that contain factually incorrect information about electoral systems and processes, including in the time, place, or manner of voting in civic elections.",
+    ),
+    SafetyCategory(
+        "Code Interpreter Abuse. ",
+        """AI models should not create content that attempts to abuse code interpreters. Examples of code interpreter abuse include, but are not limited to:
+                - Denial of service attacks
+                - Container escapes or privilege escalation.""",
+    ),
+]
+
+
+def create_conversation(messages: List[str]) -> List[ConversationTurn]:
+    conversations = []
+    for i, message in enumerate(messages):
+        conversations.append(
+            ConversationTurn(message=message, agent_type=AgentType.USER if i % 2 == 0 else AgentType.AGENT)
+        )
+
+    return conversations
+
+
+def build_default_prompt(
+    agent_type: AgentType,
+    conversations: List[ConversationTurn],
+    llama_guard_version: LlamaGuardVersion = LlamaGuardVersion.LLAMA_GUARD_3,
+):
+
+    if llama_guard_version == LlamaGuardVersion.LLAMA_GUARD_3:
+        categories = LLAMA_GUARD_3_CATEGORY
+        category_short_name_prefix = LLAMA_GUARD_3_CATEGORY_SHORT_NAME_PREFIX
+        prompt_template = PROMPT_TEMPLATE_3
+    elif llama_guard_version == LlamaGuardVersion.LLAMA_GUARD_2:
+        categories = LLAMA_GUARD_2_CATEGORY
+        category_short_name_prefix = LLAMA_GUARD_2_CATEGORY_SHORT_NAME_PREFIX
+        prompt_template = PROMPT_TEMPLATE_2
+    else:
+        categories = LLAMA_GUARD_1_CATEGORY
+        category_short_name_prefix = LLAMA_GUARD_1_CATEGORY_SHORT_NAME_PREFIX
+        prompt_template = PROMPT_TEMPLATE_1
+
+    return build_custom_prompt(agent_type, conversations, categories, category_short_name_prefix, prompt_template)
+
+
+def build_custom_prompt(
+    agent_type: AgentType,
+    conversations: List[ConversationTurn],
+    categories: List[SafetyCategory],
+    category_short_name_prefix: str,
+    prompt_template: str,
+    with_policy: bool = False,
+):
+    categories_str = "\n".join(
+        [
+            f"{category_short_name_prefix}{i+1}: {c.name}" + (f"\n{c.description}" if with_policy else "")
+            for i, c in enumerate(categories)
+        ]
+    )
+    conversations_str = "\n\n".join([f"{t.agent_type.value}: {t.message}" for t in conversations])
+
+    return prompt_template.substitute(
+        agent_type=agent_type.value, categories=categories_str, conversations=conversations_str
+    )
+
+
+def build_prompt_test():
+    print(
+        build_default_prompt(
+            AgentType.AGENT,
+            [
+                ConversationTurn("What's the color of the sky?", AgentType.USER),
+                ConversationTurn("The sky is blue.", AgentType.AGENT),
+            ],
+        )
+    )
+
+    print("\n\n")
+
+    # use a customized safety category and create_conversation function.
+
+    print(
+        build_custom_prompt(
+            AgentType.AGENT,
+            create_conversation(["<User Prompt placeholder>", "<Agent Prompt placeholder>"]),
+            [
+                SafetyCategory(
+                    "Violence and Hate.",
+                    """Should not
+ - Help people plan or engage in violence.
+ - Encourage people to engage in violence.
+ - Express hateful, derogatory, or demeaning sentiments against people based on sensitive personal characteristics like their race, color, religion, national origin, sexual orientation, gender, gender identity, or disability.
+ - Encourage discrimination against people based on those characteristics and should not use slurs or create insults based on those characteristics.
+Can
+ - Provide information on violence and discrimination.
+ - Discuss the topics of hate and violence, and can discuss historical events involving violence.""",
+                ),
+            ],
+            LLAMA_GUARD_3_CATEGORY_SHORT_NAME_PREFIX,
+            PROMPT_TEMPLATE_3,
+            True,
+        )
+    )
+
+
+if __name__ == "__main__":
+    build_prompt_test()
--- a/comps/lvms/llama-vision/replace_module.py
+++ b/comps/lvms/llama-vision/replace_module.py
@@ -0,0 +1,711 @@
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+
+import gc
+import os
+import time
+
+import deepspeed
+import deepspeed.ops.transformer as transformer_inference
+import torch
+import tqdm
+from deepspeed import comm as dist
+from deepspeed.accelerator import get_accelerator
+from deepspeed.module_inject.tp_shard import set_n_embd, set_num_kv_heads
+from deepspeed.ops.transformer.inference.diffusers_2d_transformer import Diffusers2DTransformerConfig
+from deepspeed.ops.transformer.inference.diffusers_attention import DeepSpeedDiffusersAttention
+from deepspeed.ops.transformer.inference.diffusers_transformer_block import DeepSpeedDiffusersTransformerBlock
+
+from .auto_tp import AutoTP, Loading, ReplaceWithTensorSlicing
+from .load_checkpoint import load_model_with_checkpoint
+from .replace_policy import generic_policies, replace_policies
+from .utils import policy_to_ds_container
+
+
+def get_transformer_name(replaced_module):
+    from torch.nn import ModuleList
+
+    from .containers import supported_models
+
+    transformer_name = ""
+    for n, c in replaced_module.named_children():
+        if c.__class__ in supported_models:
+            transformer_name += n + "."
+            for name, child in c.named_children():
+                if child.__class__ is ModuleList:
+                    transformer_name += name
+                    break
+            break
+    return transformer_name
+
+
+class GroupQuantizer:
+
+    def __init__(self, q_int8=True, group_size=1, num_bits=8, num_groups=0):
+        self.group_size = group_size
+        self.num_bits = num_bits
+        self.q_int8 = q_int8
+
+        self.num_groups = num_groups
+
+    def quantize(self, inputs, qkv=True, count=1, parallel_dim=0):
+        if not self.q_int8 or not qkv:
+            inputs = torch.nn.Parameter(inputs, requires_grad=False)
+            inputs.scale = torch.empty(1)
+            return inputs
+        q_range = 2**self.num_bits
+        num_groups = self.num_groups if self.num_groups > 0 else inputs.shape[0] // self.group_size
+        inputs = inputs.to(get_accelerator().current_device_name())
+        input_flat = inputs.reshape(num_groups, -1).contiguous()
+        input_min = torch.min(input_flat, dim=1, keepdim=True)[0].float()
+        input_max = torch.max(input_flat, dim=1, keepdim=True)[0].float()
+        scale = torch.max(input_min.abs(), input_max.abs()) * 2.0 / (q_range)
+        input_flat = (input_flat / scale).round().clamp(-q_range // 2, q_range // 2 - 1)
+        inputs_q = input_flat.reshape(inputs.shape).to(torch.int8).contiguous()
+        out = torch.nn.Parameter(inputs_q, requires_grad=False)
+        inputs_split = inputs.split(inputs.shape[parallel_dim] // 2, dim=parallel_dim)
+        input_flat = [inputs_split[i].reshape(num_groups, -1).contiguous() for i in range(2)]
+        input_min = [torch.min(input_flat[i], dim=1, keepdim=True)[0].float() for i in range(2)]
+        input_max = [torch.max(input_flat[i], dim=1, keepdim=True)[0].float() for i in range(2)]
+        scale1 = [
+            (torch.max(input_min[i].abs(), input_max[i].abs()) * 2.0 / (q_range)).squeeze().unsqueeze(0)
+            for i in range(2)
+        ]
+
+        out.scale = (
+            torch.cat([scale.squeeze().unsqueeze(0), scale1[0], scale1[1]], dim=0).reshape(num_groups, -1).contiguous()
+        )
+        return out
+
+
+def _module_match(module):
+    for policy in generic_policies:
+        policy = policy()
+        if policy.match(module):
+            return policy
+    return None
+
+
+def generic_injection(module, dtype=None, enable_cuda_graph=True):
+
+    def replace_attn(child, policy):
+        policy_attn = policy.attention(child)
+        if policy_attn is None:
+            return child
+        if len(policy_attn) == 5:
+            qkvw, attn_ow, attn_ob, hidden_size, heads = policy_attn
+        else:
+            qw, kw, vw, attn_ow, attn_ob, hidden_size, heads = policy_attn
+
+        config = transformer_inference.DeepSpeedInferenceConfig(
+            hidden_size=hidden_size,
+            heads=heads,
+            dtype=dtype,
+            triangular_masking=False,
+            max_out_tokens=4096,
+        )
+        attn_module = DeepSpeedDiffusersAttention(config)
+
+        def transpose(data):
+            data = data.contiguous()
+            data.reshape(-1).copy_(data.transpose(-1, -2).contiguous().reshape(-1))
+            data = data.reshape(data.shape[-1], data.shape[-2])
+            data.to(get_accelerator().current_device_name())
+            return data
+
+        if len(policy_attn) == 5:
+            attn_module.attn_qkvw.data = transpose(qkvw.data)
+        else:
+            attn_module.attn_qkvw = None
+            attn_module.attn_qw.data = transpose(qw.data)
+            attn_module.attn_kw.data = transpose(kw.data)
+            attn_module.attn_vw.data = transpose(vw.data)
+
+        attn_module.attn_qkvb = None
+        attn_module.attn_ow.data = transpose(attn_ow.data)
+        attn_module.attn_ob.data.copy_(attn_ob.data.to(get_accelerator().current_device_name()))
+        return attn_module
+
+    def replace_attn_block(child, policy):
+        config = Diffusers2DTransformerConfig()
+        return DeepSpeedDiffusersTransformerBlock(child, config)
+
+    if isinstance(module, torch.nn.Module):
+        pass
+    else:
+        if dtype not in [torch.float16, torch.half]:
+            raise ValueError("Generic injection only supported with FP16")
+
+        try:
+            import diffusers
+
+            if hasattr(diffusers.models.attention, "CrossAttention"):
+                cross_attention = diffusers.models.attention.CrossAttention
+            else:
+                cross_attention = diffusers.models.attention_processor.Attention
+            attention_block = diffusers.models.attention.BasicTransformerBlock
+            new_policies = {
+                cross_attention: replace_attn,
+                attention_block: replace_attn_block,
+            }
+        except ImportError:
+            new_policies = {}
+
+        # replace_transformer_layer(None,
+        #                          module.text_encoder,
+        #                          training=False,
+        #                          replace_with_kernel_inject=True,
+        #                          triangular_masking=True,
+        #                          max_out_tokens=8192)
+        from ..model_implementations.transformers.clip_encoder import DSClipEncoder
+
+        cg_encoder = DSClipEncoder(module.text_encoder, enable_cuda_graph=enable_cuda_graph)
+        setattr(module, "text_encoder", cg_encoder)
+        for name in module.__dict__.keys():
+            sub_module = getattr(module, name)
+            policy = _module_match(sub_module)
+
+            if policy is not None:
+
+                def _replace_module(module, policy):
+                    for name, child in module.named_children():
+                        _replace_module(child, policy)
+                        if child.__class__ in new_policies:
+                            replaced_module = new_policies[child.__class__](child, policy)
+                            setattr(module, name, replaced_module)
+
+                _replace_module(sub_module, policy)
+                new_module = policy.apply(sub_module, enable_cuda_graph=enable_cuda_graph)
+                print(f"**** found and replaced {name} w. {type(new_module)}")
+                setattr(module, name, new_module)
+
+
+container_g = None
+
+
+def replace_transformer_layer(orig_layer_impl, model, checkpoint_dict, config, model_config):
+    """Replace bert-style transformer layers with DeepSpeed's transformer layer
+    Arguments:
+        orig_layer_impl (torch.nn.Module): the original transformer layer implementation to look for,
+            e.g., transformers.models.bert.modeling_bert.BertLayer or transformers.BertLayer
+        model (torch.nn.Module): user's nn.module representing their model
+        checkpoint_dict: Dictionary for checkpoint passed from the Inference Engine
+        config: top-level DS Inference config defined in inference/config.py
+        model_config: HuggingFace model config passed from the inference/engine.py
+    Returns:
+        Updated nn.module with replaced transformer layers
+    """
+    # defining globals as internally defined functions inherit these everywhere
+    quantize = config.dtype == torch.int8
+    # todo: Refactor later. In future, let's minimize the style used above and use config.** instead
+
+    linear_layer_setting = None
+    """linear_layer_setting (tuple of modules) [Optional]: shows which two classes are used for linear layers and embedding layers."""
+    micro_batch_size = -1
+    seed = -1
+    local_rank = -1
+
+    mp_replace = ReplaceWithTensorSlicing(
+        mp_group=config.tensor_parallel.tp_group, mp_size=config.tensor_parallel.tp_size
+    )  # , out_dim=0, in_dim=1)
+
+    def replace_with_policy(child, policy_cls, triangular_masking, inference=False, layer_id=0):
+        policy = policy_cls(child, inference=inference)
+        if not policy.cuda_graph_supported:
+            # policy says cuda graph is not supported raise an error if set
+            assert not config.enable_cuda_graph, "cuda graph is not supported with this model, please disable"
+
+        from deepspeed.moe.layer import MoE
+
+        moe = False
+        if hasattr(child, "mlp") and isinstance(child.mlp, MoE):
+            num_experts = child.mlp.num_experts
+            moe = True
+
+        # 1. Create a model-specific container object using the policy object.
+        _container = policy_to_ds_container(
+            policy=policy, config=config, model_config=model_config, layer_id=layer_id, child=child
+        )
+        _container.set_moe(moe)
+
+        # 2. Set the tensor parallelism config
+        _container.set_tensor_parallel_config(config.tensor_parallel.tp_size, config.tensor_parallel.tp_group)
+
+        # 3. Initialize tensors
+        _container.initialize_tensors()
+
+        # 4. deal with data types -- needs refactor to use dtype instead of fp16
+        if config.dtype in [torch.float16, torch.bfloat16, torch.int8]:
+            _container.convert_to_required_dtype()
+
+        # 5. Set the quantization config
+        quantizer = GroupQuantizer(q_int8=quantize)
+        _container.set_quantization_config(quantizer)
+
+        # 6. create a DS Inference config object
+        _container.create_ds_model_config()
+
+        # 7. use the config and create the module
+        _container.create_module()
+
+        # 8. transpose the weights and bias if needed
+        _container.transpose()
+
+        # 9. deal with tensor parallelism.
+        _container.apply_tensor_parallelism(mp_replace)
+
+        # 10. copy the tensors from the model-specific container to the new module
+        _container.copy_data_to_new_module()
+
+        # 11. set global for generic checkpoint loading
+        global container_g
+
+        if container_g is None:
+            container_g = _container
+
+        return _container.module
+
+    def replace_wo_policy(module, all_reduce_linears, prefix="", state_dict=None):
+        # mp_replace = ReplaceWithTensorSlicing(mp_group=config.tensor_parallel.tp_group)
+
+        # 1. Create AutoTP object
+        _autotp = AutoTP(module, all_reduce_linears, prefix, state_dict, linear_layer_setting, orig_layer_impl)
+
+        # 2. Set the tensor parallelism config
+        _autotp.set_tensor_parallel_config(config.tensor_parallel.tp_size, config.tensor_parallel.tp_group)
+
+        # 3. Try to get num_key_heads from model_config.num_key_value_heads
+        # num_kv_heads = _autotp.get_model_num_kv_heads(model_config)
+        if hasattr(model_config, "vision_config"):
+            if "self_attn.o_proj" in all_reduce_linears and "mlp.fc2" in all_reduce_linears:
+                num_kv_heads = _autotp.get_model_num_kv_heads(model_config.vision_config)
+            else:
+                num_kv_heads = _autotp.get_model_num_kv_heads(model_config.text_config)
+        else:
+            num_kv_heads = _autotp.get_model_num_kv_heads(model_config)
+
+        # 4. When we have num_kv_heads defined, uneven division is possible, otherwise enforce even division
+        set_num_kv_heads(num_kv_heads)
+
+        # 4.1 Get n_embd
+        n_embd = None
+        multi_query_n_embd_names = ["n_embd"]
+        for name in multi_query_n_embd_names:
+            if hasattr(model_config, name):
+                n_embd = getattr(model_config, name)
+            if n_embd is not None:
+                break
+
+        # 4.2 set n_embd
+        set_n_embd(n_embd)
+
+        # 5. Set linear policies
+        _autotp.update_linear_policies()
+
+        # 6. Replace modules
+        if "lm_head" in all_reduce_linears or "embed_out" in all_reduce_linears:
+            return _autotp._replace_last_linear_module(module)
+        return _autotp._replace_module(module)
+
+    def replace_fn(child, _policy, layer_id=0, prefix="", state_dict=None):
+        training = False  # todo: refactor this part to go in the config
+        if training:
+            # copy relevant state from child -> new module
+            new_module = replace_with_policy(child, _policy, config.triangular_masking)
+
+        else:
+            # copy relevant state from child -> new module
+            if config.replace_with_kernel_inject:
+                new_module = replace_with_policy(
+                    child, _policy, config.triangular_masking, inference=True, layer_id=layer_id
+                )
+            else:
+                new_module = replace_wo_policy(child, _policy, prefix=prefix, state_dict=state_dict)
+
+        return new_module
+
+    def set_lm_head(module):
+        embedding_weight = None
+        for n, p in module.named_parameters():
+            if "word_embeddings." in n or "embed_tokens." in n or "wte." in n:
+                embedding_weight = p
+        if (
+            embedding_weight is not None
+            and hasattr(module, "lm_head")
+            and hasattr(module.lm_head, "weight")
+            and module.lm_head.weight.is_meta
+        ):
+            module.lm_head.weight = embedding_weight
+        # enable tensor parallel for the last linear
+        if (
+            hasattr(module, "lm_head")
+            and hasattr(module.lm_head, "weight")
+            and not module.lm_head.weight.is_meta
+            and isinstance(module.lm_head, torch.nn.Linear)
+        ):
+            module = replace_wo_policy(module, ("lm_head",), 0, "lm_head")
+        elif (
+            hasattr(module, "embed_out")
+            and hasattr(module.embed_out, "weight")
+            and not module.embed_out.weight.is_meta
+            and isinstance(module.embed_out, torch.nn.Linear)
+        ):
+            module = replace_wo_policy(module, ("embed_out",), 0, "embed_out")
+        elif hasattr(module.language_model, "lm_head"):
+            module = replace_wo_policy(module.language_model, ("lm_head",), 0, "lm_head")
+        return module
+
+    if checkpoint_dict is not None and not config.replace_with_kernel_inject:
+        # AutoTP shard loading
+        checkpoint = checkpoint_dict["checkpoints"]
+        pbar = tqdm.tqdm(total=len(checkpoint), desc=f"Loading {len(checkpoint)} checkpoint shards")
+        for i in range(len(checkpoint)):
+            checkpoint_file = os.path.join(config.base_dir, checkpoint[i])
+            replaced_module = replace_module(
+                model=model,
+                orig_class=orig_layer_impl,
+                replace_fn=replace_fn,
+                _replace_policy=config.injection_policy_tuple,
+                checkpoint=checkpoint_file,
+            )
+            pbar.update(1)
+            gc.collect()
+        replaced_module = set_lm_head(replaced_module)
+    else:
+        replaced_module = replace_module(
+            model=model,
+            orig_class=orig_layer_impl,
+            replace_fn=replace_fn,
+            _replace_policy=config.injection_policy_tuple,
+        )
+
+    quantizer = GroupQuantizer(q_int8=quantize)
+    world_size = dist.get_world_size() if dist.is_initialized() else 1
+    rank = dist.get_rank() if dist.is_initialized() else 0
+    if checkpoint_dict is not None and config.replace_with_kernel_inject:
+        assert (
+            container_g.ckpt_load_enabled
+        ), f"Meta Tensor checkpoint loading not supported in {container_g.__class__.__name__} container"
+        start_time = time.time()
+        checkpoint = checkpoint_dict["checkpoints"]
+        ckpt_list = checkpoint["tp"] if type(checkpoint) is dict else checkpoint
+        ckpt_type = checkpoint_dict.get("parallelization", "pp")
+        ckpt_mp_size = checkpoint_dict.get("tp_size", len(ckpt_list))
+        ckpt_mp_size = checkpoint_dict.get("mp_size", ckpt_mp_size)
+        base_dir1 = checkpoint_dict.get("base_dir", config.base_dir)
+
+        if ckpt_type == "pp" and type(checkpoint) is list:
+            pbar = tqdm.tqdm(total=len(checkpoint), desc=f"Loading {len(checkpoint)} checkpoint shards")
+
+            for i in range(len(checkpoint)):
+                sd = [torch.load(os.path.join(base_dir1, checkpoint[i]), map_location="cpu")]
+                load_model_with_checkpoint(
+                    replaced_module, sd, mp_replace, ckpt_type, ckpt_mp_size, quantizer, container=container_g
+                )
+                pbar.update(1)
+        else:
+            num_checkpoints = len(ckpt_list) // ckpt_mp_size
+            tp_split_size = world_size / ckpt_mp_size
+            sd_offset = int(rank / tp_split_size)
+            sd_count = int((rank + max(1, tp_split_size)) / tp_split_size) - sd_offset
+            pbar = tqdm.tqdm(total=num_checkpoints, desc=f"Loading {num_checkpoints} checkpoint shards")
+            for i in range(num_checkpoints):
+                pbar.update(1)
+                ckpt_index = i * ckpt_mp_size + sd_offset
+                ckpt_files = [
+                    os.path.join(base_dir1, ckpt_list[ckpt_index + j]) if base_dir1 else ckpt_list[ckpt_index + j]
+                    for j in range(sd_count)
+                ]
+                sds = [torch.load(ckpt_file, map_location="cpu") for ckpt_file in ckpt_files]
+                load_model_with_checkpoint(
+                    replaced_module,
+                    sds,
+                    mp_replace,
+                    ckpt_type,
+                    ckpt_mp_size,
+                    quantizer,
+                    int(rank % tp_split_size),
+                    container=container_g,
+                )
+                sds = [None for _ in sds]
+                gc.collect()
+
+            if "non_tp" in checkpoint:
+                pbar = tqdm.tqdm(
+                    total=len(checkpoint["non_tp"]), desc=f"Loading {len(checkpoint['non_tp'])} checkpoint shards"
+                )
+
+                for i in range(len(checkpoint["non_tp"])):
+                    pbar.update(1)
+                    ckpt_file = (
+                        os.path.join(base_dir1, checkpoint["non_tp"][i]) if base_dir1 else checkpoint["non_tp"][i]
+                    )
+                    sds = [torch.load(ckpt_file, map_location="cpu")]
+                    load_model_with_checkpoint(
+                        replaced_module,
+                        sds,
+                        mp_replace,
+                        ckpt_type,
+                        ckpt_mp_size,
+                        quantizer,
+                        int(rank % tp_split_size),
+                        container=container_g,
+                    )
+                    sds = [None for _ in sds]
+                    gc.collect()
+        set_lm_head(replaced_module)
+        print(f"checkpoint loading time at rank {rank}: {time.time()-start_time} sec")
+
+    if config.save_mp_checkpoint_path is not None:
+        import json
+        from collections import OrderedDict
+
+        num_partitions = 8
+
+        if checkpoint_dict is None:
+            ckpt_name = "ds_model"
+            try:
+                from transformers.models.bloom.modeling_bloom import BloomForCausalLM
+
+                if isinstance(model, BloomForCausalLM):
+                    ckpt_name = "bloom"
+            except ImportError:
+                ckpt_name = "ds_model"
+        else:
+            ckpt_name = checkpoint_dict["type"]
+        if dist.is_initialized():
+            dist.barrier()
+        transformer_name = get_transformer_name(replaced_module)
+        non_tp_ckpt_name = "non-tp.pt"
+        ckpt_files = [non_tp_ckpt_name]
+        os.makedirs(config.save_mp_checkpoint_path, exist_ok=True)
+
+        if not dist.is_initialized() or dist.get_rank() == 0:
+            print("Saving tp-sharded checkpoints")
+            torch.save(
+                OrderedDict({k: v for k, v in dict(replaced_module.state_dict()).items() if transformer_name not in k}),
+                f"{config.save_mp_checkpoint_path}/{non_tp_ckpt_name}",
+            )
+
+            dtype_reprs = {
+                torch.float32: "float32",
+                torch.float16: "float16",
+                torch.int8: "int8",
+                torch.bfloat16: "bfloat16",
+            }
+
+            ckpt_config = json.dumps(
+                {
+                    "type": ckpt_name,
+                    "base_dir": f"{config.save_mp_checkpoint_path}",
+                    "checkpoints": {
+                        "non_tp": ckpt_files,
+                        "tp": [f"tp_{r:0>2d}_{m:0>2d}.pt" for m in range(num_partitions) for r in range(world_size)],
+                    },
+                    "version": 1.0,
+                    "parallelization": "tp",
+                    "tp_size": world_size,
+                    "dtype": dtype_reprs[config.dtype],
+                }
+            )
+            with open(f"{config.save_mp_checkpoint_path}/ds_inference_config.json", "w") as cfg:
+                cfg.write(ckpt_config)
+
+        rep_sd = replaced_module.state_dict()
+        for n, p in replaced_module.named_parameters():
+            if hasattr(p, "scale"):
+                rep_sd[n] = [p, p.scale]
+        keys = list(rep_sd.keys())
+        partition_size = len(keys) // num_partitions + 1
+        for m in range(num_partitions):
+            torch.save(
+                OrderedDict(
+                    {
+                        k: [rep_sd[k], rep_sd[k].scale] if hasattr(rep_sd[k], "scale") else rep_sd[k]
+                        for k in keys[m * partition_size : (m + 1) * partition_size]
+                        if transformer_name in k
+                    }
+                ),
+                f"{config.save_mp_checkpoint_path}/tp_{rank:0>2d}_{m:0>2d}.pt",
+            )
+
+    return replaced_module
+
+
+def revert_transformer_layer(orig_layer_impl, model, config, preln=False):
+    """Revert DeepSpeed's transformer layer back to original bert-style transformer layer
+    Arguments:
+        orig_layer_impl (torch.nn.Module): the original transformer layer implementation that was replaced,
+            e.g., transformers.models.bert.modeling_bert.BertLayer or transformers.BertLayer
+        model (torch.nn.Module): user's nn.module representing their model
+        config (dict): model config containing hidden size, attention heads, etc.
+    Returns:
+        Updated nn.module with original bert-style transformer layers
+    """
+
+    def replace_fn(child, _replace_policy, layer_id):
+        # from turing.nvidia_modelingpreln import BertLayer
+        orig_module = orig_layer_impl(config)
+
+        # copy relevant state from child -> original module
+        qkvw = child.attn_qkvw.data
+        qkvb = child.attn_qkvb.data
+
+        qw, kw, vw = torch.chunk(qkvw, 3, axis=0)
+        qb, kb, vb = torch.chunk(qkvb, 3, axis=0)
+
+        orig_module.attention.self.query.weight.data = qw
+        orig_module.attention.self.query.bias.data = qb
+        orig_module.attention.self.key.weight.data = kw
+        orig_module.attention.self.key.bias.data = kb
+        orig_module.attention.self.value.weight.data = vw
+        orig_module.attention.self.value.bias.data = vb
+
+        orig_module.attention.output.dense.weight.data = child.attn_ow.data
+        orig_module.attention.output.dense.bias.data = child.attn_ob.data
+
+        attn_ln_w = child.attn_nw.data
+        attn_ln_b = child.attn_nb.data
+        if preln:
+            orig_module.PostAttentionLayerNorm.weight.data = attn_ln_w
+            orig_module.PostAttentionLayerNorm.bias.data = attn_ln_b
+        else:
+            orig_module.attention.output.LayerNorm.weight.data = attn_ln_w
+            orig_module.attention.output.LayerNorm.bias.data = attn_ln_b
+
+        inter_ff_w = child.inter_w.data
+        inter_ff_b = child.inter_b.data
+        if preln:
+            orig_module.intermediate.dense_act.weight.data = inter_ff_w
+            orig_module.intermediate.dense_act.bias.data = inter_ff_b
+        else:
+            orig_module.intermediate.dense.weight.data = inter_ff_w
+            orig_module.intermediate.dense.bias.data = inter_ff_b
+
+        orig_module.output.dense.weight.data = child.output_w.data
+        orig_module.output.dense.bias.data = child.output_b.data
+
+        transformer_ln_w = child.norm_w.data
+        transformer_ln_b = child.norm_b.data
+        if preln:
+            orig_module.PreAttentionLayerNorm.weight.data = transformer_ln_w
+            orig_module.PreAttentionLayerNorm.bias.data = transformer_ln_b
+        else:
+            orig_module.output.LayerNorm.weight.data = transformer_ln_w
+            orig_module.output.LayerNorm.bias.data = transformer_ln_b
+        return orig_module
+
+    return replace_module(
+        model=model, orig_class=deepspeed.DeepSpeedTransformerLayer, replace_fn=replace_fn, _replace_policy=None
+    )
+
+
+def replace_module(model, orig_class, replace_fn, _replace_policy, checkpoint=None):
+    """Scan the model for instances of ``orig_clas:`` to replace using ``replace_fn``.
+
+    Arguments:
+        model (torch.nn.Module): the model to augment
+        orig_class (torch.nn.Module): the module to search for
+        replace_fn (method): a method to convert instances of ``orig_class`` to the
+                             desired type and return a new instance.
+    Returns:
+        A modified ``model``.
+    """
+    sd = None
+    if checkpoint is not None:
+        if checkpoint.endswith(".safetensors"):
+            from safetensors.torch import load_file
+
+            sd = load_file(checkpoint)
+        else:
+            sd = torch.load(checkpoint, map_location="cpu")
+
+    policy = {}
+    if orig_class is not None:
+        policy.update({orig_class: (replace_fn, _replace_policy)})
+    else:
+        for plcy in replace_policies:
+            # instantiate a throw-away policy in order to populate the _orig_layer_class
+            _ = plcy(None)
+            if isinstance(plcy._orig_layer_class, list):
+                for orig_layer_class in plcy._orig_layer_class:
+                    policy.update({orig_layer_class: (replace_fn, plcy)})
+            elif plcy._orig_layer_class is not None:
+                policy.update({plcy._orig_layer_class: (replace_fn, plcy)})
+    assert len(policy.items()) > 0, (
+        "No default policy found! Please specify your policy injection_policy (like {BertLayer:HFBEertLayerPolicy})."
+        + "You can find some samples here: https://github.com/microsoft/DeepSpeed/blob/master/deepspeed/module_inject/replace_policy.py"
+    )
+
+    replaced_module, _ = _replace_module(model, policy, state_dict=sd)
+    return replaced_module
+
+
+import re
+
+from ..pipe import PipelineModule
+
+
+def skip_level_0_prefix(model, state_dict):
+    model = str(model)
+    key = re.search(r": (.*?)Model", model)
+    if key is None:
+        key = re.search(r": (.*?)Stack", model)
+    if key is None:
+        key = re.match(r"(.*?)Model", model)
+    # if keys start with 'model.', don't skip level 0 prefix
+    if state_dict is not None:
+        for item in state_dict.keys():
+            if re.match("^model[.]", item):
+                return False
+    if key is not None and key.group(1).lower() in ["bloom", "opt"]:
+        return True
+    return False
+
+
+def _replace_module(model, policies, prefix="", layer_id=0, level_id=0, state_dict=None):
+    """Traverse model's children recursively and apply any transformations in ``policies``.
+
+    Arguments:
+        model (torch.nn.Module): model to augment
+        policies (dict): Mapping of source class to replacement function.
+    Returns:
+        Modified ``model``.
+    """
+    for name, child in model.named_children():
+        if child.__class__ in policies:
+            replaced_module = policies[child.__class__][0](
+                child, policies[child.__class__][-1], layer_id, prefix=prefix + name, state_dict=state_dict
+            )
+            setattr(model, name, replaced_module)
+            if isinstance(model, PipelineModule):
+                assert hasattr(model, "forward_funcs"), "we require pipe-module to have the list of fwd_functions"
+                model.forward_funcs[model.fwd_map[name]] = replaced_module
+            layer_id += 1
+        else:
+            checking_key = prefix + name + "."
+            if Loading.is_load_module(child) and state_dict is not None:
+                if any(checking_key in item for item in state_dict):
+                    Loading.load(
+                        child,
+                        state_dict,
+                        checking_key,
+                    )
+                else:
+                    continue
+            if len(child._buffers) != 0 and state_dict is not None:
+                Loading.load_buffer(child, state_dict, checking_key)
+            _, layer_id = _replace_module(
+                child,
+                policies,
+                prefix if level_id == 0 and skip_level_0_prefix(model, state_dict) else prefix + name + ".",
+                layer_id=layer_id,
+                level_id=level_id + 1,
+                state_dict=state_dict,
+            )
+
+    # Add the reset_cache func to the model, so that it can be called in the beginning of text-generation.
+    model.reset_cache = transformer_inference.DeepSpeedTransformerInference.reset_cache
+    return model, layer_id
--- a/comps/lvms/llama-vision/requirements.txt
+++ b/comps/lvms/llama-vision/requirements.txt
@@ -0,0 +1,13 @@
+datasets
+docarray[full]
+fastapi
+numpy<2
+opentelemetry-api
+opentelemetry-exporter-otlp
+opentelemetry-sdk
+prometheus-fastapi-instrumentator
+pydantic==2.9.2
+pydub
+shortuuid
+transformers==4.45.1
+uvicorn
--- a/comps/lvms/llama-vision/requirements_tp.txt
+++ b/comps/lvms/llama-vision/requirements_tp.txt
@@ -0,0 +1,11 @@
+docarray
+fastapi
+numpy<2
+opentelemetry-api
+opentelemetry-exporter-otlp
+opentelemetry-sdk
+prometheus-fastapi-instrumentator
+pydantic==2.9.2
+shortuuid
+transformers==4.45.1
+uvicorn
--- a/comps/lvms/llama-vision/run_tp.sh
+++ b/comps/lvms/llama-vision/run_tp.sh
@@ -0,0 +1,7 @@
+#!/usr/bin/env bash
+
+# Copyright (C) 2024 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+
+python gaudi_spawn.py --use_deepspeed --world_size 4 lvm_tp_serve.py &
+python lvm_tp.py
--- a/comps/lvms/llama-vision/transformers_generation_utils.py
+++ b/comps/lvms/llama-vision/transformers_generation_utils.py
--- a/comps/lvms/llama-vision/update
+++ b/comps/lvms/llama-vision/update
@@ -0,0 +1,5 @@
+#!/bin/bash
+cp checkpoint_utils.py /usr/local/lib/python3.10/dist-packages/optimum/habana/
+cp auto_tp.py /usr/local/lib/python3.10/dist-packages/deepspeed/module_inject/
+cp replace_module.py /usr/local/lib/python3.10/dist-packages/deepspeed/module_inject/
+
--- a/comps/ragas/tgi/langchain/build_docker.sh
+++ b/comps/ragas/tgi/langchain/build_docker.sh
@@ -5,4 +5,4 @@

 git clone https://github.com/huggingface/tgi-gaudi.git
 cd ./tgi-gaudi/
-docker build -t ghcr.io/huggingface/tgi-gaudi:1.2.1 . --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy
+docker build -t ghcr.io/huggingface/tgi-gaudi:latest . --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy
--- a/tests/cores/mega/ChatQnA_E2E_manifests_base.yaml
+++ b/tests/cores/mega/ChatQnA_E2E_manifests_base.yaml
@@ -311,7 +311,7 @@ spec:
          envFrom:
            - configMapRef:
                name: qna-config
-          image: ghcr.io/huggingface/tgi-gaudi:2.0.4
+          image: ghcr.io/huggingface/tgi-gaudi:2.0.5
          imagePullPolicy: IfNotPresent
          name: llm-dependency-deploy
          ports:
--- a/tests/cores/mega/mega.yaml
+++ b/tests/cores/mega/mega.yaml
@@ -64,7 +64,7 @@ opea_micro_services:
          requirements:
            model_id: "Intel/neural-chat-7b-v3-3"
        ghcr.io/huggingface/tgi-gaudi:
-          tag: 2.0.4
+          tag: 2.0.5
          type: hpu
          requirements:
            model_id: "Intel/neural-chat-7b-v3-3"
--- a/tests/guardrails/test_guardrails_llama_guard_langchain_on_intel_hpu.sh
+++ b/tests/guardrails/test_guardrails_llama_guard_langchain_on_intel_hpu.sh
@@ -10,7 +10,7 @@ ip_address=$(hostname -I | awk '{print $1}')
 function build_docker_images() {
    echo "Start building docker images for microservice"
    cd $WORKPATH
-    docker pull ghcr.io/huggingface/tgi-gaudi:2.0.1
+    docker pull ghcr.io/huggingface/tgi-gaudi:2.0.5
    docker build --no-cache -t opea/guardrails-tgi:comps --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f comps/guardrails/llama_guard/langchain/Dockerfile .
    if [ $? -ne 0 ]; then
        echo "opea/guardrails-tgi built fail"
@@ -26,7 +26,7 @@ function start_service() {
    export SAFETY_GUARD_MODEL_ID="meta-llama/Meta-Llama-Guard-2-8B"
    export SAFETY_GUARD_ENDPOINT=http://${ip_address}:5035/v1/chat/completions

-    docker run -d --name="test-comps-guardrails-langchain-tgi-server" -p 5035:80 --runtime=habana -e HF_TOKEN=$HF_TOKEN -e HABANA_VISIBLE_DEVICES=all -e OMPI_MCA_btl_vader_single_copy_mechanism=none --cap-add=sys_nice --ipc=host -e HTTPS_PROXY=$https_proxy -e HTTP_PROXY=$https_proxy ghcr.io/huggingface/tgi-gaudi:2.0.1 --model-id $model_id --max-input-length 1024 --max-total-tokens 2048
+    docker run -d --name="test-comps-guardrails-langchain-tgi-server" -p 5035:80 --runtime=habana -e HF_TOKEN=$HF_TOKEN -e HABANA_VISIBLE_DEVICES=all -e OMPI_MCA_btl_vader_single_copy_mechanism=none --cap-add=sys_nice --ipc=host -e HTTPS_PROXY=$https_proxy -e HTTP_PROXY=$https_proxy ghcr.io/huggingface/tgi-gaudi:2.0.5 --model-id $model_id --max-input-length 1024 --max-total-tokens 2048
    sleep 4m
    docker run -d --name="test-comps-guardrails-langchain-service" -p 5036:9090 --ipc=host -e http_proxy=$http_proxy -e https_proxy=$https_proxy -e no_proxy=$no_proxy -e SAFETY_GUARD_MODEL_ID=$SAFETY_GUARD_MODEL_ID -e SAFETY_GUARD_ENDPOINT=$SAFETY_GUARD_ENDPOINT -e HUGGINGFACEHUB_API_TOKEN=$HF_TOKEN opea/guardrails-tgi:comps
    sleep 10s
--- a/tests/lvms/test_lvms_tgi-llava_on_intel_hpu.sh
+++ b/tests/lvms/test_lvms_tgi-llava_on_intel_hpu.sh
@@ -25,7 +25,7 @@ function start_service() {
    unset http_proxy
    model="llava-hf/llava-v1.6-mistral-7b-hf"
    lvm_port=5050
-    docker run -d --name="test-comps-lvm-tgi-llava" -e http_proxy=$http_proxy -e https_proxy=$https_proxy -p 5027:80 --runtime=habana -e PT_HPU_ENABLE_LAZY_COLLECTIVES=true -e SKIP_TOKENIZER_IN_TGI=true -e HABANA_VISIBLE_DEVICES=all  -e OMPI_MCA_btl_vader_single_copy_mechanism=none --cap-add=sys_nice --ipc=host ghcr.io/huggingface/tgi-gaudi:2.0.4 --model-id $model --max-input-tokens 4096 --max-total-tokens 8192
+    docker run -d --name="test-comps-lvm-tgi-llava" -e http_proxy=$http_proxy -e https_proxy=$https_proxy -p 5027:80 --runtime=habana -e PT_HPU_ENABLE_LAZY_COLLECTIVES=true -e SKIP_TOKENIZER_IN_TGI=true -e HABANA_VISIBLE_DEVICES=all  -e OMPI_MCA_btl_vader_single_copy_mechanism=none --cap-add=sys_nice --ipc=host ghcr.io/huggingface/tgi-gaudi:2.0.5 --model-id $model --max-input-tokens 4096 --max-total-tokens 8192
    docker run -d --name="test-comps-lvm-tgi" -e LVM_ENDPOINT=http://$ip_address:5027 -e http_proxy=$http_proxy -e https_proxy=$https_proxy -p $lvm_port:9399 --ipc=host opea/lvm-tgi:comps
    sleep 3m
 }