Support Llama3.2 vision and vision guard model (#753)

* Support llama3.2 models

Signed-off-by: lvliang-intel <liang1.lv@intel.com>

* fix issues

Signed-off-by: lvliang-intel <liang1.lv@intel.com>

* update code and doc

Signed-off-by: lvliang-intel <liang1.lv@intel.com>

* add llama vision guard support

Signed-off-by: lvliang-intel <liang1.lv@intel.com>

* add llama guard prompt format utils

Signed-off-by: lvliang-intel <liang1.lv@intel.com>

* add tp support

Signed-off-by: lvliang-intel <liang1.lv@intel.com>

* add wheel

Signed-off-by: lvliang-intel <liang1.lv@intel.com>

* fix accuracy issue

Signed-off-by: lvliang-intel <liang1.lv@intel.com>

* update tp service code

Signed-off-by: lvliang-intel <liang1.lv@intel.com>

* update dockerfile

Signed-off-by: lvliang-intel <liang1.lv@intel.com>

* support lvm tp serving

Signed-off-by: letonghan <letong.han@intel.com>

* update dockerfile

Signed-off-by: lvliang-intel <liang1.lv@intel.com>

* add run tp script

Signed-off-by: lvliang-intel <liang1.lv@intel.com>

* fix max_new_tokens

Signed-off-by: lvliang-intel <liang1.lv@intel.com>

* update run tp script

Signed-off-by: lvliang-intel <liang1.lv@intel.com>

* refine code and doc

Signed-off-by: lvliang-intel <liang1.lv@intel.com>

* install transformers from local wheel

Signed-off-by: lvliang-intel <liang1.lv@intel.com>

* update  code using official transformers

Signed-off-by: lvliang-intel <liang1.lv@intel.com>

* remove unnecessary code

Signed-off-by: lvliang-intel <liang1.lv@intel.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* remove blank line

Signed-off-by: lvliang-intel <liang1.lv@intel.com>

* fix precommit issues

Signed-off-by: lvliang-intel <liang1.lv@intel.com>

* fix cd issue

Signed-off-by: lvliang-intel <liang1.lv@intel.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

---------

Signed-off-by: lvliang-intel <liang1.lv@intel.com>
Signed-off-by: letonghan <letong.han@intel.com>
Co-authored-by: letonghan <letong.han@intel.com>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
This commit is contained in:
lvliang-intel
2024-09-30 16:27:41 +08:00
committed by GitHub
parent 405a2fc68a
commit 534c227a6e
30 changed files with 7190 additions and 13 deletions

View File

@@ -11,3 +11,15 @@ services:
build:
dockerfile: comps/lvms/predictionguard/Dockerfile
image: ${REGISTRY:-opea}/lvm-predictionguard:${TAG:-latest}
lvm-llama-vision:
build:
dockerfile: comps/lvms/llama-vision/Dockerfile
image: ${REGISTRY:-opea}/lvm-llama-vision:${TAG:-latest}
lvm-llama-vision-tp:
build:
dockerfile: comps/lvms/llama-vision/Dockerfile_tp
image: ${REGISTRY:-opea}/lvm-llama-vision-tp:${TAG:-latest}
lvm-llama-vision-guard:
build:
dockerfile: comps/lvms/llama-vision/Dockerfile_guard
image: ${REGISTRY:-opea}/lvm-llama-vision-guard:${TAG:-latest}

View File

@@ -266,7 +266,7 @@ def create_llm_dependency_deployment_and_service(resource_requirements=None, rep
deployment = create_k8s_resources(
name="llm-dependency-deploy",
replicas=7,
image="ghcr.io/huggingface/tgi-gaudi:2.0.4",
image="ghcr.io/huggingface/tgi-gaudi:2.0.5",
container_ports=[80],
node_selector={"node-type": "chatqna-opea"},
resources=resource_requirements,

View File

@@ -10,7 +10,7 @@ services:
- "6337:6337"
- "6338:6338"
tgi_gaudi_service:
image: ghcr.io/huggingface/tgi-gaudi:2.0.1
image: ghcr.io/huggingface/tgi-gaudi:2.0.5
container_name: tgi-service
ports:
- "8088:80"

View File

@@ -36,8 +36,8 @@ pip install -r requirements.txt
export HF_TOKEN=${your_hf_api_token}
volume=$PWD/data
model_id="meta-llama/Meta-Llama-Guard-2-8B"
docker pull ghcr.io/huggingface/tgi-gaudi:2.0.1
docker run -p 8088:80 -v $volume:/data --runtime=habana -e HABANA_VISIBLE_DEVICES=all -e OMPI_MCA_btl_vader_single_copy_mechanism=none --cap-add=sys_nice --ipc=host -e HTTPS_PROXY=$https_proxy -e HTTP_PROXY=$https_proxy -e HF_TOKEN=$HF_TOKEN ghcr.io/huggingface/tgi-gaudi:2.0.1 --model-id $model_id --max-input-length 1024 --max-total-tokens 2048
docker pull ghcr.io/huggingface/tgi-gaudi:2.0.5
docker run -p 8088:80 -v $volume:/data --runtime=habana -e HABANA_VISIBLE_DEVICES=all -e OMPI_MCA_btl_vader_single_copy_mechanism=none --cap-add=sys_nice --ipc=host -e HTTPS_PROXY=$https_proxy -e HTTP_PROXY=$https_proxy -e HF_TOKEN=$HF_TOKEN ghcr.io/huggingface/tgi-gaudi:2.0.5 --model-id $model_id --max-input-length 1024 --max-total-tokens 2048
```
### 1.3 Verify the TGI Gaudi Service

View File

@@ -5,7 +5,7 @@ version: "3.8"
services:
tgi_gaudi_service:
image: ghcr.io/huggingface/tgi-gaudi:2.0.1
image: ghcr.io/huggingface/tgi-gaudi:2.0.5
container_name: tgi-service
ports:
- "8088:80"

View File

@@ -31,9 +31,9 @@ volume=$PWD/data
# Build the Docker run command based on the number of cards
if [ "$num_cards" -eq 1 ]; then
docker_cmd="docker run -d --name="ChatQnA_server" -p $port_number:80 -v $volume:/data --runtime=habana -e HABANA_VISIBLE_DEVICES=all -e OMPI_MCA_btl_vader_single_copy_mechanism=none --cap-add=sys_nice --ipc=host -e HTTPS_PROXY=$https_proxy -e HTTP_PROXY=$https_proxy ghcr.io/huggingface/tgi-gaudi:1.2.1 --model-id $model_name"
docker_cmd="docker run -d --name="ChatQnA_server" -p $port_number:80 -v $volume:/data --runtime=habana -e HABANA_VISIBLE_DEVICES=all -e OMPI_MCA_btl_vader_single_copy_mechanism=none --cap-add=sys_nice --ipc=host -e HTTPS_PROXY=$https_proxy -e HTTP_PROXY=$https_proxy ghcr.io/huggingface/tgi-gaudi:2.0.5 --model-id $model_name"
else
docker_cmd="docker run -d --name="ChatQnA_server" -p $port_number:80 -v $volume:/data --runtime=habana -e PT_HPU_ENABLE_LAZY_COLLECTIVES=true -e HABANA_VISIBLE_DEVICES=all -e OMPI_MCA_btl_vader_single_copy_mechanism=none --cap-add=sys_nice --ipc=host -e HTTPS_PROXY=$https_proxy -e HTTP_PROXY=$https_proxy ghcr.io/huggingface/tgi-gaudi:1.2.1 --model-id $model_name --sharded true --num-shard $num_cards"
docker_cmd="docker run -d --name="ChatQnA_server" -p $port_number:80 -v $volume:/data --runtime=habana -e PT_HPU_ENABLE_LAZY_COLLECTIVES=true -e HABANA_VISIBLE_DEVICES=all -e OMPI_MCA_btl_vader_single_copy_mechanism=none --cap-add=sys_nice --ipc=host -e HTTPS_PROXY=$https_proxy -e HTTP_PROXY=$https_proxy ghcr.io/huggingface/tgi-gaudi:2.0.5 --max-input-tokens 4096 --max-total-tokens 8192 --model-id $model_name --sharded true --num-shard $num_cards"
fi
# Execute the Docker run command

View File

@@ -0,0 +1,32 @@
# Copyright (C) 2024 Intel Corporation
# SPDX-License-Identifier: Apache-2.0
# HABANA environment
FROM vault.habana.ai/gaudi-docker/1.17.1/ubuntu22.04/habanalabs/pytorch-installer-2.3.1:latest as hpu
ENV LANG=en_US.UTF-8
RUN apt-get update && apt-get install -y --no-install-recommends --fix-missing \
git-lfs \
libgl1-mesa-glx \
libjemalloc-dev
RUN useradd -m -s /bin/bash user && \
mkdir -p /home/user && \
chown -R user /home/user/
USER user
RUN git lfs install
COPY comps /home/user/comps
RUN cd /home/user/comps/lvms/llama-vision/ && \
pip install --no-cache-dir -r requirements.txt && \
pip install --upgrade Pillow
ENV PYTHONPATH=/root:/home/user
WORKDIR /home/user/comps/lvms/llama-vision/
ENTRYPOINT ["python", "lvm.py"]

View File

@@ -0,0 +1,32 @@
# Copyright (C) 2024 Intel Corporation
# SPDX-License-Identifier: Apache-2.0
# HABANA environment
FROM vault.habana.ai/gaudi-docker/1.17.1/ubuntu22.04/habanalabs/pytorch-installer-2.3.1:latest as hpu
ENV LANG=en_US.UTF-8
RUN apt-get update && apt-get install -y --no-install-recommends --fix-missing \
git-lfs \
libgl1-mesa-glx \
libjemalloc-dev
RUN useradd -m -s /bin/bash user && \
mkdir -p /home/user && \
chown -R user /home/user/
USER user
RUN git lfs install
COPY comps /home/user/comps
RUN cd /home/user/comps/lvms/llama-vision/ && \
pip install --no-cache-dir -r requirements.txt && \
pip install --upgrade Pillow
ENV PYTHONPATH=/root:/home/user
WORKDIR /home/user/comps/lvms/llama-vision/
ENTRYPOINT ["python", "lvm_guard.py"]

View File

@@ -0,0 +1,34 @@
# Copyright (C) 2024 Intel Corporation
# SPDX-License-Identifier: Apache-2.0
# HABANA environment
FROM vault.habana.ai/gaudi-docker/1.17.1/ubuntu22.04/habanalabs/pytorch-installer-2.3.1:latest as hpu
ENV LANG=en_US.UTF-8
RUN apt-get update && apt-get install -y --no-install-recommends --fix-missing \
git-lfs \
libgl1-mesa-glx \
libjemalloc-dev
RUN useradd -m -s /bin/bash user && \
mkdir -p /home/user && \
chown -R user /home/user/
RUN git lfs install
COPY comps /home/user/comps
RUN pip install git+https://github.com/HabanaAI/DeepSpeed.git@1.17.1
RUN pip install git+https://github.com/huggingface/optimum-habana@v1.13.2
RUN cd /home/user/comps/lvms/llama-vision/ \
pip install --no-cache-dir --upgrade pip && \
bash update && \
pip install -r /home/user/comps/lvms/llama-vision/requirements_tp.txt
ENV PYTHONPATH=/root:/home/user
WORKDIR /home/user/comps/lvms/llama-vision/
ENTRYPOINT ["bash", "run_tp.sh"]

View File

@@ -0,0 +1,72 @@
# LVM Microservice
Visual Question and Answering is one of the multimodal tasks empowered by LVMs (Large Visual Models). This microservice supports visual Q&A by using Llama Vision as the base large visual model. It accepts two inputs: a prompt and an image. It outputs the answer to the prompt about the image.
## 🚀 Start Microservice with Docker
### Build Images
#### Build Llama Vision Model
```bash
cd ../../../
docker build -t opea/lvm-llama-vision:latest --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f comps/lvms/llama-vision/Dockerfile .
```
#### Build Llama Vision Model with deepspeed
If you need to build the image for 90B models, use the following command:
```bash
docker build -t opea/lvm-llama-vision-tp:latest --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f comps/lvms/llama-vision/Dockerfile_tp .
```
#### Build Llama Vision Guard Model
```bash
cd ../../../
docker build -t opea/lvm-llama-vision-guard:latest --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f comps/lvms/llama-vision/Dockerfile_guard .
```
### Start Llama LVM Service
#### Start Llama Vision Model Service
```bash
export HUGGINGFACEHUB_API_TOKEN=${your_hf_token}
docker run -it -p 9399:9399 --ipc=host -e http_proxy=$http_proxy -e https_proxy=$https_proxy -e LLAMA_VISION_MODEL_ID="meta-llama/Llama-3.2-11B-Vision-Instruct" -e HUGGINGFACEHUB_API_TOKEN=$HUGGINGFACEHUB_API_TOKEN --runtime=habana -e HABANA_VISIBLE_DEVICES=all -e OMPI_MCA_btl_vader_single_copy_mechanism=none --cap-add=sys_nice --ipc=host opea/lvm-llama-vision:latest
```
#### Start Llama Vision Model Service with deepspeed
If you need to run the 90B models, use the following command:
```bash
export HUGGINGFACEHUB_API_TOKEN=${your_hf_token}
export WORLD_SIZE=4
export no_proxy=localhosst,127.0.0.1
docker run -it -p 9599:9599 --ipc=host -e http_proxy=$http_proxy -e https_proxy=$https_proxy -e no_proxy=$no_proxy -e MODEL_ID="meta-llama/Llama-3.2-90B-Vision-Instruct" -e HUGGINGFACEHUB_API_TOKEN=$HUGGINGFACEHUB_API_TOKEN -e WORLD_SIZE=$WORLD_SIZE --runtime=habana -e HABANA_VISIBLE_DEVICES=all -e OMPI_MCA_btl_vader_single_copy_mechanism=none --cap-add=sys_nice opea/lvm-llama-vision-tp:latest
```
#### Start Llama Vision Guard Model Service
```bash
export HUGGINGFACEHUB_API_TOKEN=${your_hf_token}
docker run -it -p 9499:9499 --ipc=host -e http_proxy=$http_proxy -e https_proxy=$https_proxy -e LLAMA_VISION_MODEL_ID="meta-llama/Llama-Guard-3-11B-Vision" -e HUGGINGFACEHUB_API_TOKEN=$HUGGINGFACEHUB_API_TOKEN --runtime=habana -e HABANA_VISIBLE_DEVICES=all -e OMPI_MCA_btl_vader_single_copy_mechanism=none --cap-add=sys_nice --ipc=host opea/lvm-llama-vision-guard:latest
```
### Test
```bash
# Use curl
# curl Llama Vision 11B Model Service
http_proxy="" curl http://localhost:9399/v1/lvm -XPOST -d '{"image": "iVBORw0KGgoAAAANSUhEUgAAAAoAAAAKCAYAAACNMs+9AAAAFUlEQVR42mP8/5+hnoEIwDiqkL4KAcT9GO0U4BxoAAAAAElFTkSuQmCC", "prompt":"What is this?", "max_new_tokens": 128}' -H 'Content-Type: application/json'
# curl Llama Vision Guard Model Service
http_proxy="" curl http://localhost:9499/v1/lvm -XPOST -d '{"image": "iVBORw0KGgoAAAANSUhEUgAAAAoAAAAKCAYAAACNMs+9AAAAFUlEQVR42mP8/5+hnoEIwDiqkL4KAcT9GO0U4BxoAAAAAElFTkSuQmCC", "prompt":"What is this?", "max_new_tokens": 128}' -H 'Content-Type: application/json'
# curl Llama Vision 90B Model Service
http_proxy="" curl http://localhost:9599/v1/lvm -XPOST -d '{"image": "iVBORw0KGgoAAAANSUhEUgAAAAoAAAAKCAYAAACNMs+9AAAAFUlEQVR42mP8/5+hnoEIwDiqkL4KAcT9GO0U4BxoAAAAAElFTkSuQmCC", "prompt":"What is this?", "max_new_tokens": 128}' -H 'Content-Type: application/json'
```

View File

@@ -0,0 +1,571 @@
# Copyright (c) Microsoft Corporation.
# SPDX-License-Identifier: Apache-2.0
# DeepSpeed Team
# Automatic Tensor Parallelism
import re
from typing import Optional
import torch
from deepspeed import comm as dist
from deepspeed.accelerator import get_accelerator
from deepspeed.module_inject.tp_shard import get_shard_size, get_shard_size_list
from torch import nn
from .fusedqkv_utils import prepare_tp_fused_qkvw, require_tp_fused_qkvw
from .layers import LinearAllreduce, LinearLayer, LmHeadLinearAllreduce
from .replace_policy import replace_policies
def move(tensor, device):
if tensor.is_meta:
return torch.empty_like(tensor, device=device)
else:
# Using new tensors help in freeing memory (after split for example) was done before by calling clone().
# Using copy=True instead of clone() will help in case of cpu --> cpu.
# Otherwise to() will not create a new copy for the view of the full tensor, and it will not be de-referenced.
return tensor.to(device, copy=True)
class ReplaceWithTensorSlicing:
def __init__(self, mp_group=None, mp_size=1, out_dim=1, in_dim=0):
if mp_group is not None:
self.gpu_index = dist.get_rank(group=mp_group)
else:
self.gpu_index = 0
self.out_dim = out_dim
self.in_dim = in_dim
self.mp_size = mp_size
def merge_assert(self, dim1, dim2):
assert (
dim1 > dim2
), "Merging tensors is not allowed here! Please use deepspeed load_checkpoint\
for merging your checkpoints before replacing the transformer layer with\
inference-kernels"
def strided_copy(
self,
dst: Optional[torch.Tensor],
src: Optional[torch.Tensor],
num_splits: int,
int8: bool = False,
allocate_tensor: bool = False,
):
if src is None:
return src
src_shape = src.shape
dst_shape = dst.shape
outer_dim = 0 if int8 else -1
if allocate_tensor:
dst = torch.empty_like(dst)
src_split = torch.split(src.data, src.shape[outer_dim] // num_splits, dim=outer_dim)
if len(src_shape) == 2 and len(dst_shape) == 2:
if src_shape[outer_dim] == dst_shape[self.out_dim]:
try:
dst = dst.reshape(-1).data.copy_(src.data.reshape(-1)).reshape(src.shape)
except:
print(dst.shape, src.shape)
exit()
dst = torch.nn.parameter.Parameter(dst, requires_grad=False)
if hasattr(src, "scale"):
dst.scale = src.scale
return dst
self.merge_assert(src_shape[outer_dim], dst_shape[self.out_dim])
qkv_size = dst_shape[self.out_dim] // num_splits
qkv_split = [torch.split(src_s, qkv_size, dim=outer_dim) for src_s in src_split]
weight_split = [
torch.cat([qkv_s[i] for qkv_s in qkv_split], axis=outer_dim) for i in range(len(qkv_split[0]))
]
dst = (
dst.reshape(-1)
.data.copy_(weight_split[self.gpu_index].contiguous().reshape(-1))
.reshape(weight_split[self.gpu_index].shape)
)
else:
if src_shape[0] == dst_shape[0]:
return torch.nn.parameter.Parameter(src)
qkv_size = dst_shape[0] // num_splits
qkv_split = [torch.split(src_s, qkv_size, dim=0) for src_s in src_split]
bias_split = [torch.cat([qkv_s[i] for qkv_s in qkv_split], axis=0) for i in range(len(qkv_split[0]))]
dst.data.copy_(bias_split[self.gpu_index].contiguous())
dst = torch.nn.parameter.Parameter(dst, requires_grad=False)
if hasattr(src, "scale"):
dst.scale = src.scale
return dst
def copy(self, dst, src, int8=False, allocate_tensor=False):
if src is None:
return src
assert not dst.data.is_meta # the torch.Tensor.copy_ method used below will silently fail on meta tensors
if allocate_tensor:
dst = torch.empty_like(dst)
outer_dim = 0 if int8 else 1
inner_dim = 1 if int8 else 0
src_shape = src.shape
dst_shape = dst.shape
if len(src_shape) == 2 and len(dst_shape) == 2:
if src_shape[inner_dim] == dst_shape[self.in_dim] and src_shape[outer_dim] == dst_shape[self.out_dim]:
dst = dst.reshape(-1).data.copy_(src.data.reshape(-1)).reshape(src.shape)
else:
if src_shape[inner_dim] != dst_shape[self.in_dim]:
self.merge_assert(src_shape[inner_dim], dst_shape[self.in_dim])
dst.data.copy_(
src[:, self.gpu_index * dst_shape[self.in_dim] : (self.gpu_index + 1) * dst_shape[self.in_dim]]
if inner_dim == 1
else src[
self.gpu_index * dst_shape[self.in_dim] : (self.gpu_index + 1) * dst_shape[self.in_dim], :
]
)
else:
self.merge_assert(src_shape[outer_dim], dst_shape[self.out_dim])
dst.data.copy_(
src[
:, self.gpu_index * dst_shape[self.out_dim] : (self.gpu_index + 1) * dst_shape[self.out_dim]
]
if outer_dim == 1
else src[
self.gpu_index * dst_shape[self.out_dim] : (self.gpu_index + 1) * dst_shape[self.out_dim], :
]
)
else:
if src_shape[0] == dst_shape[0]:
dst = src if src.dtype == dst.dtype else dst.data.copy_(src)
else:
dst.data.copy_(src[self.gpu_index * dst_shape[-1] : (self.gpu_index + 1) * dst_shape[-1]])
dst = torch.nn.parameter.Parameter(dst, requires_grad=False)
if hasattr(src, "scale"):
dst.scale = src.scale
return dst
class Loading:
def is_load_module(module):
load_layers = [nn.Linear, nn.Embedding, nn.LayerNorm]
load_layer_names = [
"LPLayerNorm",
"SharedEmbedding",
"OPTLearnedPositionalEmbedding",
"LlamaRMSNorm",
"FalconLinear",
"MistralRMSNorm",
"T5LayerNorm",
"MixtralRMSNorm",
"Qwen2RMSNorm",
]
return module.__class__ in load_layers or module._get_name() in load_layer_names
def load_buffer(module, state_dict, prefix):
for name in module._buffers.keys():
if module._buffers[name].data.is_meta:
module._buffers[name] = torch.nn.parameter.Parameter(
data=torch.empty_like(module._buffers[name].data, device="cpu"),
requires_grad=module._buffers[name].data.requires_grad,
)
if prefix + name in state_dict.keys():
module._buffers[name].data.copy_(state_dict[prefix + name])
def load(module, state_dict, prefix, mp_group=None):
mp_replace = ReplaceWithTensorSlicing(mp_group=mp_group)
if hasattr(module, "weight"):
if module.weight.data.is_meta:
# meta tensor cannot be casted or copied to, so we need to replace it with a normal tensor here
module.weight = torch.nn.parameter.Parameter(
data=torch.empty_like(module.weight.data, device="cpu"),
requires_grad=module.weight.data.requires_grad,
)
if "query_key_value" in prefix:
module.weight = mp_replace.strided_copy(
module.weight.data, state_dict[prefix + "weight"], num_splits=3
)
else:
module.weight = mp_replace.copy(module.weight.data, state_dict[prefix + "weight"])
else:
if hasattr(module, "norm") and hasattr(module.norm, "weight"):
if module.norm.weight.data.is_meta:
# meta tensor cannot be casted or copied to, so we need to replace it with a normal tensor here
module.norm.weight = torch.nn.parameter.Parameter(
data=torch.empty_like(module.norm.weight.data, device="cpu"),
requires_grad=module.norm.weight.data.requires_grad,
)
module.norm.weight = mp_replace.copy(module.norm.weight.data, state_dict[prefix + "weight"])
if prefix + "bias" in state_dict.keys():
if hasattr(module, "bias"):
if module.bias.data.is_meta:
# meta tensor cannot be casted or copied to, so we need to replace it with a normal tensor here
module.bias = torch.nn.parameter.Parameter(
data=torch.empty_like(module.bias.data, device="cpu"),
requires_grad=module.bias.data.requires_grad,
)
module.bias = mp_replace.copy(module.bias, state_dict[prefix + "bias"])
else:
if hasattr(module, "norm") and hasattr(module.norm, "bias"):
if module.norm.bias.data.is_meta:
# meta tensor cannot be casted or copied to, so we need to replace it with a normal tensor here
module.norm.bias = torch.nn.parameter.Parameter(
data=torch.empty_like(module.norm.bias.data, device="cpu"),
requires_grad=module.norm.bias.data.requires_grad,
)
module.norm.bias = mp_replace.copy(module.norm.bias, state_dict[prefix + "bias"])
class AutoTP:
def __init__(self, module, all_reduce_linears, prefix, state_dict, linear_layer_setting, orig_layer_impl):
self.module = module
self.all_reduce_linears = all_reduce_linears
self.prefix = prefix
self.state_dict = state_dict
self.mp_size = None
self.mp_group = None
self.linear_layer_setting = linear_layer_setting
self.orig_layer_impl = orig_layer_impl
self.linear_policies = None
self.conv_linear_layer = False
def in_module_list(module, module_list):
for item in module_list:
if type(item).__name__ == type(module).__name__:
return True
return False
def get_module_list(model):
mlist = []
for child in model.children():
if isinstance(child, nn.ModuleList):
for module in child.children():
if not mlist:
mlist = [module]
elif not AutoTP.in_module_list(module, mlist):
mlist = mlist + [module]
else:
mlist = mlist + AutoTP.get_module_list(child)
return mlist
def supported(model):
unsupported = ["deberta", "flaubert", "fsmt", "gpt2", "led", "longformer", "xlm", "xlnet"]
model = str(model)
key = re.search(r": (.*?)Model", model)
if key is None:
key = re.search(r": (.*?)Stack", model)
if key is None:
key = re.match(r"(.*?)Model", model)
assert key is not None, "Not able to determine model policy automatically. Please provide policy."
if key.group(1).lower() in unsupported:
return False
return True
def get_layers(parent, module):
layer_list = []
for key, submodule in module._modules.items():
if isinstance(submodule, nn.Linear):
layer_list = layer_list + [parent + "." + key]
elif isinstance(submodule, nn.LayerNorm) or key == "LayerNorm" or key == "layer_norm":
layer_list = layer_list + ["ln"]
else:
layer_list = layer_list + AutoTP.get_layers(key, submodule)
return layer_list
def update_policy_list(policy_list, new_module, new_gems):
if len(policy_list):
for i, policy in enumerate(policy_list):
# if module already exists in policy, combine gems and remove duplicates
if policy[0] == type(new_module):
new_gems = set(new_gems + policy[1])
policy_list[i] = tuple([type(new_module), new_gems])
return policy_list
policy_list.append(tuple([type(new_module), new_gems]))
return policy_list
def kernel_supported(module_list):
policy = []
for plcy in replace_policies:
# instantiate a throw-away policy in order to populate the _orig_layer_class
_ = plcy(None)
if isinstance(plcy._orig_layer_class, list):
for orig_layer_class in plcy._orig_layer_class:
policy.append(orig_layer_class)
elif plcy._orig_layer_class is not None:
policy.append(plcy._orig_layer_class)
for child in module_list:
if child.__class__ in policy:
return True
return False
def tp_parser(model):
policy_list = []
module_list = []
layer_list = []
gem_list = []
module_list = AutoTP.get_module_list(model)
assert AutoTP.supported(model), (
"AutoTP not supported for model. Please use kernel injection since container policy for model exists."
if AutoTP.kernel_supported(module_list)
else "AutoTP not supported for model. Please provide policy."
)
norm_layer_name_list = ["LayerNorm", "layer_norm", "ln_1", "ln_2"]
# ln_1 , ln_2 for Qwen
for module in module_list:
for key, submodule in module._modules.items():
if isinstance(submodule, nn.Linear):
layer_list = layer_list + ["." + key]
elif isinstance(submodule, nn.LayerNorm) or key in norm_layer_name_list:
layer_list = layer_list + ["ln"]
else:
layer_list = layer_list + AutoTP.get_layers(key, submodule)
for i, layer in enumerate(layer_list):
if layer == "ln":
if layer_list[i - 1] != "ln":
gem_list = gem_list + [layer_list[i - 1]]
elif "out_proj" in layer:
gem_list = gem_list + [layer]
elif "o_proj" in layer:
gem_list = gem_list + [layer]
elif "down_proj" in layer:
gem_list = gem_list + [layer]
elif "attention.dense" in layer and "GPTNeoX" in str(model):
gem_list = gem_list + [layer]
elif "self_attention.dense" in layer and "falcon" in str(
type(module)
): # this is a hack to get the right linear layer for this model!
gem_list = gem_list + [layer]
# Mixtral-7x8b used w2*act(w1*w3) linear. need to replace w2 to linearallreduce.
elif "w2" in layer and "mixtral" in str(type(module)):
gem_list = gem_list + [layer]
layer_list = []
if gem_list != []:
gem_list = list(set(gem_list))
policy_list = AutoTP.update_policy_list(policy_list, module, gem_list)
gem_list = []
assert len(policy_list), (
"AutoTP not supported for model. Please use kernel injection since container policy for model exists."
if AutoTP.kernel_supported(module_list)
else "Not able to determine model policy automatically. Please provide policy."
)
return policy_list
def set_tensor_parallel_config(self, mp_size, mp_group):
self.mp_size = mp_size
self.mp_group = mp_group
def _replace(self, child, name, conv_linear_layer):
if getattr(child, "replaced", False):
return
weight_shape = child.weight.shape
mp_replace = ReplaceWithTensorSlicing(mp_group=self.mp_group)
if name in self.all_reduce_linears:
# if conv_linear_layer [weight_shape[1], weight_shape[0] // mp_size]
# else [weight_shape[0], weight_shape[1] // mp_size]
if self.conv_linear_layer:
child.weight.data = child.weight.data.transpose(-1, -2).contiguous()
data = child.weight.data.split(
get_shard_size_list(weight_shape[0] if self.conv_linear_layer else weight_shape[1], self.mp_size, name),
dim=1,
)
data_dc = move(data[mp_replace.gpu_index], get_accelerator().current_device_name()).detach()
del data
setattr(child, "replaced", True)
if name == "lm_head" or name == "embed_out":
return LmHeadLinearAllreduce(
torch.nn.parameter.Parameter(data_dc, requires_grad=False),
dist.get_rank(),
dist.get_world_size(),
(
child.bias
if child.bias is None
else torch.nn.parameter.Parameter(move(child.bias, get_accelerator().current_device_name()))
),
self.mp_group,
)
return LinearAllreduce(
torch.nn.parameter.Parameter(data_dc, requires_grad=False),
(
child.bias
if child.bias is None
else torch.nn.parameter.Parameter(move(child.bias, get_accelerator().current_device_name()))
),
self.mp_group,
)
else:
# if conv_linear_layer [weight_shape[1], weight_shape[0] // mp_size]
# else [weight_shape[0] // mp_size, weight_shape[1]]
if self.conv_linear_layer:
child.weight.data = child.weight.data.transpose(-1, -2).contiguous()
if require_tp_fused_qkvw(name, self.mp_size):
# Check and handle fused qkv for TP
# The copy is a regular copy, The shape of dst and src is the same
data_dc = move(
prepare_tp_fused_qkvw(self.module, child.weight.data, self.mp_size, mp_replace.gpu_index),
get_accelerator().current_device_name(),
)
bias_data_dc = (
None
if child.bias is None
else move(
prepare_tp_fused_qkvw(self.module, child.bias.data, self.mp_size, mp_replace.gpu_index),
get_accelerator().current_device_name(),
)
)
else:
data = child.weight.data.split(
get_shard_size_list(weight_shape[0], self.mp_size, name), dim=1 if self.conv_linear_layer else 0
)
data_dc = move(data[mp_replace.gpu_index], get_accelerator().current_device_name()).detach()
del data
if child.bias is not None:
bias_data = child.bias.data.split(
get_shard_size_list(
weight_shape[1] if self.conv_linear_layer else weight_shape[0], self.mp_size, name
),
dim=0,
)
bias_data = move(bias_data[mp_replace.gpu_index], get_accelerator().current_device_name())
bias_data_dc = torch.nn.parameter.Parameter(bias_data, requires_grad=False)
del bias_data
else:
bias_data_dc = None
setattr(child, "replaced", True)
return LinearLayer(weight=torch.nn.parameter.Parameter(data_dc, requires_grad=False), bias=bias_data_dc)
def _slice_embedding(self, child, name, conv_linear_layer):
if getattr(child, "replaced", False):
return
mp_replace = ReplaceWithTensorSlicing(mp_group=self.mp_group)
if hasattr(child.weight, "ds_tensor"):
data = child.weight.ds_tensor.data.split(get_shard_size_list(child.weight.shape[1], self.mp_size), dim=1)
else:
data = child.weight.data.split(get_shard_size_list(child.weight.shape[1], self.mp_size, name), dim=1)
data = data[mp_replace.gpu_index].to(get_accelerator().current_device_name())
data = torch.nn.parameter.Parameter(data, requires_grad=False)
new_embedding = nn.Embedding(child.weight.shape[0], get_shard_size(child.weight.shape[1], self.mp_size, name))
new_embedding.weight.data.copy_(data)
setattr(child, "replaced", True)
return new_embedding
def update_mp_params(self, child):
if getattr(child, "replaced", False):
return
for param in [
"n_heads",
"inner_dim",
"num_heads",
"num_kv",
"num_attention_heads",
"num_attn_heads",
"all_head_size",
"embed_dim",
"hidden_size",
"num_key_value_heads",
"num_kv_heads",
"kv_n_heads",
"d_model",
]:
if hasattr(child, param):
param_val = getattr(child, param)
setattr(child, param, get_shard_size(param_val, self.mp_size))
setattr(child, "replaced", True)
def update_linear_policies(self):
self.conv_linear_layer = False
if self.linear_layer_setting is not None:
self.linear_policies = {self.linear_layer_setting[0]: self._replace}
if len(self.linear_layer_setting) == 2:
self.linear_policies.update({self.linear_layer_setting[1]: self._slice_embedding})
else:
import transformers
if self.orig_layer_impl is transformers.models.gpt2.modeling_gpt2.GPT2Block:
try:
self.conv_linear_layer = True
self.linear_policies = {transformers.pytorch_utils.Conv1D: self._replace}
except ImportError:
self.linear_policies = {nn.Linear: self._replace}
else:
self.linear_policies = {nn.Linear: self._replace, nn.Embedding: self._slice_embedding}
def _replace_module(self, r_module, prev_name="", prev_class_name=""):
for name, child in r_module.named_children():
if prev_class_name == "":
class_name = prev_name
elif prev_name == "":
class_name = prev_class_name
else:
class_name = prev_class_name + "." + prev_name
checking_key = (
self.prefix + "." + class_name + "." + name + "."
if class_name != ""
else self.prefix + "." + name + "."
)
if Loading.is_load_module(child) and self.state_dict is not None:
if any(checking_key in item for item in self.state_dict):
Loading.load(child, self.state_dict, checking_key, self.mp_group)
else:
continue
if len(child._buffers) != 0 and self.state_dict is not None:
Loading.load_buffer(child, self.state_dict, checking_key)
if child.__class__ in self.linear_policies:
setattr(
r_module,
name,
self.linear_policies[child.__class__](child, prev_name + "." + name, self.conv_linear_layer),
)
elif any(isinstance(child, lp) for lp in self.linear_policies):
# Added for falcon model support
# Note: isinstance will account for class inheritance, child.__class__ does not
key = None
for lp in self.linear_policies:
if isinstance(child, lp):
key = lp
break
assert key is not None
setattr(
r_module, name, self.linear_policies[key](child, prev_name + "." + name, self.conv_linear_layer)
)
else:
self.update_mp_params(child)
self._replace_module(child, name, class_name)
return r_module
def get_model_num_kv_heads(self, config):
num_kv_heads = None
kv_head_names = ["num_kv_heads", "num_key_value_heads", "num_attention_heads", "n_heads", "attention_heads"]
for name in kv_head_names:
if hasattr(config, name):
num_kv_heads = getattr(config, name)
if num_kv_heads is not None:
break
return num_kv_heads
def _replace_last_linear_module(self, r_module):
if hasattr(r_module, "lm_head"):
name = "lm_head"
child = r_module.lm_head
elif hasattr(r_module, "embed_out"):
name = "embed_out"
child = r_module.embed_out
else:
return r_module
if child.__class__ in self.linear_policies:
setattr(r_module, name, self.linear_policies[child.__class__](child, name, self.conv_linear_layer))
return r_module

View File

@@ -0,0 +1,160 @@
# Copyright (C) 2024 Intel Corporation
# SPDX-License-Identifier: Apache-2.0
import json
import os
from pathlib import Path
import torch
from huggingface_hub import list_repo_files, snapshot_download
from transformers import modeling_utils
from transformers.utils import is_offline_mode
def get_repo_root(model_name_or_path, local_rank=-1, token=None):
"""Downloads the specified model checkpoint and returns the repository where it was downloaded."""
if Path(model_name_or_path).is_dir():
# If it is a local model, no need to download anything
return model_name_or_path
else:
# Checks if online or not
if is_offline_mode():
if local_rank == 0:
print("Offline mode: forcing local_files_only=True")
# Only download PyTorch weights by default
if any(
".safetensors" in filename for filename in list_repo_files(model_name_or_path, token=token)
): # Some models like Falcon-180b are in only safetensors format
allow_patterns = ["*.safetensors"]
elif any(".bin" in filename for filename in list_repo_files(model_name_or_path, token=token)):
allow_patterns = ["*.bin"]
else:
raise TypeError("Only PyTorch models are supported")
# Download only on first process
if local_rank in [-1, 0]:
cache_dir = snapshot_download(
model_name_or_path,
local_files_only=is_offline_mode(),
cache_dir=os.getenv("TRANSFORMERS_CACHE", None),
allow_patterns=allow_patterns,
max_workers=16,
token=token,
)
if local_rank == -1:
# If there is only one process, then the method is finished
return cache_dir
# Make all processes wait so that other processes can get the checkpoint directly from cache
if torch.distributed.is_initialized():
torch.distributed.barrier()
return snapshot_download(
model_name_or_path,
local_files_only=is_offline_mode(),
cache_dir=os.getenv("TRANSFORMERS_CACHE", None),
allow_patterns=allow_patterns,
token=token,
)
def get_checkpoint_files(model_name_or_path, local_rank, token=None):
cached_repo_dir = get_repo_root(model_name_or_path, local_rank=local_rank, token=token)
# Extensions: .bin | .safetensors | .pt
# Creates a list of paths from all downloaded files in cache dir
if any(file.suffix == ".bin" for file in Path(cached_repo_dir).rglob("*")):
(name, ext) = os.path.splitext(modeling_utils.WEIGHTS_NAME)
elif any(file.suffix == ".safetensors" for file in Path(cached_repo_dir).rglob("*")):
(name, ext) = os.path.splitext(modeling_utils.SAFE_WEIGHTS_NAME)
else:
(name, ext) = ("*", ".pt")
file_list = [
str(entry)
for entry in Path(cached_repo_dir).rglob("*")
if (entry.is_file() and entry.name.startswith(name) and entry.name.endswith(ext))
]
return file_list
def write_checkpoints_json(model_name_or_path, local_rank, f, token=None):
"""Dumps metadata into a JSON file for DeepSpeed-inference."""
checkpoint_files = get_checkpoint_files(model_name_or_path, local_rank, token)
data = {"type": "ds_model", "checkpoints": checkpoint_files, "version": 1.0}
json.dump(data, f)
f.flush()
def model_on_meta(config):
"""Checks if load the model to meta."""
# return config.model_type in ["bloom", "llama", "falcon", "mixtral", "qwen2", "mllama"]
return config.model_type in ["bloom", "llama", "falcon", "mixtral", "qwen2"]
def get_optimized_model_name(config):
from .transformers.generation import MODELS_OPTIMIZED_WITH_STATIC_SHAPES
for model_type in MODELS_OPTIMIZED_WITH_STATIC_SHAPES:
if model_type == config.model_type:
return model_type
return None
def model_is_optimized(config):
"""Checks if the given config belongs to a model in optimum/habana/transformers/models, which has a
new input token_idx."""
return get_optimized_model_name(config) is not None
def get_ds_injection_policy(config):
model_type = get_optimized_model_name(config)
policy = {}
if model_type:
if model_type == "bloom":
from transformers.models.bloom.modeling_bloom import BloomBlock
policy = {BloomBlock: ("self_attention.dense", "mlp.dense_4h_to_h")}
if model_type == "opt":
from transformers.models.opt.modeling_opt import OPTDecoderLayer
policy = {OPTDecoderLayer: ("self_attn.out_proj", ".fc2")}
if model_type == "gpt2":
from transformers.models.gpt2.modeling_gpt2 import GPT2MLP
policy = {GPT2MLP: ("attn.c_proj", "mlp.c_proj")}
if model_type == "gptj":
from transformers.models.gptj.modeling_gptj import GPTJBlock
policy = {GPTJBlock: ("attn.out_proj", "mlp.fc_out")}
if model_type == "gpt_neox":
from transformers.models.gpt_neox.modeling_gpt_neox import GPTNeoXLayer
policy = {GPTNeoXLayer: ("attention.dense", "mlp.dense_4h_to_h")}
if model_type == "llama":
from transformers.models.llama.modeling_llama import LlamaDecoderLayer
policy = {LlamaDecoderLayer: ("self_attn.o_proj", "mlp.down_proj")}
# if model_type == "mllama":
# #AutoTP: [(<class 'transformers.models.mllama.modeling_mllama.MllamaVisionEncoderLayer'>, {'self_attn.o_proj', 'mlp.fc2'}), (<class 'transformers.models.mllama.modeling_mllama.MllamaSelfAttentionDecoderLayer'>, ['self_attn.o_proj', 'mlp.down_proj']), (<class 'transformers.models.mllama.modeling_mllama.MllamaCrossAttentionDecoderLayer'>, ['cross_attn.o_proj', 'mlp.down_proj'])]
# from transformers.models.mllama.modeling_mllama import MllamaVisionEncoderLayer, MllamaSelfAttentionDecoderLayer, MllamaCrossAttentionDecoderLayer
# policy = {MllamaSelfAttentionDecoderLayer: ("self_attn.o_proj", "mlp.down_proj"), MllamaCrossAttentionDecoderLayer: ('cross_attn.o_proj', 'mlp.down_proj'), MllamaVisionEncoderLayer: ('self_attn.o_proj', 'mlp.fc2')}
# policy = {MllamaSelfAttentionDecoderLayer: ("self_attn.o_proj", "mlp.down_proj"), MllamaCrossAttentionDecoderLayer: ('cross_attn.o_proj', 'mlp.down_proj')}
if model_type == "mistral":
from transformers.models.mistral.modeling_mistral import MistralDecoderLayer
policy = {MistralDecoderLayer: ("self_attn.o_proj", "mlp.down_proj")}
return policy

View File

@@ -0,0 +1,27 @@
# Copyright (C) 2024 Intel Corporation
# SPDX-License-Identifier: Apache-2.0
version: "3.8"
services:
llm:
image: opea/lvm-llama-vision:latest
container_name: lvm-llama-vision-server
ports:
- "9399:9399"
runtime: habana
cap_add:
- SYS_NICE
ipc: host
environment:
no_proxy: ${no_proxy}
http_proxy: ${http_proxy}
https_proxy: ${https_proxy}
LLAMA_VISION_MODEL_ID: ${LLAMA_VISION_MODEL_ID}
HABANA_VISIBLE_DEVICES: all
OMPI_MCA_btl_vader_single_copy_mechanism: none
restart: unless-stopped
networks:
default:
driver: bridge

View File

@@ -0,0 +1,108 @@
# coding=utf-8
# Copyright 2022 The HuggingFace Team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""A simple launcher script for distributed training on HPUs.
Single node:
::
>>> python gaudi_spawn.py --world_size=NUM_CARDS_YOU_HAVE --use_mpi
YOUR_TRAINING_SCRIPT.py (--arg1 --arg2 --arg3 and all other
arguments of your training script)
Multi node:
::
>>> python gaudi_spawn.py --hostfile=PATH_TO_HOSTFILE --use_deepspeed
YOUR_TRAINING_SCRIPT.py (--arg1 --arg2 --arg3 and all other
arguments of your training script)
"""
import sys
from argparse import REMAINDER, ArgumentParser
from optimum.habana.distributed import DistributedRunner
from optimum.utils import logging
logger = logging.get_logger(__name__)
def parse_args():
"""Helper function parsing the command line options.
@retval ArgumentParser
"""
parser = ArgumentParser(
description=(
"Habana Gaudi distributed training launch helper utility that will spawn up multiple distributed"
" processes."
)
)
# Optional arguments for the launch helper
parser.add_argument("--world_size", type=int, default=1, help="Number of HPUs to use (1 or 8)")
parser.add_argument("--hostfile", type=str, default=None, help="Path to the file where hosts are specified.")
parser.add_argument("--use_mpi", action="store_true", help="Use MPI for distributed training")
parser.add_argument("--use_deepspeed", action="store_true", help="Use DeepSpeed for distributed training")
parser.add_argument("--master_port", type=int, default=29500, help="Master port used by DeepSpeed and MPI")
# positional
parser.add_argument(
"training_script",
type=str,
help=(
"The full path to the single HPU training "
"program/script to be launched in parallel, "
"followed by all the arguments for the "
"training script."
),
)
# rest from the training program
parser.add_argument("training_script_args", nargs=REMAINDER)
return parser.parse_args()
def main():
args = parse_args()
if args.use_deepspeed:
from transformers.integrations.deepspeed import is_deepspeed_available
if not is_deepspeed_available():
raise ImportError(
"--use_deepspeed requires deepspeed: `pip install"
" git+https://github.com/HabanaAI/DeepSpeed.git@1.17.0`."
)
# Patch sys.argv
sys.argv = [args.training_script] + args.training_script_args
# Handle the case where arguments contain whitespaces
argv = ['"{}"'.format(arg) if " " in arg and arg[0] != '"' and arg[-1] != '"' else arg for arg in sys.argv]
command_list = [" ".join(argv)]
distributed_runner = DistributedRunner(
command_list=command_list,
world_size=args.world_size,
hostfile=args.hostfile,
use_mpi=args.use_mpi,
use_deepspeed=args.use_deepspeed,
master_port=args.master_port,
)
ret_code = distributed_runner.run()
sys.exit(ret_code)
if __name__ == "__main__":
main()

View File

@@ -0,0 +1,103 @@
# Copyright (C) 2024 Intel Corporation
# SPDX-License-Identifier: Apache-2.0
import base64
import os
import threading
import time
from io import BytesIO
from typing import Union
import habana_frameworks.torch as htorch
import requests
import torch
from PIL import Image
from transformers import AutoModelForVision2Seq, AutoProcessor
from comps import (
CustomLogger,
LVMDoc,
ServiceType,
TextDoc,
opea_microservices,
register_microservice,
register_statistics,
statistics_dict,
)
logger = CustomLogger("lvm-llama-vision-native")
logflag = os.getenv("LOGFLAG", False)
initialization_lock = threading.Lock()
initialized = False
def initialize():
global model, processor, initialized
with initialization_lock:
if not initialized:
import habana_frameworks.torch.hpu as torch_hpu
model_id = os.getenv("LLAMA_VISION_MODEL_ID", "meta-llama/Llama-3.2-11B-Vision-Instruct")
huggingface_token = os.getenv("HUGGINGFACEHUB_API_TOKEN")
model = AutoModelForVision2Seq.from_pretrained(
model_id, device_map="hpu", torch_dtype=torch.bfloat16, token=huggingface_token
)
processor = AutoProcessor.from_pretrained(model_id, token=huggingface_token)
messages = [
{
"role": "user",
"content": [
{"type": "image"},
{"type": "text", "text": "If I had to write a haiku for this one, it would be: "},
],
}
]
url = "https://llava-vl.github.io/static/images/view.jpg"
raw_image = Image.open(requests.get(url, stream=True).raw)
prompt = processor.apply_chat_template(messages, add_generation_prompt=True)
inputs = processor(raw_image, prompt, return_tensors="pt").to(model.device)
prompt_len = len(inputs["input_ids"][0])
output = model.generate(**inputs, pad_token_id=0, max_new_tokens=32)
generated_tokens = output[:, prompt_len:]
logger.info(processor.decode(generated_tokens[0], skip_special_tokens=True))
initialized = True
logger.info("[LVM] Llama Vision LVM initialized.")
@register_microservice(
name="opea_service@lvm_llama_vision_native",
service_type=ServiceType.LVM,
endpoint="/v1/lvm",
host="0.0.0.0",
port=9399,
)
@register_statistics(names=["opea_service@lvm_llama_vision_native"])
async def lvm(request: Union[LVMDoc]) -> Union[TextDoc]:
initialize()
if logflag:
logger.info(request)
start = time.time()
img_b64_str = request.image
prompt = request.prompt
max_new_tokens = request.max_new_tokens
messages = [{"role": "user", "content": [{"type": "image"}, {"type": "text", "text": prompt}]}]
text = processor.apply_chat_template(messages, add_generation_prompt=True)
image_data = base64.b64decode(img_b64_str)
image_stream = BytesIO(image_data)
raw_image = Image.open(image_stream)
inputs = processor(raw_image, text, return_tensors="pt").to(model.device)
prompt_len = len(inputs["input_ids"][0])
output = model.generate(**inputs, do_sample=False, max_new_tokens=max_new_tokens)
generated_tokens = output[:, prompt_len:]
result = processor.decode(generated_tokens[0], skip_special_tokens=True)
statistics_dict["opea_service@lvm_llama_vision_native"].append_latency(time.time() - start, None)
if logflag:
logger.info(result)
return TextDoc(text=result)
if __name__ == "__main__":
opea_microservices["opea_service@lvm_llama_vision_native"].start()

View File

@@ -0,0 +1,129 @@
# Copyright (C) 2024 Intel Corporation
# SPDX-License-Identifier: Apache-2.0
import base64
import os
import threading
import time
from enum import Enum
from io import BytesIO
from typing import Union
import habana_frameworks.torch as htorch
import requests
import torch
from PIL import Image
from prompt_format_utils import LlamaGuardVersion, build_default_prompt, create_conversation
from transformers import AutoModelForVision2Seq, AutoProcessor
from comps import (
CustomLogger,
LVMDoc,
ServiceType,
TextDoc,
opea_microservices,
register_microservice,
register_statistics,
statistics_dict,
)
logger = CustomLogger("lvm-llama-vision-guard-native")
logflag = os.getenv("LOGFLAG", False)
initialization_lock = threading.Lock()
initialized = False
class AgentType(Enum):
AGENT = "Agent"
USER = "User"
def initialize():
global model, processor, initialized
with initialization_lock:
if not initialized:
import habana_frameworks.torch.hpu as torch_hpu
model_id = os.getenv("LLAMA_VISION_GUARD_MODEL_ID", "meta-llama/Llama-Guard-3-11B-Vision")
huggingface_token = os.getenv("HUGGINGFACEHUB_API_TOKEN")
model = AutoModelForVision2Seq.from_pretrained(
model_id, device_map="hpu", torch_dtype=torch.bfloat16, token=huggingface_token
)
processor = AutoProcessor.from_pretrained(model_id, token=huggingface_token)
url = "https://llava-vl.github.io/static/images/view.jpg"
conversation = [
{
"role": "user",
"content": [
{"type": "text", "text": "If I had to write a haiku for this one, it would be: "},
{
"type": "image",
},
],
}
]
input_prompt = processor.apply_chat_template(conversation, return_tensors="pt")
raw_image = Image.open(requests.get(url, stream=True).raw)
inputs = processor(text=input_prompt, images=raw_image, return_tensors="pt").to(model.device)
prompt_len = len(inputs["input_ids"][0])
output = model.generate(**inputs, pad_token_id=0, max_new_tokens=32)
generated_tokens = output[:, prompt_len:]
logger.info(processor.decode(generated_tokens[0], skip_special_tokens=True))
initialized = True
logger.info("[LVM] Llama Vision GUARD LVM initialized.")
@register_microservice(
name="opea_service@lvm_llama_vision_guard_native",
service_type=ServiceType.LVM,
endpoint="/v1/lvm",
host="0.0.0.0",
port=9499,
)
@register_statistics(names=["opea_service@lvm_llama_vision_guard_native"])
async def lvm(request: Union[LVMDoc]) -> Union[TextDoc]:
initialize()
if logflag:
logger.info(request)
start = time.time()
img_b64_str = request.image
prompt = request.prompt
max_new_tokens = request.max_new_tokens
llama_guard_version = "LLAMA_GUARD_3"
prompts = [(prompt, AgentType.USER)]
for prompt in prompts:
formatted_prompt = build_default_prompt(prompt[1], create_conversation([prompt[0]]), llama_guard_version)
conversation = [
{
"role": "user",
"content": [
{"type": "text", "text": formatted_prompt},
{
"type": "image",
},
],
}
]
input_prompt = processor.apply_chat_template(conversation, return_tensors="pt")
image_data = base64.b64decode(img_b64_str)
image_stream = BytesIO(image_data)
raw_image = Image.open(image_stream)
inputs = processor(text=input_prompt, images=raw_image, return_tensors="pt").to(model.device)
prompt_len = len(inputs["input_ids"][0])
output = model.generate(**inputs, do_sample=False, max_new_tokens=max_new_tokens)
generated_tokens = output[:, prompt_len:]
result = processor.decode(generated_tokens[0], skip_special_tokens=True)
statistics_dict["opea_service@lvm_llama_vision_guard_native"].append_latency(time.time() - start, None)
if logflag:
logger.info(result)
return TextDoc(text=result)
if __name__ == "__main__":
opea_microservices["opea_service@lvm_llama_vision_guard_native"].start()

View File

@@ -0,0 +1,74 @@
# Copyright (C) 2024 Intel Corporation
# SPDX-License-Identifier: Apache-2.0
import json
import os
import time
from concurrent import futures
from typing import Union
import requests
from comps import (
CustomLogger,
LVMDoc,
ServiceType,
TextDoc,
opea_microservices,
register_microservice,
register_statistics,
statistics_dict,
)
logger = CustomLogger("lvm-llama-vision-tp-native")
logflag = os.getenv("LOGFLAG", False)
@register_microservice(
name="opea_service@lvm_llama_vision_tp_native",
service_type=ServiceType.LVM,
endpoint="/v1/lvm",
host="0.0.0.0",
port=9599,
)
@register_statistics(names=["opea_service@lvm_llama_vision_tp_native"])
async def lvm(request: Union[LVMDoc]) -> Union[TextDoc]:
if logflag:
logger.info(request)
start = time.time()
# Initialize responses list
responses = []
# Function to send requests to individual TP workers
def send_request_to_tp_worker(port):
try:
# Build the worker URL dynamically
url = f"http://127.0.0.1:{port}/v1/lvm_serve"
# Send POST request to the TP worker
response = requests.post(url, json=request.dict())
response.raise_for_status() # Ensure the request was successful
# Parse and process the response
json_response = response.json()
responses.append(TextDoc(text=json_response.get("text", "")))
except requests.exceptions.RequestException as e:
# Log any errors that occur
logger.error(f"Error sending request to TP worker on port {port}: {e}")
return None
# Distribute work across TP workers using ThreadPoolExecutor
with futures.ThreadPoolExecutor(max_workers=4) as executor:
# TP worker ports (e.g., worker processes listen on sequential ports)
worker_ports = [9393 + i + 1 for i in range(4)]
# Map the `send_request_to_tp_worker` function to each worker port
executor.map(send_request_to_tp_worker, worker_ports)
statistics_dict["opea_service@lvm_llama_vision_tp_native"].append_latency(time.time() - start, None)
if responses:
return responses[0]
if __name__ == "__main__":
opea_microservices["opea_service@lvm_llama_vision_tp_native"].start()

View File

@@ -0,0 +1,212 @@
# Copyright (C) 2024 Intel Corporation
# SPDX-License-Identifier: Apache-2.0
import base64
import json
import os
import threading
import time
from io import BytesIO
from pathlib import Path
from typing import Union
import deepspeed
import deepspeed.comm as dist
import requests
import torch
import uvicorn
from fastapi import FastAPI
from huggingface_hub import snapshot_download
from PIL import Image
from starlette.middleware.cors import CORSMiddleware
from transformers import AutoProcessor, MllamaForConditionalGeneration
from transformers.utils import is_offline_mode
from comps import CustomLogger, LVMDoc, TextDoc
app = FastAPI(title="NeuralChat Gaudi Serving Process", description="Serving", version="0.0.1")
logger = CustomLogger("lvm-llama-vision-tp")
logflag = os.getenv("LOGFLAG", False)
model = None
model_id = os.getenv("MODEL_ID", "/workspace/models/final_weights/Llama-3.2-90B-Vision-Instruct")
processor = None
initialization_lock = threading.Lock()
initialized = False
local_rank = 0
app.add_middleware(
CORSMiddleware,
allow_origins=["*"], # Adjust this to restrict origins as necessary
allow_credentials=True,
allow_methods=["*"],
allow_headers=["*"],
)
def print_rank0(*msg):
if local_rank != 0:
return
print(*msg)
def get_repo_root(model_name_or_path):
huggingface_token = os.getenv("HUGGINGFACEHUB_API_TOKEN")
if os.path.exists(model_name_or_path):
# local path
return model_name_or_path
# checks if online or not
if is_offline_mode():
print_rank0("Offline mode: forcing local_files_only=True")
# download only on first process
allow_patterns = ["*.bin", "*.model", "*.json", "*.txt", "*.py", "*LICENSE"]
if local_rank == 0:
snapshot_download(
model_name_or_path,
local_files_only=is_offline_mode(),
cache_dir=os.getenv("TRANSFORMERS_CACHE", None),
allow_patterns=allow_patterns,
token=huggingface_token,
# ignore_patterns=["*.safetensors"],
)
dist.barrier()
return snapshot_download(
model_name_or_path,
local_files_only=is_offline_mode(),
cache_dir=os.getenv("TRANSFORMERS_CACHE", None),
allow_patterns=allow_patterns,
token=huggingface_token,
# ignore_patterns=["*.safetensors"],
)
def get_checkpoint_files(model_name_or_path):
cached_repo_dir = get_repo_root(model_name_or_path)
# extensions: .bin | .pt
# creates a list of paths from all downloaded files in cache dir
file_list = [
str(entry)
for entry in Path(cached_repo_dir).rglob("*.[sbp][ait][fn][e][t][e][n][s][o][r][s]")
if entry.is_file()
]
print(file_list)
return file_list
def write_checkpoints_json(local_rank, checkpoints_json):
checkpoint_files = get_checkpoint_files(model_id)
if local_rank == 0:
# model.config.model_type.upper()
data = {"type": "BLOOM", "checkpoints": checkpoint_files, "version": 1.0}
json.dump(data, open(checkpoints_json, "w"))
def get_int_from_env(env_keys, default):
"""Returns the first positive env value found in the `env_keys` list or the default."""
for e in env_keys:
val = int(os.environ.get(e, -1))
if val >= 0:
return val
return default
def generate(prompt, raw_image, max_new_tokens=32):
if logflag:
logger.info(f"[lvm tp serve] start to generate text with {prompt}")
inputs = processor(raw_image, prompt, return_tensors="pt").to(torch.device("hpu"))
prompt_len = len(inputs["input_ids"][0])
output = model.generate(**inputs, max_new_tokens=max_new_tokens)
generated_tokens = output[:, prompt_len:]
result = processor.decode(generated_tokens[0], skip_special_tokens=True)
if logflag:
logger.info(f"[lvm tp serve] text generated: {result}")
return result
def initialize():
global model, processor, initialized, local_rank
if logflag:
logger.info("[lvm tp serve] start to initialize model and processor")
initialized = True
huggingface_token = os.getenv("HUGGINGFACEHUB_API_TOKEN")
model = MllamaForConditionalGeneration.from_pretrained(
model_id, torch_dtype=torch.bfloat16, device_map="auto", token=huggingface_token
)
processor = AutoProcessor.from_pretrained(model_id, token=huggingface_token)
deepspeed.init_distributed(dist_backend="hccl")
local_rank = get_int_from_env(["LOCAL_RANK", "MPI_LOCALRANKID"], "0")
world_size = get_int_from_env(["WORLD_SIZE", "PMI_SIZE"], "1")
if logflag:
logger.info(f"[lvm tp serve] local rank: {local_rank}, world size: {world_size}")
repo_root = get_repo_root(model_id)
checkpoints_json = "checkpoints.json"
write_checkpoints_json(local_rank, checkpoints_json)
if logflag:
logger.info("[lvm tp serve] checkpoint json written")
# sleep for 10 seconds to avoid multi-process conflict
time.sleep(10)
model = deepspeed.init_inference(
model,
mp_size=world_size,
base_dir=repo_root,
dtype=torch.bfloat16,
checkpoint=checkpoints_json,
)
if logflag:
logger.info(model)
logger.info("[lvm tp serve] model initialized")
# warm up model
if logflag:
logger.info("[lvm tp serve] start to warm up model")
warmup = 3
messages = [
{
"role": "user",
"content": [
{"type": "image"},
{"type": "text", "text": "If I had to write a haiku for this one, it would be: "},
],
}
]
input_text = processor.apply_chat_template(messages, add_generation_prompt=True)
url = "https://llava-vl.github.io/static/images/view.jpg"
raw_image = Image.open(requests.get(url, stream=True).raw)
for i in range(warmup):
if logflag:
logger.info(f"[lvm tp serve] warming up iteration {i}")
generate(input_text, raw_image)
@app.post("/v1/lvm_serve")
async def lvm_tp_endpoint(input: Union[LVMDoc]) -> Union[TextDoc]:
if logflag:
logger.info(input)
img_b64_str = input.image
prompt = input.prompt
max_new_tokens = input.max_new_tokens
messages = [{"role": "user", "content": [{"type": "image"}, {"type": "text", "text": prompt}]}]
text = processor.apply_chat_template(messages, add_generation_prompt=True)
image_data = base64.b64decode(img_b64_str)
image_stream = BytesIO(image_data)
raw_image = Image.open(image_stream)
result = generate(text, raw_image, max_new_tokens)
if logflag:
logger.info(f"res: {result}")
return TextDoc(text=result)
if __name__ == "__main__":
initialize()
process_port = 9393 + local_rank + 1
try:
uvicorn.run(app, host="localhost", port=process_port)
except Exception as e:
print(f"Error starting uvicorn: {str(e)}")

View File

@@ -0,0 +1,380 @@
# Copyright (C) 2024 Intel Corporation
# SPDX-License-Identifier: Apache-2.0
# Copyright (c) Meta Platforms, Inc. and affiliates.
# This software may be used and distributed according to the terms of the Llama 2 Community License Agreement.
from dataclasses import dataclass
from enum import Enum
from string import Template
from typing import List
class LlamaGuardVersion(Enum):
LLAMA_GUARD_1 = "Llama Guard 1"
LLAMA_GUARD_2 = "Llama Guard 2"
LLAMA_GUARD_3 = "Llama Guard 3"
class AgentType(Enum):
AGENT = "Agent"
USER = "User"
@dataclass
class SafetyCategory:
name: str
description: str
@dataclass
class ConversationTurn:
message: str
agent_type: AgentType
LLAMA_GUARD_1_CATEGORY_SHORT_NAME_PREFIX = "O"
LLAMA_GUARD_2_CATEGORY_SHORT_NAME_PREFIX = "S"
LLAMA_GUARD_3_CATEGORY_SHORT_NAME_PREFIX = "S"
PROMPT_TASK = "Task: Check if there is unsafe content in '$agent_type' messages in conversations according our safety policy with the below categories."
SAFETY_CATEGORIES = """
<BEGIN UNSAFE CONTENT CATEGORIES>
$categories
<END UNSAFE CONTENT CATEGORIES>
"""
PROMPT_CONVERSATION = """
<BEGIN CONVERSATION>
$conversations
<END CONVERSATION>
"""
PROMPT_INSTRUCTIONS_1 = """
Provide your safety assessment for $agent_type in the above conversation:
- First line must read 'safe' or 'unsafe'.
- If unsafe, a second line must include a comma-separated list of violated categories."""
PROMPT_INSTRUCTIONS_2 = """
Provide your safety assessment for ONLY THE LAST $agent_type message in the above conversation:
- First line must read 'safe' or 'unsafe'.
- If unsafe, a second line must include a comma-separated list of violated categories."""
PROMPT_TEMPLATE_1 = Template(
f"[INST] {PROMPT_TASK}{SAFETY_CATEGORIES}{PROMPT_CONVERSATION}{PROMPT_INSTRUCTIONS_1} [/INST]"
)
PROMPT_TEMPLATE_2 = Template(
f"[INST] {PROMPT_TASK}{SAFETY_CATEGORIES}{PROMPT_CONVERSATION}{PROMPT_INSTRUCTIONS_2} [/INST]"
)
PROMPT_TEMPLATE_3 = Template(
f"<|begin_of_text|><|start_header_id|>user<|end_header_id|> {PROMPT_TASK}{SAFETY_CATEGORIES}{PROMPT_CONVERSATION}{PROMPT_INSTRUCTIONS_2} <|eot_id|><|start_header_id|>assistant<|end_header_id|>"
)
LLAMA_GUARD_1_CATEGORY = [
SafetyCategory(
"Violence and Hate.",
"""Should not
- Help people plan or engage in violence.
- Encourage people to engage in violence.
- Express hateful, derogatory, or demeaning sentiments against people based on sensitive personal characteristics like their race, color, religion, national origin, sexual orientation, gender, gender identity, or disability.
- Encourage discrimination against people based on those characteristics and should not use slurs or create insults based on those characteristics.
Can
- Provide information on violence and discrimination.
- Discuss the topics of hate and violence, and can discuss historical events involving violence.""",
),
SafetyCategory(
"Sexual Content.",
"""Should not
- Engage in sexually explicit (i.e., erotic) conversations and should not encourage people to engage in sexual activities.
Can
- Discuss topics related to sexuality and sexual education.
- Provide general information regarding sexual activities.
- Provide guidance on topics related to sexual health.""",
),
SafetyCategory(
"Criminal Planning.",
"""Should not
- Help people plan or engage in criminal activities like theft, kidnapping, or financial crimes, especially when the person indicates that they may act on the information provided to them.
- Encourage people to engage in criminal activities or condone criminal behavior.
Can
- Help people understand how crimes are committed, provided that the response is not so detailed that it could be used to provide a roadmap for future crimes.
- Help people understand how systems work that are used to prevent crimes, provided those descriptions do not provide detailed information on how to defeat those systems.""",
),
SafetyCategory(
"Guns and Illegal Weapons.",
"""Should not
- Assist in the planning or execution of firearm or illegal weapons crimes, especially when the person indicates that they may act on the information provided to them.
- Encourage people to engage in firearm or illegal weapons crimes or condone such behavior.
Can
- Discuss firearms and the arguments for and against firearm ownership.""",
),
SafetyCategory(
"Regulated or Controlled Substances.",
"""Should not
- Encourage or assist people in the trafficking, consumption, or use of regulated or controlled substances, including but not limited to illegal drugs, tobacco, alcohol, and cannabis.
- Assist or encourage people to create such substances when it is illegal to do so.
Can
- Discuss the use of regulated or controlled substances, and can provide general information on those substances (ex: what their chemical composition is and what their effects are).
- Provide information on how those substances are manufactured, provided that the information is not so detailed as to provide step-by-step instructions for their creation.""",
),
SafetyCategory(
"Self-Harm.",
"""Should not
- Encourage people to harm themselves.
- Romanticize or condone self-harm or suicide.
- Provide information regarding the methods of suicide or self-harm.
- Provide any information or advice, other than information regarding appropriate health resources, to people who express or imply that they wish to harm themselves.
Can
- Provide general information regarding self-harm and suicide (ex: statistics, the mental health challenges faced by those who take these actions).
Should
- Provide information regarding appropriate health resources to people who imply or express the desire to harm themselves.""",
),
]
LLAMA_GUARD_2_CATEGORY = [
SafetyCategory(
"Violent Crimes. ",
"""AI models should not create content that enables, encourages, or excuses the commission of violent crimes. Examples of violent crimes include, but are not limited to:
- Unlawful violence toward people (ex: terrorism, genocide, murder, hate crimes, child abuse, assault, battery, kidnapping)
- Unlawful violence toward animals (ex: animal abuse)""",
),
SafetyCategory(
"Non-Violent Crimes. ",
"""AI models should not create content that enables, encourages, or excuses the commission of non-violent crimes. Examples of non-violent crimes include, but are not limited to:
- Financial crimes (ex: fraud, scams, money laundering)
- Property crimes (ex: burglary, robbery, arson, vandalism)
- Drug crimes (ex: creating or trafficking narcotics)
- Weapons crimes (ex: producing unlicensed firearms)
- Cyber crimes (ex: hacking, spyware, malware)""",
),
SafetyCategory(
"Sex Crimes. ",
"""AI models should not create content that enables, encourages, or excuses the commission of sex crimes. Examples of sex crimes include, but are not limited to:
- Human trafficking
- Sexual assault (ex: rape)
- Sexual harassment (ex: groping)
- Lewd conduct""",
),
SafetyCategory(
"Child Exploitation. ",
"AI models should not create content that depicts child nudity or that enables, encourages, excuses, or depicts the sexual abuse of children.",
),
SafetyCategory(
"Specialized Advice. ",
"AI models should not create content that contains specialized financial, medical, or legal advice.",
),
SafetyCategory(
"Privacy. ",
"AI models should not create content that contains sensitive, personal information about private individuals.",
),
SafetyCategory(
"Intellectual Property. ",
"AI models should not create content that violates the intellectual property rights of third parties.",
),
SafetyCategory(
"Indiscriminate Weapons. ",
"""AI models should not create content that enables the creation of weapons of mass destruction. Examples of such weapons include, but are not limited to:
- Chemical weapons (ex: nerve gas)
- Biological weapons (ex: anthrax)
- Radiological weapons (ex: salted bombs)
- Nuclear weapons (ex: atomic warheads)
- High-yield explosive weapons (ex: cluster munitions)""",
),
SafetyCategory(
"Hate. ",
"AI models should not create content that is hateful toward people on the basis of their protected characteristics (race, color, ethnicity, national origin, disability, religious affiliation, caste, sexual orientation, sex, gender identity and serious disease). AI models should not create content that perpetuates negative stereotypes.",
),
SafetyCategory(
"Self-Harm. ",
"""AI models should not create content that enables, encourages, or excuses acts of intentional self-harm. Examples of such actions include, but are not limited to:
- Suicide
- Self-injury (ex: cutting)
- Disordered eating""",
),
SafetyCategory(
"Sexual Content. ",
"AI models should not create depictions of nude adults or content that contains erotic descriptions or explicit depictions of sex acts.",
),
]
LLAMA_GUARD_3_CATEGORY = [
SafetyCategory(
"Violent Crimes. ",
"""AI models should not create content that enables, encourages, or excuses the commission of violent crimes. Examples of violent crimes include, but are not limited to:
- Unlawful violence toward people (ex: terrorism, genocide, murder, hate crimes, child abuse, assault, battery, kidnapping)
- Unlawful violence toward animals (ex: animal abuse)""",
),
SafetyCategory(
"Non-Violent Crimes. ",
"""AI models should not create content that enables, encourages, or excuses the commission of non-violent crimes. Examples of non-violent crimes include, but are not limited to:
- Financial crimes (ex: fraud, scams, money laundering)
- Property crimes (ex: burglary, robbery, arson, vandalism)
- Drug crimes (ex: creating or trafficking narcotics)
- Weapons crimes (ex: producing unlicensed firearms)
- Cyber crimes (ex: hacking, spyware, malware)""",
),
SafetyCategory(
"Sex Crimes. ",
"""AI models should not create content that enables, encourages, or excuses the commission of sex crimes. Examples of sex crimes include, but are not limited to:
- Human trafficking
- Sexual assault (ex: rape)
- Sexual harassment (ex: groping)
- Lewd conduct""",
),
SafetyCategory(
"Child Exploitation. ",
"AI models should not create content that depicts child nudity or that enables, encourages, excuses, or depicts the sexual abuse of children.",
),
SafetyCategory(
"Defamation. ",
"AI models should not create content about a real, living person that is verifiably false and likely to injure the person's reputation.",
),
SafetyCategory(
"Specialized Advice. ",
"AI models should not create content that contains specialized financial, medical, or legal advice.",
),
SafetyCategory(
"Privacy. ",
"AI models should not create content that contains sensitive, personal information about private individuals.",
),
SafetyCategory(
"Intellectual Property. ",
"AI models should not create content that violates the intellectual property rights of third parties.",
),
SafetyCategory(
"Indiscriminate Weapons. ",
"""I models should not create content that enables the creation of weapons of mass destruction. Examples of such weapons include, but are not limited to:
- Chemical weapons (ex: nerve gas)
- Biological weapons (ex: anthrax)
- Radiological weapons (ex: salted bombs)
- Nuclear weapons (ex: atomic warheads)
- High-yield explosive weapons (ex: cluster munitions)""",
),
SafetyCategory(
"Hate. ",
"AI models should not create content that is hateful toward people on the basis of their protected characteristics (race, color, ethnicity, national origin, disability, religious affiliation, caste, sexual orientation, sex, gender identity and serious disease). AI models should not create content that perpetuates negative stereotypes.",
),
SafetyCategory(
"Self-Harm. ",
"""AI models should not create content that enables, encourages, or excuses acts of intentional self-harm. Examples of such actions include, but are not limited to:
- Suicide
- Self-injury (ex: cutting)
- Disordered eating""",
),
SafetyCategory(
"Sexual Content. ",
"AI models should not create depictions of nude adults or content that contains erotic descriptions or explicit depictions of sex acts.",
),
SafetyCategory(
"Elections. ",
"AI models should not create content that contain factually incorrect information about electoral systems and processes, including in the time, place, or manner of voting in civic elections.",
),
SafetyCategory(
"Code Interpreter Abuse. ",
"""AI models should not create content that attempts to abuse code interpreters. Examples of code interpreter abuse include, but are not limited to:
- Denial of service attacks
- Container escapes or privilege escalation.""",
),
]
def create_conversation(messages: List[str]) -> List[ConversationTurn]:
conversations = []
for i, message in enumerate(messages):
conversations.append(
ConversationTurn(message=message, agent_type=AgentType.USER if i % 2 == 0 else AgentType.AGENT)
)
return conversations
def build_default_prompt(
agent_type: AgentType,
conversations: List[ConversationTurn],
llama_guard_version: LlamaGuardVersion = LlamaGuardVersion.LLAMA_GUARD_3,
):
if llama_guard_version == LlamaGuardVersion.LLAMA_GUARD_3:
categories = LLAMA_GUARD_3_CATEGORY
category_short_name_prefix = LLAMA_GUARD_3_CATEGORY_SHORT_NAME_PREFIX
prompt_template = PROMPT_TEMPLATE_3
elif llama_guard_version == LlamaGuardVersion.LLAMA_GUARD_2:
categories = LLAMA_GUARD_2_CATEGORY
category_short_name_prefix = LLAMA_GUARD_2_CATEGORY_SHORT_NAME_PREFIX
prompt_template = PROMPT_TEMPLATE_2
else:
categories = LLAMA_GUARD_1_CATEGORY
category_short_name_prefix = LLAMA_GUARD_1_CATEGORY_SHORT_NAME_PREFIX
prompt_template = PROMPT_TEMPLATE_1
return build_custom_prompt(agent_type, conversations, categories, category_short_name_prefix, prompt_template)
def build_custom_prompt(
agent_type: AgentType,
conversations: List[ConversationTurn],
categories: List[SafetyCategory],
category_short_name_prefix: str,
prompt_template: str,
with_policy: bool = False,
):
categories_str = "\n".join(
[
f"{category_short_name_prefix}{i+1}: {c.name}" + (f"\n{c.description}" if with_policy else "")
for i, c in enumerate(categories)
]
)
conversations_str = "\n\n".join([f"{t.agent_type.value}: {t.message}" for t in conversations])
return prompt_template.substitute(
agent_type=agent_type.value, categories=categories_str, conversations=conversations_str
)
def build_prompt_test():
print(
build_default_prompt(
AgentType.AGENT,
[
ConversationTurn("What's the color of the sky?", AgentType.USER),
ConversationTurn("The sky is blue.", AgentType.AGENT),
],
)
)
print("\n\n")
# use a customized safety category and create_conversation function.
print(
build_custom_prompt(
AgentType.AGENT,
create_conversation(["<User Prompt placeholder>", "<Agent Prompt placeholder>"]),
[
SafetyCategory(
"Violence and Hate.",
"""Should not
- Help people plan or engage in violence.
- Encourage people to engage in violence.
- Express hateful, derogatory, or demeaning sentiments against people based on sensitive personal characteristics like their race, color, religion, national origin, sexual orientation, gender, gender identity, or disability.
- Encourage discrimination against people based on those characteristics and should not use slurs or create insults based on those characteristics.
Can
- Provide information on violence and discrimination.
- Discuss the topics of hate and violence, and can discuss historical events involving violence.""",
),
],
LLAMA_GUARD_3_CATEGORY_SHORT_NAME_PREFIX,
PROMPT_TEMPLATE_3,
True,
)
)
if __name__ == "__main__":
build_prompt_test()

View File

@@ -0,0 +1,711 @@
# Copyright (c) Microsoft Corporation.
# SPDX-License-Identifier: Apache-2.0
# DeepSpeed Team
import gc
import os
import time
import deepspeed
import deepspeed.ops.transformer as transformer_inference
import torch
import tqdm
from deepspeed import comm as dist
from deepspeed.accelerator import get_accelerator
from deepspeed.module_inject.tp_shard import set_n_embd, set_num_kv_heads
from deepspeed.ops.transformer.inference.diffusers_2d_transformer import Diffusers2DTransformerConfig
from deepspeed.ops.transformer.inference.diffusers_attention import DeepSpeedDiffusersAttention
from deepspeed.ops.transformer.inference.diffusers_transformer_block import DeepSpeedDiffusersTransformerBlock
from .auto_tp import AutoTP, Loading, ReplaceWithTensorSlicing
from .load_checkpoint import load_model_with_checkpoint
from .replace_policy import generic_policies, replace_policies
from .utils import policy_to_ds_container
def get_transformer_name(replaced_module):
from torch.nn import ModuleList
from .containers import supported_models
transformer_name = ""
for n, c in replaced_module.named_children():
if c.__class__ in supported_models:
transformer_name += n + "."
for name, child in c.named_children():
if child.__class__ is ModuleList:
transformer_name += name
break
break
return transformer_name
class GroupQuantizer:
def __init__(self, q_int8=True, group_size=1, num_bits=8, num_groups=0):
self.group_size = group_size
self.num_bits = num_bits
self.q_int8 = q_int8
self.num_groups = num_groups
def quantize(self, inputs, qkv=True, count=1, parallel_dim=0):
if not self.q_int8 or not qkv:
inputs = torch.nn.Parameter(inputs, requires_grad=False)
inputs.scale = torch.empty(1)
return inputs
q_range = 2**self.num_bits
num_groups = self.num_groups if self.num_groups > 0 else inputs.shape[0] // self.group_size
inputs = inputs.to(get_accelerator().current_device_name())
input_flat = inputs.reshape(num_groups, -1).contiguous()
input_min = torch.min(input_flat, dim=1, keepdim=True)[0].float()
input_max = torch.max(input_flat, dim=1, keepdim=True)[0].float()
scale = torch.max(input_min.abs(), input_max.abs()) * 2.0 / (q_range)
input_flat = (input_flat / scale).round().clamp(-q_range // 2, q_range // 2 - 1)
inputs_q = input_flat.reshape(inputs.shape).to(torch.int8).contiguous()
out = torch.nn.Parameter(inputs_q, requires_grad=False)
inputs_split = inputs.split(inputs.shape[parallel_dim] // 2, dim=parallel_dim)
input_flat = [inputs_split[i].reshape(num_groups, -1).contiguous() for i in range(2)]
input_min = [torch.min(input_flat[i], dim=1, keepdim=True)[0].float() for i in range(2)]
input_max = [torch.max(input_flat[i], dim=1, keepdim=True)[0].float() for i in range(2)]
scale1 = [
(torch.max(input_min[i].abs(), input_max[i].abs()) * 2.0 / (q_range)).squeeze().unsqueeze(0)
for i in range(2)
]
out.scale = (
torch.cat([scale.squeeze().unsqueeze(0), scale1[0], scale1[1]], dim=0).reshape(num_groups, -1).contiguous()
)
return out
def _module_match(module):
for policy in generic_policies:
policy = policy()
if policy.match(module):
return policy
return None
def generic_injection(module, dtype=None, enable_cuda_graph=True):
def replace_attn(child, policy):
policy_attn = policy.attention(child)
if policy_attn is None:
return child
if len(policy_attn) == 5:
qkvw, attn_ow, attn_ob, hidden_size, heads = policy_attn
else:
qw, kw, vw, attn_ow, attn_ob, hidden_size, heads = policy_attn
config = transformer_inference.DeepSpeedInferenceConfig(
hidden_size=hidden_size,
heads=heads,
dtype=dtype,
triangular_masking=False,
max_out_tokens=4096,
)
attn_module = DeepSpeedDiffusersAttention(config)
def transpose(data):
data = data.contiguous()
data.reshape(-1).copy_(data.transpose(-1, -2).contiguous().reshape(-1))
data = data.reshape(data.shape[-1], data.shape[-2])
data.to(get_accelerator().current_device_name())
return data
if len(policy_attn) == 5:
attn_module.attn_qkvw.data = transpose(qkvw.data)
else:
attn_module.attn_qkvw = None
attn_module.attn_qw.data = transpose(qw.data)
attn_module.attn_kw.data = transpose(kw.data)
attn_module.attn_vw.data = transpose(vw.data)
attn_module.attn_qkvb = None
attn_module.attn_ow.data = transpose(attn_ow.data)
attn_module.attn_ob.data.copy_(attn_ob.data.to(get_accelerator().current_device_name()))
return attn_module
def replace_attn_block(child, policy):
config = Diffusers2DTransformerConfig()
return DeepSpeedDiffusersTransformerBlock(child, config)
if isinstance(module, torch.nn.Module):
pass
else:
if dtype not in [torch.float16, torch.half]:
raise ValueError("Generic injection only supported with FP16")
try:
import diffusers
if hasattr(diffusers.models.attention, "CrossAttention"):
cross_attention = diffusers.models.attention.CrossAttention
else:
cross_attention = diffusers.models.attention_processor.Attention
attention_block = diffusers.models.attention.BasicTransformerBlock
new_policies = {
cross_attention: replace_attn,
attention_block: replace_attn_block,
}
except ImportError:
new_policies = {}
# replace_transformer_layer(None,
# module.text_encoder,
# training=False,
# replace_with_kernel_inject=True,
# triangular_masking=True,
# max_out_tokens=8192)
from ..model_implementations.transformers.clip_encoder import DSClipEncoder
cg_encoder = DSClipEncoder(module.text_encoder, enable_cuda_graph=enable_cuda_graph)
setattr(module, "text_encoder", cg_encoder)
for name in module.__dict__.keys():
sub_module = getattr(module, name)
policy = _module_match(sub_module)
if policy is not None:
def _replace_module(module, policy):
for name, child in module.named_children():
_replace_module(child, policy)
if child.__class__ in new_policies:
replaced_module = new_policies[child.__class__](child, policy)
setattr(module, name, replaced_module)
_replace_module(sub_module, policy)
new_module = policy.apply(sub_module, enable_cuda_graph=enable_cuda_graph)
print(f"**** found and replaced {name} w. {type(new_module)}")
setattr(module, name, new_module)
container_g = None
def replace_transformer_layer(orig_layer_impl, model, checkpoint_dict, config, model_config):
"""Replace bert-style transformer layers with DeepSpeed's transformer layer
Arguments:
orig_layer_impl (torch.nn.Module): the original transformer layer implementation to look for,
e.g., transformers.models.bert.modeling_bert.BertLayer or transformers.BertLayer
model (torch.nn.Module): user's nn.module representing their model
checkpoint_dict: Dictionary for checkpoint passed from the Inference Engine
config: top-level DS Inference config defined in inference/config.py
model_config: HuggingFace model config passed from the inference/engine.py
Returns:
Updated nn.module with replaced transformer layers
"""
# defining globals as internally defined functions inherit these everywhere
quantize = config.dtype == torch.int8
# todo: Refactor later. In future, let's minimize the style used above and use config.** instead
linear_layer_setting = None
"""linear_layer_setting (tuple of modules) [Optional]: shows which two classes are used for linear layers and embedding layers."""
micro_batch_size = -1
seed = -1
local_rank = -1
mp_replace = ReplaceWithTensorSlicing(
mp_group=config.tensor_parallel.tp_group, mp_size=config.tensor_parallel.tp_size
) # , out_dim=0, in_dim=1)
def replace_with_policy(child, policy_cls, triangular_masking, inference=False, layer_id=0):
policy = policy_cls(child, inference=inference)
if not policy.cuda_graph_supported:
# policy says cuda graph is not supported raise an error if set
assert not config.enable_cuda_graph, "cuda graph is not supported with this model, please disable"
from deepspeed.moe.layer import MoE
moe = False
if hasattr(child, "mlp") and isinstance(child.mlp, MoE):
num_experts = child.mlp.num_experts
moe = True
# 1. Create a model-specific container object using the policy object.
_container = policy_to_ds_container(
policy=policy, config=config, model_config=model_config, layer_id=layer_id, child=child
)
_container.set_moe(moe)
# 2. Set the tensor parallelism config
_container.set_tensor_parallel_config(config.tensor_parallel.tp_size, config.tensor_parallel.tp_group)
# 3. Initialize tensors
_container.initialize_tensors()
# 4. deal with data types -- needs refactor to use dtype instead of fp16
if config.dtype in [torch.float16, torch.bfloat16, torch.int8]:
_container.convert_to_required_dtype()
# 5. Set the quantization config
quantizer = GroupQuantizer(q_int8=quantize)
_container.set_quantization_config(quantizer)
# 6. create a DS Inference config object
_container.create_ds_model_config()
# 7. use the config and create the module
_container.create_module()
# 8. transpose the weights and bias if needed
_container.transpose()
# 9. deal with tensor parallelism.
_container.apply_tensor_parallelism(mp_replace)
# 10. copy the tensors from the model-specific container to the new module
_container.copy_data_to_new_module()
# 11. set global for generic checkpoint loading
global container_g
if container_g is None:
container_g = _container
return _container.module
def replace_wo_policy(module, all_reduce_linears, prefix="", state_dict=None):
# mp_replace = ReplaceWithTensorSlicing(mp_group=config.tensor_parallel.tp_group)
# 1. Create AutoTP object
_autotp = AutoTP(module, all_reduce_linears, prefix, state_dict, linear_layer_setting, orig_layer_impl)
# 2. Set the tensor parallelism config
_autotp.set_tensor_parallel_config(config.tensor_parallel.tp_size, config.tensor_parallel.tp_group)
# 3. Try to get num_key_heads from model_config.num_key_value_heads
# num_kv_heads = _autotp.get_model_num_kv_heads(model_config)
if hasattr(model_config, "vision_config"):
if "self_attn.o_proj" in all_reduce_linears and "mlp.fc2" in all_reduce_linears:
num_kv_heads = _autotp.get_model_num_kv_heads(model_config.vision_config)
else:
num_kv_heads = _autotp.get_model_num_kv_heads(model_config.text_config)
else:
num_kv_heads = _autotp.get_model_num_kv_heads(model_config)
# 4. When we have num_kv_heads defined, uneven division is possible, otherwise enforce even division
set_num_kv_heads(num_kv_heads)
# 4.1 Get n_embd
n_embd = None
multi_query_n_embd_names = ["n_embd"]
for name in multi_query_n_embd_names:
if hasattr(model_config, name):
n_embd = getattr(model_config, name)
if n_embd is not None:
break
# 4.2 set n_embd
set_n_embd(n_embd)
# 5. Set linear policies
_autotp.update_linear_policies()
# 6. Replace modules
if "lm_head" in all_reduce_linears or "embed_out" in all_reduce_linears:
return _autotp._replace_last_linear_module(module)
return _autotp._replace_module(module)
def replace_fn(child, _policy, layer_id=0, prefix="", state_dict=None):
training = False # todo: refactor this part to go in the config
if training:
# copy relevant state from child -> new module
new_module = replace_with_policy(child, _policy, config.triangular_masking)
else:
# copy relevant state from child -> new module
if config.replace_with_kernel_inject:
new_module = replace_with_policy(
child, _policy, config.triangular_masking, inference=True, layer_id=layer_id
)
else:
new_module = replace_wo_policy(child, _policy, prefix=prefix, state_dict=state_dict)
return new_module
def set_lm_head(module):
embedding_weight = None
for n, p in module.named_parameters():
if "word_embeddings." in n or "embed_tokens." in n or "wte." in n:
embedding_weight = p
if (
embedding_weight is not None
and hasattr(module, "lm_head")
and hasattr(module.lm_head, "weight")
and module.lm_head.weight.is_meta
):
module.lm_head.weight = embedding_weight
# enable tensor parallel for the last linear
if (
hasattr(module, "lm_head")
and hasattr(module.lm_head, "weight")
and not module.lm_head.weight.is_meta
and isinstance(module.lm_head, torch.nn.Linear)
):
module = replace_wo_policy(module, ("lm_head",), 0, "lm_head")
elif (
hasattr(module, "embed_out")
and hasattr(module.embed_out, "weight")
and not module.embed_out.weight.is_meta
and isinstance(module.embed_out, torch.nn.Linear)
):
module = replace_wo_policy(module, ("embed_out",), 0, "embed_out")
elif hasattr(module.language_model, "lm_head"):
module = replace_wo_policy(module.language_model, ("lm_head",), 0, "lm_head")
return module
if checkpoint_dict is not None and not config.replace_with_kernel_inject:
# AutoTP shard loading
checkpoint = checkpoint_dict["checkpoints"]
pbar = tqdm.tqdm(total=len(checkpoint), desc=f"Loading {len(checkpoint)} checkpoint shards")
for i in range(len(checkpoint)):
checkpoint_file = os.path.join(config.base_dir, checkpoint[i])
replaced_module = replace_module(
model=model,
orig_class=orig_layer_impl,
replace_fn=replace_fn,
_replace_policy=config.injection_policy_tuple,
checkpoint=checkpoint_file,
)
pbar.update(1)
gc.collect()
replaced_module = set_lm_head(replaced_module)
else:
replaced_module = replace_module(
model=model,
orig_class=orig_layer_impl,
replace_fn=replace_fn,
_replace_policy=config.injection_policy_tuple,
)
quantizer = GroupQuantizer(q_int8=quantize)
world_size = dist.get_world_size() if dist.is_initialized() else 1
rank = dist.get_rank() if dist.is_initialized() else 0
if checkpoint_dict is not None and config.replace_with_kernel_inject:
assert (
container_g.ckpt_load_enabled
), f"Meta Tensor checkpoint loading not supported in {container_g.__class__.__name__} container"
start_time = time.time()
checkpoint = checkpoint_dict["checkpoints"]
ckpt_list = checkpoint["tp"] if type(checkpoint) is dict else checkpoint
ckpt_type = checkpoint_dict.get("parallelization", "pp")
ckpt_mp_size = checkpoint_dict.get("tp_size", len(ckpt_list))
ckpt_mp_size = checkpoint_dict.get("mp_size", ckpt_mp_size)
base_dir1 = checkpoint_dict.get("base_dir", config.base_dir)
if ckpt_type == "pp" and type(checkpoint) is list:
pbar = tqdm.tqdm(total=len(checkpoint), desc=f"Loading {len(checkpoint)} checkpoint shards")
for i in range(len(checkpoint)):
sd = [torch.load(os.path.join(base_dir1, checkpoint[i]), map_location="cpu")]
load_model_with_checkpoint(
replaced_module, sd, mp_replace, ckpt_type, ckpt_mp_size, quantizer, container=container_g
)
pbar.update(1)
else:
num_checkpoints = len(ckpt_list) // ckpt_mp_size
tp_split_size = world_size / ckpt_mp_size
sd_offset = int(rank / tp_split_size)
sd_count = int((rank + max(1, tp_split_size)) / tp_split_size) - sd_offset
pbar = tqdm.tqdm(total=num_checkpoints, desc=f"Loading {num_checkpoints} checkpoint shards")
for i in range(num_checkpoints):
pbar.update(1)
ckpt_index = i * ckpt_mp_size + sd_offset
ckpt_files = [
os.path.join(base_dir1, ckpt_list[ckpt_index + j]) if base_dir1 else ckpt_list[ckpt_index + j]
for j in range(sd_count)
]
sds = [torch.load(ckpt_file, map_location="cpu") for ckpt_file in ckpt_files]
load_model_with_checkpoint(
replaced_module,
sds,
mp_replace,
ckpt_type,
ckpt_mp_size,
quantizer,
int(rank % tp_split_size),
container=container_g,
)
sds = [None for _ in sds]
gc.collect()
if "non_tp" in checkpoint:
pbar = tqdm.tqdm(
total=len(checkpoint["non_tp"]), desc=f"Loading {len(checkpoint['non_tp'])} checkpoint shards"
)
for i in range(len(checkpoint["non_tp"])):
pbar.update(1)
ckpt_file = (
os.path.join(base_dir1, checkpoint["non_tp"][i]) if base_dir1 else checkpoint["non_tp"][i]
)
sds = [torch.load(ckpt_file, map_location="cpu")]
load_model_with_checkpoint(
replaced_module,
sds,
mp_replace,
ckpt_type,
ckpt_mp_size,
quantizer,
int(rank % tp_split_size),
container=container_g,
)
sds = [None for _ in sds]
gc.collect()
set_lm_head(replaced_module)
print(f"checkpoint loading time at rank {rank}: {time.time()-start_time} sec")
if config.save_mp_checkpoint_path is not None:
import json
from collections import OrderedDict
num_partitions = 8
if checkpoint_dict is None:
ckpt_name = "ds_model"
try:
from transformers.models.bloom.modeling_bloom import BloomForCausalLM
if isinstance(model, BloomForCausalLM):
ckpt_name = "bloom"
except ImportError:
ckpt_name = "ds_model"
else:
ckpt_name = checkpoint_dict["type"]
if dist.is_initialized():
dist.barrier()
transformer_name = get_transformer_name(replaced_module)
non_tp_ckpt_name = "non-tp.pt"
ckpt_files = [non_tp_ckpt_name]
os.makedirs(config.save_mp_checkpoint_path, exist_ok=True)
if not dist.is_initialized() or dist.get_rank() == 0:
print("Saving tp-sharded checkpoints")
torch.save(
OrderedDict({k: v for k, v in dict(replaced_module.state_dict()).items() if transformer_name not in k}),
f"{config.save_mp_checkpoint_path}/{non_tp_ckpt_name}",
)
dtype_reprs = {
torch.float32: "float32",
torch.float16: "float16",
torch.int8: "int8",
torch.bfloat16: "bfloat16",
}
ckpt_config = json.dumps(
{
"type": ckpt_name,
"base_dir": f"{config.save_mp_checkpoint_path}",
"checkpoints": {
"non_tp": ckpt_files,
"tp": [f"tp_{r:0>2d}_{m:0>2d}.pt" for m in range(num_partitions) for r in range(world_size)],
},
"version": 1.0,
"parallelization": "tp",
"tp_size": world_size,
"dtype": dtype_reprs[config.dtype],
}
)
with open(f"{config.save_mp_checkpoint_path}/ds_inference_config.json", "w") as cfg:
cfg.write(ckpt_config)
rep_sd = replaced_module.state_dict()
for n, p in replaced_module.named_parameters():
if hasattr(p, "scale"):
rep_sd[n] = [p, p.scale]
keys = list(rep_sd.keys())
partition_size = len(keys) // num_partitions + 1
for m in range(num_partitions):
torch.save(
OrderedDict(
{
k: [rep_sd[k], rep_sd[k].scale] if hasattr(rep_sd[k], "scale") else rep_sd[k]
for k in keys[m * partition_size : (m + 1) * partition_size]
if transformer_name in k
}
),
f"{config.save_mp_checkpoint_path}/tp_{rank:0>2d}_{m:0>2d}.pt",
)
return replaced_module
def revert_transformer_layer(orig_layer_impl, model, config, preln=False):
"""Revert DeepSpeed's transformer layer back to original bert-style transformer layer
Arguments:
orig_layer_impl (torch.nn.Module): the original transformer layer implementation that was replaced,
e.g., transformers.models.bert.modeling_bert.BertLayer or transformers.BertLayer
model (torch.nn.Module): user's nn.module representing their model
config (dict): model config containing hidden size, attention heads, etc.
Returns:
Updated nn.module with original bert-style transformer layers
"""
def replace_fn(child, _replace_policy, layer_id):
# from turing.nvidia_modelingpreln import BertLayer
orig_module = orig_layer_impl(config)
# copy relevant state from child -> original module
qkvw = child.attn_qkvw.data
qkvb = child.attn_qkvb.data
qw, kw, vw = torch.chunk(qkvw, 3, axis=0)
qb, kb, vb = torch.chunk(qkvb, 3, axis=0)
orig_module.attention.self.query.weight.data = qw
orig_module.attention.self.query.bias.data = qb
orig_module.attention.self.key.weight.data = kw
orig_module.attention.self.key.bias.data = kb
orig_module.attention.self.value.weight.data = vw
orig_module.attention.self.value.bias.data = vb
orig_module.attention.output.dense.weight.data = child.attn_ow.data
orig_module.attention.output.dense.bias.data = child.attn_ob.data
attn_ln_w = child.attn_nw.data
attn_ln_b = child.attn_nb.data
if preln:
orig_module.PostAttentionLayerNorm.weight.data = attn_ln_w
orig_module.PostAttentionLayerNorm.bias.data = attn_ln_b
else:
orig_module.attention.output.LayerNorm.weight.data = attn_ln_w
orig_module.attention.output.LayerNorm.bias.data = attn_ln_b
inter_ff_w = child.inter_w.data
inter_ff_b = child.inter_b.data
if preln:
orig_module.intermediate.dense_act.weight.data = inter_ff_w
orig_module.intermediate.dense_act.bias.data = inter_ff_b
else:
orig_module.intermediate.dense.weight.data = inter_ff_w
orig_module.intermediate.dense.bias.data = inter_ff_b
orig_module.output.dense.weight.data = child.output_w.data
orig_module.output.dense.bias.data = child.output_b.data
transformer_ln_w = child.norm_w.data
transformer_ln_b = child.norm_b.data
if preln:
orig_module.PreAttentionLayerNorm.weight.data = transformer_ln_w
orig_module.PreAttentionLayerNorm.bias.data = transformer_ln_b
else:
orig_module.output.LayerNorm.weight.data = transformer_ln_w
orig_module.output.LayerNorm.bias.data = transformer_ln_b
return orig_module
return replace_module(
model=model, orig_class=deepspeed.DeepSpeedTransformerLayer, replace_fn=replace_fn, _replace_policy=None
)
def replace_module(model, orig_class, replace_fn, _replace_policy, checkpoint=None):
"""Scan the model for instances of ``orig_clas:`` to replace using ``replace_fn``.
Arguments:
model (torch.nn.Module): the model to augment
orig_class (torch.nn.Module): the module to search for
replace_fn (method): a method to convert instances of ``orig_class`` to the
desired type and return a new instance.
Returns:
A modified ``model``.
"""
sd = None
if checkpoint is not None:
if checkpoint.endswith(".safetensors"):
from safetensors.torch import load_file
sd = load_file(checkpoint)
else:
sd = torch.load(checkpoint, map_location="cpu")
policy = {}
if orig_class is not None:
policy.update({orig_class: (replace_fn, _replace_policy)})
else:
for plcy in replace_policies:
# instantiate a throw-away policy in order to populate the _orig_layer_class
_ = plcy(None)
if isinstance(plcy._orig_layer_class, list):
for orig_layer_class in plcy._orig_layer_class:
policy.update({orig_layer_class: (replace_fn, plcy)})
elif plcy._orig_layer_class is not None:
policy.update({plcy._orig_layer_class: (replace_fn, plcy)})
assert len(policy.items()) > 0, (
"No default policy found! Please specify your policy injection_policy (like {BertLayer:HFBEertLayerPolicy})."
+ "You can find some samples here: https://github.com/microsoft/DeepSpeed/blob/master/deepspeed/module_inject/replace_policy.py"
)
replaced_module, _ = _replace_module(model, policy, state_dict=sd)
return replaced_module
import re
from ..pipe import PipelineModule
def skip_level_0_prefix(model, state_dict):
model = str(model)
key = re.search(r": (.*?)Model", model)
if key is None:
key = re.search(r": (.*?)Stack", model)
if key is None:
key = re.match(r"(.*?)Model", model)
# if keys start with 'model.', don't skip level 0 prefix
if state_dict is not None:
for item in state_dict.keys():
if re.match("^model[.]", item):
return False
if key is not None and key.group(1).lower() in ["bloom", "opt"]:
return True
return False
def _replace_module(model, policies, prefix="", layer_id=0, level_id=0, state_dict=None):
"""Traverse model's children recursively and apply any transformations in ``policies``.
Arguments:
model (torch.nn.Module): model to augment
policies (dict): Mapping of source class to replacement function.
Returns:
Modified ``model``.
"""
for name, child in model.named_children():
if child.__class__ in policies:
replaced_module = policies[child.__class__][0](
child, policies[child.__class__][-1], layer_id, prefix=prefix + name, state_dict=state_dict
)
setattr(model, name, replaced_module)
if isinstance(model, PipelineModule):
assert hasattr(model, "forward_funcs"), "we require pipe-module to have the list of fwd_functions"
model.forward_funcs[model.fwd_map[name]] = replaced_module
layer_id += 1
else:
checking_key = prefix + name + "."
if Loading.is_load_module(child) and state_dict is not None:
if any(checking_key in item for item in state_dict):
Loading.load(
child,
state_dict,
checking_key,
)
else:
continue
if len(child._buffers) != 0 and state_dict is not None:
Loading.load_buffer(child, state_dict, checking_key)
_, layer_id = _replace_module(
child,
policies,
prefix if level_id == 0 and skip_level_0_prefix(model, state_dict) else prefix + name + ".",
layer_id=layer_id,
level_id=level_id + 1,
state_dict=state_dict,
)
# Add the reset_cache func to the model, so that it can be called in the beginning of text-generation.
model.reset_cache = transformer_inference.DeepSpeedTransformerInference.reset_cache
return model, layer_id

View File

@@ -0,0 +1,13 @@
datasets
docarray[full]
fastapi
numpy<2
opentelemetry-api
opentelemetry-exporter-otlp
opentelemetry-sdk
prometheus-fastapi-instrumentator
pydantic==2.9.2
pydub
shortuuid
transformers==4.45.1
uvicorn

View File

@@ -0,0 +1,11 @@
docarray
fastapi
numpy<2
opentelemetry-api
opentelemetry-exporter-otlp
opentelemetry-sdk
prometheus-fastapi-instrumentator
pydantic==2.9.2
shortuuid
transformers==4.45.1
uvicorn

View File

@@ -0,0 +1,7 @@
#!/usr/bin/env bash
# Copyright (C) 2024 Intel Corporation
# SPDX-License-Identifier: Apache-2.0
python gaudi_spawn.py --use_deepspeed --world_size 4 lvm_tp_serve.py &
python lvm_tp.py

File diff suppressed because it is too large Load Diff

5
comps/lvms/llama-vision/update Executable file
View File

@@ -0,0 +1,5 @@
#!/bin/bash
cp checkpoint_utils.py /usr/local/lib/python3.10/dist-packages/optimum/habana/
cp auto_tp.py /usr/local/lib/python3.10/dist-packages/deepspeed/module_inject/
cp replace_module.py /usr/local/lib/python3.10/dist-packages/deepspeed/module_inject/

View File

@@ -5,4 +5,4 @@
git clone https://github.com/huggingface/tgi-gaudi.git
cd ./tgi-gaudi/
docker build -t ghcr.io/huggingface/tgi-gaudi:1.2.1 . --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy
docker build -t ghcr.io/huggingface/tgi-gaudi:latest . --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy

View File

@@ -311,7 +311,7 @@ spec:
envFrom:
- configMapRef:
name: qna-config
image: ghcr.io/huggingface/tgi-gaudi:2.0.4
image: ghcr.io/huggingface/tgi-gaudi:2.0.5
imagePullPolicy: IfNotPresent
name: llm-dependency-deploy
ports:

View File

@@ -64,7 +64,7 @@ opea_micro_services:
requirements:
model_id: "Intel/neural-chat-7b-v3-3"
ghcr.io/huggingface/tgi-gaudi:
tag: 2.0.4
tag: 2.0.5
type: hpu
requirements:
model_id: "Intel/neural-chat-7b-v3-3"

View File

@@ -10,7 +10,7 @@ ip_address=$(hostname -I | awk '{print $1}')
function build_docker_images() {
echo "Start building docker images for microservice"
cd $WORKPATH
docker pull ghcr.io/huggingface/tgi-gaudi:2.0.1
docker pull ghcr.io/huggingface/tgi-gaudi:2.0.5
docker build --no-cache -t opea/guardrails-tgi:comps --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f comps/guardrails/llama_guard/langchain/Dockerfile .
if [ $? -ne 0 ]; then
echo "opea/guardrails-tgi built fail"
@@ -26,7 +26,7 @@ function start_service() {
export SAFETY_GUARD_MODEL_ID="meta-llama/Meta-Llama-Guard-2-8B"
export SAFETY_GUARD_ENDPOINT=http://${ip_address}:5035/v1/chat/completions
docker run -d --name="test-comps-guardrails-langchain-tgi-server" -p 5035:80 --runtime=habana -e HF_TOKEN=$HF_TOKEN -e HABANA_VISIBLE_DEVICES=all -e OMPI_MCA_btl_vader_single_copy_mechanism=none --cap-add=sys_nice --ipc=host -e HTTPS_PROXY=$https_proxy -e HTTP_PROXY=$https_proxy ghcr.io/huggingface/tgi-gaudi:2.0.1 --model-id $model_id --max-input-length 1024 --max-total-tokens 2048
docker run -d --name="test-comps-guardrails-langchain-tgi-server" -p 5035:80 --runtime=habana -e HF_TOKEN=$HF_TOKEN -e HABANA_VISIBLE_DEVICES=all -e OMPI_MCA_btl_vader_single_copy_mechanism=none --cap-add=sys_nice --ipc=host -e HTTPS_PROXY=$https_proxy -e HTTP_PROXY=$https_proxy ghcr.io/huggingface/tgi-gaudi:2.0.5 --model-id $model_id --max-input-length 1024 --max-total-tokens 2048
sleep 4m
docker run -d --name="test-comps-guardrails-langchain-service" -p 5036:9090 --ipc=host -e http_proxy=$http_proxy -e https_proxy=$https_proxy -e no_proxy=$no_proxy -e SAFETY_GUARD_MODEL_ID=$SAFETY_GUARD_MODEL_ID -e SAFETY_GUARD_ENDPOINT=$SAFETY_GUARD_ENDPOINT -e HUGGINGFACEHUB_API_TOKEN=$HF_TOKEN opea/guardrails-tgi:comps
sleep 10s

View File

@@ -25,7 +25,7 @@ function start_service() {
unset http_proxy
model="llava-hf/llava-v1.6-mistral-7b-hf"
lvm_port=5050
docker run -d --name="test-comps-lvm-tgi-llava" -e http_proxy=$http_proxy -e https_proxy=$https_proxy -p 5027:80 --runtime=habana -e PT_HPU_ENABLE_LAZY_COLLECTIVES=true -e SKIP_TOKENIZER_IN_TGI=true -e HABANA_VISIBLE_DEVICES=all -e OMPI_MCA_btl_vader_single_copy_mechanism=none --cap-add=sys_nice --ipc=host ghcr.io/huggingface/tgi-gaudi:2.0.4 --model-id $model --max-input-tokens 4096 --max-total-tokens 8192
docker run -d --name="test-comps-lvm-tgi-llava" -e http_proxy=$http_proxy -e https_proxy=$https_proxy -p 5027:80 --runtime=habana -e PT_HPU_ENABLE_LAZY_COLLECTIVES=true -e SKIP_TOKENIZER_IN_TGI=true -e HABANA_VISIBLE_DEVICES=all -e OMPI_MCA_btl_vader_single_copy_mechanism=none --cap-add=sys_nice --ipc=host ghcr.io/huggingface/tgi-gaudi:2.0.5 --model-id $model --max-input-tokens 4096 --max-total-tokens 8192
docker run -d --name="test-comps-lvm-tgi" -e LVM_ENDPOINT=http://$ip_address:5027 -e http_proxy=$http_proxy -e https_proxy=$https_proxy -p $lvm_port:9399 --ipc=host opea/lvm-tgi:comps
sleep 3m
}