Refactor lvms (#1096)

Co-authored-by: ZePan110 <ze.pan@intel.com>
This commit is contained in:
Sihan Chen
2025-01-13 13:06:59 +08:00
committed by GitHub
parent ea72c943bd
commit feef30b0ea
73 changed files with 764 additions and 706 deletions

View File

@@ -8,6 +8,7 @@ import re
from typing import Optional
from fastapi import FastAPI
from fastapi.responses import Response
from prometheus_fastapi_instrumentator import Instrumentator
from uvicorn import Config, Server
@@ -73,6 +74,11 @@ class HTTPService(BaseService):
"""Get the health status of this GenAI microservice."""
return {"Service Title": self.title, "Service Description": self.description}
@app.get("/health")
async def _health() -> Response:
"""Health check."""
return Response(status_code=200)
@app.get(
path="/v1/statistics",
summary="Get the statistics of GenAI services",

View File

@@ -11,7 +11,7 @@ apt-get install libreoffice
## Use LVM (Large Vision Model) for Summarizing Image Data
Occasionally unstructured data will contain image data, to convert the image data to the text data, LVM can be used to summarize the image. To leverage LVM, please refer to this [readme](../lvms/llava/README.md) to start the LVM microservice first and then set the below environment variable, before starting any dataprep microservice.
Occasionally unstructured data will contain image data, to convert the image data to the text data, LVM can be used to summarize the image. To leverage LVM, please refer to this [readme](../lvms/src/README.md) to start the LVM microservice first and then set the below environment variable, before starting any dataprep microservice.
```bash
export SUMMARIZE_IMAGE_VIA_LVM=1

View File

@@ -38,7 +38,7 @@ export PYTHONPATH=${path_to_comps}
This is required only if you are going to consume the _generate_captions_ API of this microservice as in [Section 4.3](#43-consume-generate_captions-api).
Please refer to this [readme](../../../../lvms/llava/README.md) to start the LVM microservice.
Please refer to this [readme](../../../../lvms/src/README.md) to start the LVM microservice.
After LVM is up, set up environment variables.
```bash
@@ -64,7 +64,7 @@ Please refer to this [readme](../../../../vectorstores/redis/README.md).
This is required only if you are going to consume the _generate_captions_ API of this microservice as described [here](#43-consume-generate_captions-api).
Please refer to this [readme](../../../../lvms/llava/README.md) to start the LVM microservice.
Please refer to this [readme](../../../../lvms/src/README.md) to start the LVM microservice.
After LVM is up, set up environment variables.
```bash

View File

@@ -11,7 +11,7 @@ apt-get install libreoffice
## Use LVM (Large Vision Model) for Summarizing Image Data
Occasionally unstructured data will contain image data, to convert the image data to the text data, LVM can be used to summarize the image. To leverage LVM, please refer to this [readme](../../lvms/llava/README.md) to start the LVM microservice first and then set the below environment variable, before starting any dataprep microservice.
Occasionally unstructured data will contain image data, to convert the image data to the text data, LVM can be used to summarize the image. To leverage LVM, please refer to this [readme](../../lvms/src/README.md) to start the LVM microservice first and then set the below environment variable, before starting any dataprep microservice.
```bash
export SUMMARIZE_IMAGE_VIA_LVM=1

View File

@@ -1,29 +0,0 @@
# Copyright (C) 2024 Intel Corporation
# SPDX-License-Identifier: Apache-2.0
FROM python:3.11-slim
ARG ARCH="cpu" # Set this to "cpu" or "gpu"
# Set environment variables
ENV LANG=en_US.UTF-8
RUN useradd -m -s /bin/bash user && \
mkdir -p /home/user && \
chown -R user /home/user/
USER user
COPY comps /home/user/comps
RUN pip install --no-cache-dir --upgrade pip setuptools && \
if [ ${ARCH} = "cpu" ]; then \
pip install --no-cache-dir --extra-index-url https://download.pytorch.org/whl/cpu -r /home/user/comps/lvms/llava/requirements.txt; \
else \
pip install --no-cache-dir -r /home/user/comps/lvms/llava/requirements.txt; \
fi
ENV PYTHONPATH=$PYTHONPATH:/home/user
WORKDIR /home/user/comps/lvms/llava
ENTRYPOINT ["python", "lvm.py"]

View File

@@ -1,2 +0,0 @@
# Copyright (C) 2024 Intel Corporation
# SPDX-License-Identifier: Apache-2.0

View File

@@ -1,22 +0,0 @@
# Copyright (C) 2024 Intel Corporation
# SPDX-License-Identifier: Apache-2.0
import base64
import json
from io import BytesIO
import PIL.Image
import requests
image_path = "https://avatars.githubusercontent.com/u/39623753?v=4"
image = PIL.Image.open(requests.get(image_path, stream=True, timeout=3000).raw)
buffered = BytesIO()
image.save(buffered, format="PNG")
img_b64_str = base64.b64encode(buffered.getvalue()).decode()
endpoint = "http://localhost:9399/v1/lvm"
inputs = {"image": img_b64_str, "prompt": "What is this?", "max_new_tokens": 32}
response = requests.post(url=endpoint, data=json.dumps(inputs), proxies={"http": None})
print(response.json())

View File

@@ -1,103 +0,0 @@
# Copyright (C) 2024 Intel Corporation
# SPDX-License-Identifier: Apache-2.0
import json
import os
import time
from typing import Union
import requests
from fastapi import HTTPException
from langchain_core.prompts import PromptTemplate
from template import ChatTemplate
from comps import (
CustomLogger,
LVMDoc,
LVMSearchedMultimodalDoc,
MetadataTextDoc,
ServiceType,
TextDoc,
opea_microservices,
register_microservice,
register_statistics,
statistics_dict,
)
logger = CustomLogger("lvm")
logflag = os.getenv("LOGFLAG", False)
@register_microservice(
name="opea_service@lvm",
service_type=ServiceType.LVM,
endpoint="/v1/lvm",
host="0.0.0.0",
port=9399,
)
@register_statistics(names=["opea_service@lvm"])
async def lvm(request: Union[LVMDoc, LVMSearchedMultimodalDoc]) -> Union[TextDoc, MetadataTextDoc]:
if logflag:
logger.info(request)
start = time.time()
if isinstance(request, LVMSearchedMultimodalDoc):
if logflag:
logger.info("[LVMSearchedMultimodalDoc ] input from retriever microservice")
retrieved_metadatas = request.metadata
if retrieved_metadatas is None or len(retrieved_metadatas) == 0:
# there is no video segments retrieved.
# Raise HTTPException status_code 204
# due to llava-tgi-gaudi should receive image as input; Otherwise, the generated text is bad.
raise HTTPException(status_code=500, detail="There is no video segments retrieved given the query!")
img_b64_str = retrieved_metadatas[0]["b64_img_str"]
has_image = img_b64_str != ""
initial_query = request.initial_query
context = retrieved_metadatas[0]["transcript_for_inference"]
prompt = initial_query
if request.chat_template is None:
prompt = ChatTemplate.generate_multimodal_rag_on_videos_prompt(initial_query, context, has_image)
else:
prompt_template = PromptTemplate.from_template(request.chat_template)
input_variables = prompt_template.input_variables
if sorted(input_variables) == ["context", "question"]:
prompt = prompt_template.format(question=initial_query, context=context)
else:
logger.info(
f"[ LVMSearchedMultimodalDoc ] {prompt_template} not used, we only support 2 input variables ['question', 'context']"
)
max_new_tokens = request.max_new_tokens
if logflag:
logger.info(f"prompt generated for [LVMSearchedMultimodalDoc ] input from retriever microservice: {prompt}")
else:
img_b64_str = request.image
prompt = request.prompt
max_new_tokens = request.max_new_tokens
inputs = {"img_b64_str": img_b64_str, "prompt": prompt, "max_new_tokens": max_new_tokens}
# forward to the LLaVA server
response = requests.post(url=f"{lvm_endpoint}/generate", data=json.dumps(inputs), proxies={"http": None})
statistics_dict["opea_service@lvm"].append_latency(time.time() - start, None)
result = response.json()["text"]
if logflag:
logger.info(result)
if isinstance(request, LVMSearchedMultimodalDoc):
retrieved_metadata = request.metadata[0]
return_metadata = {} # this metadata will be used to construct proof for generated text
return_metadata["video_id"] = retrieved_metadata["video_id"]
return_metadata["source_video"] = retrieved_metadata["source_video"]
return_metadata["time_of_frame_ms"] = retrieved_metadata["time_of_frame_ms"]
return_metadata["transcript_for_inference"] = retrieved_metadata["transcript_for_inference"]
return MetadataTextDoc(text=result, metadata=return_metadata)
else:
return TextDoc(text=result)
if __name__ == "__main__":
lvm_endpoint = os.getenv("LVM_ENDPOINT", "http://localhost:8399")
logger.info("[LVM] LVM initialized.")
opea_microservices["opea_service@lvm"].start()

View File

@@ -1,17 +0,0 @@
# Copyright (C) 2024 Intel Corporation
# SPDX-License-Identifier: Apache-2.0
class ChatTemplate:
@staticmethod
def generate_multimodal_rag_on_videos_prompt(question: str, context: str, has_image: bool = False):
if has_image:
template = """The transcript associated with the image is '{context}'. {question}"""
else:
template = (
"""Refer to the following results obtained from the local knowledge base: '{context}'. {question}"""
)
return template.format(context=context, question=question)

23
comps/lvms/src/Dockerfile Normal file
View File

@@ -0,0 +1,23 @@
# Copyright (C) 2024 Intel Corporation
# SPDX-License-Identifier: Apache-2.0
FROM python:3.11-slim
RUN useradd -m -s /bin/bash user && \
mkdir -p /home/user && \
chown -R user /home/user/
USER user
ENV LANG=C.UTF-8
COPY comps /home/user/comps
RUN pip install --no-cache-dir --upgrade pip setuptools && \
pip install --no-cache-dir -r /home/user/comps/lvms/src/requirements.txt
ENV PYTHONPATH=$PYTHONPATH:/home/user
WORKDIR /home/user/comps/lvms/src
ENTRYPOINT ["python", "opea_lvm_microservice.py"]

31
comps/lvms/src/README.md Normal file
View File

@@ -0,0 +1,31 @@
# LVM Microservice
Visual Question and Answering is one of the multimodal tasks empowered by LVMs (Large Visual Models). This microservice supports visual Q&A by using LLaVA as the base large visual model. It accepts two inputs: a prompt and an image. It outputs the answer to the prompt about the image.
## Build Image & Run
Before this, you have to start the [dependency](./integrations/dependency/) service based on your demands.
```bash
docker build --no-cache -t opea/lvm:comps --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f comps/lvms/src/Dockerfile .
# Change LVM_ENDPOINT to you dependency service endpoint
docker run -d --name="test-comps-lvm" -e LVM_ENDPOINT=http://localhost:8399 -e http_proxy=$http_proxy -e https_proxy=$https_proxy -p 9399:9399 --ipc=host opea/lvm:comps
```
## Test
- LLaVA & llama-vision & PredictionGuard & TGI LLaVA
```bash
# curl with an image and a prompt
http_proxy="" curl http://localhost:9399/v1/lvm -XPOST -d '{"image": "iVBORw0KGgoAAAANSUhEUgAAAAoAAAAKCAYAAACNMs+9AAAAFUlEQVR42mP8/5+hnoEIwDiqkL4KAcT9GO0U4BxoAAAAAElFTkSuQmCC", "prompt":"What is this?"}' -H 'Content-Type: application/json'
# curl with only the prompt
http_proxy="" curl http://localhost:9399/v1/lvm --silent --write-out "HTTPSTATUS:%{http_code}" -XPOST -d '{"image": "", "prompt":"What is deep learning?"}' -H 'Content-Type: application/json'
```
- video-llama
```bash
http_proxy="" curl -X POST http://localhost:9399/v1/lvm -d '{"video_url":"https://github.com/DAMO-NLP-SG/Video-LLaMA/raw/main/examples/silence_girl.mp4","chunk_start": 0,"chunk_duration": 9,"prompt":"What is the person doing?","max_new_tokens": 150}' -H 'Content-Type: application/json'
```

View File

@@ -21,12 +21,12 @@ RUN git lfs install
COPY comps /home/user/comps
RUN cd /home/user/comps/lvms/llama-vision/ && \
RUN cd /home/user/comps/lvms/src/integrations/dependency/llama-vision/ && \
pip install --no-cache-dir -r requirements.txt && \
pip install --no-cache-dir --upgrade Pillow
ENV PYTHONPATH=/root:/home/user
WORKDIR /home/user/comps/lvms/llama-vision/
WORKDIR /home/user/comps/lvms/src/integrations/dependency/llama-vision/
ENTRYPOINT ["python", "lvm.py"]

View File

@@ -21,12 +21,12 @@ RUN git lfs install
COPY comps /home/user/comps
RUN cd /home/user/comps/lvms/llama-vision/ && \
RUN cd /home/user/comps/lvms/src/integrations/dependency/llama-vision/ && \
pip install --no-cache-dir -r requirements.txt && \
pip install --no-cache-dir --upgrade Pillow
ENV PYTHONPATH=/root:/home/user
WORKDIR /home/user/comps/lvms/llama-vision/
WORKDIR /home/user/comps/lvms/src/integrations/dependency/llama-vision/
ENTRYPOINT ["python", "lvm_guard.py"]

View File

@@ -22,13 +22,13 @@ COPY comps /home/user/comps
RUN pip install --no-cache-dir git+https://github.com/HabanaAI/DeepSpeed.git@1.17.1
RUN pip install --no-cache-dir git+https://github.com/huggingface/optimum-habana@v1.13.2
RUN cd /home/user/comps/lvms/llama-vision/ \
RUN cd /home/user/comps/lvms/src/integrations/dependency/llama-vision/ \
pip install --no-cache-dir --upgrade pip && \
bash update && \
pip install --no-cache-dir -r /home/user/comps/lvms/llama-vision/requirements_tp.txt
pip install --no-cache-dir -r /home/user/comps/lvms/src/integrations/dependency/llama-vision/requirements_tp.txt
ENV PYTHONPATH=/root:/home/user
WORKDIR /home/user/comps/lvms/llama-vision/
WORKDIR /home/user/comps/lvms/src/integrations/dependency/llama-vision/
ENTRYPOINT ["bash", "run_tp.sh"]

View File

@@ -10,7 +10,7 @@ Visual Question and Answering is one of the multimodal tasks empowered by LVMs (
```bash
cd ../../../
docker build -t opea/lvm-llama-vision:latest --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f comps/lvms/llama-vision/Dockerfile .
docker build -t opea/lvm-llama-vision:latest --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f comps/lvms/src/integrations/dependency/llama-vision/Dockerfile .
```
#### Build Llama Vision Model with deepspeed
@@ -18,14 +18,14 @@ docker build -t opea/lvm-llama-vision:latest --build-arg https_proxy=$https_prox
If you need to build the image for 90B models, use the following command:
```bash
docker build -t opea/lvm-llama-vision-tp:latest --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f comps/lvms/llama-vision/Dockerfile_tp .
docker build -t opea/lvm-llama-vision-tp:latest --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f comps/lvms/src/integrations/dependency/llama-vision/Dockerfile_tp .
```
#### Build Llama Vision Guard Model
```bash
cd ../../../
docker build -t opea/lvm-llama-vision-guard:latest --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f comps/lvms/llama-vision/Dockerfile_guard .
docker build -t opea/lvm-llama-vision-guard:latest --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f comps/lvms/src/integrations/dependency/llama-vision/Dockerfile_guard .
```
### Start Llama LVM Service

View File

@@ -13,10 +13,10 @@ ENV PYTHONPATH=/home/user:/usr/lib/habanalabs/:/optimum-habana
COPY comps /home/user/comps
RUN pip install --no-cache-dir --upgrade pip setuptools && \
pip install --no-cache-dir -r /home/user/comps/lvms/llava/requirements.txt
pip install --no-cache-dir -r /home/user/comps/lvms/src/integrations/dependency/llava/requirements.txt
ENV PYTHONPATH=$PYTHONPATH:/home/user
WORKDIR /home/user/comps/lvms/llava/dependency
WORKDIR /home/user/comps/lvms/src/integrations/dependency/llava
ENTRYPOINT ["python", "llava_server.py", "--device", "cpu"]

View File

@@ -17,11 +17,11 @@ COPY comps /home/user/comps
# Install requirements and optimum habana
RUN pip install --no-cache-dir --upgrade pip && \
pip install --no-cache-dir -r /home/user/comps/lvms/llava/requirements.txt && \
pip install --no-cache-dir -r /home/user/comps/lvms/src/integrations/dependency/llava/requirements.txt && \
pip install --no-cache-dir optimum[habana]
ENV PYTHONPATH=$PYTHONPATH:/home/user
WORKDIR /home/user/comps/lvms/llava/dependency
WORKDIR /home/user/comps/lvms/src/integrations/dependency/llava/
ENTRYPOINT ["python", "llava_server.py"]

View File

@@ -16,7 +16,6 @@ pip install -r requirements.txt
```bash
# Start LLaVA service
cd dependency/
nohup python llava_server.py --device=cpu &
# Wait until the server is up
# Test
@@ -30,23 +29,12 @@ pip install optimum[habana]
```
```bash
cd dependency/
# Start LLaVA service
nohup python llava_server.py &
# Test
python check_llava_server.py
```
### 1.3 Start Image To Text Service/Test
```bash
cd ..
# Start the OPEA Microservice
python lvm.py
# Test
python check_lvm.py
```
## 🚀2. Start Microservice with Docker (Option 2)
### 2.1 Build Images
@@ -57,21 +45,14 @@ python check_lvm.py
```bash
cd ../../../
docker build -t opea/lvm-llava:latest --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f comps/lvms/llava/dependency/Dockerfile .
docker build -t opea/lvm-llava:latest --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f comps/lvms/src/integrations/dependency/llava/Dockerfile .
```
- Gaudi2 HPU
```bash
cd ../../../
docker build -t opea/lvm-llava:latest --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f comps/lvms/llava/dependency/Dockerfile.intel_hpu .
```
#### 2.1.2 LVM Service Image
```bash
cd ../../../
docker build -t opea/lvm-llava-svc:latest --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f comps/lvms/llava/Dockerfile .
docker build -t opea/lvm-llava:latest --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f comps/lvms/src/integrations/dependency/llava/Dockerfile.intel_hpu .
```
### 2.2 Start LLaVA and LVM Service
@@ -90,25 +71,9 @@ docker run -p 8399:8399 -e http_proxy=$http_proxy --ipc=host -e https_proxy=$htt
docker run -p 8399:8399 --runtime=habana -e HABANA_VISIBLE_DEVICES=all -e OMPI_MCA_btl_vader_single_copy_mechanism=none --cap-add=sys_nice --ipc=host -e http_proxy=$http_proxy -e https_proxy=$https_proxy opea/lvm-llava:latest
```
#### 2.2.2 Start LVM service
#### 2.2.2 Test
```bash
ip_address=$(hostname -I | awk '{print $1}')
docker run -p 9399:9399 --ipc=host -e http_proxy=$http_proxy -e https_proxy=$https_proxy -e LVM_ENDPOINT=http://$ip_address:8399 opea/lvm-llava-svc:latest
```
#### 2.2.3 Test
```bash
# Use curl/python
# curl with an image and a prompt
http_proxy="" curl http://localhost:9399/v1/lvm -XPOST -d '{"image": "iVBORw0KGgoAAAANSUhEUgAAAAoAAAAKCAYAAACNMs+9AAAAFUlEQVR42mP8/5+hnoEIwDiqkL4KAcT9GO0U4BxoAAAAAElFTkSuQmCC", "prompt":"What is this?"}' -H 'Content-Type: application/json'
# curl with a prompt only (no image)
http_proxy="" curl http://localhost:9399/v1/lvm -XPOST -d '{"image": "", "prompt":"What is deep learning?"}' -H 'Content-Type: application/json'
# python
python check_lvm.py
# Test
python check_llava_server.py
```

View File

@@ -9,10 +9,10 @@ ENV LANG=en_US.UTF-8
COPY comps /home/comps
RUN pip install --no-cache-dir --upgrade pip setuptools && \
pip install --no-cache-dir -r /home/comps/lvms/predictionguard/requirements.txt
pip install --no-cache-dir -r /home/comps/lvms/src/integrations/dependency/predictionguard/requirements.txt
ENV PYTHONPATH=$PYTHONPATH:/home
WORKDIR /home/comps/lvms/predictionguard
WORKDIR /home/comps/lvms/src/integrations/dependency/predictionguard
ENTRYPOINT ["python", "lvm.py"]

View File

@@ -32,7 +32,7 @@ export PREDICTIONGUARD_API_KEY=${your_predictionguard_api_key}
```bash
cd ../../..
docker build -t opea/lvm-predictionguard:latest -f comps/lvms/predictionguard/Dockerfile .
docker build -t opea/lvm-predictionguard:latest -f comps/lvms/src/integrations/dependency/predictionguard/Dockerfile .
```
### 2.2 Start Service

View File

@@ -17,10 +17,10 @@ RUN mkdir /home/user/model && chown user:user -R /home/user/model
USER user
COPY --chown=user:user comps /home/user/comps
WORKDIR /home/user/comps/lvms/video-llama/dependency
WORKDIR /home/user/comps/lvms/src/integrations/dependency/video-llama/
RUN pip install --no-cache-dir --upgrade pip && \
pip install --no-cache-dir -r /home/user/comps/lvms/video-llama/dependency/requirements.txt
pip install --no-cache-dir -r /home/user/comps/lvms/src/integrations/dependency/video-llama/requirements.txt
ARG VIDEO_LLAMA_REPO=https://github.com/DAMO-NLP-SG/Video-LLaMA.git
ARG VIDEO_LLAMA_COMMIT=0adb19e

View File

@@ -9,9 +9,9 @@ This is a Docker-based microservice that runs Video-Llama as a Large Vision Mode
```bash
cd GenAIComps
# Video-Llama Server Image
docker build --no-cache -t opea/video-llama-lvm-server:latest --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f comps/lvms/video-llama/dependency/Dockerfile .
docker build --no-cache -t opea/lvm-video-llama:latest --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f comps/lvms/src/integrations/dependency/video-llama/Dockerfile .
# LVM Service Image
docker build --no-cache -t opea/lvm-video-llama:latest --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f comps/lvms/video-llama/Dockerfile .
docker build --no-cache -t opea/lvm:latest --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f comps/lvms/src/integrations/dependency/video-llama/Dockerfile .
```
### 1.2 Start Video-Llama and LVM Services
@@ -24,9 +24,9 @@ export ip_address=$(hostname -I | awk '{print $1}')
export no_proxy=$no_proxy,${ip_address}
export LVM_ENDPOINT=http://${ip_address}:9009
# Start service
docker compose -f comps/lvms/video-llama/docker_compose.yaml up -d
docker compose -f comps/lvms/src/integrations/dependency/video-llama/docker_compose.yaml up -d
# it should take about 1.5 hours for the model to download in the video-llama server, assuming a maximum download speed of 100 Mbps
until docker logs video-llama-lvm-server 2>&1 | grep -q "Uvicorn running on"; do
until docker logs lvm-video-llama 2>&1 | grep -q "Uvicorn running on"; do
sleep 5m
done
```
@@ -34,7 +34,7 @@ done
If you've run the microservice before, it's recommended to keep the downloaded model so it won't be redownloaded each time you run it. To achieve this, you need to modify the following configuration:
```yaml
# comps/lvms/video-llama/docker_compose.yaml
# comps/lvms/src/integrations/dependency/video-llama/docker_compose.yaml
services:
lvm-video-llama:
...
@@ -49,13 +49,6 @@ services:
export ip_address=$(hostname -I | awk '{print $1}')
## check video-llama
http_proxy="" curl -X POST "http://${ip_address}:9009/generate?video_url=https%3A%2F%2Fgithub.com%2FDAMO-NLP-SG%2FVideo-LLaMA%2Fraw%2Fmain%2Fexamples%2Fsilence_girl.mp4&start=0.0&duration=9&prompt=What%20is%20the%20person%20doing%3F&max_new_tokens=150" -H "accept: */*" -d ''
## check lvm
http_proxy="" curl -X POST http://${ip_address}:9000/v1/lvm -d '{"video_url":"https://github.com/DAMO-NLP-SG/Video-LLaMA/raw/main/examples/silence_girl.mp4","chunk_start": 0,"chunk_duration": 9,"prompt":"What is the person doing?","max_new_tokens": 150}' -H 'Content-Type: application/json'
# or use python
export ip_address=$(hostname -I | awk '{print $1}')
python comps/lvms/video-llama/check_lvm.py
```
## ♻️ 3. Clean

View File

@@ -4,8 +4,8 @@
version: "3"
services:
lvm-video-llama:
image: opea/video-llama-lvm-server:latest
container_name: video-llama-lvm-server
image: opea/lvm-video-llama:latest
container_name: lvm-video-llama
ports:
- "9009:9009"
ipc: host

View File

@@ -29,7 +29,7 @@ timm
torch==1.13.1 --index-url https://download.pytorch.org/whl/cpu
torchaudio==0.13.1 --index-url https://download.pytorch.org/whl/cpu
torchvision==0.14.1 --index-url https://download.pytorch.org/whl/cpu
transformers
transformers==4.47.1
uvicorn
validators
webdataset

View File

@@ -37,7 +37,7 @@ context_db = None
streamer = None
chat = None
VIDEO_DIR = "/home/user/comps/lvms/video-llama/dependency/data"
VIDEO_DIR = "/home/user/comps/lvms/src/integrations/dependency/video-llama/data"
CFG_PATH = "video_llama_config/video_llama_eval_only_vl.yaml"
MODEL_TYPE = "llama_v2"

View File

@@ -0,0 +1,60 @@
# Copyright (C) 2024 Intel Corporation
# SPDX-License-Identifier: Apache-2.0
import json
import os
from typing import Union
import requests
from comps import CustomLogger, LVMDoc, OpeaComponent, OpeaComponentRegistry, ServiceType, TextDoc
logger = CustomLogger("opea_llama_vision")
logflag = os.getenv("LOGFLAG", False)
@OpeaComponentRegistry.register("OPEA_LLAVA_VISION_LVM")
class OpeaLlamaVisionLvm(OpeaComponent):
"""A specialized LVM component derived from OpeaComponent for LLaMA-Vision services."""
def __init__(self, name: str, description: str, config: dict = None):
super().__init__(name, ServiceType.LVM.name.lower(), description, config)
self.base_url = os.getenv("LVM_ENDPOINT", "http://localhost:9399")
health_status = self.check_health()
if not health_status:
logger.error("specialized health check failed.")
async def invoke(
self,
request: Union[LVMDoc],
) -> Union[TextDoc]:
"""Involve the LVM service to generate answer for the provided input."""
if logflag:
logger.info(request)
inputs = {"image": request.image, "prompt": request.prompt, "max_new_tokens": request.max_new_tokens}
# forward to the LLaMA Vision server
response = requests.post(url=f"{self.base_url}/v1/lvm", data=json.dumps(inputs), proxies={"http": None})
result = response.json()["text"]
if logflag:
logger.info(result)
return TextDoc(text=result)
def check_health(self) -> bool:
"""Checks the health of the embedding service.
Returns:
bool: True if the service is reachable and healthy, False otherwise.
"""
try:
response = requests.get(f"{self.base_url}/health")
if response.status_code == 200:
return True
else:
return False
except Exception as e:
# Handle connection errors, timeouts, etc.
logger.error(f"Health check failed: {e}")
return False

View File

@@ -0,0 +1,130 @@
# Copyright (C) 2024 Intel Corporation
# SPDX-License-Identifier: Apache-2.0
import json
import os
from typing import Union
import requests
from fastapi import HTTPException
from langchain_core.prompts import PromptTemplate
from comps import (
CustomLogger,
LVMDoc,
LVMSearchedMultimodalDoc,
MetadataTextDoc,
OpeaComponent,
OpeaComponentRegistry,
ServiceType,
TextDoc,
)
logger = CustomLogger("opea_llava")
logflag = os.getenv("LOGFLAG", False)
class ChatTemplate:
@staticmethod
def generate_multimodal_rag_on_videos_prompt(question: str, context: str, has_image: bool = False):
if has_image:
template = """The transcript associated with the image is '{context}'. {question}"""
else:
template = (
"""Refer to the following results obtained from the local knowledge base: '{context}'. {question}"""
)
return template.format(context=context, question=question)
@OpeaComponentRegistry.register("OPEA_LLAVA_LVM")
class OpeaLlavaLvm(OpeaComponent):
"""A specialized LVM component derived from OpeaComponent for LLaVA LVM services."""
def __init__(self, name: str, description: str, config: dict = None):
super().__init__(name, ServiceType.LVM.name.lower(), description, config)
self.base_url = os.getenv("LVM_ENDPOINT", "http://localhost:8399")
health_status = self.check_health()
if not health_status:
logger.error("OpeaLlavaLvm health check failed.")
async def invoke(
self,
request: Union[LVMDoc, LVMSearchedMultimodalDoc],
) -> Union[TextDoc, MetadataTextDoc]:
"""Involve the LVM service to generate answer for the provided input."""
if logflag:
logger.info(request)
if isinstance(request, LVMSearchedMultimodalDoc):
if logflag:
logger.info("[LVMSearchedMultimodalDoc ] input from retriever microservice")
retrieved_metadatas = request.metadata
if retrieved_metadatas is None or len(retrieved_metadatas) == 0:
# there is no video segments retrieved.
# Raise HTTPException status_code 204
# due to llava-tgi-gaudi should receive image as input; Otherwise, the generated text is bad.
raise HTTPException(status_code=500, detail="There is no video segments retrieved given the query!")
img_b64_str = retrieved_metadatas[0]["b64_img_str"]
has_image = img_b64_str != ""
initial_query = request.initial_query
context = retrieved_metadatas[0]["transcript_for_inference"]
prompt = initial_query
if request.chat_template is None:
prompt = ChatTemplate.generate_multimodal_rag_on_videos_prompt(initial_query, context, has_image)
else:
prompt_template = PromptTemplate.from_template(request.chat_template)
input_variables = prompt_template.input_variables
if sorted(input_variables) == ["context", "question"]:
prompt = prompt_template.format(question=initial_query, context=context)
else:
logger.info(
f"[ LVMSearchedMultimodalDoc ] {prompt_template} not used, we only support 2 input variables ['question', 'context']"
)
max_new_tokens = request.max_new_tokens
if logflag:
logger.info(
f"prompt generated for [LVMSearchedMultimodalDoc ] input from retriever microservice: {prompt}"
)
else:
img_b64_str = request.image
prompt = request.prompt
max_new_tokens = request.max_new_tokens
inputs = {"img_b64_str": img_b64_str, "prompt": prompt, "max_new_tokens": max_new_tokens}
# forward to the LLaVA server
response = requests.post(url=f"{self.base_url}/generate", data=json.dumps(inputs), proxies={"http": None})
result = response.json()["text"]
if logflag:
logger.info(result)
if isinstance(request, LVMSearchedMultimodalDoc):
retrieved_metadata = request.metadata[0]
return_metadata = {} # this metadata will be used to construct proof for generated text
return_metadata["video_id"] = retrieved_metadata["video_id"]
return_metadata["source_video"] = retrieved_metadata["source_video"]
return_metadata["time_of_frame_ms"] = retrieved_metadata["time_of_frame_ms"]
return_metadata["transcript_for_inference"] = retrieved_metadata["transcript_for_inference"]
return MetadataTextDoc(text=result, metadata=return_metadata)
else:
return TextDoc(text=result)
def check_health(self) -> bool:
"""Checks the health of the embedding service.
Returns:
bool: True if the service is reachable and healthy, False otherwise.
"""
try:
response = requests.get(f"{self.base_url}/health")
if response.status_code == 200:
return True
else:
return False
except Exception as e:
# Handle connection errors, timeouts, etc.
logger.error(f"Health check failed: {e}")
return False

View File

@@ -0,0 +1,60 @@
# Copyright (C) 2024 Intel Corporation
# SPDX-License-Identifier: Apache-2.0
import json
import os
from typing import Union
import requests
from comps import CustomLogger, LVMDoc, OpeaComponent, OpeaComponentRegistry, ServiceType, TextDoc
logger = CustomLogger("opea_predictionguard")
logflag = os.getenv("LOGFLAG", False)
@OpeaComponentRegistry.register("OPEA_PREDICTION_GUARD_LVM")
class OpeaPredictionguardLvm(OpeaComponent):
"""A specialized LVM component derived from OpeaComponent for Predictionguard services."""
def __init__(self, name: str, description: str, config: dict = None):
super().__init__(name, ServiceType.LVM.name.lower(), description, config)
self.base_url = os.getenv("LVM_ENDPOINT", "http://localhost:9399")
health_status = self.check_health()
if not health_status:
logger.error("OpeaPredictionguardLvm health check failed.")
async def invoke(
self,
request: Union[LVMDoc],
) -> Union[TextDoc]:
"""Involve the LVM service to generate answer for the provided input."""
if logflag:
logger.info(request)
inputs = {"image": request.image, "prompt": request.prompt, "max_new_tokens": request.max_new_tokens}
# forward to the PredictionGuard server
response = requests.post(url=f"{self.base_url}/v1/lvm", data=json.dumps(inputs), proxies={"http": None})
result = response.json()["text"]
if logflag:
logger.info(result)
return TextDoc(text=result)
def check_health(self) -> bool:
"""Checks the health of the embedding service.
Returns:
bool: True if the service is reachable and healthy, False otherwise.
"""
try:
response = requests.get(f"{self.base_url}/health")
if response.status_code == 200:
return True
else:
return False
except Exception as e:
# Handle connection errors, timeouts, etc.
logger.error(f"Health check failed: {e}")
return False

View File

@@ -0,0 +1,187 @@
# Copyright (C) 2024 Intel Corporation
# SPDX-License-Identifier: Apache-2.0
import os
import time
from typing import Union
import requests
from fastapi import HTTPException
from fastapi.responses import StreamingResponse
from huggingface_hub import AsyncInferenceClient
from langchain_core.prompts import PromptTemplate
from comps import (
CustomLogger,
LVMDoc,
LVMSearchedMultimodalDoc,
MetadataTextDoc,
OpeaComponent,
OpeaComponentRegistry,
ServiceType,
TextDoc,
statistics_dict,
)
logger = CustomLogger("opea_tgi_llava")
logflag = os.getenv("LOGFLAG", False)
class ChatTemplate:
@staticmethod
def generate_multimodal_rag_on_videos_prompt(question: str, context: str, has_image: bool = False):
if has_image:
template = """The transcript associated with the image is '{context}'. {question}"""
else:
template = (
"""Refer to the following results obtained from the local knowledge base: '{context}'. {question}"""
)
return template.format(context=context, question=question)
@OpeaComponentRegistry.register("OPEA_TGI_LLAVA_LVM")
class OpeaTgiLlavaLvm(OpeaComponent):
"""A specialized TGI LVM component derived from OpeaComponent for LLaVA LVM services."""
def __init__(self, name: str, description: str, config: dict = None):
super().__init__(name, ServiceType.LVM.name.lower(), description, config)
self.base_url = os.getenv("LVM_ENDPOINT", "http://localhost:8399")
self.lvm_client = AsyncInferenceClient(self.base_url)
health_status = self.check_health()
if not health_status:
logger.error("OpeaTgiLlavaLvm health check failed.")
async def invoke(
self,
request: Union[LVMDoc, LVMSearchedMultimodalDoc],
) -> Union[TextDoc, MetadataTextDoc]:
"""Involve the LVM service to generate answer for the provided input."""
if logflag:
logger.info(request)
if isinstance(request, LVMSearchedMultimodalDoc):
if logflag:
logger.info("[LVMSearchedMultimodalDoc ] input from retriever microservice")
retrieved_metadatas = request.metadata
if retrieved_metadatas is None or len(retrieved_metadatas) == 0:
# there is no video segments retrieved.
# Raise HTTPException status_code 204
# due to llava-tgi-gaudi should receive image as input; Otherwise, the generated text is bad.
raise HTTPException(status_code=500, detail="There is no video segments retrieved given the query!")
img_b64_str = retrieved_metadatas[0]["b64_img_str"]
has_image = img_b64_str != ""
initial_query = request.initial_query
context = retrieved_metadatas[0]["transcript_for_inference"]
prompt = initial_query
if request.chat_template is None:
prompt = ChatTemplate.generate_multimodal_rag_on_videos_prompt(initial_query, context, has_image)
else:
prompt_template = PromptTemplate.from_template(request.chat_template)
input_variables = prompt_template.input_variables
if sorted(input_variables) == ["context", "question"]:
prompt = prompt_template.format(question=initial_query, context=context)
else:
logger.info(
f"[ LVMSearchedMultimodalDoc ] {prompt_template} not used, we only support 2 input variables ['question', 'context']"
)
max_new_tokens = request.max_new_tokens
stream = request.stream
repetition_penalty = request.repetition_penalty
temperature = request.temperature
top_k = request.top_k
top_p = request.top_p
if logflag:
logger.info(
f"prompt generated for [LVMSearchedMultimodalDoc ] input from retriever microservice: {prompt}"
)
else:
img_b64_str = request.image
prompt = request.prompt
max_new_tokens = request.max_new_tokens
stream = request.stream
repetition_penalty = request.repetition_penalty
temperature = request.temperature
top_k = request.top_k
top_p = request.top_p
if not img_b64_str:
# Work around an issue where LLaVA-NeXT is not providing good responses when prompted without an image.
# Provide an image and then instruct the model to ignore the image. The base64 string below is the encoded png:
# https://raw.githubusercontent.com/opea-project/GenAIExamples/refs/tags/v1.0/AudioQnA/ui/svelte/src/lib/assets/icons/png/audio1.png
img_b64_str = "iVBORw0KGgoAAAANSUhEUgAAADUAAAAlCAYAAADiMKHrAAAACXBIWXMAAAsTAAALEwEAmpwYAAAAAXNSR0IArs4c6QAAAARnQU1BAACxjwv8YQUAAAKPSURBVHgB7Zl/btowFMefnUTqf+MAHYMTjN4gvcGOABpM+8E0doLSE4xpsE3rKuAG3KC5Ad0J6MYOkP07YnvvhR9y0lVzupTIVT5SwDjB9fd97WfsMkCef1rUXM8dY9HHK4hWUevzi/oVWAqnF8fzLmAtiPA3Aq0lFsVA1fRKxlgNLIbDPaQUZQuu6YO98aIipHOiFGtIqaYfn1UnUCDds6WPyeANlTFbv9WztbFTK+HNUVAPiz7nbPzq7HsPCoKWIBREGfsJXZit5xT07X0jp6iRdIbEHOnjyyD97OvzH00lVS2K5OS2ax11cBXxJgYxlEIE6XZclzdTX6n8XjkkcEIfbj2nMO0/SNd1vy4vsCNjYPyEovfyy88GZIQCSKOCMf6ORgStoboLJuSWKDYCfK2q4jjrMZ+GOh7Pib/gek5DHxVUJtcgA7mJ4kwZRbN7viQXFzQn0Nl52gXG4Fo7DKAYp0yI3VHQ16oaWV0wYa+iGE8nG+wAdx5DzpS/KGyhFGULpShbKEXZQinqLlBK/IKc2asoh4sZvoXJWhlAzuxV1KBVD3HrfYTFAK8ZHgu0hu36DHLG+Izinw250WUkXHJht02QUnxLP7fZxR7f1I6S7Ir2GgmYvIQM5OYUuYBdainATq2ZjTqPBlnbGXYeBrg9Od18DKmc1U0jpw4OIIwEJFxQSl2b4MN2lf74fw8nFNbHt/5N9xWKTZvJ2S6YZk6RC3j2cKpVhSIShZ0mea6caCOCAjyNHd5gPPxGncMBTvI6hunYdaJ6kf8VoSCP2odxX6RkR6NOtanfj13EswKVqEQrPzzFL1lK+YvCFraiEqs8TrwQLGYraqpX4kr/Hixml+63Z+CoM9DTo438AUmP+KyMWT+tAAAAAElFTkSuQmCC"
prompt = f"Please disregard the image and answer the question. {prompt}"
image = f"data:image/png;base64,{img_b64_str}"
image_prompt = f"![]({image})\n{prompt}\nASSISTANT:"
if stream:
t_start = time.time()
async def stream_generator(time_start):
first_token_latency = None
chat_response = ""
text_generation = await self.lvm_client.text_generation(
prompt=image_prompt,
stream=stream,
max_new_tokens=max_new_tokens,
repetition_penalty=repetition_penalty,
temperature=temperature,
top_k=top_k,
top_p=top_p,
)
async for text in text_generation:
if first_token_latency is None:
first_token_latency = time.time() - time_start
chat_response += text
chunk_repr = repr(text.encode("utf-8"))
if logflag:
logger.info(f"[llm - chat_stream] chunk:{chunk_repr}")
yield f"data: {chunk_repr}\n\n"
if logflag:
logger.info(f"[llm - chat_stream] stream response: {chat_response}")
statistics_dict["opea_service@lvm"].append_latency(time.time() - time_start, first_token_latency)
yield "data: [DONE]\n\n"
return StreamingResponse(stream_generator(t_start), media_type="text/event-stream")
else:
generated_str = await self.lvm_client.text_generation(
image_prompt,
max_new_tokens=max_new_tokens,
repetition_penalty=repetition_penalty,
temperature=temperature,
top_k=top_k,
top_p=top_p,
)
if logflag:
logger.info(generated_str)
if isinstance(request, LVMSearchedMultimodalDoc):
retrieved_metadata = request.metadata[0]
return_metadata = {} # this metadata will be used to construct proof for generated text
return_metadata["video_id"] = retrieved_metadata["video_id"]
return_metadata["source_video"] = retrieved_metadata["source_video"]
return_metadata["time_of_frame_ms"] = retrieved_metadata["time_of_frame_ms"]
return_metadata["transcript_for_inference"] = retrieved_metadata["transcript_for_inference"]
return MetadataTextDoc(text=generated_str, metadata=return_metadata)
else:
return TextDoc(text=generated_str)
def check_health(self) -> bool:
"""Checks the health of the embedding service.
Returns:
bool: True if the service is reachable and healthy, False otherwise.
"""
try:
response = requests.get(f"{self.base_url}/health")
if response.status_code == 200:
return True
else:
return False
except Exception as e:
# Handle connection errors, timeouts, etc.
logger.error(f"Health check failed: {e}")
return False

View File

@@ -0,0 +1,100 @@
# Copyright (C) 2024 Intel Corporation
# SPDX-License-Identifier: Apache-2.0
import os
import time
from typing import Union
import requests
from fastapi import HTTPException
from fastapi.responses import StreamingResponse
from comps import CustomLogger, LVMVideoDoc, OpeaComponent, OpeaComponentRegistry, ServiceType, statistics_dict
logger = CustomLogger("opea_video_llama")
logflag = os.getenv("LOGFLAG", False)
@OpeaComponentRegistry.register("OPEA_VIDEO_LLAMA_LVM")
class OpeaVideoLlamaLvm(OpeaComponent):
"""A specialized LVM component derived from OpeaComponent for Video-LLaMA services."""
def __init__(self, name: str, description: str, config: dict = None):
super().__init__(name, ServiceType.LVM.name.lower(), description, config)
self.base_url = os.getenv("LVM_ENDPOINT", "http://localhost:9099")
health_status = self.check_health()
if not health_status:
logger.error("OpeaVideoLlamaLvm health check failed.")
async def invoke(
self,
request: Union[LVMVideoDoc],
) -> Union[StreamingResponse]:
"""Involve the LVM service to generate answer for the provided request.
Parameters:
request (LVMVideoDoc): The request containing the video URL, start time, duration, prompt, and maximum new tokens.
Returns:
StreamingResponse: A streaming response containing the generated text in text/event-stream format, or a JSON error response if the upstream API responds with an error.
"""
if logflag:
logger.info("[lvm] Received input")
logger.info(request)
video_url = request.video_url
chunk_start = request.chunk_start
chunk_duration = request.chunk_duration
prompt = request.prompt
max_new_tokens = request.max_new_tokens
params = {
"video_url": video_url,
"start": chunk_start,
"duration": chunk_duration,
"prompt": prompt,
"max_new_tokens": max_new_tokens,
}
logger.info(f"[lvm] Params: {params}")
t_start = time.time()
response = requests.post(url=f"{self.base_url}/generate", params=params, proxies={"http": None}, stream=True)
logger.info(f"[lvm] Response status code: {response.status_code}")
if response.status_code == 200:
def streamer(time_start):
first_token_latency = None
yield f"{{'video_url': '{video_url}', 'chunk_start': {chunk_start}, 'chunk_duration': {chunk_duration}}}\n".encode(
"utf-8"
)
for chunk in response.iter_content(chunk_size=8192):
if chunk:
if first_token_latency is None:
first_token_latency = time.time() - time_start
yield chunk
logger.info(f"[lvm - chat_stream] Streaming chunk of size {len(chunk)}")
logger.info("[lvm - chat_stream] stream response finished")
statistics_dict["opea_service@lvm"].append_latency(time.time() - time_start, first_token_latency)
return StreamingResponse(streamer(t_start), media_type="text/event-stream")
else:
logger.error(f"[lvm] Error: {response.text}")
raise HTTPException(status_code=500, detail="The upstream API responded with an error.")
def check_health(self) -> bool:
"""Checks the health of the embedding service.
Returns:
bool: True if the service is reachable and healthy, False otherwise.
"""
try:
response = requests.get(f"{self.base_url}/health")
if response.status_code == 200:
return True
else:
return False
except Exception as e:
# Handle connection errors, timeouts, etc.
logger.error(f"Health check failed: {e}")
return False

View File

@@ -0,0 +1,72 @@
# Copyright (C) 2024 Intel Corporation
# SPDX-License-Identifier: Apache-2.0
import os
import time
from typing import Union
from integrations.opea_llama_vision import OpeaLlamaVisionLvm
from integrations.opea_llava import OpeaLlavaLvm
from integrations.opea_predictionguard import OpeaPredictionguardLvm
from integrations.opea_tgi_llava import OpeaTgiLlavaLvm
from integrations.opea_video_llama import OpeaVideoLlamaLvm
from comps import (
CustomLogger,
LVMDoc,
LVMSearchedMultimodalDoc,
LVMVideoDoc,
MetadataTextDoc,
OpeaComponentLoader,
ServiceType,
TextDoc,
opea_microservices,
register_microservice,
register_statistics,
statistics_dict,
)
logger = CustomLogger("opea_lvm_microservice")
logflag = os.getenv("LOGFLAG", False)
lvm_component_name = os.getenv("LVM_COMPONENT_NAME", "OPEA_LLAVA_LVM")
# Initialize OpeaComponentController
loader = OpeaComponentLoader(lvm_component_name, description=f"OPEA LVM Component: {lvm_component_name}")
@register_microservice(
name="opea_service@lvm",
service_type=ServiceType.LVM,
endpoint="/v1/lvm",
host="0.0.0.0",
port=9399,
)
@register_statistics(names=["opea_service@lvm"])
async def lvm(
request: Union[LVMDoc, LVMSearchedMultimodalDoc, LVMVideoDoc]
) -> Union[TextDoc, MetadataTextDoc]: # can also return a StreamingResponse but omit it in annotation for FastAPI
start = time.time()
try:
# Use the controller to invoke the active component
lvm_response = await loader.invoke(request)
if logflag:
logger.info(lvm_response)
if loader.component.name in ["OpeaVideoLlamaLvm"] or (
loader.component.name in ["OpeaTgiLlavaLvm"] and request.streaming
):
# statistics for StreamingResponse are handled inside the integrations
# here directly return the response
return lvm_response
statistics_dict["opea_service@lvm"].append_latency(time.time() - start, None)
return lvm_response
except Exception as e:
logger.error(f"Error during lvm invocation: {e}")
raise
if __name__ == "__main__":
logger.info("OPEA LVM Microservice is starting....")
opea_microservices["opea_service@lvm"].start()

View File

@@ -1,25 +0,0 @@
# Copyright (C) 2024 Intel Corporation
# SPDX-License-Identifier: Apache-2.0
FROM python:3.11-slim
# Set environment variables
ENV LANG=en_US.UTF-8
ARG ARCH="cpu"
COPY comps /home/comps
RUN pip install --no-cache-dir --upgrade pip setuptools && \
if [ ${ARCH} = "cpu" ]; then \
pip install --no-cache-dir --extra-index-url https://download.pytorch.org/whl/cpu -r /home/comps/lvms/tgi-llava/requirements.txt; \
else \
pip install --no-cache-dir -r /home/comps/lvms/tgi-llava/requirements.txt; \
fi;
ENV PYTHONPATH=$PYTHONPATH:/home
WORKDIR /home/comps/lvms/tgi-llava
ENTRYPOINT ["python", "lvm_tgi.py"]

View File

@@ -1,2 +0,0 @@
# Copyright (C) 2024 Intel Corporation
# SPDX-License-Identifier: Apache-2.0

View File

@@ -1,155 +0,0 @@
# Copyright (C) 2024 Intel Corporation
# SPDX-License-Identifier: Apache-2.0
import os
import time
from typing import Union
from fastapi import HTTPException
from fastapi.responses import StreamingResponse
from huggingface_hub import AsyncInferenceClient
from langchain_core.prompts import PromptTemplate
from template import ChatTemplate
from comps import (
CustomLogger,
LVMDoc,
LVMSearchedMultimodalDoc,
MetadataTextDoc,
ServiceType,
TextDoc,
opea_microservices,
register_microservice,
register_statistics,
statistics_dict,
)
logger = CustomLogger("lvm_tgi")
logflag = os.getenv("LOGFLAG", False)
@register_microservice(
name="opea_service@lvm_tgi",
service_type=ServiceType.LVM,
endpoint="/v1/lvm",
host="0.0.0.0",
port=9399,
input_datatype=LVMDoc,
output_datatype=TextDoc,
)
@register_statistics(names=["opea_service@lvm_tgi"])
async def lvm(request: Union[LVMDoc, LVMSearchedMultimodalDoc]) -> Union[TextDoc, MetadataTextDoc]:
if logflag:
logger.info(request)
start = time.time()
stream_gen_time = []
if isinstance(request, LVMSearchedMultimodalDoc):
if logflag:
logger.info("[LVMSearchedMultimodalDoc ] input from retriever microservice")
retrieved_metadatas = request.metadata
if not retrieved_metadatas or len(retrieved_metadatas) == 0:
# there is no video segments retrieved.
# Raise HTTPException status_code 204
# due to llava-tgi-gaudi should receive image as input; Otherwise, the generated text is bad.
raise HTTPException(status_code=500, detail="There is no video segments retrieved given the query!")
img_b64_str = retrieved_metadatas[0]["b64_img_str"]
has_image = img_b64_str != ""
initial_query = request.initial_query
context = retrieved_metadatas[0]["transcript_for_inference"]
prompt = initial_query
if request.chat_template is None:
prompt = ChatTemplate.generate_multimodal_rag_on_videos_prompt(initial_query, context, has_image)
else:
prompt_template = PromptTemplate.from_template(request.chat_template)
input_variables = prompt_template.input_variables
if sorted(input_variables) == ["context", "question"]:
prompt = prompt_template.format(question=initial_query, context=context)
else:
logger.info(
f"[ LVMSearchedMultimodalDoc ] {prompt_template} not used, we only support 2 input variables ['question', 'context']"
)
max_new_tokens = request.max_new_tokens
stream = request.stream
repetition_penalty = request.repetition_penalty
temperature = request.temperature
top_k = request.top_k
top_p = request.top_p
if logflag:
logger.info(f"prompt generated for [LVMSearchedMultimodalDoc ] input from retriever microservice: {prompt}")
else:
img_b64_str = request.image
prompt = request.prompt
max_new_tokens = request.max_new_tokens
stream = request.stream
repetition_penalty = request.repetition_penalty
temperature = request.temperature
top_k = request.top_k
top_p = request.top_p
if not img_b64_str:
# Work around an issue where LLaVA-NeXT is not providing good responses when prompted without an image.
# Provide an image and then instruct the model to ignore the image. The base64 string below is the encoded png:
# https://raw.githubusercontent.com/opea-project/GenAIExamples/refs/tags/v1.0/AudioQnA/ui/svelte/src/lib/assets/icons/png/audio1.png
img_b64_str = "iVBORw0KGgoAAAANSUhEUgAAADUAAAAlCAYAAADiMKHrAAAACXBIWXMAAAsTAAALEwEAmpwYAAAAAXNSR0IArs4c6QAAAARnQU1BAACxjwv8YQUAAAKPSURBVHgB7Zl/btowFMefnUTqf+MAHYMTjN4gvcGOABpM+8E0doLSE4xpsE3rKuAG3KC5Ad0J6MYOkP07YnvvhR9y0lVzupTIVT5SwDjB9fd97WfsMkCef1rUXM8dY9HHK4hWUevzi/oVWAqnF8fzLmAtiPA3Aq0lFsVA1fRKxlgNLIbDPaQUZQuu6YO98aIipHOiFGtIqaYfn1UnUCDds6WPyeANlTFbv9WztbFTK+HNUVAPiz7nbPzq7HsPCoKWIBREGfsJXZit5xT07X0jp6iRdIbEHOnjyyD97OvzH00lVS2K5OS2ax11cBXxJgYxlEIE6XZclzdTX6n8XjkkcEIfbj2nMO0/SNd1vy4vsCNjYPyEovfyy88GZIQCSKOCMf6ORgStoboLJuSWKDYCfK2q4jjrMZ+GOh7Pib/gek5DHxVUJtcgA7mJ4kwZRbN7viQXFzQn0Nl52gXG4Fo7DKAYp0yI3VHQ16oaWV0wYa+iGE8nG+wAdx5DzpS/KGyhFGULpShbKEXZQinqLlBK/IKc2asoh4sZvoXJWhlAzuxV1KBVD3HrfYTFAK8ZHgu0hu36DHLG+Izinw250WUkXHJht02QUnxLP7fZxR7f1I6S7Ir2GgmYvIQM5OYUuYBdainATq2ZjTqPBlnbGXYeBrg9Od18DKmc1U0jpw4OIIwEJFxQSl2b4MN2lf74fw8nFNbHt/5N9xWKTZvJ2S6YZk6RC3j2cKpVhSIShZ0mea6caCOCAjyNHd5gPPxGncMBTvI6hunYdaJ6kf8VoSCP2odxX6RkR6NOtanfj13EswKVqEQrPzzFL1lK+YvCFraiEqs8TrwQLGYraqpX4kr/Hixml+63Z+CoM9DTo438AUmP+KyMWT+tAAAAAElFTkSuQmCC"
prompt = f"Please disregard the image and answer the question. {prompt}"
image = f"data:image/png;base64,{img_b64_str}"
image_prompt = f"![]({image})\n{prompt}\nASSISTANT:"
if stream:
async def stream_generator():
chat_response = ""
text_generation = await lvm_client.text_generation(
prompt=image_prompt,
stream=stream,
max_new_tokens=max_new_tokens,
repetition_penalty=repetition_penalty,
temperature=temperature,
top_k=top_k,
top_p=top_p,
)
async for text in text_generation:
stream_gen_time.append(time.time() - start)
chat_response += text
chunk_repr = repr(text.encode("utf-8"))
if logflag:
logger.info(f"[llm - chat_stream] chunk:{chunk_repr}")
yield f"data: {chunk_repr}\n\n"
if logflag:
logger.info(f"[llm - chat_stream] stream response: {chat_response}")
statistics_dict["opea_service@lvm_tgi"].append_latency(stream_gen_time[-1], stream_gen_time[0])
yield "data: [DONE]\n\n"
return StreamingResponse(stream_generator(), media_type="text/event-stream")
else:
generated_str = await lvm_client.text_generation(
image_prompt,
max_new_tokens=max_new_tokens,
repetition_penalty=repetition_penalty,
temperature=temperature,
top_k=top_k,
top_p=top_p,
)
statistics_dict["opea_service@lvm_tgi"].append_latency(time.time() - start, None)
if logflag:
logger.info(generated_str)
if isinstance(request, LVMSearchedMultimodalDoc):
retrieved_metadata = request.metadata[0]
return_metadata = {} # this metadata will be used to construct proof for generated text
return_metadata["video_id"] = retrieved_metadata["video_id"]
return_metadata["source_video"] = retrieved_metadata["source_video"]
return_metadata["time_of_frame_ms"] = retrieved_metadata["time_of_frame_ms"]
return_metadata["transcript_for_inference"] = retrieved_metadata["transcript_for_inference"]
return MetadataTextDoc(text=generated_str, metadata=return_metadata)
else:
return TextDoc(text=generated_str)
if __name__ == "__main__":
lvm_endpoint = os.getenv("LVM_ENDPOINT", "http://localhost:8399")
lvm_client = AsyncInferenceClient(lvm_endpoint)
logger.info("[LVM] LVM initialized.")
opea_microservices["opea_service@lvm_tgi"].start()

View File

@@ -1,17 +0,0 @@
# Copyright (C) 2024 Intel Corporation
# SPDX-License-Identifier: Apache-2.0
class ChatTemplate:
@staticmethod
def generate_multimodal_rag_on_videos_prompt(question: str, context: str, has_image: bool = False):
if has_image:
template = """The transcript associated with the image is '{context}'. {question}"""
else:
template = (
"""Refer to the following results obtained from the local knowledge base: '{context}'. {question}"""
)
return template.format(context=context, question=question)

View File

@@ -1,18 +0,0 @@
# Copyright (C) 2024 Intel Corporation
# SPDX-License-Identifier: Apache-2.0
FROM python:3.11-slim
# Set environment variables
ENV LANG=en_US.UTF-8
COPY comps /home/comps
RUN pip install --no-cache-dir --upgrade pip setuptools && \
pip install --no-cache-dir -r /home/comps/lvms/video-llama/requirements.txt
ENV PYTHONPATH=$PYTHONPATH:/home
WORKDIR /home/comps/lvms/video-llama
ENTRYPOINT ["python", "lvm.py"]

View File

@@ -1,50 +0,0 @@
# Copyright (C) 2024 Intel Corporation
# SPDX-License-Identifier: Apache-2.0
import datetime
import json
import os
import requests
ip_address = os.getenv("ip_address")
####### video-llama request ########
print("video-llama request")
api_url = f"http://${ip_address}:9009/generate"
content = {
"video_url": "https://github.com/DAMO-NLP-SG/Video-LLaMA/raw/main/examples/silence_girl.mp4",
"start": 0.0,
"duration": 9,
"prompt": "What is the person doing?",
"max_new_tokens": 150,
}
start = datetime.datetime.now()
with requests.post(api_url, params=content, stream=True) as response:
for chunk in response.iter_content(chunk_size=8192):
if chunk:
print(chunk.decode("utf-8"), end="", flush=True) # Flush to ensure immediate output
end = datetime.datetime.now()
print(f"\nTotal time: {end - start}")
####### lvm request ########
print("lvm request")
api_url = f"http://${ip_address}:9000/v1/lvm"
headers = {"Content-Type": "application/json"}
data = {
"video_url": "https://github.com/DAMO-NLP-SG/Video-LLaMA/raw/main/examples/silence_girl.mp4",
"chunk_start": 0,
"chunk_duration": 9,
"prompt": "what is the person doing",
"max_new_tokens": 150,
}
start = datetime.datetime.now()
with requests.post(api_url, headers=headers, data=json.dumps(data), stream=True) as response:
for chunk in response.iter_content(chunk_size=8192):
if chunk:
print(chunk.decode("utf-8"), end="", flush=True) # Flush to ensure immediate output
end = datetime.datetime.now()
print(f"\nTotal time: {end - start}")

View File

@@ -1,40 +0,0 @@
# Copyright (C) 2024 Intel Corporation
# SPDX-License-Identifier: Apache-2.0
version: "3"
services:
lvm-video-llama:
image: opea/video-llama-lvm-server:latest
container_name: video-llama-lvm-server
ports:
- "9009:9009"
ipc: host
environment:
http_proxy: ${http_proxy}
https_proxy: ${https_proxy}
no_proxy: ${no_proxy}
llm_download: "True"
volumes:
- "/home/$USER/.cache:/home/user/.cache" # RECOMMENDED: use local cache to avoid download
- video-llama-model:/home/user/model
restart: unless-stopped
lvm:
image: opea/lvm-video-llama:latest
container_name: lvm-video-llama
ports:
- "9000:9000"
ipc: host
environment:
http_proxy: ${http_proxy}
https_proxy: ${https_proxy}
no_proxy: ${no_proxy}
LVM_ENDPOINT: ${LVM_ENDPOINT}
restart: unless-stopped
depends_on:
- lvm-video-llama
networks:
default:
driver: bridge
volumes:
video-llama-model:

View File

@@ -1,80 +0,0 @@
# Copyright (C) 2024 Intel Corporation
# SPDX-License-Identifier: Apache-2.0
# import json
import logging
import os
import requests
from fastapi import HTTPException
from fastapi.responses import StreamingResponse
from comps import LVMVideoDoc, ServiceType, opea_microservices, register_microservice, register_statistics
# import time
logging.basicConfig(level=logging.INFO)
@register_microservice(
name="opea_service@lvm",
service_type=ServiceType.LVM,
endpoint="/v1/lvm",
host="0.0.0.0",
port=9000,
input_datatype=LVMVideoDoc,
output_datatype=StreamingResponse,
)
@register_statistics(names=["opea_service@lvm"])
async def lvm(input: LVMVideoDoc):
"""This function handles the LVM microservice, which generates text based on a video URL, start time, duration, prompt, and maximum new tokens.
Parameters:
input (LVMVideoDoc): The input containing the video URL, start time, duration, prompt, and maximum new tokens.
Returns:
StreamingResponse: A streaming response containing the generated text in text/event-stream format, or a JSON error response if the upstream API responds with an error.
"""
logging.info("[lvm] Received input")
video_url = input.video_url
chunk_start = input.chunk_start
chunk_duration = input.chunk_duration
prompt = input.prompt
max_new_tokens = input.max_new_tokens
params = {
"video_url": video_url,
"start": chunk_start,
"duration": chunk_duration,
"prompt": prompt,
"max_new_tokens": max_new_tokens,
}
logging.info(f"[lvm] Params: {params}")
response = requests.post(url=f"{lvm_endpoint}/generate", params=params, proxies={"http": None}, stream=True)
logging.info(f"[lvm] Response status code: {response.status_code}")
if response.status_code == 200:
def streamer():
yield f"{{'video_url': '{video_url}', 'chunk_start': {chunk_start}, 'chunk_duration': {chunk_duration}}}\n".encode(
"utf-8"
)
for chunk in response.iter_content(chunk_size=8192):
if chunk:
yield chunk
logging.info(f"[llm - chat_stream] Streaming: {chunk}")
logging.info("[llm - chat_stream] stream response finished")
return StreamingResponse(streamer(), media_type="text/event-stream")
else:
logging.error(f"[lvm] Error: {response.text}")
raise HTTPException(status_code=500, detail="The upstream API responded with an error.")
if __name__ == "__main__":
lvm_endpoint = os.getenv("LVM_ENDPOINT")
opea_microservices["opea_service@lvm"].start()

View File

@@ -1,11 +0,0 @@
datasets
docarray
fastapi
opentelemetry-api
opentelemetry-exporter-otlp
opentelemetry-sdk
Pillow
prometheus-fastapi-instrumentator
pydub
shortuuid
uvicorn