Add MultimodalQnA as MMRAG usecase in Example (#751)

Signed-off-by: Tiep Le <tiep.le@intel.com>
Signed-off-by: siddhivelankar23 <siddhi.velankar@intel.com>
Signed-off-by: sjagtap1803 <siddhant.jagtap@intel.com>
This commit is contained in:
Tiep Le
2024-09-14 01:55:29 -07:00
committed by GitHub
parent 06696c8e58
commit b6cce35a93
21 changed files with 2558 additions and 0 deletions

View File

@@ -0,0 +1,35 @@
# Copyright (C) 2024 Intel Corporation
# SPDX-License-Identifier: Apache-2.0
FROM python:3.11-slim
ENV LANG=C.UTF-8
ARG ARCH="cpu"
RUN apt-get update -y && apt-get install -y --no-install-recommends --fix-missing \
build-essential \
libgl1-mesa-glx \
libjemalloc-dev \
default-jre \
wget \
vim
# Install ffmpeg static build
WORKDIR /root
RUN wget https://johnvansickle.com/ffmpeg/builds/ffmpeg-git-amd64-static.tar.xz && \
mkdir ffmpeg-git-amd64-static && tar -xvf ffmpeg-git-amd64-static.tar.xz -C ffmpeg-git-amd64-static --strip-components 1 && \
export PATH=/root/ffmpeg-git-amd64-static:$PATH && \
cp /root/ffmpeg-git-amd64-static/ffmpeg /usr/local/bin/ && \
cp /root/ffmpeg-git-amd64-static/ffprobe /usr/local/bin/
RUN mkdir -p /home/user
COPY gradio /home/user/gradio
RUN pip install --no-cache-dir --upgrade pip setuptools && \
pip install --no-cache-dir -r /home/user/gradio/requirements.txt
WORKDIR /home/user/gradio
ENTRYPOINT ["python", "multimodalqna_ui_gradio.py"]
# ENTRYPOINT ["/usr/bin/sleep", "infinity"]

View File

@@ -0,0 +1,155 @@
# Copyright (C) 2024 Intel Corporation
# SPDX-License-Identifier: Apache-2.0
import dataclasses
from enum import Enum, auto
from typing import List
from utils import get_b64_frame_from_timestamp
class SeparatorStyle(Enum):
"""Different separator style."""
SINGLE = auto()
@dataclasses.dataclass
class Conversation:
"""A class that keeps all conversation history."""
system: str
roles: List[str]
messages: List[List[str]]
offset: int
sep_style: SeparatorStyle = SeparatorStyle.SINGLE
sep: str = "\n"
video_file: str = None
caption: str = None
time_of_frame_ms: str = None
base64_frame: str = None
skip_next: bool = False
split_video: str = None
def _template_caption(self):
out = ""
if self.caption is not None:
out = f"The caption associated with the image is '{self.caption}'. "
return out
def get_prompt(self):
messages = self.messages
if len(messages) > 1 and messages[1][1] is None:
# Need to do RAG. prompt is the query only
ret = messages[0][1]
else:
# No need to do RAG. Thus, prompt of chatcompletion format
conv_dict = []
if self.sep_style == SeparatorStyle.SINGLE:
for i, (role, message) in enumerate(messages):
if message:
if i != 0:
dic = {"role": role, "content": message}
else:
dic = {"role": role}
if self.time_of_frame_ms and self.video_file:
content = [{"type": "text", "text": message}]
if self.base64_frame:
base64_frame = self.base64_frame
else:
base64_frame = get_b64_frame_from_timestamp(self.video_file, self.time_of_frame_ms)
self.base64_frame = base64_frame
content.append({"type": "image_url", "image_url": {"url": base64_frame}})
else:
content = message
dic["content"] = content
conv_dict.append(dic)
else:
raise ValueError(f"Invalid style: {self.sep_style}")
ret = conv_dict
return ret
def append_message(self, role, message):
self.messages.append([role, message])
def get_b64_image(self):
b64_img = None
if self.time_of_frame_ms and self.video_file:
time_of_frame_ms = self.time_of_frame_ms
video_file = self.video_file
b64_img = get_b64_frame_from_timestamp(video_file, time_of_frame_ms)
return b64_img
def to_gradio_chatbot(self):
ret = []
for i, (role, msg) in enumerate(self.messages[self.offset :]):
if i % 2 == 0:
if type(msg) is tuple:
import base64
from io import BytesIO
msg, image, image_process_mode = msg
max_hw, min_hw = max(image.size), min(image.size)
aspect_ratio = max_hw / min_hw
max_len, min_len = 800, 400
shortest_edge = int(min(max_len / aspect_ratio, min_len, min_hw))
longest_edge = int(shortest_edge * aspect_ratio)
W, H = image.size
if H > W:
H, W = longest_edge, shortest_edge
else:
H, W = shortest_edge, longest_edge
image = image.resize((W, H))
buffered = BytesIO()
image.save(buffered, format="JPEG")
img_b64_str = base64.b64encode(buffered.getvalue()).decode()
img_str = f'<img src="data:image/png;base64,{img_b64_str}" alt="user upload image" />'
msg = img_str + msg.replace("<image>", "").strip()
ret.append([msg, None])
else:
ret.append([msg, None])
else:
ret[-1][-1] = msg
return ret
def copy(self):
return Conversation(
system=self.system,
roles=self.roles,
messages=[[x, y] for x, y in self.messages],
offset=self.offset,
sep_style=self.sep_style,
sep=self.sep,
video_file=self.video_file,
caption=self.caption,
base64_frame=self.base64_frame,
)
def dict(self):
return {
"system": self.system,
"roles": self.roles,
"messages": self.messages,
"offset": self.offset,
"sep": self.sep,
"time_of_frame_ms": self.time_of_frame_ms,
"video_file": self.video_file,
"caption": self.caption,
"base64_frame": self.base64_frame,
"split_video": self.split_video,
}
multimodalqna_conv = Conversation(
system="",
roles=("user", "assistant"),
messages=(),
offset=0,
sep_style=SeparatorStyle.SINGLE,
sep="\n",
video_file=None,
caption=None,
time_of_frame_ms=None,
base64_frame=None,
split_video=None,
)

View File

@@ -0,0 +1,337 @@
# Copyright (C) 2024 Intel Corporation
# SPDX-License-Identifier: Apache-2.0
import argparse
import os
import shutil
import time
from pathlib import Path
import gradio as gr
import requests
import uvicorn
from conversation import multimodalqna_conv
from fastapi import FastAPI
from fastapi.staticfiles import StaticFiles
from utils import build_logger, moderation_msg, server_error_msg, split_video
logger = build_logger("gradio_web_server", "gradio_web_server.log")
headers = {"Content-Type": "application/json"}
css = """
h1 {
text-align: center;
display:block;
}
"""
# create a FastAPI app
app = FastAPI()
cur_dir = os.getcwd()
static_dir = Path(os.path.join(cur_dir, "static/"))
tmp_dir = Path(os.path.join(cur_dir, "split_tmp_videos/"))
Path(static_dir).mkdir(parents=True, exist_ok=True)
app.mount("/static", StaticFiles(directory=static_dir), name="static")
description = "This Space lets you engage with MultimodalQnA on a video through a chat box."
no_change_btn = gr.Button()
enable_btn = gr.Button(interactive=True)
disable_btn = gr.Button(interactive=False)
def clear_history(state, request: gr.Request):
logger.info(f"clear_history. ip: {request.client.host}")
if state.split_video and os.path.exists(state.split_video):
os.remove(state.split_video)
state = multimodalqna_conv.copy()
return (state, state.to_gradio_chatbot(), "", None) + (disable_btn,) * 1
def add_text(state, text, request: gr.Request):
logger.info(f"add_text. ip: {request.client.host}. len: {len(text)}")
if len(text) <= 0:
state.skip_next = True
return (state, state.to_gradio_chatbot(), "", None) + (no_change_btn,) * 1
text = text[:2000] # Hard cut-off
state.append_message(state.roles[0], text)
state.append_message(state.roles[1], None)
state.skip_next = False
return (state, state.to_gradio_chatbot(), "") + (disable_btn,) * 1
def http_bot(state, request: gr.Request):
global gateway_addr
logger.info(f"http_bot. ip: {request.client.host}")
url = gateway_addr
is_very_first_query = False
if state.skip_next:
# This generate call is skipped due to invalid inputs
path_to_sub_videos = state.get_path_to_subvideos()
yield (state, state.to_gradio_chatbot(), path_to_sub_videos) + (no_change_btn,) * 1
return
if len(state.messages) == state.offset + 2:
# First round of conversation
is_very_first_query = True
new_state = multimodalqna_conv.copy()
new_state.append_message(new_state.roles[0], state.messages[-2][1])
new_state.append_message(new_state.roles[1], None)
state = new_state
# Construct prompt
prompt = state.get_prompt()
# Make requests
pload = {
"messages": prompt,
}
logger.info(f"==== request ====\n{pload}")
logger.info(f"==== url request ====\n{gateway_addr}")
state.messages[-1][-1] = ""
yield (state, state.to_gradio_chatbot(), state.split_video) + (disable_btn,) * 1
try:
response = requests.post(
url,
headers=headers,
json=pload,
timeout=100,
)
print(response.status_code)
print(response.json())
if response.status_code == 200:
response = response.json()
choice = response["choices"][-1]
metadata = choice["metadata"]
message = choice["message"]["content"]
if (
is_very_first_query
and not state.video_file
and "source_video" in metadata
and not state.time_of_frame_ms
and "time_of_frame_ms" in metadata
):
video_file = metadata["source_video"]
state.video_file = os.path.join(static_dir, metadata["source_video"])
state.time_of_frame_ms = metadata["time_of_frame_ms"]
splited_video_path = split_video(
state.video_file, state.time_of_frame_ms, tmp_dir, f"{state.time_of_frame_ms}__{video_file}"
)
state.split_video = splited_video_path
print(splited_video_path)
else:
raise requests.exceptions.RequestException
except requests.exceptions.RequestException as e:
state.messages[-1][-1] = server_error_msg
yield (state, state.to_gradio_chatbot(), None) + (enable_btn,)
return
state.messages[-1][-1] = message
yield (state, state.to_gradio_chatbot(), state.split_video) + (enable_btn,) * 1
logger.info(f"{state.messages[-1][-1]}")
return
def ingest_video_gen_transcript(filepath, request: gr.Request):
yield (gr.Textbox(visible=True, value="Please wait for ingesting your uploaded video into database..."))
basename = os.path.basename(filepath)
dest = os.path.join(static_dir, basename)
shutil.copy(filepath, dest)
print("Done copy uploaded file to static folder!")
headers = {
# 'Content-Type': 'multipart/form-data'
}
files = {
"files": open(dest, "rb"),
}
response = requests.post(dataprep_gen_transcript_addr, headers=headers, files=files)
print(response.status_code)
if response.status_code == 200:
response = response.json()
print(response)
yield (gr.Textbox(visible=True, value="Video ingestion is done. Saving your uploaded video..."))
time.sleep(2)
fn_no_ext = Path(dest).stem
if "video_id_maps" in response and fn_no_ext in response["video_id_maps"]:
new_dst = os.path.join(static_dir, response["video_id_maps"][fn_no_ext])
print(response["video_id_maps"][fn_no_ext])
os.rename(dest, new_dst)
yield (
gr.Textbox(
visible=True,
value="Congratulation! Your upload is done!\nClick the X button on the top right of the video upload box to upload another video.",
)
)
return
else:
yield (
gr.Textbox(
visible=True,
value="Something wrong!\nPlease click the X button on the top right of the video upload boxreupload your video!",
)
)
time.sleep(2)
return
def ingest_video_gen_caption(filepath, request: gr.Request):
yield (gr.Textbox(visible=True, value="Please wait for ingesting your uploaded video into database..."))
basename = os.path.basename(filepath)
dest = os.path.join(static_dir, basename)
shutil.copy(filepath, dest)
print("Done copy uploaded file to static folder!")
headers = {
# 'Content-Type': 'multipart/form-data'
}
files = {
"files": open(dest, "rb"),
}
response = requests.post(dataprep_gen_captiono_addr, headers=headers, files=files)
print(response.status_code)
if response.status_code == 200:
response = response.json()
print(response)
yield (gr.Textbox(visible=True, value="Video ingestion is done. Saving your uploaded video..."))
time.sleep(2)
fn_no_ext = Path(dest).stem
if "video_id_maps" in response and fn_no_ext in response["video_id_maps"]:
new_dst = os.path.join(static_dir, response["video_id_maps"][fn_no_ext])
print(response["video_id_maps"][fn_no_ext])
os.rename(dest, new_dst)
yield (
gr.Textbox(
visible=True,
value="Congratulation! Your upload is done!\nClick the X button on the top right of the video upload box to upload another video.",
)
)
return
else:
yield (
gr.Textbox(
visible=True,
value="Something wrong!\nPlease click the X button on the top right of the video upload boxreupload your video!",
)
)
time.sleep(2)
return
def clear_uploaded_video(request: gr.Request):
return gr.Textbox(visible=False)
with gr.Blocks() as upload_gen_trans:
gr.Markdown("# Ingest Your Own Video - Utilizing Generated Transcripts")
gr.Markdown(
"Please use this interface to ingest your own video if the video has meaningful audio (e.g., announcements, discussions, etc...)"
)
with gr.Row():
with gr.Column(scale=6):
video_upload = gr.Video(sources="upload", height=512, width=512, elem_id="video_upload")
with gr.Column(scale=3):
text_upload_result = gr.Textbox(visible=False, interactive=False, label="Upload Status")
video_upload.upload(ingest_video_gen_transcript, [video_upload], [text_upload_result])
video_upload.clear(clear_uploaded_video, [], [text_upload_result])
with gr.Blocks() as upload_gen_captions:
gr.Markdown("# Ingest Your Own Video - Utilizing Generated Captions")
gr.Markdown(
"Please use this interface to ingest your own video if the video has meaningless audio (e.g., background musics, etc...)"
)
with gr.Row():
with gr.Column(scale=6):
video_upload_cap = gr.Video(sources="upload", height=512, width=512, elem_id="video_upload_cap")
with gr.Column(scale=3):
text_upload_result_cap = gr.Textbox(visible=False, interactive=False, label="Upload Status")
video_upload_cap.upload(ingest_video_gen_transcript, [video_upload_cap], [text_upload_result_cap])
video_upload_cap.clear(clear_uploaded_video, [], [text_upload_result_cap])
with gr.Blocks() as qna:
state = gr.State(multimodalqna_conv.copy())
with gr.Row():
with gr.Column(scale=4):
video = gr.Video(height=512, width=512, elem_id="video")
with gr.Column(scale=7):
chatbot = gr.Chatbot(elem_id="chatbot", label="MultimodalQnA Chatbot", height=390)
with gr.Row():
with gr.Column(scale=6):
# textbox.render()
textbox = gr.Textbox(
# show_label=False,
# container=False,
label="Query",
info="Enter your query here!",
)
with gr.Column(scale=1, min_width=100):
with gr.Row():
submit_btn = gr.Button(value="Send", variant="primary", interactive=True)
with gr.Row(elem_id="buttons") as button_row:
clear_btn = gr.Button(value="🗑️ Clear", interactive=False)
clear_btn.click(
clear_history,
[
state,
],
[state, chatbot, textbox, video, clear_btn],
)
submit_btn.click(
add_text,
[state, textbox],
[state, chatbot, textbox, clear_btn],
).then(
http_bot,
[
state,
],
[state, chatbot, video, clear_btn],
)
with gr.Blocks(css=css) as demo:
gr.Markdown("# MultimodalQnA")
with gr.Tabs():
with gr.TabItem("MultimodalQnA With Your Videos"):
qna.render()
with gr.TabItem("Upload Your Own Videos"):
upload_gen_trans.render()
with gr.TabItem("Upload Your Own Videos"):
upload_gen_captions.render()
demo.queue()
app = gr.mount_gradio_app(app, demo, path="/")
share = False
enable_queue = True
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument("--host", type=str, default="0.0.0.0")
parser.add_argument("--port", type=int, default=5173)
parser.add_argument("--concurrency-count", type=int, default=20)
parser.add_argument("--share", action="store_true")
backend_service_endpoint = os.getenv("BACKEND_SERVICE_ENDPOINT", "http://localhost:8888/v1/multimodalqna")
dataprep_gen_transcript_endpoint = os.getenv(
"DATAPREP_GEN_TRANSCRIPT_SERVICE_ENDPOINT", "http://localhost:6007/v1/generate_transcripts"
)
dataprep_gen_caption_endpoint = os.getenv(
"DATAPREP_GEN_CAPTION_SERVICE_ENDPOINT", "http://localhost:6007/v1/generate_captions"
)
args = parser.parse_args()
logger.info(f"args: {args}")
global gateway_addr
gateway_addr = backend_service_endpoint
global dataprep_gen_transcript_addr
dataprep_gen_transcript_addr = dataprep_gen_transcript_endpoint
global dataprep_gen_captiono_addr
dataprep_gen_captiono_addr = dataprep_gen_caption_endpoint
uvicorn.run(app, host=args.host, port=args.port)

View File

@@ -0,0 +1,5 @@
gradio==4.44.0
moviepy==1.0.3
numpy==1.26.4
opencv-python==4.10.0.82
Pillow==10.3.0

View File

@@ -0,0 +1,169 @@
# Copyright (C) 2024 Intel Corporation
# SPDX-License-Identifier: Apache-2.0
import base64
import logging
import logging.handlers
import os
import sys
from pathlib import Path
import cv2
from moviepy.video.io.VideoFileClip import VideoFileClip
LOGDIR = "."
server_error_msg = "**NETWORK ERROR DUE TO HIGH TRAFFIC. PLEASE REGENERATE OR REFRESH THIS PAGE.**"
moderation_msg = "YOUR INPUT VIOLATES OUR CONTENT MODERATION GUIDELINES. PLEASE TRY AGAIN."
handler = None
save_log = False
def build_logger(logger_name, logger_filename):
global handler
formatter = logging.Formatter(
fmt="%(asctime)s | %(levelname)s | %(name)s | %(message)s",
datefmt="%Y-%m-%d %H:%M:%S",
)
# Set the format of root handlers
if not logging.getLogger().handlers:
logging.basicConfig(level=logging.INFO)
logging.getLogger().handlers[0].setFormatter(formatter)
# Redirect stdout and stderr to loggers
stdout_logger = logging.getLogger("stdout")
stdout_logger.setLevel(logging.INFO)
sl = StreamToLogger(stdout_logger, logging.INFO)
sys.stdout = sl
stderr_logger = logging.getLogger("stderr")
stderr_logger.setLevel(logging.ERROR)
sl = StreamToLogger(stderr_logger, logging.ERROR)
sys.stderr = sl
# Get logger
logger = logging.getLogger(logger_name)
logger.setLevel(logging.INFO)
# Add a file handler for all loggers
if save_log and handler is None:
os.makedirs(LOGDIR, exist_ok=True)
filename = os.path.join(LOGDIR, logger_filename)
handler = logging.handlers.TimedRotatingFileHandler(filename, when="D", utc=True)
handler.setFormatter(formatter)
for name, item in logging.root.manager.loggerDict.items():
if isinstance(item, logging.Logger):
item.addHandler(handler)
return logger
class StreamToLogger(object):
"""Fake file-like stream object that redirects writes to a logger instance."""
def __init__(self, logger, log_level=logging.INFO):
self.terminal = sys.stdout
self.logger = logger
self.log_level = log_level
self.linebuf = ""
def __getattr__(self, attr):
return getattr(self.terminal, attr)
def write(self, buf):
temp_linebuf = self.linebuf + buf
self.linebuf = ""
for line in temp_linebuf.splitlines(True):
# From the io.TextIOWrapper docs:
# On output, if newline is None, any '\n' characters written
# are translated to the system default line separator.
# By default sys.stdout.write() expects '\n' newlines and then
# translates them so this is still cross platform.
if line[-1] == "\n":
self.logger.log(self.log_level, line.rstrip())
else:
self.linebuf += line
def flush(self):
if self.linebuf != "":
self.logger.log(self.log_level, self.linebuf.rstrip())
self.linebuf = ""
def maintain_aspect_ratio_resize(image, width=None, height=None, inter=cv2.INTER_AREA):
# Grab the image size and initialize dimensions
dim = None
(h, w) = image.shape[:2]
# Return original image if no need to resize
if width is None and height is None:
return image
# We are resizing height if width is none
if width is None:
# Calculate the ratio of the height and construct the dimensions
r = height / float(h)
dim = (int(w * r), height)
# We are resizing width if height is none
else:
# Calculate the ratio of the width and construct the dimensions
r = width / float(w)
dim = (width, int(h * r))
# Return the resized image
return cv2.resize(image, dim, interpolation=inter)
# function to split video at a timestamp
def split_video(
video_path,
timestamp_in_ms,
output_video_path: str = "./public/splitted_videos",
output_video_name: str = "video_tmp.mp4",
play_before_sec: int = 5,
play_after_sec: int = 5,
):
timestamp_in_sec = int(timestamp_in_ms) / 1000
# create output_video_name folder if not exist:
Path(output_video_path).mkdir(parents=True, exist_ok=True)
output_video = os.path.join(output_video_path, output_video_name)
with VideoFileClip(video_path) as video:
duration = video.duration
start_time = max(timestamp_in_sec - play_before_sec, 0)
end_time = min(timestamp_in_sec + play_after_sec, duration)
new = video.subclip(start_time, end_time)
new.write_videofile(output_video, audio_codec="aac")
return output_video
def delete_split_video(video_path):
if os.path.exists(video_path):
os.remove(video_path)
return True
else:
print("The file does not exist")
return False
def convert_img_to_base64(image):
"Convert image to base64 string"
_, buffer = cv2.imencode(".png", image)
encoded_string = base64.b64encode(buffer)
return encoded_string.decode("utf-8")
def get_b64_frame_from_timestamp(video_path, timestamp_in_ms, maintain_aspect_ratio: bool = False):
print(f"video path: {video_path}")
vidcap = cv2.VideoCapture(video_path)
vidcap.set(cv2.CAP_PROP_POS_MSEC, int(timestamp_in_ms))
success, frame = vidcap.read()
if success:
if maintain_aspect_ratio:
frame = maintain_aspect_ratio_resize(frame, height=350)
b64_img_str = convert_img_to_base64(frame)
return b64_img_str
return None