# Copyright (C) 2024 Intel Corporation # SPDX-License-Identifier: Apache-2.0 import asyncio import base64 import io import os import shutil import subprocess import time import aiohttp import docker import ffmpeg import gradio as gr import numpy as np import soundfile as sf from PIL import Image # %% Docker Management def update_env_var_in_container(container_name, env_var, new_value): return # %% AudioQnA functions def preprocess_audio(audio): """The audio data is a 16-bit integer array with values ranging from -32768 to 32767 and the shape of the audio data array is (samples,)""" sr, y = audio # Convert to normalized float32 audio y = y.astype(np.float32) y /= np.max(np.abs(y)) # Save to memory buf = io.BytesIO() sf.write(buf, y, sr, format="WAV") buf.seek(0) # Reset the buffer position to the beginning # Encode the WAV file to base64 string base64_bytes = base64.b64encode(buf.read()) base64_string = base64_bytes.decode("utf-8") return base64_string def base64_to_int16(base64_string): wav_bytes = base64.b64decode(base64_string) buf = io.BytesIO(wav_bytes) y, sr = sf.read(buf, dtype="int16") return sr, y async def transcribe(audio_input, face_input, model_choice): """Input: mic audio; Output: ai audio, text, text""" global ai_chatbot_url, chat_history, count chat_history = "" # Preprocess the audio base64bytestr = preprocess_audio(audio_input) # Send the audio to the AvatarChatbot backend server endpoint initial_inputs = {"audio": base64bytestr, "max_tokens": 64} # TO-DO: update wav2lip-service with the chosen face_input # update_env_var_in_container("wav2lip-service", "DEVICE", "new_device_value") async with aiohttp.ClientSession() as session: async with session.post(ai_chatbot_url, json=initial_inputs) as response: # Check the response status code if response.status == 200: # response_json = await response.json() # # Decode the base64 string # sampling_rate, audio_int16 = base64_to_int16(response_json["byte_str"]) # chat_history += f"User: {response_json['query']}\n\n" # chat_ai = response_json["text"] # hitted_ends = [",", ".", "?", "!", "。", ";"] # last_punc_idx = max([chat_ai.rfind(punc) for punc in hitted_ends]) # if last_punc_idx != -1: # chat_ai = chat_ai[: last_punc_idx + 1] # chat_history += f"AI: {chat_ai}" # chat_history = chat_history.replace("OPEX", "OPEA") # return (sampling_rate, audio_int16) # handle the response result = await response.text() return "docker_compose/intel/hpu/gaudi/result.mp4" else: return {"error": "Failed to transcribe audio", "status_code": response.status_code} def resize_image(image_pil, size=(720, 720)): """Resize the image to the specified size.""" return image_pil.resize(size, Image.LANCZOS) def resize_video(video_path, save_path, size=(720, 1280)): """Resize the video to the specified size, and save to the save path.""" ffmpeg.input(video_path).output(save_path, vf=f"scale={size[0]}:{size[1]}").overwrite_output().run() # %% AI Avatar demo function async def aiavatar_demo(audio_input, face_input, model_choice): """Input: mic/preloaded audio, avatar file path; Output: ai video""" # Wait for response from AvatarChatbot backend output_video = await transcribe(audio_input, face_input, model_choice) # output video path if isinstance(output_video, dict): # in case of an error return None, None else: return output_video # %% Main if __name__ == "__main__": # HOST_IP = os.getenv("host_ip") HOST_IP = subprocess.check_output("hostname -I | awk '{print $1}'", shell=True).decode("utf-8").strip() # Fetch the AudioQnA backend server ai_chatbot_url = f"http://{HOST_IP}:3009/v1/avatarchatbot" # Collect chat history to print in the interface chat_history = "" # Prepare 3 image paths and 3 video paths # image_pils = [ # Image.open(os.path.join("assets/img/woman1.png")), # Image.open(os.path.join("assets/img/man1.png")), # Image.open(os.path.join("assets/img/woman2.png")), # ] # video_paths = [ # os.path.join("assets/video/man1.mp4"), # os.path.join("assets/video/woman2.mp4"), # os.path.join("assets/video/man4.mp4"), # ] def image_to_base64(image_path): with open(image_path, "rb") as image_file: return base64.b64encode(image_file.read()).decode("utf-8") # Convert your images to Base64 xeon_base64 = image_to_base64("assets/img/xeon.jpg") gaudi_base64 = image_to_base64("assets/img/gaudi.png") # List of prerecorded WAV files containing audio questions # audio_filepaths = [ # "assets/audio/intel2.wav", # "assets/audio/intel4.wav", # ] # audio_questions = [ # "1. What's the objective of the Open Platform for Enterprise AI? How is it helpful to enterprises building AI solutions?", # "2. What kinds of Intel AI tools are available to accelerate AI workloads?", # ] # Demo frontend demo = gr.Blocks() with demo: # Define processing functions count = 0 # Make necessary folders: if not os.path.exists("inputs"): os.makedirs("inputs") if not os.path.exists("outputs"): os.makedirs("outputs") def initial_process(audio_input, face_input, model_choice): global count start_time = time.time() loop = asyncio.new_event_loop() asyncio.set_event_loop(loop) video_file = loop.run_until_complete(aiavatar_demo(audio_input, face_input, model_choice)) count += 1 end_time = time.time() return video_file, f"The entire application took {(end_time - start_time):.1f} seconds" # def update_selected_image_state(image_index): # image_index = int(image_index) # selected_image_state.value = image_index # # change image_input here # if image_index < len(image_pils): # return f"inputs/face_{image_index}.png" # else: # return f"inputs/video_{image_index - len(image_pils)}.mp4" # def update_audio_input(audio_choice): # if audio_choice: # audio_index = int(audio_choice.split(".")[0]) - 1 # audio_filepath_gradio = f"inputs/audio_{audio_index:d}.wav" # shutil.copyfile(audio_filepaths[audio_index], audio_filepath_gradio) # return audio_filepath_gradio # UI Components # Title & Introduction gr.Markdown("
Welcome to our AI Avatar Audio Chatbot! This application leverages PyTorch and OPEA (Open Platform for Enterprise AI) v0.8 to provide you with a human-like conversational experience. It's run on Intel® Gaudi® AI Accelerator and Intel® Xeon® Processor, with hardware and software optimizations.
Please feel free to interact with the AI avatar by choosing your own avatar and talking into the mic.
OPEA megaservice deployed:
OPEA microservices deployed:
The AI Avatar Audio Chatbot is powered by the following Intel® AI software:
Intel is committed to respecting human rights and avoiding complicity in human rights abuses. See Intel's Global Human Rights Principles. Intel's products and software are intended only to be used in applications that do not cause or contribute to a violation of an internationally recognized human right.
© Intel Corporation. Intel, the Intel logo, and other Intel marks are trademarks of Intel Corporation or its subsidiaries. Other names and brands may be claimed as the property of others.
You may not use or facilitate the use of this document in connection with any infringement or other legal analysis concerning Intel products described herein. You agree to grant Intel a non-exclusive, royalty-free license to any patent claim thereafter drafted which includes subject matter disclosed herein.