369 lines
11 KiB
TypeScript
369 lines
11 KiB
TypeScript
import {
|
|
Configuration,
|
|
NewSessionData,
|
|
StreamingAvatarApi,
|
|
} from "@heygen/streaming-avatar";
|
|
import {
|
|
Button,
|
|
Card,
|
|
CardBody,
|
|
CardFooter,
|
|
Divider,
|
|
Input,
|
|
Spinner,
|
|
Tooltip,
|
|
} from "@nextui-org/react";
|
|
import { Microphone, MicrophoneStage } from "@phosphor-icons/react";
|
|
import { useChat } from "ai/react";
|
|
import clsx from "clsx";
|
|
import OpenAI from "openai";
|
|
import { useEffect, useRef, useState } from "react";
|
|
import StreamingAvatarTextInput from "./StreamingAvatarTextInput";
|
|
|
|
const openai = new OpenAI({
|
|
apiKey: process.env.NEXT_PUBLIC_OPENAI_API_KEY,
|
|
dangerouslyAllowBrowser: true,
|
|
});
|
|
|
|
export default function StreamingAvatar() {
|
|
const [isLoadingSession, setIsLoadingSession] = useState(false);
|
|
const [isLoadingRepeat, setIsLoadingRepeat] = useState(false);
|
|
const [isLoadingChat, setIsLoadingChat] = useState(false);
|
|
const [stream, setStream] = useState<MediaStream>();
|
|
const [debug, setDebug] = useState<string>();
|
|
const [avatarId, setAvatarId] = useState<string>("");
|
|
const [voiceId, setVoiceId] = useState<string>("");
|
|
const [data, setData] = useState<NewSessionData>();
|
|
const [text, setText] = useState<string>("");
|
|
const [initialized, setInitialized] = useState(false); // Track initialization
|
|
const [recording, setRecording] = useState(false); // Track recording state
|
|
const mediaStream = useRef<HTMLVideoElement>(null);
|
|
const avatar = useRef<StreamingAvatarApi | null>(null);
|
|
const mediaRecorder = useRef<MediaRecorder | null>(null);
|
|
const audioChunks = useRef<Blob[]>([]);
|
|
const { input, setInput, handleSubmit } = useChat({
|
|
onFinish: async (message) => {
|
|
console.log("ChatGPT Response:", message);
|
|
|
|
if (!initialized || !avatar.current) {
|
|
setDebug("Avatar API not initialized");
|
|
return;
|
|
}
|
|
|
|
//send the ChatGPT response to the Streaming Avatar
|
|
await avatar.current
|
|
.speak({
|
|
taskRequest: { text: message.content, sessionId: data?.sessionId },
|
|
})
|
|
.catch((e) => {
|
|
setDebug(e.message);
|
|
});
|
|
setIsLoadingChat(false);
|
|
},
|
|
initialMessages: [
|
|
{
|
|
id: "1",
|
|
role: "system",
|
|
content: "You are a helpful assistant.",
|
|
},
|
|
],
|
|
});
|
|
|
|
async function fetchAccessToken() {
|
|
try {
|
|
const response = await fetch("/api/get-access-token", {
|
|
method: "POST",
|
|
});
|
|
const token = await response.text();
|
|
console.log("Access Token:", token); // Log the token to verify
|
|
return token;
|
|
} catch (error) {
|
|
console.error("Error fetching access token:", error);
|
|
return "";
|
|
}
|
|
}
|
|
|
|
async function startSession() {
|
|
setIsLoadingSession(true);
|
|
await updateToken();
|
|
if (!avatar.current) {
|
|
setDebug("Avatar API is not initialized");
|
|
return;
|
|
}
|
|
try {
|
|
const res = await avatar.current.createStartAvatar(
|
|
{
|
|
newSessionRequest: {
|
|
quality: "low",
|
|
avatarName: avatarId,
|
|
voice: { voiceId: voiceId },
|
|
},
|
|
},
|
|
setDebug
|
|
);
|
|
setData(res);
|
|
setStream(avatar.current.mediaStream);
|
|
setIsLoadingSession(false);
|
|
} catch (error) {
|
|
console.error("Error starting avatar session:", error);
|
|
}
|
|
}
|
|
|
|
async function updateToken() {
|
|
const newToken = await fetchAccessToken();
|
|
console.log("Updating Access Token:", newToken); // Log token for debugging
|
|
avatar.current = new StreamingAvatarApi(
|
|
new Configuration({ accessToken: newToken })
|
|
);
|
|
|
|
const startTalkCallback = (e: any) => {
|
|
console.log("Avatar started talking", e);
|
|
};
|
|
|
|
const stopTalkCallback = (e: any) => {
|
|
console.log("Avatar stopped talking", e);
|
|
};
|
|
|
|
console.log("Adding event handlers:", avatar.current);
|
|
avatar.current.addEventHandler("avatar_start_talking", startTalkCallback);
|
|
avatar.current.addEventHandler("avatar_stop_talking", stopTalkCallback);
|
|
|
|
setInitialized(true);
|
|
}
|
|
|
|
async function handleInterrupt() {
|
|
if (!initialized || !avatar.current) {
|
|
setDebug('Avatar API not initialized');
|
|
return;
|
|
}
|
|
await avatar.current.interrupt({ interruptRequest: { sessionId: data?.sessionId } }).catch((e) => {
|
|
setDebug(e.message);
|
|
});
|
|
}
|
|
|
|
async function endSession() {
|
|
if (!initialized || !avatar.current) {
|
|
setDebug("Avatar API not initialized");
|
|
return;
|
|
}
|
|
await avatar.current.stopAvatar(
|
|
{ stopSessionRequest: { sessionId: data?.sessionId } },
|
|
setDebug
|
|
);
|
|
setStream(undefined);
|
|
}
|
|
|
|
async function handleSpeak() {
|
|
setIsLoadingRepeat(true);
|
|
if (!initialized || !avatar.current) {
|
|
setDebug("Avatar API not initialized");
|
|
return;
|
|
}
|
|
await avatar.current
|
|
.speak({ taskRequest: { text: text, sessionId: data?.sessionId } })
|
|
.catch((e) => {
|
|
setDebug(e.message);
|
|
});
|
|
setIsLoadingRepeat(false);
|
|
}
|
|
|
|
useEffect(() => {
|
|
async function init() {
|
|
const newToken = await fetchAccessToken();
|
|
console.log("Initializing with Access Token:", newToken); // Log token for debugging
|
|
avatar.current = new StreamingAvatarApi(
|
|
new Configuration({ accessToken: newToken, jitterBuffer: 200 })
|
|
);
|
|
setInitialized(true); // Set initialized to true
|
|
}
|
|
init();
|
|
|
|
return () => {
|
|
endSession();
|
|
};
|
|
}, []);
|
|
|
|
useEffect(() => {
|
|
if (stream && mediaStream.current) {
|
|
mediaStream.current.srcObject = stream;
|
|
mediaStream.current.onloadedmetadata = () => {
|
|
mediaStream.current!.play();
|
|
setDebug("Playing");
|
|
};
|
|
}
|
|
}, [mediaStream, stream]);
|
|
|
|
function startRecording() {
|
|
navigator.mediaDevices
|
|
.getUserMedia({ audio: true })
|
|
.then((stream) => {
|
|
mediaRecorder.current = new MediaRecorder(stream);
|
|
mediaRecorder.current.ondataavailable = (event) => {
|
|
audioChunks.current.push(event.data);
|
|
};
|
|
mediaRecorder.current.onstop = () => {
|
|
const audioBlob = new Blob(audioChunks.current, {
|
|
type: "audio/wav",
|
|
});
|
|
audioChunks.current = [];
|
|
transcribeAudio(audioBlob);
|
|
};
|
|
mediaRecorder.current.start();
|
|
setRecording(true);
|
|
})
|
|
.catch((error) => {
|
|
console.error("Error accessing microphone:", error);
|
|
});
|
|
}
|
|
|
|
function stopRecording() {
|
|
if (mediaRecorder.current) {
|
|
mediaRecorder.current.stop();
|
|
setRecording(false);
|
|
}
|
|
}
|
|
|
|
async function transcribeAudio(audioBlob: Blob) {
|
|
try {
|
|
// Convert Blob to File
|
|
const audioFile = new File([audioBlob], "recording.wav", {
|
|
type: "audio/wav",
|
|
});
|
|
const response = await openai.audio.transcriptions.create({
|
|
model: "whisper-1",
|
|
file: audioFile,
|
|
});
|
|
const transcription = response.text;
|
|
console.log("Transcription: ", transcription);
|
|
setInput(transcription);
|
|
} catch (error) {
|
|
console.error("Error transcribing audio:", error);
|
|
}
|
|
}
|
|
|
|
return (
|
|
<div className="w-full flex flex-col gap-4">
|
|
<Card>
|
|
<CardBody className="h-[500px] flex flex-col justify-center items-center">
|
|
{stream ? (
|
|
<div className="h-[500px] w-[900px] justify-center items-center flex rounded-lg overflow-hidden">
|
|
<video
|
|
ref={mediaStream}
|
|
autoPlay
|
|
playsInline
|
|
style={{
|
|
width: "100%",
|
|
height: "100%",
|
|
objectFit: "contain",
|
|
}}
|
|
>
|
|
<track kind="captions" />
|
|
</video>
|
|
<Button
|
|
size="md"
|
|
onClick={handleInterrupt}
|
|
className="bg-gradient-to-tr from-indigo-500 to-indigo-300 absolute bottom-20 right-3 text-white rounded-lg"
|
|
variant="shadow"
|
|
>
|
|
Interrupt
|
|
</Button>
|
|
<Button
|
|
size="md"
|
|
onClick={endSession}
|
|
className="bg-gradient-to-tr from-indigo-500 to-indigo-300 absolute bottom-3 right-3 text-white rounded-lg"
|
|
variant="shadow"
|
|
>
|
|
End session
|
|
</Button>
|
|
</div>
|
|
) : !isLoadingSession ? (
|
|
<div className="h-full justify-center items-center flex flex-col gap-4 w-96 self-center">
|
|
<Input
|
|
value={avatarId}
|
|
onChange={(e) => setAvatarId(e.target.value)}
|
|
placeholder="Custom Avatar ID (optional)"
|
|
/>
|
|
<Input
|
|
value={voiceId}
|
|
onChange={(e) => setVoiceId(e.target.value)}
|
|
placeholder="Custom Voice ID (optional)"
|
|
/>
|
|
<Button
|
|
size="md"
|
|
onClick={startSession}
|
|
className="bg-gradient-to-tr from-indigo-500 to-indigo-300 w-full text-white"
|
|
variant="shadow"
|
|
>
|
|
Start session
|
|
</Button>
|
|
</div>
|
|
) : (
|
|
<Spinner size="lg" color="default" />
|
|
)}
|
|
</CardBody>
|
|
<Divider />
|
|
<CardFooter className="flex flex-col gap-3">
|
|
<StreamingAvatarTextInput
|
|
label="Repeat"
|
|
placeholder="Type something for the avatar to repeat"
|
|
input={text}
|
|
onSubmit={handleSpeak}
|
|
setInput={setText}
|
|
disabled={!stream}
|
|
loading={isLoadingRepeat}
|
|
/>
|
|
<StreamingAvatarTextInput
|
|
label="Chat"
|
|
placeholder="Chat with the avatar (uses ChatGPT)"
|
|
input={input}
|
|
onSubmit={() => {
|
|
setIsLoadingChat(true);
|
|
if (!input) {
|
|
setDebug("Please enter text to send to ChatGPT");
|
|
return;
|
|
}
|
|
handleSubmit();
|
|
}}
|
|
setInput={setInput}
|
|
loading={isLoadingChat}
|
|
endContent={
|
|
<Tooltip
|
|
content={!recording ? "Start recording" : "Stop recording"}
|
|
>
|
|
<Button
|
|
onClick={!recording ? startRecording : stopRecording}
|
|
isDisabled={!stream}
|
|
isIconOnly
|
|
className={clsx(
|
|
"mr-4 text-white",
|
|
!recording
|
|
? "bg-gradient-to-tr from-indigo-500 to-indigo-300"
|
|
: ""
|
|
)}
|
|
size="sm"
|
|
variant="shadow"
|
|
>
|
|
{!recording ? (
|
|
<Microphone size={20} />
|
|
) : (
|
|
<>
|
|
<div className="absolute h-full w-full bg-gradient-to-tr from-indigo-500 to-indigo-300 animate-pulse -z-10"></div>
|
|
<MicrophoneStage size={20} />
|
|
</>
|
|
)}
|
|
</Button>
|
|
</Tooltip>
|
|
}
|
|
disabled={!stream}
|
|
/>
|
|
</CardFooter>
|
|
</Card>
|
|
<p className="font-mono text-right">
|
|
<span className="font-bold">Console:</span>
|
|
<br />
|
|
{debug}
|
|
</p>
|
|
</div>
|
|
);
|
|
}
|