Files
conversational-ai-avatar/src/app/api/conversation/speech/route.ts
2025-05-26 17:25:13 +00:00

122 lines
3.9 KiB
TypeScript

import { PassThrough } from "stream";
import { AIMessage, HumanMessage } from "@langchain/core/messages";
import { Messages } from "@langchain/langgraph";
import { ElevenLabsClient } from "elevenlabs";
import OpenAI from "openai";
import { graph } from "@/lib/graph";
import logger from "@/lib/logger";
import { generateApproximateVisemes } from "@/utils/visemes";
const client = new OpenAI({
apiKey: process.env.OPENAI_API_KEY,
});
const elevenlabs = new ElevenLabsClient({
apiKey: process.env.ELEVENLABS_API_KEY,
});
// Configure voice settings
const VOICE_ID = process.env.ELEVENLABS_VOICE_ID || "21m00Tcm4TlvDq8ikWAM";
const VOICE_SETTINGS = {
stability: 0.5,
similarity_boost: 0.75,
style: 0.0,
use_speaker_boost: true,
};
// Function to estimate audio duration from text (rough approximation)
function estimateAudioDuration(text: string): number {
// Average speaking rate is about 150-160 words per minute
const wordsPerMinute = 150;
const words = text.split(/\s+/).length;
const minutes = words / wordsPerMinute;
return minutes * 60 * 1000; // Convert to milliseconds
}
// Define POST method for chat route
export async function POST(req: Request) {
try {
const formData = await req.formData();
const audio = formData.get("audio") as File;
const messages = JSON.parse(formData.get("messages") as string);
logger.info(JSON.stringify(messages, null, 2));
//* Speech to text (OpenAI Whisper)
const transcription = await client.audio.transcriptions.create({
file: audio,
model: "whisper-1",
});
logger.info(JSON.stringify(transcription, null, 2));
// Create new message with transcription
const userMessage = {
role: "user",
content: transcription.text,
id: Date.now().toString(),
};
const updatedMessages = [...messages, userMessage];
//* Text to text
const allMessages: Messages = updatedMessages.map((message) =>
message.role === "user" ? new HumanMessage(message.content) : new AIMessage(message.content)
);
// Stream of messages
const result = await graph.invoke({ messages: allMessages });
const lastMessage = result.messages[result.messages.length - 1];
const messageText = lastMessage.content.toString();
//* Text to speech with ElevenLabs
const audioStream = await elevenlabs.generate({
voice: VOICE_ID,
text: messageText,
model_id: "eleven_multilingual_v2",
voice_settings: VOICE_SETTINGS,
});
// Convert the audio stream to a PassThrough stream
const bufferStream = new PassThrough();
// ElevenLabs returns an async iterator, so we need to collect the chunks
const chunks: Uint8Array[] = [];
for await (const chunk of audioStream) {
chunks.push(chunk);
}
// Combine all chunks into a single buffer
const audioBuffer = Buffer.concat(chunks);
bufferStream.end(audioBuffer);
// Generate approximate visemes
const estimatedDuration = estimateAudioDuration(messageText);
const visemes = generateApproximateVisemes(messageText, estimatedDuration);
//* Return processed response
logger.info(`Response: ${lastMessage.content}`);
const safeLastMessageContent = messageText
.replace(/[\u2018\u2019]/g, "'")
.replace(/\u2014/g, "-");
return new Response(bufferStream, {
headers: {
"Content-Type": "audio/mpeg",
"Content-Disposition": `inline; filename=tts.mp3`,
Visemes: JSON.stringify(visemes),
Result: JSON.stringify({
id: lastMessage.id,
role: "assistant",
content: safeLastMessageContent,
}),
UserMessage: JSON.stringify(userMessage),
},
});
} catch (error) {
logger.error("Error in speech route:", error);
return new Response(JSON.stringify({ error: "Internal server error" }), {
status: 500,
headers: { "Content-Type": "application/json" },
});
}
}