feat:add tts-streaming config and future (#5492)

2026-01-08 07:14:14 +00:00 · 2024-07-09 11:33:58 +08:00
parent b29a36f461
commit 6ef401a9f0
44 changed files with 1280 additions and 358 deletions
--- a/web/app/components/app/configuration/config-voice/param-config-content.tsx
+++ b/web/app/components/app/configuration/config-voice/param-config-content.tsx
@@ -11,11 +11,13 @@ import { usePathname } from 'next/navigation'
 import { useTranslation } from 'react-i18next'
 import { Listbox, Transition } from '@headlessui/react'
 import { CheckIcon, ChevronDownIcon } from '@heroicons/react/20/solid'
+import RadioGroup from '@/app/components/app/configuration/config-vision/radio-group'
 import type { Item } from '@/app/components/base/select'
 import ConfigContext from '@/context/debug-configuration'
 import { fetchAppVoices } from '@/service/apps'
 import Tooltip from '@/app/components/base/tooltip'
 import { languages } from '@/i18n/language'
+import { TtsAutoPlay } from '@/types/app'
 const VoiceParamConfig: FC = () => {
  const { t } = useTranslation()
  const pathname = usePathname()
@@ -27,12 +29,16 @@ const VoiceParamConfig: FC = () => {
    setTextToSpeechConfig,
  } = useContext(ConfigContext)

-  const languageItem = languages.find(item => item.value === textToSpeechConfig.language)
+  let languageItem = languages.find(item => item.value === textToSpeechConfig.language)
  const localLanguagePlaceholder = languageItem?.name || t('common.placeholder.select')
-
+  if (languages && !languageItem)
+    languageItem = languages[0]
  const language = languageItem?.value
  const voiceItems = useSWR({ appId, language }, fetchAppVoices).data
-  const voiceItem = voiceItems?.find(item => item.value === textToSpeechConfig.voice)
+  let voiceItem = voiceItems?.find(item => item.value === textToSpeechConfig.voice)
+  if (voiceItems && !voiceItem)
+    voiceItem = voiceItems[0]
+
  const localVoicePlaceholder = voiceItem?.name || t('common.placeholder.select')

  return (
@@ -42,8 +48,9 @@ const VoiceParamConfig: FC = () => {
        <div className='pt-3 space-y-6'>
          <div>
            <div className='mb-2 flex items-center  space-x-1'>
-              <div className='leading-[18px] text-[13px] font-semibold text-gray-800'>{t('appDebug.voice.voiceSettings.language')}</div>
-              <Tooltip htmlContent={<div className='w-[180px]' >
+              <div
+                className='leading-[18px] text-[13px] font-semibold text-gray-800'>{t('appDebug.voice.voiceSettings.language')}</div>
+              <Tooltip htmlContent={<div className='w-[180px]'>
                {t('appDebug.voice.voiceSettings.resolutionTooltip').split('\n').map(item => (
                  <div key={item}>{item}</div>
                ))}
@@ -61,7 +68,8 @@ const VoiceParamConfig: FC = () => {
              }}
            >
              <div className={'relative h-9'}>
-                <Listbox.Button className={'w-full h-full rounded-lg border-0 bg-gray-100 py-1.5 pl-3 pr-10 sm:text-sm sm:leading-6 focus-visible:outline-none focus-visible:bg-gray-200 group-hover:bg-gray-200 cursor-pointer'}>
+                <Listbox.Button
+                  className={'w-full h-full rounded-lg border-0 bg-gray-100 py-1.5 pl-3 pr-10 sm:text-sm sm:leading-6 focus-visible:outline-none focus-visible:bg-gray-200 group-hover:bg-gray-200 cursor-pointer'}>
                  <span className={classNames('block truncate text-left', !languageItem?.name && 'text-gray-400')}>
                    {languageItem?.name ? t(`common.voice.language.${languageItem?.value.replace('-', '')}`) : localLanguagePlaceholder}
                  </span>
@@ -79,7 +87,8 @@ const VoiceParamConfig: FC = () => {
                  leaveTo="opacity-0"
                >

-                  <Listbox.Options className="absolute z-10 mt-1 px-1 max-h-60 w-full overflow-auto rounded-md bg-white py-1 text-base shadow-lg border-gray-200 border-[0.5px] focus:outline-none sm:text-sm">
+                  <Listbox.Options
+                    className="absolute z-10 mt-1 px-1 max-h-60 w-full overflow-auto rounded-md bg-white py-1 text-base shadow-lg border-gray-200 border-[0.5px] focus:outline-none sm:text-sm">
                    {languages.map((item: Item) => (
                      <Listbox.Option
                        key={item.value}
@@ -100,7 +109,7 @@ const VoiceParamConfig: FC = () => {
                                  'absolute inset-y-0 right-0 flex items-center pr-4 text-gray-700',
                                )}
                              >
-                                <CheckIcon className="h-5 w-5" aria-hidden="true" />
+                                <CheckIcon className="h-5 w-5" aria-hidden="true"/>
                              </span>
                            )}
                          </>
@@ -112,9 +121,9 @@ const VoiceParamConfig: FC = () => {
              </div>
            </Listbox>
          </div>
-
          <div>
-            <div className='mb-2 leading-[18px] text-[13px] font-semibold text-gray-800'>{t('appDebug.voice.voiceSettings.voice')}</div>
+            <div
+              className='mb-2 leading-[18px] text-[13px] font-semibold text-gray-800'>{t('appDebug.voice.voiceSettings.voice')}</div>
            <Listbox
              value={voiceItem}
              disabled={!languageItem}
@@ -126,8 +135,10 @@ const VoiceParamConfig: FC = () => {
              }}
            >
              <div className={'relative h-9'}>
-                <Listbox.Button className={'w-full h-full rounded-lg border-0 bg-gray-100 py-1.5 pl-3 pr-10 sm:text-sm sm:leading-6 focus-visible:outline-none focus-visible:bg-gray-200 group-hover:bg-gray-200 cursor-pointer'}>
-                  <span className={classNames('block truncate text-left', !voiceItem?.name && 'text-gray-400')}>{voiceItem?.name ?? localVoicePlaceholder}</span>
+                <Listbox.Button
+                  className={'w-full h-full rounded-lg border-0 bg-gray-100 py-1.5 pl-3 pr-10 sm:text-sm sm:leading-6 focus-visible:outline-none focus-visible:bg-gray-200 group-hover:bg-gray-200 cursor-pointer'}>
+                  <span
+                    className={classNames('block truncate text-left', !voiceItem?.name && 'text-gray-400')}>{voiceItem?.name ?? localVoicePlaceholder}</span>
                  <span className="pointer-events-none absolute inset-y-0 right-0 flex items-center pr-2">
                    <ChevronDownIcon
                      className="h-5 w-5 text-gray-400"
@@ -142,7 +153,8 @@ const VoiceParamConfig: FC = () => {
                  leaveTo="opacity-0"
                >

-                  <Listbox.Options className="absolute z-10 mt-1 px-1 max-h-60 w-full overflow-auto rounded-md bg-white py-1 text-base shadow-lg border-gray-200 border-[0.5px] focus:outline-none sm:text-sm">
+                  <Listbox.Options
+                    className="absolute z-10 mt-1 px-1 max-h-60 w-full overflow-auto rounded-md bg-white py-1 text-base shadow-lg border-gray-200 border-[0.5px] focus:outline-none sm:text-sm">
                    {voiceItems?.map((item: Item) => (
                      <Listbox.Option
                        key={item.value}
@@ -162,7 +174,7 @@ const VoiceParamConfig: FC = () => {
                                  'absolute inset-y-0 right-0 flex items-center pr-4 text-gray-700',
                                )}
                              >
-                                <CheckIcon className="h-5 w-5" aria-hidden="true" />
+                                <CheckIcon className="h-5 w-5" aria-hidden="true"/>
                              </span>
                            )}
                          </>
@@ -174,6 +186,30 @@ const VoiceParamConfig: FC = () => {
              </div>
            </Listbox>
          </div>
+          <div>
+            <div
+              className='mb-2 leading-[18px] text-[13px] font-semibold text-gray-800'>{t('appDebug.voice.voiceSettings.autoPlay')}</div>
+            <RadioGroup
+              className='space-x-3'
+              options={[
+                {
+                  label: t('appDebug.voice.voiceSettings.autoPlayEnabled'),
+                  value: TtsAutoPlay.enabled,
+                },
+                {
+                  label: t('appDebug.voice.voiceSettings.autoPlayDisabled'),
+                  value: TtsAutoPlay.disabled,
+                },
+              ]}
+              value={textToSpeechConfig.autoPlay ? textToSpeechConfig.autoPlay : TtsAutoPlay.disabled}
+              onChange={(value: TtsAutoPlay) => {
+                setTextToSpeechConfig({
+                  ...textToSpeechConfig,
+                  autoPlay: value,
+                })
+              }}
+            />
+          </div>
        </div>
      </div>
    </div>
--- a/web/app/components/app/configuration/features/chat-group/text-to-speech/index.tsx
+++ b/web/app/components/app/configuration/features/chat-group/text-to-speech/index.tsx
@@ -40,7 +40,6 @@ const TextToSpeech: FC = () => {
          { languageInfo?.example && (
            <AudioBtn
              value={languageInfo?.example}
-              voice={voiceItem?.value}
              isAudition
              noCache
            />
--- a/web/app/components/app/text-generate/item/index.tsx
+++ b/web/app/components/app/text-generate/item/index.tsx
@@ -428,8 +428,7 @@ const GenerationItem: FC<IGenerationItemProps> = ({
                  <>
                    <div className='ml-2 mr-2 h-[14px] w-[1px] bg-gray-200'></div>
                    <AudioBtn
-                      value={content}
-                      noCache={false}
+                      id={messageId!}
                      className={'mr-1'}
                    />
                  </>
--- a/web/app/components/base/audio-btn/audio.player.manager.ts
+++ b/web/app/components/base/audio-btn/audio.player.manager.ts
@@ -0,0 +1,53 @@
+import AudioPlayer from '@/app/components/base/audio-btn/audio'
+declare global {
+  // eslint-disable-next-line @typescript-eslint/consistent-type-definitions
+  interface AudioPlayerManager {
+    instance: AudioPlayerManager
+  }
+
+}
+
+export class AudioPlayerManager {
+  private static instance: AudioPlayerManager
+  private audioPlayers: AudioPlayer | null = null
+  private msgId: string | undefined
+
+  private constructor() {
+  }
+
+  public static getInstance(): AudioPlayerManager {
+    if (!AudioPlayerManager.instance) {
+      AudioPlayerManager.instance = new AudioPlayerManager()
+      this.instance = AudioPlayerManager.instance
+    }
+
+    return AudioPlayerManager.instance
+  }
+
+  public getAudioPlayer(url: string, isPublic: boolean, id: string | undefined, msgContent: string | null | undefined, voice: string | undefined, callback: ((event: string) => {}) | null): AudioPlayer {
+    if (this.msgId && this.msgId === id && this.audioPlayers) {
+      this.audioPlayers.setCallback(callback)
+      return this.audioPlayers
+    }
+    else {
+      if (this.audioPlayers) {
+        try {
+          this.audioPlayers.pauseAudio()
+          this.audioPlayers.cacheBuffers = []
+          this.audioPlayers.sourceBuffer?.abort()
+        }
+        catch (e) {
+        }
+      }
+
+      this.msgId = id
+      this.audioPlayers = new AudioPlayer(url, isPublic, id, msgContent, callback)
+      return this.audioPlayers
+    }
+  }
+
+  public resetMsgId(msgId: string) {
+    this.msgId = msgId
+    this.audioPlayers?.resetMsgId(msgId)
+  }
+}
--- a/web/app/components/base/audio-btn/audio.ts
+++ b/web/app/components/base/audio-btn/audio.ts
@@ -0,0 +1,263 @@
+import Toast from '@/app/components/base/toast'
+import { textToAudioStream } from '@/service/share'
+
+declare global {
+  // eslint-disable-next-line @typescript-eslint/consistent-type-definitions
+  interface Window {
+    ManagedMediaSource: any
+  }
+}
+
+export default class AudioPlayer {
+  mediaSource: MediaSource | null
+  audio: HTMLAudioElement
+  audioContext: AudioContext
+  sourceBuffer?: SourceBuffer
+  cacheBuffers: ArrayBuffer[] = []
+  pauseTimer: number | null = null
+  msgId: string | undefined
+  msgContent: string | null | undefined = null
+  voice: string | undefined = undefined
+  isLoadData = false
+  url: string
+  isPublic: boolean
+  callback: ((event: string) => {}) | null
+
+  constructor(streamUrl: string, isPublic: boolean, msgId: string | undefined, msgContent: string | null | undefined, callback: ((event: string) => {}) | null) {
+    this.audioContext = new AudioContext()
+    this.msgId = msgId
+    this.msgContent = msgContent
+    this.url = streamUrl
+    this.isPublic = isPublic
+    this.callback = callback
+
+    // Compatible with iphone ios17 ManagedMediaSource
+    const MediaSource = window.MediaSource || window.ManagedMediaSource
+    if (!MediaSource) {
+      Toast.notify({
+        message: 'Your browser does not support audio streaming, if you are using an iPhone, please update to iOS 17.1 or later.',
+        type: 'error',
+      })
+    }
+    this.mediaSource = MediaSource ? new MediaSource() : null
+    this.audio = new Audio()
+    this.setCallback(callback)
+    this.audio.src = this.mediaSource ? URL.createObjectURL(this.mediaSource) : ''
+    this.audio.autoplay = true
+
+    const source = this.audioContext.createMediaElementSource(this.audio)
+    source.connect(this.audioContext.destination)
+    this.listenMediaSource('audio/mpeg')
+  }
+
+  public resetMsgId(msgId: string) {
+    this.msgId = msgId
+  }
+
+  private listenMediaSource(contentType: string) {
+    this.mediaSource?.addEventListener('sourceopen', () => {
+      if (this.sourceBuffer)
+        return
+
+      this.sourceBuffer = this.mediaSource?.addSourceBuffer(contentType)
+    //   this.sourceBuffer?.addEventListener('update', () => {
+    //     if (this.cacheBuffers.length && !this.sourceBuffer?.updating) {
+    //       const cacheBuffer = this.cacheBuffers.shift()!
+    //       this.sourceBuffer?.appendBuffer(cacheBuffer)
+    //     }
+    //     // this.pauseAudio()
+    //   })
+    //
+    //   this.sourceBuffer?.addEventListener('updateend', () => {
+    //     if (this.cacheBuffers.length && !this.sourceBuffer?.updating) {
+    //       const cacheBuffer = this.cacheBuffers.shift()!
+    //       this.sourceBuffer?.appendBuffer(cacheBuffer)
+    //     }
+    //     // this.pauseAudio()
+    //   })
+    })
+  }
+
+  public setCallback(callback: ((event: string) => {}) | null) {
+    this.callback = callback
+    if (callback) {
+      this.audio.addEventListener('ended', () => {
+        callback('ended')
+      }, false)
+      this.audio.addEventListener('paused', () => {
+        callback('paused')
+      }, true)
+      this.audio.addEventListener('loaded', () => {
+        callback('loaded')
+      }, true)
+      this.audio.addEventListener('play', () => {
+        callback('play')
+      }, true)
+      this.audio.addEventListener('timeupdate', () => {
+        callback('timeupdate')
+      }, true)
+      this.audio.addEventListener('loadeddate', () => {
+        callback('loadeddate')
+      }, true)
+      this.audio.addEventListener('canplay', () => {
+        callback('canplay')
+      }, true)
+      this.audio.addEventListener('error', () => {
+        callback('error')
+      }, true)
+    }
+  }
+
+  private async loadAudio() {
+    try {
+      const audioResponse: any = await textToAudioStream(this.url, this.isPublic, { content_type: 'audio/mpeg' }, {
+        message_id: this.msgId,
+        streaming: true,
+        voice: this.voice,
+        text: this.msgContent,
+      })
+
+      if (audioResponse.status !== 200) {
+        this.isLoadData = false
+        if (this.callback)
+          this.callback('error')
+      }
+
+      const reader = audioResponse.body.getReader()
+      while (true) {
+        const { value, done } = await reader.read()
+
+        if (done) {
+          this.receiveAudioData(value)
+          break
+        }
+
+        this.receiveAudioData(value)
+      }
+    }
+    catch (error) {
+      this.isLoadData = false
+      this.callback && this.callback('error')
+    }
+  }
+
+  // play audio
+  public playAudio() {
+    if (this.isLoadData) {
+      if (this.audioContext.state === 'suspended') {
+        this.audioContext.resume().then((_) => {
+          this.audio.play()
+          this.callback && this.callback('play')
+        })
+      }
+      else if (this.audio.ended) {
+        this.audio.play()
+        this.callback && this.callback('play')
+      }
+      if (this.callback)
+        this.callback('play')
+    }
+    else {
+      this.isLoadData = true
+      this.loadAudio()
+    }
+  }
+
+  private theEndOfStream() {
+    const endTimer = setInterval(() => {
+      if (!this.sourceBuffer?.updating) {
+        this.mediaSource?.endOfStream()
+        clearInterval(endTimer)
+      }
+      console.log('finishStream  endOfStream endTimer')
+    }, 10)
+  }
+
+  private finishStream() {
+    const timer = setInterval(() => {
+      if (!this.cacheBuffers.length) {
+        this.theEndOfStream()
+        clearInterval(timer)
+      }
+
+      if (this.cacheBuffers.length && !this.sourceBuffer?.updating) {
+        const arrayBuffer = this.cacheBuffers.shift()!
+        this.sourceBuffer?.appendBuffer(arrayBuffer)
+      }
+      console.log('finishStream  timer')
+    }, 10)
+  }
+
+  public async playAudioWithAudio(audio: string, play = true) {
+    if (!audio || !audio.length) {
+      this.finishStream()
+      return
+    }
+
+    const audioContent = Buffer.from(audio, 'base64')
+    this.receiveAudioData(new Uint8Array(audioContent))
+    if (play) {
+      this.isLoadData = true
+      if (this.audio.paused) {
+        this.audioContext.resume().then((_) => {
+          this.audio.play()
+          this.callback && this.callback('play')
+        })
+      }
+      else if (this.audio.ended) {
+        this.audio.play()
+        this.callback && this.callback('play')
+      }
+      else if (this.audio.played) { /* empty */ }
+
+      else {
+        this.audio.play()
+        this.callback && this.callback('play')
+      }
+    }
+  }
+
+  public pauseAudio() {
+    this.callback && this.callback('paused')
+    this.audio.pause()
+    this.audioContext.suspend()
+  }
+
+  private cancer() {
+
+  }
+
+  private receiveAudioData(unit8Array: Uint8Array) {
+    if (!unit8Array) {
+      this.finishStream()
+      return
+    }
+    const audioData = this.byteArrayToArrayBuffer(unit8Array)
+    if (!audioData.byteLength) {
+      if (this.mediaSource?.readyState === 'open')
+        this.finishStream()
+      return
+    }
+
+    if (this.sourceBuffer?.updating) {
+      this.cacheBuffers.push(audioData)
+    }
+    else {
+      if (this.cacheBuffers.length && !this.sourceBuffer?.updating) {
+        this.cacheBuffers.push(audioData)
+        const cacheBuffer = this.cacheBuffers.shift()!
+        this.sourceBuffer?.appendBuffer(cacheBuffer)
+      }
+      else {
+        this.sourceBuffer?.appendBuffer(audioData)
+      }
+    }
+  }
+
+  private byteArrayToArrayBuffer(byteArray: Uint8Array): ArrayBuffer {
+    const arrayBuffer = new ArrayBuffer(byteArray.length)
+    const uint8Array = new Uint8Array(arrayBuffer)
+    uint8Array.set(byteArray)
+    return arrayBuffer
+  }
+}
--- a/web/app/components/base/audio-btn/index.tsx
+++ b/web/app/components/base/audio-btn/index.tsx
@@ -1,124 +1,78 @@
 'use client'
-import { useEffect, useRef, useState } from 'react'
+import { useRef, useState } from 'react'
 import { t } from 'i18next'
 import { useParams, usePathname } from 'next/navigation'
 import s from './style.module.css'
 import Tooltip from '@/app/components/base/tooltip'
 import { randomString } from '@/utils'
-import { textToAudio } from '@/service/share'
 import Loading from '@/app/components/base/loading'
+import { AudioPlayerManager } from '@/app/components/base/audio-btn/audio.player.manager'

 type AudioBtnProps = {
-  value: string
+  id?: string
  voice?: string
+  value?: string
  className?: string
  isAudition?: boolean
-  noCache: boolean
+  noCache?: boolean
 }

 type AudioState = 'initial' | 'loading' | 'playing' | 'paused' | 'ended'

 const AudioBtn = ({
-  value,
+  id,
  voice,
+  value,
  className,
  isAudition,
-  noCache,
 }: AudioBtnProps) => {
-  const audioRef = useRef<HTMLAudioElement | null>(null)
  const [audioState, setAudioState] = useState<AudioState>('initial')

  const selector = useRef(`play-tooltip-${randomString(4)}`)
  const params = useParams()
  const pathname = usePathname()
-  const removeCodeBlocks = (inputText: any) => {
-    const codeBlockRegex = /```[\s\S]*?```/g
-    if (inputText)
-      return inputText.replace(codeBlockRegex, '')
-    return ''
-  }
-
-  const loadAudio = async () => {
-    const formData = new FormData()
-    formData.append('text', removeCodeBlocks(value))
-    formData.append('voice', removeCodeBlocks(voice))
-
-    if (value !== '') {
-      setAudioState('loading')
-
-      let url = ''
-      let isPublic = false
-
-      if (params.token) {
-        url = '/text-to-audio'
-        isPublic = true
-      }
-      else if (params.appId) {
-        if (pathname.search('explore/installed') > -1)
-          url = `/installed-apps/${params.appId}/text-to-audio`
-        else
-          url = `/apps/${params.appId}/text-to-audio`
-      }
-
-      try {
-        const audioResponse = await textToAudio(url, isPublic, formData)
-        const blob_bytes = Buffer.from(audioResponse.data, 'latin1')
-        const blob = new Blob([blob_bytes], { type: 'audio/wav' })
-        const audioUrl = URL.createObjectURL(blob)
-        audioRef.current!.src = audioUrl
-      }
-      catch (error) {
-        setAudioState('initial')
-        console.error('Error playing audio:', error)
-      }
-    }
-  }
-
-  const handleToggle = async () => {
-    if (audioState === 'initial' || noCache) {
-      await loadAudio()
-    }
-    else if (audioRef.current) {
-      if (audioState === 'playing') {
-        audioRef.current.pause()
-        setAudioState('paused')
-      }
-      else {
-        audioRef.current.play()
+  const audio_finished_call = (event: string): any => {
+    switch (event) {
+      case 'ended':
+        setAudioState('ended')
+        break
+      case 'paused':
+        setAudioState('ended')
+        break
+      case 'loaded':
+        setAudioState('loading')
+        break
+      case 'play':
        setAudioState('playing')
-      }
+        break
+      case 'error':
+        setAudioState('ended')
+        break
    }
  }
+  let url = ''
+  let isPublic = false

-  useEffect(() => {
-    const currentAudio = audioRef.current
-
-    const handleLoading = () => {
+  if (params.token) {
+    url = '/text-to-audio'
+    isPublic = true
+  }
+  else if (params.appId) {
+    if (pathname.search('explore/installed') > -1)
+      url = `/installed-apps/${params.appId}/text-to-audio`
+    else
+      url = `/apps/${params.appId}/text-to-audio`
+  }
+  const handleToggle = async () => {
+    if (audioState === 'playing' || audioState === 'loading') {
+      setAudioState('paused')
+      AudioPlayerManager.getInstance().getAudioPlayer(url, isPublic, id, value, voice, audio_finished_call).pauseAudio()
+    }
+    else {
      setAudioState('loading')
+      AudioPlayerManager.getInstance().getAudioPlayer(url, isPublic, id, value, voice, audio_finished_call).playAudio()
    }
-
-    const handlePlay = () => {
-      currentAudio?.play()
-      setAudioState('playing')
-    }
-
-    const handleEnded = () => {
-      setAudioState('ended')
-    }
-
-    currentAudio?.addEventListener('progress', handleLoading)
-    currentAudio?.addEventListener('canplaythrough', handlePlay)
-    currentAudio?.addEventListener('ended', handleEnded)
-
-    return () => {
-      currentAudio?.removeEventListener('progress', handleLoading)
-      currentAudio?.removeEventListener('canplaythrough', handlePlay)
-      currentAudio?.removeEventListener('ended', handleEnded)
-      URL.revokeObjectURL(currentAudio?.src || '')
-      currentAudio?.pause()
-      currentAudio?.setAttribute('src', '')
-    }
-  }, [])
+  }

  const tooltipContent = {
    initial: t('appApi.play'),
@@ -151,7 +105,6 @@ const AudioBtn = ({
            )}
        </button>
      </Tooltip>
-      <audio ref={audioRef} src='' className='hidden' />
    </div>
  )
 }
--- a/web/app/components/base/chat/chat/answer/index.tsx
+++ b/web/app/components/base/chat/chat/answer/index.tsx
@@ -8,6 +8,7 @@ import type {
  ChatConfig,
  ChatItem,
 } from '../../types'
+import { useChatContext } from '../context'
 import Operation from './operation'
 import AgentContent from './agent-content'
 import BasicContent from './basic-content'
@@ -59,23 +60,25 @@ const Answer: FC<AnswerProps> = ({
  } = item
  const hasAgentThoughts = !!agent_thoughts?.length

-  const [containerWidth, setContainerWidth] = useState(0)
+  const [containerWidth] = useState(0)
  const [contentWidth, setContentWidth] = useState(0)
  const containerRef = useRef<HTMLDivElement>(null)
  const contentRef = useRef<HTMLDivElement>(null)

-  const getContainerWidth = () => {
-    if (containerRef.current)
-      setContainerWidth(containerRef.current?.clientWidth + 16)
-  }
+  const {
+    config: chatContextConfig,
+  } = useChatContext()
+
+  const voiceRef = useRef(chatContextConfig?.text_to_speech?.voice)
  const getContentWidth = () => {
    if (contentRef.current)
      setContentWidth(contentRef.current?.clientWidth)
  }

  useEffect(() => {
-    getContainerWidth()
-  }, [])
+    voiceRef.current = chatContextConfig?.text_to_speech?.voice
+  }
+  , [chatContextConfig?.text_to_speech?.voice])

  useEffect(() => {
    if (!responding)
--- a/web/app/components/base/chat/chat/answer/operation.tsx
+++ b/web/app/components/base/chat/chat/answer/operation.tsx
@@ -119,9 +119,9 @@ const Operation: FC<OperationProps> = ({
              <>
                <div className='mx-1 w-[1px] h-[14px] bg-gray-200'/>
                <AudioBtn
+                  id={id}
                  value={content}
                  noCache={false}
-                  voice={config?.text_to_speech?.voice}
                  className='hidden group-hover:block'
                />
              </>
--- a/web/app/components/base/chat/chat/hooks.ts
+++ b/web/app/components/base/chat/chat/hooks.ts
@@ -6,6 +6,8 @@ import {
 } from 'react'
 import { useTranslation } from 'react-i18next'
 import { produce, setAutoFreeze } from 'immer'
+import { useParams, usePathname } from 'next/navigation'
+import { v4 as uuidV4 } from 'uuid'
 import type {
  ChatConfig,
  ChatItem,
@@ -20,6 +22,7 @@ import { replaceStringWithValues } from '@/app/components/app/configuration/prom
 import type { Annotation } from '@/models/log'
 import { WorkflowRunningStatus } from '@/app/components/workflow/types'
 import useTimestamp from '@/hooks/use-timestamp'
+import { AudioPlayerManager } from '@/app/components/base/audio-btn/audio.player.manager'

 type GetAbortController = (abortController: AbortController) => void
 type SendCallback = {
@@ -91,7 +94,8 @@ export const useChat = (
  const conversationMessagesAbortControllerRef = useRef<AbortController | null>(null)
  const suggestedQuestionsAbortControllerRef = useRef<AbortController | null>(null)
  const checkPromptVariables = useCheckPromptVariables()
-
+  const params = useParams()
+  const pathname = usePathname()
  useEffect(() => {
    setAutoFreeze(false)
    return () => {
@@ -262,6 +266,19 @@ export const useChat = (
    let isAgentMode = false
    let hasSetResponseId = false

+    let ttsUrl = ''
+    let ttsIsPublic = false
+    if (params.token) {
+      ttsUrl = '/text-to-audio'
+      ttsIsPublic = true
+    }
+    else if (params.appId) {
+      if (pathname.search('explore/installed') > -1)
+        ttsUrl = `/installed-apps/${params.appId}/text-to-audio`
+      else
+        ttsUrl = `/apps/${params.appId}/text-to-audio`
+    }
+    const player = AudioPlayerManager.getInstance().getAudioPlayer(ttsUrl, ttsIsPublic, uuidV4(), 'none', 'none', (_: any): any => {})
    ssePost(
      url,
      {
@@ -530,6 +547,15 @@ export const useChat = (
            }
          }))
        },
+        onTTSChunk: (messageId: string, audio: string) => {
+          if (!audio || audio === '')
+            return
+          player.playAudioWithAudio(audio, true)
+          AudioPlayerManager.getInstance().resetMsgId(messageId)
+        },
+        onTTSEnd: (messageId: string, audio: string) => {
+          player.playAudioWithAudio(audio, false)
+        },
      })
    return true
  }, [
--- a/web/app/components/base/features/feature-panel/text-to-speech/param-config-content.tsx
+++ b/web/app/components/base/features/feature-panel/text-to-speech/param-config-content.tsx
@@ -19,6 +19,8 @@ import type { Item } from '@/app/components/base/select'
 import { fetchAppVoices } from '@/service/apps'
 import Tooltip from '@/app/components/base/tooltip'
 import { languages } from '@/i18n/language'
+import RadioGroup from '@/app/components/app/configuration/config-vision/radio-group'
+import { TtsAutoPlay } from '@/types/app'

 type VoiceParamConfigProps = {
  onChange?: OnFeaturesChange
@@ -33,12 +35,16 @@ const VoiceParamConfig = ({
  const text2speech = useFeatures(state => state.features.text2speech)
  const featuresStore = useFeaturesStore()

-  const languageItem = languages.find(item => item.value === text2speech.language)
+  let languageItem = languages.find(item => item.value === text2speech?.language)
+  if (languages && !languageItem)
+    languageItem = languages[0]
  const localLanguagePlaceholder = languageItem?.name || t('common.placeholder.select')

  const language = languageItem?.value
  const voiceItems = useSWR({ appId, language }, fetchAppVoices).data
-  const voiceItem = voiceItems?.find(item => item.value === text2speech.voice)
+  let voiceItem = voiceItems?.find(item => item.value === text2speech?.voice)
+  if (voiceItems && !voiceItem)
+    voiceItem = voiceItems[0]
  const localVoicePlaceholder = voiceItem?.name || t('common.placeholder.select')

  const handleChange = (value: Record<string, string>) => {
@@ -66,13 +72,14 @@ const VoiceParamConfig = ({
        <div className='pt-3 space-y-6'>
          <div>
            <div className='mb-2 flex items-center  space-x-1'>
-              <div className='leading-[18px] text-[13px] font-semibold text-gray-800'>{t('appDebug.voice.voiceSettings.language')}</div>
-              <Tooltip htmlContent={<div className='w-[180px]' >
+              <div
+                className='leading-[18px] text-[13px] font-semibold text-gray-800'>{t('appDebug.voice.voiceSettings.language')}</div>
+              <Tooltip htmlContent={<div className='w-[180px]'>
                {t('appDebug.voice.voiceSettings.resolutionTooltip').split('\n').map(item => (
                  <div key={item}>{item}</div>
                ))}
              </div>} selector='config-resolution-tooltip'>
-                <RiQuestionLine className='w-[14px] h-[14px] text-gray-400' />
+                <RiQuestionLine className='w-[14px] h-[14px] text-gray-400'/>
              </Tooltip>
            </div>
            <Listbox
@@ -84,7 +91,8 @@ const VoiceParamConfig = ({
              }}
            >
              <div className={'relative h-9'}>
-                <Listbox.Button className={'w-full h-full rounded-lg border-0 bg-gray-100 py-1.5 pl-3 pr-10 sm:text-sm sm:leading-6 focus-visible:outline-none focus-visible:bg-gray-200 group-hover:bg-gray-200 cursor-pointer'}>
+                <Listbox.Button
+                  className={'w-full h-full rounded-lg border-0 bg-gray-100 py-1.5 pl-3 pr-10 sm:text-sm sm:leading-6 focus-visible:outline-none focus-visible:bg-gray-200 group-hover:bg-gray-200 cursor-pointer'}>
                  <span className={classNames('block truncate text-left', !languageItem?.name && 'text-gray-400')}>
                    {languageItem?.name ? t(`common.voice.language.${languageItem?.value.replace('-', '')}`) : localLanguagePlaceholder}
                  </span>
@@ -102,7 +110,8 @@ const VoiceParamConfig = ({
                  leaveTo="opacity-0"
                >

-                  <Listbox.Options className="absolute z-10 mt-1 px-1 max-h-60 w-full overflow-auto rounded-md bg-white py-1 text-base shadow-lg border-gray-200 border-[0.5px] focus:outline-none sm:text-sm">
+                  <Listbox.Options
+                    className="absolute z-10 mt-1 px-1 max-h-60 w-full overflow-auto rounded-md bg-white py-1 text-base shadow-lg border-gray-200 border-[0.5px] focus:outline-none sm:text-sm">
                    {languages.map((item: Item) => (
                      <Listbox.Option
                        key={item.value}
@@ -117,13 +126,13 @@ const VoiceParamConfig = ({
                          <>
                            <span
                              className={classNames('block', selected && 'font-normal')}>{t(`common.voice.language.${(item.value).toString().replace('-', '')}`)}</span>
-                            {(selected || item.value === text2speech.language) && (
+                            {(selected || item.value === text2speech?.language) && (
                              <span
                                className={classNames(
                                  'absolute inset-y-0 right-0 flex items-center pr-4 text-gray-700',
                                )}
                              >
-                                <CheckIcon className="h-5 w-5" aria-hidden="true" />
+                                <CheckIcon className="h-5 w-5" aria-hidden="true"/>
                              </span>
                            )}
                          </>
@@ -137,7 +146,8 @@ const VoiceParamConfig = ({
          </div>

          <div>
-            <div className='mb-2 leading-[18px] text-[13px] font-semibold text-gray-800'>{t('appDebug.voice.voiceSettings.voice')}</div>
+            <div
+              className='mb-2 leading-[18px] text-[13px] font-semibold text-gray-800'>{t('appDebug.voice.voiceSettings.voice')}</div>
            <Listbox
              value={voiceItem}
              disabled={!languageItem}
@@ -148,8 +158,10 @@ const VoiceParamConfig = ({
              }}
            >
              <div className={'relative h-9'}>
-                <Listbox.Button className={'w-full h-full rounded-lg border-0 bg-gray-100 py-1.5 pl-3 pr-10 sm:text-sm sm:leading-6 focus-visible:outline-none focus-visible:bg-gray-200 group-hover:bg-gray-200 cursor-pointer'}>
-                  <span className={classNames('block truncate text-left', !voiceItem?.name && 'text-gray-400')}>{voiceItem?.name ?? localVoicePlaceholder}</span>
+                <Listbox.Button
+                  className={'w-full h-full rounded-lg border-0 bg-gray-100 py-1.5 pl-3 pr-10 sm:text-sm sm:leading-6 focus-visible:outline-none focus-visible:bg-gray-200 group-hover:bg-gray-200 cursor-pointer'}>
+                  <span
+                    className={classNames('block truncate text-left', !voiceItem?.name && 'text-gray-400')}>{voiceItem?.name ?? localVoicePlaceholder}</span>
                  <span className="pointer-events-none absolute inset-y-0 right-0 flex items-center pr-2">
                    <ChevronDownIcon
                      className="h-5 w-5 text-gray-400"
@@ -164,7 +176,8 @@ const VoiceParamConfig = ({
                  leaveTo="opacity-0"
                >

-                  <Listbox.Options className="absolute z-10 mt-1 px-1 max-h-60 w-full overflow-auto rounded-md bg-white py-1 text-base shadow-lg border-gray-200 border-[0.5px] focus:outline-none sm:text-sm">
+                  <Listbox.Options
+                    className="absolute z-10 mt-1 px-1 max-h-60 w-full overflow-auto rounded-md bg-white py-1 text-base shadow-lg border-gray-200 border-[0.5px] focus:outline-none sm:text-sm">
                    {voiceItems?.map((item: Item) => (
                      <Listbox.Option
                        key={item.value}
@@ -178,13 +191,13 @@ const VoiceParamConfig = ({
                        {({ /* active, */ selected }) => (
                          <>
                            <span className={classNames('block', selected && 'font-normal')}>{item.name}</span>
-                            {(selected || item.value === text2speech.voice) && (
+                            {(selected || item.value === text2speech?.voice) && (
                              <span
                                className={classNames(
                                  'absolute inset-y-0 right-0 flex items-center pr-4 text-gray-700',
                                )}
                              >
-                                <CheckIcon className="h-5 w-5" aria-hidden="true" />
+                                <CheckIcon className="h-5 w-5" aria-hidden="true"/>
                              </span>
                            )}
                          </>
@@ -196,6 +209,29 @@ const VoiceParamConfig = ({
              </div>
            </Listbox>
          </div>
+          <div>
+            <div
+              className='mb-2 leading-[18px] text-[13px] font-semibold text-gray-800'>{t('appDebug.voice.voiceSettings.autoPlay')}</div>
+            <RadioGroup
+              className='space-x-3'
+              options={[
+                {
+                  label: t('appDebug.voice.voiceSettings.autoPlayEnabled'),
+                  value: TtsAutoPlay.enabled,
+                },
+                {
+                  label: t('appDebug.voice.voiceSettings.autoPlayDisabled'),
+                  value: TtsAutoPlay.disabled,
+                },
+              ]}
+              value={text2speech?.autoPlay ? text2speech?.autoPlay : TtsAutoPlay.disabled}
+              onChange={(value: TtsAutoPlay) => {
+                handleChange({
+                  autoPlay: value,
+                })
+              }}
+            />
+          </div>
        </div>
      </div>
    </div>
--- a/web/app/components/base/features/types.ts
+++ b/web/app/components/base/features/types.ts
@@ -1,4 +1,4 @@
-import type { TransferMethod } from '@/types/app'
+import type { TransferMethod, TtsAutoPlay } from '@/types/app'

 export type EnabledOrDisabled = {
  enabled?: boolean
@@ -14,6 +14,7 @@ export type SuggestedQuestionsAfterAnswer = EnabledOrDisabled
 export type TextToSpeech = EnabledOrDisabled & {
  language?: string
  voice?: string
+  autoPlay?: TtsAutoPlay
 }

 export type SpeechToText = EnabledOrDisabled
--- a/web/app/components/workflow/hooks/use-workflow-run.ts
+++ b/web/app/components/workflow/hooks/use-workflow-run.ts
@@ -4,6 +4,8 @@ import {
  useStoreApi,
 } from 'reactflow'
 import produce from 'immer'
+import { v4 as uuidV4 } from 'uuid'
+import { usePathname } from 'next/navigation'
 import { useWorkflowStore } from '../store'
 import { useNodesSyncDraft } from '../hooks'
 import {
@@ -19,6 +21,7 @@ import {
  stopWorkflowRun,
 } from '@/service/workflow'
 import { useFeaturesStore } from '@/app/components/base/features/hooks'
+import { AudioPlayerManager } from '@/app/components/base/audio-btn/audio.player.manager'

 export const useWorkflowRun = () => {
  const store = useStoreApi()
@@ -27,6 +30,7 @@ export const useWorkflowRun = () => {
  const featuresStore = useFeaturesStore()
  const { doSyncWorkflowDraft } = useNodesSyncDraft()
  const { handleUpdateWorkflowCanvas } = useWorkflowUpdate()
+  const pathname = usePathname()

  const handleBackupDraft = useCallback(() => {
    const {
@@ -134,6 +138,20 @@ export const useWorkflowRun = () => {
    let isInIteration = false
    let iterationLength = 0

+    let ttsUrl = ''
+    let ttsIsPublic = false
+    if (params.token) {
+      ttsUrl = '/text-to-audio'
+      ttsIsPublic = true
+    }
+    else if (params.appId) {
+      if (pathname.search('explore/installed') > -1)
+        ttsUrl = `/installed-apps/${params.appId}/text-to-audio`
+      else
+        ttsUrl = `/apps/${params.appId}/text-to-audio`
+    }
+    const player = AudioPlayerManager.getInstance().getAudioPlayer(ttsUrl, ttsIsPublic, uuidV4(), 'none', 'none', (_: any): any => {})
+
    ssePost(
      url,
      {
@@ -468,6 +486,15 @@ export const useWorkflowRun = () => {
            draft.resultText = text
          }))
        },
+        onTTSChunk: (messageId: string, audio: string, audioType?: string) => {
+          if (!audio || audio === '')
+            return
+          player.playAudioWithAudio(audio, true)
+          AudioPlayerManager.getInstance().resetMsgId(messageId)
+        },
+        onTTSEnd: (messageId: string, audio: string, audioType?: string) => {
+          player.playAudioWithAudio(audio, false)
+        },
        ...restCallback,
      },
    )
--- a/web/i18n/en-US/app-debug.ts
+++ b/web/i18n/en-US/app-debug.ts
@@ -323,6 +323,9 @@ const translation = {
      language: 'Language',
      resolutionTooltip: 'Text-to-speech voice support language。',
      voice: 'Voice',
+      autoPlay: 'Auto Play',
+      autoPlayEnabled: 'Turn On',
+      autoPlayDisabled: 'Turn Off',
    },
  },
  openingStatement: {
--- a/web/i18n/ja-JP/app-debug.ts
+++ b/web/i18n/ja-JP/app-debug.ts
@@ -319,6 +319,9 @@ const translation = {
      language: '言語',
      resolutionTooltip: 'テキスト読み上げの音声言語をサポートします。',
      voice: '音声',
+      autoPlay: '自動再生',
+      autoPlayEnabled: '開ける',
+      autoPlayDisabled: '關閉',
    },
  },
  openingStatement: {
--- a/web/i18n/zh-Hans/app-debug.ts
+++ b/web/i18n/zh-Hans/app-debug.ts
@@ -319,6 +319,9 @@ const translation = {
      language: '语言',
      resolutionTooltip: '文本转语音音色支持语言。',
      voice: '音色',
+      autoPlay: '自动播放',
+      autoPlayEnabled: '开启',
+      autoPlayDisabled: '关闭',
    },
  },
  openingStatement: {
--- a/web/i18n/zh-Hant/app-debug.ts
+++ b/web/i18n/zh-Hant/app-debug.ts
@@ -318,6 +318,9 @@ const translation = {
      language: '語言',
      resolutionTooltip: '文字轉語音音色支援語言。',
      voice: '音色',
+      autoPlay: '自動播放',
+      autoPlayEnabled: '開啟',
+      autoPlayDisabled: '關閉',
    },
  },
  openingStatement: {
--- a/web/models/debug.ts
+++ b/web/models/debug.ts
@@ -1,4 +1,4 @@
-import type { AgentStrategy, ModelModeType, RETRIEVE_TYPE, ToolItem } from '@/types/app'
+import type { AgentStrategy, ModelModeType, RETRIEVE_TYPE, ToolItem, TtsAutoPlay } from '@/types/app'
 export type Inputs = Record<string, string | number | object>

 export enum PromptMode {
@@ -79,6 +79,7 @@ export type TextToSpeechConfig = {
  enabled: boolean
  voice?: string
  language?: string
+  autoPlay?: TtsAutoPlay
 }

 export type CitationConfig = MoreLikeThisConfig
--- a/web/next.config.js
+++ b/web/next.config.js
@@ -34,6 +34,7 @@ const nextConfig = {
    // https://nextjs.org/docs/api-reference/next.config.js/ignoring-typescript-errors
    ignoreBuildErrors: true,
  },
+  reactStrictMode: true,
  async redirects() {
    return [
      {
--- a/web/service/apps.ts
+++ b/web/service/apps.ts
@@ -120,6 +120,7 @@ export const generationIntroduction: Fetcher<GenerationIntroductionResponse, { u
 }

 export const fetchAppVoices: Fetcher<AppVoicesListResponse, { appId: string; language?: string }> = ({ appId, language }) => {
+  language = language || 'en-US'
  return get<AppVoicesListResponse>(`apps/${appId}/text-to-audio/voices?language=${language}`)
 }

--- a/web/service/base.ts
+++ b/web/service/base.ts
@@ -19,6 +19,7 @@ const TIME_OUT = 100000
 const ContentType = {
  json: 'application/json',
  stream: 'text/event-stream',
+  audio: 'audio/mpeg',
  form: 'application/x-www-form-urlencoded; charset=UTF-8',
  download: 'application/octet-stream', // for download
  upload: 'multipart/form-data', // for upload
@@ -59,6 +60,8 @@ export type IOnIterationStarted = (workflowStarted: IterationStartedResponse) =>
 export type IOnIterationNexted = (workflowStarted: IterationNextedResponse) => void
 export type IOnIterationFinished = (workflowFinished: IterationFinishedResponse) => void
 export type IOnTextChunk = (textChunk: TextChunkResponse) => void
+export type IOnTTSChunk = (messageId: string, audioStr: string, audioType?: string) => void
+export type IOnTTSEnd = (messageId: string, audioStr: string, audioType?: string) => void
 export type IOnTextReplace = (textReplace: TextReplaceResponse) => void

 export type IOtherOptions = {
@@ -84,6 +87,8 @@ export type IOtherOptions = {
  onIterationNext?: IOnIterationNexted
  onIterationFinish?: IOnIterationFinished
  onTextChunk?: IOnTextChunk
+  onTTSChunk?: IOnTTSChunk
+  onTTSEnd?: IOnTTSEnd
  onTextReplace?: IOnTextReplace
 }

@@ -135,6 +140,8 @@ const handleStream = (
  onIterationNext?: IOnIterationNexted,
  onIterationFinish?: IOnIterationFinished,
  onTextChunk?: IOnTextChunk,
+  onTTSChunk?: IOnTTSChunk,
+  onTTSEnd?: IOnTTSEnd,
  onTextReplace?: IOnTextReplace,
 ) => {
  if (!response.ok)
@@ -227,6 +234,12 @@ const handleStream = (
            else if (bufferObj.event === 'text_replace') {
              onTextReplace?.(bufferObj as TextReplaceResponse)
            }
+            else if (bufferObj.event === 'tts_message') {
+              onTTSChunk?.(bufferObj.message_id, bufferObj.audio, bufferObj.audio_type)
+            }
+            else if (bufferObj.event === 'tts_message_end') {
+              onTTSEnd?.(bufferObj.message_id, bufferObj.audio)
+            }
          }
        })
        buffer = lines[lines.length - 1]
@@ -390,9 +403,10 @@ const baseFetch = <T>(
          }

          // return data
-          const data: Promise<T> = options.headers.get('Content-type') === ContentType.download ? res.blob() : res.json()
+          if (options.headers.get('Content-type') === ContentType.download || options.headers.get('Content-type') === ContentType.audio)
+            resolve(needAllResponseContent ? resClone : res.blob())

-          resolve(needAllResponseContent ? resClone : data)
+          else resolve(needAllResponseContent ? resClone : res.json())
        })
        .catch((err) => {
          if (!silent)
@@ -475,6 +489,8 @@ export const ssePost = (
    onIterationNext,
    onIterationFinish,
    onTextChunk,
+    onTTSChunk,
+    onTTSEnd,
    onTextReplace,
    onError,
    getAbortController,
@@ -527,7 +543,7 @@ export const ssePost = (
          return
        }
        onData?.(str, isFirstMessage, moreInfo)
-      }, onCompleted, onThought, onMessageEnd, onMessageReplace, onFile, onWorkflowStarted, onWorkflowFinished, onNodeStarted, onNodeFinished, onIterationStart, onIterationNext, onIterationFinish, onTextChunk, onTextReplace)
+      }, onCompleted, onThought, onMessageEnd, onMessageReplace, onFile, onWorkflowStarted, onWorkflowFinished, onNodeStarted, onNodeFinished, onIterationStart, onIterationNext, onIterationFinish, onTextChunk, onTTSChunk, onTTSEnd, onTextReplace)
    }).catch((e) => {
      if (e.toString() !== 'AbortError: The user aborted a request.')
        Toast.notify({ type: 'error', message: e })
--- a/web/service/share.ts
+++ b/web/service/share.ts
@@ -1,4 +1,4 @@
-import type { IOnCompleted, IOnData, IOnError, IOnFile, IOnIterationFinished, IOnIterationNexted, IOnIterationStarted, IOnMessageEnd, IOnMessageReplace, IOnNodeFinished, IOnNodeStarted, IOnTextChunk, IOnTextReplace, IOnThought, IOnWorkflowFinished, IOnWorkflowStarted } from './base'
+import type { IOnCompleted, IOnData, IOnError, IOnFile, IOnIterationFinished, IOnIterationNexted, IOnIterationStarted, IOnMessageEnd, IOnMessageReplace, IOnNodeFinished, IOnNodeStarted, IOnTTSChunk, IOnTTSEnd, IOnTextChunk, IOnTextReplace, IOnThought, IOnWorkflowFinished, IOnWorkflowStarted } from './base'
 import {
  del as consoleDel, get as consoleGet, patch as consolePatch, post as consolePost,
  delPublic as del, getPublic as get, patchPublic as patch, postPublic as post, ssePost,
@@ -30,7 +30,7 @@ export function getUrl(url: string, isInstalledApp: boolean, installedAppId: str
  return isInstalledApp ? `installed-apps/${installedAppId}/${url.startsWith('/') ? url.slice(1) : url}` : url
 }

-export const sendChatMessage = async (body: Record<string, any>, { onData, onCompleted, onThought, onFile, onError, getAbortController, onMessageEnd, onMessageReplace }: {
+export const sendChatMessage = async (body: Record<string, any>, { onData, onCompleted, onThought, onFile, onError, getAbortController, onMessageEnd, onMessageReplace, onTTSChunk, onTTSEnd }: {
  onData: IOnData
  onCompleted: IOnCompleted
  onFile: IOnFile
@@ -39,13 +39,15 @@ export const sendChatMessage = async (body: Record<string, any>, { onData, onCom
  onMessageEnd?: IOnMessageEnd
  onMessageReplace?: IOnMessageReplace
  getAbortController?: (abortController: AbortController) => void
+  onTTSChunk?: IOnTTSChunk
+  onTTSEnd?: IOnTTSEnd
 }, isInstalledApp: boolean, installedAppId = '') => {
  return ssePost(getUrl('chat-messages', isInstalledApp, installedAppId), {
    body: {
      ...body,
      response_mode: 'streaming',
    },
-  }, { onData, onCompleted, onThought, onFile, isPublicAPI: !isInstalledApp, onError, getAbortController, onMessageEnd, onMessageReplace })
+  }, { onData, onCompleted, onThought, onFile, isPublicAPI: !isInstalledApp, onError, getAbortController, onMessageEnd, onMessageReplace, onTTSChunk, onTTSEnd })
 }

 export const stopChatMessageResponding = async (appId: string, taskId: string, isInstalledApp: boolean, installedAppId = '') => {
@@ -214,6 +216,10 @@ export const textToAudio = (url: string, isPublicAPI: boolean, body: FormData) =
  return (getAction('post', !isPublicAPI))(url, { body }, { bodyStringify: false, deleteContentType: true }) as Promise<{ data: string }>
 }

+export const textToAudioStream = (url: string, isPublicAPI: boolean, header: { content_type: string }, body: { streaming: boolean; voice?: string; message_id?: string; text?: string | null | undefined }) => {
+  return (getAction('post', !isPublicAPI))(url, { body, header }, { needAllResponseContent: true })
+}
+
 export const fetchAccessToken = async (appCode: string) => {
  const headers = new Headers()
  headers.append('X-App-Code', appCode)
--- a/web/types/app.ts
+++ b/web/types/app.ts
@@ -160,6 +160,7 @@ export type ModelConfig = {
    enabled: boolean
    voice?: string
    language?: string
+    autoPlay?: TtsAutoPlay
  }
  retriever_resource: {
    enabled: boolean
@@ -349,6 +350,11 @@ export enum TransferMethod {
  remote_url = 'remote_url',
 }

+export enum TtsAutoPlay {
+  enabled = 'enabled',
+  disabled = 'disabled',
+}
+
 export const ALLOW_FILE_EXTENSIONS = ['png', 'jpg', 'jpeg', 'webp', 'gif']

 export type VisionSettings = {