push2talk

This commit is contained in:
gcw_4spBpAfv
2026-03-02 12:00:33 +08:00
parent 2f6166ab6c
commit 1701ecfb7f
19 changed files with 802 additions and 160 deletions

View File

@@ -37,6 +37,9 @@ class Live2DChatActivity : AppCompatActivity() {
@Volatile
private var isRecording: Boolean = false
private val holdToSpeakAudioBuffer = mutableListOf<Float>()
private val HOLD_TO_SPEAK_MIN_SAMPLES = 16000 // 1秒的音频数据
private val ioScope = CoroutineScope(SupervisorJob() + Dispatchers.IO)
private var recordingJob: Job? = null
@@ -77,13 +80,30 @@ class Live2DChatActivity : AppCompatActivity() {
scrollViewId = R.id.scroll_view,
startButtonId = R.id.start_button,
stopButtonId = R.id.stop_button,
recordButtonId = R.id.record_button,
traditionalButtonsId = R.id.traditional_buttons,
silentPlayerViewId = 0,
speakingPlayerViewId = 0,
live2dViewId = R.id.live2d_view
)
uiManager.setStartButtonListener { onStartClicked() }
uiManager.setStopButtonListener { onStopClicked(userInitiated = true) }
// 根据配置选择交互方式
uiManager.setUseHoldToSpeak(AppConfig.USE_HOLD_TO_SPEAK)
if (AppConfig.USE_HOLD_TO_SPEAK) {
uiManager.setRecordButtonTouchListener { isDown ->
if (isDown) {
// 按住按钮,开始录音
onRecordButtonDown()
} else {
// 松开按钮,停止录音
onRecordButtonUp()
}
}
} else {
uiManager.setStartButtonListener { onStartClicked() }
uiManager.setStopButtonListener { onStopClicked(userInitiated = true) }
}
ActivityCompat.requestPermissions(this, permissions, AppConfig.REQUEST_RECORD_AUDIO_PERMISSION)
@@ -99,10 +119,16 @@ class Live2DChatActivity : AppCompatActivity() {
Log.w(AppConfig.TAG, "Streaming switch not found in layout: ${e.message}")
}
uiManager.setButtonsEnabled(startEnabled = false, stopEnabled = false)
if (AppConfig.USE_HOLD_TO_SPEAK) {
uiManager.setButtonsEnabled(recordEnabled = false)
} else {
uiManager.setButtonsEnabled(startEnabled = false, stopEnabled = false)
}
uiManager.setText("初始化中…")
audioProcessor = AudioProcessor(this)
ttsManager = TtsManager(this)
ttsManager.setCallback(createTtsCallback())
asrManager = AsrManager(this)
asrManager.setAudioProcessor(audioProcessor)
@@ -127,23 +153,28 @@ class Live2DChatActivity : AppCompatActivity() {
)
}
uiManager.setText(getString(R.string.hint))
uiManager.setButtonsEnabled(startEnabled = true, stopEnabled = false)
if (AppConfig.USE_HOLD_TO_SPEAK) {
uiManager.setButtonsEnabled(recordEnabled = true)
} else {
uiManager.setButtonsEnabled(startEnabled = true, stopEnabled = false)
}
}
} catch (t: Throwable) {
Log.e(AppConfig.TAG, "Initialization failed: ${t.message}", t)
withContext(Dispatchers.Main) {
uiManager.setText("初始化失败:${t.javaClass.simpleName}: ${t.message}")
uiManager.showToast("初始化失败(请看 Logcat: ${t.javaClass.simpleName}", Toast.LENGTH_LONG)
uiManager.setButtonsEnabled(startEnabled = false, stopEnabled = false)
if (AppConfig.USE_HOLD_TO_SPEAK) {
uiManager.setButtonsEnabled(recordEnabled = false)
} else {
uiManager.setButtonsEnabled(startEnabled = false, stopEnabled = false)
}
}
}
}
cloudApiManager = CloudApiManager(createCloudApiListener(), applicationContext)
cloudApiManager.setEnableStreaming(enableStreaming)
ttsManager = TtsManager(this)
ttsManager.setCallback(createTtsCallback())
Log.d(AppConfig.TAG, "Pre-starting ASR worker")
ioScope.launch {
@@ -205,10 +236,18 @@ class Live2DChatActivity : AppCompatActivity() {
}
ttsManager.enqueueEnd()
} else {
runOnUiThread {
uiManager.appendToUi("${response}\n")
val previousMood = com.digitalperson.mood.MoodManager.getCurrentMood()
val (filteredText, mood) = com.digitalperson.mood.MoodManager.extractAndFilterMood(response)
android.util.Log.d(com.digitalperson.config.AppConfig.TAG, "Final mood: $mood, filtered text: $filteredText")
if (mood != previousMood) {
uiManager.setMood(mood)
}
ttsManager.enqueueSegment(response)
runOnUiThread {
uiManager.appendToUi("${filteredText}\n")
}
ttsManager.enqueueSegment(filteredText)
ttsManager.enqueueEnd()
}
}
@@ -219,9 +258,18 @@ class Live2DChatActivity : AppCompatActivity() {
llmFirstChunkMarked = true
currentTrace?.markLlmFirstChunk()
}
uiManager.appendToUi(chunk)
val segments = segmenter.processChunk(chunk)
val previousMood = com.digitalperson.mood.MoodManager.getCurrentMood()
val (filteredText, mood) = com.digitalperson.mood.MoodManager.extractAndFilterMood(chunk)
if (mood != previousMood) {
android.util.Log.d(com.digitalperson.config.AppConfig.TAG, "Mood changed to: $mood")
// 设置Live2D人物的心情
uiManager.setMood(mood)
}
uiManager.appendToUi(filteredText)
val segments = segmenter.processChunk(filteredText)
for (seg in segments) {
ttsManager.enqueueSegment(seg)
}
@@ -353,6 +401,77 @@ class Live2DChatActivity : AppCompatActivity() {
Log.d(AppConfig.TAG, "onStartClicked completed")
}
private fun onRecordButtonDown() {
Log.d(AppConfig.TAG, "onRecordButtonDown called")
if (isRecording) {
Log.d(AppConfig.TAG, "Already recording, returning")
return
}
// 如果TTS正在播放打断它
val interrupted = ttsManager.interruptForNewTurn()
if (interrupted) {
uiManager.appendToUi("\n[LOG] 已打断TTS播放\n")
}
if (!audioProcessor.initMicrophone(permissions, AppConfig.REQUEST_RECORD_AUDIO_PERMISSION)) {
uiManager.showToast("麦克风初始化失败/无权限")
return
}
currentTrace = TraceManager.getInstance().startNewTurn()
currentTrace?.mark("turn_start")
llmInFlight = false
uiManager.clearText()
// interruptForNewTurn() already prepared TTS state for next turn.
// Keep reset() only for non-interrupt entry points.
ttsManager.setCurrentTrace(currentTrace)
segmenter.reset()
// 启动按住说话的动作
uiManager.startSpecificMotion("hold_to_speak")
holdToSpeakAudioBuffer.clear()
audioProcessor.startRecording()
isRecording = true
Log.d(AppConfig.TAG, "Starting processSamplesLoop coroutine")
recordingJob?.cancel()
recordingJob = ioScope.launch {
processSamplesLoop()
}
Log.d(AppConfig.TAG, "onRecordButtonDown completed")
}
private fun onRecordButtonUp() {
Log.d(AppConfig.TAG, "onRecordButtonUp called")
if (!isRecording) {
Log.d(AppConfig.TAG, "Not recording, returning")
return
}
isRecording = false
audioProcessor.stopRecording()
recordingJob?.cancel()
recordingJob = ioScope.launch {
// 处理最后的音频数据
val audioData = audioProcessor.getRecordedData()
holdToSpeakAudioBuffer.addAll(audioData.toList())
if (holdToSpeakAudioBuffer.size >= HOLD_TO_SPEAK_MIN_SAMPLES) {
val finalAudio = holdToSpeakAudioBuffer.toFloatArray()
asrManager.enqueueAudioSegment(finalAudio, finalAudio)
} else {
uiManager.showToast("录音时间太短请长按至少1秒")
}
holdToSpeakAudioBuffer.clear()
}
Log.d(AppConfig.TAG, "onRecordButtonUp completed")
}
private fun onStopClicked(userInitiated: Boolean) {
isRecording = false
audioProcessor.stopRecording()
@@ -362,7 +481,11 @@ class Live2DChatActivity : AppCompatActivity() {
ttsManager.stop()
uiManager.setButtonsEnabled(startEnabled = true, stopEnabled = false)
if (AppConfig.USE_HOLD_TO_SPEAK) {
uiManager.setButtonsEnabled(recordEnabled = true)
} else {
uiManager.setButtonsEnabled(startEnabled = true, stopEnabled = false)
}
if (userInitiated) {
TraceManager.getInstance().endTurn()
@@ -372,47 +495,62 @@ class Live2DChatActivity : AppCompatActivity() {
private suspend fun processSamplesLoop() {
Log.d(AppConfig.TAG, "processSamplesLoop started")
val windowSize = AppConfig.WINDOW_SIZE
val buffer = ShortArray(windowSize)
var loopCount = 0
while (isRecording && ioScope.coroutineContext.isActive) {
loopCount++
if (loopCount % 100 == 0) {
Log.d(AppConfig.TAG, "processSamplesLoop running, loopCount=$loopCount, ttsPlaying=${ttsManager.isPlaying()}")
}
if (ttsManager.isPlaying()) {
if (vadManager.isInSpeech()) {
Log.d(AppConfig.TAG, "TTS playing, resetting VAD state")
vadManager.clearState()
if (AppConfig.USE_HOLD_TO_SPEAK) {
// 按住说话模式累积音频数据到一定长度后再发送给ASR
while (isRecording && ioScope.coroutineContext.isActive) {
val audioData = audioProcessor.getAudioData()
if (audioData.isNotEmpty()) {
holdToSpeakAudioBuffer.addAll(audioData.toList())
}
// 避免CPU占用过高
kotlinx.coroutines.delay(10)
}
} else {
// 传统模式使用VAD
val windowSize = AppConfig.WINDOW_SIZE
val buffer = ShortArray(windowSize)
var loopCount = 0
while (isRecording && ioScope.coroutineContext.isActive) {
loopCount++
if (loopCount % 100 == 0) {
Log.d(AppConfig.TAG, "processSamplesLoop running, loopCount=$loopCount, ttsPlaying=${ttsManager.isPlaying()}")
}
if (ttsManager.isPlaying()) {
if (vadManager.isInSpeech()) {
Log.d(AppConfig.TAG, "TTS playing, resetting VAD state")
vadManager.clearState()
}
val ret = audioProcessor.readAudio(buffer)
if (ret <= 0) continue
continue
}
val ret = audioProcessor.readAudio(buffer)
if (ret <= 0) continue
continue
}
val ret = audioProcessor.readAudio(buffer)
if (ret <= 0) continue
if (ret != windowSize) continue
val chunk = audioProcessor.convertShortToFloat(buffer)
val processedChunk = audioProcessor.applyGain(chunk)
if (ret != windowSize) continue
val chunk = audioProcessor.convertShortToFloat(buffer)
val processedChunk = audioProcessor.applyGain(chunk)
val result = vadManager.processAudioChunk(chunk, processedChunk)
if (vadManager.vadComputeCount % 100 == 0) {
Log.d(AppConfig.TAG, "VAD result: $result, inSpeech=${vadManager.isInSpeech()}")
val result = vadManager.processAudioChunk(chunk, processedChunk)
if (vadManager.vadComputeCount % 100 == 0) {
Log.d(AppConfig.TAG, "VAD result: $result, inSpeech=${vadManager.isInSpeech()}")
}
if (loopCount % 1000 == 0) {
Log.d(AppConfig.TAG, "VAD status: inSpeech=${vadManager.isInSpeech()}, speechLen=${vadManager.getSpeechLength()}")
}
val forced = segmenter.maybeForceByTime()
for (seg in forced) ttsManager.enqueueSegment(seg)
}
if (loopCount % 1000 == 0) {
Log.d(AppConfig.TAG, "VAD status: inSpeech=${vadManager.isInSpeech()}, speechLen=${vadManager.getSpeechLength()}")
}
val forced = segmenter.maybeForceByTime()
for (seg in forced) ttsManager.enqueueSegment(seg)
vadManager.forceFinalize()
}
vadManager.forceFinalize()
Log.d(AppConfig.TAG, "processSamplesLoop stopped")
}
}