live2d model

This commit is contained in:
gcw_4spBpAfv
2026-03-02 09:25:50 +08:00
parent d63d4b03cf
commit 2f6166ab6c
179 changed files with 100625 additions and 2018 deletions

View File

@@ -1,957 +0,0 @@
package com.digitalperson
import android.Manifest
import android.content.pm.PackageManager
import android.media.AudioAttributes
import android.media.AudioFormat
import android.media.AudioManager
import android.media.AudioRecord
import android.media.AudioTrack
import android.media.MediaRecorder
import android.media.audiofx.AcousticEchoCanceler
import android.media.audiofx.NoiseSuppressor
import android.os.Bundle
import android.os.SystemClock
import android.text.method.ScrollingMovementMethod
import android.util.Log
import android.widget.Button
import android.widget.TextView
import android.widget.Toast
import androidx.appcompat.app.AppCompatActivity
import androidx.core.app.ActivityCompat
import com.digitalperson.cloud.CloudApiManager
import com.digitalperson.player.VideoPlayerManager
import com.google.android.exoplayer2.ui.PlayerView
import com.digitalperson.engine.SenseVoiceEngineRKNN
import com.digitalperson.metrics.TraceManager
import com.digitalperson.metrics.TraceSession
import com.k2fsa.sherpa.onnx.OfflineTts
import com.k2fsa.sherpa.onnx.SileroVadModelConfig
import com.k2fsa.sherpa.onnx.Vad
import com.k2fsa.sherpa.onnx.VadModelConfig
import com.k2fsa.sherpa.onnx.getOfflineTtsConfig
import kotlinx.coroutines.CoroutineScope
import kotlinx.coroutines.Dispatchers
import kotlinx.coroutines.Job
import kotlinx.coroutines.SupervisorJob
import kotlinx.coroutines.cancel
import kotlinx.coroutines.channels.Channel
import kotlinx.coroutines.isActive
import kotlinx.coroutines.launch
import kotlinx.coroutines.withContext
import java.io.File
import java.io.FileOutputStream
import java.util.concurrent.LinkedBlockingQueue
import java.util.concurrent.atomic.AtomicBoolean
import kotlin.math.max
private const val TAG = "DigitalPerson"
private const val REQUEST_RECORD_AUDIO_PERMISSION = 200
class MainActivity : AppCompatActivity() {
private lateinit var startButton: Button
private lateinit var stopButton: Button
private lateinit var textView: TextView
private lateinit var vad: Vad
private var senseVoice: SenseVoiceEngineRKNN? = null
private var tts: OfflineTts? = null
private var track: AudioTrack? = null
private var aec: AcousticEchoCanceler? = null
private var ns: NoiseSuppressor? = null
private var audioRecord: AudioRecord? = null
private val audioSource = MediaRecorder.AudioSource.MIC
private val sampleRateInHz = 16000
private val channelConfig = AudioFormat.CHANNEL_IN_MONO
private val audioFormat = AudioFormat.ENCODING_PCM_16BIT
private val permissions: Array<String> = arrayOf(Manifest.permission.RECORD_AUDIO)
@Volatile
private var isRecording: Boolean = false
private val ioScope = CoroutineScope(SupervisorJob() + Dispatchers.IO)
private var recordingJob: Job? = null
private val nativeLock = Any()
private lateinit var cloudApiManager: CloudApiManager
private var videoPlayerManager: VideoPlayerManager? = null
private val segmenter = StreamingTextSegmenter(
maxLen = 30,
maxWaitMs = 600
)
private sealed class TtsQueueItem {
data class Segment(val text: String) : TtsQueueItem()
data object End : TtsQueueItem()
}
private val ttsQueue = LinkedBlockingQueue<TtsQueueItem>()
private val ttsStopped = AtomicBoolean(false)
private val ttsWorkerRunning = AtomicBoolean(false)
private val ttsPlaying = AtomicBoolean(false)
@Volatile private var ttsTotalSamplesWritten: Long = 0
private var currentTrace: TraceSession? = null
private var lastUiText: String = ""
@Volatile private var llmInFlight: Boolean = false
private var enableStreaming = true // 默认启用流式输出
// ASR 队列和工作器
private val asrQueue = Channel<Pair<FloatArray, TraceSession?>>()
private val asrWorkerRunning = AtomicBoolean(false)
override fun onRequestPermissionsResult(
requestCode: Int,
permissions: Array<String>,
grantResults: IntArray
) {
super.onRequestPermissionsResult(requestCode, permissions, grantResults)
val ok = requestCode == REQUEST_RECORD_AUDIO_PERMISSION &&
grantResults.isNotEmpty() &&
grantResults[0] == PackageManager.PERMISSION_GRANTED
if (!ok) {
Log.e(TAG, "Audio record is disallowed")
finish()
}
}
override fun onCreate(savedInstanceState: Bundle?) {
super.onCreate(savedInstanceState)
setContentView(R.layout.activity_main)
// 初始化双播放器管理器silent 与 speaking 两个叠加的 PlayerView
try {
val silentPv = findViewById<PlayerView>(R.id.player_view_silent)
val speakingPv = findViewById<PlayerView>(R.id.player_view_speaking)
videoPlayerManager = VideoPlayerManager(this, silentPv, speakingPv)
// 默认 AI 未说话
videoPlayerManager?.setSpeaking(false)
} catch (e: Exception) {
Log.w(TAG, "PlayerViews not found or init failed: ${e.message}")
}
ActivityCompat.requestPermissions(this, permissions, REQUEST_RECORD_AUDIO_PERMISSION)
startButton = findViewById(R.id.start_button)
stopButton = findViewById(R.id.stop_button)
textView = findViewById(R.id.my_text)
textView.movementMethod = ScrollingMovementMethod()
startButton.setOnClickListener { onStartClicked() }
stopButton.setOnClickListener { onStopClicked(userInitiated = true) }
// 初始化流式输出开关
try {
val streamingSwitch = findViewById<android.widget.Switch>(R.id.streaming_switch)
streamingSwitch.isChecked = enableStreaming
streamingSwitch.setOnCheckedChangeListener { _, isChecked ->
enableStreaming = isChecked
cloudApiManager.setEnableStreaming(isChecked)
Toast.makeText(this, "流式输出已${if (isChecked) "启用" else "禁用"}", Toast.LENGTH_SHORT).show()
}
} catch (e: Exception) {
Log.w(TAG, "Streaming switch not found in layout: ${e.message}")
}
// 避免 UI 线程重初始化导致 ANR在后台初始化模型与 AudioTrack
startButton.isEnabled = false
stopButton.isEnabled = false
textView.text = "初始化中…"
ioScope.launch {
try {
Log.i(TAG, "Init VAD + SenseVoice(RKNN) + TTS (background)")
synchronized(nativeLock) {
initVadModel()
initSenseVoiceModel()
}
withContext(Dispatchers.Main) {
initTtsAndAudioTrack()
textView.text = getString(R.string.hint)
startButton.isEnabled = true
stopButton.isEnabled = false
}
} catch (t: Throwable) {
Log.e(TAG, "Initialization failed: ${t.message}", t)
withContext(Dispatchers.Main) {
textView.text = "初始化失败:${t.javaClass.simpleName}: ${t.message}"
Toast.makeText(
this@MainActivity,
"初始化失败(请看 Logcat: ${t.javaClass.simpleName}",
Toast.LENGTH_LONG
).show()
startButton.isEnabled = false
stopButton.isEnabled = false
}
}
}
cloudApiManager = CloudApiManager(object : CloudApiManager.CloudApiListener {
private var llmFirstChunkMarked = false
override fun onLLMResponseReceived(response: String) {
currentTrace?.markLlmDone()
llmInFlight = false
// 根据流式输出模式处理响应
if (enableStreaming) {
// 启用流式输出时,刷新剩余缓冲区
for (seg in segmenter.flush()) {
enqueueTtsSegment(seg)
}
// 发送队列结束信号
ttsQueue.offer(TtsQueueItem.End)
} else {
runOnUiThread {
appendToUi("${response}\n")
}
// 禁用流式输出时直接使用整段文本进行TTS
enqueueTtsSegment(response)
// 发送队列结束信号
ttsQueue.offer(TtsQueueItem.End)
}
}
override fun onLLMStreamingChunkReceived(chunk: String) {
// 启用流式输出时处理流式chunk
if (enableStreaming) {
if (!llmFirstChunkMarked) {
llmFirstChunkMarked = true
currentTrace?.markLlmFirstChunk()
}
appendToUi(chunk)
val segments = segmenter.processChunk(chunk)
for (seg in segments) {
enqueueTtsSegment(seg)
}
}
}
override fun onTTSAudioReceived(audioFilePath: String) {
// unused
}
override fun onError(errorMessage: String) {
llmInFlight = false
Toast.makeText(this@MainActivity, errorMessage, Toast.LENGTH_LONG).show()
onStopClicked(userInitiated = false)
}
}, applicationContext)
// 设置流式输出模式
cloudApiManager.setEnableStreaming(enableStreaming)
}
override fun onDestroy() {
super.onDestroy()
onStopClicked(userInitiated = false)
ioScope.cancel()
synchronized(nativeLock) {
try {
vad.release()
} catch (_: Throwable) {
}
try {
senseVoice?.deinitialize()
} catch (_: Throwable) {
}
}
try {
tts?.release()
} catch (_: Throwable) {
}
try {
videoPlayerManager?.release()
} catch (_: Throwable) {
}
}
private fun onStartClicked() {
if (isRecording) return
if (!initMicrophone()) {
Toast.makeText(this, "麦克风初始化失败/无权限", Toast.LENGTH_SHORT).show()
return
}
// Start a new trace turn
currentTrace = TraceManager.getInstance().startNewTurn()
currentTrace?.mark("turn_start")
llmInFlight = false
lastUiText = ""
textView.text = ""
ttsStopped.set(false)
ttsPlaying.set(false)
ttsTotalSamplesWritten = 0
ttsQueue.clear()
segmenter.reset()
vad.reset()
audioRecord!!.startRecording()
isRecording = true
startButton.isEnabled = false
stopButton.isEnabled = true
recordingJob?.cancel()
recordingJob = ioScope.launch {
processSamplesLoop()
}
}
private fun onStopClicked(userInitiated: Boolean) {
isRecording = false
try {
audioRecord?.stop()
} catch (_: Throwable) {
}
try {
audioRecord?.release()
} catch (_: Throwable) {
}
audioRecord = null
recordingJob?.cancel()
recordingJob = null
ttsStopped.set(true)
ttsPlaying.set(false)
ttsTotalSamplesWritten = 0
ttsQueue.clear()
// wake worker if waiting
ttsQueue.offer(TtsQueueItem.End)
try {
track?.pause()
track?.flush()
} catch (_: Throwable) {
}
try { aec?.release() } catch (_: Throwable) {}
try { ns?.release() } catch (_: Throwable) {}
aec = null
ns = null
startButton.isEnabled = true
stopButton.isEnabled = false
if (userInitiated) {
TraceManager.getInstance().endTurn()
currentTrace = null
}
}
private fun initVadModel() {
// 你的 VAD 模型在 assets/vad_model/ 下
val config = VadModelConfig(
sileroVadModelConfig = SileroVadModelConfig(
model = "vad_model/silero_vad.onnx",
threshold = 0.5F,
minSilenceDuration = 0.25F,
minSpeechDuration = 0.25F,
windowSize = 512,
),
sampleRate = sampleRateInHz,
numThreads = 1,
provider = "cpu",
)
vad = Vad(assetManager = application.assets, config = config)
}
private fun initSenseVoiceModel() {
Log.i(TAG, "ASR: init SenseVoice RKNN (scheme A)")
// Copy assets/sensevoice_models/* -> filesDir/sensevoice_models/*
val modelDir = copySenseVoiceAssetsToInternal()
val modelPath = File(modelDir, "sense-voice-encoder.rknn").absolutePath
val embeddingPath = File(modelDir, "embedding.npy").absolutePath
val bpePath = File(modelDir, "chn_jpn_yue_eng_ko_spectok.bpe.model").absolutePath
// Print quick diagnostics for native libs + model files
try {
val libDir = applicationInfo.nativeLibraryDir
Log.i(TAG, "nativeLibraryDir=$libDir")
try {
val names = File(libDir).list()?.joinToString(", ") ?: "(empty)"
Log.i(TAG, "nativeLibraryDir files: $names")
} catch (t: Throwable) {
Log.w(TAG, "Failed to list nativeLibraryDir: ${t.message}")
}
} catch (_: Throwable) {
}
Log.i(TAG, "SenseVoice model paths:")
Log.i(TAG, " model=$modelPath exists=${File(modelPath).exists()} size=${File(modelPath).length()}")
Log.i(TAG, " embedding=$embeddingPath exists=${File(embeddingPath).exists()} size=${File(embeddingPath).length()}")
Log.i(TAG, " bpe=$bpePath exists=${File(bpePath).exists()} size=${File(bpePath).length()}")
val t0 = SystemClock.elapsedRealtime()
val engine = try {
SenseVoiceEngineRKNN(this)
} catch (e: UnsatisfiedLinkError) {
// Most common: libsensevoiceEngine.so not packaged/built, or dependent libs missing
throw IllegalStateException("Load native libraries failed: ${e.message}", e)
}
val ok = try {
engine.loadModelDirectly(modelPath, embeddingPath, bpePath)
} catch (t: Throwable) {
throw IllegalStateException("SenseVoice loadModelDirectly crashed: ${t.message}", t)
}
val dt = SystemClock.elapsedRealtime() - t0
Log.i(TAG, "SenseVoice loadModelDirectly ok=$ok costMs=$dt")
if (!ok) throw IllegalStateException("SenseVoiceEngineRKNN loadModelDirectly returned false")
senseVoice = engine
}
private fun initTtsAndAudioTrack() {
try {
// 你放入的 sherpa-onnx VITS 中文模型目录:
// assets/tts_model/sherpa-onnx-vits-zh-ll/{model.onnx,tokens.txt,lexicon.txt,...}
val modelDir = "tts_model/sherpa-onnx-vits-zh-ll"
val modelName = "model.onnx"
val lexicon = "lexicon.txt"
val dataDir = ""
val ttsConfig = getOfflineTtsConfig(
modelDir = modelDir,
modelName = modelName,
acousticModelName = "",
vocoder = "",
voices = "",
lexicon = lexicon,
dataDir = dataDir,
dictDir = "",
// 中文规范化规则(目录里已有这些 fst
ruleFsts = "$modelDir/phone.fst,$modelDir/date.fst,$modelDir/number.fst,$modelDir/new_heteronym.fst",
ruleFars = "",
numThreads = null,
isKitten = false
)
tts = OfflineTts(assetManager = application.assets, config = ttsConfig)
} catch (t: Throwable) {
Log.e(TAG, "Init TTS failed: ${t.message}", t)
tts = null
runOnUiThread {
Toast.makeText(
this,
"TTS 初始化失败:请确认 assets/tts_model/sherpa-onnx-vits-zh-ll/ 下有 model.onnx、tokens.txt、lexicon.txt 以及 phone/date/number/new_heteronym.fst",
Toast.LENGTH_LONG
).show()
}
}
val t = tts ?: return
val sr = t.sampleRate()
val bufLength = AudioTrack.getMinBufferSize(
sr,
AudioFormat.CHANNEL_OUT_MONO,
AudioFormat.ENCODING_PCM_FLOAT
)
val attr = AudioAttributes.Builder()
.setContentType(AudioAttributes.CONTENT_TYPE_SPEECH)
.setUsage(AudioAttributes.USAGE_MEDIA)
.build()
val format = AudioFormat.Builder()
.setEncoding(AudioFormat.ENCODING_PCM_FLOAT)
.setChannelMask(AudioFormat.CHANNEL_OUT_MONO)
.setSampleRate(sr)
.build()
track = AudioTrack(
attr,
format,
bufLength,
AudioTrack.MODE_STREAM,
AudioManager.AUDIO_SESSION_ID_GENERATE
)
track?.play()
}
private fun assetExists(path: String): Boolean {
return try {
application.assets.open(path).close()
true
} catch (_: Throwable) {
false
}
}
private fun copySenseVoiceAssetsToInternal(): File {
val outDir = File(filesDir, "sensevoice_models")
if (!outDir.exists()) outDir.mkdirs()
val files = arrayOf(
"am.mvn",
"chn_jpn_yue_eng_ko_spectok.bpe.model",
"embedding.npy",
"sense-voice-encoder.rknn"
)
for (name in files) {
val assetPath = "sensevoice_models/$name"
val outFile = File(outDir, name)
if (outFile.exists() && outFile.length() > 0) continue
application.assets.open(assetPath).use { input ->
FileOutputStream(outFile).use { output ->
input.copyTo(output)
}
}
}
return outDir
}
private fun initMicrophone(): Boolean {
if (ActivityCompat.checkSelfPermission(this, Manifest.permission.RECORD_AUDIO)
!= PackageManager.PERMISSION_GRANTED
) {
ActivityCompat.requestPermissions(this, permissions, REQUEST_RECORD_AUDIO_PERMISSION)
return false
}
val numBytes = AudioRecord.getMinBufferSize(sampleRateInHz, channelConfig, audioFormat)
audioRecord = AudioRecord(
audioSource,
sampleRateInHz,
channelConfig,
audioFormat,
numBytes * 2
)
val sessionId = audioRecord?.audioSessionId ?: 0
if (sessionId != 0) {
if (android.media.audiofx.AcousticEchoCanceler.isAvailable()) {
aec = android.media.audiofx.AcousticEchoCanceler.create(sessionId)?.apply {
enabled = true
}
Log.i(TAG, "AEC enabled=${aec?.enabled}")
} else {
Log.w(TAG, "AEC not available on this device")
}
if (android.media.audiofx.NoiseSuppressor.isAvailable()) {
ns = android.media.audiofx.NoiseSuppressor.create(sessionId)?.apply {
enabled = true
}
Log.i(TAG, "NS enabled=${ns?.enabled}")
} else {
Log.w(TAG, "NS not available on this device")
}
}
return true
}
private suspend fun processSamplesLoop() {
// Avoid calling vad.front()/vad.pop() (native queue APIs) since it crashes on some builds.
// Use vad.compute() and implement a simple VAD segmenter in Kotlin instead.
val windowSize = 512
val buffer = ShortArray(windowSize)
// 双阈值设置
val startThreshold = 0.2f // 进入语音的阈值
val endThreshold = 0.15f // 退出语音的阈值
val minSilenceSamples = (0.5f * sampleRateInHz).toInt()
val minSpeechSamples = (0.1f * sampleRateInHz).toInt()
val maxSpeechSamples = (5.0f * sampleRateInHz).toInt()
// VAD 概率数据记录
val vadProbabilities = mutableListOf<Float>()
val vadTimestamps = mutableListOf<Long>()
val vadRMSValues = mutableListOf<Float>()
val vadSmoothedRMSValues = mutableListOf<Float>()
// 指数平滑相关变量
var smoothedRms = 0f
val alpha = 0.8f // 平滑系数
var inSpeech = false
var silenceSamples = 0
var speechBuf = FloatArray(0)
var speechLen = 0
var processedSpeechBuf = FloatArray(0)
var processedSpeechLen = 0
fun appendSpeech(chunk: FloatArray, processedChunk: FloatArray) {
// 保存原始音频
val needed = speechLen + chunk.size
if (speechBuf.size < needed) {
var newCap = maxOf(needed, maxOf(1024, speechBuf.size * 2))
if (newCap > maxSpeechSamples) newCap = maxSpeechSamples
val n = FloatArray(newCap)
if (speechLen > 0) System.arraycopy(speechBuf, 0, n, 0, speechLen)
speechBuf = n
}
val copyN = minOf(chunk.size, max(0, maxSpeechSamples - speechLen))
if (copyN > 0) {
System.arraycopy(chunk, 0, speechBuf, speechLen, copyN)
speechLen += copyN
}
// 保存增益后的音频
val processedNeeded = processedSpeechLen + processedChunk.size
if (processedSpeechBuf.size < processedNeeded) {
var newCap = maxOf(processedNeeded, maxOf(1024, processedSpeechBuf.size * 2))
if (newCap > maxSpeechSamples) newCap = maxSpeechSamples
val n = FloatArray(newCap)
if (processedSpeechLen > 0) System.arraycopy(processedSpeechBuf, 0, n, 0, processedSpeechLen)
processedSpeechBuf = n
}
val processedCopyN = minOf(processedChunk.size, max(0, maxSpeechSamples - processedSpeechLen))
if (processedCopyN > 0) {
System.arraycopy(processedChunk, 0, processedSpeechBuf, processedSpeechLen, processedCopyN)
processedSpeechLen += processedCopyN
}
}
suspend fun finalizeSegmentIfAny() {
if (speechLen < minSpeechSamples) {
speechLen = 0
processedSpeechLen = 0
inSpeech = false
silenceSamples = 0
return
}
// ✅ 新增:如果 TTS 正在播放或 LLM 请求中,丢弃此段(避免回声)
if (ttsPlaying.get() || llmInFlight) {
speechLen = 0
processedSpeechLen = 0
inSpeech = false
silenceSamples = 0
return
}
val originalSeg = speechBuf.copyOf(speechLen)
val processedSeg = processedSpeechBuf.copyOf(processedSpeechLen)
speechLen = 0
processedSpeechLen = 0
inSpeech = false
silenceSamples = 0
// 将语音段加入 ASR 处理队列,异步处理
asrQueue.send(Pair(originalSeg, processedSeg))
}
while (isRecording && ioScope.coroutineContext.isActive) {
val ret = audioRecord?.read(buffer, 0, buffer.size) ?: break
if (ret <= 0) continue
if (ret != windowSize) continue
// 在 processSamplesLoop 方法中
val chunk = FloatArray(ret) { buffer[it] / 32768.0f }
// 计算当前音频的RMS值均方根
val rms = calculateRMS(chunk)
// 应用指数平滑
smoothedRms = if (smoothedRms == 0f) rms else alpha * rms + (1 - alpha) * smoothedRms
// 动态调整增益因子目标RMS设为0.1(约-20dB
val targetRMS = 0.1f
var gainFactor = if (smoothedRms > 0) targetRMS / smoothedRms else 3.0f
// 设置增益的上下限,避免过度增益导致削波
gainFactor = gainFactor.coerceIn(0.1f, 10.0f)
// 应用增益因子
val processedChunk = FloatArray(chunk.size) {
val value = chunk[it] * gainFactor
// 限制音量范围,避免削波
if (value > 1.0f) 1.0f else if (value < -1.0f) -1.0f else value
}
// 使用处理后的音频数据
val prob = synchronized(nativeLock) { vad.compute(processedChunk) }
// 记录VAD概率、时间戳、原始RMS值和平滑后的RMS值
vadProbabilities.add(prob)
vadTimestamps.add(System.currentTimeMillis())
vadRMSValues.add(rms)
vadSmoothedRMSValues.add(smoothedRms)
// 双阈值状态机逻辑
if (!inSpeech && prob >= startThreshold) {
// 进入语音状态
inSpeech = true
silenceSamples = 0
appendSpeech(chunk, processedChunk)
} else if (inSpeech && prob <= endThreshold) {
// 开始计数静音样本
silenceSamples += ret
if (silenceSamples >= minSilenceSamples) {
// 退出语音状态
finalizeSegmentIfAny()
} else {
// 保留尾音
appendSpeech(chunk, processedChunk)
}
} else if (inSpeech) {
// 语音过程中,持续添加音频
appendSpeech(chunk, processedChunk)
silenceSamples = 0 // 重置静音计数
if (speechLen >= maxSpeechSamples) {
finalizeSegmentIfAny()
}
}
// 非语音状态且概率低于开始阈值,不做处理
// 时间兜底切段(避免长时间无标点导致首包太慢)
val forced = segmenter.maybeForceByTime()
for (seg in forced) enqueueTtsSegment(seg)
}
// flush last partial segment
finalizeSegmentIfAny()
// 保存VAD数据到文件
saveVadData(vadTimestamps, vadProbabilities, vadRMSValues, vadSmoothedRMSValues)
}
/**
* 保存VAD数据到文件方便后续分析和绘图
*/
private fun saveVadData(timestamps: List<Long>, probabilities: List<Float>, rmsValues: List<Float>, smoothedRmsValues: List<Float>) {
try {
// 创建保存目录
val vadDataDir = File(filesDir, "vad_data")
if (!vadDataDir.exists()) {
vadDataDir.mkdirs()
}
// 生成唯一的文件名
val timestamp = System.currentTimeMillis()
val fileName = "vad_data_${timestamp}.csv"
val outputFile = File(vadDataDir, fileName)
// 写入数据
FileOutputStream(outputFile).use { fos ->
// 写入表头
fos.write("timestamp,probability,rms,smoothed_rms\n".toByteArray())
// 写入数据行
for (i in timestamps.indices) {
val line = "${timestamps[i]},${probabilities[i]},${rmsValues[i]},${smoothedRmsValues[i]}\n"
fos.write(line.toByteArray())
}
}
Log.d(TAG, "Saved VAD data to: ${outputFile.absolutePath}")
} catch (e: Exception) {
Log.e(TAG, "Error saving VAD data: ${e.message}")
}
}
private fun removeTokens(text: String): String {
// Remove tokens like <|zh|>, <|NEUTRAL|>, <|Speech|>, <|woitn|> and stray '>' chars
var cleaned = text.replace(Regex("<\\|[^>]+\\|>"), "")
cleaned = cleaned.replace(Regex("[>>≥≫]"), "")
cleaned = cleaned.trim().replace(Regex("\\s+"), " ")
return cleaned
}
private fun enqueueTtsSegment(seg: String) {
// 移除句末的标点符号
val cleanedSeg = seg.trimEnd('.', '。', '!', '', '?', '', ',', '', ';', '', ':', '')
currentTrace?.markTtsRequestEnqueued()
ttsQueue.offer(TtsQueueItem.Segment(cleanedSeg))
ensureTtsWorker()
}
private fun ensureTtsWorker() {
if (!ttsWorkerRunning.compareAndSet(false, true)) return
ioScope.launch {
try {
runTtsWorker()
} finally {
ttsWorkerRunning.set(false)
}
}
}
private fun ensureAsrWorker() {
if (!asrWorkerRunning.compareAndSet(false, true)) return
ioScope.launch {
try {
runAsrWorker()
} finally {
asrWorkerRunning.set(false)
}
}
}
private fun runTtsWorker() {
val t = tts ?: return
val audioTrack = track ?: return
var firstAudioMarked = false
var isFirstSegment = true
while (true) {
val item = ttsQueue.take()
if (ttsStopped.get()) break
when (item) {
is TtsQueueItem.Segment -> {
ttsPlaying.set(true)
runOnUiThread { videoPlayerManager?.setSpeaking(true) }
val trace = currentTrace
trace?.markTtsSynthesisStart()
Log.d(TAG, "TTS started: processing segment '${item.text}'")
runOnUiThread {
appendToUi("\n[TTS] 开始合成...\n")
}
val startMs = System.currentTimeMillis()
var firstPcmMarked = false
if (isFirstSegment) {
try {
audioTrack.pause()
audioTrack.flush()
audioTrack.play()
} catch (_: Throwable) {
}
isFirstSegment = false
}
t.generateWithCallback(
text = item.text,
sid = 2, // 这里可以修改说话人
speed = 1.0f
) { samples ->
if (ttsStopped.get()) return@generateWithCallback 0
if (!firstPcmMarked && samples.isNotEmpty()) {
firstPcmMarked = true
trace?.markTtsFirstPcmReady()
}
if (!firstAudioMarked && samples.isNotEmpty()) {
firstAudioMarked = true
trace?.markTtsFirstAudioPlay()
}
audioTrack.write(samples, 0, samples.size, AudioTrack.WRITE_BLOCKING)
ttsTotalSamplesWritten += samples.size
1
}
val ttsMs = System.currentTimeMillis() - startMs
trace?.addDuration("tts_segment_ms_total", ttsMs)
}
TtsQueueItem.End -> {
// 清空 ASR 队列,丢弃所有未处理的段(这些可能是 TTS 播放期间的回声)
while (asrQueue.tryReceive().isSuccess) { }
waitForPlaybackComplete(audioTrack)
val ttsCompleteTime = System.currentTimeMillis()
// 在主线程更新UI
runOnUiThread {
appendToUi("\n[LOG] TTS completed at: ${ttsCompleteTime}\n")
}
ttsPlaying.set(false)
runOnUiThread { videoPlayerManager?.setSpeaking(false) }
ttsTotalSamplesWritten = 0
isFirstSegment = true
currentTrace?.markTtsDone()
TraceManager.getInstance().endTurn()
currentTrace = null
break
}
}
}
}
private fun waitForPlaybackComplete(audioTrack: AudioTrack) {
val totalSamples = ttsTotalSamplesWritten
if (totalSamples <= 0) return
val sampleRate = audioTrack.sampleRate
val timeoutMs = (totalSamples * 1000 / sampleRate) + 2000
val startTime = System.currentTimeMillis()
while (true) {
if (ttsStopped.get()) break
val playbackPos = audioTrack.playbackHeadPosition.toLong()
if (playbackPos >= totalSamples) {
break
}
if (System.currentTimeMillis() - startTime > timeoutMs) {
Log.w(TAG, "waitForPlaybackComplete timeout, pos=$playbackPos, total=$totalSamples")
break
}
Thread.sleep(20)
}
// 直接等待 1000ms确保所有缓冲区清空
Thread.sleep(1000)
}
private suspend fun runAsrWorker() {
while (ioScope.coroutineContext.isActive) {
val (seg, trace) = try {
asrQueue.receive()
} catch (_: Throwable) {
break
}
// 每次只允许一个 LLM 请求在飞,避免堆积导致卡死/竞态
// TTS 播放期间不做 ASR避免识别到 TTS 播放的声音
if (llmInFlight || ttsPlaying.get()) continue
trace?.markASRStart()
Log.d(TAG, "ASR started: processing audio segment")
withContext(Dispatchers.Main) {
appendToUi("\n[ASR] 开始识别...\n")
}
val raw = synchronized(nativeLock) {
val e = senseVoice
if (e == null || !e.isInitialized) "" else e.transcribeBuffer(seg)
}
val text = removeTokens(raw)
// 添加过滤逻辑
if (text.isBlank()) continue
// 过滤英文单字符"i"
if (text.length == 1 && text[0].equals('i', ignoreCase = true)) {
Log.d(TAG, "ASR segment skipped: single 'i'")
continue
}
// 过滤超过50个字符的长文本
if (text.length > 50) {
Log.d(TAG, "ASR segment skipped: too long (${text.length} chars)")
continue
}
trace?.markASREnd()
withContext(Dispatchers.Main) {
appendToUi("\n\n[ASR] ${text}\n")
}
trace?.markRecordingDone()
trace?.markLlmResponseReceived()
if (BuildConfig.LLM_API_KEY.isBlank()) {
withContext(Dispatchers.Main) {
Toast.makeText(
this@MainActivity,
"未配置 LLM_API_KEY在 local.properties 或 gradle.properties 里设置)",
Toast.LENGTH_LONG
).show()
}
continue
}
llmInFlight = true
cloudApiManager.callLLM(text)
}
}
private fun appendToUi(s: String) {
lastUiText += s
textView.text = lastUiText
}
}

View File

@@ -0,0 +1,20 @@
package com.digitalperson
import android.content.Intent
import android.os.Bundle
import androidx.appcompat.app.AppCompatActivity
import com.digitalperson.config.AppConfig
class EntryActivity : AppCompatActivity() {
override fun onCreate(savedInstanceState: Bundle?) {
super.onCreate(savedInstanceState)
val target = if (AppConfig.Avatar.USE_LIVE2D) {
Live2DChatActivity::class.java
} else {
MainActivity::class.java
}
startActivity(Intent(this, target))
finish()
}
}

View File

@@ -0,0 +1,418 @@
package com.digitalperson
import android.Manifest
import android.content.pm.PackageManager
import android.os.Bundle
import android.util.Log
import android.widget.Toast
import androidx.appcompat.app.AppCompatActivity
import androidx.core.app.ActivityCompat
import com.digitalperson.cloud.CloudApiManager
import com.digitalperson.audio.AudioProcessor
import com.digitalperson.vad.VadManager
import com.digitalperson.asr.AsrManager
import com.digitalperson.tts.TtsManager
import com.digitalperson.ui.Live2DUiManager
import com.digitalperson.config.AppConfig
import com.digitalperson.metrics.TraceManager
import com.digitalperson.metrics.TraceSession
import kotlinx.coroutines.CoroutineScope
import kotlinx.coroutines.Dispatchers
import kotlinx.coroutines.Job
import kotlinx.coroutines.SupervisorJob
import kotlinx.coroutines.cancel
import kotlinx.coroutines.isActive
import kotlinx.coroutines.launch
import kotlinx.coroutines.withContext
class Live2DChatActivity : AppCompatActivity() {
private lateinit var uiManager: Live2DUiManager
private lateinit var vadManager: VadManager
private lateinit var asrManager: AsrManager
private lateinit var ttsManager: TtsManager
private lateinit var audioProcessor: AudioProcessor
private val permissions: Array<String> = arrayOf(Manifest.permission.RECORD_AUDIO)
@Volatile
private var isRecording: Boolean = false
private val ioScope = CoroutineScope(SupervisorJob() + Dispatchers.IO)
private var recordingJob: Job? = null
private val nativeLock = Any()
private lateinit var cloudApiManager: CloudApiManager
private val segmenter = StreamingTextSegmenter(
maxLen = AppConfig.Tts.MAX_LEN,
maxWaitMs = AppConfig.Tts.MAX_WAIT_MS
)
private var currentTrace: TraceSession? = null
@Volatile private var llmInFlight: Boolean = false
private var enableStreaming = false
override fun onRequestPermissionsResult(
requestCode: Int,
permissions: Array<String>,
grantResults: IntArray
) {
super.onRequestPermissionsResult(requestCode, permissions, grantResults)
val ok = requestCode == AppConfig.REQUEST_RECORD_AUDIO_PERMISSION &&
grantResults.isNotEmpty() &&
grantResults[0] == PackageManager.PERMISSION_GRANTED
if (!ok) {
Log.e(AppConfig.TAG, "Audio record is disallowed")
finish()
}
}
override fun onCreate(savedInstanceState: Bundle?) {
super.onCreate(savedInstanceState)
setContentView(R.layout.activity_live2d_chat)
uiManager = Live2DUiManager(this)
uiManager.initViews(
textViewId = R.id.my_text,
scrollViewId = R.id.scroll_view,
startButtonId = R.id.start_button,
stopButtonId = R.id.stop_button,
silentPlayerViewId = 0,
speakingPlayerViewId = 0,
live2dViewId = R.id.live2d_view
)
uiManager.setStartButtonListener { onStartClicked() }
uiManager.setStopButtonListener { onStopClicked(userInitiated = true) }
ActivityCompat.requestPermissions(this, permissions, AppConfig.REQUEST_RECORD_AUDIO_PERMISSION)
try {
val streamingSwitch = findViewById<android.widget.Switch>(R.id.streaming_switch)
streamingSwitch.isChecked = enableStreaming
streamingSwitch.setOnCheckedChangeListener { _, isChecked ->
enableStreaming = isChecked
cloudApiManager.setEnableStreaming(isChecked)
uiManager.showToast("流式输出已${if (isChecked) "启用" else "禁用"}")
}
} catch (e: Exception) {
Log.w(AppConfig.TAG, "Streaming switch not found in layout: ${e.message}")
}
uiManager.setButtonsEnabled(startEnabled = false, stopEnabled = false)
uiManager.setText("初始化中…")
audioProcessor = AudioProcessor(this)
asrManager = AsrManager(this)
asrManager.setAudioProcessor(audioProcessor)
asrManager.setCallback(createAsrCallback())
vadManager = VadManager(this)
vadManager.setCallback(createVadCallback())
ioScope.launch {
try {
Log.i(AppConfig.TAG, "Init VAD + SenseVoice(RKNN) + TTS (background)")
synchronized(nativeLock) {
vadManager.initVadModel()
asrManager.initSenseVoiceModel()
}
val ttsOk = ttsManager.initTtsAndAudioTrack()
withContext(Dispatchers.Main) {
if (!ttsOk) {
uiManager.showToast(
"TTS 初始化失败:请确认 assets/${AppConfig.Tts.MODEL_DIR}/ 下有 model.onnx、tokens.txt、lexicon.txt 以及 phone/date/number/new_heteronym.fst",
Toast.LENGTH_LONG
)
}
uiManager.setText(getString(R.string.hint))
uiManager.setButtonsEnabled(startEnabled = true, stopEnabled = false)
}
} catch (t: Throwable) {
Log.e(AppConfig.TAG, "Initialization failed: ${t.message}", t)
withContext(Dispatchers.Main) {
uiManager.setText("初始化失败:${t.javaClass.simpleName}: ${t.message}")
uiManager.showToast("初始化失败(请看 Logcat: ${t.javaClass.simpleName}", Toast.LENGTH_LONG)
uiManager.setButtonsEnabled(startEnabled = false, stopEnabled = false)
}
}
}
cloudApiManager = CloudApiManager(createCloudApiListener(), applicationContext)
cloudApiManager.setEnableStreaming(enableStreaming)
ttsManager = TtsManager(this)
ttsManager.setCallback(createTtsCallback())
Log.d(AppConfig.TAG, "Pre-starting ASR worker")
ioScope.launch {
asrManager.runAsrWorker()
}
}
private fun createAsrCallback() = object : AsrManager.AsrCallback {
override fun onAsrStarted() {
currentTrace?.markASRStart()
runOnUiThread {
uiManager.appendToUi("\n[ASR] 开始识别...\n")
}
}
override fun onAsrResult(text: String) {
currentTrace?.markASREnd()
runOnUiThread {
uiManager.appendToUi("\n\n[ASR] ${text}\n")
}
currentTrace?.markRecordingDone()
currentTrace?.markLlmResponseReceived()
}
override fun onAsrSkipped(reason: String) {
Log.d(AppConfig.TAG, "ASR segment skipped: $reason")
}
override fun shouldSkipAsr(): Boolean = ttsManager.isPlaying()
override fun isLlmInFlight(): Boolean = llmInFlight
override fun onLlmCalled(text: String) {
llmInFlight = true
Log.d(AppConfig.TAG, "Calling LLM with text: $text")
cloudApiManager.callLLM(text)
}
}
private fun createVadCallback() = object : VadManager.VadCallback {
override fun onSpeechSegmentReady(originalAudio: FloatArray, processedAudio: FloatArray) {
Log.d(AppConfig.TAG, "Sending audio segment to ASR queue, size: ${processedAudio.size}")
asrManager.enqueueAudioSegment(originalAudio, processedAudio)
}
override fun shouldSkipProcessing(): Boolean = ttsManager.isPlaying() || llmInFlight
}
private fun createCloudApiListener() = object : CloudApiManager.CloudApiListener {
private var llmFirstChunkMarked = false
override fun onLLMResponseReceived(response: String) {
currentTrace?.markLlmDone()
llmInFlight = false
if (enableStreaming) {
for (seg in segmenter.flush()) {
ttsManager.enqueueSegment(seg)
}
ttsManager.enqueueEnd()
} else {
runOnUiThread {
uiManager.appendToUi("${response}\n")
}
ttsManager.enqueueSegment(response)
ttsManager.enqueueEnd()
}
}
override fun onLLMStreamingChunkReceived(chunk: String) {
if (enableStreaming) {
if (!llmFirstChunkMarked) {
llmFirstChunkMarked = true
currentTrace?.markLlmFirstChunk()
}
uiManager.appendToUi(chunk)
val segments = segmenter.processChunk(chunk)
for (seg in segments) {
ttsManager.enqueueSegment(seg)
}
}
}
override fun onTTSAudioReceived(audioFilePath: String) {}
override fun onError(errorMessage: String) {
llmInFlight = false
uiManager.showToast(errorMessage, Toast.LENGTH_LONG)
onStopClicked(userInitiated = false)
}
}
private fun createTtsCallback() = object : TtsManager.TtsCallback {
override fun onTtsStarted(text: String) {
runOnUiThread {
uiManager.appendToUi("\n[TTS] 开始合成...\n")
}
}
override fun onTtsCompleted() {
runOnUiThread {
uiManager.appendToUi("\n[LOG] TTS completed at: ${System.currentTimeMillis()}\n")
}
}
override fun onTtsSegmentCompleted(durationMs: Long) {}
override fun isTtsStopped(): Boolean = !isRecording
override fun onClearAsrQueue() {
asrManager.clearQueue()
}
override fun onSetSpeaking(speaking: Boolean) {
uiManager.setSpeaking(speaking)
}
override fun getCurrentTrace(): TraceSession? = currentTrace
override fun onTraceMarkTtsRequestEnqueued() {
currentTrace?.markTtsRequestEnqueued()
}
override fun onTraceMarkTtsSynthesisStart() {
currentTrace?.markTtsSynthesisStart()
}
override fun onTraceMarkTtsFirstPcmReady() {
currentTrace?.markTtsFirstPcmReady()
}
override fun onTraceMarkTtsFirstAudioPlay() {
currentTrace?.markTtsFirstAudioPlay()
}
override fun onTraceMarkTtsDone() {
currentTrace?.markTtsDone()
}
override fun onTraceAddDuration(name: String, value: Long) {
currentTrace?.addDuration(name, value)
}
override fun onEndTurn() {
TraceManager.getInstance().endTurn()
currentTrace = null
}
}
override fun onDestroy() {
super.onDestroy()
onStopClicked(userInitiated = false)
ioScope.cancel()
synchronized(nativeLock) {
try { vadManager.release() } catch (_: Throwable) {}
try { asrManager.release() } catch (_: Throwable) {}
}
try { ttsManager.release() } catch (_: Throwable) {}
try { uiManager.release() } catch (_: Throwable) {}
try { audioProcessor.release() } catch (_: Throwable) {}
}
override fun onResume() {
super.onResume()
uiManager.onResume()
}
override fun onPause() {
uiManager.onPause()
super.onPause()
}
private fun onStartClicked() {
Log.d(AppConfig.TAG, "onStartClicked called")
if (isRecording) {
Log.d(AppConfig.TAG, "Already recording, returning")
return
}
if (!audioProcessor.initMicrophone(permissions, AppConfig.REQUEST_RECORD_AUDIO_PERMISSION)) {
uiManager.showToast("麦克风初始化失败/无权限")
return
}
currentTrace = TraceManager.getInstance().startNewTurn()
currentTrace?.mark("turn_start")
llmInFlight = false
uiManager.clearText()
ttsManager.reset()
ttsManager.setCurrentTrace(currentTrace)
segmenter.reset()
vadManager.reset()
audioProcessor.startRecording()
isRecording = true
uiManager.setButtonsEnabled(startEnabled = false, stopEnabled = true)
Log.d(AppConfig.TAG, "Starting processSamplesLoop coroutine")
recordingJob?.cancel()
recordingJob = ioScope.launch {
processSamplesLoop()
}
Log.d(AppConfig.TAG, "onStartClicked completed")
}
private fun onStopClicked(userInitiated: Boolean) {
isRecording = false
audioProcessor.stopRecording()
recordingJob?.cancel()
recordingJob = null
ttsManager.stop()
uiManager.setButtonsEnabled(startEnabled = true, stopEnabled = false)
if (userInitiated) {
TraceManager.getInstance().endTurn()
currentTrace = null
}
}
private suspend fun processSamplesLoop() {
Log.d(AppConfig.TAG, "processSamplesLoop started")
val windowSize = AppConfig.WINDOW_SIZE
val buffer = ShortArray(windowSize)
var loopCount = 0
while (isRecording && ioScope.coroutineContext.isActive) {
loopCount++
if (loopCount % 100 == 0) {
Log.d(AppConfig.TAG, "processSamplesLoop running, loopCount=$loopCount, ttsPlaying=${ttsManager.isPlaying()}")
}
if (ttsManager.isPlaying()) {
if (vadManager.isInSpeech()) {
Log.d(AppConfig.TAG, "TTS playing, resetting VAD state")
vadManager.clearState()
}
val ret = audioProcessor.readAudio(buffer)
if (ret <= 0) continue
continue
}
val ret = audioProcessor.readAudio(buffer)
if (ret <= 0) continue
if (ret != windowSize) continue
val chunk = audioProcessor.convertShortToFloat(buffer)
val processedChunk = audioProcessor.applyGain(chunk)
val result = vadManager.processAudioChunk(chunk, processedChunk)
if (vadManager.vadComputeCount % 100 == 0) {
Log.d(AppConfig.TAG, "VAD result: $result, inSpeech=${vadManager.isInSpeech()}")
}
if (loopCount % 1000 == 0) {
Log.d(AppConfig.TAG, "VAD status: inSpeech=${vadManager.isInSpeech()}, speechLen=${vadManager.getSpeechLength()}")
}
val forced = segmenter.maybeForceByTime()
for (seg in forced) ttsManager.enqueueSegment(seg)
}
vadManager.forceFinalize()
}
}

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,223 @@
package com.digitalperson.asr
import android.content.Context
import android.os.SystemClock
import android.util.Log
import com.digitalperson.BuildConfig
import com.digitalperson.audio.AudioProcessor
import com.digitalperson.config.AppConfig
import com.digitalperson.engine.SenseVoiceEngineRKNN
import com.digitalperson.util.FileHelper
import kotlinx.coroutines.Dispatchers
import kotlinx.coroutines.channels.Channel
import kotlinx.coroutines.currentCoroutineContext
import kotlinx.coroutines.isActive
import kotlinx.coroutines.withContext
import java.io.File
class AsrManager(private val context: Context) {
companion object {
private const val TAG = "AsrManager"
}
private var senseVoice: SenseVoiceEngineRKNN? = null
private val nativeLock = Any()
private val asrQueue = Channel<Pair<FloatArray, FloatArray>>(capacity = Channel.UNLIMITED)
private var audioProcessor: AudioProcessor? = null
interface AsrCallback {
fun onAsrStarted()
fun onAsrResult(text: String)
fun onAsrSkipped(reason: String)
fun shouldSkipAsr(): Boolean
fun isLlmInFlight(): Boolean
fun onLlmCalled(text: String)
}
private var callback: AsrCallback? = null
fun setCallback(callback: AsrCallback) {
this.callback = callback
}
fun setAudioProcessor(audioProcessor: AudioProcessor) {
this.audioProcessor = audioProcessor
}
fun initSenseVoiceModel(): Boolean {
return try {
Log.i(TAG, "ASR: init SenseVoice RKNN (scheme A)")
val modelDir = FileHelper.copySenseVoiceAssets(context)
val modelPath = File(modelDir, "sense-voice-encoder.rknn").absolutePath
val embeddingPath = File(modelDir, "embedding.npy").absolutePath
val bpePath = File(modelDir, "chn_jpn_yue_eng_ko_spectok.bpe.model").absolutePath
try {
val libDir = context.applicationInfo.nativeLibraryDir
Log.i(TAG, "nativeLibraryDir=$libDir")
try {
val names = File(libDir).list()?.joinToString(", ") ?: "(empty)"
Log.i(TAG, "nativeLibraryDir files: $names")
} catch (t: Throwable) {
Log.w(TAG, "Failed to list nativeLibraryDir: ${t.message}")
}
} catch (_: Throwable) {
}
Log.i(TAG, "SenseVoice model paths:")
Log.i(TAG, " model=$modelPath exists=${File(modelPath).exists()} size=${File(modelPath).length()}")
Log.i(TAG, " embedding=$embeddingPath exists=${File(embeddingPath).exists()} size=${File(embeddingPath).length()}")
Log.i(TAG, " bpe=$bpePath exists=${File(bpePath).exists()} size=${File(bpePath).length()}")
val t0 = SystemClock.elapsedRealtime()
val engine = try {
SenseVoiceEngineRKNN(context)
} catch (e: UnsatisfiedLinkError) {
throw IllegalStateException("Load native libraries failed: ${e.message}", e)
}
val ok = try {
engine.loadModelDirectly(modelPath, embeddingPath, bpePath)
} catch (t: Throwable) {
throw IllegalStateException("SenseVoice loadModelDirectly crashed: ${t.message}", t)
}
val dt = SystemClock.elapsedRealtime() - t0
Log.i(TAG, "SenseVoice loadModelDirectly ok=$ok costMs=$dt")
if (!ok) throw IllegalStateException("SenseVoiceEngineRKNN loadModelDirectly returned false")
senseVoice = engine
true
} catch (e: Exception) {
Log.e(TAG, "Failed to initialize SenseVoice model: ${e.message}", e)
false
}
}
fun enqueueAudioSegment(originalAudio: FloatArray, processedAudio: FloatArray) {
try {
asrQueue.trySend(Pair(originalAudio, processedAudio))
} catch (e: Exception) {
Log.e(TAG, "Failed to enqueue audio segment: ${e.message}")
}
}
fun clearQueue() {
while (asrQueue.tryReceive().isSuccess) { }
}
suspend fun runAsrWorker() {
Log.d(TAG, "ASR worker started")
try {
while (currentCoroutineContext().isActive) {
val (originalSeg, processedSeg) = try {
Log.d(TAG, "ASR worker waiting for audio segment")
asrQueue.receive()
} catch (e: Throwable) {
Log.e(TAG, "ASR worker receive failed: ${e.message}")
break
}
Log.d(TAG, "ASR worker received audio segment, size=${processedSeg.size}")
if (callback?.shouldSkipAsr() == true || callback?.isLlmInFlight() == true) {
Log.d(TAG, "ASR worker skipping segment: shouldSkip=${callback?.shouldSkipAsr()}, llmInFlight=${callback?.isLlmInFlight()}")
continue
}
callback?.onAsrStarted()
Log.d(TAG, "ASR started: processing audio segment")
saveAsrAudio(originalSeg, processedSeg)
val raw = synchronized(nativeLock) {
val e = senseVoice
if (e == null || !e.isInitialized) {
Log.e(TAG, "ASR failed: SenseVoice engine not initialized")
""
} else {
try {
e.transcribeBuffer(processedSeg)
} catch (e: Throwable) {
Log.e(TAG, "ASR transcribe failed: ${e.message}")
""
}
}
}
Log.d(TAG, "ASR raw result: $raw")
val text = removeTokens(raw)
val filterResult = filterText(text)
if (filterResult != null) {
callback?.onAsrSkipped(filterResult)
continue
}
callback?.onAsrResult(text)
if (BuildConfig.LLM_API_KEY.isBlank()) {
Log.w(TAG, "LLM API Key is not configured")
continue
}
callback?.onLlmCalled(text)
}
} catch (e: Throwable) {
Log.e(TAG, "ASR worker error: ${e.message}", e)
} finally {
Log.d(TAG, "ASR worker exiting")
}
}
fun release() {
try {
senseVoice?.deinitialize()
} catch (e: Exception) {
Log.e(TAG, "Error deinitializing SenseVoice: ${e.message}")
}
senseVoice = null
clearQueue()
}
fun isInitialized(): Boolean = senseVoice?.isInitialized ?: false
private fun saveAsrAudio(originalAudio: FloatArray, processedAudio: FloatArray) {
try {
val timestamp = System.currentTimeMillis()
val asrAudioDir = FileHelper.getAsrAudioDir(context)
audioProcessor?.let { processor ->
val originalFile = File(asrAudioDir, "asr_${timestamp}_original.wav")
processor.saveAudioAsWav(originalFile, originalAudio, AppConfig.SAMPLE_RATE)
val processedFile = File(asrAudioDir, "asr_${timestamp}_processed.wav")
processor.saveAudioAsWav(processedFile, processedAudio, AppConfig.SAMPLE_RATE)
}
} catch (e: Exception) {
Log.e(TAG, "Error saving ASR audio: ${e.message}")
}
}
private fun removeTokens(text: String): String {
var cleaned = text.replace(Regex("<\\|[^>]+\\|>"), "")
cleaned = cleaned.replace(Regex("[>>≥≫]"), "")
cleaned = cleaned.trim().replace(Regex("\\s+"), " ")
return cleaned
}
private fun filterText(text: String): String? {
if (text.isBlank()) {
return "blank text"
}
if (text.length == 1 && text[0].equals('i', ignoreCase = true)) {
return "single 'i'"
}
if (text.length > AppConfig.Asr.MAX_TEXT_LENGTH) {
return "too long (${text.length} chars)"
}
return null
}
}

View File

@@ -0,0 +1,218 @@
package com.digitalperson.audio
import android.Manifest
import android.content.pm.PackageManager
import android.media.AudioFormat
import android.media.AudioRecord
import android.media.MediaRecorder
import android.media.audiofx.AcousticEchoCanceler
import android.media.audiofx.NoiseSuppressor
import android.util.Log
import androidx.core.app.ActivityCompat
import java.io.File
import java.io.FileOutputStream
private const val TAG = "AudioProcessor"
class AudioProcessor(
private val context: android.content.Context,
private val sampleRateInHz: Int = 16000,
private val channelConfig: Int = AudioFormat.CHANNEL_IN_MONO,
private val audioFormat: Int = AudioFormat.ENCODING_PCM_16BIT
) {
private val audioSource = MediaRecorder.AudioSource.MIC
private var audioRecord: AudioRecord? = null
private var aec: AcousticEchoCanceler? = null
private var ns: NoiseSuppressor? = null
private var smoothedRms = 0f
private val alpha = 0.8f
fun initMicrophone(permissions: Array<String>, requestCode: Int): Boolean {
if (ActivityCompat.checkSelfPermission(context, Manifest.permission.RECORD_AUDIO)
!= PackageManager.PERMISSION_GRANTED
) {
ActivityCompat.requestPermissions(
context as androidx.appcompat.app.AppCompatActivity,
permissions,
requestCode
)
return false
}
val numBytes = AudioRecord.getMinBufferSize(sampleRateInHz, channelConfig, audioFormat)
audioRecord = AudioRecord(
audioSource,
sampleRateInHz,
channelConfig,
audioFormat,
numBytes * 2
)
val sessionId = audioRecord?.audioSessionId ?: 0
if (sessionId != 0) {
if (AcousticEchoCanceler.isAvailable()) {
aec = AcousticEchoCanceler.create(sessionId)?.apply {
enabled = true
}
Log.i(TAG, "AEC enabled=${aec?.enabled}")
} else {
Log.w(TAG, "AEC not available on this device")
}
if (NoiseSuppressor.isAvailable()) {
ns = NoiseSuppressor.create(sessionId)?.apply {
enabled = true
}
Log.i(TAG, "NS enabled=${ns?.enabled}")
} else {
Log.w(TAG, "NS not available on this device")
}
}
return true
}
fun startRecording() {
audioRecord?.startRecording()
Log.d(TAG, "Audio recording started")
}
fun stopRecording() {
try {
audioRecord?.stop()
} catch (_: Throwable) {
}
Log.d(TAG, "Audio recording stopped")
}
fun release() {
try {
audioRecord?.stop()
} catch (_: Throwable) {
}
try {
audioRecord?.release()
} catch (_: Throwable) {
}
audioRecord = null
try {
aec?.release()
} catch (_: Throwable) {
}
try {
ns?.release()
} catch (_: Throwable) {
}
aec = null
ns = null
Log.d(TAG, "AudioProcessor released")
}
fun readAudio(buffer: ShortArray): Int {
return audioRecord?.read(buffer, 0, buffer.size) ?: -1
}
fun calculateRMS(samples: FloatArray): Float {
if (samples.isEmpty()) return 0.0f
var sumSquared = 0.0f
for (sample in samples) {
sumSquared += sample * sample
}
val meanSquared = sumSquared / samples.size
return kotlin.math.sqrt(meanSquared)
}
fun applyGain(chunk: FloatArray, targetRMS: Float = 0.1f): FloatArray {
val rms = calculateRMS(chunk)
smoothedRms = if (smoothedRms == 0f) rms else alpha * rms + (1 - alpha) * smoothedRms
var gainFactor = if (smoothedRms > 0) targetRMS / smoothedRms else 3.0f
gainFactor = gainFactor.coerceIn(0.1f, 10.0f)
val processedChunk = FloatArray(chunk.size) {
val value = chunk[it] * gainFactor
if (value > 1.0f) 1.0f else if (value < -1.0f) -1.0f else value
}
return processedChunk
}
fun convertShortToFloat(buffer: ShortArray): FloatArray {
return FloatArray(buffer.size) { buffer[it] / 32768.0f }
}
fun saveAudioAsWav(file: File, samples: FloatArray, sampleRate: Int) {
FileOutputStream(file).use { fos ->
val header = ByteArray(44)
header[0] = 'R'.code.toByte()
header[1] = 'I'.code.toByte()
header[2] = 'F'.code.toByte()
header[3] = 'F'.code.toByte()
val fileSize = 36 + samples.size * 2
intToByteArray(fileSize, header, 4)
header[8] = 'W'.code.toByte()
header[9] = 'A'.code.toByte()
header[10] = 'V'.code.toByte()
header[11] = 'E'.code.toByte()
header[12] = 'f'.code.toByte()
header[13] = 'm'.code.toByte()
header[14] = 't'.code.toByte()
header[15] = ' '.code.toByte()
intToByteArray(16, header, 16)
shortToByteArray(1, header, 20)
shortToByteArray(1, header, 22)
intToByteArray(sampleRate, header, 24)
val byteRate = sampleRate * 1 * 16 / 8
intToByteArray(byteRate, header, 28)
val blockAlign = 1 * 16 / 8
shortToByteArray(blockAlign.toShort(), header, 32)
shortToByteArray(16, header, 34)
header[36] = 'd'.code.toByte()
header[37] = 'a'.code.toByte()
header[38] = 't'.code.toByte()
header[39] = 'a'.code.toByte()
val dataSize = samples.size * 2
intToByteArray(dataSize, header, 40)
fos.write(header)
for (sample in samples) {
val clampedSample = sample.coerceIn(-1.0f, 1.0f)
val shortSample = (clampedSample * 32767.0f).toInt().toShort()
val bytes = ByteArray(2)
bytes[0] = (shortSample.toInt() and 0xFF).toByte()
bytes[1] = (shortSample.toInt() shr 8 and 0xFF).toByte()
fos.write(bytes)
}
}
}
private fun intToByteArray(value: Int, dest: ByteArray, offset: Int) {
dest[offset] = (value and 0xFF).toByte()
dest[offset + 1] = (value shr 8 and 0xFF).toByte()
dest[offset + 2] = (value shr 16 and 0xFF).toByte()
dest[offset + 3] = (value shr 24 and 0xFF).toByte()
}
private fun shortToByteArray(value: Short, dest: ByteArray, offset: Int) {
dest[offset] = (value.toInt() and 0xFF).toByte()
dest[offset + 1] = (value.toInt() shr 8 and 0xFF).toByte()
}
}

View File

@@ -0,0 +1,50 @@
package com.digitalperson.config
import com.digitalperson.BuildConfig
object AppConfig {
const val TAG = "DigitalPerson"
const val REQUEST_RECORD_AUDIO_PERMISSION = 200
const val SAMPLE_RATE = 16000
const val WINDOW_SIZE = 512
const val SHOW_DEBUG_TEXT = true
object Tts {
const val MODEL_DIR = "tts_model/sherpa-onnx-vits-zh-ll"
const val MODEL_NAME = "model.onnx"
const val LEXICON = "lexicon.txt"
const val SPEAKER_ID = 2
const val SPEED = 1.0f
const val MAX_LEN = 30
const val MAX_WAIT_MS: Long = 600
}
object Vad {
const val START_THRESHOLD = 0.2f
const val END_THRESHOLD = 0.15f
const val MIN_SILENCE_DURATION = 0.5f
const val MIN_SPEECH_DURATION = 0.1f
const val MAX_SPEECH_DURATION = 5.0f
}
object Asr {
const val MAX_TEXT_LENGTH = 50
const val MODEL_DIR = "sensevoice_models"
}
object Audio {
const val GAIN_SMOOTHING_FACTOR = 0.1f
const val TARGET_RMS = 0.1f
}
object Avatar {
// Compile-time switch in gradle.properties/local.properties: USE_LIVE2D=true|false
const val USE_LIVE2D = BuildConfig.USE_LIVE2D
// const val MODEL_DIR = "live2d_model/mao_pro_zh"
// const val MODEL_JSON = "mao_pro.model3.json"
const val MODEL_DIR = "live2d_model/Haru_pro_jp"
const val MODEL_JSON = "haru_greeter_t05.model3.json"
}
}

View File

@@ -0,0 +1,29 @@
package com.digitalperson.live2d
import android.opengl.GLSurfaceView
class Live2DAvatarManager(private val glSurfaceView: GLSurfaceView) {
private val renderer = Live2DRenderer(glSurfaceView.context)
init {
glSurfaceView.setEGLContextClientVersion(2)
glSurfaceView.setRenderer(renderer)
glSurfaceView.renderMode = GLSurfaceView.RENDERMODE_CONTINUOUSLY
}
fun setSpeaking(speaking: Boolean) {
renderer.setSpeaking(speaking)
}
fun onResume() {
glSurfaceView.onResume()
}
fun onPause() {
glSurfaceView.onPause()
}
fun release() {
renderer.release()
}
}

View File

@@ -0,0 +1,182 @@
package com.digitalperson.live2d
import android.content.res.AssetManager
import android.graphics.BitmapFactory
import android.opengl.GLES20
import android.opengl.GLUtils
import com.live2d.sdk.cubism.framework.CubismFramework
import com.live2d.sdk.cubism.framework.CubismModelSettingJson
import com.live2d.sdk.cubism.framework.id.CubismId
import com.live2d.sdk.cubism.framework.math.CubismMatrix44
import com.live2d.sdk.cubism.framework.model.CubismUserModel
import com.live2d.sdk.cubism.framework.motion.CubismMotion
import com.live2d.sdk.cubism.framework.rendering.android.CubismRendererAndroid
import kotlin.math.sin
class Live2DCharacter : CubismUserModel() {
private lateinit var setting: CubismModelSettingJson
private val lipSyncParams = mutableListOf<CubismId>()
private val idleMotions = mutableListOf<CubismMotion>()
private var idleMotionIndex = 0
private var lastElapsedSec = 0f
private val textureIds = mutableListOf<Int>()
fun loadFromAssets(assets: AssetManager, modelDir: String, modelJsonName: String) {
val settingBytes = readAssetBytes(assets, "$modelDir/$modelJsonName")
setting = CubismModelSettingJson(settingBytes)
loadModel(readAssetBytes(assets, "$modelDir/${setting.modelFileName}"))
setupRenderer(CubismRendererAndroid.create())
getModelMatrix().setWidth(2.0f)
getModelMatrix().setCenterPosition(0f, 0f)
val physicsFile = setting.physicsFileName
if (physicsFile.isNotEmpty()) {
loadPhysics(readAssetBytes(assets, "$modelDir/$physicsFile"))
}
val poseFile = setting.poseFileName
if (poseFile.isNotEmpty()) {
loadPose(readAssetBytes(assets, "$modelDir/$poseFile"))
}
initLipSyncParams()
loadIdleMotions(assets, modelDir)
startNextIdleMotion()
}
fun bindTextures(assets: AssetManager, modelDir: String) {
val renderer = getRenderer<CubismRendererAndroid>()
renderer.isPremultipliedAlpha(true)
renderer.isUsingHighPrecisionMask(true)
textureIds.forEach { id ->
GLES20.glDeleteTextures(1, intArrayOf(id), 0)
}
textureIds.clear()
for (i in 0 until setting.textureCount) {
val texturePath = "$modelDir/${setting.getTextureFileName(i)}"
val texId = loadTexture(assets, texturePath)
renderer.bindTexture(i, texId)
textureIds.add(texId)
}
}
fun updateFrame(elapsedSec: Float, speaking: Boolean) {
val model = getModel() ?: return
val dt = (elapsedSec - lastElapsedSec).coerceAtLeast(0f).coerceAtMost(0.1f)
lastElapsedSec = elapsedSec
// Keep motions running. If finished, continue idle loop.
motionManager.updateMotion(model, dt)
if (motionManager.isFinished()) {
startNextIdleMotion()
}
val mouth = if (speaking) {
0.2f + 0.35f * ((sin(elapsedSec * 14.0f) + 1.0f) * 0.5f)
} else {
0.0f
}
// Apply lip-sync to model-defined LipSync params (this model uses ParamA).
for (id in lipSyncParams) {
model.setParameterValue(id, mouth, 0.8f)
}
// Add small idle breathing/sway on top, so character is never "frozen".
val sway = sin(elapsedSec * 0.8f)
val breathe = sin(elapsedSec * 1.2f)
model.addParameterValue(CubismFramework.getIdManager().getId("ParamAngleX"), sway * 8f, 0.2f)
model.addParameterValue(CubismFramework.getIdManager().getId("ParamAngleY"), sin(elapsedSec * 0.6f) * 4f, 0.2f)
model.addParameterValue(CubismFramework.getIdManager().getId("ParamBodyAngleX"), sway * 6f, 0.15f)
model.addParameterValue(CubismFramework.getIdManager().getId("ParamBreath"), (breathe + 1f) * 0.5f, 0.1f)
physics?.evaluate(model, dt)
pose?.updateParameters(model, dt)
model.update()
}
fun draw(mvp: CubismMatrix44) {
val renderer = getRenderer<CubismRendererAndroid>()
renderer.setMvpMatrix(mvp)
renderer.drawModel()
}
fun release() {
textureIds.forEach { id ->
GLES20.glDeleteTextures(1, intArrayOf(id), 0)
}
textureIds.clear()
delete()
}
private fun loadTexture(assets: AssetManager, path: String): Int {
val bitmap = assets.open(path).use { stream ->
BitmapFactory.decodeStream(stream)
} ?: error("Decode texture failed: $path")
val ids = IntArray(1)
GLES20.glGenTextures(1, ids, 0)
val textureId = ids[0]
GLES20.glBindTexture(GLES20.GL_TEXTURE_2D, textureId)
GLES20.glTexParameteri(GLES20.GL_TEXTURE_2D, GLES20.GL_TEXTURE_MIN_FILTER, GLES20.GL_LINEAR)
GLES20.glTexParameteri(GLES20.GL_TEXTURE_2D, GLES20.GL_TEXTURE_MAG_FILTER, GLES20.GL_LINEAR)
GLES20.glTexParameteri(GLES20.GL_TEXTURE_2D, GLES20.GL_TEXTURE_WRAP_S, GLES20.GL_CLAMP_TO_EDGE)
GLES20.glTexParameteri(GLES20.GL_TEXTURE_2D, GLES20.GL_TEXTURE_WRAP_T, GLES20.GL_CLAMP_TO_EDGE)
GLUtils.texImage2D(GLES20.GL_TEXTURE_2D, 0, bitmap, 0)
bitmap.recycle()
return textureId
}
private fun readAssetBytes(assets: AssetManager, path: String): ByteArray {
return assets.open(path).use { input ->
input.readBytes()
}
}
private fun initLipSyncParams() {
lipSyncParams.clear()
for (i in 0 until setting.lipSyncParameterCount) {
lipSyncParams.add(setting.getLipSyncParameterId(i))
}
if (lipSyncParams.isEmpty()) {
lipSyncParams.add(CubismFramework.getIdManager().getId("ParamA"))
lipSyncParams.add(CubismFramework.getIdManager().getId("ParamMouthOpenY"))
}
}
private fun loadIdleMotions(assets: AssetManager, modelDir: String) {
idleMotions.clear()
val groupName = findIdleGroupName()
if (groupName.isEmpty()) return
for (i in 0 until setting.getMotionCount(groupName)) {
val fileName = setting.getMotionFileName(groupName, i)
if (fileName.isBlank()) continue
runCatching {
val motion = loadMotion(readAssetBytes(assets, "$modelDir/$fileName"))
motion?.setLoop(true)
motion?.setLoopFadeIn(true)
if (motion != null) idleMotions.add(motion)
}
}
}
private fun startNextIdleMotion() {
if (idleMotions.isEmpty()) return
val index = idleMotionIndex % idleMotions.size
idleMotionIndex++
motionManager.startMotionPriority(idleMotions[index], 1)
}
private fun findIdleGroupName(): String {
for (i in 0 until setting.motionGroupCount) {
val name = setting.getMotionGroupName(i)
if (name.equals("Idle", ignoreCase = true)) return name
}
if (setting.motionGroupCount > 0) {
return setting.getMotionGroupName(0) ?: ""
}
return ""
}
}

View File

@@ -0,0 +1,78 @@
package com.digitalperson.live2d
import android.content.Context
import android.opengl.GLES20
import android.opengl.GLSurfaceView
import android.os.SystemClock
import android.util.Log
import com.digitalperson.config.AppConfig
import com.live2d.sdk.cubism.framework.CubismFramework
import com.live2d.sdk.cubism.framework.math.CubismMatrix44
import javax.microedition.khronos.egl.EGLConfig
import javax.microedition.khronos.opengles.GL10
class Live2DRenderer(
private val context: Context
) : GLSurfaceView.Renderer {
@Volatile
private var speaking = false
private var character: Live2DCharacter? = null
private val mvp = CubismMatrix44.create()
private var startTimeMs: Long = 0L
override fun onSurfaceCreated(gl: GL10?, config: EGLConfig?) {
GLES20.glClearColor(0f, 0f, 0f, 0f)
ensureFrameworkInitialized()
startTimeMs = SystemClock.elapsedRealtime()
runCatching {
val model = Live2DCharacter()
model.loadFromAssets(
assets = context.assets,
modelDir = AppConfig.Avatar.MODEL_DIR,
modelJsonName = AppConfig.Avatar.MODEL_JSON
)
model.bindTextures(context.assets, AppConfig.Avatar.MODEL_DIR)
character = model
}.onFailure {
Log.e(AppConfig.TAG, "Load Live2D model failed: ${it.message}", it)
character = null
}
}
override fun onSurfaceChanged(gl: GL10?, width: Int, height: Int) {
GLES20.glViewport(0, 0, width, height)
mvp.loadIdentity()
if (width > height) {
mvp.scale(1f, width.toFloat() / height.toFloat())
} else {
mvp.scale(height.toFloat() / width.toFloat(), 1f)
}
}
override fun onDrawFrame(gl: GL10?) {
GLES20.glClear(GLES20.GL_COLOR_BUFFER_BIT)
val elapsedSec = (SystemClock.elapsedRealtime() - startTimeMs) / 1000f
character?.updateFrame(elapsedSec = elapsedSec, speaking = speaking)
character?.draw(mvp)
}
fun setSpeaking(speaking: Boolean) {
this.speaking = speaking
}
fun release() {
character?.release()
character = null
}
private fun ensureFrameworkInitialized() {
if (!CubismFramework.isStarted()) {
CubismFramework.startUp(CubismFramework.Option())
}
if (!CubismFramework.isInitialized()) {
CubismFramework.initialize()
}
}
}

View File

@@ -17,7 +17,7 @@ class VideoPlayerManager(
private var playerSilent: ExoPlayer? = null
private var playerSpeaking: ExoPlayer? = null
private var currentState: Boolean = false
private var transitionDuration = 300L // 淡入淡出时长
private var transitionDuration = 100L // 淡入淡出时长
init {
// 确保初始 alpha

View File

@@ -0,0 +1,293 @@
package com.digitalperson.tts
import android.content.Context
import android.media.AudioAttributes
import android.media.AudioFormat
import android.media.AudioManager
import android.media.AudioTrack
import android.util.Log
import android.widget.Toast
import com.digitalperson.config.AppConfig
import com.digitalperson.metrics.TraceManager
import com.digitalperson.metrics.TraceSession
import com.k2fsa.sherpa.onnx.OfflineTts
import com.k2fsa.sherpa.onnx.getOfflineTtsConfig
import kotlinx.coroutines.CoroutineScope
import kotlinx.coroutines.Dispatchers
import kotlinx.coroutines.launch
import java.util.concurrent.LinkedBlockingQueue
import java.util.concurrent.atomic.AtomicBoolean
class TtsManager(private val context: Context) {
companion object {
private const val TAG = "TtsManager"
}
private var tts: OfflineTts? = null
private var track: AudioTrack? = null
private sealed class TtsQueueItem {
data class Segment(val text: String) : TtsQueueItem()
data object End : TtsQueueItem()
}
private val ttsQueue = LinkedBlockingQueue<TtsQueueItem>()
private val ttsStopped = AtomicBoolean(false)
private val ttsWorkerRunning = AtomicBoolean(false)
private val ttsPlaying = AtomicBoolean(false)
@Volatile private var ttsTotalSamplesWritten: Long = 0
private var currentTrace: TraceSession? = null
private val ioScope = CoroutineScope(Dispatchers.IO)
interface TtsCallback {
fun onTtsStarted(text: String)
fun onTtsCompleted()
fun onTtsSegmentCompleted(durationMs: Long)
fun isTtsStopped(): Boolean
fun onClearAsrQueue()
fun onSetSpeaking(speaking: Boolean)
fun getCurrentTrace(): TraceSession?
fun onTraceMarkTtsRequestEnqueued()
fun onTraceMarkTtsSynthesisStart()
fun onTraceMarkTtsFirstPcmReady()
fun onTraceMarkTtsFirstAudioPlay()
fun onTraceMarkTtsDone()
fun onTraceAddDuration(name: String, value: Long)
fun onEndTurn()
}
private var callback: TtsCallback? = null
fun setCallback(callback: TtsCallback) {
this.callback = callback
}
fun initTtsAndAudioTrack(): Boolean {
return try {
val modelDir = AppConfig.Tts.MODEL_DIR
val modelName = AppConfig.Tts.MODEL_NAME
val lexicon = AppConfig.Tts.LEXICON
val dataDir = ""
val ttsConfig = getOfflineTtsConfig(
modelDir = modelDir,
modelName = modelName,
acousticModelName = "",
vocoder = "",
voices = "",
lexicon = lexicon,
dataDir = dataDir,
dictDir = "",
ruleFsts = "$modelDir/phone.fst,$modelDir/date.fst,$modelDir/number.fst,$modelDir/new_heteronym.fst",
ruleFars = "",
numThreads = null,
isKitten = false
)
tts = OfflineTts(assetManager = context.assets, config = ttsConfig)
initAudioTrack()
true
} catch (t: Throwable) {
Log.e(TAG, "Init TTS failed: ${t.message}", t)
tts = null
false
}
}
private fun initAudioTrack() {
val t = tts ?: return
val sr = t.sampleRate()
val bufLength = AudioTrack.getMinBufferSize(
sr,
AudioFormat.CHANNEL_OUT_MONO,
AudioFormat.ENCODING_PCM_FLOAT
)
val attr = AudioAttributes.Builder()
.setContentType(AudioAttributes.CONTENT_TYPE_SPEECH)
.setUsage(AudioAttributes.USAGE_MEDIA)
.build()
val format = AudioFormat.Builder()
.setEncoding(AudioFormat.ENCODING_PCM_FLOAT)
.setChannelMask(AudioFormat.CHANNEL_OUT_MONO)
.setSampleRate(sr)
.build()
track = AudioTrack(
attr,
format,
bufLength,
AudioTrack.MODE_STREAM,
AudioManager.AUDIO_SESSION_ID_GENERATE
)
track?.play()
}
fun enqueueSegment(seg: String) {
val cleanedSeg = seg.trimEnd('.', '。', '!', '', '?', '', ',', '', ';', '', ':', '')
callback?.onTraceMarkTtsRequestEnqueued()
ttsQueue.offer(TtsQueueItem.Segment(cleanedSeg))
ensureTtsWorker()
}
fun enqueueEnd() {
ttsQueue.offer(TtsQueueItem.End)
}
fun isPlaying(): Boolean = ttsPlaying.get()
fun reset() {
ttsStopped.set(false)
ttsPlaying.set(false)
ttsTotalSamplesWritten = 0
ttsQueue.clear()
}
fun stop() {
ttsStopped.set(true)
ttsPlaying.set(false)
ttsTotalSamplesWritten = 0
ttsQueue.clear()
ttsQueue.offer(TtsQueueItem.End)
try {
track?.pause()
track?.flush()
} catch (_: Throwable) {
}
}
fun release() {
try {
tts?.release()
} catch (_: Throwable) {
}
try {
track?.release()
} catch (_: Throwable) {
}
tts = null
track = null
}
fun setCurrentTrace(trace: TraceSession?) {
currentTrace = trace
}
private fun ensureTtsWorker() {
if (!ttsWorkerRunning.compareAndSet(false, true)) return
ioScope.launch {
try {
runTtsWorker()
} finally {
ttsWorkerRunning.set(false)
}
}
}
private fun runTtsWorker() {
val t = tts ?: return
val audioTrack = track ?: return
var firstAudioMarked = false
var isFirstSegment = true
while (true) {
val item = ttsQueue.take()
if (ttsStopped.get()) break
when (item) {
is TtsQueueItem.Segment -> {
ttsPlaying.set(true)
callback?.onSetSpeaking(true)
val trace = currentTrace
trace?.markTtsSynthesisStart()
callback?.onTraceMarkTtsSynthesisStart()
Log.d(TAG, "TTS started: processing segment '${item.text}'")
callback?.onTtsStarted(item.text)
val startMs = System.currentTimeMillis()
var firstPcmMarked = false
if (isFirstSegment) {
try {
audioTrack.pause()
audioTrack.flush()
audioTrack.play()
} catch (_: Throwable) {
}
isFirstSegment = false
}
t.generateWithCallback(
text = item.text,
sid = AppConfig.Tts.SPEAKER_ID,
speed = AppConfig.Tts.SPEED
) { samples ->
if (ttsStopped.get()) return@generateWithCallback 0
if (!firstPcmMarked && samples.isNotEmpty()) {
firstPcmMarked = true
trace?.markTtsFirstPcmReady()
callback?.onTraceMarkTtsFirstPcmReady()
}
if (!firstAudioMarked && samples.isNotEmpty()) {
firstAudioMarked = true
trace?.markTtsFirstAudioPlay()
callback?.onTraceMarkTtsFirstAudioPlay()
}
audioTrack.write(samples, 0, samples.size, AudioTrack.WRITE_BLOCKING)
ttsTotalSamplesWritten += samples.size
1
}
val ttsMs = System.currentTimeMillis() - startMs
trace?.addDuration("tts_segment_ms_total", ttsMs)
callback?.onTraceAddDuration("tts_segment_ms_total", ttsMs)
callback?.onTtsSegmentCompleted(ttsMs)
}
TtsQueueItem.End -> {
callback?.onClearAsrQueue()
waitForPlaybackComplete(audioTrack)
callback?.onTtsCompleted()
ttsPlaying.set(false)
callback?.onSetSpeaking(false)
ttsTotalSamplesWritten = 0
currentTrace?.markTtsDone()
callback?.onTraceMarkTtsDone()
callback?.onEndTurn()
break
}
}
}
}
private fun waitForPlaybackComplete(audioTrack: AudioTrack) {
val totalSamples = ttsTotalSamplesWritten
if (totalSamples <= 0) return
val sampleRate = audioTrack.sampleRate
val timeoutMs = (totalSamples * 1000 / sampleRate) + 2000
val startTime = System.currentTimeMillis()
while (true) {
if (ttsStopped.get()) break
val playbackPos = audioTrack.playbackHeadPosition.toLong()
if (playbackPos >= totalSamples) {
break
}
if (System.currentTimeMillis() - startTime > timeoutMs) {
Log.w(TAG, "waitForPlaybackComplete timeout, pos=$playbackPos, total=$totalSamples")
break
}
Thread.sleep(20)
}
Thread.sleep(1000)
}
}

View File

@@ -0,0 +1,95 @@
package com.digitalperson.ui
import android.app.Activity
import android.opengl.GLSurfaceView
import android.text.method.ScrollingMovementMethod
import android.widget.Button
import android.widget.ScrollView
import android.widget.TextView
import android.widget.Toast
import com.digitalperson.live2d.Live2DAvatarManager
class Live2DUiManager(private val activity: Activity) {
private var textView: TextView? = null
private var scrollView: ScrollView? = null
private var startButton: Button? = null
private var stopButton: Button? = null
private var avatarManager: Live2DAvatarManager? = null
private var lastUiText: String = ""
fun initViews(
textViewId: Int,
scrollViewId: Int,
startButtonId: Int,
stopButtonId: Int,
silentPlayerViewId: Int,
speakingPlayerViewId: Int,
live2dViewId: Int
) {
textView = activity.findViewById(textViewId)
scrollView = activity.findViewById(scrollViewId)
startButton = activity.findViewById(startButtonId)
stopButton = activity.findViewById(stopButtonId)
textView?.movementMethod = ScrollingMovementMethod()
val glView = activity.findViewById<GLSurfaceView>(live2dViewId)
avatarManager = Live2DAvatarManager(glView)
avatarManager?.setSpeaking(false)
}
fun setStartButtonListener(listener: () -> Unit) {
startButton?.setOnClickListener { listener() }
}
fun setStopButtonListener(listener: () -> Unit) {
stopButton?.setOnClickListener { listener() }
}
fun appendToUi(s: String) {
lastUiText += s
textView?.text = lastUiText
scrollView?.post { scrollView?.fullScroll(ScrollView.FOCUS_DOWN) }
}
fun clearText() {
lastUiText = ""
textView?.text = ""
}
fun setText(text: String) {
lastUiText = text
textView?.text = text
}
fun setButtonsEnabled(startEnabled: Boolean, stopEnabled: Boolean) {
startButton?.isEnabled = startEnabled
stopButton?.isEnabled = stopEnabled
}
fun setSpeaking(speaking: Boolean) {
activity.runOnUiThread {
avatarManager?.setSpeaking(speaking)
}
}
fun showToast(message: String, duration: Int = Toast.LENGTH_SHORT) {
activity.runOnUiThread {
Toast.makeText(activity, message, duration).show()
}
}
fun onResume() {
avatarManager?.onResume()
}
fun onPause() {
avatarManager?.onPause()
}
fun release() {
avatarManager?.release()
avatarManager = null
}
}

View File

@@ -0,0 +1,106 @@
package com.digitalperson.ui
import android.app.Activity
import android.text.method.ScrollingMovementMethod
import android.util.Log
import android.widget.Button
import android.widget.ScrollView
import android.widget.TextView
import android.widget.Toast
import com.digitalperson.config.AppConfig
import com.digitalperson.player.VideoPlayerManager
import com.google.android.exoplayer2.ui.PlayerView
class UiManager(private val activity: Activity) {
private var textView: TextView? = null
private var scrollView: ScrollView? = null
private var startButton: Button? = null
private var stopButton: Button? = null
private var videoPlayerManager: VideoPlayerManager? = null
private var lastUiText: String = ""
fun initViews(
textViewId: Int,
scrollViewId: Int,
startButtonId: Int,
stopButtonId: Int,
silentPlayerViewId: Int,
speakingPlayerViewId: Int
) {
textView = activity.findViewById(textViewId)
scrollView = activity.findViewById(scrollViewId)
startButton = activity.findViewById(startButtonId)
stopButton = activity.findViewById(stopButtonId)
textView?.movementMethod = ScrollingMovementMethod()
try {
val silentPv = activity.findViewById<PlayerView>(silentPlayerViewId)
val speakingPv = activity.findViewById<PlayerView>(speakingPlayerViewId)
videoPlayerManager = VideoPlayerManager(activity, silentPv, speakingPv)
videoPlayerManager?.setSpeaking(false)
} catch (e: Exception) {
Log.w(AppConfig.TAG, "PlayerViews not found or init failed: ${e.message}")
}
}
fun setStartButtonListener(listener: () -> Unit) {
startButton?.setOnClickListener { listener() }
}
fun setStopButtonListener(listener: () -> Unit) {
stopButton?.setOnClickListener { listener() }
}
fun appendToUi(s: String) {
if (!AppConfig.SHOW_DEBUG_TEXT) return
lastUiText += s
textView?.text = lastUiText
scrollView?.post {
scrollView?.fullScroll(ScrollView.FOCUS_DOWN)
}
}
fun clearText() {
lastUiText = ""
textView?.text = ""
}
fun setText(text: String) {
lastUiText = text
textView?.text = text
}
fun setButtonsEnabled(startEnabled: Boolean, stopEnabled: Boolean) {
startButton?.isEnabled = startEnabled
stopButton?.isEnabled = stopEnabled
}
fun setSpeaking(speaking: Boolean) {
activity.runOnUiThread {
videoPlayerManager?.setSpeaking(speaking)
}
}
fun showToast(message: String, duration: Int = Toast.LENGTH_SHORT) {
activity.runOnUiThread {
Toast.makeText(activity, message, duration).show()
}
}
fun showToastOnUi(message: String, duration: Int = Toast.LENGTH_SHORT) {
Toast.makeText(activity, message, duration).show()
}
fun release() {
videoPlayerManager?.release()
videoPlayerManager = null
}
fun reset() {
lastUiText = ""
}
}

View File

@@ -0,0 +1,60 @@
package com.digitalperson.util
import android.content.Context
import android.util.Log
import com.digitalperson.config.AppConfig
import java.io.File
import java.io.FileOutputStream
object FileHelper {
private const val TAG = AppConfig.TAG
fun assetExists(context: Context, path: String): Boolean {
return try {
context.assets.open(path).close()
true
} catch (_: Throwable) {
false
}
}
fun copyAssetsToInternal(context: Context, assetDir: String, targetDir: File, files: Array<String>): File {
if (!targetDir.exists()) targetDir.mkdirs()
for (name in files) {
val assetPath = "$assetDir/$name"
val outFile = File(targetDir, name)
if (outFile.exists() && outFile.length() > 0) continue
try {
context.assets.open(assetPath).use { input ->
FileOutputStream(outFile).use { output ->
input.copyTo(output)
}
}
} catch (e: Exception) {
Log.e(TAG, "Failed to copy asset $assetPath: ${e.message}")
}
}
return targetDir
}
fun copySenseVoiceAssets(context: Context): File {
val outDir = File(context.filesDir, AppConfig.Asr.MODEL_DIR)
val files = arrayOf(
"am.mvn",
"chn_jpn_yue_eng_ko_spectok.bpe.model",
"embedding.npy",
"sense-voice-encoder.rknn"
)
return copyAssetsToInternal(context, AppConfig.Asr.MODEL_DIR, outDir, files)
}
fun ensureDir(dir: File): File {
if (!dir.exists()) dir.mkdirs()
return dir
}
fun getAsrAudioDir(context: Context): File {
return ensureDir(File(context.filesDir, "asr_audio"))
}
}

View File

@@ -0,0 +1,216 @@
package com.digitalperson.vad
import android.content.Context
import android.util.Log
import com.digitalperson.config.AppConfig
import com.k2fsa.sherpa.onnx.SileroVadModelConfig
import com.k2fsa.sherpa.onnx.Vad
import com.k2fsa.sherpa.onnx.VadModelConfig
import java.io.File
import kotlin.math.max
class VadManager(private val context: Context) {
companion object {
private const val TAG = "VadManager"
}
private var vad: Vad? = null
private val nativeLock = Any()
private var inSpeech = false
private var silenceSamples = 0
private var speechBuf = FloatArray(0)
private var speechLen = 0
private var processedSpeechBuf = FloatArray(0)
private var processedSpeechLen = 0
private val minSilenceSamples = (AppConfig.Vad.MIN_SILENCE_DURATION * AppConfig.SAMPLE_RATE).toInt()
private val minSpeechSamples = (AppConfig.Vad.MIN_SPEECH_DURATION * AppConfig.SAMPLE_RATE).toInt()
private val maxSpeechSamples = (AppConfig.Vad.MAX_SPEECH_DURATION * AppConfig.SAMPLE_RATE).toInt()
var vadComputeCount = 0
private set
interface VadCallback {
fun onSpeechSegmentReady(originalAudio: FloatArray, processedAudio: FloatArray)
fun shouldSkipProcessing(): Boolean
}
private var callback: VadCallback? = null
fun setCallback(callback: VadCallback) {
this.callback = callback
}
fun initVadModel(): Boolean {
return try {
val config = VadModelConfig(
sileroVadModelConfig = SileroVadModelConfig(
model = "vad_model/silero_vad.onnx",
threshold = 0.5F,
minSilenceDuration = 0.25F,
minSpeechDuration = 0.25F,
windowSize = AppConfig.WINDOW_SIZE,
),
sampleRate = AppConfig.SAMPLE_RATE,
numThreads = 1,
provider = "cpu",
)
vad = Vad(assetManager = context.assets, config = config)
Log.i(TAG, "VAD model initialized successfully")
true
} catch (e: Exception) {
Log.e(TAG, "Failed to initialize VAD model: ${e.message}", e)
false
}
}
fun reset() {
vad?.reset()
inSpeech = false
silenceSamples = 0
speechLen = 0
processedSpeechLen = 0
vadComputeCount = 0
}
fun release() {
try {
vad?.release()
} catch (e: Exception) {
Log.e(TAG, "Error releasing VAD: ${e.message}")
}
vad = null
}
fun processAudioChunk(chunk: FloatArray, processedChunk: FloatArray): VadResult {
val prob = synchronized(nativeLock) {
vad?.compute(processedChunk) ?: 0f
}
vadComputeCount++
val result = when {
!inSpeech && prob >= AppConfig.Vad.START_THRESHOLD -> {
inSpeech = true
silenceSamples = 0
appendSpeech(chunk, processedChunk)
Log.d(TAG, "VAD: Entered speech state, prob=$prob, speechLen=$speechLen")
VadResult.SpeechStarted(prob)
}
inSpeech && prob <= AppConfig.Vad.END_THRESHOLD -> {
silenceSamples += chunk.size
if (silenceSamples >= minSilenceSamples) {
Log.d(TAG, "VAD: Exiting speech state, prob=$prob, silenceSamples=$silenceSamples, speechLen=$speechLen")
finalizeSegmentIfAny()
VadResult.SpeechEnded(prob)
} else {
appendSpeech(chunk, processedChunk)
VadResult.SpeechContinuing(prob)
}
}
inSpeech -> {
appendSpeech(chunk, processedChunk)
silenceSamples = 0
if (speechLen >= maxSpeechSamples) {
Log.d(TAG, "VAD: Max speech length reached, finalizing segment")
finalizeSegmentIfAny()
VadResult.MaxSpeechReached(prob)
} else {
VadResult.SpeechContinuing(prob)
}
}
else -> {
VadResult.Silence(prob)
}
}
return result
}
fun forceFinalize() {
finalizeSegmentIfAny()
}
fun isInSpeech(): Boolean = inSpeech
fun getSpeechLength(): Int = speechLen
fun clearState() {
inSpeech = false
silenceSamples = 0
speechLen = 0
processedSpeechLen = 0
}
private fun appendSpeech(chunk: FloatArray, processedChunk: FloatArray) {
val needed = speechLen + chunk.size
if (speechBuf.size < needed) {
var newCap = maxOf(needed, maxOf(1024, speechBuf.size * 2))
if (newCap > maxSpeechSamples) newCap = maxSpeechSamples
val n = FloatArray(newCap)
if (speechLen > 0) System.arraycopy(speechBuf, 0, n, 0, speechLen)
speechBuf = n
}
val copyN = minOf(chunk.size, max(0, maxSpeechSamples - speechLen))
if (copyN > 0) {
System.arraycopy(chunk, 0, speechBuf, speechLen, copyN)
speechLen += copyN
}
val processedNeeded = processedSpeechLen + processedChunk.size
if (processedSpeechBuf.size < processedNeeded) {
var newCap = maxOf(processedNeeded, maxOf(1024, processedSpeechBuf.size * 2))
if (newCap > maxSpeechSamples) newCap = maxSpeechSamples
val n = FloatArray(newCap)
if (processedSpeechLen > 0) System.arraycopy(processedSpeechBuf, 0, n, 0, processedSpeechLen)
processedSpeechBuf = n
}
val processedCopyN = minOf(processedChunk.size, max(0, maxSpeechSamples - processedSpeechLen))
if (processedCopyN > 0) {
System.arraycopy(processedChunk, 0, processedSpeechBuf, processedSpeechLen, processedCopyN)
processedSpeechLen += processedCopyN
}
}
private fun finalizeSegmentIfAny() {
Log.d(TAG, "finalizeSegmentIfAny called: speechLen=$speechLen, minSpeechSamples=$minSpeechSamples")
if (speechLen < minSpeechSamples) {
Log.d(TAG, "finalizeSegmentIfAny: speech too short, discarding")
speechLen = 0
processedSpeechLen = 0
inSpeech = false
silenceSamples = 0
return
}
if (callback?.shouldSkipProcessing() == true) {
Log.d(TAG, "finalizeSegmentIfAny: skipping due to callback")
speechLen = 0
processedSpeechLen = 0
inSpeech = false
silenceSamples = 0
return
}
val originalSeg = speechBuf.copyOf(speechLen)
val processedSeg = processedSpeechBuf.copyOf(processedSpeechLen)
speechLen = 0
processedSpeechLen = 0
inSpeech = false
silenceSamples = 0
Log.d(TAG, "Sending audio segment to callback, size: ${processedSeg.size}")
callback?.onSpeechSegmentReady(originalSeg, processedSeg)
}
sealed class VadResult(val probability: Float) {
class SpeechStarted(prob: Float) : VadResult(prob)
class SpeechEnded(prob: Float) : VadResult(prob)
class SpeechContinuing(prob: Float) : VadResult(prob)
class MaxSpeechReached(prob: Float) : VadResult(prob)
class Silence(prob: Float) : VadResult(prob)
}
}