live2d model

2026-03-02 09:25:50 +08:00
parent d63d4b03cf
commit 2f6166ab6c
179 changed files with 100625 additions and 2018 deletions
--- a/app/src/main/java/com/digital_person/MainActivity.kt.bak
+++ b/app/src/main/java/com/digital_person/MainActivity.kt.bak
@@ -1,957 +0,0 @@
-package com.digitalperson
-
-import android.Manifest
-import android.content.pm.PackageManager
-import android.media.AudioAttributes
-import android.media.AudioFormat
-import android.media.AudioManager
-import android.media.AudioRecord
-import android.media.AudioTrack
-import android.media.MediaRecorder
-import android.media.audiofx.AcousticEchoCanceler
-import android.media.audiofx.NoiseSuppressor
-import android.os.Bundle
-import android.os.SystemClock
-import android.text.method.ScrollingMovementMethod
-import android.util.Log
-import android.widget.Button
-import android.widget.TextView
-import android.widget.Toast
-import androidx.appcompat.app.AppCompatActivity
-import androidx.core.app.ActivityCompat
-import com.digitalperson.cloud.CloudApiManager
-import com.digitalperson.player.VideoPlayerManager
-import com.google.android.exoplayer2.ui.PlayerView
-import com.digitalperson.engine.SenseVoiceEngineRKNN
-import com.digitalperson.metrics.TraceManager
-import com.digitalperson.metrics.TraceSession
-import com.k2fsa.sherpa.onnx.OfflineTts
-import com.k2fsa.sherpa.onnx.SileroVadModelConfig
-import com.k2fsa.sherpa.onnx.Vad
-import com.k2fsa.sherpa.onnx.VadModelConfig
-import com.k2fsa.sherpa.onnx.getOfflineTtsConfig
-import kotlinx.coroutines.CoroutineScope
-import kotlinx.coroutines.Dispatchers
-import kotlinx.coroutines.Job
-import kotlinx.coroutines.SupervisorJob
-import kotlinx.coroutines.cancel
-import kotlinx.coroutines.channels.Channel
-import kotlinx.coroutines.isActive
-import kotlinx.coroutines.launch
-import kotlinx.coroutines.withContext
-import java.io.File
-import java.io.FileOutputStream
-import java.util.concurrent.LinkedBlockingQueue
-import java.util.concurrent.atomic.AtomicBoolean
-import kotlin.math.max
-
-private const val TAG = "DigitalPerson"
-private const val REQUEST_RECORD_AUDIO_PERMISSION = 200
-
-class MainActivity : AppCompatActivity() {
-
-    private lateinit var startButton: Button
-    private lateinit var stopButton: Button
-    private lateinit var textView: TextView
-
-    private lateinit var vad: Vad
-    private var senseVoice: SenseVoiceEngineRKNN? = null
-    private var tts: OfflineTts? = null
-    private var track: AudioTrack? = null
-
-    private var aec: AcousticEchoCanceler? = null
-    private var ns: NoiseSuppressor? = null
-
-    private var audioRecord: AudioRecord? = null
-    private val audioSource = MediaRecorder.AudioSource.MIC
-    private val sampleRateInHz = 16000
-    private val channelConfig = AudioFormat.CHANNEL_IN_MONO
-    private val audioFormat = AudioFormat.ENCODING_PCM_16BIT
-    private val permissions: Array<String> = arrayOf(Manifest.permission.RECORD_AUDIO)
-
-    @Volatile
-    private var isRecording: Boolean = false
-
-    private val ioScope = CoroutineScope(SupervisorJob() + Dispatchers.IO)
-    private var recordingJob: Job? = null
-    private val nativeLock = Any()
-
-    private lateinit var cloudApiManager: CloudApiManager
-    private var videoPlayerManager: VideoPlayerManager? = null
-    private val segmenter = StreamingTextSegmenter(
-        maxLen = 30,
-        maxWaitMs = 600
-    )
-
-    private sealed class TtsQueueItem {
-        data class Segment(val text: String) : TtsQueueItem()
-        data object End : TtsQueueItem()
-    }
-
-    private val ttsQueue = LinkedBlockingQueue<TtsQueueItem>()
-    private val ttsStopped = AtomicBoolean(false)
-    private val ttsWorkerRunning = AtomicBoolean(false)
-    private val ttsPlaying = AtomicBoolean(false)
-    @Volatile private var ttsTotalSamplesWritten: Long = 0
-
-    private var currentTrace: TraceSession? = null
-
-    private var lastUiText: String = ""
-    @Volatile private var llmInFlight: Boolean = false
-    private var enableStreaming = true // 默认启用流式输出
-
-    // ASR 队列和工作器
-    private val asrQueue = Channel<Pair<FloatArray, TraceSession?>>()
-    private val asrWorkerRunning = AtomicBoolean(false)
-
-    override fun onRequestPermissionsResult(
-        requestCode: Int,
-        permissions: Array<String>,
-        grantResults: IntArray
-    ) {
-        super.onRequestPermissionsResult(requestCode, permissions, grantResults)
-        val ok = requestCode == REQUEST_RECORD_AUDIO_PERMISSION &&
-            grantResults.isNotEmpty() &&
-            grantResults[0] == PackageManager.PERMISSION_GRANTED
-        if (!ok) {
-            Log.e(TAG, "Audio record is disallowed")
-            finish()
-        }
-    }
-
-    override fun onCreate(savedInstanceState: Bundle?) {
-        super.onCreate(savedInstanceState)
-        setContentView(R.layout.activity_main)
-
-        // 初始化双播放器管理器（silent 与 speaking 两个叠加的 PlayerView）
-        try {
-            val silentPv = findViewById<PlayerView>(R.id.player_view_silent)
-            val speakingPv = findViewById<PlayerView>(R.id.player_view_speaking)
-            videoPlayerManager = VideoPlayerManager(this, silentPv, speakingPv)
-            // 默认 AI 未说话
-            videoPlayerManager?.setSpeaking(false)
-        } catch (e: Exception) {
-            Log.w(TAG, "PlayerViews not found or init failed: ${e.message}")
-        }
-
-        ActivityCompat.requestPermissions(this, permissions, REQUEST_RECORD_AUDIO_PERMISSION)
-
-        startButton = findViewById(R.id.start_button)
-        stopButton = findViewById(R.id.stop_button)
-        textView = findViewById(R.id.my_text)
-        textView.movementMethod = ScrollingMovementMethod()
-
-        startButton.setOnClickListener { onStartClicked() }
-        stopButton.setOnClickListener { onStopClicked(userInitiated = true) }
-        
-        // 初始化流式输出开关
-        try {
-            val streamingSwitch = findViewById<android.widget.Switch>(R.id.streaming_switch)
-            streamingSwitch.isChecked = enableStreaming
-            streamingSwitch.setOnCheckedChangeListener { _, isChecked ->
-                enableStreaming = isChecked
-                cloudApiManager.setEnableStreaming(isChecked)
-                Toast.makeText(this, "流式输出已${if (isChecked) "启用" else "禁用"}", Toast.LENGTH_SHORT).show()
-            }
-        } catch (e: Exception) {
-            Log.w(TAG, "Streaming switch not found in layout: ${e.message}")
-        }
-
-        // 避免 UI 线程重初始化导致 ANR：在后台初始化模型与 AudioTrack
-        startButton.isEnabled = false
-        stopButton.isEnabled = false
-        textView.text = "初始化中…"
-        ioScope.launch {
-            try {
-                Log.i(TAG, "Init VAD + SenseVoice(RKNN) + TTS (background)")
-                synchronized(nativeLock) {
-                    initVadModel()
-                    initSenseVoiceModel()
-                }
-                withContext(Dispatchers.Main) {
-                    initTtsAndAudioTrack()
-                    textView.text = getString(R.string.hint)
-                    startButton.isEnabled = true
-                    stopButton.isEnabled = false
-                }
-            } catch (t: Throwable) {
-                Log.e(TAG, "Initialization failed: ${t.message}", t)
-                withContext(Dispatchers.Main) {
-                    textView.text = "初始化失败：${t.javaClass.simpleName}: ${t.message}"
-                    Toast.makeText(
-                        this@MainActivity,
-                        "初始化失败（请看 Logcat）: ${t.javaClass.simpleName}",
-                        Toast.LENGTH_LONG
-                    ).show()
-                    startButton.isEnabled = false
-                    stopButton.isEnabled = false
-                }
-            }
-        }
-
-        cloudApiManager = CloudApiManager(object : CloudApiManager.CloudApiListener  {
-            private var llmFirstChunkMarked = false
-
-            override fun onLLMResponseReceived(response: String) {
-                currentTrace?.markLlmDone()
-                llmInFlight = false
-                
-                // 根据流式输出模式处理响应
-                if (enableStreaming) {
-                    // 启用流式输出时，刷新剩余缓冲区
-                    for (seg in segmenter.flush()) {
-                        enqueueTtsSegment(seg)
-                    }
-                    // 发送队列结束信号
-                    ttsQueue.offer(TtsQueueItem.End)
-                } else {
-                    runOnUiThread {
-                        appendToUi("${response}\n")
-                    }
-                    // 禁用流式输出时，直接使用整段文本进行TTS
-                    enqueueTtsSegment(response)
-                    // 发送队列结束信号
-                    ttsQueue.offer(TtsQueueItem.End)
-                }
-            }
-
-            override fun onLLMStreamingChunkReceived(chunk: String) {
-                // 启用流式输出时，处理流式chunk
-                if (enableStreaming) {
-                    if (!llmFirstChunkMarked) {
-                        llmFirstChunkMarked = true
-                        currentTrace?.markLlmFirstChunk()
-                    }
-                    appendToUi(chunk)
-
-                    val segments = segmenter.processChunk(chunk)
-                    for (seg in segments) {
-                        enqueueTtsSegment(seg)
-                    }
-                }
-            }
-
-            override fun onTTSAudioReceived(audioFilePath: String) {
-                // unused
-            }
-
-            override fun onError(errorMessage: String) {
-                llmInFlight = false
-                Toast.makeText(this@MainActivity, errorMessage, Toast.LENGTH_LONG).show()
-                onStopClicked(userInitiated = false)
-            }
-        }, applicationContext)
-        
-        // 设置流式输出模式
-        cloudApiManager.setEnableStreaming(enableStreaming)
-    }
-
-    override fun onDestroy() {
-        super.onDestroy()
-        onStopClicked(userInitiated = false)
-        ioScope.cancel()
-        synchronized(nativeLock) {
-            try {
-                vad.release()
-            } catch (_: Throwable) {
-            }
-            try {
-                senseVoice?.deinitialize()
-            } catch (_: Throwable) {
-            }
-        }
-        try {
-            tts?.release()
-        } catch (_: Throwable) {
-        }
-        try {
-            videoPlayerManager?.release()
-        } catch (_: Throwable) {
-        }
-    }
-
-    private fun onStartClicked() {
-        if (isRecording) return
-
-        if (!initMicrophone()) {
-            Toast.makeText(this, "麦克风初始化失败/无权限", Toast.LENGTH_SHORT).show()
-            return
-        }
-
-        // Start a new trace turn
-        currentTrace = TraceManager.getInstance().startNewTurn()
-        currentTrace?.mark("turn_start")
-        llmInFlight = false
-
-        lastUiText = ""
-        textView.text = ""
-
-        ttsStopped.set(false)
-        ttsPlaying.set(false)
-        ttsTotalSamplesWritten = 0
-        ttsQueue.clear()
-        segmenter.reset()
-
-        vad.reset()
-        audioRecord!!.startRecording()
-        isRecording = true
-
-        startButton.isEnabled = false
-        stopButton.isEnabled = true
-
-        recordingJob?.cancel()
-        recordingJob = ioScope.launch {
-            processSamplesLoop()
-        }
-    }
-
-    private fun onStopClicked(userInitiated: Boolean) {
-        isRecording = false
-        try {
-            audioRecord?.stop()
-        } catch (_: Throwable) {
-        }
-        try {
-            audioRecord?.release()
-        } catch (_: Throwable) {
-        }
-        audioRecord = null
-
-        recordingJob?.cancel()
-        recordingJob = null
-
-        ttsStopped.set(true)
-        ttsPlaying.set(false)
-        ttsTotalSamplesWritten = 0
-        ttsQueue.clear()
-        // wake worker if waiting
-        ttsQueue.offer(TtsQueueItem.End)
-
-        try {
-            track?.pause()
-            track?.flush()
-        } catch (_: Throwable) {
-        }
-        try { aec?.release() } catch (_: Throwable) {}
-        try { ns?.release() } catch (_: Throwable) {}
-        aec = null
-        ns = null
-        startButton.isEnabled = true
-        stopButton.isEnabled = false
-
-        if (userInitiated) {
-            TraceManager.getInstance().endTurn()
-            currentTrace = null
-        }
-    }
-
-    private fun initVadModel() {
-        // 你的 VAD 模型在 assets/vad_model/ 下
-        val config = VadModelConfig(
-            sileroVadModelConfig = SileroVadModelConfig(
-                model = "vad_model/silero_vad.onnx",
-                threshold = 0.5F,
-                minSilenceDuration = 0.25F,
-                minSpeechDuration = 0.25F,
-                windowSize = 512,
-            ),
-            sampleRate = sampleRateInHz,
-            numThreads = 1,
-            provider = "cpu",
-        )
-        vad = Vad(assetManager = application.assets, config = config)
-    }
-
-    private fun initSenseVoiceModel() {
-        Log.i(TAG, "ASR: init SenseVoice RKNN (scheme A)")
-        // Copy assets/sensevoice_models/* -> filesDir/sensevoice_models/*
-        val modelDir = copySenseVoiceAssetsToInternal()
-        val modelPath = File(modelDir, "sense-voice-encoder.rknn").absolutePath
-        val embeddingPath = File(modelDir, "embedding.npy").absolutePath
-        val bpePath = File(modelDir, "chn_jpn_yue_eng_ko_spectok.bpe.model").absolutePath
-
-        // Print quick diagnostics for native libs + model files
-        try {
-            val libDir = applicationInfo.nativeLibraryDir
-            Log.i(TAG, "nativeLibraryDir=$libDir")
-            try {
-                val names = File(libDir).list()?.joinToString(", ") ?: "(empty)"
-                Log.i(TAG, "nativeLibraryDir files: $names")
-            } catch (t: Throwable) {
-                Log.w(TAG, "Failed to list nativeLibraryDir: ${t.message}")
-            }
-        } catch (_: Throwable) {
-        }
-        Log.i(TAG, "SenseVoice model paths:")
-        Log.i(TAG, "  model=$modelPath exists=${File(modelPath).exists()} size=${File(modelPath).length()}")
-        Log.i(TAG, "  embedding=$embeddingPath exists=${File(embeddingPath).exists()} size=${File(embeddingPath).length()}")
-        Log.i(TAG, "  bpe=$bpePath exists=${File(bpePath).exists()} size=${File(bpePath).length()}")
-
-        val t0 = SystemClock.elapsedRealtime()
-        val engine = try {
-            SenseVoiceEngineRKNN(this)
-        } catch (e: UnsatisfiedLinkError) {
-            // Most common: libsensevoiceEngine.so not packaged/built, or dependent libs missing
-            throw IllegalStateException("Load native libraries failed: ${e.message}", e)
-        }
-
-        val ok = try {
-            engine.loadModelDirectly(modelPath, embeddingPath, bpePath)
-        } catch (t: Throwable) {
-            throw IllegalStateException("SenseVoice loadModelDirectly crashed: ${t.message}", t)
-        }
-
-        val dt = SystemClock.elapsedRealtime() - t0
-        Log.i(TAG, "SenseVoice loadModelDirectly ok=$ok costMs=$dt")
-        if (!ok) throw IllegalStateException("SenseVoiceEngineRKNN loadModelDirectly returned false")
-
-        senseVoice = engine
-    }
-
-    private fun initTtsAndAudioTrack() {
-        try {
-            // 你放入的 sherpa-onnx VITS 中文模型目录：
-            // assets/tts_model/sherpa-onnx-vits-zh-ll/{model.onnx,tokens.txt,lexicon.txt,...}
-            val modelDir = "tts_model/sherpa-onnx-vits-zh-ll"
-            val modelName = "model.onnx"
-            val lexicon = "lexicon.txt"
-            val dataDir = ""
-
-            val ttsConfig = getOfflineTtsConfig(
-                modelDir = modelDir,
-                modelName = modelName,
-                acousticModelName = "",
-                vocoder = "",
-                voices = "",
-                lexicon = lexicon,
-                dataDir = dataDir,
-                dictDir = "",
-                // 中文规范化规则（目录里已有这些 fst）
-                ruleFsts = "$modelDir/phone.fst,$modelDir/date.fst,$modelDir/number.fst,$modelDir/new_heteronym.fst",
-                ruleFars = "",
-                numThreads = null,
-                isKitten = false
-            )
-            tts = OfflineTts(assetManager = application.assets, config = ttsConfig)
-        } catch (t: Throwable) {
-            Log.e(TAG, "Init TTS failed: ${t.message}", t)
-            tts = null
-            runOnUiThread {
-                Toast.makeText(
-                    this,
-                    "TTS 初始化失败：请确认 assets/tts_model/sherpa-onnx-vits-zh-ll/ 下有 model.onnx、tokens.txt、lexicon.txt 以及 phone/date/number/new_heteronym.fst",
-                    Toast.LENGTH_LONG
-                ).show()
-            }
-        }
-
-        val t = tts ?: return
-        val sr = t.sampleRate()
-        val bufLength = AudioTrack.getMinBufferSize(
-            sr,
-            AudioFormat.CHANNEL_OUT_MONO,
-            AudioFormat.ENCODING_PCM_FLOAT
-        )
-        val attr = AudioAttributes.Builder()
-            .setContentType(AudioAttributes.CONTENT_TYPE_SPEECH)
-            .setUsage(AudioAttributes.USAGE_MEDIA)
-            .build()
-        val format = AudioFormat.Builder()
-            .setEncoding(AudioFormat.ENCODING_PCM_FLOAT)
-            .setChannelMask(AudioFormat.CHANNEL_OUT_MONO)
-            .setSampleRate(sr)
-            .build()
-        track = AudioTrack(
-            attr,
-            format,
-            bufLength,
-            AudioTrack.MODE_STREAM,
-            AudioManager.AUDIO_SESSION_ID_GENERATE
-        )
-        track?.play()
-    }
-
-    private fun assetExists(path: String): Boolean {
-        return try {
-            application.assets.open(path).close()
-            true
-        } catch (_: Throwable) {
-            false
-        }
-    }
-
-    private fun copySenseVoiceAssetsToInternal(): File {
-        val outDir = File(filesDir, "sensevoice_models")
-        if (!outDir.exists()) outDir.mkdirs()
-
-        val files = arrayOf(
-            "am.mvn",
-            "chn_jpn_yue_eng_ko_spectok.bpe.model",
-            "embedding.npy",
-            "sense-voice-encoder.rknn"
-        )
-
-        for (name in files) {
-            val assetPath = "sensevoice_models/$name"
-            val outFile = File(outDir, name)
-            if (outFile.exists() && outFile.length() > 0) continue
-            application.assets.open(assetPath).use { input ->
-                FileOutputStream(outFile).use { output ->
-                    input.copyTo(output)
-                }
-            }
-        }
-        return outDir
-    }
-
-    private fun initMicrophone(): Boolean {
-        if (ActivityCompat.checkSelfPermission(this, Manifest.permission.RECORD_AUDIO)
-            != PackageManager.PERMISSION_GRANTED
-        ) {
-            ActivityCompat.requestPermissions(this, permissions, REQUEST_RECORD_AUDIO_PERMISSION)
-            return false
-        }
-
-        val numBytes = AudioRecord.getMinBufferSize(sampleRateInHz, channelConfig, audioFormat)
-        audioRecord = AudioRecord(
-            audioSource,
-            sampleRateInHz,
-            channelConfig,
-            audioFormat,
-            numBytes * 2
-        )
-        val sessionId = audioRecord?.audioSessionId ?: 0
-        if (sessionId != 0) {
-            if (android.media.audiofx.AcousticEchoCanceler.isAvailable()) {
-                aec = android.media.audiofx.AcousticEchoCanceler.create(sessionId)?.apply {
-                    enabled = true
-                }
-                Log.i(TAG, "AEC enabled=${aec?.enabled}")
-            } else {
-                Log.w(TAG, "AEC not available on this device")
-            }
-
-            if (android.media.audiofx.NoiseSuppressor.isAvailable()) {
-                ns = android.media.audiofx.NoiseSuppressor.create(sessionId)?.apply {
-                    enabled = true
-                }
-                Log.i(TAG, "NS enabled=${ns?.enabled}")
-            } else {
-                Log.w(TAG, "NS not available on this device")
-            }
-        }
-        return true
-    }
-
-    private suspend fun processSamplesLoop() {
-        // Avoid calling vad.front()/vad.pop() (native queue APIs) since it crashes on some builds.
-        // Use vad.compute() and implement a simple VAD segmenter in Kotlin instead.
-        val windowSize = 512
-        val buffer = ShortArray(windowSize)
-        // 双阈值设置
-        val startThreshold = 0.2f   // 进入语音的阈值
-        val endThreshold = 0.15f    // 退出语音的阈值
-        val minSilenceSamples = (0.5f * sampleRateInHz).toInt()
-        val minSpeechSamples = (0.1f * sampleRateInHz).toInt()
-        val maxSpeechSamples = (5.0f * sampleRateInHz).toInt()
-
-        // VAD 概率数据记录
-        val vadProbabilities = mutableListOf<Float>()
-        val vadTimestamps = mutableListOf<Long>()
-        val vadRMSValues = mutableListOf<Float>()
-        val vadSmoothedRMSValues = mutableListOf<Float>()
-
-        // 指数平滑相关变量
-        var smoothedRms = 0f
-        val alpha = 0.8f   // 平滑系数
-
-        var inSpeech = false
-        var silenceSamples = 0
-
-        var speechBuf = FloatArray(0)
-        var speechLen = 0
-        var processedSpeechBuf = FloatArray(0)
-        var processedSpeechLen = 0
-
-        fun appendSpeech(chunk: FloatArray, processedChunk: FloatArray) {
-            // 保存原始音频
-            val needed = speechLen + chunk.size
-            if (speechBuf.size < needed) {
-                var newCap = maxOf(needed, maxOf(1024, speechBuf.size * 2))
-                if (newCap > maxSpeechSamples) newCap = maxSpeechSamples
-                val n = FloatArray(newCap)
-                if (speechLen > 0) System.arraycopy(speechBuf, 0, n, 0, speechLen)
-                speechBuf = n
-            }
-            val copyN = minOf(chunk.size, max(0, maxSpeechSamples - speechLen))
-            if (copyN > 0) {
-                System.arraycopy(chunk, 0, speechBuf, speechLen, copyN)
-                speechLen += copyN
-            }
-
-            // 保存增益后的音频
-            val processedNeeded = processedSpeechLen + processedChunk.size
-            if (processedSpeechBuf.size < processedNeeded) {
-                var newCap = maxOf(processedNeeded, maxOf(1024, processedSpeechBuf.size * 2))
-                if (newCap > maxSpeechSamples) newCap = maxSpeechSamples
-                val n = FloatArray(newCap)
-                if (processedSpeechLen > 0) System.arraycopy(processedSpeechBuf, 0, n, 0, processedSpeechLen)
-                processedSpeechBuf = n
-            }
-            val processedCopyN = minOf(processedChunk.size, max(0, maxSpeechSamples - processedSpeechLen))
-            if (processedCopyN > 0) {
-                System.arraycopy(processedChunk, 0, processedSpeechBuf, processedSpeechLen, processedCopyN)
-                processedSpeechLen += processedCopyN
-            }
-        }
-
-        suspend fun finalizeSegmentIfAny() {
-            if (speechLen < minSpeechSamples) {
-                speechLen = 0
-                processedSpeechLen = 0
-                inSpeech = false
-                silenceSamples = 0
-                return
-            }
-            // ✅ 新增：如果 TTS 正在播放或 LLM 请求中，丢弃此段（避免回声）
-            if (ttsPlaying.get() || llmInFlight) {
-                speechLen = 0
-                processedSpeechLen = 0
-                inSpeech = false
-                silenceSamples = 0
-                return
-            }
-            val originalSeg = speechBuf.copyOf(speechLen)
-            val processedSeg = processedSpeechBuf.copyOf(processedSpeechLen)
-            speechLen = 0
-            processedSpeechLen = 0
-            inSpeech = false
-            silenceSamples = 0
-
-            // 将语音段加入 ASR 处理队列，异步处理
-            asrQueue.send(Pair(originalSeg, processedSeg))
-        }
-
-        while (isRecording && ioScope.coroutineContext.isActive) {
-            val ret = audioRecord?.read(buffer, 0, buffer.size) ?: break
-            if (ret <= 0) continue
-            if (ret != windowSize) continue
-            // 在 processSamplesLoop 方法中
-            val chunk = FloatArray(ret) { buffer[it] / 32768.0f }
-
-            // 计算当前音频的RMS值（均方根）
-            val rms = calculateRMS(chunk)
-            
-            // 应用指数平滑
-            smoothedRms = if (smoothedRms == 0f) rms else alpha * rms + (1 - alpha) * smoothedRms
-            
-            // 动态调整增益因子，目标RMS设为0.1（约-20dB）
-            val targetRMS = 0.1f
-            var gainFactor = if (smoothedRms > 0) targetRMS / smoothedRms else 3.0f
-            
-            // 设置增益的上下限，避免过度增益导致削波
-            gainFactor = gainFactor.coerceIn(0.1f, 10.0f)
-            
-            // 应用增益因子
-            val processedChunk = FloatArray(chunk.size) {
-                val value = chunk[it] * gainFactor
-                // 限制音量范围，避免削波
-                if (value > 1.0f) 1.0f else if (value < -1.0f) -1.0f else value
-            }
-
-            // 使用处理后的音频数据
-            val prob = synchronized(nativeLock) { vad.compute(processedChunk) }
-            
-            // 记录VAD概率、时间戳、原始RMS值和平滑后的RMS值
-            vadProbabilities.add(prob)
-            vadTimestamps.add(System.currentTimeMillis())
-            vadRMSValues.add(rms)
-            vadSmoothedRMSValues.add(smoothedRms)
-
-            // 双阈值状态机逻辑
-            if (!inSpeech && prob >= startThreshold) {
-                // 进入语音状态
-                inSpeech = true
-                silenceSamples = 0
-                appendSpeech(chunk, processedChunk)
-            } else if (inSpeech && prob <= endThreshold) {
-                // 开始计数静音样本
-                silenceSamples += ret
-                if (silenceSamples >= minSilenceSamples) {
-                    // 退出语音状态
-                    finalizeSegmentIfAny()
-                } else {
-                    // 保留尾音
-                    appendSpeech(chunk, processedChunk)
-                }
-            } else if (inSpeech) {
-                // 语音过程中，持续添加音频
-                appendSpeech(chunk, processedChunk)
-                silenceSamples = 0 // 重置静音计数
-                
-                if (speechLen >= maxSpeechSamples) {
-                    finalizeSegmentIfAny()
-                }
-            }
-            // 非语音状态且概率低于开始阈值，不做处理
-
-            // 时间兜底切段（避免长时间无标点导致首包太慢）
-            val forced = segmenter.maybeForceByTime()
-            for (seg in forced) enqueueTtsSegment(seg)
-        }
-
-        // flush last partial segment
-        finalizeSegmentIfAny()
-        
-        // 保存VAD数据到文件
-        saveVadData(vadTimestamps, vadProbabilities, vadRMSValues, vadSmoothedRMSValues)
-    }
-
-    /**
-     * 保存VAD数据到文件，方便后续分析和绘图
-     */
-    private fun saveVadData(timestamps: List<Long>, probabilities: List<Float>, rmsValues: List<Float>, smoothedRmsValues: List<Float>) {
-        try {
-            // 创建保存目录
-            val vadDataDir = File(filesDir, "vad_data")
-            if (!vadDataDir.exists()) {
-                vadDataDir.mkdirs()
-            }
-            
-            // 生成唯一的文件名
-            val timestamp = System.currentTimeMillis()
-            val fileName = "vad_data_${timestamp}.csv"
-            val outputFile = File(vadDataDir, fileName)
-            
-            // 写入数据
-            FileOutputStream(outputFile).use { fos ->
-                // 写入表头
-                fos.write("timestamp,probability,rms,smoothed_rms\n".toByteArray())
-                
-                // 写入数据行
-                for (i in timestamps.indices) {
-                    val line = "${timestamps[i]},${probabilities[i]},${rmsValues[i]},${smoothedRmsValues[i]}\n"
-                    fos.write(line.toByteArray())
-                }
-            }
-            
-            Log.d(TAG, "Saved VAD data to: ${outputFile.absolutePath}")
-        } catch (e: Exception) {
-            Log.e(TAG, "Error saving VAD data: ${e.message}")
-        }
-    }
-
-    private fun removeTokens(text: String): String {
-        // Remove tokens like <|zh|>, <|NEUTRAL|>, <|Speech|>, <|woitn|> and stray '>' chars
-        var cleaned = text.replace(Regex("<\\|[^>]+\\|>"), "")
-        cleaned = cleaned.replace(Regex("[>＞≥≫]"), "")
-        cleaned = cleaned.trim().replace(Regex("\\s+"), " ")
-        return cleaned
-    }
-
-    private fun enqueueTtsSegment(seg: String) {
-        // 移除句末的标点符号
-        val cleanedSeg = seg.trimEnd('.', '。', '!', '！', '?', '？', ',', '，', ';', '；', ':', '：')
-        
-        currentTrace?.markTtsRequestEnqueued()
-        ttsQueue.offer(TtsQueueItem.Segment(cleanedSeg))
-        ensureTtsWorker()
-    }
-
-    private fun ensureTtsWorker() {
-        if (!ttsWorkerRunning.compareAndSet(false, true)) return
-        ioScope.launch {
-            try {
-                runTtsWorker()
-            } finally {
-                ttsWorkerRunning.set(false)
-            }
-        }
-    }
-
-    private fun ensureAsrWorker() {
-        if (!asrWorkerRunning.compareAndSet(false, true)) return
-        ioScope.launch {
-            try {
-                runAsrWorker()
-            } finally {
-                asrWorkerRunning.set(false)
-            }
-        }
-    }
-
-    private fun runTtsWorker() {
-        val t = tts ?: return
-        val audioTrack = track ?: return
-
-        var firstAudioMarked = false
-        var isFirstSegment = true
-        while (true) {
-            val item = ttsQueue.take()
-            if (ttsStopped.get()) break
-
-            when (item) {
-                is TtsQueueItem.Segment -> {
-                    ttsPlaying.set(true)
-                    runOnUiThread { videoPlayerManager?.setSpeaking(true) }
-                    val trace = currentTrace
-                    trace?.markTtsSynthesisStart()
-                    Log.d(TAG, "TTS started: processing segment '${item.text}'")
-                    runOnUiThread {
-                        appendToUi("\n[TTS] 开始合成...\n")
-                    }
-
-                    val startMs = System.currentTimeMillis()
-                    var firstPcmMarked = false
-
-                    if (isFirstSegment) {
-                        try {
-                            audioTrack.pause()
-                            audioTrack.flush()
-                            audioTrack.play()
-                        } catch (_: Throwable) {
-                        }
-                        isFirstSegment = false
-                    }
-
-                    t.generateWithCallback(
-                        text = item.text,
-                        sid = 2,  // 这里可以修改说话人
-                        speed = 1.0f
-                    ) { samples ->
-                        if (ttsStopped.get()) return@generateWithCallback 0
-                        if (!firstPcmMarked && samples.isNotEmpty()) {
-                            firstPcmMarked = true
-                            trace?.markTtsFirstPcmReady()
-                        }
-                        if (!firstAudioMarked && samples.isNotEmpty()) {
-                            firstAudioMarked = true
-                            trace?.markTtsFirstAudioPlay()
-                        }
-                        audioTrack.write(samples, 0, samples.size, AudioTrack.WRITE_BLOCKING)
-                        ttsTotalSamplesWritten += samples.size
-                        1
-                    }
-
-                    val ttsMs = System.currentTimeMillis() - startMs
-                    trace?.addDuration("tts_segment_ms_total", ttsMs)
-                }
-
-                TtsQueueItem.End -> {
-                    // 清空 ASR 队列，丢弃所有未处理的段（这些可能是 TTS 播放期间的回声）
-                    while (asrQueue.tryReceive().isSuccess) { }
-                    
-                    waitForPlaybackComplete(audioTrack)
-                    val ttsCompleteTime = System.currentTimeMillis()
-                    
-                    // 在主线程更新UI
-                    runOnUiThread {
-                        appendToUi("\n[LOG] TTS completed at: ${ttsCompleteTime}\n")
-                    }
-                    
-                    ttsPlaying.set(false)
-                    runOnUiThread { videoPlayerManager?.setSpeaking(false) }
-                    ttsTotalSamplesWritten = 0
-                    isFirstSegment = true
-                    currentTrace?.markTtsDone()
-                    TraceManager.getInstance().endTurn()
-                    currentTrace = null
-                    break
-                }
-            }
-        }
-    }
-
-    private fun waitForPlaybackComplete(audioTrack: AudioTrack) {
-        val totalSamples = ttsTotalSamplesWritten
-        if (totalSamples <= 0) return
-        
-        val sampleRate = audioTrack.sampleRate
-        val timeoutMs = (totalSamples * 1000 / sampleRate) + 2000
-        val startTime = System.currentTimeMillis()
-        
-        while (true) {
-            if (ttsStopped.get()) break
-            
-            val playbackPos = audioTrack.playbackHeadPosition.toLong()
-            if (playbackPos >= totalSamples) {
-                break
-            }
-            
-            if (System.currentTimeMillis() - startTime > timeoutMs) {
-                Log.w(TAG, "waitForPlaybackComplete timeout, pos=$playbackPos, total=$totalSamples")
-                break
-            }
-            
-            Thread.sleep(20)
-        }
-        // 直接等待 1000ms，确保所有缓冲区清空
-        Thread.sleep(1000)
-    }
-
-    private suspend fun runAsrWorker() {
-        while (ioScope.coroutineContext.isActive) {
-            val (seg, trace) = try {
-                asrQueue.receive()
-            } catch (_: Throwable) {
-                break
-            }
-
-            // 每次只允许一个 LLM 请求在飞，避免堆积导致卡死/竞态
-            // TTS 播放期间不做 ASR，避免识别到 TTS 播放的声音
-            if (llmInFlight || ttsPlaying.get()) continue
-
-            trace?.markASRStart()
-            Log.d(TAG, "ASR started: processing audio segment")
-            withContext(Dispatchers.Main) {
-                appendToUi("\n[ASR] 开始识别...\n")
-            }
-            val raw = synchronized(nativeLock) {
-                val e = senseVoice
-                if (e == null || !e.isInitialized) "" else e.transcribeBuffer(seg)
-            }
-            val text = removeTokens(raw)
-            
-            // 添加过滤逻辑
-            if (text.isBlank()) continue
-            // 过滤英文单字符"i"
-            if (text.length == 1 && text[0].equals('i', ignoreCase = true)) {
-                Log.d(TAG, "ASR segment skipped: single 'i'")
-                continue
-            }
-            // 过滤超过50个字符的长文本
-            if (text.length > 50) {
-                Log.d(TAG, "ASR segment skipped: too long (${text.length} chars)")
-                continue
-            }
-            
-            trace?.markASREnd()
-
-            withContext(Dispatchers.Main) {
-                appendToUi("\n\n[ASR] ${text}\n")
-            }
-
-            trace?.markRecordingDone()
-            trace?.markLlmResponseReceived()
-
-            if (BuildConfig.LLM_API_KEY.isBlank()) {
-                withContext(Dispatchers.Main) {
-                    Toast.makeText(
-                        this@MainActivity,
-                        "未配置 LLM_API_KEY（在 local.properties 或 gradle.properties 里设置）",
-                        Toast.LENGTH_LONG
-                    ).show()
-                }
-                continue
-            }
-
-            llmInFlight = true
-            cloudApiManager.callLLM(text)
-        }
-    }
-
-    private fun appendToUi(s: String) {
-        lastUiText += s
-        textView.text = lastUiText
-    }
-}
--- a/app/src/main/java/com/digitalperson/EntryActivity.kt
+++ b/app/src/main/java/com/digitalperson/EntryActivity.kt
@@ -0,0 +1,20 @@
+package com.digitalperson
+
+import android.content.Intent
+import android.os.Bundle
+import androidx.appcompat.app.AppCompatActivity
+import com.digitalperson.config.AppConfig
+
+class EntryActivity : AppCompatActivity() {
+    override fun onCreate(savedInstanceState: Bundle?) {
+        super.onCreate(savedInstanceState)
+
+        val target = if (AppConfig.Avatar.USE_LIVE2D) {
+            Live2DChatActivity::class.java
+        } else {
+            MainActivity::class.java
+        }
+        startActivity(Intent(this, target))
+        finish()
+    }
+}
--- a/app/src/main/java/com/digitalperson/Live2DChatActivity.kt
+++ b/app/src/main/java/com/digitalperson/Live2DChatActivity.kt
@@ -0,0 +1,418 @@
+package com.digitalperson
+
+import android.Manifest
+import android.content.pm.PackageManager
+import android.os.Bundle
+import android.util.Log
+import android.widget.Toast
+import androidx.appcompat.app.AppCompatActivity
+import androidx.core.app.ActivityCompat
+import com.digitalperson.cloud.CloudApiManager
+import com.digitalperson.audio.AudioProcessor
+import com.digitalperson.vad.VadManager
+import com.digitalperson.asr.AsrManager
+import com.digitalperson.tts.TtsManager
+import com.digitalperson.ui.Live2DUiManager
+import com.digitalperson.config.AppConfig
+import com.digitalperson.metrics.TraceManager
+import com.digitalperson.metrics.TraceSession
+import kotlinx.coroutines.CoroutineScope
+import kotlinx.coroutines.Dispatchers
+import kotlinx.coroutines.Job
+import kotlinx.coroutines.SupervisorJob
+import kotlinx.coroutines.cancel
+import kotlinx.coroutines.isActive
+import kotlinx.coroutines.launch
+import kotlinx.coroutines.withContext
+
+class Live2DChatActivity : AppCompatActivity() {
+
+    private lateinit var uiManager: Live2DUiManager
+    private lateinit var vadManager: VadManager
+    private lateinit var asrManager: AsrManager
+    private lateinit var ttsManager: TtsManager
+    private lateinit var audioProcessor: AudioProcessor
+
+    private val permissions: Array<String> = arrayOf(Manifest.permission.RECORD_AUDIO)
+
+    @Volatile
+    private var isRecording: Boolean = false
+
+    private val ioScope = CoroutineScope(SupervisorJob() + Dispatchers.IO)
+    private var recordingJob: Job? = null
+    private val nativeLock = Any()
+
+    private lateinit var cloudApiManager: CloudApiManager
+    private val segmenter = StreamingTextSegmenter(
+        maxLen = AppConfig.Tts.MAX_LEN,
+        maxWaitMs = AppConfig.Tts.MAX_WAIT_MS
+    )
+
+    private var currentTrace: TraceSession? = null
+    @Volatile private var llmInFlight: Boolean = false
+    private var enableStreaming = false
+
+    override fun onRequestPermissionsResult(
+        requestCode: Int,
+        permissions: Array<String>,
+        grantResults: IntArray
+    ) {
+        super.onRequestPermissionsResult(requestCode, permissions, grantResults)
+        val ok = requestCode == AppConfig.REQUEST_RECORD_AUDIO_PERMISSION &&
+                grantResults.isNotEmpty() &&
+                grantResults[0] == PackageManager.PERMISSION_GRANTED
+        if (!ok) {
+            Log.e(AppConfig.TAG, "Audio record is disallowed")
+            finish()
+        }
+    }
+
+    override fun onCreate(savedInstanceState: Bundle?) {
+        super.onCreate(savedInstanceState)
+        setContentView(R.layout.activity_live2d_chat)
+
+        uiManager = Live2DUiManager(this)
+        uiManager.initViews(
+            textViewId = R.id.my_text,
+            scrollViewId = R.id.scroll_view,
+            startButtonId = R.id.start_button,
+            stopButtonId = R.id.stop_button,
+            silentPlayerViewId = 0,
+            speakingPlayerViewId = 0,
+            live2dViewId = R.id.live2d_view
+        )
+        
+        uiManager.setStartButtonListener { onStartClicked() }
+        uiManager.setStopButtonListener { onStopClicked(userInitiated = true) }
+
+        ActivityCompat.requestPermissions(this, permissions, AppConfig.REQUEST_RECORD_AUDIO_PERMISSION)
+
+        try {
+            val streamingSwitch = findViewById<android.widget.Switch>(R.id.streaming_switch)
+            streamingSwitch.isChecked = enableStreaming
+            streamingSwitch.setOnCheckedChangeListener { _, isChecked ->
+                enableStreaming = isChecked
+                cloudApiManager.setEnableStreaming(isChecked)
+                uiManager.showToast("流式输出已${if (isChecked) "启用" else "禁用"}")
+            }
+        } catch (e: Exception) {
+            Log.w(AppConfig.TAG, "Streaming switch not found in layout: ${e.message}")
+        }
+
+        uiManager.setButtonsEnabled(startEnabled = false, stopEnabled = false)
+        uiManager.setText("初始化中…")
+        
+        audioProcessor = AudioProcessor(this)
+        
+        asrManager = AsrManager(this)
+        asrManager.setAudioProcessor(audioProcessor)
+        asrManager.setCallback(createAsrCallback())
+        
+        vadManager = VadManager(this)
+        vadManager.setCallback(createVadCallback())
+        
+        ioScope.launch {
+            try {
+                Log.i(AppConfig.TAG, "Init VAD + SenseVoice(RKNN) + TTS (background)")
+                synchronized(nativeLock) {
+                    vadManager.initVadModel()
+                    asrManager.initSenseVoiceModel()
+                }
+                val ttsOk = ttsManager.initTtsAndAudioTrack()
+                withContext(Dispatchers.Main) {
+                    if (!ttsOk) {
+                        uiManager.showToast(
+                            "TTS 初始化失败：请确认 assets/${AppConfig.Tts.MODEL_DIR}/ 下有 model.onnx、tokens.txt、lexicon.txt 以及 phone/date/number/new_heteronym.fst",
+                            Toast.LENGTH_LONG
+                        )
+                    }
+                    uiManager.setText(getString(R.string.hint))
+                    uiManager.setButtonsEnabled(startEnabled = true, stopEnabled = false)
+                }
+            } catch (t: Throwable) {
+                Log.e(AppConfig.TAG, "Initialization failed: ${t.message}", t)
+                withContext(Dispatchers.Main) {
+                    uiManager.setText("初始化失败：${t.javaClass.simpleName}: ${t.message}")
+                    uiManager.showToast("初始化失败（请看 Logcat）: ${t.javaClass.simpleName}", Toast.LENGTH_LONG)
+                    uiManager.setButtonsEnabled(startEnabled = false, stopEnabled = false)
+                }
+            }
+        }
+
+        cloudApiManager = CloudApiManager(createCloudApiListener(), applicationContext)
+        cloudApiManager.setEnableStreaming(enableStreaming)
+
+        ttsManager = TtsManager(this)
+        ttsManager.setCallback(createTtsCallback())
+        
+        Log.d(AppConfig.TAG, "Pre-starting ASR worker")
+        ioScope.launch {
+            asrManager.runAsrWorker()
+        }
+    }
+    
+    private fun createAsrCallback() = object : AsrManager.AsrCallback {
+        override fun onAsrStarted() {
+            currentTrace?.markASRStart()
+            runOnUiThread {
+                uiManager.appendToUi("\n[ASR] 开始识别...\n")
+            }
+        }
+        
+        override fun onAsrResult(text: String) {
+            currentTrace?.markASREnd()
+            runOnUiThread {
+                uiManager.appendToUi("\n\n[ASR] ${text}\n")
+            }
+            currentTrace?.markRecordingDone()
+            currentTrace?.markLlmResponseReceived()
+        }
+        
+        override fun onAsrSkipped(reason: String) {
+            Log.d(AppConfig.TAG, "ASR segment skipped: $reason")
+        }
+        
+        override fun shouldSkipAsr(): Boolean = ttsManager.isPlaying()
+        
+        override fun isLlmInFlight(): Boolean = llmInFlight
+        
+        override fun onLlmCalled(text: String) {
+            llmInFlight = true
+            Log.d(AppConfig.TAG, "Calling LLM with text: $text")
+            cloudApiManager.callLLM(text)
+        }
+    }
+    
+    private fun createVadCallback() = object : VadManager.VadCallback {
+        override fun onSpeechSegmentReady(originalAudio: FloatArray, processedAudio: FloatArray) {
+            Log.d(AppConfig.TAG, "Sending audio segment to ASR queue, size: ${processedAudio.size}")
+            asrManager.enqueueAudioSegment(originalAudio, processedAudio)
+        }
+        
+        override fun shouldSkipProcessing(): Boolean = ttsManager.isPlaying() || llmInFlight
+    }
+    
+    private fun createCloudApiListener() = object : CloudApiManager.CloudApiListener {
+        private var llmFirstChunkMarked = false
+
+        override fun onLLMResponseReceived(response: String) {
+            currentTrace?.markLlmDone()
+            llmInFlight = false
+
+            if (enableStreaming) {
+                for (seg in segmenter.flush()) {
+                    ttsManager.enqueueSegment(seg)
+                }
+                ttsManager.enqueueEnd()
+            } else {
+                runOnUiThread {
+                    uiManager.appendToUi("${response}\n")
+                }
+                ttsManager.enqueueSegment(response)
+                ttsManager.enqueueEnd()
+            }
+        }
+
+        override fun onLLMStreamingChunkReceived(chunk: String) {
+            if (enableStreaming) {
+                if (!llmFirstChunkMarked) {
+                    llmFirstChunkMarked = true
+                    currentTrace?.markLlmFirstChunk()
+                }
+                uiManager.appendToUi(chunk)
+
+                val segments = segmenter.processChunk(chunk)
+                for (seg in segments) {
+                    ttsManager.enqueueSegment(seg)
+                }
+            }
+        }
+
+        override fun onTTSAudioReceived(audioFilePath: String) {}
+
+        override fun onError(errorMessage: String) {
+            llmInFlight = false
+            uiManager.showToast(errorMessage, Toast.LENGTH_LONG)
+            onStopClicked(userInitiated = false)
+        }
+    }
+    
+    private fun createTtsCallback() = object : TtsManager.TtsCallback {
+        override fun onTtsStarted(text: String) {
+            runOnUiThread {
+                uiManager.appendToUi("\n[TTS] 开始合成...\n")
+            }
+        }
+        
+        override fun onTtsCompleted() {
+            runOnUiThread {
+                uiManager.appendToUi("\n[LOG] TTS completed at: ${System.currentTimeMillis()}\n")
+            }
+        }
+        
+        override fun onTtsSegmentCompleted(durationMs: Long) {}
+        
+        override fun isTtsStopped(): Boolean = !isRecording
+        
+        override fun onClearAsrQueue() {
+            asrManager.clearQueue()
+        }
+        
+        override fun onSetSpeaking(speaking: Boolean) {
+            uiManager.setSpeaking(speaking)
+        }
+        
+        override fun getCurrentTrace(): TraceSession? = currentTrace
+        
+        override fun onTraceMarkTtsRequestEnqueued() {
+            currentTrace?.markTtsRequestEnqueued()
+        }
+        
+        override fun onTraceMarkTtsSynthesisStart() {
+            currentTrace?.markTtsSynthesisStart()
+        }
+        
+        override fun onTraceMarkTtsFirstPcmReady() {
+            currentTrace?.markTtsFirstPcmReady()
+        }
+        
+        override fun onTraceMarkTtsFirstAudioPlay() {
+            currentTrace?.markTtsFirstAudioPlay()
+        }
+        
+        override fun onTraceMarkTtsDone() {
+            currentTrace?.markTtsDone()
+        }
+        
+        override fun onTraceAddDuration(name: String, value: Long) {
+            currentTrace?.addDuration(name, value)
+        }
+        
+        override fun onEndTurn() {
+            TraceManager.getInstance().endTurn()
+            currentTrace = null
+        }
+    }
+
+    override fun onDestroy() {
+        super.onDestroy()
+        onStopClicked(userInitiated = false)
+        ioScope.cancel()
+        synchronized(nativeLock) {
+            try { vadManager.release() } catch (_: Throwable) {}
+            try { asrManager.release() } catch (_: Throwable) {}
+        }
+        try { ttsManager.release() } catch (_: Throwable) {}
+        try { uiManager.release() } catch (_: Throwable) {}
+        try { audioProcessor.release() } catch (_: Throwable) {}
+    }
+
+    override fun onResume() {
+        super.onResume()
+        uiManager.onResume()
+    }
+
+    override fun onPause() {
+        uiManager.onPause()
+        super.onPause()
+    }
+
+    private fun onStartClicked() {
+        Log.d(AppConfig.TAG, "onStartClicked called")
+        if (isRecording) {
+            Log.d(AppConfig.TAG, "Already recording, returning")
+            return
+        }
+
+        if (!audioProcessor.initMicrophone(permissions, AppConfig.REQUEST_RECORD_AUDIO_PERMISSION)) {
+            uiManager.showToast("麦克风初始化失败/无权限")
+            return
+        }
+
+        currentTrace = TraceManager.getInstance().startNewTurn()
+        currentTrace?.mark("turn_start")
+        llmInFlight = false
+
+        uiManager.clearText()
+
+        ttsManager.reset()
+        ttsManager.setCurrentTrace(currentTrace)
+        segmenter.reset()
+
+        vadManager.reset()
+        audioProcessor.startRecording()
+        isRecording = true
+
+        uiManager.setButtonsEnabled(startEnabled = false, stopEnabled = true)
+
+        Log.d(AppConfig.TAG, "Starting processSamplesLoop coroutine")
+        recordingJob?.cancel()
+        recordingJob = ioScope.launch {
+            processSamplesLoop()
+        }
+        Log.d(AppConfig.TAG, "onStartClicked completed")
+    }
+
+    private fun onStopClicked(userInitiated: Boolean) {
+        isRecording = false
+        audioProcessor.stopRecording()
+
+        recordingJob?.cancel()
+        recordingJob = null
+
+        ttsManager.stop()
+
+        uiManager.setButtonsEnabled(startEnabled = true, stopEnabled = false)
+
+        if (userInitiated) {
+            TraceManager.getInstance().endTurn()
+            currentTrace = null
+        }
+    }
+
+    private suspend fun processSamplesLoop() {
+        Log.d(AppConfig.TAG, "processSamplesLoop started")
+        val windowSize = AppConfig.WINDOW_SIZE
+        val buffer = ShortArray(windowSize)
+        var loopCount = 0
+
+        while (isRecording && ioScope.coroutineContext.isActive) {
+            loopCount++
+            if (loopCount % 100 == 0) {
+                Log.d(AppConfig.TAG, "processSamplesLoop running, loopCount=$loopCount, ttsPlaying=${ttsManager.isPlaying()}")
+            }
+            
+            if (ttsManager.isPlaying()) {
+                if (vadManager.isInSpeech()) {
+                    Log.d(AppConfig.TAG, "TTS playing, resetting VAD state")
+                    vadManager.clearState()
+                }
+                val ret = audioProcessor.readAudio(buffer)
+                if (ret <= 0) continue
+                continue
+            }
+            
+            val ret = audioProcessor.readAudio(buffer)
+            if (ret <= 0) continue
+            if (ret != windowSize) continue
+            
+            val chunk = audioProcessor.convertShortToFloat(buffer)
+            val processedChunk = audioProcessor.applyGain(chunk)
+
+            val result = vadManager.processAudioChunk(chunk, processedChunk)
+            
+            if (vadManager.vadComputeCount % 100 == 0) {
+                Log.d(AppConfig.TAG, "VAD result: $result, inSpeech=${vadManager.isInSpeech()}")
+            }
+
+            if (loopCount % 1000 == 0) {
+                Log.d(AppConfig.TAG, "VAD status: inSpeech=${vadManager.isInSpeech()}, speechLen=${vadManager.getSpeechLength()}")
+            }
+
+            val forced = segmenter.maybeForceByTime()
+            for (seg in forced) ttsManager.enqueueSegment(seg)
+        }
+
+        vadManager.forceFinalize()
+    }
+}
--- a/app/src/main/java/com/digitalperson/MainActivity.kt
+++ b/app/src/main/java/com/digitalperson/MainActivity.kt
--- a/app/src/main/java/com/digitalperson/asr/AsrManager.kt
+++ b/app/src/main/java/com/digitalperson/asr/AsrManager.kt
@@ -0,0 +1,223 @@
+package com.digitalperson.asr
+
+import android.content.Context
+import android.os.SystemClock
+import android.util.Log
+import com.digitalperson.BuildConfig
+import com.digitalperson.audio.AudioProcessor
+import com.digitalperson.config.AppConfig
+import com.digitalperson.engine.SenseVoiceEngineRKNN
+import com.digitalperson.util.FileHelper
+import kotlinx.coroutines.Dispatchers
+import kotlinx.coroutines.channels.Channel
+import kotlinx.coroutines.currentCoroutineContext
+import kotlinx.coroutines.isActive
+import kotlinx.coroutines.withContext
+import java.io.File
+
+class AsrManager(private val context: Context) {
+    
+    companion object {
+        private const val TAG = "AsrManager"
+    }
+    
+    private var senseVoice: SenseVoiceEngineRKNN? = null
+    private val nativeLock = Any()
+    
+    private val asrQueue = Channel<Pair<FloatArray, FloatArray>>(capacity = Channel.UNLIMITED)
+    
+    private var audioProcessor: AudioProcessor? = null
+    
+    interface AsrCallback {
+        fun onAsrStarted()
+        fun onAsrResult(text: String)
+        fun onAsrSkipped(reason: String)
+        fun shouldSkipAsr(): Boolean
+        fun isLlmInFlight(): Boolean
+        fun onLlmCalled(text: String)
+    }
+    
+    private var callback: AsrCallback? = null
+    
+    fun setCallback(callback: AsrCallback) {
+        this.callback = callback
+    }
+    
+    fun setAudioProcessor(audioProcessor: AudioProcessor) {
+        this.audioProcessor = audioProcessor
+    }
+    
+    fun initSenseVoiceModel(): Boolean {
+        return try {
+            Log.i(TAG, "ASR: init SenseVoice RKNN (scheme A)")
+            
+            val modelDir = FileHelper.copySenseVoiceAssets(context)
+            val modelPath = File(modelDir, "sense-voice-encoder.rknn").absolutePath
+            val embeddingPath = File(modelDir, "embedding.npy").absolutePath
+            val bpePath = File(modelDir, "chn_jpn_yue_eng_ko_spectok.bpe.model").absolutePath
+
+            try {
+                val libDir = context.applicationInfo.nativeLibraryDir
+                Log.i(TAG, "nativeLibraryDir=$libDir")
+                try {
+                    val names = File(libDir).list()?.joinToString(", ") ?: "(empty)"
+                    Log.i(TAG, "nativeLibraryDir files: $names")
+                } catch (t: Throwable) {
+                    Log.w(TAG, "Failed to list nativeLibraryDir: ${t.message}")
+                }
+            } catch (_: Throwable) {
+            }
+            
+            Log.i(TAG, "SenseVoice model paths:")
+            Log.i(TAG, "  model=$modelPath exists=${File(modelPath).exists()} size=${File(modelPath).length()}")
+            Log.i(TAG, "  embedding=$embeddingPath exists=${File(embeddingPath).exists()} size=${File(embeddingPath).length()}")
+            Log.i(TAG, "  bpe=$bpePath exists=${File(bpePath).exists()} size=${File(bpePath).length()}")
+
+            val t0 = SystemClock.elapsedRealtime()
+            val engine = try {
+                SenseVoiceEngineRKNN(context)
+            } catch (e: UnsatisfiedLinkError) {
+                throw IllegalStateException("Load native libraries failed: ${e.message}", e)
+            }
+
+            val ok = try {
+                engine.loadModelDirectly(modelPath, embeddingPath, bpePath)
+            } catch (t: Throwable) {
+                throw IllegalStateException("SenseVoice loadModelDirectly crashed: ${t.message}", t)
+            }
+
+            val dt = SystemClock.elapsedRealtime() - t0
+            Log.i(TAG, "SenseVoice loadModelDirectly ok=$ok costMs=$dt")
+            if (!ok) throw IllegalStateException("SenseVoiceEngineRKNN loadModelDirectly returned false")
+
+            senseVoice = engine
+            true
+        } catch (e: Exception) {
+            Log.e(TAG, "Failed to initialize SenseVoice model: ${e.message}", e)
+            false
+        }
+    }
+    
+    fun enqueueAudioSegment(originalAudio: FloatArray, processedAudio: FloatArray) {
+        try {
+            asrQueue.trySend(Pair(originalAudio, processedAudio))
+        } catch (e: Exception) {
+            Log.e(TAG, "Failed to enqueue audio segment: ${e.message}")
+        }
+    }
+    
+    fun clearQueue() {
+        while (asrQueue.tryReceive().isSuccess) { }
+    }
+    
+    suspend fun runAsrWorker() {
+        Log.d(TAG, "ASR worker started")
+        try {
+            while (currentCoroutineContext().isActive) {
+                val (originalSeg, processedSeg) = try {
+                    Log.d(TAG, "ASR worker waiting for audio segment")
+                    asrQueue.receive()
+                } catch (e: Throwable) {
+                    Log.e(TAG, "ASR worker receive failed: ${e.message}")
+                    break
+                }
+
+                Log.d(TAG, "ASR worker received audio segment, size=${processedSeg.size}")
+
+                if (callback?.shouldSkipAsr() == true || callback?.isLlmInFlight() == true) {
+                    Log.d(TAG, "ASR worker skipping segment: shouldSkip=${callback?.shouldSkipAsr()}, llmInFlight=${callback?.isLlmInFlight()}")
+                    continue
+                }
+
+                callback?.onAsrStarted()
+                Log.d(TAG, "ASR started: processing audio segment")
+                
+                saveAsrAudio(originalSeg, processedSeg)
+                
+                val raw = synchronized(nativeLock) {
+                    val e = senseVoice
+                    if (e == null || !e.isInitialized) {
+                        Log.e(TAG, "ASR failed: SenseVoice engine not initialized")
+                        ""
+                    } else {
+                        try {
+                            e.transcribeBuffer(processedSeg)
+                        } catch (e: Throwable) {
+                            Log.e(TAG, "ASR transcribe failed: ${e.message}")
+                            ""
+                        }
+                    }
+                }
+                Log.d(TAG, "ASR raw result: $raw")
+                val text = removeTokens(raw)
+
+                val filterResult = filterText(text)
+                if (filterResult != null) {
+                    callback?.onAsrSkipped(filterResult)
+                    continue
+                }
+
+                callback?.onAsrResult(text)
+
+                if (BuildConfig.LLM_API_KEY.isBlank()) {
+                    Log.w(TAG, "LLM API Key is not configured")
+                    continue
+                }
+
+                callback?.onLlmCalled(text)
+            }
+        } catch (e: Throwable) {
+            Log.e(TAG, "ASR worker error: ${e.message}", e)
+        } finally {
+            Log.d(TAG, "ASR worker exiting")
+        }
+    }
+    
+    fun release() {
+        try {
+            senseVoice?.deinitialize()
+        } catch (e: Exception) {
+            Log.e(TAG, "Error deinitializing SenseVoice: ${e.message}")
+        }
+        senseVoice = null
+        clearQueue()
+    }
+    
+    fun isInitialized(): Boolean = senseVoice?.isInitialized ?: false
+    
+    private fun saveAsrAudio(originalAudio: FloatArray, processedAudio: FloatArray) {
+        try {
+            val timestamp = System.currentTimeMillis()
+            val asrAudioDir = FileHelper.getAsrAudioDir(context)
+            
+            audioProcessor?.let { processor ->
+                val originalFile = File(asrAudioDir, "asr_${timestamp}_original.wav")
+                processor.saveAudioAsWav(originalFile, originalAudio, AppConfig.SAMPLE_RATE)
+                val processedFile = File(asrAudioDir, "asr_${timestamp}_processed.wav")
+                processor.saveAudioAsWav(processedFile, processedAudio, AppConfig.SAMPLE_RATE)
+            }
+        } catch (e: Exception) {
+            Log.e(TAG, "Error saving ASR audio: ${e.message}")
+        }
+    }
+    
+    private fun removeTokens(text: String): String {
+        var cleaned = text.replace(Regex("<\\|[^>]+\\|>"), "")
+        cleaned = cleaned.replace(Regex("[>＞≥≫]"), "")
+        cleaned = cleaned.trim().replace(Regex("\\s+"), " ")
+        return cleaned
+    }
+    
+    private fun filterText(text: String): String? {
+        if (text.isBlank()) {
+            return "blank text"
+        }
+        if (text.length == 1 && text[0].equals('i', ignoreCase = true)) {
+            return "single 'i'"
+        }
+        if (text.length > AppConfig.Asr.MAX_TEXT_LENGTH) {
+            return "too long (${text.length} chars)"
+        }
+        return null
+    }
+}
--- a/app/src/main/java/com/digitalperson/audio/AudioProcessor.kt
+++ b/app/src/main/java/com/digitalperson/audio/AudioProcessor.kt
@@ -0,0 +1,218 @@
+package com.digitalperson.audio
+
+import android.Manifest
+import android.content.pm.PackageManager
+import android.media.AudioFormat
+import android.media.AudioRecord
+import android.media.MediaRecorder
+import android.media.audiofx.AcousticEchoCanceler
+import android.media.audiofx.NoiseSuppressor
+import android.util.Log
+import androidx.core.app.ActivityCompat
+import java.io.File
+import java.io.FileOutputStream
+
+private const val TAG = "AudioProcessor"
+
+class AudioProcessor(
+    private val context: android.content.Context,
+    private val sampleRateInHz: Int = 16000,
+    private val channelConfig: Int = AudioFormat.CHANNEL_IN_MONO,
+    private val audioFormat: Int = AudioFormat.ENCODING_PCM_16BIT
+) {
+    private val audioSource = MediaRecorder.AudioSource.MIC
+    private var audioRecord: AudioRecord? = null
+    private var aec: AcousticEchoCanceler? = null
+    private var ns: NoiseSuppressor? = null
+
+    private var smoothedRms = 0f
+    private val alpha = 0.8f
+
+    fun initMicrophone(permissions: Array<String>, requestCode: Int): Boolean {
+        if (ActivityCompat.checkSelfPermission(context, Manifest.permission.RECORD_AUDIO)
+            != PackageManager.PERMISSION_GRANTED
+        ) {
+            ActivityCompat.requestPermissions(
+                context as androidx.appcompat.app.AppCompatActivity,
+                permissions,
+                requestCode
+            )
+            return false
+        }
+
+        val numBytes = AudioRecord.getMinBufferSize(sampleRateInHz, channelConfig, audioFormat)
+        audioRecord = AudioRecord(
+            audioSource,
+            sampleRateInHz,
+            channelConfig,
+            audioFormat,
+            numBytes * 2
+        )
+        val sessionId = audioRecord?.audioSessionId ?: 0
+        if (sessionId != 0) {
+            if (AcousticEchoCanceler.isAvailable()) {
+                aec = AcousticEchoCanceler.create(sessionId)?.apply {
+                    enabled = true
+                }
+                Log.i(TAG, "AEC enabled=${aec?.enabled}")
+            } else {
+                Log.w(TAG, "AEC not available on this device")
+            }
+
+            if (NoiseSuppressor.isAvailable()) {
+                ns = NoiseSuppressor.create(sessionId)?.apply {
+                    enabled = true
+                }
+                Log.i(TAG, "NS enabled=${ns?.enabled}")
+            } else {
+                Log.w(TAG, "NS not available on this device")
+            }
+        }
+        return true
+    }
+
+    fun startRecording() {
+        audioRecord?.startRecording()
+        Log.d(TAG, "Audio recording started")
+    }
+
+    fun stopRecording() {
+        try {
+            audioRecord?.stop()
+        } catch (_: Throwable) {
+        }
+        Log.d(TAG, "Audio recording stopped")
+    }
+
+    fun release() {
+        try {
+            audioRecord?.stop()
+        } catch (_: Throwable) {
+        }
+        try {
+            audioRecord?.release()
+        } catch (_: Throwable) {
+        }
+        audioRecord = null
+
+        try {
+            aec?.release()
+        } catch (_: Throwable) {
+        }
+        try {
+            ns?.release()
+        } catch (_: Throwable) {
+        }
+        aec = null
+        ns = null
+
+        Log.d(TAG, "AudioProcessor released")
+    }
+
+    fun readAudio(buffer: ShortArray): Int {
+        return audioRecord?.read(buffer, 0, buffer.size) ?: -1
+    }
+
+    fun calculateRMS(samples: FloatArray): Float {
+        if (samples.isEmpty()) return 0.0f
+
+        var sumSquared = 0.0f
+        for (sample in samples) {
+            sumSquared += sample * sample
+        }
+
+        val meanSquared = sumSquared / samples.size
+        return kotlin.math.sqrt(meanSquared)
+    }
+
+    fun applyGain(chunk: FloatArray, targetRMS: Float = 0.1f): FloatArray {
+        val rms = calculateRMS(chunk)
+
+        smoothedRms = if (smoothedRms == 0f) rms else alpha * rms + (1 - alpha) * smoothedRms
+
+        var gainFactor = if (smoothedRms > 0) targetRMS / smoothedRms else 3.0f
+        gainFactor = gainFactor.coerceIn(0.1f, 10.0f)
+
+        val processedChunk = FloatArray(chunk.size) {
+            val value = chunk[it] * gainFactor
+            if (value > 1.0f) 1.0f else if (value < -1.0f) -1.0f else value
+        }
+
+        return processedChunk
+    }
+
+    fun convertShortToFloat(buffer: ShortArray): FloatArray {
+        return FloatArray(buffer.size) { buffer[it] / 32768.0f }
+    }
+
+    fun saveAudioAsWav(file: File, samples: FloatArray, sampleRate: Int) {
+        FileOutputStream(file).use { fos ->
+            val header = ByteArray(44)
+
+            header[0] = 'R'.code.toByte()
+            header[1] = 'I'.code.toByte()
+            header[2] = 'F'.code.toByte()
+            header[3] = 'F'.code.toByte()
+
+            val fileSize = 36 + samples.size * 2
+            intToByteArray(fileSize, header, 4)
+
+            header[8] = 'W'.code.toByte()
+            header[9] = 'A'.code.toByte()
+            header[10] = 'V'.code.toByte()
+            header[11] = 'E'.code.toByte()
+
+            header[12] = 'f'.code.toByte()
+            header[13] = 'm'.code.toByte()
+            header[14] = 't'.code.toByte()
+            header[15] = ' '.code.toByte()
+
+            intToByteArray(16, header, 16)
+
+            shortToByteArray(1, header, 20)
+
+            shortToByteArray(1, header, 22)
+
+            intToByteArray(sampleRate, header, 24)
+
+            val byteRate = sampleRate * 1 * 16 / 8
+            intToByteArray(byteRate, header, 28)
+
+            val blockAlign = 1 * 16 / 8
+            shortToByteArray(blockAlign.toShort(), header, 32)
+
+            shortToByteArray(16, header, 34)
+
+            header[36] = 'd'.code.toByte()
+            header[37] = 'a'.code.toByte()
+            header[38] = 't'.code.toByte()
+            header[39] = 'a'.code.toByte()
+
+            val dataSize = samples.size * 2
+            intToByteArray(dataSize, header, 40)
+
+            fos.write(header)
+
+            for (sample in samples) {
+                val clampedSample = sample.coerceIn(-1.0f, 1.0f)
+                val shortSample = (clampedSample * 32767.0f).toInt().toShort()
+                val bytes = ByteArray(2)
+                bytes[0] = (shortSample.toInt() and 0xFF).toByte()
+                bytes[1] = (shortSample.toInt() shr 8 and 0xFF).toByte()
+                fos.write(bytes)
+            }
+        }
+    }
+
+    private fun intToByteArray(value: Int, dest: ByteArray, offset: Int) {
+        dest[offset] = (value and 0xFF).toByte()
+        dest[offset + 1] = (value shr 8 and 0xFF).toByte()
+        dest[offset + 2] = (value shr 16 and 0xFF).toByte()
+        dest[offset + 3] = (value shr 24 and 0xFF).toByte()
+    }
+
+    private fun shortToByteArray(value: Short, dest: ByteArray, offset: Int) {
+        dest[offset] = (value.toInt() and 0xFF).toByte()
+        dest[offset + 1] = (value.toInt() shr 8 and 0xFF).toByte()
+    }
+}
--- a/app/src/main/java/com/digitalperson/config/AppConfig.kt
+++ b/app/src/main/java/com/digitalperson/config/AppConfig.kt
@@ -0,0 +1,50 @@
+package com.digitalperson.config
+
+import com.digitalperson.BuildConfig
+
+object AppConfig {
+    const val TAG = "DigitalPerson"
+    const val REQUEST_RECORD_AUDIO_PERMISSION = 200
+    
+    const val SAMPLE_RATE = 16000
+    const val WINDOW_SIZE = 512
+    
+    const val SHOW_DEBUG_TEXT = true
+    
+    object Tts {
+        const val MODEL_DIR = "tts_model/sherpa-onnx-vits-zh-ll"
+        const val MODEL_NAME = "model.onnx"
+        const val LEXICON = "lexicon.txt"
+        const val SPEAKER_ID = 2
+        const val SPEED = 1.0f
+        const val MAX_LEN = 30
+        const val MAX_WAIT_MS: Long = 600
+    }
+    
+    object Vad {
+        const val START_THRESHOLD = 0.2f
+        const val END_THRESHOLD = 0.15f
+        const val MIN_SILENCE_DURATION = 0.5f
+        const val MIN_SPEECH_DURATION = 0.1f
+        const val MAX_SPEECH_DURATION = 5.0f
+    }
+    
+    object Asr {
+        const val MAX_TEXT_LENGTH = 50
+        const val MODEL_DIR = "sensevoice_models"
+    }
+    
+    object Audio {
+        const val GAIN_SMOOTHING_FACTOR = 0.1f
+        const val TARGET_RMS = 0.1f
+    }
+
+    object Avatar {
+        // Compile-time switch in gradle.properties/local.properties: USE_LIVE2D=true|false
+        const val USE_LIVE2D = BuildConfig.USE_LIVE2D
+        // const val MODEL_DIR = "live2d_model/mao_pro_zh"
+        // const val MODEL_JSON = "mao_pro.model3.json"
+        const val MODEL_DIR = "live2d_model/Haru_pro_jp"
+        const val MODEL_JSON = "haru_greeter_t05.model3.json"
+    }
+}
--- a/app/src/main/java/com/digitalperson/live2d/Live2DAvatarManager.kt
+++ b/app/src/main/java/com/digitalperson/live2d/Live2DAvatarManager.kt
@@ -0,0 +1,29 @@
+package com.digitalperson.live2d
+
+import android.opengl.GLSurfaceView
+
+class Live2DAvatarManager(private val glSurfaceView: GLSurfaceView) {
+    private val renderer = Live2DRenderer(glSurfaceView.context)
+
+    init {
+        glSurfaceView.setEGLContextClientVersion(2)
+        glSurfaceView.setRenderer(renderer)
+        glSurfaceView.renderMode = GLSurfaceView.RENDERMODE_CONTINUOUSLY
+    }
+
+    fun setSpeaking(speaking: Boolean) {
+        renderer.setSpeaking(speaking)
+    }
+
+    fun onResume() {
+        glSurfaceView.onResume()
+    }
+
+    fun onPause() {
+        glSurfaceView.onPause()
+    }
+
+    fun release() {
+        renderer.release()
+    }
+}
--- a/app/src/main/java/com/digitalperson/live2d/Live2DCharacter.kt
+++ b/app/src/main/java/com/digitalperson/live2d/Live2DCharacter.kt
@@ -0,0 +1,182 @@
+package com.digitalperson.live2d
+
+import android.content.res.AssetManager
+import android.graphics.BitmapFactory
+import android.opengl.GLES20
+import android.opengl.GLUtils
+import com.live2d.sdk.cubism.framework.CubismFramework
+import com.live2d.sdk.cubism.framework.CubismModelSettingJson
+import com.live2d.sdk.cubism.framework.id.CubismId
+import com.live2d.sdk.cubism.framework.math.CubismMatrix44
+import com.live2d.sdk.cubism.framework.model.CubismUserModel
+import com.live2d.sdk.cubism.framework.motion.CubismMotion
+import com.live2d.sdk.cubism.framework.rendering.android.CubismRendererAndroid
+import kotlin.math.sin
+
+class Live2DCharacter : CubismUserModel() {
+    private lateinit var setting: CubismModelSettingJson
+    private val lipSyncParams = mutableListOf<CubismId>()
+    private val idleMotions = mutableListOf<CubismMotion>()
+    private var idleMotionIndex = 0
+    private var lastElapsedSec = 0f
+    private val textureIds = mutableListOf<Int>()
+
+    fun loadFromAssets(assets: AssetManager, modelDir: String, modelJsonName: String) {
+        val settingBytes = readAssetBytes(assets, "$modelDir/$modelJsonName")
+        setting = CubismModelSettingJson(settingBytes)
+
+        loadModel(readAssetBytes(assets, "$modelDir/${setting.modelFileName}"))
+        setupRenderer(CubismRendererAndroid.create())
+        getModelMatrix().setWidth(2.0f)
+        getModelMatrix().setCenterPosition(0f, 0f)
+
+        val physicsFile = setting.physicsFileName
+        if (physicsFile.isNotEmpty()) {
+            loadPhysics(readAssetBytes(assets, "$modelDir/$physicsFile"))
+        }
+        val poseFile = setting.poseFileName
+        if (poseFile.isNotEmpty()) {
+            loadPose(readAssetBytes(assets, "$modelDir/$poseFile"))
+        }
+
+        initLipSyncParams()
+        loadIdleMotions(assets, modelDir)
+        startNextIdleMotion()
+    }
+
+    fun bindTextures(assets: AssetManager, modelDir: String) {
+        val renderer = getRenderer<CubismRendererAndroid>()
+        renderer.isPremultipliedAlpha(true)
+        renderer.isUsingHighPrecisionMask(true)
+        textureIds.forEach { id ->
+            GLES20.glDeleteTextures(1, intArrayOf(id), 0)
+        }
+        textureIds.clear()
+
+        for (i in 0 until setting.textureCount) {
+            val texturePath = "$modelDir/${setting.getTextureFileName(i)}"
+            val texId = loadTexture(assets, texturePath)
+            renderer.bindTexture(i, texId)
+            textureIds.add(texId)
+        }
+    }
+
+    fun updateFrame(elapsedSec: Float, speaking: Boolean) {
+        val model = getModel() ?: return
+        val dt = (elapsedSec - lastElapsedSec).coerceAtLeast(0f).coerceAtMost(0.1f)
+        lastElapsedSec = elapsedSec
+
+        // Keep motions running. If finished, continue idle loop.
+        motionManager.updateMotion(model, dt)
+        if (motionManager.isFinished()) {
+            startNextIdleMotion()
+        }
+
+        val mouth = if (speaking) {
+            0.2f + 0.35f * ((sin(elapsedSec * 14.0f) + 1.0f) * 0.5f)
+        } else {
+            0.0f
+        }
+
+        // Apply lip-sync to model-defined LipSync params (this model uses ParamA).
+        for (id in lipSyncParams) {
+            model.setParameterValue(id, mouth, 0.8f)
+        }
+
+        // Add small idle breathing/sway on top, so character is never "frozen".
+        val sway = sin(elapsedSec * 0.8f)
+        val breathe = sin(elapsedSec * 1.2f)
+        model.addParameterValue(CubismFramework.getIdManager().getId("ParamAngleX"), sway * 8f, 0.2f)
+        model.addParameterValue(CubismFramework.getIdManager().getId("ParamAngleY"), sin(elapsedSec * 0.6f) * 4f, 0.2f)
+        model.addParameterValue(CubismFramework.getIdManager().getId("ParamBodyAngleX"), sway * 6f, 0.15f)
+        model.addParameterValue(CubismFramework.getIdManager().getId("ParamBreath"), (breathe + 1f) * 0.5f, 0.1f)
+
+        physics?.evaluate(model, dt)
+        pose?.updateParameters(model, dt)
+        model.update()
+    }
+
+    fun draw(mvp: CubismMatrix44) {
+        val renderer = getRenderer<CubismRendererAndroid>()
+        renderer.setMvpMatrix(mvp)
+        renderer.drawModel()
+    }
+
+    fun release() {
+        textureIds.forEach { id ->
+            GLES20.glDeleteTextures(1, intArrayOf(id), 0)
+        }
+        textureIds.clear()
+        delete()
+    }
+
+    private fun loadTexture(assets: AssetManager, path: String): Int {
+        val bitmap = assets.open(path).use { stream ->
+            BitmapFactory.decodeStream(stream)
+        } ?: error("Decode texture failed: $path")
+
+        val ids = IntArray(1)
+        GLES20.glGenTextures(1, ids, 0)
+        val textureId = ids[0]
+        GLES20.glBindTexture(GLES20.GL_TEXTURE_2D, textureId)
+        GLES20.glTexParameteri(GLES20.GL_TEXTURE_2D, GLES20.GL_TEXTURE_MIN_FILTER, GLES20.GL_LINEAR)
+        GLES20.glTexParameteri(GLES20.GL_TEXTURE_2D, GLES20.GL_TEXTURE_MAG_FILTER, GLES20.GL_LINEAR)
+        GLES20.glTexParameteri(GLES20.GL_TEXTURE_2D, GLES20.GL_TEXTURE_WRAP_S, GLES20.GL_CLAMP_TO_EDGE)
+        GLES20.glTexParameteri(GLES20.GL_TEXTURE_2D, GLES20.GL_TEXTURE_WRAP_T, GLES20.GL_CLAMP_TO_EDGE)
+        GLUtils.texImage2D(GLES20.GL_TEXTURE_2D, 0, bitmap, 0)
+        bitmap.recycle()
+        return textureId
+    }
+
+    private fun readAssetBytes(assets: AssetManager, path: String): ByteArray {
+        return assets.open(path).use { input ->
+            input.readBytes()
+        }
+    }
+
+    private fun initLipSyncParams() {
+        lipSyncParams.clear()
+        for (i in 0 until setting.lipSyncParameterCount) {
+            lipSyncParams.add(setting.getLipSyncParameterId(i))
+        }
+        if (lipSyncParams.isEmpty()) {
+            lipSyncParams.add(CubismFramework.getIdManager().getId("ParamA"))
+            lipSyncParams.add(CubismFramework.getIdManager().getId("ParamMouthOpenY"))
+        }
+    }
+
+    private fun loadIdleMotions(assets: AssetManager, modelDir: String) {
+        idleMotions.clear()
+        val groupName = findIdleGroupName()
+        if (groupName.isEmpty()) return
+
+        for (i in 0 until setting.getMotionCount(groupName)) {
+            val fileName = setting.getMotionFileName(groupName, i)
+            if (fileName.isBlank()) continue
+            runCatching {
+                val motion = loadMotion(readAssetBytes(assets, "$modelDir/$fileName"))
+                motion?.setLoop(true)
+                motion?.setLoopFadeIn(true)
+                if (motion != null) idleMotions.add(motion)
+            }
+        }
+    }
+
+    private fun startNextIdleMotion() {
+        if (idleMotions.isEmpty()) return
+        val index = idleMotionIndex % idleMotions.size
+        idleMotionIndex++
+        motionManager.startMotionPriority(idleMotions[index], 1)
+    }
+
+    private fun findIdleGroupName(): String {
+        for (i in 0 until setting.motionGroupCount) {
+            val name = setting.getMotionGroupName(i)
+            if (name.equals("Idle", ignoreCase = true)) return name
+        }
+        if (setting.motionGroupCount > 0) {
+            return setting.getMotionGroupName(0) ?: ""
+        }
+        return ""
+    }
+}
--- a/app/src/main/java/com/digitalperson/live2d/Live2DRenderer.kt
+++ b/app/src/main/java/com/digitalperson/live2d/Live2DRenderer.kt
@@ -0,0 +1,78 @@
+package com.digitalperson.live2d
+
+import android.content.Context
+import android.opengl.GLES20
+import android.opengl.GLSurfaceView
+import android.os.SystemClock
+import android.util.Log
+import com.digitalperson.config.AppConfig
+import com.live2d.sdk.cubism.framework.CubismFramework
+import com.live2d.sdk.cubism.framework.math.CubismMatrix44
+import javax.microedition.khronos.egl.EGLConfig
+import javax.microedition.khronos.opengles.GL10
+
+class Live2DRenderer(
+    private val context: Context
+) : GLSurfaceView.Renderer {
+    @Volatile
+    private var speaking = false
+
+    private var character: Live2DCharacter? = null
+    private val mvp = CubismMatrix44.create()
+    private var startTimeMs: Long = 0L
+
+    override fun onSurfaceCreated(gl: GL10?, config: EGLConfig?) {
+        GLES20.glClearColor(0f, 0f, 0f, 0f)
+        ensureFrameworkInitialized()
+        startTimeMs = SystemClock.elapsedRealtime()
+
+        runCatching {
+            val model = Live2DCharacter()
+            model.loadFromAssets(
+                assets = context.assets,
+                modelDir = AppConfig.Avatar.MODEL_DIR,
+                modelJsonName = AppConfig.Avatar.MODEL_JSON
+            )
+            model.bindTextures(context.assets, AppConfig.Avatar.MODEL_DIR)
+            character = model
+        }.onFailure {
+            Log.e(AppConfig.TAG, "Load Live2D model failed: ${it.message}", it)
+            character = null
+        }
+    }
+
+    override fun onSurfaceChanged(gl: GL10?, width: Int, height: Int) {
+        GLES20.glViewport(0, 0, width, height)
+        mvp.loadIdentity()
+        if (width > height) {
+            mvp.scale(1f, width.toFloat() / height.toFloat())
+        } else {
+            mvp.scale(height.toFloat() / width.toFloat(), 1f)
+        }
+    }
+
+    override fun onDrawFrame(gl: GL10?) {
+        GLES20.glClear(GLES20.GL_COLOR_BUFFER_BIT)
+        val elapsedSec = (SystemClock.elapsedRealtime() - startTimeMs) / 1000f
+        character?.updateFrame(elapsedSec = elapsedSec, speaking = speaking)
+        character?.draw(mvp)
+    }
+
+    fun setSpeaking(speaking: Boolean) {
+        this.speaking = speaking
+    }
+
+    fun release() {
+        character?.release()
+        character = null
+    }
+
+    private fun ensureFrameworkInitialized() {
+        if (!CubismFramework.isStarted()) {
+            CubismFramework.startUp(CubismFramework.Option())
+        }
+        if (!CubismFramework.isInitialized()) {
+            CubismFramework.initialize()
+        }
+    }
+}
--- a/app/src/main/java/com/digitalperson/player/VideoPlayerManager.kt
+++ b/app/src/main/java/com/digitalperson/player/VideoPlayerManager.kt
@@ -17,7 +17,7 @@ class VideoPlayerManager(
    private var playerSilent: ExoPlayer? = null
    private var playerSpeaking: ExoPlayer? = null
    private var currentState: Boolean = false
-    private var transitionDuration = 300L // 淡入淡出时长
+    private var transitionDuration = 100L // 淡入淡出时长

    init {
        // 确保初始 alpha
--- a/app/src/main/java/com/digitalperson/tts/TtsManager.kt
+++ b/app/src/main/java/com/digitalperson/tts/TtsManager.kt
@@ -0,0 +1,293 @@
+package com.digitalperson.tts
+
+import android.content.Context
+import android.media.AudioAttributes
+import android.media.AudioFormat
+import android.media.AudioManager
+import android.media.AudioTrack
+import android.util.Log
+import android.widget.Toast
+import com.digitalperson.config.AppConfig
+import com.digitalperson.metrics.TraceManager
+import com.digitalperson.metrics.TraceSession
+import com.k2fsa.sherpa.onnx.OfflineTts
+import com.k2fsa.sherpa.onnx.getOfflineTtsConfig
+import kotlinx.coroutines.CoroutineScope
+import kotlinx.coroutines.Dispatchers
+import kotlinx.coroutines.launch
+import java.util.concurrent.LinkedBlockingQueue
+import java.util.concurrent.atomic.AtomicBoolean
+
+class TtsManager(private val context: Context) {
+    
+    companion object {
+        private const val TAG = "TtsManager"
+    }
+    
+    private var tts: OfflineTts? = null
+    private var track: AudioTrack? = null
+    
+    private sealed class TtsQueueItem {
+        data class Segment(val text: String) : TtsQueueItem()
+        data object End : TtsQueueItem()
+    }
+    
+    private val ttsQueue = LinkedBlockingQueue<TtsQueueItem>()
+    private val ttsStopped = AtomicBoolean(false)
+    private val ttsWorkerRunning = AtomicBoolean(false)
+    private val ttsPlaying = AtomicBoolean(false)
+    @Volatile private var ttsTotalSamplesWritten: Long = 0
+    
+    private var currentTrace: TraceSession? = null
+    private val ioScope = CoroutineScope(Dispatchers.IO)
+    
+    interface TtsCallback {
+        fun onTtsStarted(text: String)
+        fun onTtsCompleted()
+        fun onTtsSegmentCompleted(durationMs: Long)
+        fun isTtsStopped(): Boolean
+        fun onClearAsrQueue()
+        fun onSetSpeaking(speaking: Boolean)
+        fun getCurrentTrace(): TraceSession?
+        fun onTraceMarkTtsRequestEnqueued()
+        fun onTraceMarkTtsSynthesisStart()
+        fun onTraceMarkTtsFirstPcmReady()
+        fun onTraceMarkTtsFirstAudioPlay()
+        fun onTraceMarkTtsDone()
+        fun onTraceAddDuration(name: String, value: Long)
+        fun onEndTurn()
+    }
+    
+    private var callback: TtsCallback? = null
+    
+    fun setCallback(callback: TtsCallback) {
+        this.callback = callback
+    }
+    
+    fun initTtsAndAudioTrack(): Boolean {
+        return try {
+            val modelDir = AppConfig.Tts.MODEL_DIR
+            val modelName = AppConfig.Tts.MODEL_NAME
+            val lexicon = AppConfig.Tts.LEXICON
+            val dataDir = ""
+
+            val ttsConfig = getOfflineTtsConfig(
+                modelDir = modelDir,
+                modelName = modelName,
+                acousticModelName = "",
+                vocoder = "",
+                voices = "",
+                lexicon = lexicon,
+                dataDir = dataDir,
+                dictDir = "",
+                ruleFsts = "$modelDir/phone.fst,$modelDir/date.fst,$modelDir/number.fst,$modelDir/new_heteronym.fst",
+                ruleFars = "",
+                numThreads = null,
+                isKitten = false
+            )
+            tts = OfflineTts(assetManager = context.assets, config = ttsConfig)
+            
+            initAudioTrack()
+            true
+        } catch (t: Throwable) {
+            Log.e(TAG, "Init TTS failed: ${t.message}", t)
+            tts = null
+            false
+        }
+    }
+    
+    private fun initAudioTrack() {
+        val t = tts ?: return
+        val sr = t.sampleRate()
+        val bufLength = AudioTrack.getMinBufferSize(
+            sr,
+            AudioFormat.CHANNEL_OUT_MONO,
+            AudioFormat.ENCODING_PCM_FLOAT
+        )
+        val attr = AudioAttributes.Builder()
+            .setContentType(AudioAttributes.CONTENT_TYPE_SPEECH)
+            .setUsage(AudioAttributes.USAGE_MEDIA)
+            .build()
+        val format = AudioFormat.Builder()
+            .setEncoding(AudioFormat.ENCODING_PCM_FLOAT)
+            .setChannelMask(AudioFormat.CHANNEL_OUT_MONO)
+            .setSampleRate(sr)
+            .build()
+        track = AudioTrack(
+            attr,
+            format,
+            bufLength,
+            AudioTrack.MODE_STREAM,
+            AudioManager.AUDIO_SESSION_ID_GENERATE
+        )
+        track?.play()
+    }
+    
+    fun enqueueSegment(seg: String) {
+        val cleanedSeg = seg.trimEnd('.', '。', '!', '！', '?', '？', ',', '，', ';', '；', ':', '：')
+        
+        callback?.onTraceMarkTtsRequestEnqueued()
+        ttsQueue.offer(TtsQueueItem.Segment(cleanedSeg))
+        ensureTtsWorker()
+    }
+    
+    fun enqueueEnd() {
+        ttsQueue.offer(TtsQueueItem.End)
+    }
+    
+    fun isPlaying(): Boolean = ttsPlaying.get()
+    
+    fun reset() {
+        ttsStopped.set(false)
+        ttsPlaying.set(false)
+        ttsTotalSamplesWritten = 0
+        ttsQueue.clear()
+    }
+    
+    fun stop() {
+        ttsStopped.set(true)
+        ttsPlaying.set(false)
+        ttsTotalSamplesWritten = 0
+        ttsQueue.clear()
+        ttsQueue.offer(TtsQueueItem.End)
+        
+        try {
+            track?.pause()
+            track?.flush()
+        } catch (_: Throwable) {
+        }
+    }
+    
+    fun release() {
+        try {
+            tts?.release()
+        } catch (_: Throwable) {
+        }
+        try {
+            track?.release()
+        } catch (_: Throwable) {
+        }
+        tts = null
+        track = null
+    }
+    
+    fun setCurrentTrace(trace: TraceSession?) {
+        currentTrace = trace
+    }
+    
+    private fun ensureTtsWorker() {
+        if (!ttsWorkerRunning.compareAndSet(false, true)) return
+        ioScope.launch {
+            try {
+                runTtsWorker()
+            } finally {
+                ttsWorkerRunning.set(false)
+            }
+        }
+    }
+    
+    private fun runTtsWorker() {
+        val t = tts ?: return
+        val audioTrack = track ?: return
+
+        var firstAudioMarked = false
+        var isFirstSegment = true
+        while (true) {
+            val item = ttsQueue.take()
+            if (ttsStopped.get()) break
+
+            when (item) {
+                is TtsQueueItem.Segment -> {
+                    ttsPlaying.set(true)
+                    callback?.onSetSpeaking(true)
+                    val trace = currentTrace
+                    trace?.markTtsSynthesisStart()
+                    callback?.onTraceMarkTtsSynthesisStart()
+                    Log.d(TAG, "TTS started: processing segment '${item.text}'")
+                    callback?.onTtsStarted(item.text)
+
+                    val startMs = System.currentTimeMillis()
+                    var firstPcmMarked = false
+
+                    if (isFirstSegment) {
+                        try {
+                            audioTrack.pause()
+                            audioTrack.flush()
+                            audioTrack.play()
+                        } catch (_: Throwable) {
+                        }
+                        isFirstSegment = false
+                    }
+
+                    t.generateWithCallback(
+                        text = item.text,
+                        sid = AppConfig.Tts.SPEAKER_ID,
+                        speed = AppConfig.Tts.SPEED
+                    ) { samples ->
+                        if (ttsStopped.get()) return@generateWithCallback 0
+                        if (!firstPcmMarked && samples.isNotEmpty()) {
+                            firstPcmMarked = true
+                            trace?.markTtsFirstPcmReady()
+                            callback?.onTraceMarkTtsFirstPcmReady()
+                        }
+                        if (!firstAudioMarked && samples.isNotEmpty()) {
+                            firstAudioMarked = true
+                            trace?.markTtsFirstAudioPlay()
+                            callback?.onTraceMarkTtsFirstAudioPlay()
+                        }
+                        audioTrack.write(samples, 0, samples.size, AudioTrack.WRITE_BLOCKING)
+                        ttsTotalSamplesWritten += samples.size
+                        1
+                    }
+
+                    val ttsMs = System.currentTimeMillis() - startMs
+                    trace?.addDuration("tts_segment_ms_total", ttsMs)
+                    callback?.onTraceAddDuration("tts_segment_ms_total", ttsMs)
+                    callback?.onTtsSegmentCompleted(ttsMs)
+                }
+
+                TtsQueueItem.End -> {
+                    callback?.onClearAsrQueue()
+
+                    waitForPlaybackComplete(audioTrack)
+
+                    callback?.onTtsCompleted()
+
+                    ttsPlaying.set(false)
+                    callback?.onSetSpeaking(false)
+                    ttsTotalSamplesWritten = 0
+                    currentTrace?.markTtsDone()
+                    callback?.onTraceMarkTtsDone()
+                    callback?.onEndTurn()
+                    break
+                }
+            }
+        }
+    }
+    
+    private fun waitForPlaybackComplete(audioTrack: AudioTrack) {
+        val totalSamples = ttsTotalSamplesWritten
+        if (totalSamples <= 0) return
+
+        val sampleRate = audioTrack.sampleRate
+        val timeoutMs = (totalSamples * 1000 / sampleRate) + 2000
+        val startTime = System.currentTimeMillis()
+
+        while (true) {
+            if (ttsStopped.get()) break
+
+            val playbackPos = audioTrack.playbackHeadPosition.toLong()
+            if (playbackPos >= totalSamples) {
+                break
+            }
+
+            if (System.currentTimeMillis() - startTime > timeoutMs) {
+                Log.w(TAG, "waitForPlaybackComplete timeout, pos=$playbackPos, total=$totalSamples")
+                break
+            }
+
+            Thread.sleep(20)
+        }
+        Thread.sleep(1000)
+    }
+}
--- a/app/src/main/java/com/digitalperson/ui/Live2DUiManager.kt
+++ b/app/src/main/java/com/digitalperson/ui/Live2DUiManager.kt
@@ -0,0 +1,95 @@
+package com.digitalperson.ui
+
+import android.app.Activity
+import android.opengl.GLSurfaceView
+import android.text.method.ScrollingMovementMethod
+import android.widget.Button
+import android.widget.ScrollView
+import android.widget.TextView
+import android.widget.Toast
+import com.digitalperson.live2d.Live2DAvatarManager
+
+class Live2DUiManager(private val activity: Activity) {
+    private var textView: TextView? = null
+    private var scrollView: ScrollView? = null
+    private var startButton: Button? = null
+    private var stopButton: Button? = null
+    private var avatarManager: Live2DAvatarManager? = null
+
+    private var lastUiText: String = ""
+
+    fun initViews(
+        textViewId: Int,
+        scrollViewId: Int,
+        startButtonId: Int,
+        stopButtonId: Int,
+        silentPlayerViewId: Int,
+        speakingPlayerViewId: Int,
+        live2dViewId: Int
+    ) {
+        textView = activity.findViewById(textViewId)
+        scrollView = activity.findViewById(scrollViewId)
+        startButton = activity.findViewById(startButtonId)
+        stopButton = activity.findViewById(stopButtonId)
+
+        textView?.movementMethod = ScrollingMovementMethod()
+
+        val glView = activity.findViewById<GLSurfaceView>(live2dViewId)
+        avatarManager = Live2DAvatarManager(glView)
+        avatarManager?.setSpeaking(false)
+    }
+
+    fun setStartButtonListener(listener: () -> Unit) {
+        startButton?.setOnClickListener { listener() }
+    }
+
+    fun setStopButtonListener(listener: () -> Unit) {
+        stopButton?.setOnClickListener { listener() }
+    }
+
+    fun appendToUi(s: String) {
+        lastUiText += s
+        textView?.text = lastUiText
+        scrollView?.post { scrollView?.fullScroll(ScrollView.FOCUS_DOWN) }
+    }
+
+    fun clearText() {
+        lastUiText = ""
+        textView?.text = ""
+    }
+
+    fun setText(text: String) {
+        lastUiText = text
+        textView?.text = text
+    }
+
+    fun setButtonsEnabled(startEnabled: Boolean, stopEnabled: Boolean) {
+        startButton?.isEnabled = startEnabled
+        stopButton?.isEnabled = stopEnabled
+    }
+
+    fun setSpeaking(speaking: Boolean) {
+        activity.runOnUiThread {
+            avatarManager?.setSpeaking(speaking)
+        }
+    }
+
+    fun showToast(message: String, duration: Int = Toast.LENGTH_SHORT) {
+        activity.runOnUiThread {
+            Toast.makeText(activity, message, duration).show()
+        }
+    }
+
+    fun onResume() {
+        avatarManager?.onResume()
+    }
+
+    fun onPause() {
+        avatarManager?.onPause()
+    }
+
+    fun release() {
+        avatarManager?.release()
+        avatarManager = null
+    }
+}
--- a/app/src/main/java/com/digitalperson/ui/UiManager.kt
+++ b/app/src/main/java/com/digitalperson/ui/UiManager.kt
@@ -0,0 +1,106 @@
+package com.digitalperson.ui
+
+import android.app.Activity
+import android.text.method.ScrollingMovementMethod
+import android.util.Log
+import android.widget.Button
+import android.widget.ScrollView
+import android.widget.TextView
+import android.widget.Toast
+import com.digitalperson.config.AppConfig
+import com.digitalperson.player.VideoPlayerManager
+import com.google.android.exoplayer2.ui.PlayerView
+
+class UiManager(private val activity: Activity) {
+    
+    private var textView: TextView? = null
+    private var scrollView: ScrollView? = null
+    private var startButton: Button? = null
+    private var stopButton: Button? = null
+    private var videoPlayerManager: VideoPlayerManager? = null
+    
+    private var lastUiText: String = ""
+    
+    fun initViews(
+        textViewId: Int,
+        scrollViewId: Int,  
+        startButtonId: Int,
+        stopButtonId: Int,
+        silentPlayerViewId: Int,
+        speakingPlayerViewId: Int
+    ) {
+        textView = activity.findViewById(textViewId)
+        scrollView = activity.findViewById(scrollViewId)
+        startButton = activity.findViewById(startButtonId)
+        stopButton = activity.findViewById(stopButtonId)
+        
+        textView?.movementMethod = ScrollingMovementMethod()
+        
+        try {
+            val silentPv = activity.findViewById<PlayerView>(silentPlayerViewId)
+            val speakingPv = activity.findViewById<PlayerView>(speakingPlayerViewId)
+            videoPlayerManager = VideoPlayerManager(activity, silentPv, speakingPv)
+            videoPlayerManager?.setSpeaking(false)
+        } catch (e: Exception) {
+            Log.w(AppConfig.TAG, "PlayerViews not found or init failed: ${e.message}")
+        }
+    }
+    
+    fun setStartButtonListener(listener: () -> Unit) {
+        startButton?.setOnClickListener { listener() }
+    }
+    
+    fun setStopButtonListener(listener: () -> Unit) {
+        stopButton?.setOnClickListener { listener() }
+    }
+    
+    fun appendToUi(s: String) {
+        if (!AppConfig.SHOW_DEBUG_TEXT) return
+
+        lastUiText += s
+        textView?.text = lastUiText
+        scrollView?.post {
+            scrollView?.fullScroll(ScrollView.FOCUS_DOWN)
+        }
+    }
+    
+    fun clearText() {
+        lastUiText = ""
+        textView?.text = ""
+    }
+    
+    fun setText(text: String) {
+        lastUiText = text
+        textView?.text = text
+    }
+    
+    fun setButtonsEnabled(startEnabled: Boolean, stopEnabled: Boolean) {
+        startButton?.isEnabled = startEnabled
+        stopButton?.isEnabled = stopEnabled
+    }
+    
+    fun setSpeaking(speaking: Boolean) {
+        activity.runOnUiThread {
+            videoPlayerManager?.setSpeaking(speaking)
+        }
+    }
+    
+    fun showToast(message: String, duration: Int = Toast.LENGTH_SHORT) {
+        activity.runOnUiThread {
+            Toast.makeText(activity, message, duration).show()
+        }
+    }
+    
+    fun showToastOnUi(message: String, duration: Int = Toast.LENGTH_SHORT) {
+        Toast.makeText(activity, message, duration).show()
+    }
+    
+    fun release() {
+        videoPlayerManager?.release()
+        videoPlayerManager = null
+    }
+    
+    fun reset() {
+        lastUiText = ""
+    }
+}
--- a/app/src/main/java/com/digitalperson/util/FileHelper.kt
+++ b/app/src/main/java/com/digitalperson/util/FileHelper.kt
@@ -0,0 +1,60 @@
+package com.digitalperson.util
+
+import android.content.Context
+import android.util.Log
+import com.digitalperson.config.AppConfig
+import java.io.File
+import java.io.FileOutputStream
+
+object FileHelper {
+    private const val TAG = AppConfig.TAG
+    
+    fun assetExists(context: Context, path: String): Boolean {
+        return try {
+            context.assets.open(path).close()
+            true
+        } catch (_: Throwable) {
+            false
+        }
+    }
+    
+    fun copyAssetsToInternal(context: Context, assetDir: String, targetDir: File, files: Array<String>): File {
+        if (!targetDir.exists()) targetDir.mkdirs()
+
+        for (name in files) {
+            val assetPath = "$assetDir/$name"
+            val outFile = File(targetDir, name)
+            if (outFile.exists() && outFile.length() > 0) continue
+            try {
+                context.assets.open(assetPath).use { input ->
+                    FileOutputStream(outFile).use { output ->
+                        input.copyTo(output)
+                    }
+                }
+            } catch (e: Exception) {
+                Log.e(TAG, "Failed to copy asset $assetPath: ${e.message}")
+            }
+        }
+        return targetDir
+    }
+    
+    fun copySenseVoiceAssets(context: Context): File {
+        val outDir = File(context.filesDir, AppConfig.Asr.MODEL_DIR)
+        val files = arrayOf(
+            "am.mvn",
+            "chn_jpn_yue_eng_ko_spectok.bpe.model",
+            "embedding.npy",
+            "sense-voice-encoder.rknn"
+        )
+        return copyAssetsToInternal(context, AppConfig.Asr.MODEL_DIR, outDir, files)
+    }
+    
+    fun ensureDir(dir: File): File {
+        if (!dir.exists()) dir.mkdirs()
+        return dir
+    }
+    
+    fun getAsrAudioDir(context: Context): File {
+        return ensureDir(File(context.filesDir, "asr_audio"))
+    }
+}
--- a/app/src/main/java/com/digitalperson/vad/VadManager.kt
+++ b/app/src/main/java/com/digitalperson/vad/VadManager.kt
@@ -0,0 +1,216 @@
+package com.digitalperson.vad
+
+import android.content.Context
+import android.util.Log
+import com.digitalperson.config.AppConfig
+import com.k2fsa.sherpa.onnx.SileroVadModelConfig
+import com.k2fsa.sherpa.onnx.Vad
+import com.k2fsa.sherpa.onnx.VadModelConfig
+import java.io.File
+import kotlin.math.max
+
+class VadManager(private val context: Context) {
+    
+    companion object {
+        private const val TAG = "VadManager"
+    }
+    
+    private var vad: Vad? = null
+    private val nativeLock = Any()
+    
+    private var inSpeech = false
+    private var silenceSamples = 0
+    private var speechBuf = FloatArray(0)
+    private var speechLen = 0
+    private var processedSpeechBuf = FloatArray(0)
+    private var processedSpeechLen = 0
+    
+    private val minSilenceSamples = (AppConfig.Vad.MIN_SILENCE_DURATION * AppConfig.SAMPLE_RATE).toInt()
+    private val minSpeechSamples = (AppConfig.Vad.MIN_SPEECH_DURATION * AppConfig.SAMPLE_RATE).toInt()
+    private val maxSpeechSamples = (AppConfig.Vad.MAX_SPEECH_DURATION * AppConfig.SAMPLE_RATE).toInt()
+    
+    var vadComputeCount = 0
+        private set
+    
+    interface VadCallback {
+        fun onSpeechSegmentReady(originalAudio: FloatArray, processedAudio: FloatArray)
+        fun shouldSkipProcessing(): Boolean
+    }
+    
+    private var callback: VadCallback? = null
+    
+    fun setCallback(callback: VadCallback) {
+        this.callback = callback
+    }
+    
+    fun initVadModel(): Boolean {
+        return try {
+            val config = VadModelConfig(
+                sileroVadModelConfig = SileroVadModelConfig(
+                    model = "vad_model/silero_vad.onnx",
+                    threshold = 0.5F,
+                    minSilenceDuration = 0.25F,
+                    minSpeechDuration = 0.25F,
+                    windowSize = AppConfig.WINDOW_SIZE,
+                ),
+                sampleRate = AppConfig.SAMPLE_RATE,
+                numThreads = 1,
+                provider = "cpu",
+            )
+            vad = Vad(assetManager = context.assets, config = config)
+            Log.i(TAG, "VAD model initialized successfully")
+            true
+        } catch (e: Exception) {
+            Log.e(TAG, "Failed to initialize VAD model: ${e.message}", e)
+            false
+        }
+    }
+    
+    fun reset() {
+        vad?.reset()
+        inSpeech = false
+        silenceSamples = 0
+        speechLen = 0
+        processedSpeechLen = 0
+        vadComputeCount = 0
+    }
+    
+    fun release() {
+        try {
+            vad?.release()
+        } catch (e: Exception) {
+            Log.e(TAG, "Error releasing VAD: ${e.message}")
+        }
+        vad = null
+    }
+    
+    fun processAudioChunk(chunk: FloatArray, processedChunk: FloatArray): VadResult {
+        val prob = synchronized(nativeLock) { 
+            vad?.compute(processedChunk) ?: 0f 
+        }
+        vadComputeCount++
+        
+        val result = when {
+            !inSpeech && prob >= AppConfig.Vad.START_THRESHOLD -> {
+                inSpeech = true
+                silenceSamples = 0
+                appendSpeech(chunk, processedChunk)
+                Log.d(TAG, "VAD: Entered speech state, prob=$prob, speechLen=$speechLen")
+                VadResult.SpeechStarted(prob)
+            }
+            inSpeech && prob <= AppConfig.Vad.END_THRESHOLD -> {
+                silenceSamples += chunk.size
+                if (silenceSamples >= minSilenceSamples) {
+                    Log.d(TAG, "VAD: Exiting speech state, prob=$prob, silenceSamples=$silenceSamples, speechLen=$speechLen")
+                    finalizeSegmentIfAny()
+                    VadResult.SpeechEnded(prob)
+                } else {
+                    appendSpeech(chunk, processedChunk)
+                    VadResult.SpeechContinuing(prob)
+                }
+            }
+            inSpeech -> {
+                appendSpeech(chunk, processedChunk)
+                silenceSamples = 0
+                
+                if (speechLen >= maxSpeechSamples) {
+                    Log.d(TAG, "VAD: Max speech length reached, finalizing segment")
+                    finalizeSegmentIfAny()
+                    VadResult.MaxSpeechReached(prob)
+                } else {
+                    VadResult.SpeechContinuing(prob)
+                }
+            }
+            else -> {
+                VadResult.Silence(prob)
+            }
+        }
+        
+        return result
+    }
+    
+    fun forceFinalize() {
+        finalizeSegmentIfAny()
+    }
+    
+    fun isInSpeech(): Boolean = inSpeech
+    
+    fun getSpeechLength(): Int = speechLen
+    
+    fun clearState() {
+        inSpeech = false
+        silenceSamples = 0
+        speechLen = 0
+        processedSpeechLen = 0
+    }
+    
+    private fun appendSpeech(chunk: FloatArray, processedChunk: FloatArray) {
+        val needed = speechLen + chunk.size
+        if (speechBuf.size < needed) {
+            var newCap = maxOf(needed, maxOf(1024, speechBuf.size * 2))
+            if (newCap > maxSpeechSamples) newCap = maxSpeechSamples
+            val n = FloatArray(newCap)
+            if (speechLen > 0) System.arraycopy(speechBuf, 0, n, 0, speechLen)
+            speechBuf = n
+        }
+        val copyN = minOf(chunk.size, max(0, maxSpeechSamples - speechLen))
+        if (copyN > 0) {
+            System.arraycopy(chunk, 0, speechBuf, speechLen, copyN)
+            speechLen += copyN
+        }
+        
+        val processedNeeded = processedSpeechLen + processedChunk.size
+        if (processedSpeechBuf.size < processedNeeded) {
+            var newCap = maxOf(processedNeeded, maxOf(1024, processedSpeechBuf.size * 2))
+            if (newCap > maxSpeechSamples) newCap = maxSpeechSamples
+            val n = FloatArray(newCap)
+            if (processedSpeechLen > 0) System.arraycopy(processedSpeechBuf, 0, n, 0, processedSpeechLen)
+            processedSpeechBuf = n
+        }
+        val processedCopyN = minOf(processedChunk.size, max(0, maxSpeechSamples - processedSpeechLen))
+        if (processedCopyN > 0) {
+            System.arraycopy(processedChunk, 0, processedSpeechBuf, processedSpeechLen, processedCopyN)
+            processedSpeechLen += processedCopyN
+        }
+    }
+    
+    private fun finalizeSegmentIfAny() {
+        Log.d(TAG, "finalizeSegmentIfAny called: speechLen=$speechLen, minSpeechSamples=$minSpeechSamples")
+        
+        if (speechLen < minSpeechSamples) {
+            Log.d(TAG, "finalizeSegmentIfAny: speech too short, discarding")
+            speechLen = 0
+            processedSpeechLen = 0
+            inSpeech = false
+            silenceSamples = 0
+            return
+        }
+        
+        if (callback?.shouldSkipProcessing() == true) {
+            Log.d(TAG, "finalizeSegmentIfAny: skipping due to callback")
+            speechLen = 0
+            processedSpeechLen = 0
+            inSpeech = false
+            silenceSamples = 0
+            return
+        }
+        
+        val originalSeg = speechBuf.copyOf(speechLen)
+        val processedSeg = processedSpeechBuf.copyOf(processedSpeechLen)
+        speechLen = 0
+        processedSpeechLen = 0
+        inSpeech = false
+        silenceSamples = 0
+        
+        Log.d(TAG, "Sending audio segment to callback, size: ${processedSeg.size}")
+        callback?.onSpeechSegmentReady(originalSeg, processedSeg)
+    }
+    
+    sealed class VadResult(val probability: Float) {
+        class SpeechStarted(prob: Float) : VadResult(prob)
+        class SpeechEnded(prob: Float) : VadResult(prob)
+        class SpeechContinuing(prob: Float) : VadResult(prob)
+        class MaxSpeechReached(prob: Float) : VadResult(prob)
+        class Silence(prob: Float) : VadResult(prob)
+    }
+}