live2d model
This commit is contained in:
@@ -1,957 +0,0 @@
|
||||
package com.digitalperson
|
||||
|
||||
import android.Manifest
|
||||
import android.content.pm.PackageManager
|
||||
import android.media.AudioAttributes
|
||||
import android.media.AudioFormat
|
||||
import android.media.AudioManager
|
||||
import android.media.AudioRecord
|
||||
import android.media.AudioTrack
|
||||
import android.media.MediaRecorder
|
||||
import android.media.audiofx.AcousticEchoCanceler
|
||||
import android.media.audiofx.NoiseSuppressor
|
||||
import android.os.Bundle
|
||||
import android.os.SystemClock
|
||||
import android.text.method.ScrollingMovementMethod
|
||||
import android.util.Log
|
||||
import android.widget.Button
|
||||
import android.widget.TextView
|
||||
import android.widget.Toast
|
||||
import androidx.appcompat.app.AppCompatActivity
|
||||
import androidx.core.app.ActivityCompat
|
||||
import com.digitalperson.cloud.CloudApiManager
|
||||
import com.digitalperson.player.VideoPlayerManager
|
||||
import com.google.android.exoplayer2.ui.PlayerView
|
||||
import com.digitalperson.engine.SenseVoiceEngineRKNN
|
||||
import com.digitalperson.metrics.TraceManager
|
||||
import com.digitalperson.metrics.TraceSession
|
||||
import com.k2fsa.sherpa.onnx.OfflineTts
|
||||
import com.k2fsa.sherpa.onnx.SileroVadModelConfig
|
||||
import com.k2fsa.sherpa.onnx.Vad
|
||||
import com.k2fsa.sherpa.onnx.VadModelConfig
|
||||
import com.k2fsa.sherpa.onnx.getOfflineTtsConfig
|
||||
import kotlinx.coroutines.CoroutineScope
|
||||
import kotlinx.coroutines.Dispatchers
|
||||
import kotlinx.coroutines.Job
|
||||
import kotlinx.coroutines.SupervisorJob
|
||||
import kotlinx.coroutines.cancel
|
||||
import kotlinx.coroutines.channels.Channel
|
||||
import kotlinx.coroutines.isActive
|
||||
import kotlinx.coroutines.launch
|
||||
import kotlinx.coroutines.withContext
|
||||
import java.io.File
|
||||
import java.io.FileOutputStream
|
||||
import java.util.concurrent.LinkedBlockingQueue
|
||||
import java.util.concurrent.atomic.AtomicBoolean
|
||||
import kotlin.math.max
|
||||
|
||||
private const val TAG = "DigitalPerson"
|
||||
private const val REQUEST_RECORD_AUDIO_PERMISSION = 200
|
||||
|
||||
class MainActivity : AppCompatActivity() {
|
||||
|
||||
private lateinit var startButton: Button
|
||||
private lateinit var stopButton: Button
|
||||
private lateinit var textView: TextView
|
||||
|
||||
private lateinit var vad: Vad
|
||||
private var senseVoice: SenseVoiceEngineRKNN? = null
|
||||
private var tts: OfflineTts? = null
|
||||
private var track: AudioTrack? = null
|
||||
|
||||
private var aec: AcousticEchoCanceler? = null
|
||||
private var ns: NoiseSuppressor? = null
|
||||
|
||||
private var audioRecord: AudioRecord? = null
|
||||
private val audioSource = MediaRecorder.AudioSource.MIC
|
||||
private val sampleRateInHz = 16000
|
||||
private val channelConfig = AudioFormat.CHANNEL_IN_MONO
|
||||
private val audioFormat = AudioFormat.ENCODING_PCM_16BIT
|
||||
private val permissions: Array<String> = arrayOf(Manifest.permission.RECORD_AUDIO)
|
||||
|
||||
@Volatile
|
||||
private var isRecording: Boolean = false
|
||||
|
||||
private val ioScope = CoroutineScope(SupervisorJob() + Dispatchers.IO)
|
||||
private var recordingJob: Job? = null
|
||||
private val nativeLock = Any()
|
||||
|
||||
private lateinit var cloudApiManager: CloudApiManager
|
||||
private var videoPlayerManager: VideoPlayerManager? = null
|
||||
private val segmenter = StreamingTextSegmenter(
|
||||
maxLen = 30,
|
||||
maxWaitMs = 600
|
||||
)
|
||||
|
||||
private sealed class TtsQueueItem {
|
||||
data class Segment(val text: String) : TtsQueueItem()
|
||||
data object End : TtsQueueItem()
|
||||
}
|
||||
|
||||
private val ttsQueue = LinkedBlockingQueue<TtsQueueItem>()
|
||||
private val ttsStopped = AtomicBoolean(false)
|
||||
private val ttsWorkerRunning = AtomicBoolean(false)
|
||||
private val ttsPlaying = AtomicBoolean(false)
|
||||
@Volatile private var ttsTotalSamplesWritten: Long = 0
|
||||
|
||||
private var currentTrace: TraceSession? = null
|
||||
|
||||
private var lastUiText: String = ""
|
||||
@Volatile private var llmInFlight: Boolean = false
|
||||
private var enableStreaming = true // 默认启用流式输出
|
||||
|
||||
// ASR 队列和工作器
|
||||
private val asrQueue = Channel<Pair<FloatArray, TraceSession?>>()
|
||||
private val asrWorkerRunning = AtomicBoolean(false)
|
||||
|
||||
override fun onRequestPermissionsResult(
|
||||
requestCode: Int,
|
||||
permissions: Array<String>,
|
||||
grantResults: IntArray
|
||||
) {
|
||||
super.onRequestPermissionsResult(requestCode, permissions, grantResults)
|
||||
val ok = requestCode == REQUEST_RECORD_AUDIO_PERMISSION &&
|
||||
grantResults.isNotEmpty() &&
|
||||
grantResults[0] == PackageManager.PERMISSION_GRANTED
|
||||
if (!ok) {
|
||||
Log.e(TAG, "Audio record is disallowed")
|
||||
finish()
|
||||
}
|
||||
}
|
||||
|
||||
override fun onCreate(savedInstanceState: Bundle?) {
|
||||
super.onCreate(savedInstanceState)
|
||||
setContentView(R.layout.activity_main)
|
||||
|
||||
// 初始化双播放器管理器(silent 与 speaking 两个叠加的 PlayerView)
|
||||
try {
|
||||
val silentPv = findViewById<PlayerView>(R.id.player_view_silent)
|
||||
val speakingPv = findViewById<PlayerView>(R.id.player_view_speaking)
|
||||
videoPlayerManager = VideoPlayerManager(this, silentPv, speakingPv)
|
||||
// 默认 AI 未说话
|
||||
videoPlayerManager?.setSpeaking(false)
|
||||
} catch (e: Exception) {
|
||||
Log.w(TAG, "PlayerViews not found or init failed: ${e.message}")
|
||||
}
|
||||
|
||||
ActivityCompat.requestPermissions(this, permissions, REQUEST_RECORD_AUDIO_PERMISSION)
|
||||
|
||||
startButton = findViewById(R.id.start_button)
|
||||
stopButton = findViewById(R.id.stop_button)
|
||||
textView = findViewById(R.id.my_text)
|
||||
textView.movementMethod = ScrollingMovementMethod()
|
||||
|
||||
startButton.setOnClickListener { onStartClicked() }
|
||||
stopButton.setOnClickListener { onStopClicked(userInitiated = true) }
|
||||
|
||||
// 初始化流式输出开关
|
||||
try {
|
||||
val streamingSwitch = findViewById<android.widget.Switch>(R.id.streaming_switch)
|
||||
streamingSwitch.isChecked = enableStreaming
|
||||
streamingSwitch.setOnCheckedChangeListener { _, isChecked ->
|
||||
enableStreaming = isChecked
|
||||
cloudApiManager.setEnableStreaming(isChecked)
|
||||
Toast.makeText(this, "流式输出已${if (isChecked) "启用" else "禁用"}", Toast.LENGTH_SHORT).show()
|
||||
}
|
||||
} catch (e: Exception) {
|
||||
Log.w(TAG, "Streaming switch not found in layout: ${e.message}")
|
||||
}
|
||||
|
||||
// 避免 UI 线程重初始化导致 ANR:在后台初始化模型与 AudioTrack
|
||||
startButton.isEnabled = false
|
||||
stopButton.isEnabled = false
|
||||
textView.text = "初始化中…"
|
||||
ioScope.launch {
|
||||
try {
|
||||
Log.i(TAG, "Init VAD + SenseVoice(RKNN) + TTS (background)")
|
||||
synchronized(nativeLock) {
|
||||
initVadModel()
|
||||
initSenseVoiceModel()
|
||||
}
|
||||
withContext(Dispatchers.Main) {
|
||||
initTtsAndAudioTrack()
|
||||
textView.text = getString(R.string.hint)
|
||||
startButton.isEnabled = true
|
||||
stopButton.isEnabled = false
|
||||
}
|
||||
} catch (t: Throwable) {
|
||||
Log.e(TAG, "Initialization failed: ${t.message}", t)
|
||||
withContext(Dispatchers.Main) {
|
||||
textView.text = "初始化失败:${t.javaClass.simpleName}: ${t.message}"
|
||||
Toast.makeText(
|
||||
this@MainActivity,
|
||||
"初始化失败(请看 Logcat): ${t.javaClass.simpleName}",
|
||||
Toast.LENGTH_LONG
|
||||
).show()
|
||||
startButton.isEnabled = false
|
||||
stopButton.isEnabled = false
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
cloudApiManager = CloudApiManager(object : CloudApiManager.CloudApiListener {
|
||||
private var llmFirstChunkMarked = false
|
||||
|
||||
override fun onLLMResponseReceived(response: String) {
|
||||
currentTrace?.markLlmDone()
|
||||
llmInFlight = false
|
||||
|
||||
// 根据流式输出模式处理响应
|
||||
if (enableStreaming) {
|
||||
// 启用流式输出时,刷新剩余缓冲区
|
||||
for (seg in segmenter.flush()) {
|
||||
enqueueTtsSegment(seg)
|
||||
}
|
||||
// 发送队列结束信号
|
||||
ttsQueue.offer(TtsQueueItem.End)
|
||||
} else {
|
||||
runOnUiThread {
|
||||
appendToUi("${response}\n")
|
||||
}
|
||||
// 禁用流式输出时,直接使用整段文本进行TTS
|
||||
enqueueTtsSegment(response)
|
||||
// 发送队列结束信号
|
||||
ttsQueue.offer(TtsQueueItem.End)
|
||||
}
|
||||
}
|
||||
|
||||
override fun onLLMStreamingChunkReceived(chunk: String) {
|
||||
// 启用流式输出时,处理流式chunk
|
||||
if (enableStreaming) {
|
||||
if (!llmFirstChunkMarked) {
|
||||
llmFirstChunkMarked = true
|
||||
currentTrace?.markLlmFirstChunk()
|
||||
}
|
||||
appendToUi(chunk)
|
||||
|
||||
val segments = segmenter.processChunk(chunk)
|
||||
for (seg in segments) {
|
||||
enqueueTtsSegment(seg)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
override fun onTTSAudioReceived(audioFilePath: String) {
|
||||
// unused
|
||||
}
|
||||
|
||||
override fun onError(errorMessage: String) {
|
||||
llmInFlight = false
|
||||
Toast.makeText(this@MainActivity, errorMessage, Toast.LENGTH_LONG).show()
|
||||
onStopClicked(userInitiated = false)
|
||||
}
|
||||
}, applicationContext)
|
||||
|
||||
// 设置流式输出模式
|
||||
cloudApiManager.setEnableStreaming(enableStreaming)
|
||||
}
|
||||
|
||||
override fun onDestroy() {
|
||||
super.onDestroy()
|
||||
onStopClicked(userInitiated = false)
|
||||
ioScope.cancel()
|
||||
synchronized(nativeLock) {
|
||||
try {
|
||||
vad.release()
|
||||
} catch (_: Throwable) {
|
||||
}
|
||||
try {
|
||||
senseVoice?.deinitialize()
|
||||
} catch (_: Throwable) {
|
||||
}
|
||||
}
|
||||
try {
|
||||
tts?.release()
|
||||
} catch (_: Throwable) {
|
||||
}
|
||||
try {
|
||||
videoPlayerManager?.release()
|
||||
} catch (_: Throwable) {
|
||||
}
|
||||
}
|
||||
|
||||
private fun onStartClicked() {
|
||||
if (isRecording) return
|
||||
|
||||
if (!initMicrophone()) {
|
||||
Toast.makeText(this, "麦克风初始化失败/无权限", Toast.LENGTH_SHORT).show()
|
||||
return
|
||||
}
|
||||
|
||||
// Start a new trace turn
|
||||
currentTrace = TraceManager.getInstance().startNewTurn()
|
||||
currentTrace?.mark("turn_start")
|
||||
llmInFlight = false
|
||||
|
||||
lastUiText = ""
|
||||
textView.text = ""
|
||||
|
||||
ttsStopped.set(false)
|
||||
ttsPlaying.set(false)
|
||||
ttsTotalSamplesWritten = 0
|
||||
ttsQueue.clear()
|
||||
segmenter.reset()
|
||||
|
||||
vad.reset()
|
||||
audioRecord!!.startRecording()
|
||||
isRecording = true
|
||||
|
||||
startButton.isEnabled = false
|
||||
stopButton.isEnabled = true
|
||||
|
||||
recordingJob?.cancel()
|
||||
recordingJob = ioScope.launch {
|
||||
processSamplesLoop()
|
||||
}
|
||||
}
|
||||
|
||||
private fun onStopClicked(userInitiated: Boolean) {
|
||||
isRecording = false
|
||||
try {
|
||||
audioRecord?.stop()
|
||||
} catch (_: Throwable) {
|
||||
}
|
||||
try {
|
||||
audioRecord?.release()
|
||||
} catch (_: Throwable) {
|
||||
}
|
||||
audioRecord = null
|
||||
|
||||
recordingJob?.cancel()
|
||||
recordingJob = null
|
||||
|
||||
ttsStopped.set(true)
|
||||
ttsPlaying.set(false)
|
||||
ttsTotalSamplesWritten = 0
|
||||
ttsQueue.clear()
|
||||
// wake worker if waiting
|
||||
ttsQueue.offer(TtsQueueItem.End)
|
||||
|
||||
try {
|
||||
track?.pause()
|
||||
track?.flush()
|
||||
} catch (_: Throwable) {
|
||||
}
|
||||
try { aec?.release() } catch (_: Throwable) {}
|
||||
try { ns?.release() } catch (_: Throwable) {}
|
||||
aec = null
|
||||
ns = null
|
||||
startButton.isEnabled = true
|
||||
stopButton.isEnabled = false
|
||||
|
||||
if (userInitiated) {
|
||||
TraceManager.getInstance().endTurn()
|
||||
currentTrace = null
|
||||
}
|
||||
}
|
||||
|
||||
private fun initVadModel() {
|
||||
// 你的 VAD 模型在 assets/vad_model/ 下
|
||||
val config = VadModelConfig(
|
||||
sileroVadModelConfig = SileroVadModelConfig(
|
||||
model = "vad_model/silero_vad.onnx",
|
||||
threshold = 0.5F,
|
||||
minSilenceDuration = 0.25F,
|
||||
minSpeechDuration = 0.25F,
|
||||
windowSize = 512,
|
||||
),
|
||||
sampleRate = sampleRateInHz,
|
||||
numThreads = 1,
|
||||
provider = "cpu",
|
||||
)
|
||||
vad = Vad(assetManager = application.assets, config = config)
|
||||
}
|
||||
|
||||
private fun initSenseVoiceModel() {
|
||||
Log.i(TAG, "ASR: init SenseVoice RKNN (scheme A)")
|
||||
// Copy assets/sensevoice_models/* -> filesDir/sensevoice_models/*
|
||||
val modelDir = copySenseVoiceAssetsToInternal()
|
||||
val modelPath = File(modelDir, "sense-voice-encoder.rknn").absolutePath
|
||||
val embeddingPath = File(modelDir, "embedding.npy").absolutePath
|
||||
val bpePath = File(modelDir, "chn_jpn_yue_eng_ko_spectok.bpe.model").absolutePath
|
||||
|
||||
// Print quick diagnostics for native libs + model files
|
||||
try {
|
||||
val libDir = applicationInfo.nativeLibraryDir
|
||||
Log.i(TAG, "nativeLibraryDir=$libDir")
|
||||
try {
|
||||
val names = File(libDir).list()?.joinToString(", ") ?: "(empty)"
|
||||
Log.i(TAG, "nativeLibraryDir files: $names")
|
||||
} catch (t: Throwable) {
|
||||
Log.w(TAG, "Failed to list nativeLibraryDir: ${t.message}")
|
||||
}
|
||||
} catch (_: Throwable) {
|
||||
}
|
||||
Log.i(TAG, "SenseVoice model paths:")
|
||||
Log.i(TAG, " model=$modelPath exists=${File(modelPath).exists()} size=${File(modelPath).length()}")
|
||||
Log.i(TAG, " embedding=$embeddingPath exists=${File(embeddingPath).exists()} size=${File(embeddingPath).length()}")
|
||||
Log.i(TAG, " bpe=$bpePath exists=${File(bpePath).exists()} size=${File(bpePath).length()}")
|
||||
|
||||
val t0 = SystemClock.elapsedRealtime()
|
||||
val engine = try {
|
||||
SenseVoiceEngineRKNN(this)
|
||||
} catch (e: UnsatisfiedLinkError) {
|
||||
// Most common: libsensevoiceEngine.so not packaged/built, or dependent libs missing
|
||||
throw IllegalStateException("Load native libraries failed: ${e.message}", e)
|
||||
}
|
||||
|
||||
val ok = try {
|
||||
engine.loadModelDirectly(modelPath, embeddingPath, bpePath)
|
||||
} catch (t: Throwable) {
|
||||
throw IllegalStateException("SenseVoice loadModelDirectly crashed: ${t.message}", t)
|
||||
}
|
||||
|
||||
val dt = SystemClock.elapsedRealtime() - t0
|
||||
Log.i(TAG, "SenseVoice loadModelDirectly ok=$ok costMs=$dt")
|
||||
if (!ok) throw IllegalStateException("SenseVoiceEngineRKNN loadModelDirectly returned false")
|
||||
|
||||
senseVoice = engine
|
||||
}
|
||||
|
||||
private fun initTtsAndAudioTrack() {
|
||||
try {
|
||||
// 你放入的 sherpa-onnx VITS 中文模型目录:
|
||||
// assets/tts_model/sherpa-onnx-vits-zh-ll/{model.onnx,tokens.txt,lexicon.txt,...}
|
||||
val modelDir = "tts_model/sherpa-onnx-vits-zh-ll"
|
||||
val modelName = "model.onnx"
|
||||
val lexicon = "lexicon.txt"
|
||||
val dataDir = ""
|
||||
|
||||
val ttsConfig = getOfflineTtsConfig(
|
||||
modelDir = modelDir,
|
||||
modelName = modelName,
|
||||
acousticModelName = "",
|
||||
vocoder = "",
|
||||
voices = "",
|
||||
lexicon = lexicon,
|
||||
dataDir = dataDir,
|
||||
dictDir = "",
|
||||
// 中文规范化规则(目录里已有这些 fst)
|
||||
ruleFsts = "$modelDir/phone.fst,$modelDir/date.fst,$modelDir/number.fst,$modelDir/new_heteronym.fst",
|
||||
ruleFars = "",
|
||||
numThreads = null,
|
||||
isKitten = false
|
||||
)
|
||||
tts = OfflineTts(assetManager = application.assets, config = ttsConfig)
|
||||
} catch (t: Throwable) {
|
||||
Log.e(TAG, "Init TTS failed: ${t.message}", t)
|
||||
tts = null
|
||||
runOnUiThread {
|
||||
Toast.makeText(
|
||||
this,
|
||||
"TTS 初始化失败:请确认 assets/tts_model/sherpa-onnx-vits-zh-ll/ 下有 model.onnx、tokens.txt、lexicon.txt 以及 phone/date/number/new_heteronym.fst",
|
||||
Toast.LENGTH_LONG
|
||||
).show()
|
||||
}
|
||||
}
|
||||
|
||||
val t = tts ?: return
|
||||
val sr = t.sampleRate()
|
||||
val bufLength = AudioTrack.getMinBufferSize(
|
||||
sr,
|
||||
AudioFormat.CHANNEL_OUT_MONO,
|
||||
AudioFormat.ENCODING_PCM_FLOAT
|
||||
)
|
||||
val attr = AudioAttributes.Builder()
|
||||
.setContentType(AudioAttributes.CONTENT_TYPE_SPEECH)
|
||||
.setUsage(AudioAttributes.USAGE_MEDIA)
|
||||
.build()
|
||||
val format = AudioFormat.Builder()
|
||||
.setEncoding(AudioFormat.ENCODING_PCM_FLOAT)
|
||||
.setChannelMask(AudioFormat.CHANNEL_OUT_MONO)
|
||||
.setSampleRate(sr)
|
||||
.build()
|
||||
track = AudioTrack(
|
||||
attr,
|
||||
format,
|
||||
bufLength,
|
||||
AudioTrack.MODE_STREAM,
|
||||
AudioManager.AUDIO_SESSION_ID_GENERATE
|
||||
)
|
||||
track?.play()
|
||||
}
|
||||
|
||||
private fun assetExists(path: String): Boolean {
|
||||
return try {
|
||||
application.assets.open(path).close()
|
||||
true
|
||||
} catch (_: Throwable) {
|
||||
false
|
||||
}
|
||||
}
|
||||
|
||||
private fun copySenseVoiceAssetsToInternal(): File {
|
||||
val outDir = File(filesDir, "sensevoice_models")
|
||||
if (!outDir.exists()) outDir.mkdirs()
|
||||
|
||||
val files = arrayOf(
|
||||
"am.mvn",
|
||||
"chn_jpn_yue_eng_ko_spectok.bpe.model",
|
||||
"embedding.npy",
|
||||
"sense-voice-encoder.rknn"
|
||||
)
|
||||
|
||||
for (name in files) {
|
||||
val assetPath = "sensevoice_models/$name"
|
||||
val outFile = File(outDir, name)
|
||||
if (outFile.exists() && outFile.length() > 0) continue
|
||||
application.assets.open(assetPath).use { input ->
|
||||
FileOutputStream(outFile).use { output ->
|
||||
input.copyTo(output)
|
||||
}
|
||||
}
|
||||
}
|
||||
return outDir
|
||||
}
|
||||
|
||||
private fun initMicrophone(): Boolean {
|
||||
if (ActivityCompat.checkSelfPermission(this, Manifest.permission.RECORD_AUDIO)
|
||||
!= PackageManager.PERMISSION_GRANTED
|
||||
) {
|
||||
ActivityCompat.requestPermissions(this, permissions, REQUEST_RECORD_AUDIO_PERMISSION)
|
||||
return false
|
||||
}
|
||||
|
||||
val numBytes = AudioRecord.getMinBufferSize(sampleRateInHz, channelConfig, audioFormat)
|
||||
audioRecord = AudioRecord(
|
||||
audioSource,
|
||||
sampleRateInHz,
|
||||
channelConfig,
|
||||
audioFormat,
|
||||
numBytes * 2
|
||||
)
|
||||
val sessionId = audioRecord?.audioSessionId ?: 0
|
||||
if (sessionId != 0) {
|
||||
if (android.media.audiofx.AcousticEchoCanceler.isAvailable()) {
|
||||
aec = android.media.audiofx.AcousticEchoCanceler.create(sessionId)?.apply {
|
||||
enabled = true
|
||||
}
|
||||
Log.i(TAG, "AEC enabled=${aec?.enabled}")
|
||||
} else {
|
||||
Log.w(TAG, "AEC not available on this device")
|
||||
}
|
||||
|
||||
if (android.media.audiofx.NoiseSuppressor.isAvailable()) {
|
||||
ns = android.media.audiofx.NoiseSuppressor.create(sessionId)?.apply {
|
||||
enabled = true
|
||||
}
|
||||
Log.i(TAG, "NS enabled=${ns?.enabled}")
|
||||
} else {
|
||||
Log.w(TAG, "NS not available on this device")
|
||||
}
|
||||
}
|
||||
return true
|
||||
}
|
||||
|
||||
private suspend fun processSamplesLoop() {
|
||||
// Avoid calling vad.front()/vad.pop() (native queue APIs) since it crashes on some builds.
|
||||
// Use vad.compute() and implement a simple VAD segmenter in Kotlin instead.
|
||||
val windowSize = 512
|
||||
val buffer = ShortArray(windowSize)
|
||||
// 双阈值设置
|
||||
val startThreshold = 0.2f // 进入语音的阈值
|
||||
val endThreshold = 0.15f // 退出语音的阈值
|
||||
val minSilenceSamples = (0.5f * sampleRateInHz).toInt()
|
||||
val minSpeechSamples = (0.1f * sampleRateInHz).toInt()
|
||||
val maxSpeechSamples = (5.0f * sampleRateInHz).toInt()
|
||||
|
||||
// VAD 概率数据记录
|
||||
val vadProbabilities = mutableListOf<Float>()
|
||||
val vadTimestamps = mutableListOf<Long>()
|
||||
val vadRMSValues = mutableListOf<Float>()
|
||||
val vadSmoothedRMSValues = mutableListOf<Float>()
|
||||
|
||||
// 指数平滑相关变量
|
||||
var smoothedRms = 0f
|
||||
val alpha = 0.8f // 平滑系数
|
||||
|
||||
var inSpeech = false
|
||||
var silenceSamples = 0
|
||||
|
||||
var speechBuf = FloatArray(0)
|
||||
var speechLen = 0
|
||||
var processedSpeechBuf = FloatArray(0)
|
||||
var processedSpeechLen = 0
|
||||
|
||||
fun appendSpeech(chunk: FloatArray, processedChunk: FloatArray) {
|
||||
// 保存原始音频
|
||||
val needed = speechLen + chunk.size
|
||||
if (speechBuf.size < needed) {
|
||||
var newCap = maxOf(needed, maxOf(1024, speechBuf.size * 2))
|
||||
if (newCap > maxSpeechSamples) newCap = maxSpeechSamples
|
||||
val n = FloatArray(newCap)
|
||||
if (speechLen > 0) System.arraycopy(speechBuf, 0, n, 0, speechLen)
|
||||
speechBuf = n
|
||||
}
|
||||
val copyN = minOf(chunk.size, max(0, maxSpeechSamples - speechLen))
|
||||
if (copyN > 0) {
|
||||
System.arraycopy(chunk, 0, speechBuf, speechLen, copyN)
|
||||
speechLen += copyN
|
||||
}
|
||||
|
||||
// 保存增益后的音频
|
||||
val processedNeeded = processedSpeechLen + processedChunk.size
|
||||
if (processedSpeechBuf.size < processedNeeded) {
|
||||
var newCap = maxOf(processedNeeded, maxOf(1024, processedSpeechBuf.size * 2))
|
||||
if (newCap > maxSpeechSamples) newCap = maxSpeechSamples
|
||||
val n = FloatArray(newCap)
|
||||
if (processedSpeechLen > 0) System.arraycopy(processedSpeechBuf, 0, n, 0, processedSpeechLen)
|
||||
processedSpeechBuf = n
|
||||
}
|
||||
val processedCopyN = minOf(processedChunk.size, max(0, maxSpeechSamples - processedSpeechLen))
|
||||
if (processedCopyN > 0) {
|
||||
System.arraycopy(processedChunk, 0, processedSpeechBuf, processedSpeechLen, processedCopyN)
|
||||
processedSpeechLen += processedCopyN
|
||||
}
|
||||
}
|
||||
|
||||
suspend fun finalizeSegmentIfAny() {
|
||||
if (speechLen < minSpeechSamples) {
|
||||
speechLen = 0
|
||||
processedSpeechLen = 0
|
||||
inSpeech = false
|
||||
silenceSamples = 0
|
||||
return
|
||||
}
|
||||
// ✅ 新增:如果 TTS 正在播放或 LLM 请求中,丢弃此段(避免回声)
|
||||
if (ttsPlaying.get() || llmInFlight) {
|
||||
speechLen = 0
|
||||
processedSpeechLen = 0
|
||||
inSpeech = false
|
||||
silenceSamples = 0
|
||||
return
|
||||
}
|
||||
val originalSeg = speechBuf.copyOf(speechLen)
|
||||
val processedSeg = processedSpeechBuf.copyOf(processedSpeechLen)
|
||||
speechLen = 0
|
||||
processedSpeechLen = 0
|
||||
inSpeech = false
|
||||
silenceSamples = 0
|
||||
|
||||
// 将语音段加入 ASR 处理队列,异步处理
|
||||
asrQueue.send(Pair(originalSeg, processedSeg))
|
||||
}
|
||||
|
||||
while (isRecording && ioScope.coroutineContext.isActive) {
|
||||
val ret = audioRecord?.read(buffer, 0, buffer.size) ?: break
|
||||
if (ret <= 0) continue
|
||||
if (ret != windowSize) continue
|
||||
// 在 processSamplesLoop 方法中
|
||||
val chunk = FloatArray(ret) { buffer[it] / 32768.0f }
|
||||
|
||||
// 计算当前音频的RMS值(均方根)
|
||||
val rms = calculateRMS(chunk)
|
||||
|
||||
// 应用指数平滑
|
||||
smoothedRms = if (smoothedRms == 0f) rms else alpha * rms + (1 - alpha) * smoothedRms
|
||||
|
||||
// 动态调整增益因子,目标RMS设为0.1(约-20dB)
|
||||
val targetRMS = 0.1f
|
||||
var gainFactor = if (smoothedRms > 0) targetRMS / smoothedRms else 3.0f
|
||||
|
||||
// 设置增益的上下限,避免过度增益导致削波
|
||||
gainFactor = gainFactor.coerceIn(0.1f, 10.0f)
|
||||
|
||||
// 应用增益因子
|
||||
val processedChunk = FloatArray(chunk.size) {
|
||||
val value = chunk[it] * gainFactor
|
||||
// 限制音量范围,避免削波
|
||||
if (value > 1.0f) 1.0f else if (value < -1.0f) -1.0f else value
|
||||
}
|
||||
|
||||
// 使用处理后的音频数据
|
||||
val prob = synchronized(nativeLock) { vad.compute(processedChunk) }
|
||||
|
||||
// 记录VAD概率、时间戳、原始RMS值和平滑后的RMS值
|
||||
vadProbabilities.add(prob)
|
||||
vadTimestamps.add(System.currentTimeMillis())
|
||||
vadRMSValues.add(rms)
|
||||
vadSmoothedRMSValues.add(smoothedRms)
|
||||
|
||||
// 双阈值状态机逻辑
|
||||
if (!inSpeech && prob >= startThreshold) {
|
||||
// 进入语音状态
|
||||
inSpeech = true
|
||||
silenceSamples = 0
|
||||
appendSpeech(chunk, processedChunk)
|
||||
} else if (inSpeech && prob <= endThreshold) {
|
||||
// 开始计数静音样本
|
||||
silenceSamples += ret
|
||||
if (silenceSamples >= minSilenceSamples) {
|
||||
// 退出语音状态
|
||||
finalizeSegmentIfAny()
|
||||
} else {
|
||||
// 保留尾音
|
||||
appendSpeech(chunk, processedChunk)
|
||||
}
|
||||
} else if (inSpeech) {
|
||||
// 语音过程中,持续添加音频
|
||||
appendSpeech(chunk, processedChunk)
|
||||
silenceSamples = 0 // 重置静音计数
|
||||
|
||||
if (speechLen >= maxSpeechSamples) {
|
||||
finalizeSegmentIfAny()
|
||||
}
|
||||
}
|
||||
// 非语音状态且概率低于开始阈值,不做处理
|
||||
|
||||
// 时间兜底切段(避免长时间无标点导致首包太慢)
|
||||
val forced = segmenter.maybeForceByTime()
|
||||
for (seg in forced) enqueueTtsSegment(seg)
|
||||
}
|
||||
|
||||
// flush last partial segment
|
||||
finalizeSegmentIfAny()
|
||||
|
||||
// 保存VAD数据到文件
|
||||
saveVadData(vadTimestamps, vadProbabilities, vadRMSValues, vadSmoothedRMSValues)
|
||||
}
|
||||
|
||||
/**
|
||||
* 保存VAD数据到文件,方便后续分析和绘图
|
||||
*/
|
||||
private fun saveVadData(timestamps: List<Long>, probabilities: List<Float>, rmsValues: List<Float>, smoothedRmsValues: List<Float>) {
|
||||
try {
|
||||
// 创建保存目录
|
||||
val vadDataDir = File(filesDir, "vad_data")
|
||||
if (!vadDataDir.exists()) {
|
||||
vadDataDir.mkdirs()
|
||||
}
|
||||
|
||||
// 生成唯一的文件名
|
||||
val timestamp = System.currentTimeMillis()
|
||||
val fileName = "vad_data_${timestamp}.csv"
|
||||
val outputFile = File(vadDataDir, fileName)
|
||||
|
||||
// 写入数据
|
||||
FileOutputStream(outputFile).use { fos ->
|
||||
// 写入表头
|
||||
fos.write("timestamp,probability,rms,smoothed_rms\n".toByteArray())
|
||||
|
||||
// 写入数据行
|
||||
for (i in timestamps.indices) {
|
||||
val line = "${timestamps[i]},${probabilities[i]},${rmsValues[i]},${smoothedRmsValues[i]}\n"
|
||||
fos.write(line.toByteArray())
|
||||
}
|
||||
}
|
||||
|
||||
Log.d(TAG, "Saved VAD data to: ${outputFile.absolutePath}")
|
||||
} catch (e: Exception) {
|
||||
Log.e(TAG, "Error saving VAD data: ${e.message}")
|
||||
}
|
||||
}
|
||||
|
||||
private fun removeTokens(text: String): String {
|
||||
// Remove tokens like <|zh|>, <|NEUTRAL|>, <|Speech|>, <|woitn|> and stray '>' chars
|
||||
var cleaned = text.replace(Regex("<\\|[^>]+\\|>"), "")
|
||||
cleaned = cleaned.replace(Regex("[>>≥≫]"), "")
|
||||
cleaned = cleaned.trim().replace(Regex("\\s+"), " ")
|
||||
return cleaned
|
||||
}
|
||||
|
||||
private fun enqueueTtsSegment(seg: String) {
|
||||
// 移除句末的标点符号
|
||||
val cleanedSeg = seg.trimEnd('.', '。', '!', '!', '?', '?', ',', ',', ';', ';', ':', ':')
|
||||
|
||||
currentTrace?.markTtsRequestEnqueued()
|
||||
ttsQueue.offer(TtsQueueItem.Segment(cleanedSeg))
|
||||
ensureTtsWorker()
|
||||
}
|
||||
|
||||
private fun ensureTtsWorker() {
|
||||
if (!ttsWorkerRunning.compareAndSet(false, true)) return
|
||||
ioScope.launch {
|
||||
try {
|
||||
runTtsWorker()
|
||||
} finally {
|
||||
ttsWorkerRunning.set(false)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private fun ensureAsrWorker() {
|
||||
if (!asrWorkerRunning.compareAndSet(false, true)) return
|
||||
ioScope.launch {
|
||||
try {
|
||||
runAsrWorker()
|
||||
} finally {
|
||||
asrWorkerRunning.set(false)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private fun runTtsWorker() {
|
||||
val t = tts ?: return
|
||||
val audioTrack = track ?: return
|
||||
|
||||
var firstAudioMarked = false
|
||||
var isFirstSegment = true
|
||||
while (true) {
|
||||
val item = ttsQueue.take()
|
||||
if (ttsStopped.get()) break
|
||||
|
||||
when (item) {
|
||||
is TtsQueueItem.Segment -> {
|
||||
ttsPlaying.set(true)
|
||||
runOnUiThread { videoPlayerManager?.setSpeaking(true) }
|
||||
val trace = currentTrace
|
||||
trace?.markTtsSynthesisStart()
|
||||
Log.d(TAG, "TTS started: processing segment '${item.text}'")
|
||||
runOnUiThread {
|
||||
appendToUi("\n[TTS] 开始合成...\n")
|
||||
}
|
||||
|
||||
val startMs = System.currentTimeMillis()
|
||||
var firstPcmMarked = false
|
||||
|
||||
if (isFirstSegment) {
|
||||
try {
|
||||
audioTrack.pause()
|
||||
audioTrack.flush()
|
||||
audioTrack.play()
|
||||
} catch (_: Throwable) {
|
||||
}
|
||||
isFirstSegment = false
|
||||
}
|
||||
|
||||
t.generateWithCallback(
|
||||
text = item.text,
|
||||
sid = 2, // 这里可以修改说话人
|
||||
speed = 1.0f
|
||||
) { samples ->
|
||||
if (ttsStopped.get()) return@generateWithCallback 0
|
||||
if (!firstPcmMarked && samples.isNotEmpty()) {
|
||||
firstPcmMarked = true
|
||||
trace?.markTtsFirstPcmReady()
|
||||
}
|
||||
if (!firstAudioMarked && samples.isNotEmpty()) {
|
||||
firstAudioMarked = true
|
||||
trace?.markTtsFirstAudioPlay()
|
||||
}
|
||||
audioTrack.write(samples, 0, samples.size, AudioTrack.WRITE_BLOCKING)
|
||||
ttsTotalSamplesWritten += samples.size
|
||||
1
|
||||
}
|
||||
|
||||
val ttsMs = System.currentTimeMillis() - startMs
|
||||
trace?.addDuration("tts_segment_ms_total", ttsMs)
|
||||
}
|
||||
|
||||
TtsQueueItem.End -> {
|
||||
// 清空 ASR 队列,丢弃所有未处理的段(这些可能是 TTS 播放期间的回声)
|
||||
while (asrQueue.tryReceive().isSuccess) { }
|
||||
|
||||
waitForPlaybackComplete(audioTrack)
|
||||
val ttsCompleteTime = System.currentTimeMillis()
|
||||
|
||||
// 在主线程更新UI
|
||||
runOnUiThread {
|
||||
appendToUi("\n[LOG] TTS completed at: ${ttsCompleteTime}\n")
|
||||
}
|
||||
|
||||
ttsPlaying.set(false)
|
||||
runOnUiThread { videoPlayerManager?.setSpeaking(false) }
|
||||
ttsTotalSamplesWritten = 0
|
||||
isFirstSegment = true
|
||||
currentTrace?.markTtsDone()
|
||||
TraceManager.getInstance().endTurn()
|
||||
currentTrace = null
|
||||
break
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private fun waitForPlaybackComplete(audioTrack: AudioTrack) {
|
||||
val totalSamples = ttsTotalSamplesWritten
|
||||
if (totalSamples <= 0) return
|
||||
|
||||
val sampleRate = audioTrack.sampleRate
|
||||
val timeoutMs = (totalSamples * 1000 / sampleRate) + 2000
|
||||
val startTime = System.currentTimeMillis()
|
||||
|
||||
while (true) {
|
||||
if (ttsStopped.get()) break
|
||||
|
||||
val playbackPos = audioTrack.playbackHeadPosition.toLong()
|
||||
if (playbackPos >= totalSamples) {
|
||||
break
|
||||
}
|
||||
|
||||
if (System.currentTimeMillis() - startTime > timeoutMs) {
|
||||
Log.w(TAG, "waitForPlaybackComplete timeout, pos=$playbackPos, total=$totalSamples")
|
||||
break
|
||||
}
|
||||
|
||||
Thread.sleep(20)
|
||||
}
|
||||
// 直接等待 1000ms,确保所有缓冲区清空
|
||||
Thread.sleep(1000)
|
||||
}
|
||||
|
||||
private suspend fun runAsrWorker() {
|
||||
while (ioScope.coroutineContext.isActive) {
|
||||
val (seg, trace) = try {
|
||||
asrQueue.receive()
|
||||
} catch (_: Throwable) {
|
||||
break
|
||||
}
|
||||
|
||||
// 每次只允许一个 LLM 请求在飞,避免堆积导致卡死/竞态
|
||||
// TTS 播放期间不做 ASR,避免识别到 TTS 播放的声音
|
||||
if (llmInFlight || ttsPlaying.get()) continue
|
||||
|
||||
trace?.markASRStart()
|
||||
Log.d(TAG, "ASR started: processing audio segment")
|
||||
withContext(Dispatchers.Main) {
|
||||
appendToUi("\n[ASR] 开始识别...\n")
|
||||
}
|
||||
val raw = synchronized(nativeLock) {
|
||||
val e = senseVoice
|
||||
if (e == null || !e.isInitialized) "" else e.transcribeBuffer(seg)
|
||||
}
|
||||
val text = removeTokens(raw)
|
||||
|
||||
// 添加过滤逻辑
|
||||
if (text.isBlank()) continue
|
||||
// 过滤英文单字符"i"
|
||||
if (text.length == 1 && text[0].equals('i', ignoreCase = true)) {
|
||||
Log.d(TAG, "ASR segment skipped: single 'i'")
|
||||
continue
|
||||
}
|
||||
// 过滤超过50个字符的长文本
|
||||
if (text.length > 50) {
|
||||
Log.d(TAG, "ASR segment skipped: too long (${text.length} chars)")
|
||||
continue
|
||||
}
|
||||
|
||||
trace?.markASREnd()
|
||||
|
||||
withContext(Dispatchers.Main) {
|
||||
appendToUi("\n\n[ASR] ${text}\n")
|
||||
}
|
||||
|
||||
trace?.markRecordingDone()
|
||||
trace?.markLlmResponseReceived()
|
||||
|
||||
if (BuildConfig.LLM_API_KEY.isBlank()) {
|
||||
withContext(Dispatchers.Main) {
|
||||
Toast.makeText(
|
||||
this@MainActivity,
|
||||
"未配置 LLM_API_KEY(在 local.properties 或 gradle.properties 里设置)",
|
||||
Toast.LENGTH_LONG
|
||||
).show()
|
||||
}
|
||||
continue
|
||||
}
|
||||
|
||||
llmInFlight = true
|
||||
cloudApiManager.callLLM(text)
|
||||
}
|
||||
}
|
||||
|
||||
private fun appendToUi(s: String) {
|
||||
lastUiText += s
|
||||
textView.text = lastUiText
|
||||
}
|
||||
}
|
||||
20
app/src/main/java/com/digitalperson/EntryActivity.kt
Normal file
20
app/src/main/java/com/digitalperson/EntryActivity.kt
Normal file
@@ -0,0 +1,20 @@
|
||||
package com.digitalperson
|
||||
|
||||
import android.content.Intent
|
||||
import android.os.Bundle
|
||||
import androidx.appcompat.app.AppCompatActivity
|
||||
import com.digitalperson.config.AppConfig
|
||||
|
||||
class EntryActivity : AppCompatActivity() {
|
||||
override fun onCreate(savedInstanceState: Bundle?) {
|
||||
super.onCreate(savedInstanceState)
|
||||
|
||||
val target = if (AppConfig.Avatar.USE_LIVE2D) {
|
||||
Live2DChatActivity::class.java
|
||||
} else {
|
||||
MainActivity::class.java
|
||||
}
|
||||
startActivity(Intent(this, target))
|
||||
finish()
|
||||
}
|
||||
}
|
||||
418
app/src/main/java/com/digitalperson/Live2DChatActivity.kt
Normal file
418
app/src/main/java/com/digitalperson/Live2DChatActivity.kt
Normal file
@@ -0,0 +1,418 @@
|
||||
package com.digitalperson
|
||||
|
||||
import android.Manifest
|
||||
import android.content.pm.PackageManager
|
||||
import android.os.Bundle
|
||||
import android.util.Log
|
||||
import android.widget.Toast
|
||||
import androidx.appcompat.app.AppCompatActivity
|
||||
import androidx.core.app.ActivityCompat
|
||||
import com.digitalperson.cloud.CloudApiManager
|
||||
import com.digitalperson.audio.AudioProcessor
|
||||
import com.digitalperson.vad.VadManager
|
||||
import com.digitalperson.asr.AsrManager
|
||||
import com.digitalperson.tts.TtsManager
|
||||
import com.digitalperson.ui.Live2DUiManager
|
||||
import com.digitalperson.config.AppConfig
|
||||
import com.digitalperson.metrics.TraceManager
|
||||
import com.digitalperson.metrics.TraceSession
|
||||
import kotlinx.coroutines.CoroutineScope
|
||||
import kotlinx.coroutines.Dispatchers
|
||||
import kotlinx.coroutines.Job
|
||||
import kotlinx.coroutines.SupervisorJob
|
||||
import kotlinx.coroutines.cancel
|
||||
import kotlinx.coroutines.isActive
|
||||
import kotlinx.coroutines.launch
|
||||
import kotlinx.coroutines.withContext
|
||||
|
||||
class Live2DChatActivity : AppCompatActivity() {
|
||||
|
||||
private lateinit var uiManager: Live2DUiManager
|
||||
private lateinit var vadManager: VadManager
|
||||
private lateinit var asrManager: AsrManager
|
||||
private lateinit var ttsManager: TtsManager
|
||||
private lateinit var audioProcessor: AudioProcessor
|
||||
|
||||
private val permissions: Array<String> = arrayOf(Manifest.permission.RECORD_AUDIO)
|
||||
|
||||
@Volatile
|
||||
private var isRecording: Boolean = false
|
||||
|
||||
private val ioScope = CoroutineScope(SupervisorJob() + Dispatchers.IO)
|
||||
private var recordingJob: Job? = null
|
||||
private val nativeLock = Any()
|
||||
|
||||
private lateinit var cloudApiManager: CloudApiManager
|
||||
private val segmenter = StreamingTextSegmenter(
|
||||
maxLen = AppConfig.Tts.MAX_LEN,
|
||||
maxWaitMs = AppConfig.Tts.MAX_WAIT_MS
|
||||
)
|
||||
|
||||
private var currentTrace: TraceSession? = null
|
||||
@Volatile private var llmInFlight: Boolean = false
|
||||
private var enableStreaming = false
|
||||
|
||||
override fun onRequestPermissionsResult(
|
||||
requestCode: Int,
|
||||
permissions: Array<String>,
|
||||
grantResults: IntArray
|
||||
) {
|
||||
super.onRequestPermissionsResult(requestCode, permissions, grantResults)
|
||||
val ok = requestCode == AppConfig.REQUEST_RECORD_AUDIO_PERMISSION &&
|
||||
grantResults.isNotEmpty() &&
|
||||
grantResults[0] == PackageManager.PERMISSION_GRANTED
|
||||
if (!ok) {
|
||||
Log.e(AppConfig.TAG, "Audio record is disallowed")
|
||||
finish()
|
||||
}
|
||||
}
|
||||
|
||||
override fun onCreate(savedInstanceState: Bundle?) {
|
||||
super.onCreate(savedInstanceState)
|
||||
setContentView(R.layout.activity_live2d_chat)
|
||||
|
||||
uiManager = Live2DUiManager(this)
|
||||
uiManager.initViews(
|
||||
textViewId = R.id.my_text,
|
||||
scrollViewId = R.id.scroll_view,
|
||||
startButtonId = R.id.start_button,
|
||||
stopButtonId = R.id.stop_button,
|
||||
silentPlayerViewId = 0,
|
||||
speakingPlayerViewId = 0,
|
||||
live2dViewId = R.id.live2d_view
|
||||
)
|
||||
|
||||
uiManager.setStartButtonListener { onStartClicked() }
|
||||
uiManager.setStopButtonListener { onStopClicked(userInitiated = true) }
|
||||
|
||||
ActivityCompat.requestPermissions(this, permissions, AppConfig.REQUEST_RECORD_AUDIO_PERMISSION)
|
||||
|
||||
try {
|
||||
val streamingSwitch = findViewById<android.widget.Switch>(R.id.streaming_switch)
|
||||
streamingSwitch.isChecked = enableStreaming
|
||||
streamingSwitch.setOnCheckedChangeListener { _, isChecked ->
|
||||
enableStreaming = isChecked
|
||||
cloudApiManager.setEnableStreaming(isChecked)
|
||||
uiManager.showToast("流式输出已${if (isChecked) "启用" else "禁用"}")
|
||||
}
|
||||
} catch (e: Exception) {
|
||||
Log.w(AppConfig.TAG, "Streaming switch not found in layout: ${e.message}")
|
||||
}
|
||||
|
||||
uiManager.setButtonsEnabled(startEnabled = false, stopEnabled = false)
|
||||
uiManager.setText("初始化中…")
|
||||
|
||||
audioProcessor = AudioProcessor(this)
|
||||
|
||||
asrManager = AsrManager(this)
|
||||
asrManager.setAudioProcessor(audioProcessor)
|
||||
asrManager.setCallback(createAsrCallback())
|
||||
|
||||
vadManager = VadManager(this)
|
||||
vadManager.setCallback(createVadCallback())
|
||||
|
||||
ioScope.launch {
|
||||
try {
|
||||
Log.i(AppConfig.TAG, "Init VAD + SenseVoice(RKNN) + TTS (background)")
|
||||
synchronized(nativeLock) {
|
||||
vadManager.initVadModel()
|
||||
asrManager.initSenseVoiceModel()
|
||||
}
|
||||
val ttsOk = ttsManager.initTtsAndAudioTrack()
|
||||
withContext(Dispatchers.Main) {
|
||||
if (!ttsOk) {
|
||||
uiManager.showToast(
|
||||
"TTS 初始化失败:请确认 assets/${AppConfig.Tts.MODEL_DIR}/ 下有 model.onnx、tokens.txt、lexicon.txt 以及 phone/date/number/new_heteronym.fst",
|
||||
Toast.LENGTH_LONG
|
||||
)
|
||||
}
|
||||
uiManager.setText(getString(R.string.hint))
|
||||
uiManager.setButtonsEnabled(startEnabled = true, stopEnabled = false)
|
||||
}
|
||||
} catch (t: Throwable) {
|
||||
Log.e(AppConfig.TAG, "Initialization failed: ${t.message}", t)
|
||||
withContext(Dispatchers.Main) {
|
||||
uiManager.setText("初始化失败:${t.javaClass.simpleName}: ${t.message}")
|
||||
uiManager.showToast("初始化失败(请看 Logcat): ${t.javaClass.simpleName}", Toast.LENGTH_LONG)
|
||||
uiManager.setButtonsEnabled(startEnabled = false, stopEnabled = false)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
cloudApiManager = CloudApiManager(createCloudApiListener(), applicationContext)
|
||||
cloudApiManager.setEnableStreaming(enableStreaming)
|
||||
|
||||
ttsManager = TtsManager(this)
|
||||
ttsManager.setCallback(createTtsCallback())
|
||||
|
||||
Log.d(AppConfig.TAG, "Pre-starting ASR worker")
|
||||
ioScope.launch {
|
||||
asrManager.runAsrWorker()
|
||||
}
|
||||
}
|
||||
|
||||
private fun createAsrCallback() = object : AsrManager.AsrCallback {
|
||||
override fun onAsrStarted() {
|
||||
currentTrace?.markASRStart()
|
||||
runOnUiThread {
|
||||
uiManager.appendToUi("\n[ASR] 开始识别...\n")
|
||||
}
|
||||
}
|
||||
|
||||
override fun onAsrResult(text: String) {
|
||||
currentTrace?.markASREnd()
|
||||
runOnUiThread {
|
||||
uiManager.appendToUi("\n\n[ASR] ${text}\n")
|
||||
}
|
||||
currentTrace?.markRecordingDone()
|
||||
currentTrace?.markLlmResponseReceived()
|
||||
}
|
||||
|
||||
override fun onAsrSkipped(reason: String) {
|
||||
Log.d(AppConfig.TAG, "ASR segment skipped: $reason")
|
||||
}
|
||||
|
||||
override fun shouldSkipAsr(): Boolean = ttsManager.isPlaying()
|
||||
|
||||
override fun isLlmInFlight(): Boolean = llmInFlight
|
||||
|
||||
override fun onLlmCalled(text: String) {
|
||||
llmInFlight = true
|
||||
Log.d(AppConfig.TAG, "Calling LLM with text: $text")
|
||||
cloudApiManager.callLLM(text)
|
||||
}
|
||||
}
|
||||
|
||||
private fun createVadCallback() = object : VadManager.VadCallback {
|
||||
override fun onSpeechSegmentReady(originalAudio: FloatArray, processedAudio: FloatArray) {
|
||||
Log.d(AppConfig.TAG, "Sending audio segment to ASR queue, size: ${processedAudio.size}")
|
||||
asrManager.enqueueAudioSegment(originalAudio, processedAudio)
|
||||
}
|
||||
|
||||
override fun shouldSkipProcessing(): Boolean = ttsManager.isPlaying() || llmInFlight
|
||||
}
|
||||
|
||||
private fun createCloudApiListener() = object : CloudApiManager.CloudApiListener {
|
||||
private var llmFirstChunkMarked = false
|
||||
|
||||
override fun onLLMResponseReceived(response: String) {
|
||||
currentTrace?.markLlmDone()
|
||||
llmInFlight = false
|
||||
|
||||
if (enableStreaming) {
|
||||
for (seg in segmenter.flush()) {
|
||||
ttsManager.enqueueSegment(seg)
|
||||
}
|
||||
ttsManager.enqueueEnd()
|
||||
} else {
|
||||
runOnUiThread {
|
||||
uiManager.appendToUi("${response}\n")
|
||||
}
|
||||
ttsManager.enqueueSegment(response)
|
||||
ttsManager.enqueueEnd()
|
||||
}
|
||||
}
|
||||
|
||||
override fun onLLMStreamingChunkReceived(chunk: String) {
|
||||
if (enableStreaming) {
|
||||
if (!llmFirstChunkMarked) {
|
||||
llmFirstChunkMarked = true
|
||||
currentTrace?.markLlmFirstChunk()
|
||||
}
|
||||
uiManager.appendToUi(chunk)
|
||||
|
||||
val segments = segmenter.processChunk(chunk)
|
||||
for (seg in segments) {
|
||||
ttsManager.enqueueSegment(seg)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
override fun onTTSAudioReceived(audioFilePath: String) {}
|
||||
|
||||
override fun onError(errorMessage: String) {
|
||||
llmInFlight = false
|
||||
uiManager.showToast(errorMessage, Toast.LENGTH_LONG)
|
||||
onStopClicked(userInitiated = false)
|
||||
}
|
||||
}
|
||||
|
||||
private fun createTtsCallback() = object : TtsManager.TtsCallback {
|
||||
override fun onTtsStarted(text: String) {
|
||||
runOnUiThread {
|
||||
uiManager.appendToUi("\n[TTS] 开始合成...\n")
|
||||
}
|
||||
}
|
||||
|
||||
override fun onTtsCompleted() {
|
||||
runOnUiThread {
|
||||
uiManager.appendToUi("\n[LOG] TTS completed at: ${System.currentTimeMillis()}\n")
|
||||
}
|
||||
}
|
||||
|
||||
override fun onTtsSegmentCompleted(durationMs: Long) {}
|
||||
|
||||
override fun isTtsStopped(): Boolean = !isRecording
|
||||
|
||||
override fun onClearAsrQueue() {
|
||||
asrManager.clearQueue()
|
||||
}
|
||||
|
||||
override fun onSetSpeaking(speaking: Boolean) {
|
||||
uiManager.setSpeaking(speaking)
|
||||
}
|
||||
|
||||
override fun getCurrentTrace(): TraceSession? = currentTrace
|
||||
|
||||
override fun onTraceMarkTtsRequestEnqueued() {
|
||||
currentTrace?.markTtsRequestEnqueued()
|
||||
}
|
||||
|
||||
override fun onTraceMarkTtsSynthesisStart() {
|
||||
currentTrace?.markTtsSynthesisStart()
|
||||
}
|
||||
|
||||
override fun onTraceMarkTtsFirstPcmReady() {
|
||||
currentTrace?.markTtsFirstPcmReady()
|
||||
}
|
||||
|
||||
override fun onTraceMarkTtsFirstAudioPlay() {
|
||||
currentTrace?.markTtsFirstAudioPlay()
|
||||
}
|
||||
|
||||
override fun onTraceMarkTtsDone() {
|
||||
currentTrace?.markTtsDone()
|
||||
}
|
||||
|
||||
override fun onTraceAddDuration(name: String, value: Long) {
|
||||
currentTrace?.addDuration(name, value)
|
||||
}
|
||||
|
||||
override fun onEndTurn() {
|
||||
TraceManager.getInstance().endTurn()
|
||||
currentTrace = null
|
||||
}
|
||||
}
|
||||
|
||||
override fun onDestroy() {
|
||||
super.onDestroy()
|
||||
onStopClicked(userInitiated = false)
|
||||
ioScope.cancel()
|
||||
synchronized(nativeLock) {
|
||||
try { vadManager.release() } catch (_: Throwable) {}
|
||||
try { asrManager.release() } catch (_: Throwable) {}
|
||||
}
|
||||
try { ttsManager.release() } catch (_: Throwable) {}
|
||||
try { uiManager.release() } catch (_: Throwable) {}
|
||||
try { audioProcessor.release() } catch (_: Throwable) {}
|
||||
}
|
||||
|
||||
override fun onResume() {
|
||||
super.onResume()
|
||||
uiManager.onResume()
|
||||
}
|
||||
|
||||
override fun onPause() {
|
||||
uiManager.onPause()
|
||||
super.onPause()
|
||||
}
|
||||
|
||||
private fun onStartClicked() {
|
||||
Log.d(AppConfig.TAG, "onStartClicked called")
|
||||
if (isRecording) {
|
||||
Log.d(AppConfig.TAG, "Already recording, returning")
|
||||
return
|
||||
}
|
||||
|
||||
if (!audioProcessor.initMicrophone(permissions, AppConfig.REQUEST_RECORD_AUDIO_PERMISSION)) {
|
||||
uiManager.showToast("麦克风初始化失败/无权限")
|
||||
return
|
||||
}
|
||||
|
||||
currentTrace = TraceManager.getInstance().startNewTurn()
|
||||
currentTrace?.mark("turn_start")
|
||||
llmInFlight = false
|
||||
|
||||
uiManager.clearText()
|
||||
|
||||
ttsManager.reset()
|
||||
ttsManager.setCurrentTrace(currentTrace)
|
||||
segmenter.reset()
|
||||
|
||||
vadManager.reset()
|
||||
audioProcessor.startRecording()
|
||||
isRecording = true
|
||||
|
||||
uiManager.setButtonsEnabled(startEnabled = false, stopEnabled = true)
|
||||
|
||||
Log.d(AppConfig.TAG, "Starting processSamplesLoop coroutine")
|
||||
recordingJob?.cancel()
|
||||
recordingJob = ioScope.launch {
|
||||
processSamplesLoop()
|
||||
}
|
||||
Log.d(AppConfig.TAG, "onStartClicked completed")
|
||||
}
|
||||
|
||||
private fun onStopClicked(userInitiated: Boolean) {
|
||||
isRecording = false
|
||||
audioProcessor.stopRecording()
|
||||
|
||||
recordingJob?.cancel()
|
||||
recordingJob = null
|
||||
|
||||
ttsManager.stop()
|
||||
|
||||
uiManager.setButtonsEnabled(startEnabled = true, stopEnabled = false)
|
||||
|
||||
if (userInitiated) {
|
||||
TraceManager.getInstance().endTurn()
|
||||
currentTrace = null
|
||||
}
|
||||
}
|
||||
|
||||
private suspend fun processSamplesLoop() {
|
||||
Log.d(AppConfig.TAG, "processSamplesLoop started")
|
||||
val windowSize = AppConfig.WINDOW_SIZE
|
||||
val buffer = ShortArray(windowSize)
|
||||
var loopCount = 0
|
||||
|
||||
while (isRecording && ioScope.coroutineContext.isActive) {
|
||||
loopCount++
|
||||
if (loopCount % 100 == 0) {
|
||||
Log.d(AppConfig.TAG, "processSamplesLoop running, loopCount=$loopCount, ttsPlaying=${ttsManager.isPlaying()}")
|
||||
}
|
||||
|
||||
if (ttsManager.isPlaying()) {
|
||||
if (vadManager.isInSpeech()) {
|
||||
Log.d(AppConfig.TAG, "TTS playing, resetting VAD state")
|
||||
vadManager.clearState()
|
||||
}
|
||||
val ret = audioProcessor.readAudio(buffer)
|
||||
if (ret <= 0) continue
|
||||
continue
|
||||
}
|
||||
|
||||
val ret = audioProcessor.readAudio(buffer)
|
||||
if (ret <= 0) continue
|
||||
if (ret != windowSize) continue
|
||||
|
||||
val chunk = audioProcessor.convertShortToFloat(buffer)
|
||||
val processedChunk = audioProcessor.applyGain(chunk)
|
||||
|
||||
val result = vadManager.processAudioChunk(chunk, processedChunk)
|
||||
|
||||
if (vadManager.vadComputeCount % 100 == 0) {
|
||||
Log.d(AppConfig.TAG, "VAD result: $result, inSpeech=${vadManager.isInSpeech()}")
|
||||
}
|
||||
|
||||
if (loopCount % 1000 == 0) {
|
||||
Log.d(AppConfig.TAG, "VAD status: inSpeech=${vadManager.isInSpeech()}, speechLen=${vadManager.getSpeechLength()}")
|
||||
}
|
||||
|
||||
val forced = segmenter.maybeForceByTime()
|
||||
for (seg in forced) ttsManager.enqueueSegment(seg)
|
||||
}
|
||||
|
||||
vadManager.forceFinalize()
|
||||
}
|
||||
}
|
||||
File diff suppressed because it is too large
Load Diff
223
app/src/main/java/com/digitalperson/asr/AsrManager.kt
Normal file
223
app/src/main/java/com/digitalperson/asr/AsrManager.kt
Normal file
@@ -0,0 +1,223 @@
|
||||
package com.digitalperson.asr
|
||||
|
||||
import android.content.Context
|
||||
import android.os.SystemClock
|
||||
import android.util.Log
|
||||
import com.digitalperson.BuildConfig
|
||||
import com.digitalperson.audio.AudioProcessor
|
||||
import com.digitalperson.config.AppConfig
|
||||
import com.digitalperson.engine.SenseVoiceEngineRKNN
|
||||
import com.digitalperson.util.FileHelper
|
||||
import kotlinx.coroutines.Dispatchers
|
||||
import kotlinx.coroutines.channels.Channel
|
||||
import kotlinx.coroutines.currentCoroutineContext
|
||||
import kotlinx.coroutines.isActive
|
||||
import kotlinx.coroutines.withContext
|
||||
import java.io.File
|
||||
|
||||
class AsrManager(private val context: Context) {
|
||||
|
||||
companion object {
|
||||
private const val TAG = "AsrManager"
|
||||
}
|
||||
|
||||
private var senseVoice: SenseVoiceEngineRKNN? = null
|
||||
private val nativeLock = Any()
|
||||
|
||||
private val asrQueue = Channel<Pair<FloatArray, FloatArray>>(capacity = Channel.UNLIMITED)
|
||||
|
||||
private var audioProcessor: AudioProcessor? = null
|
||||
|
||||
interface AsrCallback {
|
||||
fun onAsrStarted()
|
||||
fun onAsrResult(text: String)
|
||||
fun onAsrSkipped(reason: String)
|
||||
fun shouldSkipAsr(): Boolean
|
||||
fun isLlmInFlight(): Boolean
|
||||
fun onLlmCalled(text: String)
|
||||
}
|
||||
|
||||
private var callback: AsrCallback? = null
|
||||
|
||||
fun setCallback(callback: AsrCallback) {
|
||||
this.callback = callback
|
||||
}
|
||||
|
||||
fun setAudioProcessor(audioProcessor: AudioProcessor) {
|
||||
this.audioProcessor = audioProcessor
|
||||
}
|
||||
|
||||
fun initSenseVoiceModel(): Boolean {
|
||||
return try {
|
||||
Log.i(TAG, "ASR: init SenseVoice RKNN (scheme A)")
|
||||
|
||||
val modelDir = FileHelper.copySenseVoiceAssets(context)
|
||||
val modelPath = File(modelDir, "sense-voice-encoder.rknn").absolutePath
|
||||
val embeddingPath = File(modelDir, "embedding.npy").absolutePath
|
||||
val bpePath = File(modelDir, "chn_jpn_yue_eng_ko_spectok.bpe.model").absolutePath
|
||||
|
||||
try {
|
||||
val libDir = context.applicationInfo.nativeLibraryDir
|
||||
Log.i(TAG, "nativeLibraryDir=$libDir")
|
||||
try {
|
||||
val names = File(libDir).list()?.joinToString(", ") ?: "(empty)"
|
||||
Log.i(TAG, "nativeLibraryDir files: $names")
|
||||
} catch (t: Throwable) {
|
||||
Log.w(TAG, "Failed to list nativeLibraryDir: ${t.message}")
|
||||
}
|
||||
} catch (_: Throwable) {
|
||||
}
|
||||
|
||||
Log.i(TAG, "SenseVoice model paths:")
|
||||
Log.i(TAG, " model=$modelPath exists=${File(modelPath).exists()} size=${File(modelPath).length()}")
|
||||
Log.i(TAG, " embedding=$embeddingPath exists=${File(embeddingPath).exists()} size=${File(embeddingPath).length()}")
|
||||
Log.i(TAG, " bpe=$bpePath exists=${File(bpePath).exists()} size=${File(bpePath).length()}")
|
||||
|
||||
val t0 = SystemClock.elapsedRealtime()
|
||||
val engine = try {
|
||||
SenseVoiceEngineRKNN(context)
|
||||
} catch (e: UnsatisfiedLinkError) {
|
||||
throw IllegalStateException("Load native libraries failed: ${e.message}", e)
|
||||
}
|
||||
|
||||
val ok = try {
|
||||
engine.loadModelDirectly(modelPath, embeddingPath, bpePath)
|
||||
} catch (t: Throwable) {
|
||||
throw IllegalStateException("SenseVoice loadModelDirectly crashed: ${t.message}", t)
|
||||
}
|
||||
|
||||
val dt = SystemClock.elapsedRealtime() - t0
|
||||
Log.i(TAG, "SenseVoice loadModelDirectly ok=$ok costMs=$dt")
|
||||
if (!ok) throw IllegalStateException("SenseVoiceEngineRKNN loadModelDirectly returned false")
|
||||
|
||||
senseVoice = engine
|
||||
true
|
||||
} catch (e: Exception) {
|
||||
Log.e(TAG, "Failed to initialize SenseVoice model: ${e.message}", e)
|
||||
false
|
||||
}
|
||||
}
|
||||
|
||||
fun enqueueAudioSegment(originalAudio: FloatArray, processedAudio: FloatArray) {
|
||||
try {
|
||||
asrQueue.trySend(Pair(originalAudio, processedAudio))
|
||||
} catch (e: Exception) {
|
||||
Log.e(TAG, "Failed to enqueue audio segment: ${e.message}")
|
||||
}
|
||||
}
|
||||
|
||||
fun clearQueue() {
|
||||
while (asrQueue.tryReceive().isSuccess) { }
|
||||
}
|
||||
|
||||
suspend fun runAsrWorker() {
|
||||
Log.d(TAG, "ASR worker started")
|
||||
try {
|
||||
while (currentCoroutineContext().isActive) {
|
||||
val (originalSeg, processedSeg) = try {
|
||||
Log.d(TAG, "ASR worker waiting for audio segment")
|
||||
asrQueue.receive()
|
||||
} catch (e: Throwable) {
|
||||
Log.e(TAG, "ASR worker receive failed: ${e.message}")
|
||||
break
|
||||
}
|
||||
|
||||
Log.d(TAG, "ASR worker received audio segment, size=${processedSeg.size}")
|
||||
|
||||
if (callback?.shouldSkipAsr() == true || callback?.isLlmInFlight() == true) {
|
||||
Log.d(TAG, "ASR worker skipping segment: shouldSkip=${callback?.shouldSkipAsr()}, llmInFlight=${callback?.isLlmInFlight()}")
|
||||
continue
|
||||
}
|
||||
|
||||
callback?.onAsrStarted()
|
||||
Log.d(TAG, "ASR started: processing audio segment")
|
||||
|
||||
saveAsrAudio(originalSeg, processedSeg)
|
||||
|
||||
val raw = synchronized(nativeLock) {
|
||||
val e = senseVoice
|
||||
if (e == null || !e.isInitialized) {
|
||||
Log.e(TAG, "ASR failed: SenseVoice engine not initialized")
|
||||
""
|
||||
} else {
|
||||
try {
|
||||
e.transcribeBuffer(processedSeg)
|
||||
} catch (e: Throwable) {
|
||||
Log.e(TAG, "ASR transcribe failed: ${e.message}")
|
||||
""
|
||||
}
|
||||
}
|
||||
}
|
||||
Log.d(TAG, "ASR raw result: $raw")
|
||||
val text = removeTokens(raw)
|
||||
|
||||
val filterResult = filterText(text)
|
||||
if (filterResult != null) {
|
||||
callback?.onAsrSkipped(filterResult)
|
||||
continue
|
||||
}
|
||||
|
||||
callback?.onAsrResult(text)
|
||||
|
||||
if (BuildConfig.LLM_API_KEY.isBlank()) {
|
||||
Log.w(TAG, "LLM API Key is not configured")
|
||||
continue
|
||||
}
|
||||
|
||||
callback?.onLlmCalled(text)
|
||||
}
|
||||
} catch (e: Throwable) {
|
||||
Log.e(TAG, "ASR worker error: ${e.message}", e)
|
||||
} finally {
|
||||
Log.d(TAG, "ASR worker exiting")
|
||||
}
|
||||
}
|
||||
|
||||
fun release() {
|
||||
try {
|
||||
senseVoice?.deinitialize()
|
||||
} catch (e: Exception) {
|
||||
Log.e(TAG, "Error deinitializing SenseVoice: ${e.message}")
|
||||
}
|
||||
senseVoice = null
|
||||
clearQueue()
|
||||
}
|
||||
|
||||
fun isInitialized(): Boolean = senseVoice?.isInitialized ?: false
|
||||
|
||||
private fun saveAsrAudio(originalAudio: FloatArray, processedAudio: FloatArray) {
|
||||
try {
|
||||
val timestamp = System.currentTimeMillis()
|
||||
val asrAudioDir = FileHelper.getAsrAudioDir(context)
|
||||
|
||||
audioProcessor?.let { processor ->
|
||||
val originalFile = File(asrAudioDir, "asr_${timestamp}_original.wav")
|
||||
processor.saveAudioAsWav(originalFile, originalAudio, AppConfig.SAMPLE_RATE)
|
||||
val processedFile = File(asrAudioDir, "asr_${timestamp}_processed.wav")
|
||||
processor.saveAudioAsWav(processedFile, processedAudio, AppConfig.SAMPLE_RATE)
|
||||
}
|
||||
} catch (e: Exception) {
|
||||
Log.e(TAG, "Error saving ASR audio: ${e.message}")
|
||||
}
|
||||
}
|
||||
|
||||
private fun removeTokens(text: String): String {
|
||||
var cleaned = text.replace(Regex("<\\|[^>]+\\|>"), "")
|
||||
cleaned = cleaned.replace(Regex("[>>≥≫]"), "")
|
||||
cleaned = cleaned.trim().replace(Regex("\\s+"), " ")
|
||||
return cleaned
|
||||
}
|
||||
|
||||
private fun filterText(text: String): String? {
|
||||
if (text.isBlank()) {
|
||||
return "blank text"
|
||||
}
|
||||
if (text.length == 1 && text[0].equals('i', ignoreCase = true)) {
|
||||
return "single 'i'"
|
||||
}
|
||||
if (text.length > AppConfig.Asr.MAX_TEXT_LENGTH) {
|
||||
return "too long (${text.length} chars)"
|
||||
}
|
||||
return null
|
||||
}
|
||||
}
|
||||
218
app/src/main/java/com/digitalperson/audio/AudioProcessor.kt
Normal file
218
app/src/main/java/com/digitalperson/audio/AudioProcessor.kt
Normal file
@@ -0,0 +1,218 @@
|
||||
package com.digitalperson.audio
|
||||
|
||||
import android.Manifest
|
||||
import android.content.pm.PackageManager
|
||||
import android.media.AudioFormat
|
||||
import android.media.AudioRecord
|
||||
import android.media.MediaRecorder
|
||||
import android.media.audiofx.AcousticEchoCanceler
|
||||
import android.media.audiofx.NoiseSuppressor
|
||||
import android.util.Log
|
||||
import androidx.core.app.ActivityCompat
|
||||
import java.io.File
|
||||
import java.io.FileOutputStream
|
||||
|
||||
private const val TAG = "AudioProcessor"
|
||||
|
||||
class AudioProcessor(
|
||||
private val context: android.content.Context,
|
||||
private val sampleRateInHz: Int = 16000,
|
||||
private val channelConfig: Int = AudioFormat.CHANNEL_IN_MONO,
|
||||
private val audioFormat: Int = AudioFormat.ENCODING_PCM_16BIT
|
||||
) {
|
||||
private val audioSource = MediaRecorder.AudioSource.MIC
|
||||
private var audioRecord: AudioRecord? = null
|
||||
private var aec: AcousticEchoCanceler? = null
|
||||
private var ns: NoiseSuppressor? = null
|
||||
|
||||
private var smoothedRms = 0f
|
||||
private val alpha = 0.8f
|
||||
|
||||
fun initMicrophone(permissions: Array<String>, requestCode: Int): Boolean {
|
||||
if (ActivityCompat.checkSelfPermission(context, Manifest.permission.RECORD_AUDIO)
|
||||
!= PackageManager.PERMISSION_GRANTED
|
||||
) {
|
||||
ActivityCompat.requestPermissions(
|
||||
context as androidx.appcompat.app.AppCompatActivity,
|
||||
permissions,
|
||||
requestCode
|
||||
)
|
||||
return false
|
||||
}
|
||||
|
||||
val numBytes = AudioRecord.getMinBufferSize(sampleRateInHz, channelConfig, audioFormat)
|
||||
audioRecord = AudioRecord(
|
||||
audioSource,
|
||||
sampleRateInHz,
|
||||
channelConfig,
|
||||
audioFormat,
|
||||
numBytes * 2
|
||||
)
|
||||
val sessionId = audioRecord?.audioSessionId ?: 0
|
||||
if (sessionId != 0) {
|
||||
if (AcousticEchoCanceler.isAvailable()) {
|
||||
aec = AcousticEchoCanceler.create(sessionId)?.apply {
|
||||
enabled = true
|
||||
}
|
||||
Log.i(TAG, "AEC enabled=${aec?.enabled}")
|
||||
} else {
|
||||
Log.w(TAG, "AEC not available on this device")
|
||||
}
|
||||
|
||||
if (NoiseSuppressor.isAvailable()) {
|
||||
ns = NoiseSuppressor.create(sessionId)?.apply {
|
||||
enabled = true
|
||||
}
|
||||
Log.i(TAG, "NS enabled=${ns?.enabled}")
|
||||
} else {
|
||||
Log.w(TAG, "NS not available on this device")
|
||||
}
|
||||
}
|
||||
return true
|
||||
}
|
||||
|
||||
fun startRecording() {
|
||||
audioRecord?.startRecording()
|
||||
Log.d(TAG, "Audio recording started")
|
||||
}
|
||||
|
||||
fun stopRecording() {
|
||||
try {
|
||||
audioRecord?.stop()
|
||||
} catch (_: Throwable) {
|
||||
}
|
||||
Log.d(TAG, "Audio recording stopped")
|
||||
}
|
||||
|
||||
fun release() {
|
||||
try {
|
||||
audioRecord?.stop()
|
||||
} catch (_: Throwable) {
|
||||
}
|
||||
try {
|
||||
audioRecord?.release()
|
||||
} catch (_: Throwable) {
|
||||
}
|
||||
audioRecord = null
|
||||
|
||||
try {
|
||||
aec?.release()
|
||||
} catch (_: Throwable) {
|
||||
}
|
||||
try {
|
||||
ns?.release()
|
||||
} catch (_: Throwable) {
|
||||
}
|
||||
aec = null
|
||||
ns = null
|
||||
|
||||
Log.d(TAG, "AudioProcessor released")
|
||||
}
|
||||
|
||||
fun readAudio(buffer: ShortArray): Int {
|
||||
return audioRecord?.read(buffer, 0, buffer.size) ?: -1
|
||||
}
|
||||
|
||||
fun calculateRMS(samples: FloatArray): Float {
|
||||
if (samples.isEmpty()) return 0.0f
|
||||
|
||||
var sumSquared = 0.0f
|
||||
for (sample in samples) {
|
||||
sumSquared += sample * sample
|
||||
}
|
||||
|
||||
val meanSquared = sumSquared / samples.size
|
||||
return kotlin.math.sqrt(meanSquared)
|
||||
}
|
||||
|
||||
fun applyGain(chunk: FloatArray, targetRMS: Float = 0.1f): FloatArray {
|
||||
val rms = calculateRMS(chunk)
|
||||
|
||||
smoothedRms = if (smoothedRms == 0f) rms else alpha * rms + (1 - alpha) * smoothedRms
|
||||
|
||||
var gainFactor = if (smoothedRms > 0) targetRMS / smoothedRms else 3.0f
|
||||
gainFactor = gainFactor.coerceIn(0.1f, 10.0f)
|
||||
|
||||
val processedChunk = FloatArray(chunk.size) {
|
||||
val value = chunk[it] * gainFactor
|
||||
if (value > 1.0f) 1.0f else if (value < -1.0f) -1.0f else value
|
||||
}
|
||||
|
||||
return processedChunk
|
||||
}
|
||||
|
||||
fun convertShortToFloat(buffer: ShortArray): FloatArray {
|
||||
return FloatArray(buffer.size) { buffer[it] / 32768.0f }
|
||||
}
|
||||
|
||||
fun saveAudioAsWav(file: File, samples: FloatArray, sampleRate: Int) {
|
||||
FileOutputStream(file).use { fos ->
|
||||
val header = ByteArray(44)
|
||||
|
||||
header[0] = 'R'.code.toByte()
|
||||
header[1] = 'I'.code.toByte()
|
||||
header[2] = 'F'.code.toByte()
|
||||
header[3] = 'F'.code.toByte()
|
||||
|
||||
val fileSize = 36 + samples.size * 2
|
||||
intToByteArray(fileSize, header, 4)
|
||||
|
||||
header[8] = 'W'.code.toByte()
|
||||
header[9] = 'A'.code.toByte()
|
||||
header[10] = 'V'.code.toByte()
|
||||
header[11] = 'E'.code.toByte()
|
||||
|
||||
header[12] = 'f'.code.toByte()
|
||||
header[13] = 'm'.code.toByte()
|
||||
header[14] = 't'.code.toByte()
|
||||
header[15] = ' '.code.toByte()
|
||||
|
||||
intToByteArray(16, header, 16)
|
||||
|
||||
shortToByteArray(1, header, 20)
|
||||
|
||||
shortToByteArray(1, header, 22)
|
||||
|
||||
intToByteArray(sampleRate, header, 24)
|
||||
|
||||
val byteRate = sampleRate * 1 * 16 / 8
|
||||
intToByteArray(byteRate, header, 28)
|
||||
|
||||
val blockAlign = 1 * 16 / 8
|
||||
shortToByteArray(blockAlign.toShort(), header, 32)
|
||||
|
||||
shortToByteArray(16, header, 34)
|
||||
|
||||
header[36] = 'd'.code.toByte()
|
||||
header[37] = 'a'.code.toByte()
|
||||
header[38] = 't'.code.toByte()
|
||||
header[39] = 'a'.code.toByte()
|
||||
|
||||
val dataSize = samples.size * 2
|
||||
intToByteArray(dataSize, header, 40)
|
||||
|
||||
fos.write(header)
|
||||
|
||||
for (sample in samples) {
|
||||
val clampedSample = sample.coerceIn(-1.0f, 1.0f)
|
||||
val shortSample = (clampedSample * 32767.0f).toInt().toShort()
|
||||
val bytes = ByteArray(2)
|
||||
bytes[0] = (shortSample.toInt() and 0xFF).toByte()
|
||||
bytes[1] = (shortSample.toInt() shr 8 and 0xFF).toByte()
|
||||
fos.write(bytes)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private fun intToByteArray(value: Int, dest: ByteArray, offset: Int) {
|
||||
dest[offset] = (value and 0xFF).toByte()
|
||||
dest[offset + 1] = (value shr 8 and 0xFF).toByte()
|
||||
dest[offset + 2] = (value shr 16 and 0xFF).toByte()
|
||||
dest[offset + 3] = (value shr 24 and 0xFF).toByte()
|
||||
}
|
||||
|
||||
private fun shortToByteArray(value: Short, dest: ByteArray, offset: Int) {
|
||||
dest[offset] = (value.toInt() and 0xFF).toByte()
|
||||
dest[offset + 1] = (value.toInt() shr 8 and 0xFF).toByte()
|
||||
}
|
||||
}
|
||||
50
app/src/main/java/com/digitalperson/config/AppConfig.kt
Normal file
50
app/src/main/java/com/digitalperson/config/AppConfig.kt
Normal file
@@ -0,0 +1,50 @@
|
||||
package com.digitalperson.config
|
||||
|
||||
import com.digitalperson.BuildConfig
|
||||
|
||||
object AppConfig {
|
||||
const val TAG = "DigitalPerson"
|
||||
const val REQUEST_RECORD_AUDIO_PERMISSION = 200
|
||||
|
||||
const val SAMPLE_RATE = 16000
|
||||
const val WINDOW_SIZE = 512
|
||||
|
||||
const val SHOW_DEBUG_TEXT = true
|
||||
|
||||
object Tts {
|
||||
const val MODEL_DIR = "tts_model/sherpa-onnx-vits-zh-ll"
|
||||
const val MODEL_NAME = "model.onnx"
|
||||
const val LEXICON = "lexicon.txt"
|
||||
const val SPEAKER_ID = 2
|
||||
const val SPEED = 1.0f
|
||||
const val MAX_LEN = 30
|
||||
const val MAX_WAIT_MS: Long = 600
|
||||
}
|
||||
|
||||
object Vad {
|
||||
const val START_THRESHOLD = 0.2f
|
||||
const val END_THRESHOLD = 0.15f
|
||||
const val MIN_SILENCE_DURATION = 0.5f
|
||||
const val MIN_SPEECH_DURATION = 0.1f
|
||||
const val MAX_SPEECH_DURATION = 5.0f
|
||||
}
|
||||
|
||||
object Asr {
|
||||
const val MAX_TEXT_LENGTH = 50
|
||||
const val MODEL_DIR = "sensevoice_models"
|
||||
}
|
||||
|
||||
object Audio {
|
||||
const val GAIN_SMOOTHING_FACTOR = 0.1f
|
||||
const val TARGET_RMS = 0.1f
|
||||
}
|
||||
|
||||
object Avatar {
|
||||
// Compile-time switch in gradle.properties/local.properties: USE_LIVE2D=true|false
|
||||
const val USE_LIVE2D = BuildConfig.USE_LIVE2D
|
||||
// const val MODEL_DIR = "live2d_model/mao_pro_zh"
|
||||
// const val MODEL_JSON = "mao_pro.model3.json"
|
||||
const val MODEL_DIR = "live2d_model/Haru_pro_jp"
|
||||
const val MODEL_JSON = "haru_greeter_t05.model3.json"
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,29 @@
|
||||
package com.digitalperson.live2d
|
||||
|
||||
import android.opengl.GLSurfaceView
|
||||
|
||||
class Live2DAvatarManager(private val glSurfaceView: GLSurfaceView) {
|
||||
private val renderer = Live2DRenderer(glSurfaceView.context)
|
||||
|
||||
init {
|
||||
glSurfaceView.setEGLContextClientVersion(2)
|
||||
glSurfaceView.setRenderer(renderer)
|
||||
glSurfaceView.renderMode = GLSurfaceView.RENDERMODE_CONTINUOUSLY
|
||||
}
|
||||
|
||||
fun setSpeaking(speaking: Boolean) {
|
||||
renderer.setSpeaking(speaking)
|
||||
}
|
||||
|
||||
fun onResume() {
|
||||
glSurfaceView.onResume()
|
||||
}
|
||||
|
||||
fun onPause() {
|
||||
glSurfaceView.onPause()
|
||||
}
|
||||
|
||||
fun release() {
|
||||
renderer.release()
|
||||
}
|
||||
}
|
||||
182
app/src/main/java/com/digitalperson/live2d/Live2DCharacter.kt
Normal file
182
app/src/main/java/com/digitalperson/live2d/Live2DCharacter.kt
Normal file
@@ -0,0 +1,182 @@
|
||||
package com.digitalperson.live2d
|
||||
|
||||
import android.content.res.AssetManager
|
||||
import android.graphics.BitmapFactory
|
||||
import android.opengl.GLES20
|
||||
import android.opengl.GLUtils
|
||||
import com.live2d.sdk.cubism.framework.CubismFramework
|
||||
import com.live2d.sdk.cubism.framework.CubismModelSettingJson
|
||||
import com.live2d.sdk.cubism.framework.id.CubismId
|
||||
import com.live2d.sdk.cubism.framework.math.CubismMatrix44
|
||||
import com.live2d.sdk.cubism.framework.model.CubismUserModel
|
||||
import com.live2d.sdk.cubism.framework.motion.CubismMotion
|
||||
import com.live2d.sdk.cubism.framework.rendering.android.CubismRendererAndroid
|
||||
import kotlin.math.sin
|
||||
|
||||
class Live2DCharacter : CubismUserModel() {
|
||||
private lateinit var setting: CubismModelSettingJson
|
||||
private val lipSyncParams = mutableListOf<CubismId>()
|
||||
private val idleMotions = mutableListOf<CubismMotion>()
|
||||
private var idleMotionIndex = 0
|
||||
private var lastElapsedSec = 0f
|
||||
private val textureIds = mutableListOf<Int>()
|
||||
|
||||
fun loadFromAssets(assets: AssetManager, modelDir: String, modelJsonName: String) {
|
||||
val settingBytes = readAssetBytes(assets, "$modelDir/$modelJsonName")
|
||||
setting = CubismModelSettingJson(settingBytes)
|
||||
|
||||
loadModel(readAssetBytes(assets, "$modelDir/${setting.modelFileName}"))
|
||||
setupRenderer(CubismRendererAndroid.create())
|
||||
getModelMatrix().setWidth(2.0f)
|
||||
getModelMatrix().setCenterPosition(0f, 0f)
|
||||
|
||||
val physicsFile = setting.physicsFileName
|
||||
if (physicsFile.isNotEmpty()) {
|
||||
loadPhysics(readAssetBytes(assets, "$modelDir/$physicsFile"))
|
||||
}
|
||||
val poseFile = setting.poseFileName
|
||||
if (poseFile.isNotEmpty()) {
|
||||
loadPose(readAssetBytes(assets, "$modelDir/$poseFile"))
|
||||
}
|
||||
|
||||
initLipSyncParams()
|
||||
loadIdleMotions(assets, modelDir)
|
||||
startNextIdleMotion()
|
||||
}
|
||||
|
||||
fun bindTextures(assets: AssetManager, modelDir: String) {
|
||||
val renderer = getRenderer<CubismRendererAndroid>()
|
||||
renderer.isPremultipliedAlpha(true)
|
||||
renderer.isUsingHighPrecisionMask(true)
|
||||
textureIds.forEach { id ->
|
||||
GLES20.glDeleteTextures(1, intArrayOf(id), 0)
|
||||
}
|
||||
textureIds.clear()
|
||||
|
||||
for (i in 0 until setting.textureCount) {
|
||||
val texturePath = "$modelDir/${setting.getTextureFileName(i)}"
|
||||
val texId = loadTexture(assets, texturePath)
|
||||
renderer.bindTexture(i, texId)
|
||||
textureIds.add(texId)
|
||||
}
|
||||
}
|
||||
|
||||
fun updateFrame(elapsedSec: Float, speaking: Boolean) {
|
||||
val model = getModel() ?: return
|
||||
val dt = (elapsedSec - lastElapsedSec).coerceAtLeast(0f).coerceAtMost(0.1f)
|
||||
lastElapsedSec = elapsedSec
|
||||
|
||||
// Keep motions running. If finished, continue idle loop.
|
||||
motionManager.updateMotion(model, dt)
|
||||
if (motionManager.isFinished()) {
|
||||
startNextIdleMotion()
|
||||
}
|
||||
|
||||
val mouth = if (speaking) {
|
||||
0.2f + 0.35f * ((sin(elapsedSec * 14.0f) + 1.0f) * 0.5f)
|
||||
} else {
|
||||
0.0f
|
||||
}
|
||||
|
||||
// Apply lip-sync to model-defined LipSync params (this model uses ParamA).
|
||||
for (id in lipSyncParams) {
|
||||
model.setParameterValue(id, mouth, 0.8f)
|
||||
}
|
||||
|
||||
// Add small idle breathing/sway on top, so character is never "frozen".
|
||||
val sway = sin(elapsedSec * 0.8f)
|
||||
val breathe = sin(elapsedSec * 1.2f)
|
||||
model.addParameterValue(CubismFramework.getIdManager().getId("ParamAngleX"), sway * 8f, 0.2f)
|
||||
model.addParameterValue(CubismFramework.getIdManager().getId("ParamAngleY"), sin(elapsedSec * 0.6f) * 4f, 0.2f)
|
||||
model.addParameterValue(CubismFramework.getIdManager().getId("ParamBodyAngleX"), sway * 6f, 0.15f)
|
||||
model.addParameterValue(CubismFramework.getIdManager().getId("ParamBreath"), (breathe + 1f) * 0.5f, 0.1f)
|
||||
|
||||
physics?.evaluate(model, dt)
|
||||
pose?.updateParameters(model, dt)
|
||||
model.update()
|
||||
}
|
||||
|
||||
fun draw(mvp: CubismMatrix44) {
|
||||
val renderer = getRenderer<CubismRendererAndroid>()
|
||||
renderer.setMvpMatrix(mvp)
|
||||
renderer.drawModel()
|
||||
}
|
||||
|
||||
fun release() {
|
||||
textureIds.forEach { id ->
|
||||
GLES20.glDeleteTextures(1, intArrayOf(id), 0)
|
||||
}
|
||||
textureIds.clear()
|
||||
delete()
|
||||
}
|
||||
|
||||
private fun loadTexture(assets: AssetManager, path: String): Int {
|
||||
val bitmap = assets.open(path).use { stream ->
|
||||
BitmapFactory.decodeStream(stream)
|
||||
} ?: error("Decode texture failed: $path")
|
||||
|
||||
val ids = IntArray(1)
|
||||
GLES20.glGenTextures(1, ids, 0)
|
||||
val textureId = ids[0]
|
||||
GLES20.glBindTexture(GLES20.GL_TEXTURE_2D, textureId)
|
||||
GLES20.glTexParameteri(GLES20.GL_TEXTURE_2D, GLES20.GL_TEXTURE_MIN_FILTER, GLES20.GL_LINEAR)
|
||||
GLES20.glTexParameteri(GLES20.GL_TEXTURE_2D, GLES20.GL_TEXTURE_MAG_FILTER, GLES20.GL_LINEAR)
|
||||
GLES20.glTexParameteri(GLES20.GL_TEXTURE_2D, GLES20.GL_TEXTURE_WRAP_S, GLES20.GL_CLAMP_TO_EDGE)
|
||||
GLES20.glTexParameteri(GLES20.GL_TEXTURE_2D, GLES20.GL_TEXTURE_WRAP_T, GLES20.GL_CLAMP_TO_EDGE)
|
||||
GLUtils.texImage2D(GLES20.GL_TEXTURE_2D, 0, bitmap, 0)
|
||||
bitmap.recycle()
|
||||
return textureId
|
||||
}
|
||||
|
||||
private fun readAssetBytes(assets: AssetManager, path: String): ByteArray {
|
||||
return assets.open(path).use { input ->
|
||||
input.readBytes()
|
||||
}
|
||||
}
|
||||
|
||||
private fun initLipSyncParams() {
|
||||
lipSyncParams.clear()
|
||||
for (i in 0 until setting.lipSyncParameterCount) {
|
||||
lipSyncParams.add(setting.getLipSyncParameterId(i))
|
||||
}
|
||||
if (lipSyncParams.isEmpty()) {
|
||||
lipSyncParams.add(CubismFramework.getIdManager().getId("ParamA"))
|
||||
lipSyncParams.add(CubismFramework.getIdManager().getId("ParamMouthOpenY"))
|
||||
}
|
||||
}
|
||||
|
||||
private fun loadIdleMotions(assets: AssetManager, modelDir: String) {
|
||||
idleMotions.clear()
|
||||
val groupName = findIdleGroupName()
|
||||
if (groupName.isEmpty()) return
|
||||
|
||||
for (i in 0 until setting.getMotionCount(groupName)) {
|
||||
val fileName = setting.getMotionFileName(groupName, i)
|
||||
if (fileName.isBlank()) continue
|
||||
runCatching {
|
||||
val motion = loadMotion(readAssetBytes(assets, "$modelDir/$fileName"))
|
||||
motion?.setLoop(true)
|
||||
motion?.setLoopFadeIn(true)
|
||||
if (motion != null) idleMotions.add(motion)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private fun startNextIdleMotion() {
|
||||
if (idleMotions.isEmpty()) return
|
||||
val index = idleMotionIndex % idleMotions.size
|
||||
idleMotionIndex++
|
||||
motionManager.startMotionPriority(idleMotions[index], 1)
|
||||
}
|
||||
|
||||
private fun findIdleGroupName(): String {
|
||||
for (i in 0 until setting.motionGroupCount) {
|
||||
val name = setting.getMotionGroupName(i)
|
||||
if (name.equals("Idle", ignoreCase = true)) return name
|
||||
}
|
||||
if (setting.motionGroupCount > 0) {
|
||||
return setting.getMotionGroupName(0) ?: ""
|
||||
}
|
||||
return ""
|
||||
}
|
||||
}
|
||||
78
app/src/main/java/com/digitalperson/live2d/Live2DRenderer.kt
Normal file
78
app/src/main/java/com/digitalperson/live2d/Live2DRenderer.kt
Normal file
@@ -0,0 +1,78 @@
|
||||
package com.digitalperson.live2d
|
||||
|
||||
import android.content.Context
|
||||
import android.opengl.GLES20
|
||||
import android.opengl.GLSurfaceView
|
||||
import android.os.SystemClock
|
||||
import android.util.Log
|
||||
import com.digitalperson.config.AppConfig
|
||||
import com.live2d.sdk.cubism.framework.CubismFramework
|
||||
import com.live2d.sdk.cubism.framework.math.CubismMatrix44
|
||||
import javax.microedition.khronos.egl.EGLConfig
|
||||
import javax.microedition.khronos.opengles.GL10
|
||||
|
||||
class Live2DRenderer(
|
||||
private val context: Context
|
||||
) : GLSurfaceView.Renderer {
|
||||
@Volatile
|
||||
private var speaking = false
|
||||
|
||||
private var character: Live2DCharacter? = null
|
||||
private val mvp = CubismMatrix44.create()
|
||||
private var startTimeMs: Long = 0L
|
||||
|
||||
override fun onSurfaceCreated(gl: GL10?, config: EGLConfig?) {
|
||||
GLES20.glClearColor(0f, 0f, 0f, 0f)
|
||||
ensureFrameworkInitialized()
|
||||
startTimeMs = SystemClock.elapsedRealtime()
|
||||
|
||||
runCatching {
|
||||
val model = Live2DCharacter()
|
||||
model.loadFromAssets(
|
||||
assets = context.assets,
|
||||
modelDir = AppConfig.Avatar.MODEL_DIR,
|
||||
modelJsonName = AppConfig.Avatar.MODEL_JSON
|
||||
)
|
||||
model.bindTextures(context.assets, AppConfig.Avatar.MODEL_DIR)
|
||||
character = model
|
||||
}.onFailure {
|
||||
Log.e(AppConfig.TAG, "Load Live2D model failed: ${it.message}", it)
|
||||
character = null
|
||||
}
|
||||
}
|
||||
|
||||
override fun onSurfaceChanged(gl: GL10?, width: Int, height: Int) {
|
||||
GLES20.glViewport(0, 0, width, height)
|
||||
mvp.loadIdentity()
|
||||
if (width > height) {
|
||||
mvp.scale(1f, width.toFloat() / height.toFloat())
|
||||
} else {
|
||||
mvp.scale(height.toFloat() / width.toFloat(), 1f)
|
||||
}
|
||||
}
|
||||
|
||||
override fun onDrawFrame(gl: GL10?) {
|
||||
GLES20.glClear(GLES20.GL_COLOR_BUFFER_BIT)
|
||||
val elapsedSec = (SystemClock.elapsedRealtime() - startTimeMs) / 1000f
|
||||
character?.updateFrame(elapsedSec = elapsedSec, speaking = speaking)
|
||||
character?.draw(mvp)
|
||||
}
|
||||
|
||||
fun setSpeaking(speaking: Boolean) {
|
||||
this.speaking = speaking
|
||||
}
|
||||
|
||||
fun release() {
|
||||
character?.release()
|
||||
character = null
|
||||
}
|
||||
|
||||
private fun ensureFrameworkInitialized() {
|
||||
if (!CubismFramework.isStarted()) {
|
||||
CubismFramework.startUp(CubismFramework.Option())
|
||||
}
|
||||
if (!CubismFramework.isInitialized()) {
|
||||
CubismFramework.initialize()
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -17,7 +17,7 @@ class VideoPlayerManager(
|
||||
private var playerSilent: ExoPlayer? = null
|
||||
private var playerSpeaking: ExoPlayer? = null
|
||||
private var currentState: Boolean = false
|
||||
private var transitionDuration = 300L // 淡入淡出时长
|
||||
private var transitionDuration = 100L // 淡入淡出时长
|
||||
|
||||
init {
|
||||
// 确保初始 alpha
|
||||
|
||||
293
app/src/main/java/com/digitalperson/tts/TtsManager.kt
Normal file
293
app/src/main/java/com/digitalperson/tts/TtsManager.kt
Normal file
@@ -0,0 +1,293 @@
|
||||
package com.digitalperson.tts
|
||||
|
||||
import android.content.Context
|
||||
import android.media.AudioAttributes
|
||||
import android.media.AudioFormat
|
||||
import android.media.AudioManager
|
||||
import android.media.AudioTrack
|
||||
import android.util.Log
|
||||
import android.widget.Toast
|
||||
import com.digitalperson.config.AppConfig
|
||||
import com.digitalperson.metrics.TraceManager
|
||||
import com.digitalperson.metrics.TraceSession
|
||||
import com.k2fsa.sherpa.onnx.OfflineTts
|
||||
import com.k2fsa.sherpa.onnx.getOfflineTtsConfig
|
||||
import kotlinx.coroutines.CoroutineScope
|
||||
import kotlinx.coroutines.Dispatchers
|
||||
import kotlinx.coroutines.launch
|
||||
import java.util.concurrent.LinkedBlockingQueue
|
||||
import java.util.concurrent.atomic.AtomicBoolean
|
||||
|
||||
class TtsManager(private val context: Context) {
|
||||
|
||||
companion object {
|
||||
private const val TAG = "TtsManager"
|
||||
}
|
||||
|
||||
private var tts: OfflineTts? = null
|
||||
private var track: AudioTrack? = null
|
||||
|
||||
private sealed class TtsQueueItem {
|
||||
data class Segment(val text: String) : TtsQueueItem()
|
||||
data object End : TtsQueueItem()
|
||||
}
|
||||
|
||||
private val ttsQueue = LinkedBlockingQueue<TtsQueueItem>()
|
||||
private val ttsStopped = AtomicBoolean(false)
|
||||
private val ttsWorkerRunning = AtomicBoolean(false)
|
||||
private val ttsPlaying = AtomicBoolean(false)
|
||||
@Volatile private var ttsTotalSamplesWritten: Long = 0
|
||||
|
||||
private var currentTrace: TraceSession? = null
|
||||
private val ioScope = CoroutineScope(Dispatchers.IO)
|
||||
|
||||
interface TtsCallback {
|
||||
fun onTtsStarted(text: String)
|
||||
fun onTtsCompleted()
|
||||
fun onTtsSegmentCompleted(durationMs: Long)
|
||||
fun isTtsStopped(): Boolean
|
||||
fun onClearAsrQueue()
|
||||
fun onSetSpeaking(speaking: Boolean)
|
||||
fun getCurrentTrace(): TraceSession?
|
||||
fun onTraceMarkTtsRequestEnqueued()
|
||||
fun onTraceMarkTtsSynthesisStart()
|
||||
fun onTraceMarkTtsFirstPcmReady()
|
||||
fun onTraceMarkTtsFirstAudioPlay()
|
||||
fun onTraceMarkTtsDone()
|
||||
fun onTraceAddDuration(name: String, value: Long)
|
||||
fun onEndTurn()
|
||||
}
|
||||
|
||||
private var callback: TtsCallback? = null
|
||||
|
||||
fun setCallback(callback: TtsCallback) {
|
||||
this.callback = callback
|
||||
}
|
||||
|
||||
fun initTtsAndAudioTrack(): Boolean {
|
||||
return try {
|
||||
val modelDir = AppConfig.Tts.MODEL_DIR
|
||||
val modelName = AppConfig.Tts.MODEL_NAME
|
||||
val lexicon = AppConfig.Tts.LEXICON
|
||||
val dataDir = ""
|
||||
|
||||
val ttsConfig = getOfflineTtsConfig(
|
||||
modelDir = modelDir,
|
||||
modelName = modelName,
|
||||
acousticModelName = "",
|
||||
vocoder = "",
|
||||
voices = "",
|
||||
lexicon = lexicon,
|
||||
dataDir = dataDir,
|
||||
dictDir = "",
|
||||
ruleFsts = "$modelDir/phone.fst,$modelDir/date.fst,$modelDir/number.fst,$modelDir/new_heteronym.fst",
|
||||
ruleFars = "",
|
||||
numThreads = null,
|
||||
isKitten = false
|
||||
)
|
||||
tts = OfflineTts(assetManager = context.assets, config = ttsConfig)
|
||||
|
||||
initAudioTrack()
|
||||
true
|
||||
} catch (t: Throwable) {
|
||||
Log.e(TAG, "Init TTS failed: ${t.message}", t)
|
||||
tts = null
|
||||
false
|
||||
}
|
||||
}
|
||||
|
||||
private fun initAudioTrack() {
|
||||
val t = tts ?: return
|
||||
val sr = t.sampleRate()
|
||||
val bufLength = AudioTrack.getMinBufferSize(
|
||||
sr,
|
||||
AudioFormat.CHANNEL_OUT_MONO,
|
||||
AudioFormat.ENCODING_PCM_FLOAT
|
||||
)
|
||||
val attr = AudioAttributes.Builder()
|
||||
.setContentType(AudioAttributes.CONTENT_TYPE_SPEECH)
|
||||
.setUsage(AudioAttributes.USAGE_MEDIA)
|
||||
.build()
|
||||
val format = AudioFormat.Builder()
|
||||
.setEncoding(AudioFormat.ENCODING_PCM_FLOAT)
|
||||
.setChannelMask(AudioFormat.CHANNEL_OUT_MONO)
|
||||
.setSampleRate(sr)
|
||||
.build()
|
||||
track = AudioTrack(
|
||||
attr,
|
||||
format,
|
||||
bufLength,
|
||||
AudioTrack.MODE_STREAM,
|
||||
AudioManager.AUDIO_SESSION_ID_GENERATE
|
||||
)
|
||||
track?.play()
|
||||
}
|
||||
|
||||
fun enqueueSegment(seg: String) {
|
||||
val cleanedSeg = seg.trimEnd('.', '。', '!', '!', '?', '?', ',', ',', ';', ';', ':', ':')
|
||||
|
||||
callback?.onTraceMarkTtsRequestEnqueued()
|
||||
ttsQueue.offer(TtsQueueItem.Segment(cleanedSeg))
|
||||
ensureTtsWorker()
|
||||
}
|
||||
|
||||
fun enqueueEnd() {
|
||||
ttsQueue.offer(TtsQueueItem.End)
|
||||
}
|
||||
|
||||
fun isPlaying(): Boolean = ttsPlaying.get()
|
||||
|
||||
fun reset() {
|
||||
ttsStopped.set(false)
|
||||
ttsPlaying.set(false)
|
||||
ttsTotalSamplesWritten = 0
|
||||
ttsQueue.clear()
|
||||
}
|
||||
|
||||
fun stop() {
|
||||
ttsStopped.set(true)
|
||||
ttsPlaying.set(false)
|
||||
ttsTotalSamplesWritten = 0
|
||||
ttsQueue.clear()
|
||||
ttsQueue.offer(TtsQueueItem.End)
|
||||
|
||||
try {
|
||||
track?.pause()
|
||||
track?.flush()
|
||||
} catch (_: Throwable) {
|
||||
}
|
||||
}
|
||||
|
||||
fun release() {
|
||||
try {
|
||||
tts?.release()
|
||||
} catch (_: Throwable) {
|
||||
}
|
||||
try {
|
||||
track?.release()
|
||||
} catch (_: Throwable) {
|
||||
}
|
||||
tts = null
|
||||
track = null
|
||||
}
|
||||
|
||||
fun setCurrentTrace(trace: TraceSession?) {
|
||||
currentTrace = trace
|
||||
}
|
||||
|
||||
private fun ensureTtsWorker() {
|
||||
if (!ttsWorkerRunning.compareAndSet(false, true)) return
|
||||
ioScope.launch {
|
||||
try {
|
||||
runTtsWorker()
|
||||
} finally {
|
||||
ttsWorkerRunning.set(false)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private fun runTtsWorker() {
|
||||
val t = tts ?: return
|
||||
val audioTrack = track ?: return
|
||||
|
||||
var firstAudioMarked = false
|
||||
var isFirstSegment = true
|
||||
while (true) {
|
||||
val item = ttsQueue.take()
|
||||
if (ttsStopped.get()) break
|
||||
|
||||
when (item) {
|
||||
is TtsQueueItem.Segment -> {
|
||||
ttsPlaying.set(true)
|
||||
callback?.onSetSpeaking(true)
|
||||
val trace = currentTrace
|
||||
trace?.markTtsSynthesisStart()
|
||||
callback?.onTraceMarkTtsSynthesisStart()
|
||||
Log.d(TAG, "TTS started: processing segment '${item.text}'")
|
||||
callback?.onTtsStarted(item.text)
|
||||
|
||||
val startMs = System.currentTimeMillis()
|
||||
var firstPcmMarked = false
|
||||
|
||||
if (isFirstSegment) {
|
||||
try {
|
||||
audioTrack.pause()
|
||||
audioTrack.flush()
|
||||
audioTrack.play()
|
||||
} catch (_: Throwable) {
|
||||
}
|
||||
isFirstSegment = false
|
||||
}
|
||||
|
||||
t.generateWithCallback(
|
||||
text = item.text,
|
||||
sid = AppConfig.Tts.SPEAKER_ID,
|
||||
speed = AppConfig.Tts.SPEED
|
||||
) { samples ->
|
||||
if (ttsStopped.get()) return@generateWithCallback 0
|
||||
if (!firstPcmMarked && samples.isNotEmpty()) {
|
||||
firstPcmMarked = true
|
||||
trace?.markTtsFirstPcmReady()
|
||||
callback?.onTraceMarkTtsFirstPcmReady()
|
||||
}
|
||||
if (!firstAudioMarked && samples.isNotEmpty()) {
|
||||
firstAudioMarked = true
|
||||
trace?.markTtsFirstAudioPlay()
|
||||
callback?.onTraceMarkTtsFirstAudioPlay()
|
||||
}
|
||||
audioTrack.write(samples, 0, samples.size, AudioTrack.WRITE_BLOCKING)
|
||||
ttsTotalSamplesWritten += samples.size
|
||||
1
|
||||
}
|
||||
|
||||
val ttsMs = System.currentTimeMillis() - startMs
|
||||
trace?.addDuration("tts_segment_ms_total", ttsMs)
|
||||
callback?.onTraceAddDuration("tts_segment_ms_total", ttsMs)
|
||||
callback?.onTtsSegmentCompleted(ttsMs)
|
||||
}
|
||||
|
||||
TtsQueueItem.End -> {
|
||||
callback?.onClearAsrQueue()
|
||||
|
||||
waitForPlaybackComplete(audioTrack)
|
||||
|
||||
callback?.onTtsCompleted()
|
||||
|
||||
ttsPlaying.set(false)
|
||||
callback?.onSetSpeaking(false)
|
||||
ttsTotalSamplesWritten = 0
|
||||
currentTrace?.markTtsDone()
|
||||
callback?.onTraceMarkTtsDone()
|
||||
callback?.onEndTurn()
|
||||
break
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private fun waitForPlaybackComplete(audioTrack: AudioTrack) {
|
||||
val totalSamples = ttsTotalSamplesWritten
|
||||
if (totalSamples <= 0) return
|
||||
|
||||
val sampleRate = audioTrack.sampleRate
|
||||
val timeoutMs = (totalSamples * 1000 / sampleRate) + 2000
|
||||
val startTime = System.currentTimeMillis()
|
||||
|
||||
while (true) {
|
||||
if (ttsStopped.get()) break
|
||||
|
||||
val playbackPos = audioTrack.playbackHeadPosition.toLong()
|
||||
if (playbackPos >= totalSamples) {
|
||||
break
|
||||
}
|
||||
|
||||
if (System.currentTimeMillis() - startTime > timeoutMs) {
|
||||
Log.w(TAG, "waitForPlaybackComplete timeout, pos=$playbackPos, total=$totalSamples")
|
||||
break
|
||||
}
|
||||
|
||||
Thread.sleep(20)
|
||||
}
|
||||
Thread.sleep(1000)
|
||||
}
|
||||
}
|
||||
95
app/src/main/java/com/digitalperson/ui/Live2DUiManager.kt
Normal file
95
app/src/main/java/com/digitalperson/ui/Live2DUiManager.kt
Normal file
@@ -0,0 +1,95 @@
|
||||
package com.digitalperson.ui
|
||||
|
||||
import android.app.Activity
|
||||
import android.opengl.GLSurfaceView
|
||||
import android.text.method.ScrollingMovementMethod
|
||||
import android.widget.Button
|
||||
import android.widget.ScrollView
|
||||
import android.widget.TextView
|
||||
import android.widget.Toast
|
||||
import com.digitalperson.live2d.Live2DAvatarManager
|
||||
|
||||
class Live2DUiManager(private val activity: Activity) {
|
||||
private var textView: TextView? = null
|
||||
private var scrollView: ScrollView? = null
|
||||
private var startButton: Button? = null
|
||||
private var stopButton: Button? = null
|
||||
private var avatarManager: Live2DAvatarManager? = null
|
||||
|
||||
private var lastUiText: String = ""
|
||||
|
||||
fun initViews(
|
||||
textViewId: Int,
|
||||
scrollViewId: Int,
|
||||
startButtonId: Int,
|
||||
stopButtonId: Int,
|
||||
silentPlayerViewId: Int,
|
||||
speakingPlayerViewId: Int,
|
||||
live2dViewId: Int
|
||||
) {
|
||||
textView = activity.findViewById(textViewId)
|
||||
scrollView = activity.findViewById(scrollViewId)
|
||||
startButton = activity.findViewById(startButtonId)
|
||||
stopButton = activity.findViewById(stopButtonId)
|
||||
|
||||
textView?.movementMethod = ScrollingMovementMethod()
|
||||
|
||||
val glView = activity.findViewById<GLSurfaceView>(live2dViewId)
|
||||
avatarManager = Live2DAvatarManager(glView)
|
||||
avatarManager?.setSpeaking(false)
|
||||
}
|
||||
|
||||
fun setStartButtonListener(listener: () -> Unit) {
|
||||
startButton?.setOnClickListener { listener() }
|
||||
}
|
||||
|
||||
fun setStopButtonListener(listener: () -> Unit) {
|
||||
stopButton?.setOnClickListener { listener() }
|
||||
}
|
||||
|
||||
fun appendToUi(s: String) {
|
||||
lastUiText += s
|
||||
textView?.text = lastUiText
|
||||
scrollView?.post { scrollView?.fullScroll(ScrollView.FOCUS_DOWN) }
|
||||
}
|
||||
|
||||
fun clearText() {
|
||||
lastUiText = ""
|
||||
textView?.text = ""
|
||||
}
|
||||
|
||||
fun setText(text: String) {
|
||||
lastUiText = text
|
||||
textView?.text = text
|
||||
}
|
||||
|
||||
fun setButtonsEnabled(startEnabled: Boolean, stopEnabled: Boolean) {
|
||||
startButton?.isEnabled = startEnabled
|
||||
stopButton?.isEnabled = stopEnabled
|
||||
}
|
||||
|
||||
fun setSpeaking(speaking: Boolean) {
|
||||
activity.runOnUiThread {
|
||||
avatarManager?.setSpeaking(speaking)
|
||||
}
|
||||
}
|
||||
|
||||
fun showToast(message: String, duration: Int = Toast.LENGTH_SHORT) {
|
||||
activity.runOnUiThread {
|
||||
Toast.makeText(activity, message, duration).show()
|
||||
}
|
||||
}
|
||||
|
||||
fun onResume() {
|
||||
avatarManager?.onResume()
|
||||
}
|
||||
|
||||
fun onPause() {
|
||||
avatarManager?.onPause()
|
||||
}
|
||||
|
||||
fun release() {
|
||||
avatarManager?.release()
|
||||
avatarManager = null
|
||||
}
|
||||
}
|
||||
106
app/src/main/java/com/digitalperson/ui/UiManager.kt
Normal file
106
app/src/main/java/com/digitalperson/ui/UiManager.kt
Normal file
@@ -0,0 +1,106 @@
|
||||
package com.digitalperson.ui
|
||||
|
||||
import android.app.Activity
|
||||
import android.text.method.ScrollingMovementMethod
|
||||
import android.util.Log
|
||||
import android.widget.Button
|
||||
import android.widget.ScrollView
|
||||
import android.widget.TextView
|
||||
import android.widget.Toast
|
||||
import com.digitalperson.config.AppConfig
|
||||
import com.digitalperson.player.VideoPlayerManager
|
||||
import com.google.android.exoplayer2.ui.PlayerView
|
||||
|
||||
class UiManager(private val activity: Activity) {
|
||||
|
||||
private var textView: TextView? = null
|
||||
private var scrollView: ScrollView? = null
|
||||
private var startButton: Button? = null
|
||||
private var stopButton: Button? = null
|
||||
private var videoPlayerManager: VideoPlayerManager? = null
|
||||
|
||||
private var lastUiText: String = ""
|
||||
|
||||
fun initViews(
|
||||
textViewId: Int,
|
||||
scrollViewId: Int,
|
||||
startButtonId: Int,
|
||||
stopButtonId: Int,
|
||||
silentPlayerViewId: Int,
|
||||
speakingPlayerViewId: Int
|
||||
) {
|
||||
textView = activity.findViewById(textViewId)
|
||||
scrollView = activity.findViewById(scrollViewId)
|
||||
startButton = activity.findViewById(startButtonId)
|
||||
stopButton = activity.findViewById(stopButtonId)
|
||||
|
||||
textView?.movementMethod = ScrollingMovementMethod()
|
||||
|
||||
try {
|
||||
val silentPv = activity.findViewById<PlayerView>(silentPlayerViewId)
|
||||
val speakingPv = activity.findViewById<PlayerView>(speakingPlayerViewId)
|
||||
videoPlayerManager = VideoPlayerManager(activity, silentPv, speakingPv)
|
||||
videoPlayerManager?.setSpeaking(false)
|
||||
} catch (e: Exception) {
|
||||
Log.w(AppConfig.TAG, "PlayerViews not found or init failed: ${e.message}")
|
||||
}
|
||||
}
|
||||
|
||||
fun setStartButtonListener(listener: () -> Unit) {
|
||||
startButton?.setOnClickListener { listener() }
|
||||
}
|
||||
|
||||
fun setStopButtonListener(listener: () -> Unit) {
|
||||
stopButton?.setOnClickListener { listener() }
|
||||
}
|
||||
|
||||
fun appendToUi(s: String) {
|
||||
if (!AppConfig.SHOW_DEBUG_TEXT) return
|
||||
|
||||
lastUiText += s
|
||||
textView?.text = lastUiText
|
||||
scrollView?.post {
|
||||
scrollView?.fullScroll(ScrollView.FOCUS_DOWN)
|
||||
}
|
||||
}
|
||||
|
||||
fun clearText() {
|
||||
lastUiText = ""
|
||||
textView?.text = ""
|
||||
}
|
||||
|
||||
fun setText(text: String) {
|
||||
lastUiText = text
|
||||
textView?.text = text
|
||||
}
|
||||
|
||||
fun setButtonsEnabled(startEnabled: Boolean, stopEnabled: Boolean) {
|
||||
startButton?.isEnabled = startEnabled
|
||||
stopButton?.isEnabled = stopEnabled
|
||||
}
|
||||
|
||||
fun setSpeaking(speaking: Boolean) {
|
||||
activity.runOnUiThread {
|
||||
videoPlayerManager?.setSpeaking(speaking)
|
||||
}
|
||||
}
|
||||
|
||||
fun showToast(message: String, duration: Int = Toast.LENGTH_SHORT) {
|
||||
activity.runOnUiThread {
|
||||
Toast.makeText(activity, message, duration).show()
|
||||
}
|
||||
}
|
||||
|
||||
fun showToastOnUi(message: String, duration: Int = Toast.LENGTH_SHORT) {
|
||||
Toast.makeText(activity, message, duration).show()
|
||||
}
|
||||
|
||||
fun release() {
|
||||
videoPlayerManager?.release()
|
||||
videoPlayerManager = null
|
||||
}
|
||||
|
||||
fun reset() {
|
||||
lastUiText = ""
|
||||
}
|
||||
}
|
||||
60
app/src/main/java/com/digitalperson/util/FileHelper.kt
Normal file
60
app/src/main/java/com/digitalperson/util/FileHelper.kt
Normal file
@@ -0,0 +1,60 @@
|
||||
package com.digitalperson.util
|
||||
|
||||
import android.content.Context
|
||||
import android.util.Log
|
||||
import com.digitalperson.config.AppConfig
|
||||
import java.io.File
|
||||
import java.io.FileOutputStream
|
||||
|
||||
object FileHelper {
|
||||
private const val TAG = AppConfig.TAG
|
||||
|
||||
fun assetExists(context: Context, path: String): Boolean {
|
||||
return try {
|
||||
context.assets.open(path).close()
|
||||
true
|
||||
} catch (_: Throwable) {
|
||||
false
|
||||
}
|
||||
}
|
||||
|
||||
fun copyAssetsToInternal(context: Context, assetDir: String, targetDir: File, files: Array<String>): File {
|
||||
if (!targetDir.exists()) targetDir.mkdirs()
|
||||
|
||||
for (name in files) {
|
||||
val assetPath = "$assetDir/$name"
|
||||
val outFile = File(targetDir, name)
|
||||
if (outFile.exists() && outFile.length() > 0) continue
|
||||
try {
|
||||
context.assets.open(assetPath).use { input ->
|
||||
FileOutputStream(outFile).use { output ->
|
||||
input.copyTo(output)
|
||||
}
|
||||
}
|
||||
} catch (e: Exception) {
|
||||
Log.e(TAG, "Failed to copy asset $assetPath: ${e.message}")
|
||||
}
|
||||
}
|
||||
return targetDir
|
||||
}
|
||||
|
||||
fun copySenseVoiceAssets(context: Context): File {
|
||||
val outDir = File(context.filesDir, AppConfig.Asr.MODEL_DIR)
|
||||
val files = arrayOf(
|
||||
"am.mvn",
|
||||
"chn_jpn_yue_eng_ko_spectok.bpe.model",
|
||||
"embedding.npy",
|
||||
"sense-voice-encoder.rknn"
|
||||
)
|
||||
return copyAssetsToInternal(context, AppConfig.Asr.MODEL_DIR, outDir, files)
|
||||
}
|
||||
|
||||
fun ensureDir(dir: File): File {
|
||||
if (!dir.exists()) dir.mkdirs()
|
||||
return dir
|
||||
}
|
||||
|
||||
fun getAsrAudioDir(context: Context): File {
|
||||
return ensureDir(File(context.filesDir, "asr_audio"))
|
||||
}
|
||||
}
|
||||
216
app/src/main/java/com/digitalperson/vad/VadManager.kt
Normal file
216
app/src/main/java/com/digitalperson/vad/VadManager.kt
Normal file
@@ -0,0 +1,216 @@
|
||||
package com.digitalperson.vad
|
||||
|
||||
import android.content.Context
|
||||
import android.util.Log
|
||||
import com.digitalperson.config.AppConfig
|
||||
import com.k2fsa.sherpa.onnx.SileroVadModelConfig
|
||||
import com.k2fsa.sherpa.onnx.Vad
|
||||
import com.k2fsa.sherpa.onnx.VadModelConfig
|
||||
import java.io.File
|
||||
import kotlin.math.max
|
||||
|
||||
class VadManager(private val context: Context) {
|
||||
|
||||
companion object {
|
||||
private const val TAG = "VadManager"
|
||||
}
|
||||
|
||||
private var vad: Vad? = null
|
||||
private val nativeLock = Any()
|
||||
|
||||
private var inSpeech = false
|
||||
private var silenceSamples = 0
|
||||
private var speechBuf = FloatArray(0)
|
||||
private var speechLen = 0
|
||||
private var processedSpeechBuf = FloatArray(0)
|
||||
private var processedSpeechLen = 0
|
||||
|
||||
private val minSilenceSamples = (AppConfig.Vad.MIN_SILENCE_DURATION * AppConfig.SAMPLE_RATE).toInt()
|
||||
private val minSpeechSamples = (AppConfig.Vad.MIN_SPEECH_DURATION * AppConfig.SAMPLE_RATE).toInt()
|
||||
private val maxSpeechSamples = (AppConfig.Vad.MAX_SPEECH_DURATION * AppConfig.SAMPLE_RATE).toInt()
|
||||
|
||||
var vadComputeCount = 0
|
||||
private set
|
||||
|
||||
interface VadCallback {
|
||||
fun onSpeechSegmentReady(originalAudio: FloatArray, processedAudio: FloatArray)
|
||||
fun shouldSkipProcessing(): Boolean
|
||||
}
|
||||
|
||||
private var callback: VadCallback? = null
|
||||
|
||||
fun setCallback(callback: VadCallback) {
|
||||
this.callback = callback
|
||||
}
|
||||
|
||||
fun initVadModel(): Boolean {
|
||||
return try {
|
||||
val config = VadModelConfig(
|
||||
sileroVadModelConfig = SileroVadModelConfig(
|
||||
model = "vad_model/silero_vad.onnx",
|
||||
threshold = 0.5F,
|
||||
minSilenceDuration = 0.25F,
|
||||
minSpeechDuration = 0.25F,
|
||||
windowSize = AppConfig.WINDOW_SIZE,
|
||||
),
|
||||
sampleRate = AppConfig.SAMPLE_RATE,
|
||||
numThreads = 1,
|
||||
provider = "cpu",
|
||||
)
|
||||
vad = Vad(assetManager = context.assets, config = config)
|
||||
Log.i(TAG, "VAD model initialized successfully")
|
||||
true
|
||||
} catch (e: Exception) {
|
||||
Log.e(TAG, "Failed to initialize VAD model: ${e.message}", e)
|
||||
false
|
||||
}
|
||||
}
|
||||
|
||||
fun reset() {
|
||||
vad?.reset()
|
||||
inSpeech = false
|
||||
silenceSamples = 0
|
||||
speechLen = 0
|
||||
processedSpeechLen = 0
|
||||
vadComputeCount = 0
|
||||
}
|
||||
|
||||
fun release() {
|
||||
try {
|
||||
vad?.release()
|
||||
} catch (e: Exception) {
|
||||
Log.e(TAG, "Error releasing VAD: ${e.message}")
|
||||
}
|
||||
vad = null
|
||||
}
|
||||
|
||||
fun processAudioChunk(chunk: FloatArray, processedChunk: FloatArray): VadResult {
|
||||
val prob = synchronized(nativeLock) {
|
||||
vad?.compute(processedChunk) ?: 0f
|
||||
}
|
||||
vadComputeCount++
|
||||
|
||||
val result = when {
|
||||
!inSpeech && prob >= AppConfig.Vad.START_THRESHOLD -> {
|
||||
inSpeech = true
|
||||
silenceSamples = 0
|
||||
appendSpeech(chunk, processedChunk)
|
||||
Log.d(TAG, "VAD: Entered speech state, prob=$prob, speechLen=$speechLen")
|
||||
VadResult.SpeechStarted(prob)
|
||||
}
|
||||
inSpeech && prob <= AppConfig.Vad.END_THRESHOLD -> {
|
||||
silenceSamples += chunk.size
|
||||
if (silenceSamples >= minSilenceSamples) {
|
||||
Log.d(TAG, "VAD: Exiting speech state, prob=$prob, silenceSamples=$silenceSamples, speechLen=$speechLen")
|
||||
finalizeSegmentIfAny()
|
||||
VadResult.SpeechEnded(prob)
|
||||
} else {
|
||||
appendSpeech(chunk, processedChunk)
|
||||
VadResult.SpeechContinuing(prob)
|
||||
}
|
||||
}
|
||||
inSpeech -> {
|
||||
appendSpeech(chunk, processedChunk)
|
||||
silenceSamples = 0
|
||||
|
||||
if (speechLen >= maxSpeechSamples) {
|
||||
Log.d(TAG, "VAD: Max speech length reached, finalizing segment")
|
||||
finalizeSegmentIfAny()
|
||||
VadResult.MaxSpeechReached(prob)
|
||||
} else {
|
||||
VadResult.SpeechContinuing(prob)
|
||||
}
|
||||
}
|
||||
else -> {
|
||||
VadResult.Silence(prob)
|
||||
}
|
||||
}
|
||||
|
||||
return result
|
||||
}
|
||||
|
||||
fun forceFinalize() {
|
||||
finalizeSegmentIfAny()
|
||||
}
|
||||
|
||||
fun isInSpeech(): Boolean = inSpeech
|
||||
|
||||
fun getSpeechLength(): Int = speechLen
|
||||
|
||||
fun clearState() {
|
||||
inSpeech = false
|
||||
silenceSamples = 0
|
||||
speechLen = 0
|
||||
processedSpeechLen = 0
|
||||
}
|
||||
|
||||
private fun appendSpeech(chunk: FloatArray, processedChunk: FloatArray) {
|
||||
val needed = speechLen + chunk.size
|
||||
if (speechBuf.size < needed) {
|
||||
var newCap = maxOf(needed, maxOf(1024, speechBuf.size * 2))
|
||||
if (newCap > maxSpeechSamples) newCap = maxSpeechSamples
|
||||
val n = FloatArray(newCap)
|
||||
if (speechLen > 0) System.arraycopy(speechBuf, 0, n, 0, speechLen)
|
||||
speechBuf = n
|
||||
}
|
||||
val copyN = minOf(chunk.size, max(0, maxSpeechSamples - speechLen))
|
||||
if (copyN > 0) {
|
||||
System.arraycopy(chunk, 0, speechBuf, speechLen, copyN)
|
||||
speechLen += copyN
|
||||
}
|
||||
|
||||
val processedNeeded = processedSpeechLen + processedChunk.size
|
||||
if (processedSpeechBuf.size < processedNeeded) {
|
||||
var newCap = maxOf(processedNeeded, maxOf(1024, processedSpeechBuf.size * 2))
|
||||
if (newCap > maxSpeechSamples) newCap = maxSpeechSamples
|
||||
val n = FloatArray(newCap)
|
||||
if (processedSpeechLen > 0) System.arraycopy(processedSpeechBuf, 0, n, 0, processedSpeechLen)
|
||||
processedSpeechBuf = n
|
||||
}
|
||||
val processedCopyN = minOf(processedChunk.size, max(0, maxSpeechSamples - processedSpeechLen))
|
||||
if (processedCopyN > 0) {
|
||||
System.arraycopy(processedChunk, 0, processedSpeechBuf, processedSpeechLen, processedCopyN)
|
||||
processedSpeechLen += processedCopyN
|
||||
}
|
||||
}
|
||||
|
||||
private fun finalizeSegmentIfAny() {
|
||||
Log.d(TAG, "finalizeSegmentIfAny called: speechLen=$speechLen, minSpeechSamples=$minSpeechSamples")
|
||||
|
||||
if (speechLen < minSpeechSamples) {
|
||||
Log.d(TAG, "finalizeSegmentIfAny: speech too short, discarding")
|
||||
speechLen = 0
|
||||
processedSpeechLen = 0
|
||||
inSpeech = false
|
||||
silenceSamples = 0
|
||||
return
|
||||
}
|
||||
|
||||
if (callback?.shouldSkipProcessing() == true) {
|
||||
Log.d(TAG, "finalizeSegmentIfAny: skipping due to callback")
|
||||
speechLen = 0
|
||||
processedSpeechLen = 0
|
||||
inSpeech = false
|
||||
silenceSamples = 0
|
||||
return
|
||||
}
|
||||
|
||||
val originalSeg = speechBuf.copyOf(speechLen)
|
||||
val processedSeg = processedSpeechBuf.copyOf(processedSpeechLen)
|
||||
speechLen = 0
|
||||
processedSpeechLen = 0
|
||||
inSpeech = false
|
||||
silenceSamples = 0
|
||||
|
||||
Log.d(TAG, "Sending audio segment to callback, size: ${processedSeg.size}")
|
||||
callback?.onSpeechSegmentReady(originalSeg, processedSeg)
|
||||
}
|
||||
|
||||
sealed class VadResult(val probability: Float) {
|
||||
class SpeechStarted(prob: Float) : VadResult(prob)
|
||||
class SpeechEnded(prob: Float) : VadResult(prob)
|
||||
class SpeechContinuing(prob: Float) : VadResult(prob)
|
||||
class MaxSpeechReached(prob: Float) : VadResult(prob)
|
||||
class Silence(prob: Float) : VadResult(prob)
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user