Files
digital_person/app/src/main/java/com/digitalperson/asr/AsrManager.kt
gcw_4spBpAfv 4e33063a98 add photo
2026-04-23 15:21:24 +08:00

253 lines
9.6 KiB
Kotlin
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

package com.digitalperson.asr
import android.content.Context
import android.os.SystemClock
import android.util.Log
import com.digitalperson.BuildConfig
import com.digitalperson.audio.AudioProcessor
import com.digitalperson.config.AppConfig
import com.digitalperson.env.RuntimeEnv
import com.digitalperson.engine.SenseVoiceEngineRKNN
import com.digitalperson.util.FileHelper
import kotlinx.coroutines.Dispatchers
import kotlinx.coroutines.channels.Channel
import kotlinx.coroutines.currentCoroutineContext
import kotlinx.coroutines.isActive
import kotlinx.coroutines.withContext
import java.io.File
class AsrManager(private val context: Context) {
companion object {
private const val TAG = "AsrManager"
}
private var senseVoice: SenseVoiceEngineRKNN? = null
private val nativeLock = Any()
private val asrQueue = Channel<Pair<FloatArray, FloatArray>>(capacity = Channel.UNLIMITED)
private var audioProcessor: AudioProcessor? = null
interface AsrCallback {
fun onAsrStarted()
fun onAsrResult(text: String)
fun onAsrSkipped(reason: String)
fun shouldSkipAsr(): Boolean
fun isLlmInFlight(): Boolean
fun onLlmCalled(text: String)
}
private var callback: AsrCallback? = null
fun setCallback(callback: AsrCallback) {
this.callback = callback
}
fun setAudioProcessor(audioProcessor: AudioProcessor) {
this.audioProcessor = audioProcessor
}
fun initSenseVoiceModel(): Boolean {
if (RuntimeEnv.isEmulator()) {
Log.w(TAG, "ASR: emulator detected; skip local RKNN init and use cloud ASR")
return false
}
return try {
Log.i(TAG, "ASR: init SenseVoice RKNN (scheme A)")
val modelDir = FileHelper.copySenseVoiceAssets(context)
val modelPath = File(modelDir, "sense-voice-encoder.rknn").absolutePath
val embeddingPath = File(modelDir, "embedding.npy").absolutePath
val bpePath = File(modelDir, "chn_jpn_yue_eng_ko_spectok.bpe.model").absolutePath
try {
val libDir = context.applicationInfo.nativeLibraryDir
Log.i(TAG, "nativeLibraryDir=$libDir")
try {
val names = File(libDir).list()?.joinToString(", ") ?: "(empty)"
Log.i(TAG, "nativeLibraryDir files: $names")
} catch (t: Throwable) {
Log.w(TAG, "Failed to list nativeLibraryDir: ${t.message}")
}
} catch (_: Throwable) {
}
Log.i(TAG, "SenseVoice model paths:")
Log.i(TAG, " model=$modelPath exists=${File(modelPath).exists()} size=${File(modelPath).length()}")
Log.i(TAG, " embedding=$embeddingPath exists=${File(embeddingPath).exists()} size=${File(embeddingPath).length()}")
Log.i(TAG, " bpe=$bpePath exists=${File(bpePath).exists()} size=${File(bpePath).length()}")
val t0 = SystemClock.elapsedRealtime()
val engine = try {
SenseVoiceEngineRKNN(context)
} catch (e: UnsatisfiedLinkError) {
throw IllegalStateException("Load native libraries failed: ${e.message}", e)
}
val ok = try {
engine.loadModelDirectly(modelPath, embeddingPath, bpePath)
} catch (t: Throwable) {
throw IllegalStateException("SenseVoice loadModelDirectly crashed: ${t.message}", t)
}
val dt = SystemClock.elapsedRealtime() - t0
Log.i(TAG, "SenseVoice loadModelDirectly ok=$ok costMs=$dt")
if (!ok) throw IllegalStateException("SenseVoiceEngineRKNN loadModelDirectly returned false")
senseVoice = engine
true
} catch (e: Exception) {
Log.e(TAG, "Failed to initialize SenseVoice model: ${e.message}", e)
false
}
}
fun enqueueAudioSegment(originalAudio: FloatArray, processedAudio: FloatArray) {
try {
asrQueue.trySend(Pair(originalAudio, processedAudio))
} catch (e: Exception) {
Log.e(TAG, "Failed to enqueue audio segment: ${e.message}")
}
}
fun clearQueue() {
while (asrQueue.tryReceive().isSuccess) { }
}
suspend fun runAsrWorker() {
Log.d(TAG, "ASR worker started")
try {
while (currentCoroutineContext().isActive) {
val (originalSeg, processedSeg) = try {
Log.d(TAG, "ASR worker waiting for audio segment")
asrQueue.receive()
} catch (e: Throwable) {
Log.e(TAG, "ASR worker receive failed: ${e.message}")
break
}
Log.d(TAG, "ASR worker received audio segment, size=${processedSeg.size}")
if (callback?.shouldSkipAsr() == true || callback?.isLlmInFlight() == true) {
Log.d(TAG, "ASR worker skipping segment: shouldSkip=${callback?.shouldSkipAsr()}, llmInFlight=${callback?.isLlmInFlight()}")
continue
}
callback?.onAsrStarted()
Log.d(TAG, "ASR started: processing audio segment")
saveAsrAudio(originalSeg, processedSeg)
val localText = synchronized(nativeLock) {
val e = senseVoice
if (e == null || !e.isInitialized) {
""
} else {
try {
removeTokens(e.transcribeBuffer(processedSeg))
} catch (t: Throwable) {
Log.e(TAG, "ASR transcribe failed: ${t.message}")
""
}
}
}.trim()
val text = if (localText.isNotBlank()) {
localText
} else {
// 模拟器或本地 RKNN 未就绪使用腾讯云「一句话识别」SDKapp/libs/asr-one-sentence-release.aar
val shouldTryTencent =
BuildConfig.HAS_TENCENT_ASR_SDK && (RuntimeEnv.isEmulator() || !isInitialized())
if (!shouldTryTencent) {
Log.e(
TAG,
"ASR failed: local RKNN not ready and Tencent SDK unavailable " +
"(add libs/asr-one-sentence-release.aar or fix SenseVoice init)"
)
""
} else {
withContext(Dispatchers.IO) {
try {
// 云端 ASR 使用原始录音(未经 AEC/NS
// 模拟器上 AEC/NS 不可用processedSeg 可能被处理成近似静音
TencentOneSentenceAsr.transcribePcm16Mono(originalSeg)
} catch (t: Throwable) {
Log.e(TAG, "Tencent ASR failed: ${t.message}")
""
}
}.trim()
}
}
val filterResult = filterText(text)
if (filterResult != null) {
callback?.onAsrSkipped(filterResult)
continue
}
callback?.onAsrResult(text)
if (BuildConfig.LLM_API_KEY.isBlank()) {
Log.w(TAG, "LLM API Key is not configured")
continue
}
callback?.onLlmCalled(text)
}
} catch (e: Throwable) {
Log.e(TAG, "ASR worker error: ${e.message}", e)
} finally {
Log.d(TAG, "ASR worker exiting")
}
}
fun release() {
try {
senseVoice?.deinitialize()
} catch (e: Exception) {
Log.e(TAG, "Error deinitializing SenseVoice: ${e.message}")
}
senseVoice = null
clearQueue()
}
fun isInitialized(): Boolean = senseVoice?.isInitialized ?: false
private fun saveAsrAudio(originalAudio: FloatArray, processedAudio: FloatArray) {
try {
val timestamp = System.currentTimeMillis()
val asrAudioDir = FileHelper.getAsrAudioDir(context)
audioProcessor?.let { processor ->
val originalFile = File(asrAudioDir, "asr_${timestamp}_original.wav")
processor.saveAudioAsWav(originalFile, originalAudio, AppConfig.SAMPLE_RATE)
val processedFile = File(asrAudioDir, "asr_${timestamp}_processed.wav")
processor.saveAudioAsWav(processedFile, processedAudio, AppConfig.SAMPLE_RATE)
}
} catch (e: Exception) {
Log.e(TAG, "Error saving ASR audio: ${e.message}")
}
}
private fun removeTokens(text: String): String {
var cleaned = text.replace(Regex("<\\|[^>]+\\|>"), "")
cleaned = cleaned.replace(Regex("[>>≥≫]"), "")
cleaned = cleaned.trim().replace(Regex("\\s+"), " ")
return cleaned
}
private fun filterText(text: String): String? {
if (text.isBlank()) {
return "blank text"
}
if (text.length == 1 && text[0].equals('i', ignoreCase = true)) {
return "single 'i'"
}
if (text.length > AppConfig.Asr.MAX_TEXT_LENGTH) {
return "too long (${text.length} chars)"
}
return null
}
}