253 lines
9.6 KiB
Kotlin
253 lines
9.6 KiB
Kotlin
package com.digitalperson.asr
|
||
|
||
import android.content.Context
|
||
import android.os.SystemClock
|
||
import android.util.Log
|
||
import com.digitalperson.BuildConfig
|
||
import com.digitalperson.audio.AudioProcessor
|
||
import com.digitalperson.config.AppConfig
|
||
import com.digitalperson.env.RuntimeEnv
|
||
import com.digitalperson.engine.SenseVoiceEngineRKNN
|
||
import com.digitalperson.util.FileHelper
|
||
import kotlinx.coroutines.Dispatchers
|
||
import kotlinx.coroutines.channels.Channel
|
||
import kotlinx.coroutines.currentCoroutineContext
|
||
import kotlinx.coroutines.isActive
|
||
import kotlinx.coroutines.withContext
|
||
import java.io.File
|
||
|
||
class AsrManager(private val context: Context) {
|
||
|
||
companion object {
|
||
private const val TAG = "AsrManager"
|
||
}
|
||
|
||
private var senseVoice: SenseVoiceEngineRKNN? = null
|
||
private val nativeLock = Any()
|
||
private val asrQueue = Channel<Pair<FloatArray, FloatArray>>(capacity = Channel.UNLIMITED)
|
||
|
||
private var audioProcessor: AudioProcessor? = null
|
||
|
||
interface AsrCallback {
|
||
fun onAsrStarted()
|
||
fun onAsrResult(text: String)
|
||
fun onAsrSkipped(reason: String)
|
||
fun shouldSkipAsr(): Boolean
|
||
fun isLlmInFlight(): Boolean
|
||
fun onLlmCalled(text: String)
|
||
}
|
||
|
||
private var callback: AsrCallback? = null
|
||
|
||
fun setCallback(callback: AsrCallback) {
|
||
this.callback = callback
|
||
}
|
||
|
||
fun setAudioProcessor(audioProcessor: AudioProcessor) {
|
||
this.audioProcessor = audioProcessor
|
||
}
|
||
|
||
fun initSenseVoiceModel(): Boolean {
|
||
if (RuntimeEnv.isEmulator()) {
|
||
Log.w(TAG, "ASR: emulator detected; skip local RKNN init and use cloud ASR")
|
||
return false
|
||
}
|
||
return try {
|
||
Log.i(TAG, "ASR: init SenseVoice RKNN (scheme A)")
|
||
|
||
val modelDir = FileHelper.copySenseVoiceAssets(context)
|
||
val modelPath = File(modelDir, "sense-voice-encoder.rknn").absolutePath
|
||
val embeddingPath = File(modelDir, "embedding.npy").absolutePath
|
||
val bpePath = File(modelDir, "chn_jpn_yue_eng_ko_spectok.bpe.model").absolutePath
|
||
|
||
try {
|
||
val libDir = context.applicationInfo.nativeLibraryDir
|
||
Log.i(TAG, "nativeLibraryDir=$libDir")
|
||
try {
|
||
val names = File(libDir).list()?.joinToString(", ") ?: "(empty)"
|
||
Log.i(TAG, "nativeLibraryDir files: $names")
|
||
} catch (t: Throwable) {
|
||
Log.w(TAG, "Failed to list nativeLibraryDir: ${t.message}")
|
||
}
|
||
} catch (_: Throwable) {
|
||
}
|
||
|
||
Log.i(TAG, "SenseVoice model paths:")
|
||
Log.i(TAG, " model=$modelPath exists=${File(modelPath).exists()} size=${File(modelPath).length()}")
|
||
Log.i(TAG, " embedding=$embeddingPath exists=${File(embeddingPath).exists()} size=${File(embeddingPath).length()}")
|
||
Log.i(TAG, " bpe=$bpePath exists=${File(bpePath).exists()} size=${File(bpePath).length()}")
|
||
|
||
val t0 = SystemClock.elapsedRealtime()
|
||
val engine = try {
|
||
SenseVoiceEngineRKNN(context)
|
||
} catch (e: UnsatisfiedLinkError) {
|
||
throw IllegalStateException("Load native libraries failed: ${e.message}", e)
|
||
}
|
||
|
||
val ok = try {
|
||
engine.loadModelDirectly(modelPath, embeddingPath, bpePath)
|
||
} catch (t: Throwable) {
|
||
throw IllegalStateException("SenseVoice loadModelDirectly crashed: ${t.message}", t)
|
||
}
|
||
|
||
val dt = SystemClock.elapsedRealtime() - t0
|
||
Log.i(TAG, "SenseVoice loadModelDirectly ok=$ok costMs=$dt")
|
||
if (!ok) throw IllegalStateException("SenseVoiceEngineRKNN loadModelDirectly returned false")
|
||
|
||
senseVoice = engine
|
||
true
|
||
} catch (e: Exception) {
|
||
Log.e(TAG, "Failed to initialize SenseVoice model: ${e.message}", e)
|
||
false
|
||
}
|
||
}
|
||
|
||
fun enqueueAudioSegment(originalAudio: FloatArray, processedAudio: FloatArray) {
|
||
try {
|
||
asrQueue.trySend(Pair(originalAudio, processedAudio))
|
||
} catch (e: Exception) {
|
||
Log.e(TAG, "Failed to enqueue audio segment: ${e.message}")
|
||
}
|
||
}
|
||
|
||
fun clearQueue() {
|
||
while (asrQueue.tryReceive().isSuccess) { }
|
||
}
|
||
|
||
suspend fun runAsrWorker() {
|
||
Log.d(TAG, "ASR worker started")
|
||
try {
|
||
while (currentCoroutineContext().isActive) {
|
||
val (originalSeg, processedSeg) = try {
|
||
Log.d(TAG, "ASR worker waiting for audio segment")
|
||
asrQueue.receive()
|
||
} catch (e: Throwable) {
|
||
Log.e(TAG, "ASR worker receive failed: ${e.message}")
|
||
break
|
||
}
|
||
|
||
Log.d(TAG, "ASR worker received audio segment, size=${processedSeg.size}")
|
||
|
||
if (callback?.shouldSkipAsr() == true || callback?.isLlmInFlight() == true) {
|
||
Log.d(TAG, "ASR worker skipping segment: shouldSkip=${callback?.shouldSkipAsr()}, llmInFlight=${callback?.isLlmInFlight()}")
|
||
continue
|
||
}
|
||
|
||
callback?.onAsrStarted()
|
||
Log.d(TAG, "ASR started: processing audio segment")
|
||
|
||
saveAsrAudio(originalSeg, processedSeg)
|
||
|
||
val localText = synchronized(nativeLock) {
|
||
val e = senseVoice
|
||
if (e == null || !e.isInitialized) {
|
||
""
|
||
} else {
|
||
try {
|
||
removeTokens(e.transcribeBuffer(processedSeg))
|
||
} catch (t: Throwable) {
|
||
Log.e(TAG, "ASR transcribe failed: ${t.message}")
|
||
""
|
||
}
|
||
}
|
||
}.trim()
|
||
|
||
val text = if (localText.isNotBlank()) {
|
||
localText
|
||
} else {
|
||
// 模拟器或本地 RKNN 未就绪:使用腾讯云「一句话识别」SDK(app/libs/asr-one-sentence-release.aar)
|
||
val shouldTryTencent =
|
||
BuildConfig.HAS_TENCENT_ASR_SDK && (RuntimeEnv.isEmulator() || !isInitialized())
|
||
if (!shouldTryTencent) {
|
||
Log.e(
|
||
TAG,
|
||
"ASR failed: local RKNN not ready and Tencent SDK unavailable " +
|
||
"(add libs/asr-one-sentence-release.aar or fix SenseVoice init)"
|
||
)
|
||
""
|
||
} else {
|
||
withContext(Dispatchers.IO) {
|
||
try {
|
||
// 云端 ASR 使用原始录音(未经 AEC/NS):
|
||
// 模拟器上 AEC/NS 不可用,processedSeg 可能被处理成近似静音
|
||
TencentOneSentenceAsr.transcribePcm16Mono(originalSeg)
|
||
} catch (t: Throwable) {
|
||
Log.e(TAG, "Tencent ASR failed: ${t.message}")
|
||
""
|
||
}
|
||
}.trim()
|
||
}
|
||
}
|
||
|
||
val filterResult = filterText(text)
|
||
if (filterResult != null) {
|
||
callback?.onAsrSkipped(filterResult)
|
||
continue
|
||
}
|
||
|
||
callback?.onAsrResult(text)
|
||
|
||
if (BuildConfig.LLM_API_KEY.isBlank()) {
|
||
Log.w(TAG, "LLM API Key is not configured")
|
||
continue
|
||
}
|
||
|
||
callback?.onLlmCalled(text)
|
||
}
|
||
} catch (e: Throwable) {
|
||
Log.e(TAG, "ASR worker error: ${e.message}", e)
|
||
} finally {
|
||
Log.d(TAG, "ASR worker exiting")
|
||
}
|
||
}
|
||
|
||
fun release() {
|
||
try {
|
||
senseVoice?.deinitialize()
|
||
} catch (e: Exception) {
|
||
Log.e(TAG, "Error deinitializing SenseVoice: ${e.message}")
|
||
}
|
||
senseVoice = null
|
||
clearQueue()
|
||
}
|
||
|
||
fun isInitialized(): Boolean = senseVoice?.isInitialized ?: false
|
||
|
||
private fun saveAsrAudio(originalAudio: FloatArray, processedAudio: FloatArray) {
|
||
try {
|
||
val timestamp = System.currentTimeMillis()
|
||
val asrAudioDir = FileHelper.getAsrAudioDir(context)
|
||
|
||
audioProcessor?.let { processor ->
|
||
val originalFile = File(asrAudioDir, "asr_${timestamp}_original.wav")
|
||
processor.saveAudioAsWav(originalFile, originalAudio, AppConfig.SAMPLE_RATE)
|
||
val processedFile = File(asrAudioDir, "asr_${timestamp}_processed.wav")
|
||
processor.saveAudioAsWav(processedFile, processedAudio, AppConfig.SAMPLE_RATE)
|
||
}
|
||
} catch (e: Exception) {
|
||
Log.e(TAG, "Error saving ASR audio: ${e.message}")
|
||
}
|
||
}
|
||
|
||
private fun removeTokens(text: String): String {
|
||
var cleaned = text.replace(Regex("<\\|[^>]+\\|>"), "")
|
||
cleaned = cleaned.replace(Regex("[>>≥≫]"), "")
|
||
cleaned = cleaned.trim().replace(Regex("\\s+"), " ")
|
||
return cleaned
|
||
}
|
||
|
||
private fun filterText(text: String): String? {
|
||
if (text.isBlank()) {
|
||
return "blank text"
|
||
}
|
||
if (text.length == 1 && text[0].equals('i', ignoreCase = true)) {
|
||
return "single 'i'"
|
||
}
|
||
if (text.length > AppConfig.Asr.MAX_TEXT_LENGTH) {
|
||
return "too long (${text.length} chars)"
|
||
}
|
||
return null
|
||
}
|
||
|
||
}
|