add photo

This commit is contained in:
gcw_4spBpAfv
2026-04-23 15:21:24 +08:00
parent 1550783eef
commit 4e33063a98
44 changed files with 3567 additions and 64 deletions

View File

@@ -6,6 +6,7 @@ import android.content.Context
import android.content.pm.PackageManager
import android.content.res.ColorStateList
import android.graphics.Color
import android.graphics.BitmapFactory
import android.os.Build
import android.os.Bundle
import android.os.Handler
@@ -16,6 +17,7 @@ import android.util.Log
import android.view.MotionEvent
import android.view.ViewGroup
import android.widget.Button
import android.widget.ImageView
import android.widget.TextView
import android.widget.Toast
import androidx.core.app.ActivityCompat
@@ -25,7 +27,6 @@ import android.view.View
import androidx.lifecycle.Lifecycle
import androidx.lifecycle.LifecycleOwner
import androidx.lifecycle.LifecycleRegistry
import android.widget.ImageView
import com.unity3d.player.UnityPlayer
import com.unity3d.player.UnityPlayerActivity
import com.digitalperson.audio.AudioProcessor
@@ -33,6 +34,7 @@ import com.digitalperson.asr.AsrManager
import com.digitalperson.cloud.CloudApiManager
import com.digitalperson.cloud.CloudReflectionHelper
import com.digitalperson.config.AppConfig
import com.digitalperson.embedding.RefImageMatcher
import com.digitalperson.question.QuestionGenerationAgent
import com.digitalperson.data.AppDatabase
import com.digitalperson.face.FaceDetectionPipeline
@@ -48,8 +50,6 @@ import com.digitalperson.tts.TtsController
import com.digitalperson.util.FileHelper
import com.digitalperson.vad.VadManager
import kotlinx.coroutines.*
import com.digitalperson.embedding.RefImageMatcher
import android.graphics.BitmapFactory
class UnityDigitalPersonActivity : UnityPlayerActivity(), LifecycleOwner {
@@ -260,6 +260,11 @@ class UnityDigitalPersonActivity : UnityPlayerActivity(), LifecycleOwner {
recordButtonGlow = chatLayout.findViewById(R.id.record_button_glow)
refMatchImageView = chatLayout.findViewById(R.id.ref_match_image)
if (!AppConfig.SHOW_DEBUG_TEXT) {
chatHistoryText.visibility = View.GONE
chatLayout.findViewById<View>(R.id.scroll_view).visibility = View.GONE
}
// 根据配置设置按钮可见性
if (AppConfig.USE_HOLD_TO_SPEAK) {
holdToSpeakButton.visibility = View.VISIBLE
@@ -366,6 +371,8 @@ class UnityDigitalPersonActivity : UnityPlayerActivity(), LifecycleOwner {
override fun onLlmCalled(text: String) {
Log.d("UnityDigitalPerson", "LLM called with: $text")
interactionCoordinator.onUserAsrText(text)
// 用用户问题提前匹配:比等 LLM 回复更早显示图片(模拟器/真机通用)
maybeShowMatchedRefImage(text)
}
})
setAudioProcessor(audioProcessor)
@@ -664,6 +671,7 @@ class UnityDigitalPersonActivity : UnityPlayerActivity(), LifecycleOwner {
}
private fun appendChat(text: String) {
if (!AppConfig.SHOW_DEBUG_TEXT) return
runOnUiThread {
chatHistoryText.append(text + "\n")
}
@@ -696,6 +704,8 @@ class UnityDigitalPersonActivity : UnityPlayerActivity(), LifecycleOwner {
override fun onSpeak(text: String) {
ttsController.enqueueSegment(text)
ttsController.enqueueEnd()
// 主动发言(问候/主动提问)也尝试匹配参考图片
maybeShowMatchedRefImage(text)
}
override fun onRequestCloudReply(prompt: String) {
@@ -759,13 +769,22 @@ class UnityDigitalPersonActivity : UnityPlayerActivity(), LifecycleOwner {
private fun maybeShowMatchedRefImage(text: String) {
val imageView = refMatchImageView ?: return
// Unity Activity already has coroutines
CoroutineScope(SupervisorJob() + Dispatchers.IO).launch {
// 每次匹配前先清掉上一张图
runOnUiThread {
imageView.setImageBitmap(null)
imageView.visibility = View.GONE
}
ioScope.launch {
val match = RefImageMatcher.findBestMatch(applicationContext, text)
if (match == null) return@launch
if (match == null) {
Log.d("RefImageMatch", "未找到匹配图片 query=\"${text.take(80)}\"")
return@launch
}
Log.d("RefImageMatch", "匹配成功 score=${match.score} path=${match.pngAssetPath} query=\"${text.take(80)}\"")
val bitmap = try {
assets.open(match.pngAssetPath).use { BitmapFactory.decodeStream(it) }
} catch (_: Throwable) {
} catch (e: Throwable) {
Log.w("RefImageMatch", "图片加载失败 path=${match.pngAssetPath}", e)
null
}
if (bitmap == null) return@launch

View File

@@ -6,6 +6,7 @@ import android.util.Log
import com.digitalperson.BuildConfig
import com.digitalperson.audio.AudioProcessor
import com.digitalperson.config.AppConfig
import com.digitalperson.env.RuntimeEnv
import com.digitalperson.engine.SenseVoiceEngineRKNN
import com.digitalperson.util.FileHelper
import kotlinx.coroutines.Dispatchers
@@ -23,7 +24,6 @@ class AsrManager(private val context: Context) {
private var senseVoice: SenseVoiceEngineRKNN? = null
private val nativeLock = Any()
private val asrQueue = Channel<Pair<FloatArray, FloatArray>>(capacity = Channel.UNLIMITED)
private var audioProcessor: AudioProcessor? = null
@@ -48,6 +48,10 @@ class AsrManager(private val context: Context) {
}
fun initSenseVoiceModel(): Boolean {
if (RuntimeEnv.isEmulator()) {
Log.w(TAG, "ASR: emulator detected; skip local RKNN init and use cloud ASR")
return false
}
return try {
Log.i(TAG, "ASR: init SenseVoice RKNN (scheme A)")
@@ -133,23 +137,47 @@ class AsrManager(private val context: Context) {
Log.d(TAG, "ASR started: processing audio segment")
saveAsrAudio(originalSeg, processedSeg)
val raw = synchronized(nativeLock) {
val localText = synchronized(nativeLock) {
val e = senseVoice
if (e == null || !e.isInitialized) {
Log.e(TAG, "ASR failed: SenseVoice engine not initialized")
""
} else {
try {
e.transcribeBuffer(processedSeg)
} catch (e: Throwable) {
Log.e(TAG, "ASR transcribe failed: ${e.message}")
removeTokens(e.transcribeBuffer(processedSeg))
} catch (t: Throwable) {
Log.e(TAG, "ASR transcribe failed: ${t.message}")
""
}
}
}.trim()
val text = if (localText.isNotBlank()) {
localText
} else {
// 模拟器或本地 RKNN 未就绪使用腾讯云「一句话识别」SDKapp/libs/asr-one-sentence-release.aar
val shouldTryTencent =
BuildConfig.HAS_TENCENT_ASR_SDK && (RuntimeEnv.isEmulator() || !isInitialized())
if (!shouldTryTencent) {
Log.e(
TAG,
"ASR failed: local RKNN not ready and Tencent SDK unavailable " +
"(add libs/asr-one-sentence-release.aar or fix SenseVoice init)"
)
""
} else {
withContext(Dispatchers.IO) {
try {
// 云端 ASR 使用原始录音(未经 AEC/NS
// 模拟器上 AEC/NS 不可用processedSeg 可能被处理成近似静音
TencentOneSentenceAsr.transcribePcm16Mono(originalSeg)
} catch (t: Throwable) {
Log.e(TAG, "Tencent ASR failed: ${t.message}")
""
}
}.trim()
}
}
Log.d(TAG, "ASR raw result: $raw")
val text = removeTokens(raw)
val filterResult = filterText(text)
if (filterResult != null) {
@@ -220,4 +248,5 @@ class AsrManager(private val context: Context) {
}
return null
}
}

View File

@@ -0,0 +1,216 @@
package com.digitalperson.asr
import android.util.Base64
import android.util.Log
import com.digitalperson.config.AppConfig
import okhttp3.MediaType.Companion.toMediaType
import okhttp3.OkHttpClient
import okhttp3.Request
import okhttp3.RequestBody.Companion.toRequestBody
import org.json.JSONObject
import java.nio.ByteBuffer
import java.nio.ByteOrder
import java.security.MessageDigest
import java.text.SimpleDateFormat
import java.util.Date
import java.util.Locale
import java.util.TimeZone
import java.util.concurrent.TimeUnit
import javax.crypto.Mac
import javax.crypto.spec.SecretKeySpec
/**
* 腾讯云「一句话识别」REST API 直接实现TC3-HMAC-SHA256 签名)。
*
* 不依赖 SDK AAR而是用 OkHttp 自行签名并发起 HTTP 请求。
* 签名时间戳从服务器 Date 响应头获取,彻底规避模拟器时钟偏差导致的
* AuthFailure.SignatureExpire 错误。
*
* 文档https://cloud.tencent.com/document/product/1093/35646
*/
object TencentOneSentenceAsr {
private const val TAG = "TencentOneSentenceAsr"
private const val HOST = "asr.tencentcloudapi.com"
private const val ACTION = "SentenceRecognition"
private const val VERSION = "2019-06-14"
private val client = OkHttpClient.Builder()
.connectTimeout(10, TimeUnit.SECONDS)
.readTimeout(30, TimeUnit.SECONDS)
.build()
/**
* 将 FloatArray (16kHz mono, -1..1) 通过腾讯云一句话识别转为文字。
* 阻塞直到 HTTP 响应返回或超时。请在 IO 线程中调用。
*/
fun transcribePcm16Mono(pcmFloat: FloatArray): String {
val appId = AppConfig.QCloud.APP_ID.trim()
val sid = AppConfig.QCloud.SECRET_ID.trim()
val skey = AppConfig.QCloud.SECRET_KEY.trim()
if (appId.isEmpty() || sid.isEmpty() || skey.isEmpty()) {
Log.e(TAG, "APP_ID / SECRET_ID / SECRET_KEY 为空")
return ""
}
if (pcmFloat.isEmpty()) return ""
val pcmBytes = floatToPcm16Bytes(pcmFloat)
val pcmBase64 = Base64.encodeToString(pcmBytes, Base64.NO_WRAP)
// 诊断:检查音频幅度,若 RMS 接近 0 说明麦克风没采集到声音
val rms = kotlin.math.sqrt(pcmFloat.fold(0.0) { acc, v -> acc + v * v } / pcmFloat.size)
val maxAmp = pcmFloat.maxOf { kotlin.math.abs(it) }
Log.d(TAG, "一句话识别:${pcmFloat.size} 采样点,${pcmFloat.size / 16000.0}s${pcmBytes.size} bytes RMS=${"%.4f".format(rms)} maxAmp=${"%.4f".format(maxAmp)}")
if (maxAmp < 0.01f) {
Log.w(TAG, "⚠ 音频幅度极低maxAmp=${"%.5f".format(maxAmp)}),模拟器麦克风可能没有采集到声音!请检查:模拟器扩展控制 → 麦克风 → 使用宿主机麦克风")
}
// 从服务器取时间,修正模拟器时钟偏差
val timestamp = fetchServerTimestamp()
val date = utcDate(timestamp)
val payload = buildPayload(appId, pcmBase64, pcmBytes.size)
val auth = buildAuthorization(sid, skey, date, timestamp, payload)
val request = Request.Builder()
.url("https://$HOST")
.addHeader("Authorization", auth)
.addHeader("Content-Type", "application/json; charset=utf-8")
.addHeader("Host", HOST)
.addHeader("X-TC-Action", ACTION)
.addHeader("X-TC-Version", VERSION)
.addHeader("X-TC-Timestamp", timestamp.toString())
.post(payload.toRequestBody("application/json; charset=utf-8".toMediaType()))
.build()
return try {
val response = client.newCall(request).execute()
val body = response.body?.string().orEmpty()
Log.d(TAG, "API 响应: ${body.take(400)}")
parseResult(body)
} catch (e: Exception) {
Log.e(TAG, "HTTP 请求失败: ${e.message}", e)
""
}
}
// ─── 工具方法 ──────────────────────────────────────────────────────────
/**
* 向服务器发送 HEAD 请求,从 Date 响应头获取精确时间戳。
* 若请求失败则回退到设备时钟(可能有偏差)。
*/
private fun fetchServerTimestamp(): Long {
return try {
val req = Request.Builder().url("https://$HOST").head().build()
val resp = client.newCall(req).execute()
val dateHeader = resp.header("Date")
resp.close()
if (dateHeader != null) {
val sdf = SimpleDateFormat("EEE, dd MMM yyyy HH:mm:ss z", Locale.ENGLISH)
val serverTs = sdf.parse(dateHeader)?.time?.div(1000) ?: deviceTimestamp()
val deviceTs = deviceTimestamp()
val offset = serverTs - deviceTs
if (kotlin.math.abs(offset) > 60) {
Log.w(TAG, "设备时钟偏差 ${offset}s使用服务器时间修正设备=${deviceTs}, 服务器=${serverTs}")
}
serverTs
} else {
deviceTimestamp()
}
} catch (e: Exception) {
Log.w(TAG, "获取服务器时间失败: ${e.message},使用设备时间")
deviceTimestamp()
}
}
private fun deviceTimestamp() = System.currentTimeMillis() / 1000
private fun utcDate(timestamp: Long): String {
val sdf = SimpleDateFormat("yyyy-MM-dd", Locale.US)
sdf.timeZone = TimeZone.getTimeZone("UTC")
return sdf.format(Date(timestamp * 1000))
}
private fun buildPayload(appId: String, base64: String, dataLen: Int): String =
JSONObject().apply {
put("ProjectId", 0)
put("SubServiceType", 2)
put("EngSerViceType", "16k_zh")
put("SourceType", 1) // 1 = 数据流
put("VoiceFormat", "pcm")
put("UsrAudioKey", "digital-person-asr")
put("FilterDirty", 0)
put("FilterModal", 0)
put("FilterPunc", 0)
put("ConvertNumMode", 1)
put("Data", base64)
put("DataLen", dataLen)
}.toString()
// ─── TC3-HMAC-SHA256 签名 ──────────────────────────────────────────────
private fun buildAuthorization(
secretId: String,
secretKey: String,
date: String,
timestamp: Long,
payload: String,
): String {
val payloadHash = sha256Hex(payload)
val canonicalRequest = listOf(
"POST", "/", "",
"content-type:application/json; charset=utf-8",
"host:$HOST",
"",
"content-type;host",
payloadHash,
).joinToString("\n")
val credentialScope = "$date/asr/tc3_request"
val stringToSign = "TC3-HMAC-SHA256\n$timestamp\n$credentialScope\n${sha256Hex(canonicalRequest)}"
val signingKey = hmacSha256(
hmacSha256(hmacSha256("TC3$secretKey".toByteArray(), date), "asr"),
"tc3_request",
)
val signature = hmacSha256(signingKey, stringToSign).joinToString("") { "%02x".format(it) }
return "TC3-HMAC-SHA256 Credential=$secretId/$credentialScope, SignedHeaders=content-type;host, Signature=$signature"
}
private fun parseResult(json: String): String {
if (json.isBlank()) return ""
return try {
val response = JSONObject(json).optJSONObject("Response") ?: return ""
val error = response.optJSONObject("Error")
if (error != null) {
Log.e(TAG, "API 错误: ${error.optString("Code")} - ${error.optString("Message")}")
return ""
}
response.optString("Result").also { text ->
if (text.isNotBlank()) Log.d(TAG, "识别结果: \"$text\"")
}
} catch (e: Exception) {
Log.w(TAG, "解析响应失败: ${json.take(300)}")
""
}
}
private fun sha256Hex(data: String): String {
val md = MessageDigest.getInstance("SHA-256")
return md.digest(data.toByteArray(Charsets.UTF_8)).joinToString("") { "%02x".format(it) }
}
private fun hmacSha256(key: ByteArray, data: String): ByteArray {
val mac = Mac.getInstance("HmacSHA256")
mac.init(SecretKeySpec(key, "HmacSHA256"))
return mac.doFinal(data.toByteArray(Charsets.UTF_8))
}
private fun floatToPcm16Bytes(samples: FloatArray): ByteArray {
val buf = ByteBuffer.allocate(samples.size * 2).order(ByteOrder.LITTLE_ENDIAN)
samples.forEach { buf.putShort((it.coerceIn(-1f, 1f) * 32767f).toInt().toShort()) }
return buf.array()
}
}

View File

@@ -116,6 +116,19 @@ object AppConfig {
const val MODEL_FILE = "bge-small-zh-v1.5.rknn"
}
/**
* 模拟器上 [RefImageMatcher] 使用编辑距离时的最低归一化分(与 BGE 余弦阈值不可混用)。
* 分数 = 1 - Levenshtein / max(len),越接近 1 越像。
*/
object RefMatchEmulator {
/**
* 模拟器混合评分(路径关键词命中率 + 编辑距离)阈值。
* 路径关键词1 个词命中 ≈ 0.25,已足够确认话题相关性。
* 原 0.82 是纯编辑距离阈值,字面差异大时根本达不到,故降至 0.20。
*/
const val MIN_NORMALIZED_EDIT_SCORE = 0.20f
}
/**
* app/note/ref 通过 Gradle 额外 assets 目录打入 apk 后,在 assets 中的根路径为 `ref/`。
*/

View File

@@ -0,0 +1,49 @@
package com.digitalperson.embedding
import kotlin.math.max
import kotlin.math.min
/**
* 基于 Levenshtein 的字符级相似度(模拟器兜底,无语义,仅用于联调/演示)。
*
* 分数1 - dist / max(len1, len2),与余弦相似度不可直接对比阈值。
*/
object EditDistanceSimilarity {
fun normalizedScore(a: String, b: String): Float {
val s1 = a.trim()
val s2 = b.trim()
if (s1.isEmpty() && s2.isEmpty()) return 1f
if (s1.isEmpty() || s2.isEmpty()) return 0f
val dist = levenshtein(s1, s2)
val denom = max(s1.length, s2.length).coerceAtLeast(1)
return 1f - dist.toFloat() / denom.toFloat()
}
/**
* 经典双行 DPO(n·m);仅适用于模拟器上中等规模语料。
*/
fun levenshtein(s1: String, s2: String): Int {
val n = s1.length
val m = s2.length
if (n == 0) return m
if (m == 0) return n
var prev = IntArray(m + 1) { it }
var curr = IntArray(m + 1)
for (i in 1..n) {
curr[0] = i
val c1 = s1[i - 1]
for (j in 1..m) {
val cost = if (c1 == s2[j - 1]) 0 else 1
curr[j] = min(
min(prev[j] + 1, curr[j - 1] + 1),
prev[j - 1] + cost
)
}
val tmp = prev
prev = curr
curr = tmp
}
return prev[m]
}
}

View File

@@ -4,6 +4,7 @@ import android.content.Context
import android.util.Log
import com.digitalperson.config.AppConfig
import com.digitalperson.data.AppDatabase
import com.digitalperson.data.dao.QuestionDao
import com.digitalperson.data.entity.Question
import com.digitalperson.data.entity.RefTextEmbedding
import com.digitalperson.data.util.floatArrayToEmbeddingBytes
@@ -27,15 +28,15 @@ object RefEmbeddingIndexer {
val dao = db.refTextEmbeddingDao()
val questionDao = db.questionDao()
if (!BgeEmbedding.initialize(app)) {
Log.e(TAG, "[RefEmbed] BGE 初始化失败,跳过 ref 语料索引")
return@withContext
}
val root = AppConfig.RefCorpus.ASSETS_ROOT
val paths = RefCorpusAssetScanner.listTxtFilesUnder(app, root)
Log.i(TAG, "[RefEmbed] 发现 ${paths.size} 个 txtroot=$root")
val bgeOk = BgeEmbedding.initialize(app)
if (!bgeOk) {
Log.w(TAG, "[RefEmbed] BGE 未就绪常见于模拟器仅扫描题库ref 配图匹配可用编辑距离")
}
var skipped = 0
var embedded = 0
var empty = 0
@@ -50,28 +51,9 @@ object RefEmbeddingIndexer {
continue
}
// 题库:遇到包含 ?/ 的行,写入 questions
val subject = extractSubjectFromRaw(raw)
val grade = extractGradeFromPath(path)
val questionLines = extractQuestionLines(raw)
for (line in questionLines) {
val content = line.trim()
if (content.isEmpty()) continue
val exists = questionDao.findByContentSubjectGrade(content, subject, grade)
if (exists == null) {
questionDao.insert(
Question(
id = 0,
content = content,
answer = null,
subject = subject,
grade = grade,
difficulty = 1,
createdAt = System.currentTimeMillis()
)
)
}
}
ingestQuestionsFromRaw(raw, path, questionDao)
if (!bgeOk) continue
val embedText = RefTxtEmbedText.fromRawFileContent(raw)
if (embedText.isEmpty()) {
@@ -110,10 +92,34 @@ object RefEmbeddingIndexer {
Log.i(
TAG,
"[RefEmbed] 完成 embedded=$embedded skipped=$skipped empty=$empty failed=$failed cacheSize=${RefEmbeddingMemoryCache.size()}"
"[RefEmbed] 完成 embedded=$embedded skipped=$skipped empty=$empty failed=$failed cacheSize=${RefEmbeddingMemoryCache.size()} bgeOk=$bgeOk"
)
}
private fun ingestQuestionsFromRaw(raw: String, path: String, questionDao: QuestionDao) {
val subject = extractSubjectFromRaw(raw)
val grade = extractGradeFromPath(path)
val questionLines = extractQuestionLines(raw)
for (line in questionLines) {
val content = line.trim()
if (content.isEmpty()) continue
val exists = questionDao.findByContentSubjectGrade(content, subject, grade)
if (exists == null) {
questionDao.insert(
Question(
id = 0,
content = content,
answer = null,
subject = subject,
grade = grade,
difficulty = 1,
createdAt = System.currentTimeMillis()
)
)
}
}
}
private fun extractSubjectFromRaw(raw: String): String? {
val line = raw.lineSequence()
.map { it.trimEnd() }

View File

@@ -3,6 +3,7 @@ package com.digitalperson.embedding
import android.content.Context
import android.util.Log
import com.digitalperson.config.AppConfig
import com.digitalperson.env.RuntimeEnv
import kotlin.math.sqrt
data class RefImageMatch(
@@ -16,7 +17,8 @@ object RefImageMatcher {
private const val TAG = AppConfig.TAG
/**
* @param threshold 余弦相似度阈值(向量已归一化时等价于 dot product
* @param threshold 真机 BGE余弦相似度阈值(向量已归一化时等价于 dot product
* 模拟器:忽略该参数,使用 [AppConfig.RefMatchEmulator.MIN_NORMALIZED_EDIT_SCORE](编辑距离归一化分)。
*/
fun findBestMatch(
context: Context,
@@ -26,6 +28,10 @@ object RefImageMatcher {
val query = text.trim()
if (query.isEmpty()) return null
if (RuntimeEnv.isEmulator()) {
return findBestMatchEditDistance(context, query)
}
if (!BgeEmbedding.isReady()) {
val ok = BgeEmbedding.initialize(context.applicationContext)
if (!ok) {
@@ -78,6 +84,203 @@ object RefImageMatcher {
)
}
/**
* 模拟器:不加载 BGE用**路径关键词命中率**(主)+ 编辑距离(辅)混合评分。
*
* 路径关键词:取最深目录名按 "-" 分割,如 "一年级上-生活适应-社会生活-元旦"
* → ["一年级上", "生活适应", "社会生活", "元旦"]。
* 命中率 = 命中数 / 关键词总数1 个词命中 ≈ 0.25,足以通过 0.20 阈值)。
*
* 纯编辑距离原先用 0.82 阈值,但 LLM 回复文本与参考短句字面差异很大,
* 即使话题相同也难达标;改为关键词方案后准确率大幅提升。
*/
/** 模拟器混合匹配逻辑;对 [androidTest] 暴露以便回归「本应命中却未命中」的用例。 */
internal fun findBestMatchEditDistance(context: Context, query: String): RefImageMatch? {
val app = context.applicationContext
val root = AppConfig.RefCorpus.ASSETS_ROOT
val paths = RefCorpusAssetScanner.listTxtFilesUnder(app, root)
val minScore = AppConfig.RefMatchEmulator.MIN_NORMALIZED_EDIT_SCORE
val qNorm = normalizeTextForEmuMatch(query.trim())
if (qNorm.isEmpty()) return null
var bestPath: String? = null
var bestScore = -1f
var bestSubstr = -1f
var bestEdit = -1f
for (path in paths) {
// 主:路径关键词命中率(无 IOO(1)
val kwScore = pathKeywordMatchScore(path, qNorm)
// 辅:内容匹配(有 IO仅在关键词有命中或尚无候选时读取
// 策略:① 子串包含query 句子 ⊆ candidate 或 candidate 句子 ⊆ query 句子)
// ② 逐句编辑距离
// candidate 是 txt 去掉 # 行后的全文(可能同时含问题和答案),
// query 中某句若直接出现在 candidate 里,说明话题完全命中。
// 必须对每条 txt 做内容打分:若仅在 bestScore<0 或 kwScore>0 时才读盘,
// 会先被其它文件的弱编辑分「占坑」,导致题干与某文件完全一致却从未被打开(如「上厕所」目录 kw 未命中但正文含原句)。
var substrScore = 0f
var editScore = 0f
try {
val raw = app.assets.open(path).bufferedReader(Charsets.UTF_8).use { it.readText() }
val candidate = normalizeTextForEmuMatch(RefTxtEmbedText.fromRawFileContent(raw))
if (candidate.isNotEmpty()) {
val querySentences = splitSentences(qNorm)
val candidateSentences = splitSentences(candidate)
// ① 子串query 句 ⊆ candidate或 candidate 句 ⊆ query 句。
// 极短片段(如「小朋友」)在多篇课文里都有,一律给高分会错配;按匹配长度分级。
substrScore = querySentences.maxOfOrNull { qs ->
var s = 0f
if (qs.length >= 4 && candidate.contains(qs)) {
s = maxOf(s, emulatorSubstringScoreForLength(qs.length))
}
for (cs in candidateSentences) {
if (cs.length >= 6 && qs.contains(cs)) {
s = maxOf(s, emulatorSubstringScoreForLength(cs.length) * 0.92f)
}
}
s
} ?: 0f
// ② 编辑距离(逐句 vs 逐句,取最高分)
editScore = querySentences.maxOfOrNull { qs ->
candidateSentences.maxOfOrNull { cs ->
EditDistanceSimilarity.normalizedScore(qs, cs)
} ?: 0f
} ?: 0f
}
} catch (e: Exception) {
Log.w(TAG, "[RefMatchEmu] read fail $path: ${e.message}")
}
val score = maxOf(kwScore, substrScore, editScore)
if (score > 0f) {
Log.v(TAG, "[RefMatchEmu] candidate score=$score (kw=$kwScore substr=$substrScore edit=$editScore) path=$path")
}
if (isBetterEmulatorCandidate(score, substrScore, editScore, bestScore, bestSubstr, bestEdit)) {
bestScore = score
bestSubstr = substrScore
bestEdit = editScore
bestPath = path
}
}
val txtPath = bestPath ?: run {
Log.d(TAG, "[RefMatchEmu] 无候选文件 query=${qNorm.take(60)}")
return null
}
if (bestScore < minScore) {
Log.d(TAG, "[RefMatchEmu] 分数不足 bestScore=$bestScore minScore=$minScore bestPath=$txtPath query=${qNorm.take(60)}")
return null
}
val pngPath = if (txtPath.endsWith(".txt", ignoreCase = true)) {
txtPath.dropLast(4) + ".png"
} else {
"$txtPath.png"
}
val exists = try {
context.assets.open(pngPath).close()
true
} catch (_: Throwable) {
false
}
if (!exists) return null
Log.d(TAG, "[RefMatchEmu] best=$txtPath score=$bestScore query=${qNorm.take(30)}")
return RefImageMatch(
txtAssetPath = txtPath,
pngAssetPath = pngPath,
score = bestScore
)
}
/**
* 从文件路径提取话题关键词,计算与查询文本的关键词命中率。
* 如路径含目录 "一年级上-生活适应-社会生活-元旦" → 关键词 ["一年级上","生活适应","社会生活","元旦"]。
*/
private fun pathKeywordMatchScore(path: String, query: String): Float {
val keywords = extractPathTopicKeywords(path)
if (keywords.isEmpty()) return 0f
val matches = keywords.count { kw -> queryMatchesPathKeyword(query, kw) }
return matches.toFloat() / keywords.size
}
/** 统一全角/半角标点后再匹配,避免代码或 ASR 里半角 `:` 与语料全角 `` 导致长句子串匹配失败。 */
private fun normalizeTextForEmuMatch(s: String): String = buildString(s.length) {
for (ch in s) {
append(
when (ch) {
'\uFF1A', '\uFE55', ':' -> ':'
'\uFF0C' -> ','
'\uFF01' -> '!'
'\uFF1F' -> '?'
'\uFF1B' -> ';'
else -> ch
},
)
}
}
/** 总分相同时优先子串分、再比编辑分,避免「元旦到了,小朋友」等前缀在多篇课文同分却先命中排序靠前者。 */
private fun isBetterEmulatorCandidate(
score: Float,
substr: Float,
edit: Float,
bestScore: Float,
bestSubstr: Float,
bestEdit: Float,
): Boolean {
if (bestScore < 0f) return true
when {
score > bestScore + 1e-5f -> return true
score + 1e-5f < bestScore -> return false
substr > bestSubstr + 1e-5f -> return true
substr + 1e-5f < bestSubstr -> return false
else -> return edit > bestEdit + 1e-5f
}
}
/** 路径片段与 query 的包含关系;题干常省略词头(如目录「上厕所」、句子里只有「厕所」)。 */
private fun queryMatchesPathKeyword(query: String, kw: String): Boolean {
if (query.contains(kw)) return true
// 去掉首字再匹配,避免「个人生活」用 takeLast(2) 误匹配到泛泛的「生活」
if (kw.length >= 3) {
val rest = kw.substring(1)
if (rest.length >= 2 && query.contains(rest)) return true
}
return false
}
/** 子串命中得分:越长说明越具区分度;过短(如「小朋友」)分数低,减少跨课文误配。 */
private fun emulatorSubstringScoreForLength(len: Int): Float = when {
len >= 18 -> 0.95f
len >= 12 -> 0.90f
len >= 8 -> 0.82f
len >= 6 -> 0.68f
len >= 4 -> 0.48f
else -> 0f
}
/**
* 按中文/英文句子分隔符拆分,返回非空句子列表。
* 用于模拟器编辑距离辅助评分:逐句比对,避免 LLM 前导寒暄句拉低得分。
*/
private fun splitSentences(text: String): List<String> {
val parts = text.split(Regex("[,。!?;,!?;\n]+"))
.map { it.trim() }
.filter { it.length >= 2 }
return parts.ifEmpty { listOf(text) }
}
/** 取最深目录名,按 "-" 分割并过滤掉纯数字和单字符片段。 */
private fun extractPathTopicKeywords(path: String): List<String> {
val deepestDir = path.split("/").dropLast(1).lastOrNull() ?: return emptyList()
return deepestDir.split("-")
.map { it.replace(Regex("\\d+"), "").trim() }
.filter { it.length >= 2 }
.distinct()
}
private fun dot(a: FloatArray, b: FloatArray): Float {
var s = 0f
for (i in a.indices) s += a[i] * b[i]

View File

@@ -0,0 +1,34 @@
package com.digitalperson.env
import android.os.Build
object RuntimeEnv {
fun isEmulator(): Boolean {
val fingerprint = Build.FINGERPRINT.orEmpty()
val model = Build.MODEL.orEmpty()
val brand = Build.BRAND.orEmpty()
val device = Build.DEVICE.orEmpty()
val product = Build.PRODUCT.orEmpty()
val hardware = Build.HARDWARE.orEmpty()
val manufacturer = Build.MANUFACTURER.orEmpty()
var hits = 0
fun hit(b: Boolean) { if (b) hits++ }
hit(fingerprint.startsWith("generic", ignoreCase = true))
hit(fingerprint.contains("unknown", ignoreCase = true))
hit(model.contains("google_sdk", ignoreCase = true))
hit(model.contains("emulator", ignoreCase = true))
hit(model.contains("android sdk built for", ignoreCase = true))
hit(manufacturer.contains("genymotion", ignoreCase = true))
hit(brand.startsWith("generic", ignoreCase = true) && device.startsWith("generic", ignoreCase = true))
hit(product.contains("sdk", ignoreCase = true))
hit(product.contains("emulator", ignoreCase = true))
hit(hardware.contains("goldfish", ignoreCase = true))
hit(hardware.contains("ranchu", ignoreCase = true))
// Require multiple signals to avoid false positives on weird ROMs.
return hits >= 2
}
}

View File

@@ -5,6 +5,7 @@ import android.graphics.Bitmap
import android.util.Log
import com.digitalperson.config.AppConfig
import com.digitalperson.engine.RetinaFaceEngineRKNN
import com.digitalperson.env.RuntimeEnv
import java.util.ArrayDeque
import java.util.concurrent.atomic.AtomicBoolean
import kotlinx.coroutines.CoroutineScope
@@ -35,6 +36,12 @@ class FaceDetectionPipeline(
private val onResult: (FaceDetectionResult) -> Unit,
private val onPresenceChanged: (present: Boolean, isFrontal: Boolean, faceIdentityId: String?, recognizedName: String?) -> Unit,
) {
companion object {
/** 模拟器固定人脸 ID对应 UserMemory 中的 userId */
const val EMULATOR_FACE_ID = "face_emulator"
/** 模拟器固定显示名,直接作为 recognizedName 传给 coordinator */
const val EMULATOR_FACE_NAME = "小黑"
}
private val appContext = context.applicationContext
private val engine = RetinaFaceEngineRKNN()
private val recognizer = FaceRecognizer(appContext)
@@ -50,6 +57,11 @@ class FaceDetectionPipeline(
private val fusionQualities = ArrayDeque<Float>()
fun initialize(): Boolean {
if (RuntimeEnv.isEmulator()) {
Log.i(AppConfig.TAG, "[Face] 模拟器模式:跳过 RKNN 初始化,固定返回身份「$EMULATOR_FACE_NAME」")
initialized.set(true)
return true
}
val detectorOk = engine.initialize(appContext)
val recognizerOk = recognizer.initialize()
val ok = detectorOk && recognizerOk
@@ -68,6 +80,31 @@ class FaceDetectionPipeline(
return
}
// 模拟器:跳过 RKNN 检测,固定上报一张居中正脸
if (RuntimeEnv.isEmulator()) {
scope.launch {
try {
val w = bitmap.width
val h = bitmap.height
val fakeBox = FaceBox(
left = w * 0.25f,
top = h * 0.15f,
right = w * 0.75f,
bottom = h * 0.85f,
score = 0.99f,
)
withContext(Dispatchers.Main) {
onPresenceChanged(true, true, EMULATOR_FACE_ID, EMULATOR_FACE_NAME)
onResult(FaceDetectionResult(w, h, listOf(fakeBox)))
}
} finally {
bitmap.recycle()
frameInFlight.set(false)
}
}
return
}
scope.launch {
try {
val width = bitmap.width

View File

@@ -148,6 +148,8 @@ abstract class BaseDigitalPersonCoordinator(
* (i.e. after a cloud LLM response), NOT after greeting / farewell / proactive TTS.
*/
fun onTtsPlaybackCompleted() {
// Let the controller advance its own timers (greeting/proactive/dlg all count as assistant speaking).
controller.onAssistantTtsPlaybackCompleted()
if (pendingDialogueFinish) {
pendingDialogueFinish = false
controller.onDialogueResponseFinished()

View File

@@ -64,6 +64,10 @@ class DigitalHumanInteractionController(
private var memoryJob: Job? = null
private var farewellJob: Job? = null
// 让超时/间隔从 TTS 播放完成后开始计时,而不是从 speak() 调用时开始
private var pendingWaitReplyTimeoutAfterTts: Boolean = false
private var pendingProactiveFollowupAfterTts: Boolean = false
fun start() {
transitionTo(InteractionState.IDLE)
scheduleMemoryMode()
@@ -204,7 +208,7 @@ fun onFacePresenceChanged(present: Boolean, isFrontal: Boolean = true) { // 添
return
}
transitionTo(InteractionState.WAITING_REPLY)
scheduleWaitingReplyTimeout()
scheduleWaitingReplyTimeoutAfterTts()
}
private fun enterGreeting() {
@@ -224,7 +228,7 @@ fun onFacePresenceChanged(present: Boolean, isFrontal: Boolean = true) { // 添
handler.addToChatHistory("assistant", greeting)
handler.addAssistantMessageToCloudHistory(greeting)
transitionTo(InteractionState.WAITING_REPLY)
scheduleWaitingReplyTimeout()
scheduleWaitingReplyTimeoutAfterTts()
} else {
useDefaultGreeting()
}
@@ -243,7 +247,11 @@ fun onFacePresenceChanged(present: Boolean, isFrontal: Boolean = true) { // 添
handler.addAssistantMessageToCloudHistory(greeting)
transitionTo(InteractionState.WAITING_REPLY)
scheduleWaitingReplyTimeout()
scheduleWaitingReplyTimeoutAfterTts()
}
private fun scheduleWaitingReplyTimeoutAfterTts() {
pendingWaitReplyTimeoutAfterTts = true
}
private fun scheduleWaitingReplyTimeout() {
@@ -282,21 +290,34 @@ fun onFacePresenceChanged(present: Boolean, isFrontal: Boolean = true) { // 添
// 触发题目生成检查
handler.onQuestionAsked(currentFaceId ?: "guest")
proactiveJob = scope.launch {
hasPendingUserReply = false
delay(20_000)
if (state != InteractionState.PROACTIVE || hasPendingUserReply) return@launch
if (!facePresent) {
enterFarewell()
return@launch
}
proactiveRound += 1
if (proactiveRound < 3) {
askProactiveTopic()
} else {
transitionTo(InteractionState.WAITING_REPLY)
// handler.playMotion("haru_g_m17.motion3.json")
scheduleWaitingReplyTimeout()
// 不立刻开始 20s 计时;等 TTS 播放完再开始计时,避免“刚说完几秒就又问”
pendingProactiveFollowupAfterTts = true
}
/** 由 Activity 在「本轮 TTS 完整播放完成」时调用(包括问候/主动提问/对话回复)。 */
fun onAssistantTtsPlaybackCompleted() {
if (pendingWaitReplyTimeoutAfterTts && state == InteractionState.WAITING_REPLY) {
pendingWaitReplyTimeoutAfterTts = false
scheduleWaitingReplyTimeout()
}
if (pendingProactiveFollowupAfterTts && state == InteractionState.PROACTIVE) {
pendingProactiveFollowupAfterTts = false
proactiveJob?.cancel()
proactiveJob = scope.launch {
hasPendingUserReply = false
delay(20_000)
if (state != InteractionState.PROACTIVE || hasPendingUserReply) return@launch
if (!facePresent) {
enterFarewell()
return@launch
}
proactiveRound += 1
if (proactiveRound < 3) {
askProactiveTopic()
} else {
transitionTo(InteractionState.WAITING_REPLY)
scheduleWaitingReplyTimeoutAfterTts()
}
}
}
}

View File

@@ -27,6 +27,11 @@ class TtsController(private val context: Context) {
private var callback: TtsCallback? = null
// 防止 WebSocket 重连或多路回调导致同一段文案短时间内重复入队、重复播报
@Volatile private var lastEnqueuedText: String? = null
@Volatile private var lastEnqueuedAtMs: Long = 0L
private val dedupeWindowMs = 2500L
fun setCallback(callback: TtsCallback) {
this.callback = callback
bindCallbacksIfReady()
@@ -147,6 +152,14 @@ class TtsController(private val context: Context) {
fun enqueueSegment(seg: String) {
val cleaned = seg.replace(Regex("\\[.*?\\]"), "").trim()
if (cleaned.isEmpty()) return
val now = System.currentTimeMillis()
val lastText = lastEnqueuedText
if (lastText != null && lastText == cleaned && (now - lastEnqueuedAtMs) <= dedupeWindowMs) {
Log.w(TAG, "Skip duplicate TTS segment within ${dedupeWindowMs}ms: ${cleaned.take(60)}")
return
}
lastEnqueuedText = cleaned
lastEnqueuedAtMs = now
if (useQCloudTts) {
qcloudTts?.enqueueSegment(cleaned)
} else {