initial commit

This commit is contained in:
gcw_4spBpAfv
2026-02-25 18:13:26 +08:00
commit 6aa84d6b77
239 changed files with 995156 additions and 0 deletions

View File

@@ -0,0 +1,11 @@
package com.k2fsa.sherpa.onnx
data class FeatureConfig(
var sampleRate: Int = 16000,
var featureDim: Int = 80,
var dither: Float = 0.0f
)
fun getFeatureConfig(sampleRate: Int, featureDim: Int): FeatureConfig {
return FeatureConfig(sampleRate = sampleRate, featureDim = featureDim)
}

View File

@@ -0,0 +1,7 @@
package com.k2fsa.sherpa.onnx
data class HomophoneReplacerConfig(
var dictDir: String = "", // unused
var lexicon: String = "",
var ruleFsts: String = "",
)

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,32 @@
package com.k2fsa.sherpa.onnx
class OfflineStream(var ptr: Long) {
fun acceptWaveform(samples: FloatArray, sampleRate: Int) =
acceptWaveform(ptr, samples, sampleRate)
protected fun finalize() {
if (ptr != 0L) {
delete(ptr)
ptr = 0
}
}
fun release() = finalize()
fun use(block: (OfflineStream) -> Unit) {
try {
block(this)
} finally {
release()
}
}
private external fun acceptWaveform(ptr: Long, samples: FloatArray, sampleRate: Int)
private external fun delete(ptr: Long)
companion object {
init {
System.loadLibrary("sherpa-onnx-jni")
}
}
}

View File

@@ -0,0 +1,7 @@
package com.k2fsa.sherpa.onnx
data class QnnConfig(
var backendLib: String = "",
var contextBinary: String = "",
var systemLib: String = "",
)

View File

@@ -0,0 +1,373 @@
// Copyright (c) 2023 Xiaomi Corporation
package com.k2fsa.sherpa.onnx
import android.content.res.AssetManager
data class OfflineTtsVitsModelConfig(
var model: String = "",
var lexicon: String = "",
var tokens: String = "",
var dataDir: String = "",
var dictDir: String = "", // unused
var noiseScale: Float = 0.667f,
var noiseScaleW: Float = 0.8f,
var lengthScale: Float = 1.0f,
)
data class OfflineTtsMatchaModelConfig(
var acousticModel: String = "",
var vocoder: String = "",
var lexicon: String = "",
var tokens: String = "",
var dataDir: String = "",
var dictDir: String = "", // unused
var noiseScale: Float = 1.0f,
var lengthScale: Float = 1.0f,
)
data class OfflineTtsKokoroModelConfig(
var model: String = "",
var voices: String = "",
var tokens: String = "",
var dataDir: String = "",
var lexicon: String = "",
var lang: String = "",
var dictDir: String = "", // unused
var lengthScale: Float = 1.0f,
)
data class OfflineTtsKittenModelConfig(
var model: String = "",
var voices: String = "",
var tokens: String = "",
var dataDir: String = "",
var lengthScale: Float = 1.0f,
)
/**
* Configuration for Pocket TTS models.
*
* See https://k2-fsa.github.io/sherpa/onnx/tts/pocket/index.html for details.
*
* @property lmFlow Path to the LM flow model (.onnx)
* @property lmMain Path to the LM main model (.onnx)
* @property encoder Path to the encoder model (.onnx)
* @property decoder Path to the decoder model (.onnx)
* @property textConditioner Path to the text conditioner model (.onnx)
* @property vocabJson Path to vocabulary JSON file
* @property tokenScoresJson Path to token scores JSON file
*/
data class OfflineTtsPocketModelConfig(
var lmFlow: String = "",
var lmMain: String = "",
var encoder: String = "",
var decoder: String = "",
var textConditioner: String = "",
var vocabJson: String = "",
var tokenScoresJson: String = "",
)
data class OfflineTtsModelConfig(
var vits: OfflineTtsVitsModelConfig = OfflineTtsVitsModelConfig(),
var matcha: OfflineTtsMatchaModelConfig = OfflineTtsMatchaModelConfig(),
var kokoro: OfflineTtsKokoroModelConfig = OfflineTtsKokoroModelConfig(),
var kitten: OfflineTtsKittenModelConfig = OfflineTtsKittenModelConfig(),
val pocket: OfflineTtsPocketModelConfig = OfflineTtsPocketModelConfig(),
var numThreads: Int = 1,
var debug: Boolean = false,
var provider: String = "cpu",
)
data class OfflineTtsConfig(
var model: OfflineTtsModelConfig = OfflineTtsModelConfig(),
var ruleFsts: String = "",
var ruleFars: String = "",
var maxNumSentences: Int = 1,
var silenceScale: Float = 0.2f,
)
class GeneratedAudio(
val samples: FloatArray,
val sampleRate: Int,
) {
fun save(filename: String) =
saveImpl(filename = filename, samples = samples, sampleRate = sampleRate)
private external fun saveImpl(
filename: String,
samples: FloatArray,
sampleRate: Int
): Boolean
}
data class GenerationConfig(
var silenceScale: Float = 0.2f,
var speed: Float = 1.0f,
var sid: Int = 0,
var referenceAudio: FloatArray? = null,
var referenceSampleRate: Int = 0,
var referenceText: String? = null,
var numSteps: Int = 5,
var extra: Map<String, String>? = null
)
class OfflineTts(
assetManager: AssetManager? = null,
var config: OfflineTtsConfig,
) {
private var ptr: Long
init {
ptr = if (assetManager != null) {
newFromAsset(assetManager, config)
} else {
newFromFile(config)
}
}
fun sampleRate() = getSampleRate(ptr)
fun numSpeakers() = getNumSpeakers(ptr)
fun generate(
text: String,
sid: Int = 0,
speed: Float = 1.0f
): GeneratedAudio {
return toGeneratedAudio(generateImpl(ptr, text = text, sid = sid, speed = speed))
}
fun generateWithCallback(
text: String,
sid: Int = 0,
speed: Float = 1.0f,
callback: (samples: FloatArray) -> Int
): GeneratedAudio {
return toGeneratedAudio(generateWithCallbackImpl(
ptr,
text = text,
sid = sid,
speed = speed,
callback = callback
))
}
fun generateWithConfig(
text: String,
config: GenerationConfig
): GeneratedAudio {
return toGeneratedAudio(generateWithConfigImpl(ptr, text, config, null))
}
fun generateWithConfigAndCallback(
text: String,
config: GenerationConfig,
callback: (samples: FloatArray) -> Int
): GeneratedAudio {
return toGeneratedAudio(generateWithConfigImpl(ptr, text, config, callback))
}
@Suppress("UNCHECKED_CAST")
private fun toGeneratedAudio(obj: Any): GeneratedAudio {
return when (obj) {
is GeneratedAudio -> obj
is Array<*> -> {
// Native may return Object[]{ float[] samples, int sampleRate }
val samples = obj.getOrNull(0) as? FloatArray
?: error("Unexpected native TTS return: element[0] is not FloatArray")
val sampleRate = (obj.getOrNull(1) as? Number)?.toInt()
?: error("Unexpected native TTS return: element[1] is not Int/Number")
GeneratedAudio(samples = samples, sampleRate = sampleRate)
}
else -> error("Unexpected native TTS return type: ${obj::class.java.name}")
}
}
fun allocate(assetManager: AssetManager? = null) {
if (ptr == 0L) {
ptr = if (assetManager != null) {
newFromAsset(assetManager, config)
} else {
newFromFile(config)
}
}
}
fun free() {
if (ptr != 0L) {
delete(ptr)
ptr = 0
}
}
protected fun finalize() {
if (ptr != 0L) {
delete(ptr)
ptr = 0
}
}
fun release() = finalize()
private external fun newFromAsset(
assetManager: AssetManager,
config: OfflineTtsConfig,
): Long
private external fun newFromFile(
config: OfflineTtsConfig,
): Long
private external fun delete(ptr: Long)
private external fun getSampleRate(ptr: Long): Int
private external fun getNumSpeakers(ptr: Long): Int
// The returned array has two entries:
// - the first entry is an 1-D float array containing audio samples.
// Each sample is normalized to the range [-1, 1]
// - the second entry is the sample rate
private external fun generateImpl(
ptr: Long,
text: String,
sid: Int = 0,
speed: Float = 1.0f
): Any
private external fun generateWithCallbackImpl(
ptr: Long,
text: String,
sid: Int = 0,
speed: Float = 1.0f,
callback: (samples: FloatArray) -> Int
): Any
private external fun generateWithConfigImpl(
ptr: Long,
text: String,
config: GenerationConfig,
callback: ((samples: FloatArray) -> Int)?
): Any
companion object {
init {
System.loadLibrary("sherpa-onnx-jni")
}
}
}
// please refer to
// https://k2-fsa.github.io/sherpa/onnx/tts/pretrained_models/index.html
// to download models
fun getOfflineTtsConfig(
modelDir: String,
modelName: String, // for VITS
acousticModelName: String, // for Matcha
vocoder: String, // for Matcha
voices: String, // for Kokoro or kitten
lexicon: String,
dataDir: String,
dictDir: String, // unused
ruleFsts: String,
ruleFars: String,
numThreads: Int? = null,
isKitten: Boolean = false
): OfflineTtsConfig {
// For Matcha TTS, please set
// acousticModelName, vocoder
// For Kokoro TTS, please set
// modelName, voices
// For Kitten TTS, please set
// modelName, voices, isKitten
// For VITS, please set
// modelName
val numberOfThreads = if (numThreads != null) {
numThreads
} else if (voices.isNotEmpty()) {
// for Kokoro and Kitten TTS models, we use more threads
4
} else {
2
}
if (modelName.isEmpty() && acousticModelName.isEmpty()) {
throw IllegalArgumentException("Please specify a TTS model")
}
if (modelName.isNotEmpty() && acousticModelName.isNotEmpty()) {
throw IllegalArgumentException("Please specify either a VITS or a Matcha model, but not both")
}
if (acousticModelName.isNotEmpty() && vocoder.isEmpty()) {
throw IllegalArgumentException("Please provide vocoder for Matcha TTS")
}
val vits = if (modelName.isNotEmpty() && voices.isEmpty()) {
OfflineTtsVitsModelConfig(
model = "$modelDir/$modelName",
lexicon = "$modelDir/$lexicon",
tokens = "$modelDir/tokens.txt",
dataDir = dataDir,
)
} else {
OfflineTtsVitsModelConfig()
}
val matcha = if (acousticModelName.isNotEmpty()) {
OfflineTtsMatchaModelConfig(
acousticModel = "$modelDir/$acousticModelName",
vocoder = vocoder,
lexicon = "$modelDir/$lexicon",
tokens = "$modelDir/tokens.txt",
dataDir = dataDir,
)
} else {
OfflineTtsMatchaModelConfig()
}
val kokoro = if (voices.isNotEmpty() && !isKitten) {
OfflineTtsKokoroModelConfig(
model = "$modelDir/$modelName",
voices = "$modelDir/$voices",
tokens = "$modelDir/tokens.txt",
dataDir = dataDir,
lexicon = when {
lexicon == "" -> lexicon
"," in lexicon -> lexicon
else -> "$modelDir/$lexicon"
},
)
} else {
OfflineTtsKokoroModelConfig()
}
val kitten = if (isKitten) {
OfflineTtsKittenModelConfig(
model = "$modelDir/$modelName",
voices = "$modelDir/$voices",
tokens = "$modelDir/tokens.txt",
dataDir = dataDir,
)
} else {
OfflineTtsKittenModelConfig()
}
return OfflineTtsConfig(
model = OfflineTtsModelConfig(
vits = vits,
matcha = matcha,
kokoro = kokoro,
kitten = kitten,
numThreads = numberOfThreads,
debug = true,
provider = "cpu",
),
ruleFsts = ruleFsts,
ruleFars = ruleFars,
)
}

View File

@@ -0,0 +1,149 @@
// Copyright (c) 2023 Xiaomi Corporation
package com.k2fsa.sherpa.onnx
import android.content.res.AssetManager
data class SileroVadModelConfig(
var model: String = "",
var threshold: Float = 0.5F,
var minSilenceDuration: Float = 0.25F,
var minSpeechDuration: Float = 0.25F,
var windowSize: Int = 512,
var maxSpeechDuration: Float = 5.0F,
)
data class TenVadModelConfig(
var model: String = "",
var threshold: Float = 0.5F,
var minSilenceDuration: Float = 0.25F,
var minSpeechDuration: Float = 0.25F,
var windowSize: Int = 256,
var maxSpeechDuration: Float = 5.0F,
)
data class VadModelConfig(
var sileroVadModelConfig: SileroVadModelConfig = SileroVadModelConfig(),
var tenVadModelConfig: TenVadModelConfig = TenVadModelConfig(),
var sampleRate: Int = 16000,
var numThreads: Int = 1,
var provider: String = "cpu",
var debug: Boolean = false,
)
class SpeechSegment(val start: Int, val samples: FloatArray)
class Vad(
assetManager: AssetManager? = null,
var config: VadModelConfig,
) {
private var ptr: Long
init {
if (assetManager != null) {
ptr = newFromAsset(assetManager, config)
} else {
ptr = newFromFile(config)
}
}
protected fun finalize() {
if (ptr != 0L) {
delete(ptr)
ptr = 0
}
}
fun release() = finalize()
fun compute(samples: FloatArray): Float = compute(ptr, samples)
fun acceptWaveform(samples: FloatArray) = acceptWaveform(ptr, samples)
fun empty(): Boolean = empty(ptr)
fun pop() = pop(ptr)
fun front(): SpeechSegment {
return front(ptr)
}
fun clear() = clear(ptr)
fun isSpeechDetected(): Boolean = isSpeechDetected(ptr)
fun reset() = reset(ptr)
fun flush() = flush(ptr)
private external fun delete(ptr: Long)
private external fun newFromAsset(
assetManager: AssetManager,
config: VadModelConfig,
): Long
private external fun newFromFile(
config: VadModelConfig,
): Long
private external fun acceptWaveform(ptr: Long, samples: FloatArray)
private external fun compute(ptr: Long, samples: FloatArray): Float
private external fun empty(ptr: Long): Boolean
private external fun pop(ptr: Long)
private external fun clear(ptr: Long)
private external fun front(ptr: Long): SpeechSegment
private external fun isSpeechDetected(ptr: Long): Boolean
private external fun reset(ptr: Long)
private external fun flush(ptr: Long)
companion object {
init {
System.loadLibrary("sherpa-onnx-jni")
}
}
}
// Please visit
// https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/silero_vad.onnx
// to download silero_vad.onnx
// and put it inside the assets/
// directory
//
// For ten-vad, please use
// https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/ten-vad.onnx
//
fun getVadModelConfig(type: Int): VadModelConfig? {
when (type) {
0 -> {
return VadModelConfig(
sileroVadModelConfig = SileroVadModelConfig(
model = "silero_vad.onnx",
threshold = 0.5F,
minSilenceDuration = 0.25F,
minSpeechDuration = 0.25F,
windowSize = 512,
),
sampleRate = 16000,
numThreads = 1,
provider = "cpu",
)
}
1 -> {
return VadModelConfig(
tenVadModelConfig = TenVadModelConfig(
model = "ten-vad.onnx",
threshold = 0.5F,
minSilenceDuration = 0.25F,
minSpeechDuration = 0.25F,
windowSize = 256,
),
sampleRate = 16000,
numThreads = 1,
provider = "cpu",
)
}
}
return null
}