initial commit
This commit is contained in:
11
app/src/main/java/com/k2fsa/sherpa/onnx/FeatureConfig.kt
Normal file
11
app/src/main/java/com/k2fsa/sherpa/onnx/FeatureConfig.kt
Normal file
@@ -0,0 +1,11 @@
|
||||
package com.k2fsa.sherpa.onnx
|
||||
|
||||
data class FeatureConfig(
|
||||
var sampleRate: Int = 16000,
|
||||
var featureDim: Int = 80,
|
||||
var dither: Float = 0.0f
|
||||
)
|
||||
|
||||
fun getFeatureConfig(sampleRate: Int, featureDim: Int): FeatureConfig {
|
||||
return FeatureConfig(sampleRate = sampleRate, featureDim = featureDim)
|
||||
}
|
||||
@@ -0,0 +1,7 @@
|
||||
package com.k2fsa.sherpa.onnx
|
||||
|
||||
data class HomophoneReplacerConfig(
|
||||
var dictDir: String = "", // unused
|
||||
var lexicon: String = "",
|
||||
var ruleFsts: String = "",
|
||||
)
|
||||
1241
app/src/main/java/com/k2fsa/sherpa/onnx/OfflineRecognizer.kt
Normal file
1241
app/src/main/java/com/k2fsa/sherpa/onnx/OfflineRecognizer.kt
Normal file
File diff suppressed because it is too large
Load Diff
32
app/src/main/java/com/k2fsa/sherpa/onnx/OfflineStream.kt
Normal file
32
app/src/main/java/com/k2fsa/sherpa/onnx/OfflineStream.kt
Normal file
@@ -0,0 +1,32 @@
|
||||
package com.k2fsa.sherpa.onnx
|
||||
|
||||
class OfflineStream(var ptr: Long) {
|
||||
fun acceptWaveform(samples: FloatArray, sampleRate: Int) =
|
||||
acceptWaveform(ptr, samples, sampleRate)
|
||||
|
||||
protected fun finalize() {
|
||||
if (ptr != 0L) {
|
||||
delete(ptr)
|
||||
ptr = 0
|
||||
}
|
||||
}
|
||||
|
||||
fun release() = finalize()
|
||||
|
||||
fun use(block: (OfflineStream) -> Unit) {
|
||||
try {
|
||||
block(this)
|
||||
} finally {
|
||||
release()
|
||||
}
|
||||
}
|
||||
|
||||
private external fun acceptWaveform(ptr: Long, samples: FloatArray, sampleRate: Int)
|
||||
private external fun delete(ptr: Long)
|
||||
|
||||
companion object {
|
||||
init {
|
||||
System.loadLibrary("sherpa-onnx-jni")
|
||||
}
|
||||
}
|
||||
}
|
||||
7
app/src/main/java/com/k2fsa/sherpa/onnx/QnnConfig.kt
Normal file
7
app/src/main/java/com/k2fsa/sherpa/onnx/QnnConfig.kt
Normal file
@@ -0,0 +1,7 @@
|
||||
package com.k2fsa.sherpa.onnx
|
||||
|
||||
data class QnnConfig(
|
||||
var backendLib: String = "",
|
||||
var contextBinary: String = "",
|
||||
var systemLib: String = "",
|
||||
)
|
||||
373
app/src/main/java/com/k2fsa/sherpa/onnx/Tts.kt
Normal file
373
app/src/main/java/com/k2fsa/sherpa/onnx/Tts.kt
Normal file
@@ -0,0 +1,373 @@
|
||||
// Copyright (c) 2023 Xiaomi Corporation
|
||||
package com.k2fsa.sherpa.onnx
|
||||
|
||||
import android.content.res.AssetManager
|
||||
|
||||
data class OfflineTtsVitsModelConfig(
|
||||
var model: String = "",
|
||||
var lexicon: String = "",
|
||||
var tokens: String = "",
|
||||
var dataDir: String = "",
|
||||
var dictDir: String = "", // unused
|
||||
var noiseScale: Float = 0.667f,
|
||||
var noiseScaleW: Float = 0.8f,
|
||||
var lengthScale: Float = 1.0f,
|
||||
)
|
||||
|
||||
data class OfflineTtsMatchaModelConfig(
|
||||
var acousticModel: String = "",
|
||||
var vocoder: String = "",
|
||||
var lexicon: String = "",
|
||||
var tokens: String = "",
|
||||
var dataDir: String = "",
|
||||
var dictDir: String = "", // unused
|
||||
var noiseScale: Float = 1.0f,
|
||||
var lengthScale: Float = 1.0f,
|
||||
)
|
||||
|
||||
data class OfflineTtsKokoroModelConfig(
|
||||
var model: String = "",
|
||||
var voices: String = "",
|
||||
var tokens: String = "",
|
||||
var dataDir: String = "",
|
||||
var lexicon: String = "",
|
||||
var lang: String = "",
|
||||
var dictDir: String = "", // unused
|
||||
var lengthScale: Float = 1.0f,
|
||||
)
|
||||
|
||||
data class OfflineTtsKittenModelConfig(
|
||||
var model: String = "",
|
||||
var voices: String = "",
|
||||
var tokens: String = "",
|
||||
var dataDir: String = "",
|
||||
var lengthScale: Float = 1.0f,
|
||||
)
|
||||
|
||||
/**
|
||||
* Configuration for Pocket TTS models.
|
||||
*
|
||||
* See https://k2-fsa.github.io/sherpa/onnx/tts/pocket/index.html for details.
|
||||
*
|
||||
* @property lmFlow Path to the LM flow model (.onnx)
|
||||
* @property lmMain Path to the LM main model (.onnx)
|
||||
* @property encoder Path to the encoder model (.onnx)
|
||||
* @property decoder Path to the decoder model (.onnx)
|
||||
* @property textConditioner Path to the text conditioner model (.onnx)
|
||||
* @property vocabJson Path to vocabulary JSON file
|
||||
* @property tokenScoresJson Path to token scores JSON file
|
||||
*/
|
||||
data class OfflineTtsPocketModelConfig(
|
||||
var lmFlow: String = "",
|
||||
var lmMain: String = "",
|
||||
var encoder: String = "",
|
||||
var decoder: String = "",
|
||||
var textConditioner: String = "",
|
||||
var vocabJson: String = "",
|
||||
var tokenScoresJson: String = "",
|
||||
)
|
||||
|
||||
data class OfflineTtsModelConfig(
|
||||
var vits: OfflineTtsVitsModelConfig = OfflineTtsVitsModelConfig(),
|
||||
var matcha: OfflineTtsMatchaModelConfig = OfflineTtsMatchaModelConfig(),
|
||||
var kokoro: OfflineTtsKokoroModelConfig = OfflineTtsKokoroModelConfig(),
|
||||
var kitten: OfflineTtsKittenModelConfig = OfflineTtsKittenModelConfig(),
|
||||
val pocket: OfflineTtsPocketModelConfig = OfflineTtsPocketModelConfig(),
|
||||
|
||||
var numThreads: Int = 1,
|
||||
var debug: Boolean = false,
|
||||
var provider: String = "cpu",
|
||||
)
|
||||
|
||||
data class OfflineTtsConfig(
|
||||
var model: OfflineTtsModelConfig = OfflineTtsModelConfig(),
|
||||
var ruleFsts: String = "",
|
||||
var ruleFars: String = "",
|
||||
var maxNumSentences: Int = 1,
|
||||
var silenceScale: Float = 0.2f,
|
||||
)
|
||||
|
||||
class GeneratedAudio(
|
||||
val samples: FloatArray,
|
||||
val sampleRate: Int,
|
||||
) {
|
||||
fun save(filename: String) =
|
||||
saveImpl(filename = filename, samples = samples, sampleRate = sampleRate)
|
||||
|
||||
private external fun saveImpl(
|
||||
filename: String,
|
||||
samples: FloatArray,
|
||||
sampleRate: Int
|
||||
): Boolean
|
||||
}
|
||||
|
||||
data class GenerationConfig(
|
||||
var silenceScale: Float = 0.2f,
|
||||
var speed: Float = 1.0f,
|
||||
var sid: Int = 0,
|
||||
var referenceAudio: FloatArray? = null,
|
||||
var referenceSampleRate: Int = 0,
|
||||
var referenceText: String? = null,
|
||||
var numSteps: Int = 5,
|
||||
var extra: Map<String, String>? = null
|
||||
)
|
||||
|
||||
class OfflineTts(
|
||||
assetManager: AssetManager? = null,
|
||||
var config: OfflineTtsConfig,
|
||||
) {
|
||||
private var ptr: Long
|
||||
|
||||
init {
|
||||
ptr = if (assetManager != null) {
|
||||
newFromAsset(assetManager, config)
|
||||
} else {
|
||||
newFromFile(config)
|
||||
}
|
||||
}
|
||||
|
||||
fun sampleRate() = getSampleRate(ptr)
|
||||
|
||||
fun numSpeakers() = getNumSpeakers(ptr)
|
||||
|
||||
fun generate(
|
||||
text: String,
|
||||
sid: Int = 0,
|
||||
speed: Float = 1.0f
|
||||
): GeneratedAudio {
|
||||
return toGeneratedAudio(generateImpl(ptr, text = text, sid = sid, speed = speed))
|
||||
}
|
||||
|
||||
fun generateWithCallback(
|
||||
text: String,
|
||||
sid: Int = 0,
|
||||
speed: Float = 1.0f,
|
||||
callback: (samples: FloatArray) -> Int
|
||||
): GeneratedAudio {
|
||||
return toGeneratedAudio(generateWithCallbackImpl(
|
||||
ptr,
|
||||
text = text,
|
||||
sid = sid,
|
||||
speed = speed,
|
||||
callback = callback
|
||||
))
|
||||
}
|
||||
|
||||
fun generateWithConfig(
|
||||
text: String,
|
||||
config: GenerationConfig
|
||||
): GeneratedAudio {
|
||||
return toGeneratedAudio(generateWithConfigImpl(ptr, text, config, null))
|
||||
}
|
||||
|
||||
fun generateWithConfigAndCallback(
|
||||
text: String,
|
||||
config: GenerationConfig,
|
||||
callback: (samples: FloatArray) -> Int
|
||||
): GeneratedAudio {
|
||||
return toGeneratedAudio(generateWithConfigImpl(ptr, text, config, callback))
|
||||
}
|
||||
|
||||
@Suppress("UNCHECKED_CAST")
|
||||
private fun toGeneratedAudio(obj: Any): GeneratedAudio {
|
||||
return when (obj) {
|
||||
is GeneratedAudio -> obj
|
||||
is Array<*> -> {
|
||||
// Native may return Object[]{ float[] samples, int sampleRate }
|
||||
val samples = obj.getOrNull(0) as? FloatArray
|
||||
?: error("Unexpected native TTS return: element[0] is not FloatArray")
|
||||
val sampleRate = (obj.getOrNull(1) as? Number)?.toInt()
|
||||
?: error("Unexpected native TTS return: element[1] is not Int/Number")
|
||||
GeneratedAudio(samples = samples, sampleRate = sampleRate)
|
||||
}
|
||||
else -> error("Unexpected native TTS return type: ${obj::class.java.name}")
|
||||
}
|
||||
}
|
||||
|
||||
fun allocate(assetManager: AssetManager? = null) {
|
||||
if (ptr == 0L) {
|
||||
ptr = if (assetManager != null) {
|
||||
newFromAsset(assetManager, config)
|
||||
} else {
|
||||
newFromFile(config)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
fun free() {
|
||||
if (ptr != 0L) {
|
||||
delete(ptr)
|
||||
ptr = 0
|
||||
}
|
||||
}
|
||||
|
||||
protected fun finalize() {
|
||||
if (ptr != 0L) {
|
||||
delete(ptr)
|
||||
ptr = 0
|
||||
}
|
||||
}
|
||||
|
||||
fun release() = finalize()
|
||||
|
||||
private external fun newFromAsset(
|
||||
assetManager: AssetManager,
|
||||
config: OfflineTtsConfig,
|
||||
): Long
|
||||
|
||||
private external fun newFromFile(
|
||||
config: OfflineTtsConfig,
|
||||
): Long
|
||||
|
||||
private external fun delete(ptr: Long)
|
||||
private external fun getSampleRate(ptr: Long): Int
|
||||
private external fun getNumSpeakers(ptr: Long): Int
|
||||
|
||||
// The returned array has two entries:
|
||||
// - the first entry is an 1-D float array containing audio samples.
|
||||
// Each sample is normalized to the range [-1, 1]
|
||||
// - the second entry is the sample rate
|
||||
private external fun generateImpl(
|
||||
ptr: Long,
|
||||
text: String,
|
||||
sid: Int = 0,
|
||||
speed: Float = 1.0f
|
||||
): Any
|
||||
|
||||
private external fun generateWithCallbackImpl(
|
||||
ptr: Long,
|
||||
text: String,
|
||||
sid: Int = 0,
|
||||
speed: Float = 1.0f,
|
||||
callback: (samples: FloatArray) -> Int
|
||||
): Any
|
||||
|
||||
|
||||
private external fun generateWithConfigImpl(
|
||||
ptr: Long,
|
||||
text: String,
|
||||
config: GenerationConfig,
|
||||
callback: ((samples: FloatArray) -> Int)?
|
||||
): Any
|
||||
|
||||
companion object {
|
||||
init {
|
||||
System.loadLibrary("sherpa-onnx-jni")
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// please refer to
|
||||
// https://k2-fsa.github.io/sherpa/onnx/tts/pretrained_models/index.html
|
||||
// to download models
|
||||
fun getOfflineTtsConfig(
|
||||
modelDir: String,
|
||||
modelName: String, // for VITS
|
||||
acousticModelName: String, // for Matcha
|
||||
vocoder: String, // for Matcha
|
||||
voices: String, // for Kokoro or kitten
|
||||
lexicon: String,
|
||||
dataDir: String,
|
||||
dictDir: String, // unused
|
||||
ruleFsts: String,
|
||||
ruleFars: String,
|
||||
numThreads: Int? = null,
|
||||
isKitten: Boolean = false
|
||||
): OfflineTtsConfig {
|
||||
// For Matcha TTS, please set
|
||||
// acousticModelName, vocoder
|
||||
|
||||
// For Kokoro TTS, please set
|
||||
// modelName, voices
|
||||
|
||||
// For Kitten TTS, please set
|
||||
// modelName, voices, isKitten
|
||||
|
||||
// For VITS, please set
|
||||
// modelName
|
||||
|
||||
val numberOfThreads = if (numThreads != null) {
|
||||
numThreads
|
||||
} else if (voices.isNotEmpty()) {
|
||||
// for Kokoro and Kitten TTS models, we use more threads
|
||||
4
|
||||
} else {
|
||||
2
|
||||
}
|
||||
|
||||
if (modelName.isEmpty() && acousticModelName.isEmpty()) {
|
||||
throw IllegalArgumentException("Please specify a TTS model")
|
||||
}
|
||||
|
||||
if (modelName.isNotEmpty() && acousticModelName.isNotEmpty()) {
|
||||
throw IllegalArgumentException("Please specify either a VITS or a Matcha model, but not both")
|
||||
}
|
||||
|
||||
if (acousticModelName.isNotEmpty() && vocoder.isEmpty()) {
|
||||
throw IllegalArgumentException("Please provide vocoder for Matcha TTS")
|
||||
}
|
||||
|
||||
val vits = if (modelName.isNotEmpty() && voices.isEmpty()) {
|
||||
OfflineTtsVitsModelConfig(
|
||||
model = "$modelDir/$modelName",
|
||||
lexicon = "$modelDir/$lexicon",
|
||||
tokens = "$modelDir/tokens.txt",
|
||||
dataDir = dataDir,
|
||||
)
|
||||
} else {
|
||||
OfflineTtsVitsModelConfig()
|
||||
}
|
||||
|
||||
val matcha = if (acousticModelName.isNotEmpty()) {
|
||||
OfflineTtsMatchaModelConfig(
|
||||
acousticModel = "$modelDir/$acousticModelName",
|
||||
vocoder = vocoder,
|
||||
lexicon = "$modelDir/$lexicon",
|
||||
tokens = "$modelDir/tokens.txt",
|
||||
dataDir = dataDir,
|
||||
)
|
||||
} else {
|
||||
OfflineTtsMatchaModelConfig()
|
||||
}
|
||||
|
||||
val kokoro = if (voices.isNotEmpty() && !isKitten) {
|
||||
OfflineTtsKokoroModelConfig(
|
||||
model = "$modelDir/$modelName",
|
||||
voices = "$modelDir/$voices",
|
||||
tokens = "$modelDir/tokens.txt",
|
||||
dataDir = dataDir,
|
||||
lexicon = when {
|
||||
lexicon == "" -> lexicon
|
||||
"," in lexicon -> lexicon
|
||||
else -> "$modelDir/$lexicon"
|
||||
},
|
||||
)
|
||||
} else {
|
||||
OfflineTtsKokoroModelConfig()
|
||||
}
|
||||
|
||||
val kitten = if (isKitten) {
|
||||
OfflineTtsKittenModelConfig(
|
||||
model = "$modelDir/$modelName",
|
||||
voices = "$modelDir/$voices",
|
||||
tokens = "$modelDir/tokens.txt",
|
||||
dataDir = dataDir,
|
||||
)
|
||||
} else {
|
||||
OfflineTtsKittenModelConfig()
|
||||
}
|
||||
|
||||
return OfflineTtsConfig(
|
||||
model = OfflineTtsModelConfig(
|
||||
vits = vits,
|
||||
matcha = matcha,
|
||||
kokoro = kokoro,
|
||||
kitten = kitten,
|
||||
numThreads = numberOfThreads,
|
||||
debug = true,
|
||||
provider = "cpu",
|
||||
),
|
||||
ruleFsts = ruleFsts,
|
||||
ruleFars = ruleFars,
|
||||
)
|
||||
}
|
||||
149
app/src/main/java/com/k2fsa/sherpa/onnx/Vad.kt
Normal file
149
app/src/main/java/com/k2fsa/sherpa/onnx/Vad.kt
Normal file
@@ -0,0 +1,149 @@
|
||||
// Copyright (c) 2023 Xiaomi Corporation
|
||||
package com.k2fsa.sherpa.onnx
|
||||
|
||||
import android.content.res.AssetManager
|
||||
|
||||
data class SileroVadModelConfig(
|
||||
var model: String = "",
|
||||
var threshold: Float = 0.5F,
|
||||
var minSilenceDuration: Float = 0.25F,
|
||||
var minSpeechDuration: Float = 0.25F,
|
||||
var windowSize: Int = 512,
|
||||
var maxSpeechDuration: Float = 5.0F,
|
||||
)
|
||||
|
||||
data class TenVadModelConfig(
|
||||
var model: String = "",
|
||||
var threshold: Float = 0.5F,
|
||||
var minSilenceDuration: Float = 0.25F,
|
||||
var minSpeechDuration: Float = 0.25F,
|
||||
var windowSize: Int = 256,
|
||||
var maxSpeechDuration: Float = 5.0F,
|
||||
)
|
||||
|
||||
data class VadModelConfig(
|
||||
var sileroVadModelConfig: SileroVadModelConfig = SileroVadModelConfig(),
|
||||
var tenVadModelConfig: TenVadModelConfig = TenVadModelConfig(),
|
||||
var sampleRate: Int = 16000,
|
||||
var numThreads: Int = 1,
|
||||
var provider: String = "cpu",
|
||||
var debug: Boolean = false,
|
||||
)
|
||||
|
||||
class SpeechSegment(val start: Int, val samples: FloatArray)
|
||||
|
||||
class Vad(
|
||||
assetManager: AssetManager? = null,
|
||||
var config: VadModelConfig,
|
||||
) {
|
||||
private var ptr: Long
|
||||
|
||||
init {
|
||||
if (assetManager != null) {
|
||||
ptr = newFromAsset(assetManager, config)
|
||||
} else {
|
||||
ptr = newFromFile(config)
|
||||
}
|
||||
}
|
||||
|
||||
protected fun finalize() {
|
||||
if (ptr != 0L) {
|
||||
delete(ptr)
|
||||
ptr = 0
|
||||
}
|
||||
}
|
||||
|
||||
fun release() = finalize()
|
||||
|
||||
fun compute(samples: FloatArray): Float = compute(ptr, samples)
|
||||
|
||||
|
||||
fun acceptWaveform(samples: FloatArray) = acceptWaveform(ptr, samples)
|
||||
|
||||
fun empty(): Boolean = empty(ptr)
|
||||
fun pop() = pop(ptr)
|
||||
|
||||
fun front(): SpeechSegment {
|
||||
return front(ptr)
|
||||
}
|
||||
|
||||
fun clear() = clear(ptr)
|
||||
|
||||
fun isSpeechDetected(): Boolean = isSpeechDetected(ptr)
|
||||
|
||||
fun reset() = reset(ptr)
|
||||
|
||||
fun flush() = flush(ptr)
|
||||
|
||||
private external fun delete(ptr: Long)
|
||||
|
||||
private external fun newFromAsset(
|
||||
assetManager: AssetManager,
|
||||
config: VadModelConfig,
|
||||
): Long
|
||||
|
||||
private external fun newFromFile(
|
||||
config: VadModelConfig,
|
||||
): Long
|
||||
|
||||
private external fun acceptWaveform(ptr: Long, samples: FloatArray)
|
||||
private external fun compute(ptr: Long, samples: FloatArray): Float
|
||||
|
||||
private external fun empty(ptr: Long): Boolean
|
||||
private external fun pop(ptr: Long)
|
||||
private external fun clear(ptr: Long)
|
||||
private external fun front(ptr: Long): SpeechSegment
|
||||
private external fun isSpeechDetected(ptr: Long): Boolean
|
||||
private external fun reset(ptr: Long)
|
||||
private external fun flush(ptr: Long)
|
||||
|
||||
companion object {
|
||||
init {
|
||||
System.loadLibrary("sherpa-onnx-jni")
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Please visit
|
||||
// https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/silero_vad.onnx
|
||||
// to download silero_vad.onnx
|
||||
// and put it inside the assets/
|
||||
// directory
|
||||
//
|
||||
// For ten-vad, please use
|
||||
// https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/ten-vad.onnx
|
||||
//
|
||||
fun getVadModelConfig(type: Int): VadModelConfig? {
|
||||
when (type) {
|
||||
0 -> {
|
||||
return VadModelConfig(
|
||||
sileroVadModelConfig = SileroVadModelConfig(
|
||||
model = "silero_vad.onnx",
|
||||
threshold = 0.5F,
|
||||
minSilenceDuration = 0.25F,
|
||||
minSpeechDuration = 0.25F,
|
||||
windowSize = 512,
|
||||
),
|
||||
sampleRate = 16000,
|
||||
numThreads = 1,
|
||||
provider = "cpu",
|
||||
)
|
||||
}
|
||||
|
||||
1 -> {
|
||||
return VadModelConfig(
|
||||
tenVadModelConfig = TenVadModelConfig(
|
||||
model = "ten-vad.onnx",
|
||||
threshold = 0.5F,
|
||||
minSilenceDuration = 0.25F,
|
||||
minSpeechDuration = 0.25F,
|
||||
windowSize = 256,
|
||||
),
|
||||
sampleRate = 16000,
|
||||
numThreads = 1,
|
||||
provider = "cpu",
|
||||
)
|
||||
}
|
||||
}
|
||||
return null
|
||||
}
|
||||
Reference in New Issue
Block a user