tts_asr_with_video
This commit is contained in:
957
app/src/main/java/com/digital_person/MainActivity.kt.bak
Normal file
957
app/src/main/java/com/digital_person/MainActivity.kt.bak
Normal file
@@ -0,0 +1,957 @@
|
||||
package com.digitalperson
|
||||
|
||||
import android.Manifest
|
||||
import android.content.pm.PackageManager
|
||||
import android.media.AudioAttributes
|
||||
import android.media.AudioFormat
|
||||
import android.media.AudioManager
|
||||
import android.media.AudioRecord
|
||||
import android.media.AudioTrack
|
||||
import android.media.MediaRecorder
|
||||
import android.media.audiofx.AcousticEchoCanceler
|
||||
import android.media.audiofx.NoiseSuppressor
|
||||
import android.os.Bundle
|
||||
import android.os.SystemClock
|
||||
import android.text.method.ScrollingMovementMethod
|
||||
import android.util.Log
|
||||
import android.widget.Button
|
||||
import android.widget.TextView
|
||||
import android.widget.Toast
|
||||
import androidx.appcompat.app.AppCompatActivity
|
||||
import androidx.core.app.ActivityCompat
|
||||
import com.digitalperson.cloud.CloudApiManager
|
||||
import com.digitalperson.player.VideoPlayerManager
|
||||
import com.google.android.exoplayer2.ui.PlayerView
|
||||
import com.digitalperson.engine.SenseVoiceEngineRKNN
|
||||
import com.digitalperson.metrics.TraceManager
|
||||
import com.digitalperson.metrics.TraceSession
|
||||
import com.k2fsa.sherpa.onnx.OfflineTts
|
||||
import com.k2fsa.sherpa.onnx.SileroVadModelConfig
|
||||
import com.k2fsa.sherpa.onnx.Vad
|
||||
import com.k2fsa.sherpa.onnx.VadModelConfig
|
||||
import com.k2fsa.sherpa.onnx.getOfflineTtsConfig
|
||||
import kotlinx.coroutines.CoroutineScope
|
||||
import kotlinx.coroutines.Dispatchers
|
||||
import kotlinx.coroutines.Job
|
||||
import kotlinx.coroutines.SupervisorJob
|
||||
import kotlinx.coroutines.cancel
|
||||
import kotlinx.coroutines.channels.Channel
|
||||
import kotlinx.coroutines.isActive
|
||||
import kotlinx.coroutines.launch
|
||||
import kotlinx.coroutines.withContext
|
||||
import java.io.File
|
||||
import java.io.FileOutputStream
|
||||
import java.util.concurrent.LinkedBlockingQueue
|
||||
import java.util.concurrent.atomic.AtomicBoolean
|
||||
import kotlin.math.max
|
||||
|
||||
private const val TAG = "DigitalPerson"
|
||||
private const val REQUEST_RECORD_AUDIO_PERMISSION = 200
|
||||
|
||||
class MainActivity : AppCompatActivity() {
|
||||
|
||||
private lateinit var startButton: Button
|
||||
private lateinit var stopButton: Button
|
||||
private lateinit var textView: TextView
|
||||
|
||||
private lateinit var vad: Vad
|
||||
private var senseVoice: SenseVoiceEngineRKNN? = null
|
||||
private var tts: OfflineTts? = null
|
||||
private var track: AudioTrack? = null
|
||||
|
||||
private var aec: AcousticEchoCanceler? = null
|
||||
private var ns: NoiseSuppressor? = null
|
||||
|
||||
private var audioRecord: AudioRecord? = null
|
||||
private val audioSource = MediaRecorder.AudioSource.MIC
|
||||
private val sampleRateInHz = 16000
|
||||
private val channelConfig = AudioFormat.CHANNEL_IN_MONO
|
||||
private val audioFormat = AudioFormat.ENCODING_PCM_16BIT
|
||||
private val permissions: Array<String> = arrayOf(Manifest.permission.RECORD_AUDIO)
|
||||
|
||||
@Volatile
|
||||
private var isRecording: Boolean = false
|
||||
|
||||
private val ioScope = CoroutineScope(SupervisorJob() + Dispatchers.IO)
|
||||
private var recordingJob: Job? = null
|
||||
private val nativeLock = Any()
|
||||
|
||||
private lateinit var cloudApiManager: CloudApiManager
|
||||
private var videoPlayerManager: VideoPlayerManager? = null
|
||||
private val segmenter = StreamingTextSegmenter(
|
||||
maxLen = 30,
|
||||
maxWaitMs = 600
|
||||
)
|
||||
|
||||
private sealed class TtsQueueItem {
|
||||
data class Segment(val text: String) : TtsQueueItem()
|
||||
data object End : TtsQueueItem()
|
||||
}
|
||||
|
||||
private val ttsQueue = LinkedBlockingQueue<TtsQueueItem>()
|
||||
private val ttsStopped = AtomicBoolean(false)
|
||||
private val ttsWorkerRunning = AtomicBoolean(false)
|
||||
private val ttsPlaying = AtomicBoolean(false)
|
||||
@Volatile private var ttsTotalSamplesWritten: Long = 0
|
||||
|
||||
private var currentTrace: TraceSession? = null
|
||||
|
||||
private var lastUiText: String = ""
|
||||
@Volatile private var llmInFlight: Boolean = false
|
||||
private var enableStreaming = true // 默认启用流式输出
|
||||
|
||||
// ASR 队列和工作器
|
||||
private val asrQueue = Channel<Pair<FloatArray, TraceSession?>>()
|
||||
private val asrWorkerRunning = AtomicBoolean(false)
|
||||
|
||||
override fun onRequestPermissionsResult(
|
||||
requestCode: Int,
|
||||
permissions: Array<String>,
|
||||
grantResults: IntArray
|
||||
) {
|
||||
super.onRequestPermissionsResult(requestCode, permissions, grantResults)
|
||||
val ok = requestCode == REQUEST_RECORD_AUDIO_PERMISSION &&
|
||||
grantResults.isNotEmpty() &&
|
||||
grantResults[0] == PackageManager.PERMISSION_GRANTED
|
||||
if (!ok) {
|
||||
Log.e(TAG, "Audio record is disallowed")
|
||||
finish()
|
||||
}
|
||||
}
|
||||
|
||||
override fun onCreate(savedInstanceState: Bundle?) {
|
||||
super.onCreate(savedInstanceState)
|
||||
setContentView(R.layout.activity_main)
|
||||
|
||||
// 初始化双播放器管理器(silent 与 speaking 两个叠加的 PlayerView)
|
||||
try {
|
||||
val silentPv = findViewById<PlayerView>(R.id.player_view_silent)
|
||||
val speakingPv = findViewById<PlayerView>(R.id.player_view_speaking)
|
||||
videoPlayerManager = VideoPlayerManager(this, silentPv, speakingPv)
|
||||
// 默认 AI 未说话
|
||||
videoPlayerManager?.setSpeaking(false)
|
||||
} catch (e: Exception) {
|
||||
Log.w(TAG, "PlayerViews not found or init failed: ${e.message}")
|
||||
}
|
||||
|
||||
ActivityCompat.requestPermissions(this, permissions, REQUEST_RECORD_AUDIO_PERMISSION)
|
||||
|
||||
startButton = findViewById(R.id.start_button)
|
||||
stopButton = findViewById(R.id.stop_button)
|
||||
textView = findViewById(R.id.my_text)
|
||||
textView.movementMethod = ScrollingMovementMethod()
|
||||
|
||||
startButton.setOnClickListener { onStartClicked() }
|
||||
stopButton.setOnClickListener { onStopClicked(userInitiated = true) }
|
||||
|
||||
// 初始化流式输出开关
|
||||
try {
|
||||
val streamingSwitch = findViewById<android.widget.Switch>(R.id.streaming_switch)
|
||||
streamingSwitch.isChecked = enableStreaming
|
||||
streamingSwitch.setOnCheckedChangeListener { _, isChecked ->
|
||||
enableStreaming = isChecked
|
||||
cloudApiManager.setEnableStreaming(isChecked)
|
||||
Toast.makeText(this, "流式输出已${if (isChecked) "启用" else "禁用"}", Toast.LENGTH_SHORT).show()
|
||||
}
|
||||
} catch (e: Exception) {
|
||||
Log.w(TAG, "Streaming switch not found in layout: ${e.message}")
|
||||
}
|
||||
|
||||
// 避免 UI 线程重初始化导致 ANR:在后台初始化模型与 AudioTrack
|
||||
startButton.isEnabled = false
|
||||
stopButton.isEnabled = false
|
||||
textView.text = "初始化中…"
|
||||
ioScope.launch {
|
||||
try {
|
||||
Log.i(TAG, "Init VAD + SenseVoice(RKNN) + TTS (background)")
|
||||
synchronized(nativeLock) {
|
||||
initVadModel()
|
||||
initSenseVoiceModel()
|
||||
}
|
||||
withContext(Dispatchers.Main) {
|
||||
initTtsAndAudioTrack()
|
||||
textView.text = getString(R.string.hint)
|
||||
startButton.isEnabled = true
|
||||
stopButton.isEnabled = false
|
||||
}
|
||||
} catch (t: Throwable) {
|
||||
Log.e(TAG, "Initialization failed: ${t.message}", t)
|
||||
withContext(Dispatchers.Main) {
|
||||
textView.text = "初始化失败:${t.javaClass.simpleName}: ${t.message}"
|
||||
Toast.makeText(
|
||||
this@MainActivity,
|
||||
"初始化失败(请看 Logcat): ${t.javaClass.simpleName}",
|
||||
Toast.LENGTH_LONG
|
||||
).show()
|
||||
startButton.isEnabled = false
|
||||
stopButton.isEnabled = false
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
cloudApiManager = CloudApiManager(object : CloudApiManager.CloudApiListener {
|
||||
private var llmFirstChunkMarked = false
|
||||
|
||||
override fun onLLMResponseReceived(response: String) {
|
||||
currentTrace?.markLlmDone()
|
||||
llmInFlight = false
|
||||
|
||||
// 根据流式输出模式处理响应
|
||||
if (enableStreaming) {
|
||||
// 启用流式输出时,刷新剩余缓冲区
|
||||
for (seg in segmenter.flush()) {
|
||||
enqueueTtsSegment(seg)
|
||||
}
|
||||
// 发送队列结束信号
|
||||
ttsQueue.offer(TtsQueueItem.End)
|
||||
} else {
|
||||
runOnUiThread {
|
||||
appendToUi("${response}\n")
|
||||
}
|
||||
// 禁用流式输出时,直接使用整段文本进行TTS
|
||||
enqueueTtsSegment(response)
|
||||
// 发送队列结束信号
|
||||
ttsQueue.offer(TtsQueueItem.End)
|
||||
}
|
||||
}
|
||||
|
||||
override fun onLLMStreamingChunkReceived(chunk: String) {
|
||||
// 启用流式输出时,处理流式chunk
|
||||
if (enableStreaming) {
|
||||
if (!llmFirstChunkMarked) {
|
||||
llmFirstChunkMarked = true
|
||||
currentTrace?.markLlmFirstChunk()
|
||||
}
|
||||
appendToUi(chunk)
|
||||
|
||||
val segments = segmenter.processChunk(chunk)
|
||||
for (seg in segments) {
|
||||
enqueueTtsSegment(seg)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
override fun onTTSAudioReceived(audioFilePath: String) {
|
||||
// unused
|
||||
}
|
||||
|
||||
override fun onError(errorMessage: String) {
|
||||
llmInFlight = false
|
||||
Toast.makeText(this@MainActivity, errorMessage, Toast.LENGTH_LONG).show()
|
||||
onStopClicked(userInitiated = false)
|
||||
}
|
||||
}, applicationContext)
|
||||
|
||||
// 设置流式输出模式
|
||||
cloudApiManager.setEnableStreaming(enableStreaming)
|
||||
}
|
||||
|
||||
override fun onDestroy() {
|
||||
super.onDestroy()
|
||||
onStopClicked(userInitiated = false)
|
||||
ioScope.cancel()
|
||||
synchronized(nativeLock) {
|
||||
try {
|
||||
vad.release()
|
||||
} catch (_: Throwable) {
|
||||
}
|
||||
try {
|
||||
senseVoice?.deinitialize()
|
||||
} catch (_: Throwable) {
|
||||
}
|
||||
}
|
||||
try {
|
||||
tts?.release()
|
||||
} catch (_: Throwable) {
|
||||
}
|
||||
try {
|
||||
videoPlayerManager?.release()
|
||||
} catch (_: Throwable) {
|
||||
}
|
||||
}
|
||||
|
||||
private fun onStartClicked() {
|
||||
if (isRecording) return
|
||||
|
||||
if (!initMicrophone()) {
|
||||
Toast.makeText(this, "麦克风初始化失败/无权限", Toast.LENGTH_SHORT).show()
|
||||
return
|
||||
}
|
||||
|
||||
// Start a new trace turn
|
||||
currentTrace = TraceManager.getInstance().startNewTurn()
|
||||
currentTrace?.mark("turn_start")
|
||||
llmInFlight = false
|
||||
|
||||
lastUiText = ""
|
||||
textView.text = ""
|
||||
|
||||
ttsStopped.set(false)
|
||||
ttsPlaying.set(false)
|
||||
ttsTotalSamplesWritten = 0
|
||||
ttsQueue.clear()
|
||||
segmenter.reset()
|
||||
|
||||
vad.reset()
|
||||
audioRecord!!.startRecording()
|
||||
isRecording = true
|
||||
|
||||
startButton.isEnabled = false
|
||||
stopButton.isEnabled = true
|
||||
|
||||
recordingJob?.cancel()
|
||||
recordingJob = ioScope.launch {
|
||||
processSamplesLoop()
|
||||
}
|
||||
}
|
||||
|
||||
private fun onStopClicked(userInitiated: Boolean) {
|
||||
isRecording = false
|
||||
try {
|
||||
audioRecord?.stop()
|
||||
} catch (_: Throwable) {
|
||||
}
|
||||
try {
|
||||
audioRecord?.release()
|
||||
} catch (_: Throwable) {
|
||||
}
|
||||
audioRecord = null
|
||||
|
||||
recordingJob?.cancel()
|
||||
recordingJob = null
|
||||
|
||||
ttsStopped.set(true)
|
||||
ttsPlaying.set(false)
|
||||
ttsTotalSamplesWritten = 0
|
||||
ttsQueue.clear()
|
||||
// wake worker if waiting
|
||||
ttsQueue.offer(TtsQueueItem.End)
|
||||
|
||||
try {
|
||||
track?.pause()
|
||||
track?.flush()
|
||||
} catch (_: Throwable) {
|
||||
}
|
||||
try { aec?.release() } catch (_: Throwable) {}
|
||||
try { ns?.release() } catch (_: Throwable) {}
|
||||
aec = null
|
||||
ns = null
|
||||
startButton.isEnabled = true
|
||||
stopButton.isEnabled = false
|
||||
|
||||
if (userInitiated) {
|
||||
TraceManager.getInstance().endTurn()
|
||||
currentTrace = null
|
||||
}
|
||||
}
|
||||
|
||||
private fun initVadModel() {
|
||||
// 你的 VAD 模型在 assets/vad_model/ 下
|
||||
val config = VadModelConfig(
|
||||
sileroVadModelConfig = SileroVadModelConfig(
|
||||
model = "vad_model/silero_vad.onnx",
|
||||
threshold = 0.5F,
|
||||
minSilenceDuration = 0.25F,
|
||||
minSpeechDuration = 0.25F,
|
||||
windowSize = 512,
|
||||
),
|
||||
sampleRate = sampleRateInHz,
|
||||
numThreads = 1,
|
||||
provider = "cpu",
|
||||
)
|
||||
vad = Vad(assetManager = application.assets, config = config)
|
||||
}
|
||||
|
||||
private fun initSenseVoiceModel() {
|
||||
Log.i(TAG, "ASR: init SenseVoice RKNN (scheme A)")
|
||||
// Copy assets/sensevoice_models/* -> filesDir/sensevoice_models/*
|
||||
val modelDir = copySenseVoiceAssetsToInternal()
|
||||
val modelPath = File(modelDir, "sense-voice-encoder.rknn").absolutePath
|
||||
val embeddingPath = File(modelDir, "embedding.npy").absolutePath
|
||||
val bpePath = File(modelDir, "chn_jpn_yue_eng_ko_spectok.bpe.model").absolutePath
|
||||
|
||||
// Print quick diagnostics for native libs + model files
|
||||
try {
|
||||
val libDir = applicationInfo.nativeLibraryDir
|
||||
Log.i(TAG, "nativeLibraryDir=$libDir")
|
||||
try {
|
||||
val names = File(libDir).list()?.joinToString(", ") ?: "(empty)"
|
||||
Log.i(TAG, "nativeLibraryDir files: $names")
|
||||
} catch (t: Throwable) {
|
||||
Log.w(TAG, "Failed to list nativeLibraryDir: ${t.message}")
|
||||
}
|
||||
} catch (_: Throwable) {
|
||||
}
|
||||
Log.i(TAG, "SenseVoice model paths:")
|
||||
Log.i(TAG, " model=$modelPath exists=${File(modelPath).exists()} size=${File(modelPath).length()}")
|
||||
Log.i(TAG, " embedding=$embeddingPath exists=${File(embeddingPath).exists()} size=${File(embeddingPath).length()}")
|
||||
Log.i(TAG, " bpe=$bpePath exists=${File(bpePath).exists()} size=${File(bpePath).length()}")
|
||||
|
||||
val t0 = SystemClock.elapsedRealtime()
|
||||
val engine = try {
|
||||
SenseVoiceEngineRKNN(this)
|
||||
} catch (e: UnsatisfiedLinkError) {
|
||||
// Most common: libsensevoiceEngine.so not packaged/built, or dependent libs missing
|
||||
throw IllegalStateException("Load native libraries failed: ${e.message}", e)
|
||||
}
|
||||
|
||||
val ok = try {
|
||||
engine.loadModelDirectly(modelPath, embeddingPath, bpePath)
|
||||
} catch (t: Throwable) {
|
||||
throw IllegalStateException("SenseVoice loadModelDirectly crashed: ${t.message}", t)
|
||||
}
|
||||
|
||||
val dt = SystemClock.elapsedRealtime() - t0
|
||||
Log.i(TAG, "SenseVoice loadModelDirectly ok=$ok costMs=$dt")
|
||||
if (!ok) throw IllegalStateException("SenseVoiceEngineRKNN loadModelDirectly returned false")
|
||||
|
||||
senseVoice = engine
|
||||
}
|
||||
|
||||
private fun initTtsAndAudioTrack() {
|
||||
try {
|
||||
// 你放入的 sherpa-onnx VITS 中文模型目录:
|
||||
// assets/tts_model/sherpa-onnx-vits-zh-ll/{model.onnx,tokens.txt,lexicon.txt,...}
|
||||
val modelDir = "tts_model/sherpa-onnx-vits-zh-ll"
|
||||
val modelName = "model.onnx"
|
||||
val lexicon = "lexicon.txt"
|
||||
val dataDir = ""
|
||||
|
||||
val ttsConfig = getOfflineTtsConfig(
|
||||
modelDir = modelDir,
|
||||
modelName = modelName,
|
||||
acousticModelName = "",
|
||||
vocoder = "",
|
||||
voices = "",
|
||||
lexicon = lexicon,
|
||||
dataDir = dataDir,
|
||||
dictDir = "",
|
||||
// 中文规范化规则(目录里已有这些 fst)
|
||||
ruleFsts = "$modelDir/phone.fst,$modelDir/date.fst,$modelDir/number.fst,$modelDir/new_heteronym.fst",
|
||||
ruleFars = "",
|
||||
numThreads = null,
|
||||
isKitten = false
|
||||
)
|
||||
tts = OfflineTts(assetManager = application.assets, config = ttsConfig)
|
||||
} catch (t: Throwable) {
|
||||
Log.e(TAG, "Init TTS failed: ${t.message}", t)
|
||||
tts = null
|
||||
runOnUiThread {
|
||||
Toast.makeText(
|
||||
this,
|
||||
"TTS 初始化失败:请确认 assets/tts_model/sherpa-onnx-vits-zh-ll/ 下有 model.onnx、tokens.txt、lexicon.txt 以及 phone/date/number/new_heteronym.fst",
|
||||
Toast.LENGTH_LONG
|
||||
).show()
|
||||
}
|
||||
}
|
||||
|
||||
val t = tts ?: return
|
||||
val sr = t.sampleRate()
|
||||
val bufLength = AudioTrack.getMinBufferSize(
|
||||
sr,
|
||||
AudioFormat.CHANNEL_OUT_MONO,
|
||||
AudioFormat.ENCODING_PCM_FLOAT
|
||||
)
|
||||
val attr = AudioAttributes.Builder()
|
||||
.setContentType(AudioAttributes.CONTENT_TYPE_SPEECH)
|
||||
.setUsage(AudioAttributes.USAGE_MEDIA)
|
||||
.build()
|
||||
val format = AudioFormat.Builder()
|
||||
.setEncoding(AudioFormat.ENCODING_PCM_FLOAT)
|
||||
.setChannelMask(AudioFormat.CHANNEL_OUT_MONO)
|
||||
.setSampleRate(sr)
|
||||
.build()
|
||||
track = AudioTrack(
|
||||
attr,
|
||||
format,
|
||||
bufLength,
|
||||
AudioTrack.MODE_STREAM,
|
||||
AudioManager.AUDIO_SESSION_ID_GENERATE
|
||||
)
|
||||
track?.play()
|
||||
}
|
||||
|
||||
private fun assetExists(path: String): Boolean {
|
||||
return try {
|
||||
application.assets.open(path).close()
|
||||
true
|
||||
} catch (_: Throwable) {
|
||||
false
|
||||
}
|
||||
}
|
||||
|
||||
private fun copySenseVoiceAssetsToInternal(): File {
|
||||
val outDir = File(filesDir, "sensevoice_models")
|
||||
if (!outDir.exists()) outDir.mkdirs()
|
||||
|
||||
val files = arrayOf(
|
||||
"am.mvn",
|
||||
"chn_jpn_yue_eng_ko_spectok.bpe.model",
|
||||
"embedding.npy",
|
||||
"sense-voice-encoder.rknn"
|
||||
)
|
||||
|
||||
for (name in files) {
|
||||
val assetPath = "sensevoice_models/$name"
|
||||
val outFile = File(outDir, name)
|
||||
if (outFile.exists() && outFile.length() > 0) continue
|
||||
application.assets.open(assetPath).use { input ->
|
||||
FileOutputStream(outFile).use { output ->
|
||||
input.copyTo(output)
|
||||
}
|
||||
}
|
||||
}
|
||||
return outDir
|
||||
}
|
||||
|
||||
private fun initMicrophone(): Boolean {
|
||||
if (ActivityCompat.checkSelfPermission(this, Manifest.permission.RECORD_AUDIO)
|
||||
!= PackageManager.PERMISSION_GRANTED
|
||||
) {
|
||||
ActivityCompat.requestPermissions(this, permissions, REQUEST_RECORD_AUDIO_PERMISSION)
|
||||
return false
|
||||
}
|
||||
|
||||
val numBytes = AudioRecord.getMinBufferSize(sampleRateInHz, channelConfig, audioFormat)
|
||||
audioRecord = AudioRecord(
|
||||
audioSource,
|
||||
sampleRateInHz,
|
||||
channelConfig,
|
||||
audioFormat,
|
||||
numBytes * 2
|
||||
)
|
||||
val sessionId = audioRecord?.audioSessionId ?: 0
|
||||
if (sessionId != 0) {
|
||||
if (android.media.audiofx.AcousticEchoCanceler.isAvailable()) {
|
||||
aec = android.media.audiofx.AcousticEchoCanceler.create(sessionId)?.apply {
|
||||
enabled = true
|
||||
}
|
||||
Log.i(TAG, "AEC enabled=${aec?.enabled}")
|
||||
} else {
|
||||
Log.w(TAG, "AEC not available on this device")
|
||||
}
|
||||
|
||||
if (android.media.audiofx.NoiseSuppressor.isAvailable()) {
|
||||
ns = android.media.audiofx.NoiseSuppressor.create(sessionId)?.apply {
|
||||
enabled = true
|
||||
}
|
||||
Log.i(TAG, "NS enabled=${ns?.enabled}")
|
||||
} else {
|
||||
Log.w(TAG, "NS not available on this device")
|
||||
}
|
||||
}
|
||||
return true
|
||||
}
|
||||
|
||||
private suspend fun processSamplesLoop() {
|
||||
// Avoid calling vad.front()/vad.pop() (native queue APIs) since it crashes on some builds.
|
||||
// Use vad.compute() and implement a simple VAD segmenter in Kotlin instead.
|
||||
val windowSize = 512
|
||||
val buffer = ShortArray(windowSize)
|
||||
// 双阈值设置
|
||||
val startThreshold = 0.2f // 进入语音的阈值
|
||||
val endThreshold = 0.15f // 退出语音的阈值
|
||||
val minSilenceSamples = (0.5f * sampleRateInHz).toInt()
|
||||
val minSpeechSamples = (0.1f * sampleRateInHz).toInt()
|
||||
val maxSpeechSamples = (5.0f * sampleRateInHz).toInt()
|
||||
|
||||
// VAD 概率数据记录
|
||||
val vadProbabilities = mutableListOf<Float>()
|
||||
val vadTimestamps = mutableListOf<Long>()
|
||||
val vadRMSValues = mutableListOf<Float>()
|
||||
val vadSmoothedRMSValues = mutableListOf<Float>()
|
||||
|
||||
// 指数平滑相关变量
|
||||
var smoothedRms = 0f
|
||||
val alpha = 0.8f // 平滑系数
|
||||
|
||||
var inSpeech = false
|
||||
var silenceSamples = 0
|
||||
|
||||
var speechBuf = FloatArray(0)
|
||||
var speechLen = 0
|
||||
var processedSpeechBuf = FloatArray(0)
|
||||
var processedSpeechLen = 0
|
||||
|
||||
fun appendSpeech(chunk: FloatArray, processedChunk: FloatArray) {
|
||||
// 保存原始音频
|
||||
val needed = speechLen + chunk.size
|
||||
if (speechBuf.size < needed) {
|
||||
var newCap = maxOf(needed, maxOf(1024, speechBuf.size * 2))
|
||||
if (newCap > maxSpeechSamples) newCap = maxSpeechSamples
|
||||
val n = FloatArray(newCap)
|
||||
if (speechLen > 0) System.arraycopy(speechBuf, 0, n, 0, speechLen)
|
||||
speechBuf = n
|
||||
}
|
||||
val copyN = minOf(chunk.size, max(0, maxSpeechSamples - speechLen))
|
||||
if (copyN > 0) {
|
||||
System.arraycopy(chunk, 0, speechBuf, speechLen, copyN)
|
||||
speechLen += copyN
|
||||
}
|
||||
|
||||
// 保存增益后的音频
|
||||
val processedNeeded = processedSpeechLen + processedChunk.size
|
||||
if (processedSpeechBuf.size < processedNeeded) {
|
||||
var newCap = maxOf(processedNeeded, maxOf(1024, processedSpeechBuf.size * 2))
|
||||
if (newCap > maxSpeechSamples) newCap = maxSpeechSamples
|
||||
val n = FloatArray(newCap)
|
||||
if (processedSpeechLen > 0) System.arraycopy(processedSpeechBuf, 0, n, 0, processedSpeechLen)
|
||||
processedSpeechBuf = n
|
||||
}
|
||||
val processedCopyN = minOf(processedChunk.size, max(0, maxSpeechSamples - processedSpeechLen))
|
||||
if (processedCopyN > 0) {
|
||||
System.arraycopy(processedChunk, 0, processedSpeechBuf, processedSpeechLen, processedCopyN)
|
||||
processedSpeechLen += processedCopyN
|
||||
}
|
||||
}
|
||||
|
||||
suspend fun finalizeSegmentIfAny() {
|
||||
if (speechLen < minSpeechSamples) {
|
||||
speechLen = 0
|
||||
processedSpeechLen = 0
|
||||
inSpeech = false
|
||||
silenceSamples = 0
|
||||
return
|
||||
}
|
||||
// ✅ 新增:如果 TTS 正在播放或 LLM 请求中,丢弃此段(避免回声)
|
||||
if (ttsPlaying.get() || llmInFlight) {
|
||||
speechLen = 0
|
||||
processedSpeechLen = 0
|
||||
inSpeech = false
|
||||
silenceSamples = 0
|
||||
return
|
||||
}
|
||||
val originalSeg = speechBuf.copyOf(speechLen)
|
||||
val processedSeg = processedSpeechBuf.copyOf(processedSpeechLen)
|
||||
speechLen = 0
|
||||
processedSpeechLen = 0
|
||||
inSpeech = false
|
||||
silenceSamples = 0
|
||||
|
||||
// 将语音段加入 ASR 处理队列,异步处理
|
||||
asrQueue.send(Pair(originalSeg, processedSeg))
|
||||
}
|
||||
|
||||
while (isRecording && ioScope.coroutineContext.isActive) {
|
||||
val ret = audioRecord?.read(buffer, 0, buffer.size) ?: break
|
||||
if (ret <= 0) continue
|
||||
if (ret != windowSize) continue
|
||||
// 在 processSamplesLoop 方法中
|
||||
val chunk = FloatArray(ret) { buffer[it] / 32768.0f }
|
||||
|
||||
// 计算当前音频的RMS值(均方根)
|
||||
val rms = calculateRMS(chunk)
|
||||
|
||||
// 应用指数平滑
|
||||
smoothedRms = if (smoothedRms == 0f) rms else alpha * rms + (1 - alpha) * smoothedRms
|
||||
|
||||
// 动态调整增益因子,目标RMS设为0.1(约-20dB)
|
||||
val targetRMS = 0.1f
|
||||
var gainFactor = if (smoothedRms > 0) targetRMS / smoothedRms else 3.0f
|
||||
|
||||
// 设置增益的上下限,避免过度增益导致削波
|
||||
gainFactor = gainFactor.coerceIn(0.1f, 10.0f)
|
||||
|
||||
// 应用增益因子
|
||||
val processedChunk = FloatArray(chunk.size) {
|
||||
val value = chunk[it] * gainFactor
|
||||
// 限制音量范围,避免削波
|
||||
if (value > 1.0f) 1.0f else if (value < -1.0f) -1.0f else value
|
||||
}
|
||||
|
||||
// 使用处理后的音频数据
|
||||
val prob = synchronized(nativeLock) { vad.compute(processedChunk) }
|
||||
|
||||
// 记录VAD概率、时间戳、原始RMS值和平滑后的RMS值
|
||||
vadProbabilities.add(prob)
|
||||
vadTimestamps.add(System.currentTimeMillis())
|
||||
vadRMSValues.add(rms)
|
||||
vadSmoothedRMSValues.add(smoothedRms)
|
||||
|
||||
// 双阈值状态机逻辑
|
||||
if (!inSpeech && prob >= startThreshold) {
|
||||
// 进入语音状态
|
||||
inSpeech = true
|
||||
silenceSamples = 0
|
||||
appendSpeech(chunk, processedChunk)
|
||||
} else if (inSpeech && prob <= endThreshold) {
|
||||
// 开始计数静音样本
|
||||
silenceSamples += ret
|
||||
if (silenceSamples >= minSilenceSamples) {
|
||||
// 退出语音状态
|
||||
finalizeSegmentIfAny()
|
||||
} else {
|
||||
// 保留尾音
|
||||
appendSpeech(chunk, processedChunk)
|
||||
}
|
||||
} else if (inSpeech) {
|
||||
// 语音过程中,持续添加音频
|
||||
appendSpeech(chunk, processedChunk)
|
||||
silenceSamples = 0 // 重置静音计数
|
||||
|
||||
if (speechLen >= maxSpeechSamples) {
|
||||
finalizeSegmentIfAny()
|
||||
}
|
||||
}
|
||||
// 非语音状态且概率低于开始阈值,不做处理
|
||||
|
||||
// 时间兜底切段(避免长时间无标点导致首包太慢)
|
||||
val forced = segmenter.maybeForceByTime()
|
||||
for (seg in forced) enqueueTtsSegment(seg)
|
||||
}
|
||||
|
||||
// flush last partial segment
|
||||
finalizeSegmentIfAny()
|
||||
|
||||
// 保存VAD数据到文件
|
||||
saveVadData(vadTimestamps, vadProbabilities, vadRMSValues, vadSmoothedRMSValues)
|
||||
}
|
||||
|
||||
/**
|
||||
* 保存VAD数据到文件,方便后续分析和绘图
|
||||
*/
|
||||
private fun saveVadData(timestamps: List<Long>, probabilities: List<Float>, rmsValues: List<Float>, smoothedRmsValues: List<Float>) {
|
||||
try {
|
||||
// 创建保存目录
|
||||
val vadDataDir = File(filesDir, "vad_data")
|
||||
if (!vadDataDir.exists()) {
|
||||
vadDataDir.mkdirs()
|
||||
}
|
||||
|
||||
// 生成唯一的文件名
|
||||
val timestamp = System.currentTimeMillis()
|
||||
val fileName = "vad_data_${timestamp}.csv"
|
||||
val outputFile = File(vadDataDir, fileName)
|
||||
|
||||
// 写入数据
|
||||
FileOutputStream(outputFile).use { fos ->
|
||||
// 写入表头
|
||||
fos.write("timestamp,probability,rms,smoothed_rms\n".toByteArray())
|
||||
|
||||
// 写入数据行
|
||||
for (i in timestamps.indices) {
|
||||
val line = "${timestamps[i]},${probabilities[i]},${rmsValues[i]},${smoothedRmsValues[i]}\n"
|
||||
fos.write(line.toByteArray())
|
||||
}
|
||||
}
|
||||
|
||||
Log.d(TAG, "Saved VAD data to: ${outputFile.absolutePath}")
|
||||
} catch (e: Exception) {
|
||||
Log.e(TAG, "Error saving VAD data: ${e.message}")
|
||||
}
|
||||
}
|
||||
|
||||
private fun removeTokens(text: String): String {
|
||||
// Remove tokens like <|zh|>, <|NEUTRAL|>, <|Speech|>, <|woitn|> and stray '>' chars
|
||||
var cleaned = text.replace(Regex("<\\|[^>]+\\|>"), "")
|
||||
cleaned = cleaned.replace(Regex("[>>≥≫]"), "")
|
||||
cleaned = cleaned.trim().replace(Regex("\\s+"), " ")
|
||||
return cleaned
|
||||
}
|
||||
|
||||
private fun enqueueTtsSegment(seg: String) {
|
||||
// 移除句末的标点符号
|
||||
val cleanedSeg = seg.trimEnd('.', '。', '!', '!', '?', '?', ',', ',', ';', ';', ':', ':')
|
||||
|
||||
currentTrace?.markTtsRequestEnqueued()
|
||||
ttsQueue.offer(TtsQueueItem.Segment(cleanedSeg))
|
||||
ensureTtsWorker()
|
||||
}
|
||||
|
||||
private fun ensureTtsWorker() {
|
||||
if (!ttsWorkerRunning.compareAndSet(false, true)) return
|
||||
ioScope.launch {
|
||||
try {
|
||||
runTtsWorker()
|
||||
} finally {
|
||||
ttsWorkerRunning.set(false)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private fun ensureAsrWorker() {
|
||||
if (!asrWorkerRunning.compareAndSet(false, true)) return
|
||||
ioScope.launch {
|
||||
try {
|
||||
runAsrWorker()
|
||||
} finally {
|
||||
asrWorkerRunning.set(false)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private fun runTtsWorker() {
|
||||
val t = tts ?: return
|
||||
val audioTrack = track ?: return
|
||||
|
||||
var firstAudioMarked = false
|
||||
var isFirstSegment = true
|
||||
while (true) {
|
||||
val item = ttsQueue.take()
|
||||
if (ttsStopped.get()) break
|
||||
|
||||
when (item) {
|
||||
is TtsQueueItem.Segment -> {
|
||||
ttsPlaying.set(true)
|
||||
runOnUiThread { videoPlayerManager?.setSpeaking(true) }
|
||||
val trace = currentTrace
|
||||
trace?.markTtsSynthesisStart()
|
||||
Log.d(TAG, "TTS started: processing segment '${item.text}'")
|
||||
runOnUiThread {
|
||||
appendToUi("\n[TTS] 开始合成...\n")
|
||||
}
|
||||
|
||||
val startMs = System.currentTimeMillis()
|
||||
var firstPcmMarked = false
|
||||
|
||||
if (isFirstSegment) {
|
||||
try {
|
||||
audioTrack.pause()
|
||||
audioTrack.flush()
|
||||
audioTrack.play()
|
||||
} catch (_: Throwable) {
|
||||
}
|
||||
isFirstSegment = false
|
||||
}
|
||||
|
||||
t.generateWithCallback(
|
||||
text = item.text,
|
||||
sid = 2, // 这里可以修改说话人
|
||||
speed = 1.0f
|
||||
) { samples ->
|
||||
if (ttsStopped.get()) return@generateWithCallback 0
|
||||
if (!firstPcmMarked && samples.isNotEmpty()) {
|
||||
firstPcmMarked = true
|
||||
trace?.markTtsFirstPcmReady()
|
||||
}
|
||||
if (!firstAudioMarked && samples.isNotEmpty()) {
|
||||
firstAudioMarked = true
|
||||
trace?.markTtsFirstAudioPlay()
|
||||
}
|
||||
audioTrack.write(samples, 0, samples.size, AudioTrack.WRITE_BLOCKING)
|
||||
ttsTotalSamplesWritten += samples.size
|
||||
1
|
||||
}
|
||||
|
||||
val ttsMs = System.currentTimeMillis() - startMs
|
||||
trace?.addDuration("tts_segment_ms_total", ttsMs)
|
||||
}
|
||||
|
||||
TtsQueueItem.End -> {
|
||||
// 清空 ASR 队列,丢弃所有未处理的段(这些可能是 TTS 播放期间的回声)
|
||||
while (asrQueue.tryReceive().isSuccess) { }
|
||||
|
||||
waitForPlaybackComplete(audioTrack)
|
||||
val ttsCompleteTime = System.currentTimeMillis()
|
||||
|
||||
// 在主线程更新UI
|
||||
runOnUiThread {
|
||||
appendToUi("\n[LOG] TTS completed at: ${ttsCompleteTime}\n")
|
||||
}
|
||||
|
||||
ttsPlaying.set(false)
|
||||
runOnUiThread { videoPlayerManager?.setSpeaking(false) }
|
||||
ttsTotalSamplesWritten = 0
|
||||
isFirstSegment = true
|
||||
currentTrace?.markTtsDone()
|
||||
TraceManager.getInstance().endTurn()
|
||||
currentTrace = null
|
||||
break
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private fun waitForPlaybackComplete(audioTrack: AudioTrack) {
|
||||
val totalSamples = ttsTotalSamplesWritten
|
||||
if (totalSamples <= 0) return
|
||||
|
||||
val sampleRate = audioTrack.sampleRate
|
||||
val timeoutMs = (totalSamples * 1000 / sampleRate) + 2000
|
||||
val startTime = System.currentTimeMillis()
|
||||
|
||||
while (true) {
|
||||
if (ttsStopped.get()) break
|
||||
|
||||
val playbackPos = audioTrack.playbackHeadPosition.toLong()
|
||||
if (playbackPos >= totalSamples) {
|
||||
break
|
||||
}
|
||||
|
||||
if (System.currentTimeMillis() - startTime > timeoutMs) {
|
||||
Log.w(TAG, "waitForPlaybackComplete timeout, pos=$playbackPos, total=$totalSamples")
|
||||
break
|
||||
}
|
||||
|
||||
Thread.sleep(20)
|
||||
}
|
||||
// 直接等待 1000ms,确保所有缓冲区清空
|
||||
Thread.sleep(1000)
|
||||
}
|
||||
|
||||
private suspend fun runAsrWorker() {
|
||||
while (ioScope.coroutineContext.isActive) {
|
||||
val (seg, trace) = try {
|
||||
asrQueue.receive()
|
||||
} catch (_: Throwable) {
|
||||
break
|
||||
}
|
||||
|
||||
// 每次只允许一个 LLM 请求在飞,避免堆积导致卡死/竞态
|
||||
// TTS 播放期间不做 ASR,避免识别到 TTS 播放的声音
|
||||
if (llmInFlight || ttsPlaying.get()) continue
|
||||
|
||||
trace?.markASRStart()
|
||||
Log.d(TAG, "ASR started: processing audio segment")
|
||||
withContext(Dispatchers.Main) {
|
||||
appendToUi("\n[ASR] 开始识别...\n")
|
||||
}
|
||||
val raw = synchronized(nativeLock) {
|
||||
val e = senseVoice
|
||||
if (e == null || !e.isInitialized) "" else e.transcribeBuffer(seg)
|
||||
}
|
||||
val text = removeTokens(raw)
|
||||
|
||||
// 添加过滤逻辑
|
||||
if (text.isBlank()) continue
|
||||
// 过滤英文单字符"i"
|
||||
if (text.length == 1 && text[0].equals('i', ignoreCase = true)) {
|
||||
Log.d(TAG, "ASR segment skipped: single 'i'")
|
||||
continue
|
||||
}
|
||||
// 过滤超过50个字符的长文本
|
||||
if (text.length > 50) {
|
||||
Log.d(TAG, "ASR segment skipped: too long (${text.length} chars)")
|
||||
continue
|
||||
}
|
||||
|
||||
trace?.markASREnd()
|
||||
|
||||
withContext(Dispatchers.Main) {
|
||||
appendToUi("\n\n[ASR] ${text}\n")
|
||||
}
|
||||
|
||||
trace?.markRecordingDone()
|
||||
trace?.markLlmResponseReceived()
|
||||
|
||||
if (BuildConfig.LLM_API_KEY.isBlank()) {
|
||||
withContext(Dispatchers.Main) {
|
||||
Toast.makeText(
|
||||
this@MainActivity,
|
||||
"未配置 LLM_API_KEY(在 local.properties 或 gradle.properties 里设置)",
|
||||
Toast.LENGTH_LONG
|
||||
).show()
|
||||
}
|
||||
continue
|
||||
}
|
||||
|
||||
llmInFlight = true
|
||||
cloudApiManager.callLLM(text)
|
||||
}
|
||||
}
|
||||
|
||||
private fun appendToUi(s: String) {
|
||||
lastUiText += s
|
||||
textView.text = lastUiText
|
||||
}
|
||||
}
|
||||
@@ -20,6 +20,8 @@ import android.widget.Toast
|
||||
import androidx.appcompat.app.AppCompatActivity
|
||||
import androidx.core.app.ActivityCompat
|
||||
import com.digitalperson.cloud.CloudApiManager
|
||||
import com.digitalperson.player.VideoPlayerManager
|
||||
import com.google.android.exoplayer2.ui.PlayerView
|
||||
import com.digitalperson.engine.SenseVoiceEngineRKNN
|
||||
import com.digitalperson.metrics.TraceManager
|
||||
import com.digitalperson.metrics.TraceSession
|
||||
@@ -33,6 +35,7 @@ import kotlinx.coroutines.Dispatchers
|
||||
import kotlinx.coroutines.Job
|
||||
import kotlinx.coroutines.SupervisorJob
|
||||
import kotlinx.coroutines.cancel
|
||||
import kotlinx.coroutines.channels.Channel
|
||||
import kotlinx.coroutines.isActive
|
||||
import kotlinx.coroutines.launch
|
||||
import kotlinx.coroutines.withContext
|
||||
@@ -66,6 +69,8 @@ class MainActivity : AppCompatActivity() {
|
||||
private val audioFormat = AudioFormat.ENCODING_PCM_16BIT
|
||||
private val permissions: Array<String> = arrayOf(Manifest.permission.RECORD_AUDIO)
|
||||
|
||||
|
||||
|
||||
@Volatile
|
||||
private var isRecording: Boolean = false
|
||||
|
||||
@@ -74,7 +79,11 @@ class MainActivity : AppCompatActivity() {
|
||||
private val nativeLock = Any()
|
||||
|
||||
private lateinit var cloudApiManager: CloudApiManager
|
||||
private val segmenter = StreamingTextSegmenter()
|
||||
private var videoPlayerManager: VideoPlayerManager? = null
|
||||
private val segmenter = StreamingTextSegmenter(
|
||||
maxLen = 30,
|
||||
maxWaitMs = 600
|
||||
)
|
||||
|
||||
private sealed class TtsQueueItem {
|
||||
data class Segment(val text: String) : TtsQueueItem()
|
||||
@@ -84,11 +93,18 @@ class MainActivity : AppCompatActivity() {
|
||||
private val ttsQueue = LinkedBlockingQueue<TtsQueueItem>()
|
||||
private val ttsStopped = AtomicBoolean(false)
|
||||
private val ttsWorkerRunning = AtomicBoolean(false)
|
||||
private val ttsPlaying = AtomicBoolean(false)
|
||||
@Volatile private var ttsTotalSamplesWritten: Long = 0
|
||||
|
||||
private var currentTrace: TraceSession? = null
|
||||
|
||||
private var lastUiText: String = ""
|
||||
@Volatile private var llmInFlight: Boolean = false
|
||||
private var enableStreaming = false // 默认禁用流式输出
|
||||
|
||||
// ASR 队列和工作器
|
||||
private val asrQueue = Channel<Pair<FloatArray, FloatArray>>(capacity = Channel.UNLIMITED)
|
||||
private val asrWorkerRunning = AtomicBoolean(false)
|
||||
|
||||
override fun onRequestPermissionsResult(
|
||||
requestCode: Int,
|
||||
@@ -97,8 +113,8 @@ class MainActivity : AppCompatActivity() {
|
||||
) {
|
||||
super.onRequestPermissionsResult(requestCode, permissions, grantResults)
|
||||
val ok = requestCode == REQUEST_RECORD_AUDIO_PERMISSION &&
|
||||
grantResults.isNotEmpty() &&
|
||||
grantResults[0] == PackageManager.PERMISSION_GRANTED
|
||||
grantResults.isNotEmpty() &&
|
||||
grantResults[0] == PackageManager.PERMISSION_GRANTED
|
||||
if (!ok) {
|
||||
Log.e(TAG, "Audio record is disallowed")
|
||||
finish()
|
||||
@@ -109,6 +125,17 @@ class MainActivity : AppCompatActivity() {
|
||||
super.onCreate(savedInstanceState)
|
||||
setContentView(R.layout.activity_main)
|
||||
|
||||
// 初始化双播放器管理器(silent 与 speaking 两个叠加的 PlayerView)
|
||||
try {
|
||||
val silentPv = findViewById<PlayerView>(R.id.player_view_silent)
|
||||
val speakingPv = findViewById<PlayerView>(R.id.player_view_speaking)
|
||||
videoPlayerManager = VideoPlayerManager(this, silentPv, speakingPv)
|
||||
// 默认 AI 未说话
|
||||
videoPlayerManager?.setSpeaking(false)
|
||||
} catch (e: Exception) {
|
||||
Log.w(TAG, "PlayerViews not found or init failed: ${e.message}")
|
||||
}
|
||||
|
||||
ActivityCompat.requestPermissions(this, permissions, REQUEST_RECORD_AUDIO_PERMISSION)
|
||||
|
||||
startButton = findViewById(R.id.start_button)
|
||||
@@ -119,6 +146,19 @@ class MainActivity : AppCompatActivity() {
|
||||
startButton.setOnClickListener { onStartClicked() }
|
||||
stopButton.setOnClickListener { onStopClicked(userInitiated = true) }
|
||||
|
||||
// 初始化流式输出开关
|
||||
try {
|
||||
val streamingSwitch = findViewById<android.widget.Switch>(R.id.streaming_switch)
|
||||
streamingSwitch.isChecked = enableStreaming
|
||||
streamingSwitch.setOnCheckedChangeListener { _, isChecked ->
|
||||
enableStreaming = isChecked
|
||||
cloudApiManager.setEnableStreaming(isChecked)
|
||||
Toast.makeText(this, "流式输出已${if (isChecked) "启用" else "禁用"}", Toast.LENGTH_SHORT).show()
|
||||
}
|
||||
} catch (e: Exception) {
|
||||
Log.w(TAG, "Streaming switch not found in layout: ${e.message}")
|
||||
}
|
||||
|
||||
// 避免 UI 线程重初始化导致 ANR:在后台初始化模型与 AudioTrack
|
||||
startButton.isEnabled = false
|
||||
stopButton.isEnabled = false
|
||||
@@ -151,30 +191,45 @@ class MainActivity : AppCompatActivity() {
|
||||
}
|
||||
}
|
||||
|
||||
cloudApiManager = CloudApiManager(object : CloudApiManager.CloudApiListener {
|
||||
cloudApiManager = CloudApiManager(object : CloudApiManager.CloudApiListener {
|
||||
private var llmFirstChunkMarked = false
|
||||
|
||||
override fun onLLMResponseReceived(response: String) {
|
||||
currentTrace?.markLlmDone()
|
||||
llmInFlight = false
|
||||
// flush remaining buffer into TTS
|
||||
for (seg in segmenter.flush()) {
|
||||
enqueueTtsSegment(seg)
|
||||
|
||||
// 根据流式输出模式处理响应
|
||||
if (enableStreaming) {
|
||||
// 启用流式输出时,刷新剩余缓冲区
|
||||
for (seg in segmenter.flush()) {
|
||||
enqueueTtsSegment(seg)
|
||||
}
|
||||
// 发送队列结束信号
|
||||
ttsQueue.offer(TtsQueueItem.End)
|
||||
} else {
|
||||
runOnUiThread {
|
||||
appendToUi("${response}\n")
|
||||
}
|
||||
// 禁用流式输出时,直接使用整段文本进行TTS
|
||||
enqueueTtsSegment(response)
|
||||
// 发送队列结束信号
|
||||
ttsQueue.offer(TtsQueueItem.End)
|
||||
}
|
||||
// signal queue end (no more segments after this)
|
||||
ttsQueue.offer(TtsQueueItem.End)
|
||||
}
|
||||
|
||||
override fun onLLMStreamingChunkReceived(chunk: String) {
|
||||
if (!llmFirstChunkMarked) {
|
||||
llmFirstChunkMarked = true
|
||||
currentTrace?.markLlmFirstChunk()
|
||||
}
|
||||
appendToUi(chunk)
|
||||
// 启用流式输出时,处理流式chunk
|
||||
if (enableStreaming) {
|
||||
if (!llmFirstChunkMarked) {
|
||||
llmFirstChunkMarked = true
|
||||
currentTrace?.markLlmFirstChunk()
|
||||
}
|
||||
appendToUi(chunk)
|
||||
|
||||
val segments = segmenter.processChunk(chunk)
|
||||
for (seg in segments) {
|
||||
enqueueTtsSegment(seg)
|
||||
val segments = segmenter.processChunk(chunk)
|
||||
for (seg in segments) {
|
||||
enqueueTtsSegment(seg)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -187,7 +242,14 @@ class MainActivity : AppCompatActivity() {
|
||||
Toast.makeText(this@MainActivity, errorMessage, Toast.LENGTH_LONG).show()
|
||||
onStopClicked(userInitiated = false)
|
||||
}
|
||||
})
|
||||
}, applicationContext)
|
||||
|
||||
// 设置流式输出模式
|
||||
cloudApiManager.setEnableStreaming(enableStreaming)
|
||||
|
||||
// 预先启动ASR worker
|
||||
Log.d(TAG, "Pre-starting ASR worker")
|
||||
ensureAsrWorker()
|
||||
}
|
||||
|
||||
override fun onDestroy() {
|
||||
@@ -208,10 +270,18 @@ class MainActivity : AppCompatActivity() {
|
||||
tts?.release()
|
||||
} catch (_: Throwable) {
|
||||
}
|
||||
try {
|
||||
videoPlayerManager?.release()
|
||||
} catch (_: Throwable) {
|
||||
}
|
||||
}
|
||||
|
||||
private fun onStartClicked() {
|
||||
if (isRecording) return
|
||||
Log.d(TAG, "onStartClicked called")
|
||||
if (isRecording) {
|
||||
Log.d(TAG, "Already recording, returning")
|
||||
return
|
||||
}
|
||||
|
||||
if (!initMicrophone()) {
|
||||
Toast.makeText(this, "麦克风初始化失败/无权限", Toast.LENGTH_SHORT).show()
|
||||
@@ -227,6 +297,8 @@ class MainActivity : AppCompatActivity() {
|
||||
textView.text = ""
|
||||
|
||||
ttsStopped.set(false)
|
||||
ttsPlaying.set(false)
|
||||
ttsTotalSamplesWritten = 0
|
||||
ttsQueue.clear()
|
||||
segmenter.reset()
|
||||
|
||||
@@ -237,10 +309,12 @@ class MainActivity : AppCompatActivity() {
|
||||
startButton.isEnabled = false
|
||||
stopButton.isEnabled = true
|
||||
|
||||
Log.d(TAG, "Starting processSamplesLoop coroutine")
|
||||
recordingJob?.cancel()
|
||||
recordingJob = ioScope.launch {
|
||||
processSamplesLoop()
|
||||
}
|
||||
Log.d(TAG, "onStartClicked completed")
|
||||
}
|
||||
|
||||
private fun onStopClicked(userInitiated: Boolean) {
|
||||
@@ -259,6 +333,8 @@ class MainActivity : AppCompatActivity() {
|
||||
recordingJob = null
|
||||
|
||||
ttsStopped.set(true)
|
||||
ttsPlaying.set(false)
|
||||
ttsTotalSamplesWritten = 0
|
||||
ttsQueue.clear()
|
||||
// wake worker if waiting
|
||||
ttsQueue.offer(TtsQueueItem.End)
|
||||
@@ -480,22 +556,43 @@ class MainActivity : AppCompatActivity() {
|
||||
}
|
||||
|
||||
private suspend fun processSamplesLoop() {
|
||||
Log.d(TAG, "processSamplesLoop started")
|
||||
// Avoid calling vad.front()/vad.pop() (native queue APIs) since it crashes on some builds.
|
||||
// Use vad.compute() and implement a simple VAD segmenter in Kotlin instead.
|
||||
val windowSize = 512
|
||||
val buffer = ShortArray(windowSize)
|
||||
val threshold = 0.5f
|
||||
val minSilenceSamples = (0.25f * sampleRateInHz).toInt()
|
||||
val minSpeechSamples = (0.25f * sampleRateInHz).toInt()
|
||||
// 双阈值设置
|
||||
val startThreshold = 0.2f // 进入语音的阈值
|
||||
val endThreshold = 0.15f // 退出语音的阈值
|
||||
val minSilenceSamples = (0.5f * sampleRateInHz).toInt()
|
||||
val minSpeechSamples = (0.1f * sampleRateInHz).toInt()
|
||||
val maxSpeechSamples = (5.0f * sampleRateInHz).toInt()
|
||||
|
||||
Log.d(TAG, "VAD thresholds: start=$startThreshold, end=$endThreshold, minSilenceSamples=$minSilenceSamples, minSpeechSamples=$minSpeechSamples")
|
||||
|
||||
// VAD 概率数据记录
|
||||
val vadProbabilities = mutableListOf<Float>()
|
||||
val vadTimestamps = mutableListOf<Long>()
|
||||
val vadRMSValues = mutableListOf<Float>()
|
||||
val vadSmoothedRMSValues = mutableListOf<Float>()
|
||||
|
||||
// 指数平滑相关变量
|
||||
var smoothedRms = 0f
|
||||
val alpha = 0.8f // 平滑系数
|
||||
|
||||
var inSpeech = false
|
||||
var silenceSamples = 0
|
||||
|
||||
var speechBuf = FloatArray(0)
|
||||
var speechLen = 0
|
||||
var processedSpeechBuf = FloatArray(0)
|
||||
var processedSpeechLen = 0
|
||||
|
||||
fun appendSpeech(chunk: FloatArray) {
|
||||
var loopCount = 0
|
||||
var vadComputeCount = 0
|
||||
|
||||
fun appendSpeech(chunk: FloatArray, processedChunk: FloatArray) {
|
||||
// 保存原始音频
|
||||
val needed = speechLen + chunk.size
|
||||
if (speechBuf.size < needed) {
|
||||
var newCap = maxOf(needed, maxOf(1024, speechBuf.size * 2))
|
||||
@@ -509,85 +606,152 @@ class MainActivity : AppCompatActivity() {
|
||||
System.arraycopy(chunk, 0, speechBuf, speechLen, copyN)
|
||||
speechLen += copyN
|
||||
}
|
||||
|
||||
// 保存增益后的音频
|
||||
val processedNeeded = processedSpeechLen + processedChunk.size
|
||||
if (processedSpeechBuf.size < processedNeeded) {
|
||||
var newCap = maxOf(processedNeeded, maxOf(1024, processedSpeechBuf.size * 2))
|
||||
if (newCap > maxSpeechSamples) newCap = maxSpeechSamples
|
||||
val n = FloatArray(newCap)
|
||||
if (processedSpeechLen > 0) System.arraycopy(processedSpeechBuf, 0, n, 0, processedSpeechLen)
|
||||
processedSpeechBuf = n
|
||||
}
|
||||
val processedCopyN = minOf(processedChunk.size, max(0, maxSpeechSamples - processedSpeechLen))
|
||||
if (processedCopyN > 0) {
|
||||
System.arraycopy(processedChunk, 0, processedSpeechBuf, processedSpeechLen, processedCopyN)
|
||||
processedSpeechLen += processedCopyN
|
||||
}
|
||||
}
|
||||
|
||||
suspend fun finalizeSegmentIfAny() {
|
||||
Log.d(TAG, "finalizeSegmentIfAny called: speechLen=$speechLen, minSpeechSamples=$minSpeechSamples, ttsPlaying=${ttsPlaying.get()}, llmInFlight=$llmInFlight")
|
||||
|
||||
if (speechLen < minSpeechSamples) {
|
||||
Log.d(TAG, "finalizeSegmentIfAny: speech too short, discarding")
|
||||
speechLen = 0
|
||||
processedSpeechLen = 0
|
||||
inSpeech = false
|
||||
silenceSamples = 0
|
||||
return
|
||||
}
|
||||
|
||||
val seg = speechBuf.copyOf(speechLen)
|
||||
// ✅ 新增:如果 TTS 正在播放或 LLM 请求中,丢弃此段(避免回声)
|
||||
if (ttsPlaying.get() || llmInFlight) {
|
||||
Log.d(TAG, "finalizeSegmentIfAny: TTS playing or LLM in flight, discarding")
|
||||
speechLen = 0
|
||||
processedSpeechLen = 0
|
||||
inSpeech = false
|
||||
silenceSamples = 0
|
||||
return
|
||||
}
|
||||
val originalSeg = speechBuf.copyOf(speechLen)
|
||||
val processedSeg = processedSpeechBuf.copyOf(processedSpeechLen)
|
||||
speechLen = 0
|
||||
processedSpeechLen = 0
|
||||
inSpeech = false
|
||||
silenceSamples = 0
|
||||
|
||||
// 每次只允许一个 LLM 请求在飞,避免堆积导致卡死/竞态
|
||||
if (llmInFlight) return
|
||||
|
||||
val trace = currentTrace
|
||||
trace?.markASRStart()
|
||||
val raw = synchronized(nativeLock) {
|
||||
val e = senseVoice
|
||||
if (e == null || !e.isInitialized) "" else e.transcribeBuffer(seg)
|
||||
}
|
||||
val text = removeTokens(raw)
|
||||
if (text.isBlank()) return
|
||||
trace?.markASREnd()
|
||||
if (text.isBlank()) return
|
||||
|
||||
withContext(Dispatchers.Main) {
|
||||
appendToUi("\n\n[ASR] ${text}\n")
|
||||
}
|
||||
|
||||
trace?.markRecordingDone()
|
||||
trace?.markLlmResponseReceived()
|
||||
|
||||
if (BuildConfig.LLM_API_KEY.isBlank()) {
|
||||
withContext(Dispatchers.Main) {
|
||||
Toast.makeText(
|
||||
this@MainActivity,
|
||||
"未配置 LLM_API_KEY(在 local.properties 或 gradle.properties 里设置)",
|
||||
Toast.LENGTH_LONG
|
||||
).show()
|
||||
}
|
||||
return
|
||||
}
|
||||
|
||||
llmInFlight = true
|
||||
cloudApiManager.callLLM(text)
|
||||
// 将语音段加入 ASR 处理队列,异步处理
|
||||
Log.d(TAG, "Sending audio segment to ASR queue, size: ${processedSeg.size}")
|
||||
asrQueue.send(Pair(originalSeg, processedSeg))
|
||||
Log.d(TAG, "Calling ensureAsrWorker")
|
||||
ensureAsrWorker()
|
||||
}
|
||||
|
||||
while (isRecording && ioScope.coroutineContext.isActive) {
|
||||
loopCount++
|
||||
if (loopCount % 100 == 0) {
|
||||
Log.d(TAG, "processSamplesLoop running, loopCount=$loopCount, ttsPlaying=${ttsPlaying.get()}")
|
||||
}
|
||||
// 如果TTS正在播放,跳过VAD处理,避免检测到回声
|
||||
if (ttsPlaying.get()) {
|
||||
// 如果正在语音状态,立即结束它
|
||||
if (inSpeech) {
|
||||
Log.d(TAG, "TTS playing, resetting VAD state")
|
||||
inSpeech = false
|
||||
silenceSamples = 0
|
||||
speechLen = 0
|
||||
processedSpeechLen = 0
|
||||
}
|
||||
// 读取并丢弃音频数据,保持录音状态
|
||||
val ret = audioRecord?.read(buffer, 0, buffer.size) ?: break
|
||||
if (ret <= 0) continue
|
||||
continue
|
||||
}
|
||||
|
||||
val ret = audioRecord?.read(buffer, 0, buffer.size) ?: break
|
||||
if (ret <= 0) continue
|
||||
if (ret != windowSize) continue
|
||||
|
||||
// 在 processSamplesLoop 方法中
|
||||
val chunk = FloatArray(ret) { buffer[it] / 32768.0f }
|
||||
val prob = synchronized(nativeLock) { vad.compute(chunk) }
|
||||
|
||||
if (prob >= threshold) {
|
||||
if (!inSpeech) {
|
||||
inSpeech = true
|
||||
silenceSamples = 0
|
||||
// 计算当前音频的RMS值(均方根)
|
||||
val rms = calculateRMS(chunk)
|
||||
|
||||
// 应用指数平滑
|
||||
smoothedRms = if (smoothedRms == 0f) rms else alpha * rms + (1 - alpha) * smoothedRms
|
||||
|
||||
// 动态调整增益因子,目标RMS设为0.1(约-20dB)
|
||||
val targetRMS = 0.1f
|
||||
var gainFactor = if (smoothedRms > 0) targetRMS / smoothedRms else 3.0f
|
||||
|
||||
// 设置增益的上下限,避免过度增益导致削波
|
||||
gainFactor = gainFactor.coerceIn(0.1f, 10.0f)
|
||||
|
||||
// 应用增益因子
|
||||
val processedChunk = FloatArray(chunk.size) {
|
||||
val value = chunk[it] * gainFactor
|
||||
// 限制音量范围,避免削波
|
||||
if (value > 1.0f) 1.0f else if (value < -1.0f) -1.0f else value
|
||||
}
|
||||
|
||||
// 使用处理后的音频数据
|
||||
val prob = synchronized(nativeLock) { vad.compute(processedChunk) }
|
||||
vadComputeCount++
|
||||
|
||||
// 记录VAD概率、时间戳、原始RMS值和平滑后的RMS值
|
||||
vadProbabilities.add(prob)
|
||||
vadTimestamps.add(System.currentTimeMillis())
|
||||
vadRMSValues.add(rms)
|
||||
vadSmoothedRMSValues.add(smoothedRms)
|
||||
|
||||
// 每100次循环输出一次VAD概率
|
||||
if (vadComputeCount % 100 == 0) {
|
||||
Log.d(TAG, "VAD prob=$prob, inSpeech=$inSpeech, rms=$rms, smoothedRms=$smoothedRms")
|
||||
}
|
||||
|
||||
// 双阈值状态机逻辑
|
||||
if (!inSpeech && prob >= startThreshold) {
|
||||
// 进入语音状态
|
||||
inSpeech = true
|
||||
silenceSamples = 0
|
||||
appendSpeech(chunk, processedChunk)
|
||||
Log.d(TAG, "VAD: Entered speech state, prob=$prob, speechLen=$speechLen")
|
||||
} else if (inSpeech && prob <= endThreshold) {
|
||||
// 开始计数静音样本
|
||||
silenceSamples += ret
|
||||
if (silenceSamples >= minSilenceSamples) {
|
||||
// 退出语音状态
|
||||
Log.d(TAG, "VAD: Exiting speech state, prob=$prob, silenceSamples=$silenceSamples, speechLen=$speechLen")
|
||||
finalizeSegmentIfAny()
|
||||
} else {
|
||||
// 保留尾音
|
||||
appendSpeech(chunk, processedChunk)
|
||||
}
|
||||
appendSpeech(chunk)
|
||||
} else if (inSpeech) {
|
||||
// 语音过程中,持续添加音频
|
||||
appendSpeech(chunk, processedChunk)
|
||||
silenceSamples = 0 // 重置静音计数
|
||||
|
||||
if (speechLen >= maxSpeechSamples) {
|
||||
Log.d(TAG, "VAD: Max speech length reached, finalizing segment")
|
||||
finalizeSegmentIfAny()
|
||||
}
|
||||
} else {
|
||||
if (inSpeech) {
|
||||
silenceSamples += ret
|
||||
if (silenceSamples >= minSilenceSamples) {
|
||||
finalizeSegmentIfAny()
|
||||
} else {
|
||||
// keep a bit of trailing silence to avoid chopping
|
||||
appendSpeech(chunk)
|
||||
}
|
||||
}
|
||||
}
|
||||
// 非语音状态且概率低于开始阈值,不做处理
|
||||
|
||||
// 每1000次循环输出一次VAD状态
|
||||
if (loopCount % 1000 == 0) {
|
||||
Log.d(TAG, "VAD status: inSpeech=$inSpeech, prob=$prob, speechLen=$speechLen")
|
||||
}
|
||||
|
||||
// 时间兜底切段(避免长时间无标点导致首包太慢)
|
||||
@@ -597,6 +761,58 @@ class MainActivity : AppCompatActivity() {
|
||||
|
||||
// flush last partial segment
|
||||
finalizeSegmentIfAny()
|
||||
|
||||
// 保存VAD数据到文件
|
||||
saveVadData(vadTimestamps, vadProbabilities, vadRMSValues, vadSmoothedRMSValues)
|
||||
}
|
||||
|
||||
/**
|
||||
* 计算音频数据的均方根(RMS)值,用于动态调整增益
|
||||
*/
|
||||
private fun calculateRMS(samples: FloatArray): Float {
|
||||
if (samples.isEmpty()) return 0.0f
|
||||
|
||||
var sumSquared = 0.0f
|
||||
for (sample in samples) {
|
||||
sumSquared += sample * sample
|
||||
}
|
||||
|
||||
val meanSquared = sumSquared / samples.size
|
||||
return kotlin.math.sqrt(meanSquared)
|
||||
}
|
||||
|
||||
/**
|
||||
* 保存VAD数据到文件,方便后续分析和绘图
|
||||
*/
|
||||
private fun saveVadData(timestamps: List<Long>, probabilities: List<Float>, rmsValues: List<Float>, smoothedRmsValues: List<Float>) {
|
||||
try {
|
||||
// 创建保存目录
|
||||
val vadDataDir = File(filesDir, "vad_data")
|
||||
if (!vadDataDir.exists()) {
|
||||
vadDataDir.mkdirs()
|
||||
}
|
||||
|
||||
// 生成唯一的文件名
|
||||
val timestamp = System.currentTimeMillis()
|
||||
val fileName = "vad_data_${timestamp}.csv"
|
||||
val outputFile = File(vadDataDir, fileName)
|
||||
|
||||
// 写入数据
|
||||
FileOutputStream(outputFile).use { fos ->
|
||||
// 写入表头
|
||||
fos.write("timestamp,probability,rms,smoothed_rms\n".toByteArray())
|
||||
|
||||
// 写入数据行
|
||||
for (i in timestamps.indices) {
|
||||
val line = "${timestamps[i]},${probabilities[i]},${rmsValues[i]},${smoothedRmsValues[i]}\n"
|
||||
fos.write(line.toByteArray())
|
||||
}
|
||||
}
|
||||
|
||||
Log.d(TAG, "Saved VAD data to: ${outputFile.absolutePath}")
|
||||
} catch (e: Exception) {
|
||||
Log.e(TAG, "Error saving VAD data: ${e.message}")
|
||||
}
|
||||
}
|
||||
|
||||
private fun removeTokens(text: String): String {
|
||||
@@ -608,8 +824,11 @@ class MainActivity : AppCompatActivity() {
|
||||
}
|
||||
|
||||
private fun enqueueTtsSegment(seg: String) {
|
||||
// 移除句末的标点符号
|
||||
val cleanedSeg = seg.trimEnd('.', '。', '!', '!', '?', '?', ',', ',', ';', ';', ':', ':')
|
||||
|
||||
currentTrace?.markTtsRequestEnqueued()
|
||||
ttsQueue.offer(TtsQueueItem.Segment(seg))
|
||||
ttsQueue.offer(TtsQueueItem.Segment(cleanedSeg))
|
||||
ensureTtsWorker()
|
||||
}
|
||||
|
||||
@@ -624,34 +843,60 @@ class MainActivity : AppCompatActivity() {
|
||||
}
|
||||
}
|
||||
|
||||
private fun ensureAsrWorker() {
|
||||
Log.d(TAG, "ensureAsrWorker called, asrWorkerRunning=${asrWorkerRunning.get()}")
|
||||
if (!asrWorkerRunning.compareAndSet(false, true)) {
|
||||
Log.d(TAG, "ASR worker already running, returning")
|
||||
return
|
||||
}
|
||||
Log.d(TAG, "Starting ASR worker coroutine")
|
||||
ioScope.launch {
|
||||
try {
|
||||
runAsrWorker()
|
||||
} finally {
|
||||
Log.d(TAG, "ASR worker coroutine finished")
|
||||
asrWorkerRunning.set(false)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private fun runTtsWorker() {
|
||||
val t = tts ?: return
|
||||
val audioTrack = track ?: return
|
||||
|
||||
var firstAudioMarked = false
|
||||
var isFirstSegment = true
|
||||
while (true) {
|
||||
val item = ttsQueue.take()
|
||||
if (ttsStopped.get()) break
|
||||
|
||||
when (item) {
|
||||
is TtsQueueItem.Segment -> {
|
||||
ttsPlaying.set(true)
|
||||
runOnUiThread { videoPlayerManager?.setSpeaking(true) }
|
||||
val trace = currentTrace
|
||||
trace?.markTtsSynthesisStart()
|
||||
Log.d(TAG, "TTS started: processing segment '${item.text}'")
|
||||
runOnUiThread {
|
||||
appendToUi("\n[TTS] 开始合成...\n")
|
||||
}
|
||||
|
||||
val startMs = System.currentTimeMillis()
|
||||
var firstPcmMarked = false
|
||||
|
||||
// flush to reduce latency between segments
|
||||
try {
|
||||
audioTrack.pause()
|
||||
audioTrack.flush()
|
||||
audioTrack.play()
|
||||
} catch (_: Throwable) {
|
||||
if (isFirstSegment) {
|
||||
try {
|
||||
audioTrack.pause()
|
||||
audioTrack.flush()
|
||||
audioTrack.play()
|
||||
} catch (_: Throwable) {
|
||||
}
|
||||
isFirstSegment = false
|
||||
}
|
||||
|
||||
t.generateWithCallback(
|
||||
text = item.text,
|
||||
sid = 0,
|
||||
sid = 2, // 这里可以修改说话人
|
||||
speed = 1.0f
|
||||
) { samples ->
|
||||
if (ttsStopped.get()) return@generateWithCallback 0
|
||||
@@ -664,6 +909,7 @@ class MainActivity : AppCompatActivity() {
|
||||
trace?.markTtsFirstAudioPlay()
|
||||
}
|
||||
audioTrack.write(samples, 0, samples.size, AudioTrack.WRITE_BLOCKING)
|
||||
ttsTotalSamplesWritten += samples.size
|
||||
1
|
||||
}
|
||||
|
||||
@@ -672,6 +918,21 @@ class MainActivity : AppCompatActivity() {
|
||||
}
|
||||
|
||||
TtsQueueItem.End -> {
|
||||
// 清空 ASR 队列,丢弃所有未处理的段(这些可能是 TTS 播放期间的回声)
|
||||
while (asrQueue.tryReceive().isSuccess) { }
|
||||
|
||||
waitForPlaybackComplete(audioTrack)
|
||||
val ttsCompleteTime = System.currentTimeMillis()
|
||||
|
||||
// 在主线程更新UI
|
||||
runOnUiThread {
|
||||
appendToUi("\n[LOG] TTS completed at: ${ttsCompleteTime}\n")
|
||||
}
|
||||
|
||||
ttsPlaying.set(false)
|
||||
runOnUiThread { videoPlayerManager?.setSpeaking(false) }
|
||||
ttsTotalSamplesWritten = 0
|
||||
isFirstSegment = true
|
||||
currentTrace?.markTtsDone()
|
||||
TraceManager.getInstance().endTurn()
|
||||
currentTrace = null
|
||||
@@ -681,9 +942,257 @@ class MainActivity : AppCompatActivity() {
|
||||
}
|
||||
}
|
||||
|
||||
private fun waitForPlaybackComplete(audioTrack: AudioTrack) {
|
||||
val totalSamples = ttsTotalSamplesWritten
|
||||
if (totalSamples <= 0) return
|
||||
|
||||
val sampleRate = audioTrack.sampleRate
|
||||
val timeoutMs = (totalSamples * 1000 / sampleRate) + 2000
|
||||
val startTime = System.currentTimeMillis()
|
||||
|
||||
while (true) {
|
||||
if (ttsStopped.get()) break
|
||||
|
||||
val playbackPos = audioTrack.playbackHeadPosition.toLong()
|
||||
if (playbackPos >= totalSamples) {
|
||||
break
|
||||
}
|
||||
|
||||
if (System.currentTimeMillis() - startTime > timeoutMs) {
|
||||
Log.w(TAG, "waitForPlaybackComplete timeout, pos=$playbackPos, total=$totalSamples")
|
||||
break
|
||||
}
|
||||
|
||||
Thread.sleep(20)
|
||||
}
|
||||
// 直接等待 1000ms,确保所有缓冲区清空
|
||||
Thread.sleep(1000)
|
||||
}
|
||||
|
||||
private suspend fun runAsrWorker() {
|
||||
Log.d(TAG, "ASR worker started")
|
||||
try {
|
||||
while (ioScope.coroutineContext.isActive) {
|
||||
val (originalSeg, processedSeg) = try {
|
||||
Log.d(TAG, "ASR worker waiting for audio segment")
|
||||
asrQueue.receive()
|
||||
} catch (e: Throwable) {
|
||||
Log.e(TAG, "ASR worker receive failed: ${e.message}")
|
||||
break
|
||||
}
|
||||
|
||||
Log.d(TAG, "ASR worker received audio segment, size=${processedSeg.size}")
|
||||
|
||||
// 每次只允许一个 LLM 请求在飞,避免堆积导致卡死/竞态
|
||||
// TTS 播放期间不做 ASR,避免识别到 TTS 播放的声音
|
||||
if (llmInFlight || ttsPlaying.get()) {
|
||||
Log.d(TAG, "ASR worker skipping segment: llmInFlight=$llmInFlight, ttsPlaying=${ttsPlaying.get()}")
|
||||
continue
|
||||
}
|
||||
|
||||
val trace = currentTrace
|
||||
trace?.markASRStart()
|
||||
Log.d(TAG, "ASR started: processing audio segment")
|
||||
withContext(Dispatchers.Main) {
|
||||
appendToUi("\n[ASR] 开始识别...\n")
|
||||
}
|
||||
|
||||
// 保存ASR音频用于调试
|
||||
saveAsrAudio(originalSeg, processedSeg)
|
||||
|
||||
val raw = synchronized(nativeLock) {
|
||||
val e = senseVoice
|
||||
if (e == null || !e.isInitialized) {
|
||||
Log.e(TAG, "ASR failed: SenseVoice engine not initialized")
|
||||
""
|
||||
} else {
|
||||
try {
|
||||
e.transcribeBuffer(processedSeg)
|
||||
} catch (e: Throwable) {
|
||||
Log.e(TAG, "ASR transcribe failed: ${e.message}")
|
||||
""
|
||||
}
|
||||
}
|
||||
}
|
||||
Log.d(TAG, "ASR raw result: $raw")
|
||||
val text = removeTokens(raw)
|
||||
|
||||
// 添加过滤逻辑
|
||||
if (text.isBlank()) {
|
||||
Log.d(TAG, "ASR segment skipped: blank text")
|
||||
continue
|
||||
}
|
||||
// 过滤英文单字符"i"
|
||||
if (text.length == 1 && text[0].equals('i', ignoreCase = true)) {
|
||||
Log.d(TAG, "ASR segment skipped: single 'i'")
|
||||
continue
|
||||
}
|
||||
// 过滤超过50个字符的长文本
|
||||
if (text.length > 50) {
|
||||
Log.d(TAG, "ASR segment skipped: too long (${text.length} chars)")
|
||||
continue
|
||||
}
|
||||
|
||||
trace?.markASREnd()
|
||||
|
||||
withContext(Dispatchers.Main) {
|
||||
appendToUi("\n\n[ASR] ${text}\n")
|
||||
}
|
||||
|
||||
trace?.markRecordingDone()
|
||||
trace?.markLlmResponseReceived()
|
||||
|
||||
if (BuildConfig.LLM_API_KEY.isBlank()) {
|
||||
withContext(Dispatchers.Main) {
|
||||
Toast.makeText(
|
||||
this@MainActivity,
|
||||
"未配置 LLM_API_KEY(在 local.properties 或 gradle.properties 里设置)",
|
||||
Toast.LENGTH_LONG
|
||||
).show()
|
||||
}
|
||||
continue
|
||||
}
|
||||
|
||||
llmInFlight = true
|
||||
Log.d(TAG, "Calling LLM with text: $text")
|
||||
cloudApiManager.callLLM(text)
|
||||
}
|
||||
} catch (e: Throwable) {
|
||||
Log.e(TAG, "ASR worker error: ${e.message}", e)
|
||||
} finally {
|
||||
Log.d(TAG, "ASR worker exiting")
|
||||
}
|
||||
}
|
||||
|
||||
private fun appendToUi(s: String) {
|
||||
lastUiText += s
|
||||
textView.text = lastUiText
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* 保存ASR音频用于调试
|
||||
*/
|
||||
private fun saveAsrAudio(originalAudio: FloatArray, processedAudio: FloatArray) {
|
||||
try {
|
||||
// 创建保存目录
|
||||
val asrAudioDir = File(filesDir, "asr_audio")
|
||||
if (!asrAudioDir.exists()) {
|
||||
asrAudioDir.mkdirs()
|
||||
}
|
||||
|
||||
// 生成唯一的文件名
|
||||
val timestamp = System.currentTimeMillis()
|
||||
|
||||
// 保存原始音频
|
||||
val originalFile = File(asrAudioDir, "asr_${timestamp}_original.wav")
|
||||
saveFloatArrayAsWav(originalFile, originalAudio, sampleRateInHz)
|
||||
Log.d(TAG, "Saved original ASR audio to: ${originalFile.absolutePath}")
|
||||
|
||||
// 保存处理后的音频(增益后)
|
||||
val processedFile = File(asrAudioDir, "asr_${timestamp}_processed.wav")
|
||||
saveFloatArrayAsWav(processedFile, processedAudio, sampleRateInHz)
|
||||
Log.d(TAG, "Saved processed ASR audio to: ${processedFile.absolutePath}")
|
||||
} catch (e: Exception) {
|
||||
Log.e(TAG, "Error saving ASR audio: ${e.message}")
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* 将FloatArray保存为WAV文件
|
||||
*/
|
||||
private fun saveFloatArrayAsWav(file: File, samples: FloatArray, sampleRate: Int) {
|
||||
FileOutputStream(file).use { fos ->
|
||||
// WAV文件头
|
||||
val header = ByteArray(44)
|
||||
|
||||
// RIFF标识
|
||||
header[0] = 'R'.code.toByte()
|
||||
header[1] = 'I'.code.toByte()
|
||||
header[2] = 'F'.code.toByte()
|
||||
header[3] = 'F'.code.toByte()
|
||||
|
||||
// 文件大小(不包括RIFF标识和文件大小字段本身)
|
||||
val fileSize = 36 + samples.size * 2
|
||||
intToByteArray(fileSize, header, 4)
|
||||
|
||||
// WAVE标识
|
||||
header[8] = 'W'.code.toByte()
|
||||
header[9] = 'A'.code.toByte()
|
||||
header[10] = 'V'.code.toByte()
|
||||
header[11] = 'E'.code.toByte()
|
||||
|
||||
// fmt标识
|
||||
header[12] = 'f'.code.toByte()
|
||||
header[13] = 'm'.code.toByte()
|
||||
header[14] = 't'.code.toByte()
|
||||
header[15] = ' '.code.toByte()
|
||||
|
||||
// 子块大小
|
||||
intToByteArray(16, header, 16)
|
||||
|
||||
// 音频格式(1 = PCM)
|
||||
shortToByteArray(1, header, 20)
|
||||
|
||||
// 声道数(1 = 单声道)
|
||||
shortToByteArray(1, header, 22)
|
||||
|
||||
// 采样率
|
||||
intToByteArray(sampleRate, header, 24)
|
||||
|
||||
// 字节率 = 采样率 * 声道数 * 位深度 / 8
|
||||
val byteRate = sampleRate * 1 * 16 / 8
|
||||
intToByteArray(byteRate, header, 28)
|
||||
|
||||
// 块对齐 = 声道数 * 位深度 / 8
|
||||
val blockAlign = 1 * 16 / 8
|
||||
shortToByteArray(blockAlign.toShort(), header, 32)
|
||||
|
||||
// 位深度(16位)
|
||||
shortToByteArray(16, header, 34)
|
||||
|
||||
// data标识
|
||||
header[36] = 'd'.code.toByte()
|
||||
header[37] = 'a'.code.toByte()
|
||||
header[38] = 't'.code.toByte()
|
||||
header[39] = 'a'.code.toByte()
|
||||
|
||||
// 数据大小
|
||||
val dataSize = samples.size * 2
|
||||
intToByteArray(dataSize, header, 40)
|
||||
|
||||
// 写入文件头
|
||||
fos.write(header)
|
||||
|
||||
// 写入音频数据(转换为16位PCM)
|
||||
for (sample in samples) {
|
||||
// 确保样本在[-1, 1]范围内
|
||||
val clampedSample = sample.coerceIn(-1.0f, 1.0f)
|
||||
// 转换为16位整数
|
||||
val shortSample = (clampedSample * 32767.0f).toInt().toShort()
|
||||
// 写入小端序
|
||||
val bytes = ByteArray(2)
|
||||
bytes[0] = (shortSample.toInt() and 0xFF).toByte()
|
||||
bytes[1] = (shortSample.toInt() shr 8 and 0xFF).toByte()
|
||||
fos.write(bytes)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* 将int转换为小端序字节数组
|
||||
*/
|
||||
private fun intToByteArray(value: Int, dest: ByteArray, offset: Int) {
|
||||
dest[offset] = (value and 0xFF).toByte()
|
||||
dest[offset + 1] = (value shr 8 and 0xFF).toByte()
|
||||
dest[offset + 2] = (value shr 16 and 0xFF).toByte()
|
||||
dest[offset + 3] = (value shr 24 and 0xFF).toByte()
|
||||
}
|
||||
|
||||
/**
|
||||
* 将short转换为小端序字节数组
|
||||
*/
|
||||
private fun shortToByteArray(value: Short, dest: ByteArray, offset: Int) {
|
||||
dest[offset] = (value.toInt() and 0xFF).toByte()
|
||||
dest[offset + 1] = (value.toInt() shr 8 and 0xFF).toByte()
|
||||
}
|
||||
}
|
||||
@@ -1,10 +1,12 @@
|
||||
package com.digitalperson.cloud;
|
||||
|
||||
import android.content.Context;
|
||||
import android.os.Handler;
|
||||
import android.os.Looper;
|
||||
import android.util.Log;
|
||||
|
||||
import com.digitalperson.BuildConfig;
|
||||
import com.digitalperson.R;
|
||||
|
||||
import org.json.JSONArray;
|
||||
import org.json.JSONException;
|
||||
@@ -30,6 +32,7 @@ public class CloudApiManager {
|
||||
private CloudApiListener mListener;
|
||||
private Handler mMainHandler; // 用于在主线程执行UI更新
|
||||
private JSONArray mConversationHistory; // 存储对话历史
|
||||
private boolean mEnableStreaming = true; // 默认启用流式输出
|
||||
|
||||
public interface CloudApiListener {
|
||||
void onLLMResponseReceived(String response);
|
||||
@@ -38,10 +41,37 @@ public class CloudApiManager {
|
||||
void onError(String errorMessage);
|
||||
}
|
||||
|
||||
public CloudApiManager(CloudApiListener listener) {
|
||||
public CloudApiManager(CloudApiListener listener, Context context) {
|
||||
this.mListener = listener;
|
||||
this.mMainHandler = new Handler(Looper.getMainLooper()); // 初始化主线程Handler
|
||||
this.mConversationHistory = new JSONArray(); // 初始化对话历史
|
||||
|
||||
// 添加 system message,要求回答简洁
|
||||
try {
|
||||
JSONObject systemMessage = new JSONObject();
|
||||
systemMessage.put("role", "system");
|
||||
String systemPrompt = context.getString(R.string.system_prompt);
|
||||
systemMessage.put("content", systemPrompt);
|
||||
mConversationHistory.put(systemMessage);
|
||||
} catch (JSONException e) {
|
||||
Log.e(TAG, "Failed to add system message: " + e.getMessage());
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* 设置是否启用流式输出
|
||||
* @param enableStreaming true: 启用流式输出,false: 禁用流式输出(整段输出)
|
||||
*/
|
||||
public void setEnableStreaming(boolean enableStreaming) {
|
||||
this.mEnableStreaming = enableStreaming;
|
||||
}
|
||||
|
||||
/**
|
||||
* 获取当前是否启用流式输出
|
||||
* @return true: 启用流式输出,false: 禁用流式输出(整段输出)
|
||||
*/
|
||||
public boolean isEnableStreaming() {
|
||||
return mEnableStreaming;
|
||||
}
|
||||
|
||||
public void callLLM(String userInput) {
|
||||
@@ -64,7 +94,7 @@ public class CloudApiManager {
|
||||
JSONObject requestBody = new JSONObject();
|
||||
requestBody.put("model", LLM_MODEL);
|
||||
requestBody.put("messages", mConversationHistory);
|
||||
requestBody.put("stream", true); // 启用流式响应
|
||||
requestBody.put("stream", mEnableStreaming); // 根据配置决定是否启用流式响应
|
||||
|
||||
String jsonBody = requestBody.toString();
|
||||
|
||||
@@ -84,47 +114,74 @@ public class CloudApiManager {
|
||||
Log.d(TAG, "LLM Response Code: " + responseCode);
|
||||
|
||||
if (responseCode == 200) {
|
||||
// 逐行读取流式响应
|
||||
try (BufferedReader br = new BufferedReader(
|
||||
new InputStreamReader(conn.getInputStream(), StandardCharsets.UTF_8))) {
|
||||
String line;
|
||||
while ((line = br.readLine()) != null) {
|
||||
Log.d(TAG, "LLM Streaming Line: " + line);
|
||||
|
||||
// 处理SSE格式的响应
|
||||
if (line.startsWith("data: ")) {
|
||||
String dataPart = line.substring(6);
|
||||
if (dataPart.equals("[DONE]")) {
|
||||
// 流式响应结束
|
||||
break;
|
||||
}
|
||||
if (mEnableStreaming) {
|
||||
// 逐行读取流式响应
|
||||
try (BufferedReader br = new BufferedReader(
|
||||
new InputStreamReader(conn.getInputStream(), StandardCharsets.UTF_8))) {
|
||||
String line;
|
||||
while ((line = br.readLine()) != null) {
|
||||
Log.d(TAG, "LLM Streaming Line: " + line);
|
||||
|
||||
try {
|
||||
// 解析JSON
|
||||
JSONObject chunkObj = new JSONObject(dataPart);
|
||||
JSONArray choices = chunkObj.getJSONArray("choices");
|
||||
if (choices.length() > 0) {
|
||||
JSONObject choice = choices.getJSONObject(0);
|
||||
JSONObject delta = choice.getJSONObject("delta");
|
||||
|
||||
if (delta.has("content")) {
|
||||
String chunkContent = delta.getString("content");
|
||||
accumulatedContent.append(chunkContent);
|
||||
// 处理SSE格式的响应
|
||||
if (line.startsWith("data: ")) {
|
||||
String dataPart = line.substring(6);
|
||||
if (dataPart.equals("[DONE]")) {
|
||||
// 流式响应结束
|
||||
break;
|
||||
}
|
||||
|
||||
try {
|
||||
// 解析JSON
|
||||
JSONObject chunkObj = new JSONObject(dataPart);
|
||||
JSONArray choices = chunkObj.getJSONArray("choices");
|
||||
if (choices.length() > 0) {
|
||||
JSONObject choice = choices.getJSONObject(0);
|
||||
JSONObject delta = choice.getJSONObject("delta");
|
||||
|
||||
// 发送流式chunk到监听器
|
||||
if (mListener != null) {
|
||||
mMainHandler.post(() -> {
|
||||
mListener.onLLMStreamingChunkReceived(chunkContent);
|
||||
});
|
||||
if (delta.has("content")) {
|
||||
String chunkContent = delta.getString("content");
|
||||
accumulatedContent.append(chunkContent);
|
||||
|
||||
// 发送流式chunk到监听器
|
||||
if (mListener != null) {
|
||||
mMainHandler.post(() -> {
|
||||
mListener.onLLMStreamingChunkReceived(chunkContent);
|
||||
});
|
||||
}
|
||||
}
|
||||
}
|
||||
} catch (JSONException e) {
|
||||
Log.e(TAG, "Failed to parse streaming chunk: " + e.getMessage());
|
||||
}
|
||||
} catch (JSONException e) {
|
||||
Log.e(TAG, "Failed to parse streaming chunk: " + e.getMessage());
|
||||
}
|
||||
|
||||
fullResponse.append(line).append("\n");
|
||||
}
|
||||
}
|
||||
} else {
|
||||
// 读取完整响应
|
||||
try (BufferedReader br = new BufferedReader(
|
||||
new InputStreamReader(conn.getInputStream(), StandardCharsets.UTF_8))) {
|
||||
String line;
|
||||
while ((line = br.readLine()) != null) {
|
||||
fullResponse.append(line);
|
||||
}
|
||||
}
|
||||
|
||||
// 解析完整JSON响应
|
||||
try {
|
||||
JSONObject responseObj = new JSONObject(fullResponse.toString());
|
||||
JSONArray choices = responseObj.getJSONArray("choices");
|
||||
if (choices.length() > 0) {
|
||||
JSONObject choice = choices.getJSONObject(0);
|
||||
JSONObject message = choice.getJSONObject("message");
|
||||
if (message.has("content")) {
|
||||
String content = message.getString("content");
|
||||
accumulatedContent.append(content);
|
||||
}
|
||||
}
|
||||
|
||||
fullResponse.append(line).append("\n");
|
||||
} catch (JSONException e) {
|
||||
Log.e(TAG, "Failed to parse full response: " + e.getMessage());
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -38,13 +38,19 @@ public class TraceSession {
|
||||
long newValue = (currentValue != null) ? currentValue + deltaMs : deltaMs;
|
||||
if (currentValue == null) {
|
||||
// 如果键不存在,尝试添加
|
||||
if (durations.putIfAbsent(name, newValue) == null) {
|
||||
break;
|
||||
synchronized (durations) {
|
||||
if (!durations.containsKey(name)) {
|
||||
durations.put(name, newValue);
|
||||
break;
|
||||
}
|
||||
}
|
||||
} else {
|
||||
// 如果键存在,尝试更新
|
||||
if (durations.replace(name, currentValue, newValue)) {
|
||||
break;
|
||||
synchronized (durations) {
|
||||
if (durations.containsKey(name) && durations.get(name).equals(currentValue)) {
|
||||
durations.put(name, newValue);
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -0,0 +1,99 @@
|
||||
package com.digitalperson.player
|
||||
|
||||
import android.content.Context
|
||||
import android.net.Uri
|
||||
import android.view.View
|
||||
import com.digitalperson.R
|
||||
import com.google.android.exoplayer2.ExoPlayer
|
||||
import com.google.android.exoplayer2.MediaItem
|
||||
import com.google.android.exoplayer2.Player
|
||||
import com.google.android.exoplayer2.ui.PlayerView
|
||||
|
||||
class VideoPlayerManager(
|
||||
private val context: Context,
|
||||
private val silentView: PlayerView,
|
||||
private val speakingView: PlayerView
|
||||
) {
|
||||
private var playerSilent: ExoPlayer? = null
|
||||
private var playerSpeaking: ExoPlayer? = null
|
||||
private var currentState: Boolean = false
|
||||
private var transitionDuration = 300L // 淡入淡出时长
|
||||
|
||||
init {
|
||||
// 确保初始 alpha
|
||||
silentView.alpha = 1f
|
||||
speakingView.alpha = 0f
|
||||
initPlayers()
|
||||
}
|
||||
|
||||
private fun uriForRaw(resId: Int): Uri = Uri.parse("android.resource://${context.packageName}/$resId")
|
||||
|
||||
private fun initPlayers() {
|
||||
playerSilent = ExoPlayer.Builder(context).build().apply {
|
||||
repeatMode = Player.REPEAT_MODE_ONE
|
||||
playWhenReady = true
|
||||
setMediaItem(MediaItem.fromUri(uriForRaw(R.raw.silent)))
|
||||
prepare()
|
||||
}
|
||||
|
||||
playerSpeaking = ExoPlayer.Builder(context).build().apply {
|
||||
repeatMode = Player.REPEAT_MODE_ONE
|
||||
playWhenReady = true
|
||||
setMediaItem(MediaItem.fromUri(uriForRaw(R.raw.speak_no_voice)))
|
||||
prepare()
|
||||
}
|
||||
|
||||
// 绑定到各自的 PlayerView
|
||||
silentView.player = playerSilent
|
||||
speakingView.player = playerSpeaking
|
||||
|
||||
// 静音视频音频输出(通常不需要声音)
|
||||
playerSilent?.volume = 0f
|
||||
playerSpeaking?.volume = 0f
|
||||
|
||||
// 启动播放(prepare 后自动播放)
|
||||
playerSilent?.play()
|
||||
playerSpeaking?.play()
|
||||
|
||||
// 确保初始 alpha 状态(防止 Surface/Texture 的 race)
|
||||
silentView.alpha = 1f
|
||||
speakingView.alpha = 0f
|
||||
currentState = false
|
||||
}
|
||||
|
||||
/**
|
||||
* 切换到说话状态:speaking=true 播放 speakingView(alpha 1),silentView 渐隐
|
||||
*/
|
||||
fun setSpeaking(speaking: Boolean) {
|
||||
if (speaking == currentState) return
|
||||
currentState = speaking
|
||||
|
||||
// 同步位置:以 silent 为主(也可以反向)
|
||||
syncPositions()
|
||||
|
||||
val fadeInView = if (speaking) speakingView else silentView
|
||||
val fadeOutView = if (speaking) silentView else speakingView
|
||||
|
||||
// 执行淡入淡出
|
||||
fadeOutView.animate().alpha(0f).setDuration(transitionDuration).start()
|
||||
fadeInView.visibility = View.VISIBLE
|
||||
fadeInView.animate().alpha(1f).setDuration(transitionDuration).start()
|
||||
}
|
||||
|
||||
private fun syncPositions() {
|
||||
// 以 silent 为主:将 speaking 同步到 silent 的位置
|
||||
try {
|
||||
val pos = playerSilent?.currentPosition ?: 0L
|
||||
playerSpeaking?.seekTo(pos)
|
||||
} catch (_: Throwable) {}
|
||||
}
|
||||
|
||||
fun release() {
|
||||
try { silentView.player = null } catch (_: Throwable) {}
|
||||
try { speakingView.player = null } catch (_: Throwable) {}
|
||||
try { playerSilent?.release() } catch (_: Throwable) {}
|
||||
try { playerSpeaking?.release() } catch (_: Throwable) {}
|
||||
playerSilent = null
|
||||
playerSpeaking = null
|
||||
}
|
||||
}
|
||||
@@ -4,8 +4,39 @@
|
||||
xmlns:tools="http://schemas.android.com/tools"
|
||||
android:layout_width="match_parent"
|
||||
android:layout_height="match_parent"
|
||||
android:background="#606060"
|
||||
tools:context="com.digitalperson.MainActivity">
|
||||
|
||||
<!-- 双播放器容器:两个重叠的 PlayerView(silent 在下面,speaking 在上面,初始 alpha=0) -->
|
||||
<FrameLayout
|
||||
android:id="@+id/video_container"
|
||||
android:layout_width="0dp"
|
||||
android:layout_height="0dp"
|
||||
app:layout_constraintTop_toTopOf="parent"
|
||||
app:layout_constraintBottom_toBottomOf="parent"
|
||||
app:layout_constraintStart_toStartOf="parent"
|
||||
app:layout_constraintEnd_toEndOf="parent">
|
||||
|
||||
<com.google.android.exoplayer2.ui.PlayerView
|
||||
android:id="@+id/player_view_silent"
|
||||
android:layout_width="match_parent"
|
||||
android:layout_height="match_parent"
|
||||
app:use_controller="false"
|
||||
app:resize_mode="fill"
|
||||
app:surface_type="texture_view"
|
||||
android:alpha="1" />
|
||||
|
||||
<com.google.android.exoplayer2.ui.PlayerView
|
||||
android:id="@+id/player_view_speaking"
|
||||
android:layout_width="match_parent"
|
||||
android:layout_height="match_parent"
|
||||
app:use_controller="false"
|
||||
app:resize_mode="fill"
|
||||
app:surface_type="texture_view"
|
||||
android:alpha="0" />
|
||||
|
||||
</FrameLayout>
|
||||
|
||||
<TextView
|
||||
android:id="@+id/my_text"
|
||||
android:layout_width="0dp"
|
||||
@@ -14,10 +45,37 @@
|
||||
android:scrollbars="vertical"
|
||||
android:text="@string/hint"
|
||||
android:textIsSelectable="true"
|
||||
app:layout_constraintBottom_toTopOf="@+id/button_row"
|
||||
app:layout_constraintBottom_toTopOf="@+id/streaming_switch_row"
|
||||
app:layout_constraintEnd_toEndOf="parent"
|
||||
app:layout_constraintStart_toStartOf="parent"
|
||||
app:layout_constraintTop_toTopOf="parent" />
|
||||
app:layout_constraintTop_toTopOf="parent"
|
||||
android:background="@android:color/transparent"
|
||||
/>
|
||||
|
||||
<LinearLayout
|
||||
android:id="@+id/streaming_switch_row"
|
||||
android:layout_width="0dp"
|
||||
android:layout_height="wrap_content"
|
||||
android:gravity="center_vertical"
|
||||
android:orientation="horizontal"
|
||||
android:padding="16dp"
|
||||
app:layout_constraintBottom_toTopOf="@+id/button_row"
|
||||
app:layout_constraintEnd_toEndOf="parent"
|
||||
app:layout_constraintStart_toStartOf="parent">
|
||||
|
||||
<TextView
|
||||
android:layout_width="wrap_content"
|
||||
android:layout_height="wrap_content"
|
||||
android:text="流式输出"
|
||||
android:textSize="16sp"
|
||||
android:layout_marginEnd="16dp"/>
|
||||
|
||||
<Switch
|
||||
android:id="@+id/streaming_switch"
|
||||
android:layout_width="wrap_content"
|
||||
android:layout_height="wrap_content"
|
||||
android:checked="false"/>
|
||||
</LinearLayout>
|
||||
|
||||
<LinearLayout
|
||||
android:id="@+id/button_row"
|
||||
|
||||
BIN
app/src/main/res/raw/silent.mp4
Normal file
BIN
app/src/main/res/raw/silent.mp4
Normal file
Binary file not shown.
BIN
app/src/main/res/raw/speak_no_voice.mp4
Normal file
BIN
app/src/main/res/raw/speak_no_voice.mp4
Normal file
Binary file not shown.
@@ -3,4 +3,5 @@
|
||||
<string name="start">开始</string>
|
||||
<string name="stop">结束</string>
|
||||
<string name="hint">点击“开始”说话;识别后会请求大模型并用 TTS 播放回复。</string>
|
||||
<string name="system_prompt">你是一名小学女老师,喜欢回答学生的各种问题,请简洁但温柔地回答,每个回答不超过30字。</string>
|
||||
</resources>
|
||||
|
||||
Reference in New Issue
Block a user