556 lines
20 KiB
Kotlin
556 lines
20 KiB
Kotlin
package com.digitalperson
|
||
|
||
import android.Manifest
|
||
import android.content.pm.PackageManager
|
||
import android.os.Bundle
|
||
import android.util.Log
|
||
import android.widget.Toast
|
||
import androidx.appcompat.app.AppCompatActivity
|
||
import androidx.core.app.ActivityCompat
|
||
import com.digitalperson.cloud.CloudApiManager
|
||
import com.digitalperson.audio.AudioProcessor
|
||
import com.digitalperson.vad.VadManager
|
||
import com.digitalperson.asr.AsrManager
|
||
import com.digitalperson.tts.TtsManager
|
||
import com.digitalperson.ui.Live2DUiManager
|
||
import com.digitalperson.config.AppConfig
|
||
import com.digitalperson.metrics.TraceManager
|
||
import com.digitalperson.metrics.TraceSession
|
||
import kotlinx.coroutines.CoroutineScope
|
||
import kotlinx.coroutines.Dispatchers
|
||
import kotlinx.coroutines.Job
|
||
import kotlinx.coroutines.SupervisorJob
|
||
import kotlinx.coroutines.cancel
|
||
import kotlinx.coroutines.isActive
|
||
import kotlinx.coroutines.launch
|
||
import kotlinx.coroutines.withContext
|
||
|
||
class Live2DChatActivity : AppCompatActivity() {
|
||
|
||
private lateinit var uiManager: Live2DUiManager
|
||
private lateinit var vadManager: VadManager
|
||
private lateinit var asrManager: AsrManager
|
||
private lateinit var ttsManager: TtsManager
|
||
private lateinit var audioProcessor: AudioProcessor
|
||
|
||
private val permissions: Array<String> = arrayOf(Manifest.permission.RECORD_AUDIO)
|
||
|
||
@Volatile
|
||
private var isRecording: Boolean = false
|
||
|
||
private val holdToSpeakAudioBuffer = mutableListOf<Float>()
|
||
private val HOLD_TO_SPEAK_MIN_SAMPLES = 16000 // 1秒的音频数据
|
||
|
||
private val ioScope = CoroutineScope(SupervisorJob() + Dispatchers.IO)
|
||
private var recordingJob: Job? = null
|
||
private val nativeLock = Any()
|
||
|
||
private lateinit var cloudApiManager: CloudApiManager
|
||
private val segmenter = StreamingTextSegmenter(
|
||
maxLen = AppConfig.Tts.MAX_LEN,
|
||
maxWaitMs = AppConfig.Tts.MAX_WAIT_MS
|
||
)
|
||
|
||
private var currentTrace: TraceSession? = null
|
||
@Volatile private var llmInFlight: Boolean = false
|
||
private var enableStreaming = false
|
||
|
||
override fun onRequestPermissionsResult(
|
||
requestCode: Int,
|
||
permissions: Array<String>,
|
||
grantResults: IntArray
|
||
) {
|
||
super.onRequestPermissionsResult(requestCode, permissions, grantResults)
|
||
val ok = requestCode == AppConfig.REQUEST_RECORD_AUDIO_PERMISSION &&
|
||
grantResults.isNotEmpty() &&
|
||
grantResults[0] == PackageManager.PERMISSION_GRANTED
|
||
if (!ok) {
|
||
Log.e(AppConfig.TAG, "Audio record is disallowed")
|
||
finish()
|
||
}
|
||
}
|
||
|
||
override fun onCreate(savedInstanceState: Bundle?) {
|
||
super.onCreate(savedInstanceState)
|
||
setContentView(R.layout.activity_live2d_chat)
|
||
|
||
uiManager = Live2DUiManager(this)
|
||
uiManager.initViews(
|
||
textViewId = R.id.my_text,
|
||
scrollViewId = R.id.scroll_view,
|
||
startButtonId = R.id.start_button,
|
||
stopButtonId = R.id.stop_button,
|
||
recordButtonId = R.id.record_button,
|
||
traditionalButtonsId = R.id.traditional_buttons,
|
||
silentPlayerViewId = 0,
|
||
speakingPlayerViewId = 0,
|
||
live2dViewId = R.id.live2d_view
|
||
)
|
||
|
||
// 根据配置选择交互方式
|
||
uiManager.setUseHoldToSpeak(AppConfig.USE_HOLD_TO_SPEAK)
|
||
|
||
if (AppConfig.USE_HOLD_TO_SPEAK) {
|
||
uiManager.setRecordButtonTouchListener { isDown ->
|
||
if (isDown) {
|
||
// 按住按钮,开始录音
|
||
onRecordButtonDown()
|
||
} else {
|
||
// 松开按钮,停止录音
|
||
onRecordButtonUp()
|
||
}
|
||
}
|
||
} else {
|
||
uiManager.setStartButtonListener { onStartClicked() }
|
||
uiManager.setStopButtonListener { onStopClicked(userInitiated = true) }
|
||
}
|
||
|
||
ActivityCompat.requestPermissions(this, permissions, AppConfig.REQUEST_RECORD_AUDIO_PERMISSION)
|
||
|
||
try {
|
||
val streamingSwitch = findViewById<android.widget.Switch>(R.id.streaming_switch)
|
||
streamingSwitch.isChecked = enableStreaming
|
||
streamingSwitch.setOnCheckedChangeListener { _, isChecked ->
|
||
enableStreaming = isChecked
|
||
cloudApiManager.setEnableStreaming(isChecked)
|
||
uiManager.showToast("流式输出已${if (isChecked) "启用" else "禁用"}")
|
||
}
|
||
} catch (e: Exception) {
|
||
Log.w(AppConfig.TAG, "Streaming switch not found in layout: ${e.message}")
|
||
}
|
||
|
||
if (AppConfig.USE_HOLD_TO_SPEAK) {
|
||
uiManager.setButtonsEnabled(recordEnabled = false)
|
||
} else {
|
||
uiManager.setButtonsEnabled(startEnabled = false, stopEnabled = false)
|
||
}
|
||
uiManager.setText("初始化中…")
|
||
|
||
audioProcessor = AudioProcessor(this)
|
||
ttsManager = TtsManager(this)
|
||
ttsManager.setCallback(createTtsCallback())
|
||
|
||
asrManager = AsrManager(this)
|
||
asrManager.setAudioProcessor(audioProcessor)
|
||
asrManager.setCallback(createAsrCallback())
|
||
|
||
vadManager = VadManager(this)
|
||
vadManager.setCallback(createVadCallback())
|
||
|
||
ioScope.launch {
|
||
try {
|
||
Log.i(AppConfig.TAG, "Init VAD + SenseVoice(RKNN) + TTS (background)")
|
||
synchronized(nativeLock) {
|
||
vadManager.initVadModel()
|
||
asrManager.initSenseVoiceModel()
|
||
}
|
||
val ttsOk = ttsManager.initTtsAndAudioTrack()
|
||
withContext(Dispatchers.Main) {
|
||
if (!ttsOk) {
|
||
uiManager.showToast(
|
||
"TTS 初始化失败:请确认 assets/${AppConfig.Tts.MODEL_DIR}/ 下有 model.onnx、tokens.txt、lexicon.txt 以及 phone/date/number/new_heteronym.fst",
|
||
Toast.LENGTH_LONG
|
||
)
|
||
}
|
||
uiManager.setText(getString(R.string.hint))
|
||
if (AppConfig.USE_HOLD_TO_SPEAK) {
|
||
uiManager.setButtonsEnabled(recordEnabled = true)
|
||
} else {
|
||
uiManager.setButtonsEnabled(startEnabled = true, stopEnabled = false)
|
||
}
|
||
}
|
||
} catch (t: Throwable) {
|
||
Log.e(AppConfig.TAG, "Initialization failed: ${t.message}", t)
|
||
withContext(Dispatchers.Main) {
|
||
uiManager.setText("初始化失败:${t.javaClass.simpleName}: ${t.message}")
|
||
uiManager.showToast("初始化失败(请看 Logcat): ${t.javaClass.simpleName}", Toast.LENGTH_LONG)
|
||
if (AppConfig.USE_HOLD_TO_SPEAK) {
|
||
uiManager.setButtonsEnabled(recordEnabled = false)
|
||
} else {
|
||
uiManager.setButtonsEnabled(startEnabled = false, stopEnabled = false)
|
||
}
|
||
}
|
||
}
|
||
}
|
||
|
||
cloudApiManager = CloudApiManager(createCloudApiListener(), applicationContext)
|
||
cloudApiManager.setEnableStreaming(enableStreaming)
|
||
|
||
Log.d(AppConfig.TAG, "Pre-starting ASR worker")
|
||
ioScope.launch {
|
||
asrManager.runAsrWorker()
|
||
}
|
||
}
|
||
|
||
private fun createAsrCallback() = object : AsrManager.AsrCallback {
|
||
override fun onAsrStarted() {
|
||
currentTrace?.markASRStart()
|
||
runOnUiThread {
|
||
uiManager.appendToUi("\n[ASR] 开始识别...\n")
|
||
}
|
||
}
|
||
|
||
override fun onAsrResult(text: String) {
|
||
currentTrace?.markASREnd()
|
||
runOnUiThread {
|
||
uiManager.appendToUi("\n\n[ASR] ${text}\n")
|
||
}
|
||
currentTrace?.markRecordingDone()
|
||
currentTrace?.markLlmResponseReceived()
|
||
}
|
||
|
||
override fun onAsrSkipped(reason: String) {
|
||
Log.d(AppConfig.TAG, "ASR segment skipped: $reason")
|
||
}
|
||
|
||
override fun shouldSkipAsr(): Boolean = ttsManager.isPlaying()
|
||
|
||
override fun isLlmInFlight(): Boolean = llmInFlight
|
||
|
||
override fun onLlmCalled(text: String) {
|
||
llmInFlight = true
|
||
Log.d(AppConfig.TAG, "Calling LLM with text: $text")
|
||
cloudApiManager.callLLM(text)
|
||
}
|
||
}
|
||
|
||
private fun createVadCallback() = object : VadManager.VadCallback {
|
||
override fun onSpeechSegmentReady(originalAudio: FloatArray, processedAudio: FloatArray) {
|
||
Log.d(AppConfig.TAG, "Sending audio segment to ASR queue, size: ${processedAudio.size}")
|
||
asrManager.enqueueAudioSegment(originalAudio, processedAudio)
|
||
}
|
||
|
||
override fun shouldSkipProcessing(): Boolean = ttsManager.isPlaying() || llmInFlight
|
||
}
|
||
|
||
private fun createCloudApiListener() = object : CloudApiManager.CloudApiListener {
|
||
private var llmFirstChunkMarked = false
|
||
|
||
override fun onLLMResponseReceived(response: String) {
|
||
currentTrace?.markLlmDone()
|
||
llmInFlight = false
|
||
|
||
if (enableStreaming) {
|
||
for (seg in segmenter.flush()) {
|
||
ttsManager.enqueueSegment(seg)
|
||
}
|
||
ttsManager.enqueueEnd()
|
||
} else {
|
||
val previousMood = com.digitalperson.mood.MoodManager.getCurrentMood()
|
||
val (filteredText, mood) = com.digitalperson.mood.MoodManager.extractAndFilterMood(response)
|
||
android.util.Log.d(com.digitalperson.config.AppConfig.TAG, "Final mood: $mood, filtered text: $filteredText")
|
||
|
||
if (mood != previousMood) {
|
||
uiManager.setMood(mood)
|
||
}
|
||
|
||
runOnUiThread {
|
||
uiManager.appendToUi("${filteredText}\n")
|
||
}
|
||
ttsManager.enqueueSegment(filteredText)
|
||
ttsManager.enqueueEnd()
|
||
}
|
||
}
|
||
|
||
override fun onLLMStreamingChunkReceived(chunk: String) {
|
||
if (enableStreaming) {
|
||
if (!llmFirstChunkMarked) {
|
||
llmFirstChunkMarked = true
|
||
currentTrace?.markLlmFirstChunk()
|
||
}
|
||
|
||
val previousMood = com.digitalperson.mood.MoodManager.getCurrentMood()
|
||
val (filteredText, mood) = com.digitalperson.mood.MoodManager.extractAndFilterMood(chunk)
|
||
if (mood != previousMood) {
|
||
android.util.Log.d(com.digitalperson.config.AppConfig.TAG, "Mood changed to: $mood")
|
||
// 设置Live2D人物的心情
|
||
uiManager.setMood(mood)
|
||
}
|
||
|
||
uiManager.appendToUi(filteredText)
|
||
|
||
val segments = segmenter.processChunk(filteredText)
|
||
for (seg in segments) {
|
||
ttsManager.enqueueSegment(seg)
|
||
}
|
||
}
|
||
}
|
||
|
||
override fun onTTSAudioReceived(audioFilePath: String) {}
|
||
|
||
override fun onError(errorMessage: String) {
|
||
llmInFlight = false
|
||
uiManager.showToast(errorMessage, Toast.LENGTH_LONG)
|
||
onStopClicked(userInitiated = false)
|
||
}
|
||
}
|
||
|
||
private fun createTtsCallback() = object : TtsManager.TtsCallback {
|
||
override fun onTtsStarted(text: String) {
|
||
runOnUiThread {
|
||
uiManager.appendToUi("\n[TTS] 开始合成...\n")
|
||
}
|
||
}
|
||
|
||
override fun onTtsCompleted() {
|
||
runOnUiThread {
|
||
uiManager.appendToUi("\n[LOG] TTS completed at: ${System.currentTimeMillis()}\n")
|
||
}
|
||
}
|
||
|
||
override fun onTtsSegmentCompleted(durationMs: Long) {}
|
||
|
||
override fun isTtsStopped(): Boolean = !isRecording
|
||
|
||
override fun onClearAsrQueue() {
|
||
asrManager.clearQueue()
|
||
}
|
||
|
||
override fun onSetSpeaking(speaking: Boolean) {
|
||
uiManager.setSpeaking(speaking)
|
||
}
|
||
|
||
override fun getCurrentTrace(): TraceSession? = currentTrace
|
||
|
||
override fun onTraceMarkTtsRequestEnqueued() {
|
||
currentTrace?.markTtsRequestEnqueued()
|
||
}
|
||
|
||
override fun onTraceMarkTtsSynthesisStart() {
|
||
currentTrace?.markTtsSynthesisStart()
|
||
}
|
||
|
||
override fun onTraceMarkTtsFirstPcmReady() {
|
||
currentTrace?.markTtsFirstPcmReady()
|
||
}
|
||
|
||
override fun onTraceMarkTtsFirstAudioPlay() {
|
||
currentTrace?.markTtsFirstAudioPlay()
|
||
}
|
||
|
||
override fun onTraceMarkTtsDone() {
|
||
currentTrace?.markTtsDone()
|
||
}
|
||
|
||
override fun onTraceAddDuration(name: String, value: Long) {
|
||
currentTrace?.addDuration(name, value)
|
||
}
|
||
|
||
override fun onEndTurn() {
|
||
TraceManager.getInstance().endTurn()
|
||
currentTrace = null
|
||
}
|
||
}
|
||
|
||
override fun onDestroy() {
|
||
super.onDestroy()
|
||
onStopClicked(userInitiated = false)
|
||
ioScope.cancel()
|
||
synchronized(nativeLock) {
|
||
try { vadManager.release() } catch (_: Throwable) {}
|
||
try { asrManager.release() } catch (_: Throwable) {}
|
||
}
|
||
try { ttsManager.release() } catch (_: Throwable) {}
|
||
try { uiManager.release() } catch (_: Throwable) {}
|
||
try { audioProcessor.release() } catch (_: Throwable) {}
|
||
}
|
||
|
||
override fun onResume() {
|
||
super.onResume()
|
||
uiManager.onResume()
|
||
}
|
||
|
||
override fun onPause() {
|
||
uiManager.onPause()
|
||
super.onPause()
|
||
}
|
||
|
||
private fun onStartClicked() {
|
||
Log.d(AppConfig.TAG, "onStartClicked called")
|
||
if (isRecording) {
|
||
Log.d(AppConfig.TAG, "Already recording, returning")
|
||
return
|
||
}
|
||
|
||
if (!audioProcessor.initMicrophone(permissions, AppConfig.REQUEST_RECORD_AUDIO_PERMISSION)) {
|
||
uiManager.showToast("麦克风初始化失败/无权限")
|
||
return
|
||
}
|
||
|
||
currentTrace = TraceManager.getInstance().startNewTurn()
|
||
currentTrace?.mark("turn_start")
|
||
llmInFlight = false
|
||
|
||
uiManager.clearText()
|
||
|
||
ttsManager.reset()
|
||
ttsManager.setCurrentTrace(currentTrace)
|
||
segmenter.reset()
|
||
|
||
vadManager.reset()
|
||
audioProcessor.startRecording()
|
||
isRecording = true
|
||
|
||
uiManager.setButtonsEnabled(startEnabled = false, stopEnabled = true)
|
||
|
||
Log.d(AppConfig.TAG, "Starting processSamplesLoop coroutine")
|
||
recordingJob?.cancel()
|
||
recordingJob = ioScope.launch {
|
||
processSamplesLoop()
|
||
}
|
||
Log.d(AppConfig.TAG, "onStartClicked completed")
|
||
}
|
||
|
||
private fun onRecordButtonDown() {
|
||
Log.d(AppConfig.TAG, "onRecordButtonDown called")
|
||
if (isRecording) {
|
||
Log.d(AppConfig.TAG, "Already recording, returning")
|
||
return
|
||
}
|
||
|
||
// 如果TTS正在播放,打断它
|
||
val interrupted = ttsManager.interruptForNewTurn()
|
||
if (interrupted) {
|
||
uiManager.appendToUi("\n[LOG] 已打断TTS播放\n")
|
||
}
|
||
|
||
if (!audioProcessor.initMicrophone(permissions, AppConfig.REQUEST_RECORD_AUDIO_PERMISSION)) {
|
||
uiManager.showToast("麦克风初始化失败/无权限")
|
||
return
|
||
}
|
||
|
||
currentTrace = TraceManager.getInstance().startNewTurn()
|
||
currentTrace?.mark("turn_start")
|
||
llmInFlight = false
|
||
|
||
uiManager.clearText()
|
||
|
||
// interruptForNewTurn() already prepared TTS state for next turn.
|
||
// Keep reset() only for non-interrupt entry points.
|
||
ttsManager.setCurrentTrace(currentTrace)
|
||
segmenter.reset()
|
||
|
||
// 启动按住说话的动作
|
||
uiManager.startSpecificMotion("hold_to_speak")
|
||
|
||
holdToSpeakAudioBuffer.clear()
|
||
audioProcessor.startRecording()
|
||
isRecording = true
|
||
|
||
Log.d(AppConfig.TAG, "Starting processSamplesLoop coroutine")
|
||
recordingJob?.cancel()
|
||
recordingJob = ioScope.launch {
|
||
processSamplesLoop()
|
||
}
|
||
Log.d(AppConfig.TAG, "onRecordButtonDown completed")
|
||
}
|
||
|
||
private fun onRecordButtonUp() {
|
||
Log.d(AppConfig.TAG, "onRecordButtonUp called")
|
||
if (!isRecording) {
|
||
Log.d(AppConfig.TAG, "Not recording, returning")
|
||
return
|
||
}
|
||
|
||
isRecording = false
|
||
audioProcessor.stopRecording()
|
||
|
||
recordingJob?.cancel()
|
||
recordingJob = ioScope.launch {
|
||
// 处理最后的音频数据
|
||
val audioData = audioProcessor.getRecordedData()
|
||
holdToSpeakAudioBuffer.addAll(audioData.toList())
|
||
|
||
if (holdToSpeakAudioBuffer.size >= HOLD_TO_SPEAK_MIN_SAMPLES) {
|
||
val finalAudio = holdToSpeakAudioBuffer.toFloatArray()
|
||
asrManager.enqueueAudioSegment(finalAudio, finalAudio)
|
||
} else {
|
||
uiManager.showToast("录音时间太短,请长按至少1秒")
|
||
}
|
||
holdToSpeakAudioBuffer.clear()
|
||
}
|
||
Log.d(AppConfig.TAG, "onRecordButtonUp completed")
|
||
}
|
||
|
||
private fun onStopClicked(userInitiated: Boolean) {
|
||
isRecording = false
|
||
audioProcessor.stopRecording()
|
||
|
||
recordingJob?.cancel()
|
||
recordingJob = null
|
||
|
||
ttsManager.stop()
|
||
|
||
if (AppConfig.USE_HOLD_TO_SPEAK) {
|
||
uiManager.setButtonsEnabled(recordEnabled = true)
|
||
} else {
|
||
uiManager.setButtonsEnabled(startEnabled = true, stopEnabled = false)
|
||
}
|
||
|
||
if (userInitiated) {
|
||
TraceManager.getInstance().endTurn()
|
||
currentTrace = null
|
||
}
|
||
}
|
||
|
||
private suspend fun processSamplesLoop() {
|
||
Log.d(AppConfig.TAG, "processSamplesLoop started")
|
||
|
||
if (AppConfig.USE_HOLD_TO_SPEAK) {
|
||
// 按住说话模式:累积音频数据到一定长度后再发送给ASR
|
||
while (isRecording && ioScope.coroutineContext.isActive) {
|
||
val audioData = audioProcessor.getAudioData()
|
||
if (audioData.isNotEmpty()) {
|
||
holdToSpeakAudioBuffer.addAll(audioData.toList())
|
||
}
|
||
// 避免CPU占用过高
|
||
kotlinx.coroutines.delay(10)
|
||
}
|
||
} else {
|
||
// 传统模式:使用VAD
|
||
val windowSize = AppConfig.WINDOW_SIZE
|
||
val buffer = ShortArray(windowSize)
|
||
var loopCount = 0
|
||
|
||
while (isRecording && ioScope.coroutineContext.isActive) {
|
||
loopCount++
|
||
if (loopCount % 100 == 0) {
|
||
Log.d(AppConfig.TAG, "processSamplesLoop running, loopCount=$loopCount, ttsPlaying=${ttsManager.isPlaying()}")
|
||
}
|
||
|
||
if (ttsManager.isPlaying()) {
|
||
if (vadManager.isInSpeech()) {
|
||
Log.d(AppConfig.TAG, "TTS playing, resetting VAD state")
|
||
vadManager.clearState()
|
||
}
|
||
val ret = audioProcessor.readAudio(buffer)
|
||
if (ret <= 0) continue
|
||
continue
|
||
}
|
||
|
||
val ret = audioProcessor.readAudio(buffer)
|
||
if (ret <= 0) continue
|
||
if (ret != windowSize) continue
|
||
|
||
val chunk = audioProcessor.convertShortToFloat(buffer)
|
||
val processedChunk = audioProcessor.applyGain(chunk)
|
||
|
||
val result = vadManager.processAudioChunk(chunk, processedChunk)
|
||
|
||
if (vadManager.vadComputeCount % 100 == 0) {
|
||
Log.d(AppConfig.TAG, "VAD result: $result, inSpeech=${vadManager.isInSpeech()}")
|
||
}
|
||
|
||
if (loopCount % 1000 == 0) {
|
||
Log.d(AppConfig.TAG, "VAD status: inSpeech=${vadManager.isInSpeech()}, speechLen=${vadManager.getSpeechLength()}")
|
||
}
|
||
|
||
val forced = segmenter.maybeForceByTime()
|
||
for (seg in forced) ttsManager.enqueueSegment(seg)
|
||
}
|
||
|
||
vadManager.forceFinalize()
|
||
}
|
||
Log.d(AppConfig.TAG, "processSamplesLoop stopped")
|
||
}
|
||
} |