Files
digital_person/app/src/main/java/com/digitalperson/Live2DChatActivity.kt
gcw_4spBpAfv 1701ecfb7f push2talk
2026-03-02 17:18:18 +08:00

556 lines
20 KiB
Kotlin
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

package com.digitalperson
import android.Manifest
import android.content.pm.PackageManager
import android.os.Bundle
import android.util.Log
import android.widget.Toast
import androidx.appcompat.app.AppCompatActivity
import androidx.core.app.ActivityCompat
import com.digitalperson.cloud.CloudApiManager
import com.digitalperson.audio.AudioProcessor
import com.digitalperson.vad.VadManager
import com.digitalperson.asr.AsrManager
import com.digitalperson.tts.TtsManager
import com.digitalperson.ui.Live2DUiManager
import com.digitalperson.config.AppConfig
import com.digitalperson.metrics.TraceManager
import com.digitalperson.metrics.TraceSession
import kotlinx.coroutines.CoroutineScope
import kotlinx.coroutines.Dispatchers
import kotlinx.coroutines.Job
import kotlinx.coroutines.SupervisorJob
import kotlinx.coroutines.cancel
import kotlinx.coroutines.isActive
import kotlinx.coroutines.launch
import kotlinx.coroutines.withContext
class Live2DChatActivity : AppCompatActivity() {
private lateinit var uiManager: Live2DUiManager
private lateinit var vadManager: VadManager
private lateinit var asrManager: AsrManager
private lateinit var ttsManager: TtsManager
private lateinit var audioProcessor: AudioProcessor
private val permissions: Array<String> = arrayOf(Manifest.permission.RECORD_AUDIO)
@Volatile
private var isRecording: Boolean = false
private val holdToSpeakAudioBuffer = mutableListOf<Float>()
private val HOLD_TO_SPEAK_MIN_SAMPLES = 16000 // 1秒的音频数据
private val ioScope = CoroutineScope(SupervisorJob() + Dispatchers.IO)
private var recordingJob: Job? = null
private val nativeLock = Any()
private lateinit var cloudApiManager: CloudApiManager
private val segmenter = StreamingTextSegmenter(
maxLen = AppConfig.Tts.MAX_LEN,
maxWaitMs = AppConfig.Tts.MAX_WAIT_MS
)
private var currentTrace: TraceSession? = null
@Volatile private var llmInFlight: Boolean = false
private var enableStreaming = false
override fun onRequestPermissionsResult(
requestCode: Int,
permissions: Array<String>,
grantResults: IntArray
) {
super.onRequestPermissionsResult(requestCode, permissions, grantResults)
val ok = requestCode == AppConfig.REQUEST_RECORD_AUDIO_PERMISSION &&
grantResults.isNotEmpty() &&
grantResults[0] == PackageManager.PERMISSION_GRANTED
if (!ok) {
Log.e(AppConfig.TAG, "Audio record is disallowed")
finish()
}
}
override fun onCreate(savedInstanceState: Bundle?) {
super.onCreate(savedInstanceState)
setContentView(R.layout.activity_live2d_chat)
uiManager = Live2DUiManager(this)
uiManager.initViews(
textViewId = R.id.my_text,
scrollViewId = R.id.scroll_view,
startButtonId = R.id.start_button,
stopButtonId = R.id.stop_button,
recordButtonId = R.id.record_button,
traditionalButtonsId = R.id.traditional_buttons,
silentPlayerViewId = 0,
speakingPlayerViewId = 0,
live2dViewId = R.id.live2d_view
)
// 根据配置选择交互方式
uiManager.setUseHoldToSpeak(AppConfig.USE_HOLD_TO_SPEAK)
if (AppConfig.USE_HOLD_TO_SPEAK) {
uiManager.setRecordButtonTouchListener { isDown ->
if (isDown) {
// 按住按钮,开始录音
onRecordButtonDown()
} else {
// 松开按钮,停止录音
onRecordButtonUp()
}
}
} else {
uiManager.setStartButtonListener { onStartClicked() }
uiManager.setStopButtonListener { onStopClicked(userInitiated = true) }
}
ActivityCompat.requestPermissions(this, permissions, AppConfig.REQUEST_RECORD_AUDIO_PERMISSION)
try {
val streamingSwitch = findViewById<android.widget.Switch>(R.id.streaming_switch)
streamingSwitch.isChecked = enableStreaming
streamingSwitch.setOnCheckedChangeListener { _, isChecked ->
enableStreaming = isChecked
cloudApiManager.setEnableStreaming(isChecked)
uiManager.showToast("流式输出已${if (isChecked) "启用" else "禁用"}")
}
} catch (e: Exception) {
Log.w(AppConfig.TAG, "Streaming switch not found in layout: ${e.message}")
}
if (AppConfig.USE_HOLD_TO_SPEAK) {
uiManager.setButtonsEnabled(recordEnabled = false)
} else {
uiManager.setButtonsEnabled(startEnabled = false, stopEnabled = false)
}
uiManager.setText("初始化中…")
audioProcessor = AudioProcessor(this)
ttsManager = TtsManager(this)
ttsManager.setCallback(createTtsCallback())
asrManager = AsrManager(this)
asrManager.setAudioProcessor(audioProcessor)
asrManager.setCallback(createAsrCallback())
vadManager = VadManager(this)
vadManager.setCallback(createVadCallback())
ioScope.launch {
try {
Log.i(AppConfig.TAG, "Init VAD + SenseVoice(RKNN) + TTS (background)")
synchronized(nativeLock) {
vadManager.initVadModel()
asrManager.initSenseVoiceModel()
}
val ttsOk = ttsManager.initTtsAndAudioTrack()
withContext(Dispatchers.Main) {
if (!ttsOk) {
uiManager.showToast(
"TTS 初始化失败:请确认 assets/${AppConfig.Tts.MODEL_DIR}/ 下有 model.onnx、tokens.txt、lexicon.txt 以及 phone/date/number/new_heteronym.fst",
Toast.LENGTH_LONG
)
}
uiManager.setText(getString(R.string.hint))
if (AppConfig.USE_HOLD_TO_SPEAK) {
uiManager.setButtonsEnabled(recordEnabled = true)
} else {
uiManager.setButtonsEnabled(startEnabled = true, stopEnabled = false)
}
}
} catch (t: Throwable) {
Log.e(AppConfig.TAG, "Initialization failed: ${t.message}", t)
withContext(Dispatchers.Main) {
uiManager.setText("初始化失败:${t.javaClass.simpleName}: ${t.message}")
uiManager.showToast("初始化失败(请看 Logcat: ${t.javaClass.simpleName}", Toast.LENGTH_LONG)
if (AppConfig.USE_HOLD_TO_SPEAK) {
uiManager.setButtonsEnabled(recordEnabled = false)
} else {
uiManager.setButtonsEnabled(startEnabled = false, stopEnabled = false)
}
}
}
}
cloudApiManager = CloudApiManager(createCloudApiListener(), applicationContext)
cloudApiManager.setEnableStreaming(enableStreaming)
Log.d(AppConfig.TAG, "Pre-starting ASR worker")
ioScope.launch {
asrManager.runAsrWorker()
}
}
private fun createAsrCallback() = object : AsrManager.AsrCallback {
override fun onAsrStarted() {
currentTrace?.markASRStart()
runOnUiThread {
uiManager.appendToUi("\n[ASR] 开始识别...\n")
}
}
override fun onAsrResult(text: String) {
currentTrace?.markASREnd()
runOnUiThread {
uiManager.appendToUi("\n\n[ASR] ${text}\n")
}
currentTrace?.markRecordingDone()
currentTrace?.markLlmResponseReceived()
}
override fun onAsrSkipped(reason: String) {
Log.d(AppConfig.TAG, "ASR segment skipped: $reason")
}
override fun shouldSkipAsr(): Boolean = ttsManager.isPlaying()
override fun isLlmInFlight(): Boolean = llmInFlight
override fun onLlmCalled(text: String) {
llmInFlight = true
Log.d(AppConfig.TAG, "Calling LLM with text: $text")
cloudApiManager.callLLM(text)
}
}
private fun createVadCallback() = object : VadManager.VadCallback {
override fun onSpeechSegmentReady(originalAudio: FloatArray, processedAudio: FloatArray) {
Log.d(AppConfig.TAG, "Sending audio segment to ASR queue, size: ${processedAudio.size}")
asrManager.enqueueAudioSegment(originalAudio, processedAudio)
}
override fun shouldSkipProcessing(): Boolean = ttsManager.isPlaying() || llmInFlight
}
private fun createCloudApiListener() = object : CloudApiManager.CloudApiListener {
private var llmFirstChunkMarked = false
override fun onLLMResponseReceived(response: String) {
currentTrace?.markLlmDone()
llmInFlight = false
if (enableStreaming) {
for (seg in segmenter.flush()) {
ttsManager.enqueueSegment(seg)
}
ttsManager.enqueueEnd()
} else {
val previousMood = com.digitalperson.mood.MoodManager.getCurrentMood()
val (filteredText, mood) = com.digitalperson.mood.MoodManager.extractAndFilterMood(response)
android.util.Log.d(com.digitalperson.config.AppConfig.TAG, "Final mood: $mood, filtered text: $filteredText")
if (mood != previousMood) {
uiManager.setMood(mood)
}
runOnUiThread {
uiManager.appendToUi("${filteredText}\n")
}
ttsManager.enqueueSegment(filteredText)
ttsManager.enqueueEnd()
}
}
override fun onLLMStreamingChunkReceived(chunk: String) {
if (enableStreaming) {
if (!llmFirstChunkMarked) {
llmFirstChunkMarked = true
currentTrace?.markLlmFirstChunk()
}
val previousMood = com.digitalperson.mood.MoodManager.getCurrentMood()
val (filteredText, mood) = com.digitalperson.mood.MoodManager.extractAndFilterMood(chunk)
if (mood != previousMood) {
android.util.Log.d(com.digitalperson.config.AppConfig.TAG, "Mood changed to: $mood")
// 设置Live2D人物的心情
uiManager.setMood(mood)
}
uiManager.appendToUi(filteredText)
val segments = segmenter.processChunk(filteredText)
for (seg in segments) {
ttsManager.enqueueSegment(seg)
}
}
}
override fun onTTSAudioReceived(audioFilePath: String) {}
override fun onError(errorMessage: String) {
llmInFlight = false
uiManager.showToast(errorMessage, Toast.LENGTH_LONG)
onStopClicked(userInitiated = false)
}
}
private fun createTtsCallback() = object : TtsManager.TtsCallback {
override fun onTtsStarted(text: String) {
runOnUiThread {
uiManager.appendToUi("\n[TTS] 开始合成...\n")
}
}
override fun onTtsCompleted() {
runOnUiThread {
uiManager.appendToUi("\n[LOG] TTS completed at: ${System.currentTimeMillis()}\n")
}
}
override fun onTtsSegmentCompleted(durationMs: Long) {}
override fun isTtsStopped(): Boolean = !isRecording
override fun onClearAsrQueue() {
asrManager.clearQueue()
}
override fun onSetSpeaking(speaking: Boolean) {
uiManager.setSpeaking(speaking)
}
override fun getCurrentTrace(): TraceSession? = currentTrace
override fun onTraceMarkTtsRequestEnqueued() {
currentTrace?.markTtsRequestEnqueued()
}
override fun onTraceMarkTtsSynthesisStart() {
currentTrace?.markTtsSynthesisStart()
}
override fun onTraceMarkTtsFirstPcmReady() {
currentTrace?.markTtsFirstPcmReady()
}
override fun onTraceMarkTtsFirstAudioPlay() {
currentTrace?.markTtsFirstAudioPlay()
}
override fun onTraceMarkTtsDone() {
currentTrace?.markTtsDone()
}
override fun onTraceAddDuration(name: String, value: Long) {
currentTrace?.addDuration(name, value)
}
override fun onEndTurn() {
TraceManager.getInstance().endTurn()
currentTrace = null
}
}
override fun onDestroy() {
super.onDestroy()
onStopClicked(userInitiated = false)
ioScope.cancel()
synchronized(nativeLock) {
try { vadManager.release() } catch (_: Throwable) {}
try { asrManager.release() } catch (_: Throwable) {}
}
try { ttsManager.release() } catch (_: Throwable) {}
try { uiManager.release() } catch (_: Throwable) {}
try { audioProcessor.release() } catch (_: Throwable) {}
}
override fun onResume() {
super.onResume()
uiManager.onResume()
}
override fun onPause() {
uiManager.onPause()
super.onPause()
}
private fun onStartClicked() {
Log.d(AppConfig.TAG, "onStartClicked called")
if (isRecording) {
Log.d(AppConfig.TAG, "Already recording, returning")
return
}
if (!audioProcessor.initMicrophone(permissions, AppConfig.REQUEST_RECORD_AUDIO_PERMISSION)) {
uiManager.showToast("麦克风初始化失败/无权限")
return
}
currentTrace = TraceManager.getInstance().startNewTurn()
currentTrace?.mark("turn_start")
llmInFlight = false
uiManager.clearText()
ttsManager.reset()
ttsManager.setCurrentTrace(currentTrace)
segmenter.reset()
vadManager.reset()
audioProcessor.startRecording()
isRecording = true
uiManager.setButtonsEnabled(startEnabled = false, stopEnabled = true)
Log.d(AppConfig.TAG, "Starting processSamplesLoop coroutine")
recordingJob?.cancel()
recordingJob = ioScope.launch {
processSamplesLoop()
}
Log.d(AppConfig.TAG, "onStartClicked completed")
}
private fun onRecordButtonDown() {
Log.d(AppConfig.TAG, "onRecordButtonDown called")
if (isRecording) {
Log.d(AppConfig.TAG, "Already recording, returning")
return
}
// 如果TTS正在播放打断它
val interrupted = ttsManager.interruptForNewTurn()
if (interrupted) {
uiManager.appendToUi("\n[LOG] 已打断TTS播放\n")
}
if (!audioProcessor.initMicrophone(permissions, AppConfig.REQUEST_RECORD_AUDIO_PERMISSION)) {
uiManager.showToast("麦克风初始化失败/无权限")
return
}
currentTrace = TraceManager.getInstance().startNewTurn()
currentTrace?.mark("turn_start")
llmInFlight = false
uiManager.clearText()
// interruptForNewTurn() already prepared TTS state for next turn.
// Keep reset() only for non-interrupt entry points.
ttsManager.setCurrentTrace(currentTrace)
segmenter.reset()
// 启动按住说话的动作
uiManager.startSpecificMotion("hold_to_speak")
holdToSpeakAudioBuffer.clear()
audioProcessor.startRecording()
isRecording = true
Log.d(AppConfig.TAG, "Starting processSamplesLoop coroutine")
recordingJob?.cancel()
recordingJob = ioScope.launch {
processSamplesLoop()
}
Log.d(AppConfig.TAG, "onRecordButtonDown completed")
}
private fun onRecordButtonUp() {
Log.d(AppConfig.TAG, "onRecordButtonUp called")
if (!isRecording) {
Log.d(AppConfig.TAG, "Not recording, returning")
return
}
isRecording = false
audioProcessor.stopRecording()
recordingJob?.cancel()
recordingJob = ioScope.launch {
// 处理最后的音频数据
val audioData = audioProcessor.getRecordedData()
holdToSpeakAudioBuffer.addAll(audioData.toList())
if (holdToSpeakAudioBuffer.size >= HOLD_TO_SPEAK_MIN_SAMPLES) {
val finalAudio = holdToSpeakAudioBuffer.toFloatArray()
asrManager.enqueueAudioSegment(finalAudio, finalAudio)
} else {
uiManager.showToast("录音时间太短请长按至少1秒")
}
holdToSpeakAudioBuffer.clear()
}
Log.d(AppConfig.TAG, "onRecordButtonUp completed")
}
private fun onStopClicked(userInitiated: Boolean) {
isRecording = false
audioProcessor.stopRecording()
recordingJob?.cancel()
recordingJob = null
ttsManager.stop()
if (AppConfig.USE_HOLD_TO_SPEAK) {
uiManager.setButtonsEnabled(recordEnabled = true)
} else {
uiManager.setButtonsEnabled(startEnabled = true, stopEnabled = false)
}
if (userInitiated) {
TraceManager.getInstance().endTurn()
currentTrace = null
}
}
private suspend fun processSamplesLoop() {
Log.d(AppConfig.TAG, "processSamplesLoop started")
if (AppConfig.USE_HOLD_TO_SPEAK) {
// 按住说话模式累积音频数据到一定长度后再发送给ASR
while (isRecording && ioScope.coroutineContext.isActive) {
val audioData = audioProcessor.getAudioData()
if (audioData.isNotEmpty()) {
holdToSpeakAudioBuffer.addAll(audioData.toList())
}
// 避免CPU占用过高
kotlinx.coroutines.delay(10)
}
} else {
// 传统模式使用VAD
val windowSize = AppConfig.WINDOW_SIZE
val buffer = ShortArray(windowSize)
var loopCount = 0
while (isRecording && ioScope.coroutineContext.isActive) {
loopCount++
if (loopCount % 100 == 0) {
Log.d(AppConfig.TAG, "processSamplesLoop running, loopCount=$loopCount, ttsPlaying=${ttsManager.isPlaying()}")
}
if (ttsManager.isPlaying()) {
if (vadManager.isInSpeech()) {
Log.d(AppConfig.TAG, "TTS playing, resetting VAD state")
vadManager.clearState()
}
val ret = audioProcessor.readAudio(buffer)
if (ret <= 0) continue
continue
}
val ret = audioProcessor.readAudio(buffer)
if (ret <= 0) continue
if (ret != windowSize) continue
val chunk = audioProcessor.convertShortToFloat(buffer)
val processedChunk = audioProcessor.applyGain(chunk)
val result = vadManager.processAudioChunk(chunk, processedChunk)
if (vadManager.vadComputeCount % 100 == 0) {
Log.d(AppConfig.TAG, "VAD result: $result, inSpeech=${vadManager.isInSpeech()}")
}
if (loopCount % 1000 == 0) {
Log.d(AppConfig.TAG, "VAD status: inSpeech=${vadManager.isInSpeech()}, speechLen=${vadManager.getSpeechLength()}")
}
val forced = segmenter.maybeForceByTime()
for (seg in forced) ttsManager.enqueueSegment(seg)
}
vadManager.forceFinalize()
}
Log.d(AppConfig.TAG, "processSamplesLoop stopped")
}
}