tts_asr_with_video

2026-02-28 10:14:03 +08:00
parent 6aa84d6b77
commit d63d4b03cf
13 changed files with 1823 additions and 128 deletions
--- a/app/src/main/java/com/digital_person/MainActivity.kt.bak
+++ b/app/src/main/java/com/digital_person/MainActivity.kt.bak
@@ -0,0 +1,957 @@
+package com.digitalperson
+
+import android.Manifest
+import android.content.pm.PackageManager
+import android.media.AudioAttributes
+import android.media.AudioFormat
+import android.media.AudioManager
+import android.media.AudioRecord
+import android.media.AudioTrack
+import android.media.MediaRecorder
+import android.media.audiofx.AcousticEchoCanceler
+import android.media.audiofx.NoiseSuppressor
+import android.os.Bundle
+import android.os.SystemClock
+import android.text.method.ScrollingMovementMethod
+import android.util.Log
+import android.widget.Button
+import android.widget.TextView
+import android.widget.Toast
+import androidx.appcompat.app.AppCompatActivity
+import androidx.core.app.ActivityCompat
+import com.digitalperson.cloud.CloudApiManager
+import com.digitalperson.player.VideoPlayerManager
+import com.google.android.exoplayer2.ui.PlayerView
+import com.digitalperson.engine.SenseVoiceEngineRKNN
+import com.digitalperson.metrics.TraceManager
+import com.digitalperson.metrics.TraceSession
+import com.k2fsa.sherpa.onnx.OfflineTts
+import com.k2fsa.sherpa.onnx.SileroVadModelConfig
+import com.k2fsa.sherpa.onnx.Vad
+import com.k2fsa.sherpa.onnx.VadModelConfig
+import com.k2fsa.sherpa.onnx.getOfflineTtsConfig
+import kotlinx.coroutines.CoroutineScope
+import kotlinx.coroutines.Dispatchers
+import kotlinx.coroutines.Job
+import kotlinx.coroutines.SupervisorJob
+import kotlinx.coroutines.cancel
+import kotlinx.coroutines.channels.Channel
+import kotlinx.coroutines.isActive
+import kotlinx.coroutines.launch
+import kotlinx.coroutines.withContext
+import java.io.File
+import java.io.FileOutputStream
+import java.util.concurrent.LinkedBlockingQueue
+import java.util.concurrent.atomic.AtomicBoolean
+import kotlin.math.max
+
+private const val TAG = "DigitalPerson"
+private const val REQUEST_RECORD_AUDIO_PERMISSION = 200
+
+class MainActivity : AppCompatActivity() {
+
+    private lateinit var startButton: Button
+    private lateinit var stopButton: Button
+    private lateinit var textView: TextView
+
+    private lateinit var vad: Vad
+    private var senseVoice: SenseVoiceEngineRKNN? = null
+    private var tts: OfflineTts? = null
+    private var track: AudioTrack? = null
+
+    private var aec: AcousticEchoCanceler? = null
+    private var ns: NoiseSuppressor? = null
+
+    private var audioRecord: AudioRecord? = null
+    private val audioSource = MediaRecorder.AudioSource.MIC
+    private val sampleRateInHz = 16000
+    private val channelConfig = AudioFormat.CHANNEL_IN_MONO
+    private val audioFormat = AudioFormat.ENCODING_PCM_16BIT
+    private val permissions: Array<String> = arrayOf(Manifest.permission.RECORD_AUDIO)
+
+    @Volatile
+    private var isRecording: Boolean = false
+
+    private val ioScope = CoroutineScope(SupervisorJob() + Dispatchers.IO)
+    private var recordingJob: Job? = null
+    private val nativeLock = Any()
+
+    private lateinit var cloudApiManager: CloudApiManager
+    private var videoPlayerManager: VideoPlayerManager? = null
+    private val segmenter = StreamingTextSegmenter(
+        maxLen = 30,
+        maxWaitMs = 600
+    )
+
+    private sealed class TtsQueueItem {
+        data class Segment(val text: String) : TtsQueueItem()
+        data object End : TtsQueueItem()
+    }
+
+    private val ttsQueue = LinkedBlockingQueue<TtsQueueItem>()
+    private val ttsStopped = AtomicBoolean(false)
+    private val ttsWorkerRunning = AtomicBoolean(false)
+    private val ttsPlaying = AtomicBoolean(false)
+    @Volatile private var ttsTotalSamplesWritten: Long = 0
+
+    private var currentTrace: TraceSession? = null
+
+    private var lastUiText: String = ""
+    @Volatile private var llmInFlight: Boolean = false
+    private var enableStreaming = true // 默认启用流式输出
+
+    // ASR 队列和工作器
+    private val asrQueue = Channel<Pair<FloatArray, TraceSession?>>()
+    private val asrWorkerRunning = AtomicBoolean(false)
+
+    override fun onRequestPermissionsResult(
+        requestCode: Int,
+        permissions: Array<String>,
+        grantResults: IntArray
+    ) {
+        super.onRequestPermissionsResult(requestCode, permissions, grantResults)
+        val ok = requestCode == REQUEST_RECORD_AUDIO_PERMISSION &&
+            grantResults.isNotEmpty() &&
+            grantResults[0] == PackageManager.PERMISSION_GRANTED
+        if (!ok) {
+            Log.e(TAG, "Audio record is disallowed")
+            finish()
+        }
+    }
+
+    override fun onCreate(savedInstanceState: Bundle?) {
+        super.onCreate(savedInstanceState)
+        setContentView(R.layout.activity_main)
+
+        // 初始化双播放器管理器（silent 与 speaking 两个叠加的 PlayerView）
+        try {
+            val silentPv = findViewById<PlayerView>(R.id.player_view_silent)
+            val speakingPv = findViewById<PlayerView>(R.id.player_view_speaking)
+            videoPlayerManager = VideoPlayerManager(this, silentPv, speakingPv)
+            // 默认 AI 未说话
+            videoPlayerManager?.setSpeaking(false)
+        } catch (e: Exception) {
+            Log.w(TAG, "PlayerViews not found or init failed: ${e.message}")
+        }
+
+        ActivityCompat.requestPermissions(this, permissions, REQUEST_RECORD_AUDIO_PERMISSION)
+
+        startButton = findViewById(R.id.start_button)
+        stopButton = findViewById(R.id.stop_button)
+        textView = findViewById(R.id.my_text)
+        textView.movementMethod = ScrollingMovementMethod()
+
+        startButton.setOnClickListener { onStartClicked() }
+        stopButton.setOnClickListener { onStopClicked(userInitiated = true) }
+        
+        // 初始化流式输出开关
+        try {
+            val streamingSwitch = findViewById<android.widget.Switch>(R.id.streaming_switch)
+            streamingSwitch.isChecked = enableStreaming
+            streamingSwitch.setOnCheckedChangeListener { _, isChecked ->
+                enableStreaming = isChecked
+                cloudApiManager.setEnableStreaming(isChecked)
+                Toast.makeText(this, "流式输出已${if (isChecked) "启用" else "禁用"}", Toast.LENGTH_SHORT).show()
+            }
+        } catch (e: Exception) {
+            Log.w(TAG, "Streaming switch not found in layout: ${e.message}")
+        }
+
+        // 避免 UI 线程重初始化导致 ANR：在后台初始化模型与 AudioTrack
+        startButton.isEnabled = false
+        stopButton.isEnabled = false
+        textView.text = "初始化中…"
+        ioScope.launch {
+            try {
+                Log.i(TAG, "Init VAD + SenseVoice(RKNN) + TTS (background)")
+                synchronized(nativeLock) {
+                    initVadModel()
+                    initSenseVoiceModel()
+                }
+                withContext(Dispatchers.Main) {
+                    initTtsAndAudioTrack()
+                    textView.text = getString(R.string.hint)
+                    startButton.isEnabled = true
+                    stopButton.isEnabled = false
+                }
+            } catch (t: Throwable) {
+                Log.e(TAG, "Initialization failed: ${t.message}", t)
+                withContext(Dispatchers.Main) {
+                    textView.text = "初始化失败：${t.javaClass.simpleName}: ${t.message}"
+                    Toast.makeText(
+                        this@MainActivity,
+                        "初始化失败（请看 Logcat）: ${t.javaClass.simpleName}",
+                        Toast.LENGTH_LONG
+                    ).show()
+                    startButton.isEnabled = false
+                    stopButton.isEnabled = false
+                }
+            }
+        }
+
+        cloudApiManager = CloudApiManager(object : CloudApiManager.CloudApiListener  {
+            private var llmFirstChunkMarked = false
+
+            override fun onLLMResponseReceived(response: String) {
+                currentTrace?.markLlmDone()
+                llmInFlight = false
+                
+                // 根据流式输出模式处理响应
+                if (enableStreaming) {
+                    // 启用流式输出时，刷新剩余缓冲区
+                    for (seg in segmenter.flush()) {
+                        enqueueTtsSegment(seg)
+                    }
+                    // 发送队列结束信号
+                    ttsQueue.offer(TtsQueueItem.End)
+                } else {
+                    runOnUiThread {
+                        appendToUi("${response}\n")
+                    }
+                    // 禁用流式输出时，直接使用整段文本进行TTS
+                    enqueueTtsSegment(response)
+                    // 发送队列结束信号
+                    ttsQueue.offer(TtsQueueItem.End)
+                }
+            }
+
+            override fun onLLMStreamingChunkReceived(chunk: String) {
+                // 启用流式输出时，处理流式chunk
+                if (enableStreaming) {
+                    if (!llmFirstChunkMarked) {
+                        llmFirstChunkMarked = true
+                        currentTrace?.markLlmFirstChunk()
+                    }
+                    appendToUi(chunk)
+
+                    val segments = segmenter.processChunk(chunk)
+                    for (seg in segments) {
+                        enqueueTtsSegment(seg)
+                    }
+                }
+            }
+
+            override fun onTTSAudioReceived(audioFilePath: String) {
+                // unused
+            }
+
+            override fun onError(errorMessage: String) {
+                llmInFlight = false
+                Toast.makeText(this@MainActivity, errorMessage, Toast.LENGTH_LONG).show()
+                onStopClicked(userInitiated = false)
+            }
+        }, applicationContext)
+        
+        // 设置流式输出模式
+        cloudApiManager.setEnableStreaming(enableStreaming)
+    }
+
+    override fun onDestroy() {
+        super.onDestroy()
+        onStopClicked(userInitiated = false)
+        ioScope.cancel()
+        synchronized(nativeLock) {
+            try {
+                vad.release()
+            } catch (_: Throwable) {
+            }
+            try {
+                senseVoice?.deinitialize()
+            } catch (_: Throwable) {
+            }
+        }
+        try {
+            tts?.release()
+        } catch (_: Throwable) {
+        }
+        try {
+            videoPlayerManager?.release()
+        } catch (_: Throwable) {
+        }
+    }
+
+    private fun onStartClicked() {
+        if (isRecording) return
+
+        if (!initMicrophone()) {
+            Toast.makeText(this, "麦克风初始化失败/无权限", Toast.LENGTH_SHORT).show()
+            return
+        }
+
+        // Start a new trace turn
+        currentTrace = TraceManager.getInstance().startNewTurn()
+        currentTrace?.mark("turn_start")
+        llmInFlight = false
+
+        lastUiText = ""
+        textView.text = ""
+
+        ttsStopped.set(false)
+        ttsPlaying.set(false)
+        ttsTotalSamplesWritten = 0
+        ttsQueue.clear()
+        segmenter.reset()
+
+        vad.reset()
+        audioRecord!!.startRecording()
+        isRecording = true
+
+        startButton.isEnabled = false
+        stopButton.isEnabled = true
+
+        recordingJob?.cancel()
+        recordingJob = ioScope.launch {
+            processSamplesLoop()
+        }
+    }
+
+    private fun onStopClicked(userInitiated: Boolean) {
+        isRecording = false
+        try {
+            audioRecord?.stop()
+        } catch (_: Throwable) {
+        }
+        try {
+            audioRecord?.release()
+        } catch (_: Throwable) {
+        }
+        audioRecord = null
+
+        recordingJob?.cancel()
+        recordingJob = null
+
+        ttsStopped.set(true)
+        ttsPlaying.set(false)
+        ttsTotalSamplesWritten = 0
+        ttsQueue.clear()
+        // wake worker if waiting
+        ttsQueue.offer(TtsQueueItem.End)
+
+        try {
+            track?.pause()
+            track?.flush()
+        } catch (_: Throwable) {
+        }
+        try { aec?.release() } catch (_: Throwable) {}
+        try { ns?.release() } catch (_: Throwable) {}
+        aec = null
+        ns = null
+        startButton.isEnabled = true
+        stopButton.isEnabled = false
+
+        if (userInitiated) {
+            TraceManager.getInstance().endTurn()
+            currentTrace = null
+        }
+    }
+
+    private fun initVadModel() {
+        // 你的 VAD 模型在 assets/vad_model/ 下
+        val config = VadModelConfig(
+            sileroVadModelConfig = SileroVadModelConfig(
+                model = "vad_model/silero_vad.onnx",
+                threshold = 0.5F,
+                minSilenceDuration = 0.25F,
+                minSpeechDuration = 0.25F,
+                windowSize = 512,
+            ),
+            sampleRate = sampleRateInHz,
+            numThreads = 1,
+            provider = "cpu",
+        )
+        vad = Vad(assetManager = application.assets, config = config)
+    }
+
+    private fun initSenseVoiceModel() {
+        Log.i(TAG, "ASR: init SenseVoice RKNN (scheme A)")
+        // Copy assets/sensevoice_models/* -> filesDir/sensevoice_models/*
+        val modelDir = copySenseVoiceAssetsToInternal()
+        val modelPath = File(modelDir, "sense-voice-encoder.rknn").absolutePath
+        val embeddingPath = File(modelDir, "embedding.npy").absolutePath
+        val bpePath = File(modelDir, "chn_jpn_yue_eng_ko_spectok.bpe.model").absolutePath
+
+        // Print quick diagnostics for native libs + model files
+        try {
+            val libDir = applicationInfo.nativeLibraryDir
+            Log.i(TAG, "nativeLibraryDir=$libDir")
+            try {
+                val names = File(libDir).list()?.joinToString(", ") ?: "(empty)"
+                Log.i(TAG, "nativeLibraryDir files: $names")
+            } catch (t: Throwable) {
+                Log.w(TAG, "Failed to list nativeLibraryDir: ${t.message}")
+            }
+        } catch (_: Throwable) {
+        }
+        Log.i(TAG, "SenseVoice model paths:")
+        Log.i(TAG, "  model=$modelPath exists=${File(modelPath).exists()} size=${File(modelPath).length()}")
+        Log.i(TAG, "  embedding=$embeddingPath exists=${File(embeddingPath).exists()} size=${File(embeddingPath).length()}")
+        Log.i(TAG, "  bpe=$bpePath exists=${File(bpePath).exists()} size=${File(bpePath).length()}")
+
+        val t0 = SystemClock.elapsedRealtime()
+        val engine = try {
+            SenseVoiceEngineRKNN(this)
+        } catch (e: UnsatisfiedLinkError) {
+            // Most common: libsensevoiceEngine.so not packaged/built, or dependent libs missing
+            throw IllegalStateException("Load native libraries failed: ${e.message}", e)
+        }
+
+        val ok = try {
+            engine.loadModelDirectly(modelPath, embeddingPath, bpePath)
+        } catch (t: Throwable) {
+            throw IllegalStateException("SenseVoice loadModelDirectly crashed: ${t.message}", t)
+        }
+
+        val dt = SystemClock.elapsedRealtime() - t0
+        Log.i(TAG, "SenseVoice loadModelDirectly ok=$ok costMs=$dt")
+        if (!ok) throw IllegalStateException("SenseVoiceEngineRKNN loadModelDirectly returned false")
+
+        senseVoice = engine
+    }
+
+    private fun initTtsAndAudioTrack() {
+        try {
+            // 你放入的 sherpa-onnx VITS 中文模型目录：
+            // assets/tts_model/sherpa-onnx-vits-zh-ll/{model.onnx,tokens.txt,lexicon.txt,...}
+            val modelDir = "tts_model/sherpa-onnx-vits-zh-ll"
+            val modelName = "model.onnx"
+            val lexicon = "lexicon.txt"
+            val dataDir = ""
+
+            val ttsConfig = getOfflineTtsConfig(
+                modelDir = modelDir,
+                modelName = modelName,
+                acousticModelName = "",
+                vocoder = "",
+                voices = "",
+                lexicon = lexicon,
+                dataDir = dataDir,
+                dictDir = "",
+                // 中文规范化规则（目录里已有这些 fst）
+                ruleFsts = "$modelDir/phone.fst,$modelDir/date.fst,$modelDir/number.fst,$modelDir/new_heteronym.fst",
+                ruleFars = "",
+                numThreads = null,
+                isKitten = false
+            )
+            tts = OfflineTts(assetManager = application.assets, config = ttsConfig)
+        } catch (t: Throwable) {
+            Log.e(TAG, "Init TTS failed: ${t.message}", t)
+            tts = null
+            runOnUiThread {
+                Toast.makeText(
+                    this,
+                    "TTS 初始化失败：请确认 assets/tts_model/sherpa-onnx-vits-zh-ll/ 下有 model.onnx、tokens.txt、lexicon.txt 以及 phone/date/number/new_heteronym.fst",
+                    Toast.LENGTH_LONG
+                ).show()
+            }
+        }
+
+        val t = tts ?: return
+        val sr = t.sampleRate()
+        val bufLength = AudioTrack.getMinBufferSize(
+            sr,
+            AudioFormat.CHANNEL_OUT_MONO,
+            AudioFormat.ENCODING_PCM_FLOAT
+        )
+        val attr = AudioAttributes.Builder()
+            .setContentType(AudioAttributes.CONTENT_TYPE_SPEECH)
+            .setUsage(AudioAttributes.USAGE_MEDIA)
+            .build()
+        val format = AudioFormat.Builder()
+            .setEncoding(AudioFormat.ENCODING_PCM_FLOAT)
+            .setChannelMask(AudioFormat.CHANNEL_OUT_MONO)
+            .setSampleRate(sr)
+            .build()
+        track = AudioTrack(
+            attr,
+            format,
+            bufLength,
+            AudioTrack.MODE_STREAM,
+            AudioManager.AUDIO_SESSION_ID_GENERATE
+        )
+        track?.play()
+    }
+
+    private fun assetExists(path: String): Boolean {
+        return try {
+            application.assets.open(path).close()
+            true
+        } catch (_: Throwable) {
+            false
+        }
+    }
+
+    private fun copySenseVoiceAssetsToInternal(): File {
+        val outDir = File(filesDir, "sensevoice_models")
+        if (!outDir.exists()) outDir.mkdirs()
+
+        val files = arrayOf(
+            "am.mvn",
+            "chn_jpn_yue_eng_ko_spectok.bpe.model",
+            "embedding.npy",
+            "sense-voice-encoder.rknn"
+        )
+
+        for (name in files) {
+            val assetPath = "sensevoice_models/$name"
+            val outFile = File(outDir, name)
+            if (outFile.exists() && outFile.length() > 0) continue
+            application.assets.open(assetPath).use { input ->
+                FileOutputStream(outFile).use { output ->
+                    input.copyTo(output)
+                }
+            }
+        }
+        return outDir
+    }
+
+    private fun initMicrophone(): Boolean {
+        if (ActivityCompat.checkSelfPermission(this, Manifest.permission.RECORD_AUDIO)
+            != PackageManager.PERMISSION_GRANTED
+        ) {
+            ActivityCompat.requestPermissions(this, permissions, REQUEST_RECORD_AUDIO_PERMISSION)
+            return false
+        }
+
+        val numBytes = AudioRecord.getMinBufferSize(sampleRateInHz, channelConfig, audioFormat)
+        audioRecord = AudioRecord(
+            audioSource,
+            sampleRateInHz,
+            channelConfig,
+            audioFormat,
+            numBytes * 2
+        )
+        val sessionId = audioRecord?.audioSessionId ?: 0
+        if (sessionId != 0) {
+            if (android.media.audiofx.AcousticEchoCanceler.isAvailable()) {
+                aec = android.media.audiofx.AcousticEchoCanceler.create(sessionId)?.apply {
+                    enabled = true
+                }
+                Log.i(TAG, "AEC enabled=${aec?.enabled}")
+            } else {
+                Log.w(TAG, "AEC not available on this device")
+            }
+
+            if (android.media.audiofx.NoiseSuppressor.isAvailable()) {
+                ns = android.media.audiofx.NoiseSuppressor.create(sessionId)?.apply {
+                    enabled = true
+                }
+                Log.i(TAG, "NS enabled=${ns?.enabled}")
+            } else {
+                Log.w(TAG, "NS not available on this device")
+            }
+        }
+        return true
+    }
+
+    private suspend fun processSamplesLoop() {
+        // Avoid calling vad.front()/vad.pop() (native queue APIs) since it crashes on some builds.
+        // Use vad.compute() and implement a simple VAD segmenter in Kotlin instead.
+        val windowSize = 512
+        val buffer = ShortArray(windowSize)
+        // 双阈值设置
+        val startThreshold = 0.2f   // 进入语音的阈值
+        val endThreshold = 0.15f    // 退出语音的阈值
+        val minSilenceSamples = (0.5f * sampleRateInHz).toInt()
+        val minSpeechSamples = (0.1f * sampleRateInHz).toInt()
+        val maxSpeechSamples = (5.0f * sampleRateInHz).toInt()
+
+        // VAD 概率数据记录
+        val vadProbabilities = mutableListOf<Float>()
+        val vadTimestamps = mutableListOf<Long>()
+        val vadRMSValues = mutableListOf<Float>()
+        val vadSmoothedRMSValues = mutableListOf<Float>()
+
+        // 指数平滑相关变量
+        var smoothedRms = 0f
+        val alpha = 0.8f   // 平滑系数
+
+        var inSpeech = false
+        var silenceSamples = 0
+
+        var speechBuf = FloatArray(0)
+        var speechLen = 0
+        var processedSpeechBuf = FloatArray(0)
+        var processedSpeechLen = 0
+
+        fun appendSpeech(chunk: FloatArray, processedChunk: FloatArray) {
+            // 保存原始音频
+            val needed = speechLen + chunk.size
+            if (speechBuf.size < needed) {
+                var newCap = maxOf(needed, maxOf(1024, speechBuf.size * 2))
+                if (newCap > maxSpeechSamples) newCap = maxSpeechSamples
+                val n = FloatArray(newCap)
+                if (speechLen > 0) System.arraycopy(speechBuf, 0, n, 0, speechLen)
+                speechBuf = n
+            }
+            val copyN = minOf(chunk.size, max(0, maxSpeechSamples - speechLen))
+            if (copyN > 0) {
+                System.arraycopy(chunk, 0, speechBuf, speechLen, copyN)
+                speechLen += copyN
+            }
+
+            // 保存增益后的音频
+            val processedNeeded = processedSpeechLen + processedChunk.size
+            if (processedSpeechBuf.size < processedNeeded) {
+                var newCap = maxOf(processedNeeded, maxOf(1024, processedSpeechBuf.size * 2))
+                if (newCap > maxSpeechSamples) newCap = maxSpeechSamples
+                val n = FloatArray(newCap)
+                if (processedSpeechLen > 0) System.arraycopy(processedSpeechBuf, 0, n, 0, processedSpeechLen)
+                processedSpeechBuf = n
+            }
+            val processedCopyN = minOf(processedChunk.size, max(0, maxSpeechSamples - processedSpeechLen))
+            if (processedCopyN > 0) {
+                System.arraycopy(processedChunk, 0, processedSpeechBuf, processedSpeechLen, processedCopyN)
+                processedSpeechLen += processedCopyN
+            }
+        }
+
+        suspend fun finalizeSegmentIfAny() {
+            if (speechLen < minSpeechSamples) {
+                speechLen = 0
+                processedSpeechLen = 0
+                inSpeech = false
+                silenceSamples = 0
+                return
+            }
+            // ✅ 新增：如果 TTS 正在播放或 LLM 请求中，丢弃此段（避免回声）
+            if (ttsPlaying.get() || llmInFlight) {
+                speechLen = 0
+                processedSpeechLen = 0
+                inSpeech = false
+                silenceSamples = 0
+                return
+            }
+            val originalSeg = speechBuf.copyOf(speechLen)
+            val processedSeg = processedSpeechBuf.copyOf(processedSpeechLen)
+            speechLen = 0
+            processedSpeechLen = 0
+            inSpeech = false
+            silenceSamples = 0
+
+            // 将语音段加入 ASR 处理队列，异步处理
+            asrQueue.send(Pair(originalSeg, processedSeg))
+        }
+
+        while (isRecording && ioScope.coroutineContext.isActive) {
+            val ret = audioRecord?.read(buffer, 0, buffer.size) ?: break
+            if (ret <= 0) continue
+            if (ret != windowSize) continue
+            // 在 processSamplesLoop 方法中
+            val chunk = FloatArray(ret) { buffer[it] / 32768.0f }
+
+            // 计算当前音频的RMS值（均方根）
+            val rms = calculateRMS(chunk)
+            
+            // 应用指数平滑
+            smoothedRms = if (smoothedRms == 0f) rms else alpha * rms + (1 - alpha) * smoothedRms
+            
+            // 动态调整增益因子，目标RMS设为0.1（约-20dB）
+            val targetRMS = 0.1f
+            var gainFactor = if (smoothedRms > 0) targetRMS / smoothedRms else 3.0f
+            
+            // 设置增益的上下限，避免过度增益导致削波
+            gainFactor = gainFactor.coerceIn(0.1f, 10.0f)
+            
+            // 应用增益因子
+            val processedChunk = FloatArray(chunk.size) {
+                val value = chunk[it] * gainFactor
+                // 限制音量范围，避免削波
+                if (value > 1.0f) 1.0f else if (value < -1.0f) -1.0f else value
+            }
+
+            // 使用处理后的音频数据
+            val prob = synchronized(nativeLock) { vad.compute(processedChunk) }
+            
+            // 记录VAD概率、时间戳、原始RMS值和平滑后的RMS值
+            vadProbabilities.add(prob)
+            vadTimestamps.add(System.currentTimeMillis())
+            vadRMSValues.add(rms)
+            vadSmoothedRMSValues.add(smoothedRms)
+
+            // 双阈值状态机逻辑
+            if (!inSpeech && prob >= startThreshold) {
+                // 进入语音状态
+                inSpeech = true
+                silenceSamples = 0
+                appendSpeech(chunk, processedChunk)
+            } else if (inSpeech && prob <= endThreshold) {
+                // 开始计数静音样本
+                silenceSamples += ret
+                if (silenceSamples >= minSilenceSamples) {
+                    // 退出语音状态
+                    finalizeSegmentIfAny()
+                } else {
+                    // 保留尾音
+                    appendSpeech(chunk, processedChunk)
+                }
+            } else if (inSpeech) {
+                // 语音过程中，持续添加音频
+                appendSpeech(chunk, processedChunk)
+                silenceSamples = 0 // 重置静音计数
+                
+                if (speechLen >= maxSpeechSamples) {
+                    finalizeSegmentIfAny()
+                }
+            }
+            // 非语音状态且概率低于开始阈值，不做处理
+
+            // 时间兜底切段（避免长时间无标点导致首包太慢）
+            val forced = segmenter.maybeForceByTime()
+            for (seg in forced) enqueueTtsSegment(seg)
+        }
+
+        // flush last partial segment
+        finalizeSegmentIfAny()
+        
+        // 保存VAD数据到文件
+        saveVadData(vadTimestamps, vadProbabilities, vadRMSValues, vadSmoothedRMSValues)
+    }
+
+    /**
+     * 保存VAD数据到文件，方便后续分析和绘图
+     */
+    private fun saveVadData(timestamps: List<Long>, probabilities: List<Float>, rmsValues: List<Float>, smoothedRmsValues: List<Float>) {
+        try {
+            // 创建保存目录
+            val vadDataDir = File(filesDir, "vad_data")
+            if (!vadDataDir.exists()) {
+                vadDataDir.mkdirs()
+            }
+            
+            // 生成唯一的文件名
+            val timestamp = System.currentTimeMillis()
+            val fileName = "vad_data_${timestamp}.csv"
+            val outputFile = File(vadDataDir, fileName)
+            
+            // 写入数据
+            FileOutputStream(outputFile).use { fos ->
+                // 写入表头
+                fos.write("timestamp,probability,rms,smoothed_rms\n".toByteArray())
+                
+                // 写入数据行
+                for (i in timestamps.indices) {
+                    val line = "${timestamps[i]},${probabilities[i]},${rmsValues[i]},${smoothedRmsValues[i]}\n"
+                    fos.write(line.toByteArray())
+                }
+            }
+            
+            Log.d(TAG, "Saved VAD data to: ${outputFile.absolutePath}")
+        } catch (e: Exception) {
+            Log.e(TAG, "Error saving VAD data: ${e.message}")
+        }
+    }
+
+    private fun removeTokens(text: String): String {
+        // Remove tokens like <|zh|>, <|NEUTRAL|>, <|Speech|>, <|woitn|> and stray '>' chars
+        var cleaned = text.replace(Regex("<\\|[^>]+\\|>"), "")
+        cleaned = cleaned.replace(Regex("[>＞≥≫]"), "")
+        cleaned = cleaned.trim().replace(Regex("\\s+"), " ")
+        return cleaned
+    }
+
+    private fun enqueueTtsSegment(seg: String) {
+        // 移除句末的标点符号
+        val cleanedSeg = seg.trimEnd('.', '。', '!', '！', '?', '？', ',', '，', ';', '；', ':', '：')
+        
+        currentTrace?.markTtsRequestEnqueued()
+        ttsQueue.offer(TtsQueueItem.Segment(cleanedSeg))
+        ensureTtsWorker()
+    }
+
+    private fun ensureTtsWorker() {
+        if (!ttsWorkerRunning.compareAndSet(false, true)) return
+        ioScope.launch {
+            try {
+                runTtsWorker()
+            } finally {
+                ttsWorkerRunning.set(false)
+            }
+        }
+    }
+
+    private fun ensureAsrWorker() {
+        if (!asrWorkerRunning.compareAndSet(false, true)) return
+        ioScope.launch {
+            try {
+                runAsrWorker()
+            } finally {
+                asrWorkerRunning.set(false)
+            }
+        }
+    }
+
+    private fun runTtsWorker() {
+        val t = tts ?: return
+        val audioTrack = track ?: return
+
+        var firstAudioMarked = false
+        var isFirstSegment = true
+        while (true) {
+            val item = ttsQueue.take()
+            if (ttsStopped.get()) break
+
+            when (item) {
+                is TtsQueueItem.Segment -> {
+                    ttsPlaying.set(true)
+                    runOnUiThread { videoPlayerManager?.setSpeaking(true) }
+                    val trace = currentTrace
+                    trace?.markTtsSynthesisStart()
+                    Log.d(TAG, "TTS started: processing segment '${item.text}'")
+                    runOnUiThread {
+                        appendToUi("\n[TTS] 开始合成...\n")
+                    }
+
+                    val startMs = System.currentTimeMillis()
+                    var firstPcmMarked = false
+
+                    if (isFirstSegment) {
+                        try {
+                            audioTrack.pause()
+                            audioTrack.flush()
+                            audioTrack.play()
+                        } catch (_: Throwable) {
+                        }
+                        isFirstSegment = false
+                    }
+
+                    t.generateWithCallback(
+                        text = item.text,
+                        sid = 2,  // 这里可以修改说话人
+                        speed = 1.0f
+                    ) { samples ->
+                        if (ttsStopped.get()) return@generateWithCallback 0
+                        if (!firstPcmMarked && samples.isNotEmpty()) {
+                            firstPcmMarked = true
+                            trace?.markTtsFirstPcmReady()
+                        }
+                        if (!firstAudioMarked && samples.isNotEmpty()) {
+                            firstAudioMarked = true
+                            trace?.markTtsFirstAudioPlay()
+                        }
+                        audioTrack.write(samples, 0, samples.size, AudioTrack.WRITE_BLOCKING)
+                        ttsTotalSamplesWritten += samples.size
+                        1
+                    }
+
+                    val ttsMs = System.currentTimeMillis() - startMs
+                    trace?.addDuration("tts_segment_ms_total", ttsMs)
+                }
+
+                TtsQueueItem.End -> {
+                    // 清空 ASR 队列，丢弃所有未处理的段（这些可能是 TTS 播放期间的回声）
+                    while (asrQueue.tryReceive().isSuccess) { }
+                    
+                    waitForPlaybackComplete(audioTrack)
+                    val ttsCompleteTime = System.currentTimeMillis()
+                    
+                    // 在主线程更新UI
+                    runOnUiThread {
+                        appendToUi("\n[LOG] TTS completed at: ${ttsCompleteTime}\n")
+                    }
+                    
+                    ttsPlaying.set(false)
+                    runOnUiThread { videoPlayerManager?.setSpeaking(false) }
+                    ttsTotalSamplesWritten = 0
+                    isFirstSegment = true
+                    currentTrace?.markTtsDone()
+                    TraceManager.getInstance().endTurn()
+                    currentTrace = null
+                    break
+                }
+            }
+        }
+    }
+
+    private fun waitForPlaybackComplete(audioTrack: AudioTrack) {
+        val totalSamples = ttsTotalSamplesWritten
+        if (totalSamples <= 0) return
+        
+        val sampleRate = audioTrack.sampleRate
+        val timeoutMs = (totalSamples * 1000 / sampleRate) + 2000
+        val startTime = System.currentTimeMillis()
+        
+        while (true) {
+            if (ttsStopped.get()) break
+            
+            val playbackPos = audioTrack.playbackHeadPosition.toLong()
+            if (playbackPos >= totalSamples) {
+                break
+            }
+            
+            if (System.currentTimeMillis() - startTime > timeoutMs) {
+                Log.w(TAG, "waitForPlaybackComplete timeout, pos=$playbackPos, total=$totalSamples")
+                break
+            }
+            
+            Thread.sleep(20)
+        }
+        // 直接等待 1000ms，确保所有缓冲区清空
+        Thread.sleep(1000)
+    }
+
+    private suspend fun runAsrWorker() {
+        while (ioScope.coroutineContext.isActive) {
+            val (seg, trace) = try {
+                asrQueue.receive()
+            } catch (_: Throwable) {
+                break
+            }
+
+            // 每次只允许一个 LLM 请求在飞，避免堆积导致卡死/竞态
+            // TTS 播放期间不做 ASR，避免识别到 TTS 播放的声音
+            if (llmInFlight || ttsPlaying.get()) continue
+
+            trace?.markASRStart()
+            Log.d(TAG, "ASR started: processing audio segment")
+            withContext(Dispatchers.Main) {
+                appendToUi("\n[ASR] 开始识别...\n")
+            }
+            val raw = synchronized(nativeLock) {
+                val e = senseVoice
+                if (e == null || !e.isInitialized) "" else e.transcribeBuffer(seg)
+            }
+            val text = removeTokens(raw)
+            
+            // 添加过滤逻辑
+            if (text.isBlank()) continue
+            // 过滤英文单字符"i"
+            if (text.length == 1 && text[0].equals('i', ignoreCase = true)) {
+                Log.d(TAG, "ASR segment skipped: single 'i'")
+                continue
+            }
+            // 过滤超过50个字符的长文本
+            if (text.length > 50) {
+                Log.d(TAG, "ASR segment skipped: too long (${text.length} chars)")
+                continue
+            }
+            
+            trace?.markASREnd()
+
+            withContext(Dispatchers.Main) {
+                appendToUi("\n\n[ASR] ${text}\n")
+            }
+
+            trace?.markRecordingDone()
+            trace?.markLlmResponseReceived()
+
+            if (BuildConfig.LLM_API_KEY.isBlank()) {
+                withContext(Dispatchers.Main) {
+                    Toast.makeText(
+                        this@MainActivity,
+                        "未配置 LLM_API_KEY（在 local.properties 或 gradle.properties 里设置）",
+                        Toast.LENGTH_LONG
+                    ).show()
+                }
+                continue
+            }
+
+            llmInFlight = true
+            cloudApiManager.callLLM(text)
+        }
+    }
+
+    private fun appendToUi(s: String) {
+        lastUiText += s
+        textView.text = lastUiText
+    }
+}
--- a/app/src/main/java/com/digitalperson/MainActivity.kt
+++ b/app/src/main/java/com/digitalperson/MainActivity.kt
@@ -20,6 +20,8 @@ import android.widget.Toast
 import androidx.appcompat.app.AppCompatActivity
 import androidx.core.app.ActivityCompat
 import com.digitalperson.cloud.CloudApiManager
+import com.digitalperson.player.VideoPlayerManager
+import com.google.android.exoplayer2.ui.PlayerView
 import com.digitalperson.engine.SenseVoiceEngineRKNN
 import com.digitalperson.metrics.TraceManager
 import com.digitalperson.metrics.TraceSession
@@ -33,6 +35,7 @@ import kotlinx.coroutines.Dispatchers
 import kotlinx.coroutines.Job
 import kotlinx.coroutines.SupervisorJob
 import kotlinx.coroutines.cancel
+import kotlinx.coroutines.channels.Channel
 import kotlinx.coroutines.isActive
 import kotlinx.coroutines.launch
 import kotlinx.coroutines.withContext
@@ -66,6 +69,8 @@ class MainActivity : AppCompatActivity() {
    private val audioFormat = AudioFormat.ENCODING_PCM_16BIT
    private val permissions: Array<String> = arrayOf(Manifest.permission.RECORD_AUDIO)

+
+
    @Volatile
    private var isRecording: Boolean = false

@@ -74,7 +79,11 @@ class MainActivity : AppCompatActivity() {
    private val nativeLock = Any()

    private lateinit var cloudApiManager: CloudApiManager
-    private val segmenter = StreamingTextSegmenter()
+    private var videoPlayerManager: VideoPlayerManager? = null
+    private val segmenter = StreamingTextSegmenter(
+        maxLen = 30,
+        maxWaitMs = 600
+    )

    private sealed class TtsQueueItem {
        data class Segment(val text: String) : TtsQueueItem()
@@ -84,11 +93,18 @@ class MainActivity : AppCompatActivity() {
    private val ttsQueue = LinkedBlockingQueue<TtsQueueItem>()
    private val ttsStopped = AtomicBoolean(false)
    private val ttsWorkerRunning = AtomicBoolean(false)
+    private val ttsPlaying = AtomicBoolean(false)
+    @Volatile private var ttsTotalSamplesWritten: Long = 0

    private var currentTrace: TraceSession? = null

    private var lastUiText: String = ""
    @Volatile private var llmInFlight: Boolean = false
+    private var enableStreaming = false // 默认禁用流式输出
+
+    // ASR 队列和工作器
+    private val asrQueue = Channel<Pair<FloatArray, FloatArray>>(capacity = Channel.UNLIMITED)
+    private val asrWorkerRunning = AtomicBoolean(false)

    override fun onRequestPermissionsResult(
        requestCode: Int,
@@ -97,8 +113,8 @@ class MainActivity : AppCompatActivity() {
    ) {
        super.onRequestPermissionsResult(requestCode, permissions, grantResults)
        val ok = requestCode == REQUEST_RECORD_AUDIO_PERMISSION &&
-            grantResults.isNotEmpty() &&
-            grantResults[0] == PackageManager.PERMISSION_GRANTED
+                grantResults.isNotEmpty() &&
+                grantResults[0] == PackageManager.PERMISSION_GRANTED
        if (!ok) {
            Log.e(TAG, "Audio record is disallowed")
            finish()
@@ -109,6 +125,17 @@ class MainActivity : AppCompatActivity() {
        super.onCreate(savedInstanceState)
        setContentView(R.layout.activity_main)

+        // 初始化双播放器管理器（silent 与 speaking 两个叠加的 PlayerView）
+        try {
+            val silentPv = findViewById<PlayerView>(R.id.player_view_silent)
+            val speakingPv = findViewById<PlayerView>(R.id.player_view_speaking)
+            videoPlayerManager = VideoPlayerManager(this, silentPv, speakingPv)
+            // 默认 AI 未说话
+            videoPlayerManager?.setSpeaking(false)
+        } catch (e: Exception) {
+            Log.w(TAG, "PlayerViews not found or init failed: ${e.message}")
+        }
+
        ActivityCompat.requestPermissions(this, permissions, REQUEST_RECORD_AUDIO_PERMISSION)

        startButton = findViewById(R.id.start_button)
@@ -119,6 +146,19 @@ class MainActivity : AppCompatActivity() {
        startButton.setOnClickListener { onStartClicked() }
        stopButton.setOnClickListener { onStopClicked(userInitiated = true) }

+        // 初始化流式输出开关
+        try {
+            val streamingSwitch = findViewById<android.widget.Switch>(R.id.streaming_switch)
+            streamingSwitch.isChecked = enableStreaming
+            streamingSwitch.setOnCheckedChangeListener { _, isChecked ->
+                enableStreaming = isChecked
+                cloudApiManager.setEnableStreaming(isChecked)
+                Toast.makeText(this, "流式输出已${if (isChecked) "启用" else "禁用"}", Toast.LENGTH_SHORT).show()
+            }
+        } catch (e: Exception) {
+            Log.w(TAG, "Streaming switch not found in layout: ${e.message}")
+        }
+
        // 避免 UI 线程重初始化导致 ANR：在后台初始化模型与 AudioTrack
        startButton.isEnabled = false
        stopButton.isEnabled = false
@@ -151,30 +191,45 @@ class MainActivity : AppCompatActivity() {
            }
        }

-        cloudApiManager = CloudApiManager(object : CloudApiManager.CloudApiListener {
+        cloudApiManager = CloudApiManager(object : CloudApiManager.CloudApiListener  {
            private var llmFirstChunkMarked = false

            override fun onLLMResponseReceived(response: String) {
                currentTrace?.markLlmDone()
                llmInFlight = false
-                // flush remaining buffer into TTS
-                for (seg in segmenter.flush()) {
-                    enqueueTtsSegment(seg)
+
+                // 根据流式输出模式处理响应
+                if (enableStreaming) {
+                    // 启用流式输出时，刷新剩余缓冲区
+                    for (seg in segmenter.flush()) {
+                        enqueueTtsSegment(seg)
+                    }
+                    // 发送队列结束信号
+                    ttsQueue.offer(TtsQueueItem.End)
+                } else {
+                    runOnUiThread {
+                        appendToUi("${response}\n")
+                    }
+                    // 禁用流式输出时，直接使用整段文本进行TTS
+                    enqueueTtsSegment(response)
+                    // 发送队列结束信号
+                    ttsQueue.offer(TtsQueueItem.End)
                }
-                // signal queue end (no more segments after this)
-                ttsQueue.offer(TtsQueueItem.End)
            }

            override fun onLLMStreamingChunkReceived(chunk: String) {
-                if (!llmFirstChunkMarked) {
-                    llmFirstChunkMarked = true
-                    currentTrace?.markLlmFirstChunk()
-                }
-                appendToUi(chunk)
+                // 启用流式输出时，处理流式chunk
+                if (enableStreaming) {
+                    if (!llmFirstChunkMarked) {
+                        llmFirstChunkMarked = true
+                        currentTrace?.markLlmFirstChunk()
+                    }
+                    appendToUi(chunk)

-                val segments = segmenter.processChunk(chunk)
-                for (seg in segments) {
-                    enqueueTtsSegment(seg)
+                    val segments = segmenter.processChunk(chunk)
+                    for (seg in segments) {
+                        enqueueTtsSegment(seg)
+                    }
                }
            }

@@ -187,7 +242,14 @@ class MainActivity : AppCompatActivity() {
                Toast.makeText(this@MainActivity, errorMessage, Toast.LENGTH_LONG).show()
                onStopClicked(userInitiated = false)
            }
-        })
+        }, applicationContext)
+
+        // 设置流式输出模式
+        cloudApiManager.setEnableStreaming(enableStreaming)
+        
+        // 预先启动ASR worker
+        Log.d(TAG, "Pre-starting ASR worker")
+        ensureAsrWorker()
    }

    override fun onDestroy() {
@@ -208,10 +270,18 @@ class MainActivity : AppCompatActivity() {
            tts?.release()
        } catch (_: Throwable) {
        }
+        try {
+            videoPlayerManager?.release()
+        } catch (_: Throwable) {
+        }
    }

    private fun onStartClicked() {
-        if (isRecording) return
+        Log.d(TAG, "onStartClicked called")
+        if (isRecording) {
+            Log.d(TAG, "Already recording, returning")
+            return
+        }

        if (!initMicrophone()) {
            Toast.makeText(this, "麦克风初始化失败/无权限", Toast.LENGTH_SHORT).show()
@@ -227,6 +297,8 @@ class MainActivity : AppCompatActivity() {
        textView.text = ""

        ttsStopped.set(false)
+        ttsPlaying.set(false)
+        ttsTotalSamplesWritten = 0
        ttsQueue.clear()
        segmenter.reset()

@@ -237,10 +309,12 @@ class MainActivity : AppCompatActivity() {
        startButton.isEnabled = false
        stopButton.isEnabled = true

+        Log.d(TAG, "Starting processSamplesLoop coroutine")
        recordingJob?.cancel()
        recordingJob = ioScope.launch {
            processSamplesLoop()
        }
+        Log.d(TAG, "onStartClicked completed")
    }

    private fun onStopClicked(userInitiated: Boolean) {
@@ -259,6 +333,8 @@ class MainActivity : AppCompatActivity() {
        recordingJob = null

        ttsStopped.set(true)
+        ttsPlaying.set(false)
+        ttsTotalSamplesWritten = 0
        ttsQueue.clear()
        // wake worker if waiting
        ttsQueue.offer(TtsQueueItem.End)
@@ -480,22 +556,43 @@ class MainActivity : AppCompatActivity() {
    }

    private suspend fun processSamplesLoop() {
+        Log.d(TAG, "processSamplesLoop started")
        // Avoid calling vad.front()/vad.pop() (native queue APIs) since it crashes on some builds.
        // Use vad.compute() and implement a simple VAD segmenter in Kotlin instead.
        val windowSize = 512
        val buffer = ShortArray(windowSize)
-        val threshold = 0.5f
-        val minSilenceSamples = (0.25f * sampleRateInHz).toInt()
-        val minSpeechSamples = (0.25f * sampleRateInHz).toInt()
+        // 双阈值设置
+        val startThreshold = 0.2f   // 进入语音的阈值
+        val endThreshold = 0.15f    // 退出语音的阈值
+        val minSilenceSamples = (0.5f * sampleRateInHz).toInt()
+        val minSpeechSamples = (0.1f * sampleRateInHz).toInt()
        val maxSpeechSamples = (5.0f * sampleRateInHz).toInt()
+        
+        Log.d(TAG, "VAD thresholds: start=$startThreshold, end=$endThreshold, minSilenceSamples=$minSilenceSamples, minSpeechSamples=$minSpeechSamples")
+
+        // VAD 概率数据记录
+        val vadProbabilities = mutableListOf<Float>()
+        val vadTimestamps = mutableListOf<Long>()
+        val vadRMSValues = mutableListOf<Float>()
+        val vadSmoothedRMSValues = mutableListOf<Float>()
+
+        // 指数平滑相关变量
+        var smoothedRms = 0f
+        val alpha = 0.8f   // 平滑系数

        var inSpeech = false
        var silenceSamples = 0

        var speechBuf = FloatArray(0)
        var speechLen = 0
+        var processedSpeechBuf = FloatArray(0)
+        var processedSpeechLen = 0

-        fun appendSpeech(chunk: FloatArray) {
+        var loopCount = 0
+        var vadComputeCount = 0
+
+        fun appendSpeech(chunk: FloatArray, processedChunk: FloatArray) {
+            // 保存原始音频
            val needed = speechLen + chunk.size
            if (speechBuf.size < needed) {
                var newCap = maxOf(needed, maxOf(1024, speechBuf.size * 2))
@@ -509,85 +606,152 @@ class MainActivity : AppCompatActivity() {
                System.arraycopy(chunk, 0, speechBuf, speechLen, copyN)
                speechLen += copyN
            }
+
+            // 保存增益后的音频
+            val processedNeeded = processedSpeechLen + processedChunk.size
+            if (processedSpeechBuf.size < processedNeeded) {
+                var newCap = maxOf(processedNeeded, maxOf(1024, processedSpeechBuf.size * 2))
+                if (newCap > maxSpeechSamples) newCap = maxSpeechSamples
+                val n = FloatArray(newCap)
+                if (processedSpeechLen > 0) System.arraycopy(processedSpeechBuf, 0, n, 0, processedSpeechLen)
+                processedSpeechBuf = n
+            }
+            val processedCopyN = minOf(processedChunk.size, max(0, maxSpeechSamples - processedSpeechLen))
+            if (processedCopyN > 0) {
+                System.arraycopy(processedChunk, 0, processedSpeechBuf, processedSpeechLen, processedCopyN)
+                processedSpeechLen += processedCopyN
+            }
        }

        suspend fun finalizeSegmentIfAny() {
+            Log.d(TAG, "finalizeSegmentIfAny called: speechLen=$speechLen, minSpeechSamples=$minSpeechSamples, ttsPlaying=${ttsPlaying.get()}, llmInFlight=$llmInFlight")
+            
            if (speechLen < minSpeechSamples) {
+                Log.d(TAG, "finalizeSegmentIfAny: speech too short, discarding")
                speechLen = 0
+                processedSpeechLen = 0
                inSpeech = false
                silenceSamples = 0
                return
            }
-
-            val seg = speechBuf.copyOf(speechLen)
+            // ✅ 新增：如果 TTS 正在播放或 LLM 请求中，丢弃此段（避免回声）
+            if (ttsPlaying.get() || llmInFlight) {
+                Log.d(TAG, "finalizeSegmentIfAny: TTS playing or LLM in flight, discarding")
+                speechLen = 0
+                processedSpeechLen = 0
+                inSpeech = false
+                silenceSamples = 0
+                return
+            }
+            val originalSeg = speechBuf.copyOf(speechLen)
+            val processedSeg = processedSpeechBuf.copyOf(processedSpeechLen)
            speechLen = 0
+            processedSpeechLen = 0
            inSpeech = false
            silenceSamples = 0

-            // 每次只允许一个 LLM 请求在飞，避免堆积导致卡死/竞态
-            if (llmInFlight) return
-
-            val trace = currentTrace
-            trace?.markASRStart()
-            val raw = synchronized(nativeLock) {
-                val e = senseVoice
-                if (e == null || !e.isInitialized) "" else e.transcribeBuffer(seg)
-            }
-            val text = removeTokens(raw)
-            if (text.isBlank()) return
-            trace?.markASREnd()
-            if (text.isBlank()) return
-
-            withContext(Dispatchers.Main) {
-                appendToUi("\n\n[ASR] ${text}\n")
-            }
-
-            trace?.markRecordingDone()
-            trace?.markLlmResponseReceived()
-
-            if (BuildConfig.LLM_API_KEY.isBlank()) {
-                withContext(Dispatchers.Main) {
-                    Toast.makeText(
-                        this@MainActivity,
-                        "未配置 LLM_API_KEY（在 local.properties 或 gradle.properties 里设置）",
-                        Toast.LENGTH_LONG
-                    ).show()
-                }
-                return
-            }
-
-            llmInFlight = true
-            cloudApiManager.callLLM(text)
+            // 将语音段加入 ASR 处理队列，异步处理
+            Log.d(TAG, "Sending audio segment to ASR queue, size: ${processedSeg.size}")
+            asrQueue.send(Pair(originalSeg, processedSeg))
+            Log.d(TAG, "Calling ensureAsrWorker")
+            ensureAsrWorker()
        }

        while (isRecording && ioScope.coroutineContext.isActive) {
+            loopCount++
+            if (loopCount % 100 == 0) {
+                Log.d(TAG, "processSamplesLoop running, loopCount=$loopCount, ttsPlaying=${ttsPlaying.get()}")
+            }
+            // 如果TTS正在播放，跳过VAD处理，避免检测到回声
+            if (ttsPlaying.get()) {
+                // 如果正在语音状态，立即结束它
+                if (inSpeech) {
+                    Log.d(TAG, "TTS playing, resetting VAD state")
+                    inSpeech = false
+                    silenceSamples = 0
+                    speechLen = 0
+                    processedSpeechLen = 0
+                }
+                // 读取并丢弃音频数据，保持录音状态
+                val ret = audioRecord?.read(buffer, 0, buffer.size) ?: break
+                if (ret <= 0) continue
+                continue
+            }
+            
            val ret = audioRecord?.read(buffer, 0, buffer.size) ?: break
            if (ret <= 0) continue
            if (ret != windowSize) continue
-
+            // 在 processSamplesLoop 方法中
            val chunk = FloatArray(ret) { buffer[it] / 32768.0f }
-            val prob = synchronized(nativeLock) { vad.compute(chunk) }

-            if (prob >= threshold) {
-                if (!inSpeech) {
-                    inSpeech = true
-                    silenceSamples = 0
+            // 计算当前音频的RMS值（均方根）
+            val rms = calculateRMS(chunk)
+
+            // 应用指数平滑
+            smoothedRms = if (smoothedRms == 0f) rms else alpha * rms + (1 - alpha) * smoothedRms
+
+            // 动态调整增益因子，目标RMS设为0.1（约-20dB）
+            val targetRMS = 0.1f
+            var gainFactor = if (smoothedRms > 0) targetRMS / smoothedRms else 3.0f
+
+            // 设置增益的上下限，避免过度增益导致削波
+            gainFactor = gainFactor.coerceIn(0.1f, 10.0f)
+
+            // 应用增益因子
+            val processedChunk = FloatArray(chunk.size) {
+                val value = chunk[it] * gainFactor
+                // 限制音量范围，避免削波
+                if (value > 1.0f) 1.0f else if (value < -1.0f) -1.0f else value
+            }
+
+            // 使用处理后的音频数据
+            val prob = synchronized(nativeLock) { vad.compute(processedChunk) }
+            vadComputeCount++
+
+            // 记录VAD概率、时间戳、原始RMS值和平滑后的RMS值
+            vadProbabilities.add(prob)
+            vadTimestamps.add(System.currentTimeMillis())
+            vadRMSValues.add(rms)
+            vadSmoothedRMSValues.add(smoothedRms)
+
+            // 每100次循环输出一次VAD概率
+            if (vadComputeCount % 100 == 0) {
+                Log.d(TAG, "VAD prob=$prob, inSpeech=$inSpeech, rms=$rms, smoothedRms=$smoothedRms")
+            }
+
+            // 双阈值状态机逻辑
+            if (!inSpeech && prob >= startThreshold) {
+                // 进入语音状态
+                inSpeech = true
+                silenceSamples = 0
+                appendSpeech(chunk, processedChunk)
+                Log.d(TAG, "VAD: Entered speech state, prob=$prob, speechLen=$speechLen")
+            } else if (inSpeech && prob <= endThreshold) {
+                // 开始计数静音样本
+                silenceSamples += ret
+                if (silenceSamples >= minSilenceSamples) {
+                    // 退出语音状态
+                    Log.d(TAG, "VAD: Exiting speech state, prob=$prob, silenceSamples=$silenceSamples, speechLen=$speechLen")
+                    finalizeSegmentIfAny()
+                } else {
+                    // 保留尾音
+                    appendSpeech(chunk, processedChunk)
                }
-                appendSpeech(chunk)
+            } else if (inSpeech) {
+                // 语音过程中，持续添加音频
+                appendSpeech(chunk, processedChunk)
+                silenceSamples = 0 // 重置静音计数

                if (speechLen >= maxSpeechSamples) {
+                    Log.d(TAG, "VAD: Max speech length reached, finalizing segment")
                    finalizeSegmentIfAny()
                }
-            } else {
-                if (inSpeech) {
-                    silenceSamples += ret
-                    if (silenceSamples >= minSilenceSamples) {
-                        finalizeSegmentIfAny()
-                    } else {
-                        // keep a bit of trailing silence to avoid chopping
-                        appendSpeech(chunk)
-                    }
-                }
+            }
+            // 非语音状态且概率低于开始阈值，不做处理
+            
+            // 每1000次循环输出一次VAD状态
+            if (loopCount % 1000 == 0) {
+                Log.d(TAG, "VAD status: inSpeech=$inSpeech, prob=$prob, speechLen=$speechLen")
            }

            // 时间兜底切段（避免长时间无标点导致首包太慢）
@@ -597,6 +761,58 @@ class MainActivity : AppCompatActivity() {

        // flush last partial segment
        finalizeSegmentIfAny()
+
+        // 保存VAD数据到文件
+        saveVadData(vadTimestamps, vadProbabilities, vadRMSValues, vadSmoothedRMSValues)
+    }
+
+    /**
+     * 计算音频数据的均方根（RMS）值，用于动态调整增益
+     */
+    private fun calculateRMS(samples: FloatArray): Float {
+        if (samples.isEmpty()) return 0.0f
+
+        var sumSquared = 0.0f
+        for (sample in samples) {
+            sumSquared += sample * sample
+        }
+
+        val meanSquared = sumSquared / samples.size
+        return kotlin.math.sqrt(meanSquared)
+    }
+
+    /**
+     * 保存VAD数据到文件，方便后续分析和绘图
+     */
+    private fun saveVadData(timestamps: List<Long>, probabilities: List<Float>, rmsValues: List<Float>, smoothedRmsValues: List<Float>) {
+        try {
+            // 创建保存目录
+            val vadDataDir = File(filesDir, "vad_data")
+            if (!vadDataDir.exists()) {
+                vadDataDir.mkdirs()
+            }
+
+            // 生成唯一的文件名
+            val timestamp = System.currentTimeMillis()
+            val fileName = "vad_data_${timestamp}.csv"
+            val outputFile = File(vadDataDir, fileName)
+
+            // 写入数据
+            FileOutputStream(outputFile).use { fos ->
+                // 写入表头
+                fos.write("timestamp,probability,rms,smoothed_rms\n".toByteArray())
+
+                // 写入数据行
+                for (i in timestamps.indices) {
+                    val line = "${timestamps[i]},${probabilities[i]},${rmsValues[i]},${smoothedRmsValues[i]}\n"
+                    fos.write(line.toByteArray())
+                }
+            }
+
+            Log.d(TAG, "Saved VAD data to: ${outputFile.absolutePath}")
+        } catch (e: Exception) {
+            Log.e(TAG, "Error saving VAD data: ${e.message}")
+        }
    }

    private fun removeTokens(text: String): String {
@@ -608,8 +824,11 @@ class MainActivity : AppCompatActivity() {
    }

    private fun enqueueTtsSegment(seg: String) {
+        // 移除句末的标点符号
+        val cleanedSeg = seg.trimEnd('.', '。', '!', '！', '?', '？', ',', '，', ';', '；', ':', '：')
+
        currentTrace?.markTtsRequestEnqueued()
-        ttsQueue.offer(TtsQueueItem.Segment(seg))
+        ttsQueue.offer(TtsQueueItem.Segment(cleanedSeg))
        ensureTtsWorker()
    }

@@ -624,34 +843,60 @@ class MainActivity : AppCompatActivity() {
        }
    }

+    private fun ensureAsrWorker() {
+        Log.d(TAG, "ensureAsrWorker called, asrWorkerRunning=${asrWorkerRunning.get()}")
+        if (!asrWorkerRunning.compareAndSet(false, true)) {
+            Log.d(TAG, "ASR worker already running, returning")
+            return
+        }
+        Log.d(TAG, "Starting ASR worker coroutine")
+        ioScope.launch {
+            try {
+                runAsrWorker()
+            } finally {
+                Log.d(TAG, "ASR worker coroutine finished")
+                asrWorkerRunning.set(false)
+            }
+        }
+    }
+
    private fun runTtsWorker() {
        val t = tts ?: return
        val audioTrack = track ?: return

        var firstAudioMarked = false
+        var isFirstSegment = true
        while (true) {
            val item = ttsQueue.take()
            if (ttsStopped.get()) break

            when (item) {
                is TtsQueueItem.Segment -> {
+                    ttsPlaying.set(true)
+                    runOnUiThread { videoPlayerManager?.setSpeaking(true) }
                    val trace = currentTrace
                    trace?.markTtsSynthesisStart()
+                    Log.d(TAG, "TTS started: processing segment '${item.text}'")
+                    runOnUiThread {
+                        appendToUi("\n[TTS] 开始合成...\n")
+                    }

                    val startMs = System.currentTimeMillis()
                    var firstPcmMarked = false

-                    // flush to reduce latency between segments
-                    try {
-                        audioTrack.pause()
-                        audioTrack.flush()
-                        audioTrack.play()
-                    } catch (_: Throwable) {
+                    if (isFirstSegment) {
+                        try {
+                            audioTrack.pause()
+                            audioTrack.flush()
+                            audioTrack.play()
+                        } catch (_: Throwable) {
+                        }
+                        isFirstSegment = false
                    }

                    t.generateWithCallback(
                        text = item.text,
-                        sid = 0,
+                        sid = 2,  // 这里可以修改说话人
                        speed = 1.0f
                    ) { samples ->
                        if (ttsStopped.get()) return@generateWithCallback 0
@@ -664,6 +909,7 @@ class MainActivity : AppCompatActivity() {
                            trace?.markTtsFirstAudioPlay()
                        }
                        audioTrack.write(samples, 0, samples.size, AudioTrack.WRITE_BLOCKING)
+                        ttsTotalSamplesWritten += samples.size
                        1
                    }

@@ -672,6 +918,21 @@ class MainActivity : AppCompatActivity() {
                }

                TtsQueueItem.End -> {
+                    // 清空 ASR 队列，丢弃所有未处理的段（这些可能是 TTS 播放期间的回声）
+                    while (asrQueue.tryReceive().isSuccess) { }
+
+                    waitForPlaybackComplete(audioTrack)
+                    val ttsCompleteTime = System.currentTimeMillis()
+
+                    // 在主线程更新UI
+                    runOnUiThread {
+                        appendToUi("\n[LOG] TTS completed at: ${ttsCompleteTime}\n")
+                    }
+
+                    ttsPlaying.set(false)
+                    runOnUiThread { videoPlayerManager?.setSpeaking(false) }
+                    ttsTotalSamplesWritten = 0
+                    isFirstSegment = true
                    currentTrace?.markTtsDone()
                    TraceManager.getInstance().endTurn()
                    currentTrace = null
@@ -681,9 +942,257 @@ class MainActivity : AppCompatActivity() {
        }
    }

+    private fun waitForPlaybackComplete(audioTrack: AudioTrack) {
+        val totalSamples = ttsTotalSamplesWritten
+        if (totalSamples <= 0) return
+
+        val sampleRate = audioTrack.sampleRate
+        val timeoutMs = (totalSamples * 1000 / sampleRate) + 2000
+        val startTime = System.currentTimeMillis()
+
+        while (true) {
+            if (ttsStopped.get()) break
+
+            val playbackPos = audioTrack.playbackHeadPosition.toLong()
+            if (playbackPos >= totalSamples) {
+                break
+            }
+
+            if (System.currentTimeMillis() - startTime > timeoutMs) {
+                Log.w(TAG, "waitForPlaybackComplete timeout, pos=$playbackPos, total=$totalSamples")
+                break
+            }
+
+            Thread.sleep(20)
+        }
+        // 直接等待 1000ms，确保所有缓冲区清空
+        Thread.sleep(1000)
+    }
+
+    private suspend fun runAsrWorker() {
+        Log.d(TAG, "ASR worker started")
+        try {
+            while (ioScope.coroutineContext.isActive) {
+                val (originalSeg, processedSeg) = try {
+                    Log.d(TAG, "ASR worker waiting for audio segment")
+                    asrQueue.receive()
+                } catch (e: Throwable) {
+                    Log.e(TAG, "ASR worker receive failed: ${e.message}")
+                    break
+                }
+
+                Log.d(TAG, "ASR worker received audio segment, size=${processedSeg.size}")
+
+                // 每次只允许一个 LLM 请求在飞，避免堆积导致卡死/竞态
+                // TTS 播放期间不做 ASR，避免识别到 TTS 播放的声音
+                if (llmInFlight || ttsPlaying.get()) {
+                    Log.d(TAG, "ASR worker skipping segment: llmInFlight=$llmInFlight, ttsPlaying=${ttsPlaying.get()}")
+                    continue
+                }
+
+                val trace = currentTrace
+                trace?.markASRStart()
+                Log.d(TAG, "ASR started: processing audio segment")
+                withContext(Dispatchers.Main) {
+                    appendToUi("\n[ASR] 开始识别...\n")
+                }
+                
+                // 保存ASR音频用于调试
+                saveAsrAudio(originalSeg, processedSeg)
+                
+                val raw = synchronized(nativeLock) {
+                    val e = senseVoice
+                    if (e == null || !e.isInitialized) {
+                        Log.e(TAG, "ASR failed: SenseVoice engine not initialized")
+                        ""
+                    } else {
+                        try {
+                            e.transcribeBuffer(processedSeg)
+                        } catch (e: Throwable) {
+                            Log.e(TAG, "ASR transcribe failed: ${e.message}")
+                            ""
+                        }
+                    }
+                }
+                Log.d(TAG, "ASR raw result: $raw")
+                val text = removeTokens(raw)
+
+                // 添加过滤逻辑
+                if (text.isBlank()) {
+                    Log.d(TAG, "ASR segment skipped: blank text")
+                    continue
+                }
+                // 过滤英文单字符"i"
+                if (text.length == 1 && text[0].equals('i', ignoreCase = true)) {
+                    Log.d(TAG, "ASR segment skipped: single 'i'")
+                    continue
+                }
+                // 过滤超过50个字符的长文本
+                if (text.length > 50) {
+                    Log.d(TAG, "ASR segment skipped: too long (${text.length} chars)")
+                    continue
+                }
+
+                trace?.markASREnd()
+
+                withContext(Dispatchers.Main) {
+                    appendToUi("\n\n[ASR] ${text}\n")
+                }
+
+                trace?.markRecordingDone()
+                trace?.markLlmResponseReceived()
+
+                if (BuildConfig.LLM_API_KEY.isBlank()) {
+                    withContext(Dispatchers.Main) {
+                        Toast.makeText(
+                            this@MainActivity,
+                            "未配置 LLM_API_KEY（在 local.properties 或 gradle.properties 里设置）",
+                            Toast.LENGTH_LONG
+                        ).show()
+                    }
+                    continue
+                }
+
+                llmInFlight = true
+                Log.d(TAG, "Calling LLM with text: $text")
+                cloudApiManager.callLLM(text)
+            }
+        } catch (e: Throwable) {
+            Log.e(TAG, "ASR worker error: ${e.message}", e)
+        } finally {
+            Log.d(TAG, "ASR worker exiting")
+        }
+    }
+
    private fun appendToUi(s: String) {
        lastUiText += s
        textView.text = lastUiText
    }
-}

+    /**
+     * 保存ASR音频用于调试
+     */
+    private fun saveAsrAudio(originalAudio: FloatArray, processedAudio: FloatArray) {
+        try {
+            // 创建保存目录
+            val asrAudioDir = File(filesDir, "asr_audio")
+            if (!asrAudioDir.exists()) {
+                asrAudioDir.mkdirs()
+            }
+
+            // 生成唯一的文件名
+            val timestamp = System.currentTimeMillis()
+            
+            // 保存原始音频
+            val originalFile = File(asrAudioDir, "asr_${timestamp}_original.wav")
+            saveFloatArrayAsWav(originalFile, originalAudio, sampleRateInHz)
+            Log.d(TAG, "Saved original ASR audio to: ${originalFile.absolutePath}")
+
+            // 保存处理后的音频（增益后）
+            val processedFile = File(asrAudioDir, "asr_${timestamp}_processed.wav")
+            saveFloatArrayAsWav(processedFile, processedAudio, sampleRateInHz)
+            Log.d(TAG, "Saved processed ASR audio to: ${processedFile.absolutePath}")
+        } catch (e: Exception) {
+            Log.e(TAG, "Error saving ASR audio: ${e.message}")
+        }
+    }
+
+    /**
+     * 将FloatArray保存为WAV文件
+     */
+    private fun saveFloatArrayAsWav(file: File, samples: FloatArray, sampleRate: Int) {
+        FileOutputStream(file).use { fos ->
+            // WAV文件头
+            val header = ByteArray(44)
+            
+            // RIFF标识
+            header[0] = 'R'.code.toByte()
+            header[1] = 'I'.code.toByte()
+            header[2] = 'F'.code.toByte()
+            header[3] = 'F'.code.toByte()
+            
+            // 文件大小（不包括RIFF标识和文件大小字段本身）
+            val fileSize = 36 + samples.size * 2
+            intToByteArray(fileSize, header, 4)
+            
+            // WAVE标识
+            header[8] = 'W'.code.toByte()
+            header[9] = 'A'.code.toByte()
+            header[10] = 'V'.code.toByte()
+            header[11] = 'E'.code.toByte()
+            
+            // fmt标识
+            header[12] = 'f'.code.toByte()
+            header[13] = 'm'.code.toByte()
+            header[14] = 't'.code.toByte()
+            header[15] = ' '.code.toByte()
+            
+            // 子块大小
+            intToByteArray(16, header, 16)
+            
+            // 音频格式（1 = PCM）
+            shortToByteArray(1, header, 20)
+            
+            // 声道数（1 = 单声道）
+            shortToByteArray(1, header, 22)
+            
+            // 采样率
+            intToByteArray(sampleRate, header, 24)
+            
+            // 字节率 = 采样率 * 声道数 * 位深度 / 8
+            val byteRate = sampleRate * 1 * 16 / 8
+            intToByteArray(byteRate, header, 28)
+            
+            // 块对齐 = 声道数 * 位深度 / 8
+            val blockAlign = 1 * 16 / 8
+            shortToByteArray(blockAlign.toShort(), header, 32)
+            
+            // 位深度（16位）
+            shortToByteArray(16, header, 34)
+            
+            // data标识
+            header[36] = 'd'.code.toByte()
+            header[37] = 'a'.code.toByte()
+            header[38] = 't'.code.toByte()
+            header[39] = 'a'.code.toByte()
+            
+            // 数据大小
+            val dataSize = samples.size * 2
+            intToByteArray(dataSize, header, 40)
+            
+            // 写入文件头
+            fos.write(header)
+            
+            // 写入音频数据（转换为16位PCM）
+            for (sample in samples) {
+                // 确保样本在[-1, 1]范围内
+                val clampedSample = sample.coerceIn(-1.0f, 1.0f)
+                // 转换为16位整数
+                val shortSample = (clampedSample * 32767.0f).toInt().toShort()
+                // 写入小端序
+                val bytes = ByteArray(2)
+                bytes[0] = (shortSample.toInt() and 0xFF).toByte()
+                bytes[1] = (shortSample.toInt() shr 8 and 0xFF).toByte()
+                fos.write(bytes)
+            }
+        }
+    }
+
+    /**
+     * 将int转换为小端序字节数组
+     */
+    private fun intToByteArray(value: Int, dest: ByteArray, offset: Int) {
+        dest[offset] = (value and 0xFF).toByte()
+        dest[offset + 1] = (value shr 8 and 0xFF).toByte()
+        dest[offset + 2] = (value shr 16 and 0xFF).toByte()
+        dest[offset + 3] = (value shr 24 and 0xFF).toByte()
+    }
+
+    /**
+     * 将short转换为小端序字节数组
+     */
+    private fun shortToByteArray(value: Short, dest: ByteArray, offset: Int) {
+        dest[offset] = (value.toInt() and 0xFF).toByte()
+        dest[offset + 1] = (value.toInt() shr 8 and 0xFF).toByte()
+    }
+}
--- a/app/src/main/java/com/digitalperson/cloud/CloudApiManager.java
+++ b/app/src/main/java/com/digitalperson/cloud/CloudApiManager.java
@@ -1,10 +1,12 @@
 package com.digitalperson.cloud;

+import android.content.Context;
 import android.os.Handler;
 import android.os.Looper;
 import android.util.Log;

 import com.digitalperson.BuildConfig;
+import com.digitalperson.R;

 import org.json.JSONArray;
 import org.json.JSONException;
@@ -30,6 +32,7 @@ public class CloudApiManager {
    private CloudApiListener mListener;
    private Handler mMainHandler; // 用于在主线程执行UI更新
    private JSONArray mConversationHistory; // 存储对话历史
+    private boolean mEnableStreaming = true; // 默认启用流式输出
    
    public interface CloudApiListener {
        void onLLMResponseReceived(String response);
@@ -38,10 +41,37 @@ public class CloudApiManager {
        void onError(String errorMessage);
    }
    
-    public CloudApiManager(CloudApiListener listener) {
+    public CloudApiManager(CloudApiListener listener, Context context) {
        this.mListener = listener;
        this.mMainHandler = new Handler(Looper.getMainLooper()); // 初始化主线程Handler
        this.mConversationHistory = new JSONArray(); // 初始化对话历史
+        
+        // 添加 system message，要求回答简洁
+        try {
+            JSONObject systemMessage = new JSONObject();
+            systemMessage.put("role", "system");
+            String systemPrompt = context.getString(R.string.system_prompt);
+            systemMessage.put("content", systemPrompt);
+            mConversationHistory.put(systemMessage);
+        } catch (JSONException e) {
+            Log.e(TAG, "Failed to add system message: " + e.getMessage());
+        }
+    }
+    
+    /**
+     * 设置是否启用流式输出
+     * @param enableStreaming true: 启用流式输出，false: 禁用流式输出（整段输出）
+     */
+    public void setEnableStreaming(boolean enableStreaming) {
+        this.mEnableStreaming = enableStreaming;
+    }
+    
+    /**
+     * 获取当前是否启用流式输出
+     * @return true: 启用流式输出，false: 禁用流式输出（整段输出）
+     */
+    public boolean isEnableStreaming() {
+        return mEnableStreaming;
    }
    
    public void callLLM(String userInput) {
@@ -64,7 +94,7 @@ public class CloudApiManager {
                JSONObject requestBody = new JSONObject();
                requestBody.put("model", LLM_MODEL);
                requestBody.put("messages", mConversationHistory);
-                requestBody.put("stream", true); // 启用流式响应
+                requestBody.put("stream", mEnableStreaming); // 根据配置决定是否启用流式响应
                
                String jsonBody = requestBody.toString();
                
@@ -84,47 +114,74 @@ public class CloudApiManager {
                Log.d(TAG, "LLM Response Code: " + responseCode);
                
                if (responseCode == 200) {
-                    // 逐行读取流式响应
-                    try (BufferedReader br = new BufferedReader(
-                            new InputStreamReader(conn.getInputStream(), StandardCharsets.UTF_8))) {
-                        String line;
-                        while ((line = br.readLine()) != null) {
-                            Log.d(TAG, "LLM Streaming Line: " + line);
-                            
-                            // 处理SSE格式的响应
-                            if (line.startsWith("data: ")) {
-                                String dataPart = line.substring(6);
-                                if (dataPart.equals("[DONE]")) {
-                                    // 流式响应结束
-                                    break;
-                                }
+                    if (mEnableStreaming) {
+                        // 逐行读取流式响应
+                        try (BufferedReader br = new BufferedReader(
+                                new InputStreamReader(conn.getInputStream(), StandardCharsets.UTF_8))) {
+                            String line;
+                            while ((line = br.readLine()) != null) {
+                                Log.d(TAG, "LLM Streaming Line: " + line);
                                
-                                try {
-                                    // 解析JSON
-                                    JSONObject chunkObj = new JSONObject(dataPart);
-                                    JSONArray choices = chunkObj.getJSONArray("choices");
-                                    if (choices.length() > 0) {
-                                        JSONObject choice = choices.getJSONObject(0);
-                                        JSONObject delta = choice.getJSONObject("delta");
-                                        
-                                        if (delta.has("content")) {
-                                            String chunkContent = delta.getString("content");
-                                            accumulatedContent.append(chunkContent);
+                                // 处理SSE格式的响应
+                                if (line.startsWith("data: ")) {
+                                    String dataPart = line.substring(6);
+                                    if (dataPart.equals("[DONE]")) {
+                                        // 流式响应结束
+                                        break;
+                                    }
+                                    
+                                    try {
+                                        // 解析JSON
+                                        JSONObject chunkObj = new JSONObject(dataPart);
+                                        JSONArray choices = chunkObj.getJSONArray("choices");
+                                        if (choices.length() > 0) {
+                                            JSONObject choice = choices.getJSONObject(0);
+                                            JSONObject delta = choice.getJSONObject("delta");
                                            
-                                            // 发送流式chunk到监听器
-                                            if (mListener != null) {
-                                                mMainHandler.post(() -> {
-                                                    mListener.onLLMStreamingChunkReceived(chunkContent);
-                                                });
+                                            if (delta.has("content")) {
+                                                String chunkContent = delta.getString("content");
+                                                accumulatedContent.append(chunkContent);
+                                                
+                                                // 发送流式chunk到监听器
+                                                if (mListener != null) {
+                                                    mMainHandler.post(() -> {
+                                                        mListener.onLLMStreamingChunkReceived(chunkContent);
+                                                    });
+                                                }
                                            }
                                        }
+                                    } catch (JSONException e) {
+                                        Log.e(TAG, "Failed to parse streaming chunk: " + e.getMessage());
                                    }
-                                } catch (JSONException e) {
-                                    Log.e(TAG, "Failed to parse streaming chunk: " + e.getMessage());
+                                }
+                                
+                                fullResponse.append(line).append("\n");
+                            }
+                        }
+                    } else {
+                        // 读取完整响应
+                        try (BufferedReader br = new BufferedReader(
+                                new InputStreamReader(conn.getInputStream(), StandardCharsets.UTF_8))) {
+                            String line;
+                            while ((line = br.readLine()) != null) {
+                                fullResponse.append(line);
+                            }
+                        }
+                        
+                        // 解析完整JSON响应
+                        try {
+                            JSONObject responseObj = new JSONObject(fullResponse.toString());
+                            JSONArray choices = responseObj.getJSONArray("choices");
+                            if (choices.length() > 0) {
+                                JSONObject choice = choices.getJSONObject(0);
+                                JSONObject message = choice.getJSONObject("message");
+                                if (message.has("content")) {
+                                    String content = message.getString("content");
+                                    accumulatedContent.append(content);
                                }
                            }
-                            
-                            fullResponse.append(line).append("\n");
+                        } catch (JSONException e) {
+                            Log.e(TAG, "Failed to parse full response: " + e.getMessage());
                        }
                    }
                    
--- a/app/src/main/java/com/digitalperson/metrics/TraceSession.java
+++ b/app/src/main/java/com/digitalperson/metrics/TraceSession.java
@@ -38,13 +38,19 @@ public class TraceSession {
            long newValue = (currentValue != null) ? currentValue + deltaMs : deltaMs;
            if (currentValue == null) {
                // 如果键不存在，尝试添加
-                if (durations.putIfAbsent(name, newValue) == null) {
-                    break;
+                synchronized (durations) {
+                    if (!durations.containsKey(name)) {
+                        durations.put(name, newValue);
+                        break;
+                    }
                }
            } else {
                // 如果键存在，尝试更新
-                if (durations.replace(name, currentValue, newValue)) {
-                    break;
+                synchronized (durations) {
+                    if (durations.containsKey(name) && durations.get(name).equals(currentValue)) {
+                        durations.put(name, newValue);
+                        break;
+                    }
                }
            }
        }
--- a/app/src/main/java/com/digitalperson/player/VideoPlayerManager.kt
+++ b/app/src/main/java/com/digitalperson/player/VideoPlayerManager.kt
@@ -0,0 +1,99 @@
+package com.digitalperson.player
+
+import android.content.Context
+import android.net.Uri
+import android.view.View
+import com.digitalperson.R
+import com.google.android.exoplayer2.ExoPlayer
+import com.google.android.exoplayer2.MediaItem
+import com.google.android.exoplayer2.Player
+import com.google.android.exoplayer2.ui.PlayerView
+
+class VideoPlayerManager(
+    private val context: Context,
+    private val silentView: PlayerView,
+    private val speakingView: PlayerView
+) {
+    private var playerSilent: ExoPlayer? = null
+    private var playerSpeaking: ExoPlayer? = null
+    private var currentState: Boolean = false
+    private var transitionDuration = 300L // 淡入淡出时长
+
+    init {
+        // 确保初始 alpha
+        silentView.alpha = 1f
+        speakingView.alpha = 0f
+        initPlayers()
+    }
+
+    private fun uriForRaw(resId: Int): Uri = Uri.parse("android.resource://${context.packageName}/$resId")
+
+    private fun initPlayers() {
+        playerSilent = ExoPlayer.Builder(context).build().apply {
+            repeatMode = Player.REPEAT_MODE_ONE
+            playWhenReady = true
+            setMediaItem(MediaItem.fromUri(uriForRaw(R.raw.silent)))
+            prepare()
+        }
+
+        playerSpeaking = ExoPlayer.Builder(context).build().apply {
+            repeatMode = Player.REPEAT_MODE_ONE
+            playWhenReady = true
+            setMediaItem(MediaItem.fromUri(uriForRaw(R.raw.speak_no_voice)))
+            prepare()
+        }
+
+        // 绑定到各自的 PlayerView
+        silentView.player = playerSilent
+        speakingView.player = playerSpeaking
+
+        // 静音视频音频输出（通常不需要声音）
+        playerSilent?.volume = 0f
+        playerSpeaking?.volume = 0f
+
+        // 启动播放（prepare 后自动播放）
+        playerSilent?.play()
+        playerSpeaking?.play()
+
+        // 确保初始 alpha 状态（防止 Surface/Texture 的 race）
+        silentView.alpha = 1f
+        speakingView.alpha = 0f
+        currentState = false
+    }
+
+    /**
+     * 切换到说话状态：speaking=true 播放 speakingView（alpha 1），silentView 渐隐
+     */
+    fun setSpeaking(speaking: Boolean) {
+        if (speaking == currentState) return
+        currentState = speaking
+
+        // 同步位置：以 silent 为主（也可以反向）
+        syncPositions()
+
+        val fadeInView = if (speaking) speakingView else silentView
+        val fadeOutView = if (speaking) silentView else speakingView
+
+        // 执行淡入淡出
+        fadeOutView.animate().alpha(0f).setDuration(transitionDuration).start()
+        fadeInView.visibility = View.VISIBLE
+        fadeInView.animate().alpha(1f).setDuration(transitionDuration).start()
+    }
+
+    private fun syncPositions() {
+        // 以 silent 为主：将 speaking 同步到 silent 的位置
+        try {
+            val pos = playerSilent?.currentPosition ?: 0L
+            playerSpeaking?.seekTo(pos)
+        } catch (_: Throwable) {}
+    }
+
+    fun release() {
+        try { silentView.player = null } catch (_: Throwable) {}
+        try { speakingView.player = null } catch (_: Throwable) {}
+        try { playerSilent?.release() } catch (_: Throwable) {}
+        try { playerSpeaking?.release() } catch (_: Throwable) {}
+        playerSilent = null
+        playerSpeaking = null
+    }
+}
--- a/app/src/main/res/layout/activity_main.xml
+++ b/app/src/main/res/layout/activity_main.xml
@@ -4,8 +4,39 @@
    xmlns:tools="http://schemas.android.com/tools"
    android:layout_width="match_parent"
    android:layout_height="match_parent"
+    android:background="#606060"
    tools:context="com.digitalperson.MainActivity">

+    <!-- 双播放器容器：两个重叠的 PlayerView（silent 在下面，speaking 在上面，初始 alpha=0） -->
+    <FrameLayout
+        android:id="@+id/video_container"
+        android:layout_width="0dp"
+        android:layout_height="0dp"
+        app:layout_constraintTop_toTopOf="parent"
+        app:layout_constraintBottom_toBottomOf="parent"
+        app:layout_constraintStart_toStartOf="parent"
+        app:layout_constraintEnd_toEndOf="parent">
+
+        <com.google.android.exoplayer2.ui.PlayerView
+            android:id="@+id/player_view_silent"
+            android:layout_width="match_parent"
+            android:layout_height="match_parent"
+            app:use_controller="false"
+            app:resize_mode="fill"
+            app:surface_type="texture_view"
+            android:alpha="1" />
+
+        <com.google.android.exoplayer2.ui.PlayerView
+            android:id="@+id/player_view_speaking"
+            android:layout_width="match_parent"
+            android:layout_height="match_parent"
+            app:use_controller="false"
+            app:resize_mode="fill"
+            app:surface_type="texture_view"
+            android:alpha="0" />
+
+    </FrameLayout>
+
    <TextView
        android:id="@+id/my_text"
        android:layout_width="0dp"
@@ -14,10 +45,37 @@
        android:scrollbars="vertical"
        android:text="@string/hint"
        android:textIsSelectable="true"
-        app:layout_constraintBottom_toTopOf="@+id/button_row"
+        app:layout_constraintBottom_toTopOf="@+id/streaming_switch_row"
        app:layout_constraintEnd_toEndOf="parent"
        app:layout_constraintStart_toStartOf="parent"
-        app:layout_constraintTop_toTopOf="parent" />
+        app:layout_constraintTop_toTopOf="parent"
+        android:background="@android:color/transparent"
+        />
+
+    <LinearLayout
+        android:id="@+id/streaming_switch_row"
+        android:layout_width="0dp"
+        android:layout_height="wrap_content"
+        android:gravity="center_vertical"
+        android:orientation="horizontal"
+        android:padding="16dp"
+        app:layout_constraintBottom_toTopOf="@+id/button_row"
+        app:layout_constraintEnd_toEndOf="parent"
+        app:layout_constraintStart_toStartOf="parent">
+
+        <TextView
+            android:layout_width="wrap_content"
+            android:layout_height="wrap_content"
+            android:text="流式输出"
+            android:textSize="16sp"
+            android:layout_marginEnd="16dp"/>
+
+        <Switch
+            android:id="@+id/streaming_switch"
+            android:layout_width="wrap_content"
+            android:layout_height="wrap_content"
+            android:checked="false"/>
+    </LinearLayout>

    <LinearLayout
        android:id="@+id/button_row"
--- a/app/src/main/res/raw/silent.mp4
+++ b/app/src/main/res/raw/silent.mp4
--- a/app/src/main/res/raw/speak_no_voice.mp4
+++ b/app/src/main/res/raw/speak_no_voice.mp4
--- a/app/src/main/res/values/strings.xml
+++ b/app/src/main/res/values/strings.xml
@@ -3,4 +3,5 @@
    <string name="start">开始</string>
    <string name="stop">结束</string>
    <string name="hint">点击“开始”说话；识别后会请求大模型并用 TTS 播放回复。</string>
+    <string name="system_prompt">你是一名小学女老师，喜欢回答学生的各种问题，请简洁但温柔地回答，每个回答不超过30字。</string>
 </resources>