diff --git a/.vscode/settings.json b/.vscode/settings.json new file mode 100644 index 0000000..c5f3f6b --- /dev/null +++ b/.vscode/settings.json @@ -0,0 +1,3 @@ +{ + "java.configuration.updateBuildConfiguration": "interactive" +} \ No newline at end of file diff --git a/app/build.gradle b/app/build.gradle index 1f38bfa..3ca1143 100644 --- a/app/build.gradle +++ b/app/build.gradle @@ -70,4 +70,6 @@ dependencies { testImplementation 'junit:junit:4.13.2' androidTestImplementation 'androidx.test.ext:junit:1.1.5' androidTestImplementation 'androidx.test.espresso:espresso-core:3.5.1' + // ExoPlayer for video playback (used to show silent / speaking videos) + implementation 'com.google.android.exoplayer:exoplayer:2.18.6' } diff --git a/app/src/main/java/com/digital_person/MainActivity.kt.bak b/app/src/main/java/com/digital_person/MainActivity.kt.bak new file mode 100644 index 0000000..0ea208d --- /dev/null +++ b/app/src/main/java/com/digital_person/MainActivity.kt.bak @@ -0,0 +1,957 @@ +package com.digitalperson + +import android.Manifest +import android.content.pm.PackageManager +import android.media.AudioAttributes +import android.media.AudioFormat +import android.media.AudioManager +import android.media.AudioRecord +import android.media.AudioTrack +import android.media.MediaRecorder +import android.media.audiofx.AcousticEchoCanceler +import android.media.audiofx.NoiseSuppressor +import android.os.Bundle +import android.os.SystemClock +import android.text.method.ScrollingMovementMethod +import android.util.Log +import android.widget.Button +import android.widget.TextView +import android.widget.Toast +import androidx.appcompat.app.AppCompatActivity +import androidx.core.app.ActivityCompat +import com.digitalperson.cloud.CloudApiManager +import com.digitalperson.player.VideoPlayerManager +import com.google.android.exoplayer2.ui.PlayerView +import com.digitalperson.engine.SenseVoiceEngineRKNN +import com.digitalperson.metrics.TraceManager +import com.digitalperson.metrics.TraceSession +import com.k2fsa.sherpa.onnx.OfflineTts +import com.k2fsa.sherpa.onnx.SileroVadModelConfig +import com.k2fsa.sherpa.onnx.Vad +import com.k2fsa.sherpa.onnx.VadModelConfig +import com.k2fsa.sherpa.onnx.getOfflineTtsConfig +import kotlinx.coroutines.CoroutineScope +import kotlinx.coroutines.Dispatchers +import kotlinx.coroutines.Job +import kotlinx.coroutines.SupervisorJob +import kotlinx.coroutines.cancel +import kotlinx.coroutines.channels.Channel +import kotlinx.coroutines.isActive +import kotlinx.coroutines.launch +import kotlinx.coroutines.withContext +import java.io.File +import java.io.FileOutputStream +import java.util.concurrent.LinkedBlockingQueue +import java.util.concurrent.atomic.AtomicBoolean +import kotlin.math.max + +private const val TAG = "DigitalPerson" +private const val REQUEST_RECORD_AUDIO_PERMISSION = 200 + +class MainActivity : AppCompatActivity() { + + private lateinit var startButton: Button + private lateinit var stopButton: Button + private lateinit var textView: TextView + + private lateinit var vad: Vad + private var senseVoice: SenseVoiceEngineRKNN? = null + private var tts: OfflineTts? = null + private var track: AudioTrack? = null + + private var aec: AcousticEchoCanceler? = null + private var ns: NoiseSuppressor? = null + + private var audioRecord: AudioRecord? = null + private val audioSource = MediaRecorder.AudioSource.MIC + private val sampleRateInHz = 16000 + private val channelConfig = AudioFormat.CHANNEL_IN_MONO + private val audioFormat = AudioFormat.ENCODING_PCM_16BIT + private val permissions: Array = arrayOf(Manifest.permission.RECORD_AUDIO) + + @Volatile + private var isRecording: Boolean = false + + private val ioScope = CoroutineScope(SupervisorJob() + Dispatchers.IO) + private var recordingJob: Job? = null + private val nativeLock = Any() + + private lateinit var cloudApiManager: CloudApiManager + private var videoPlayerManager: VideoPlayerManager? = null + private val segmenter = StreamingTextSegmenter( + maxLen = 30, + maxWaitMs = 600 + ) + + private sealed class TtsQueueItem { + data class Segment(val text: String) : TtsQueueItem() + data object End : TtsQueueItem() + } + + private val ttsQueue = LinkedBlockingQueue() + private val ttsStopped = AtomicBoolean(false) + private val ttsWorkerRunning = AtomicBoolean(false) + private val ttsPlaying = AtomicBoolean(false) + @Volatile private var ttsTotalSamplesWritten: Long = 0 + + private var currentTrace: TraceSession? = null + + private var lastUiText: String = "" + @Volatile private var llmInFlight: Boolean = false + private var enableStreaming = true // 默认启用流式输出 + + // ASR 队列和工作器 + private val asrQueue = Channel>() + private val asrWorkerRunning = AtomicBoolean(false) + + override fun onRequestPermissionsResult( + requestCode: Int, + permissions: Array, + grantResults: IntArray + ) { + super.onRequestPermissionsResult(requestCode, permissions, grantResults) + val ok = requestCode == REQUEST_RECORD_AUDIO_PERMISSION && + grantResults.isNotEmpty() && + grantResults[0] == PackageManager.PERMISSION_GRANTED + if (!ok) { + Log.e(TAG, "Audio record is disallowed") + finish() + } + } + + override fun onCreate(savedInstanceState: Bundle?) { + super.onCreate(savedInstanceState) + setContentView(R.layout.activity_main) + + // 初始化双播放器管理器(silent 与 speaking 两个叠加的 PlayerView) + try { + val silentPv = findViewById(R.id.player_view_silent) + val speakingPv = findViewById(R.id.player_view_speaking) + videoPlayerManager = VideoPlayerManager(this, silentPv, speakingPv) + // 默认 AI 未说话 + videoPlayerManager?.setSpeaking(false) + } catch (e: Exception) { + Log.w(TAG, "PlayerViews not found or init failed: ${e.message}") + } + + ActivityCompat.requestPermissions(this, permissions, REQUEST_RECORD_AUDIO_PERMISSION) + + startButton = findViewById(R.id.start_button) + stopButton = findViewById(R.id.stop_button) + textView = findViewById(R.id.my_text) + textView.movementMethod = ScrollingMovementMethod() + + startButton.setOnClickListener { onStartClicked() } + stopButton.setOnClickListener { onStopClicked(userInitiated = true) } + + // 初始化流式输出开关 + try { + val streamingSwitch = findViewById(R.id.streaming_switch) + streamingSwitch.isChecked = enableStreaming + streamingSwitch.setOnCheckedChangeListener { _, isChecked -> + enableStreaming = isChecked + cloudApiManager.setEnableStreaming(isChecked) + Toast.makeText(this, "流式输出已${if (isChecked) "启用" else "禁用"}", Toast.LENGTH_SHORT).show() + } + } catch (e: Exception) { + Log.w(TAG, "Streaming switch not found in layout: ${e.message}") + } + + // 避免 UI 线程重初始化导致 ANR:在后台初始化模型与 AudioTrack + startButton.isEnabled = false + stopButton.isEnabled = false + textView.text = "初始化中…" + ioScope.launch { + try { + Log.i(TAG, "Init VAD + SenseVoice(RKNN) + TTS (background)") + synchronized(nativeLock) { + initVadModel() + initSenseVoiceModel() + } + withContext(Dispatchers.Main) { + initTtsAndAudioTrack() + textView.text = getString(R.string.hint) + startButton.isEnabled = true + stopButton.isEnabled = false + } + } catch (t: Throwable) { + Log.e(TAG, "Initialization failed: ${t.message}", t) + withContext(Dispatchers.Main) { + textView.text = "初始化失败:${t.javaClass.simpleName}: ${t.message}" + Toast.makeText( + this@MainActivity, + "初始化失败(请看 Logcat): ${t.javaClass.simpleName}", + Toast.LENGTH_LONG + ).show() + startButton.isEnabled = false + stopButton.isEnabled = false + } + } + } + + cloudApiManager = CloudApiManager(object : CloudApiManager.CloudApiListener { + private var llmFirstChunkMarked = false + + override fun onLLMResponseReceived(response: String) { + currentTrace?.markLlmDone() + llmInFlight = false + + // 根据流式输出模式处理响应 + if (enableStreaming) { + // 启用流式输出时,刷新剩余缓冲区 + for (seg in segmenter.flush()) { + enqueueTtsSegment(seg) + } + // 发送队列结束信号 + ttsQueue.offer(TtsQueueItem.End) + } else { + runOnUiThread { + appendToUi("${response}\n") + } + // 禁用流式输出时,直接使用整段文本进行TTS + enqueueTtsSegment(response) + // 发送队列结束信号 + ttsQueue.offer(TtsQueueItem.End) + } + } + + override fun onLLMStreamingChunkReceived(chunk: String) { + // 启用流式输出时,处理流式chunk + if (enableStreaming) { + if (!llmFirstChunkMarked) { + llmFirstChunkMarked = true + currentTrace?.markLlmFirstChunk() + } + appendToUi(chunk) + + val segments = segmenter.processChunk(chunk) + for (seg in segments) { + enqueueTtsSegment(seg) + } + } + } + + override fun onTTSAudioReceived(audioFilePath: String) { + // unused + } + + override fun onError(errorMessage: String) { + llmInFlight = false + Toast.makeText(this@MainActivity, errorMessage, Toast.LENGTH_LONG).show() + onStopClicked(userInitiated = false) + } + }, applicationContext) + + // 设置流式输出模式 + cloudApiManager.setEnableStreaming(enableStreaming) + } + + override fun onDestroy() { + super.onDestroy() + onStopClicked(userInitiated = false) + ioScope.cancel() + synchronized(nativeLock) { + try { + vad.release() + } catch (_: Throwable) { + } + try { + senseVoice?.deinitialize() + } catch (_: Throwable) { + } + } + try { + tts?.release() + } catch (_: Throwable) { + } + try { + videoPlayerManager?.release() + } catch (_: Throwable) { + } + } + + private fun onStartClicked() { + if (isRecording) return + + if (!initMicrophone()) { + Toast.makeText(this, "麦克风初始化失败/无权限", Toast.LENGTH_SHORT).show() + return + } + + // Start a new trace turn + currentTrace = TraceManager.getInstance().startNewTurn() + currentTrace?.mark("turn_start") + llmInFlight = false + + lastUiText = "" + textView.text = "" + + ttsStopped.set(false) + ttsPlaying.set(false) + ttsTotalSamplesWritten = 0 + ttsQueue.clear() + segmenter.reset() + + vad.reset() + audioRecord!!.startRecording() + isRecording = true + + startButton.isEnabled = false + stopButton.isEnabled = true + + recordingJob?.cancel() + recordingJob = ioScope.launch { + processSamplesLoop() + } + } + + private fun onStopClicked(userInitiated: Boolean) { + isRecording = false + try { + audioRecord?.stop() + } catch (_: Throwable) { + } + try { + audioRecord?.release() + } catch (_: Throwable) { + } + audioRecord = null + + recordingJob?.cancel() + recordingJob = null + + ttsStopped.set(true) + ttsPlaying.set(false) + ttsTotalSamplesWritten = 0 + ttsQueue.clear() + // wake worker if waiting + ttsQueue.offer(TtsQueueItem.End) + + try { + track?.pause() + track?.flush() + } catch (_: Throwable) { + } + try { aec?.release() } catch (_: Throwable) {} + try { ns?.release() } catch (_: Throwable) {} + aec = null + ns = null + startButton.isEnabled = true + stopButton.isEnabled = false + + if (userInitiated) { + TraceManager.getInstance().endTurn() + currentTrace = null + } + } + + private fun initVadModel() { + // 你的 VAD 模型在 assets/vad_model/ 下 + val config = VadModelConfig( + sileroVadModelConfig = SileroVadModelConfig( + model = "vad_model/silero_vad.onnx", + threshold = 0.5F, + minSilenceDuration = 0.25F, + minSpeechDuration = 0.25F, + windowSize = 512, + ), + sampleRate = sampleRateInHz, + numThreads = 1, + provider = "cpu", + ) + vad = Vad(assetManager = application.assets, config = config) + } + + private fun initSenseVoiceModel() { + Log.i(TAG, "ASR: init SenseVoice RKNN (scheme A)") + // Copy assets/sensevoice_models/* -> filesDir/sensevoice_models/* + val modelDir = copySenseVoiceAssetsToInternal() + val modelPath = File(modelDir, "sense-voice-encoder.rknn").absolutePath + val embeddingPath = File(modelDir, "embedding.npy").absolutePath + val bpePath = File(modelDir, "chn_jpn_yue_eng_ko_spectok.bpe.model").absolutePath + + // Print quick diagnostics for native libs + model files + try { + val libDir = applicationInfo.nativeLibraryDir + Log.i(TAG, "nativeLibraryDir=$libDir") + try { + val names = File(libDir).list()?.joinToString(", ") ?: "(empty)" + Log.i(TAG, "nativeLibraryDir files: $names") + } catch (t: Throwable) { + Log.w(TAG, "Failed to list nativeLibraryDir: ${t.message}") + } + } catch (_: Throwable) { + } + Log.i(TAG, "SenseVoice model paths:") + Log.i(TAG, " model=$modelPath exists=${File(modelPath).exists()} size=${File(modelPath).length()}") + Log.i(TAG, " embedding=$embeddingPath exists=${File(embeddingPath).exists()} size=${File(embeddingPath).length()}") + Log.i(TAG, " bpe=$bpePath exists=${File(bpePath).exists()} size=${File(bpePath).length()}") + + val t0 = SystemClock.elapsedRealtime() + val engine = try { + SenseVoiceEngineRKNN(this) + } catch (e: UnsatisfiedLinkError) { + // Most common: libsensevoiceEngine.so not packaged/built, or dependent libs missing + throw IllegalStateException("Load native libraries failed: ${e.message}", e) + } + + val ok = try { + engine.loadModelDirectly(modelPath, embeddingPath, bpePath) + } catch (t: Throwable) { + throw IllegalStateException("SenseVoice loadModelDirectly crashed: ${t.message}", t) + } + + val dt = SystemClock.elapsedRealtime() - t0 + Log.i(TAG, "SenseVoice loadModelDirectly ok=$ok costMs=$dt") + if (!ok) throw IllegalStateException("SenseVoiceEngineRKNN loadModelDirectly returned false") + + senseVoice = engine + } + + private fun initTtsAndAudioTrack() { + try { + // 你放入的 sherpa-onnx VITS 中文模型目录: + // assets/tts_model/sherpa-onnx-vits-zh-ll/{model.onnx,tokens.txt,lexicon.txt,...} + val modelDir = "tts_model/sherpa-onnx-vits-zh-ll" + val modelName = "model.onnx" + val lexicon = "lexicon.txt" + val dataDir = "" + + val ttsConfig = getOfflineTtsConfig( + modelDir = modelDir, + modelName = modelName, + acousticModelName = "", + vocoder = "", + voices = "", + lexicon = lexicon, + dataDir = dataDir, + dictDir = "", + // 中文规范化规则(目录里已有这些 fst) + ruleFsts = "$modelDir/phone.fst,$modelDir/date.fst,$modelDir/number.fst,$modelDir/new_heteronym.fst", + ruleFars = "", + numThreads = null, + isKitten = false + ) + tts = OfflineTts(assetManager = application.assets, config = ttsConfig) + } catch (t: Throwable) { + Log.e(TAG, "Init TTS failed: ${t.message}", t) + tts = null + runOnUiThread { + Toast.makeText( + this, + "TTS 初始化失败:请确认 assets/tts_model/sherpa-onnx-vits-zh-ll/ 下有 model.onnx、tokens.txt、lexicon.txt 以及 phone/date/number/new_heteronym.fst", + Toast.LENGTH_LONG + ).show() + } + } + + val t = tts ?: return + val sr = t.sampleRate() + val bufLength = AudioTrack.getMinBufferSize( + sr, + AudioFormat.CHANNEL_OUT_MONO, + AudioFormat.ENCODING_PCM_FLOAT + ) + val attr = AudioAttributes.Builder() + .setContentType(AudioAttributes.CONTENT_TYPE_SPEECH) + .setUsage(AudioAttributes.USAGE_MEDIA) + .build() + val format = AudioFormat.Builder() + .setEncoding(AudioFormat.ENCODING_PCM_FLOAT) + .setChannelMask(AudioFormat.CHANNEL_OUT_MONO) + .setSampleRate(sr) + .build() + track = AudioTrack( + attr, + format, + bufLength, + AudioTrack.MODE_STREAM, + AudioManager.AUDIO_SESSION_ID_GENERATE + ) + track?.play() + } + + private fun assetExists(path: String): Boolean { + return try { + application.assets.open(path).close() + true + } catch (_: Throwable) { + false + } + } + + private fun copySenseVoiceAssetsToInternal(): File { + val outDir = File(filesDir, "sensevoice_models") + if (!outDir.exists()) outDir.mkdirs() + + val files = arrayOf( + "am.mvn", + "chn_jpn_yue_eng_ko_spectok.bpe.model", + "embedding.npy", + "sense-voice-encoder.rknn" + ) + + for (name in files) { + val assetPath = "sensevoice_models/$name" + val outFile = File(outDir, name) + if (outFile.exists() && outFile.length() > 0) continue + application.assets.open(assetPath).use { input -> + FileOutputStream(outFile).use { output -> + input.copyTo(output) + } + } + } + return outDir + } + + private fun initMicrophone(): Boolean { + if (ActivityCompat.checkSelfPermission(this, Manifest.permission.RECORD_AUDIO) + != PackageManager.PERMISSION_GRANTED + ) { + ActivityCompat.requestPermissions(this, permissions, REQUEST_RECORD_AUDIO_PERMISSION) + return false + } + + val numBytes = AudioRecord.getMinBufferSize(sampleRateInHz, channelConfig, audioFormat) + audioRecord = AudioRecord( + audioSource, + sampleRateInHz, + channelConfig, + audioFormat, + numBytes * 2 + ) + val sessionId = audioRecord?.audioSessionId ?: 0 + if (sessionId != 0) { + if (android.media.audiofx.AcousticEchoCanceler.isAvailable()) { + aec = android.media.audiofx.AcousticEchoCanceler.create(sessionId)?.apply { + enabled = true + } + Log.i(TAG, "AEC enabled=${aec?.enabled}") + } else { + Log.w(TAG, "AEC not available on this device") + } + + if (android.media.audiofx.NoiseSuppressor.isAvailable()) { + ns = android.media.audiofx.NoiseSuppressor.create(sessionId)?.apply { + enabled = true + } + Log.i(TAG, "NS enabled=${ns?.enabled}") + } else { + Log.w(TAG, "NS not available on this device") + } + } + return true + } + + private suspend fun processSamplesLoop() { + // Avoid calling vad.front()/vad.pop() (native queue APIs) since it crashes on some builds. + // Use vad.compute() and implement a simple VAD segmenter in Kotlin instead. + val windowSize = 512 + val buffer = ShortArray(windowSize) + // 双阈值设置 + val startThreshold = 0.2f // 进入语音的阈值 + val endThreshold = 0.15f // 退出语音的阈值 + val minSilenceSamples = (0.5f * sampleRateInHz).toInt() + val minSpeechSamples = (0.1f * sampleRateInHz).toInt() + val maxSpeechSamples = (5.0f * sampleRateInHz).toInt() + + // VAD 概率数据记录 + val vadProbabilities = mutableListOf() + val vadTimestamps = mutableListOf() + val vadRMSValues = mutableListOf() + val vadSmoothedRMSValues = mutableListOf() + + // 指数平滑相关变量 + var smoothedRms = 0f + val alpha = 0.8f // 平滑系数 + + var inSpeech = false + var silenceSamples = 0 + + var speechBuf = FloatArray(0) + var speechLen = 0 + var processedSpeechBuf = FloatArray(0) + var processedSpeechLen = 0 + + fun appendSpeech(chunk: FloatArray, processedChunk: FloatArray) { + // 保存原始音频 + val needed = speechLen + chunk.size + if (speechBuf.size < needed) { + var newCap = maxOf(needed, maxOf(1024, speechBuf.size * 2)) + if (newCap > maxSpeechSamples) newCap = maxSpeechSamples + val n = FloatArray(newCap) + if (speechLen > 0) System.arraycopy(speechBuf, 0, n, 0, speechLen) + speechBuf = n + } + val copyN = minOf(chunk.size, max(0, maxSpeechSamples - speechLen)) + if (copyN > 0) { + System.arraycopy(chunk, 0, speechBuf, speechLen, copyN) + speechLen += copyN + } + + // 保存增益后的音频 + val processedNeeded = processedSpeechLen + processedChunk.size + if (processedSpeechBuf.size < processedNeeded) { + var newCap = maxOf(processedNeeded, maxOf(1024, processedSpeechBuf.size * 2)) + if (newCap > maxSpeechSamples) newCap = maxSpeechSamples + val n = FloatArray(newCap) + if (processedSpeechLen > 0) System.arraycopy(processedSpeechBuf, 0, n, 0, processedSpeechLen) + processedSpeechBuf = n + } + val processedCopyN = minOf(processedChunk.size, max(0, maxSpeechSamples - processedSpeechLen)) + if (processedCopyN > 0) { + System.arraycopy(processedChunk, 0, processedSpeechBuf, processedSpeechLen, processedCopyN) + processedSpeechLen += processedCopyN + } + } + + suspend fun finalizeSegmentIfAny() { + if (speechLen < minSpeechSamples) { + speechLen = 0 + processedSpeechLen = 0 + inSpeech = false + silenceSamples = 0 + return + } + // ✅ 新增:如果 TTS 正在播放或 LLM 请求中,丢弃此段(避免回声) + if (ttsPlaying.get() || llmInFlight) { + speechLen = 0 + processedSpeechLen = 0 + inSpeech = false + silenceSamples = 0 + return + } + val originalSeg = speechBuf.copyOf(speechLen) + val processedSeg = processedSpeechBuf.copyOf(processedSpeechLen) + speechLen = 0 + processedSpeechLen = 0 + inSpeech = false + silenceSamples = 0 + + // 将语音段加入 ASR 处理队列,异步处理 + asrQueue.send(Pair(originalSeg, processedSeg)) + } + + while (isRecording && ioScope.coroutineContext.isActive) { + val ret = audioRecord?.read(buffer, 0, buffer.size) ?: break + if (ret <= 0) continue + if (ret != windowSize) continue + // 在 processSamplesLoop 方法中 + val chunk = FloatArray(ret) { buffer[it] / 32768.0f } + + // 计算当前音频的RMS值(均方根) + val rms = calculateRMS(chunk) + + // 应用指数平滑 + smoothedRms = if (smoothedRms == 0f) rms else alpha * rms + (1 - alpha) * smoothedRms + + // 动态调整增益因子,目标RMS设为0.1(约-20dB) + val targetRMS = 0.1f + var gainFactor = if (smoothedRms > 0) targetRMS / smoothedRms else 3.0f + + // 设置增益的上下限,避免过度增益导致削波 + gainFactor = gainFactor.coerceIn(0.1f, 10.0f) + + // 应用增益因子 + val processedChunk = FloatArray(chunk.size) { + val value = chunk[it] * gainFactor + // 限制音量范围,避免削波 + if (value > 1.0f) 1.0f else if (value < -1.0f) -1.0f else value + } + + // 使用处理后的音频数据 + val prob = synchronized(nativeLock) { vad.compute(processedChunk) } + + // 记录VAD概率、时间戳、原始RMS值和平滑后的RMS值 + vadProbabilities.add(prob) + vadTimestamps.add(System.currentTimeMillis()) + vadRMSValues.add(rms) + vadSmoothedRMSValues.add(smoothedRms) + + // 双阈值状态机逻辑 + if (!inSpeech && prob >= startThreshold) { + // 进入语音状态 + inSpeech = true + silenceSamples = 0 + appendSpeech(chunk, processedChunk) + } else if (inSpeech && prob <= endThreshold) { + // 开始计数静音样本 + silenceSamples += ret + if (silenceSamples >= minSilenceSamples) { + // 退出语音状态 + finalizeSegmentIfAny() + } else { + // 保留尾音 + appendSpeech(chunk, processedChunk) + } + } else if (inSpeech) { + // 语音过程中,持续添加音频 + appendSpeech(chunk, processedChunk) + silenceSamples = 0 // 重置静音计数 + + if (speechLen >= maxSpeechSamples) { + finalizeSegmentIfAny() + } + } + // 非语音状态且概率低于开始阈值,不做处理 + + // 时间兜底切段(避免长时间无标点导致首包太慢) + val forced = segmenter.maybeForceByTime() + for (seg in forced) enqueueTtsSegment(seg) + } + + // flush last partial segment + finalizeSegmentIfAny() + + // 保存VAD数据到文件 + saveVadData(vadTimestamps, vadProbabilities, vadRMSValues, vadSmoothedRMSValues) + } + + /** + * 保存VAD数据到文件,方便后续分析和绘图 + */ + private fun saveVadData(timestamps: List, probabilities: List, rmsValues: List, smoothedRmsValues: List) { + try { + // 创建保存目录 + val vadDataDir = File(filesDir, "vad_data") + if (!vadDataDir.exists()) { + vadDataDir.mkdirs() + } + + // 生成唯一的文件名 + val timestamp = System.currentTimeMillis() + val fileName = "vad_data_${timestamp}.csv" + val outputFile = File(vadDataDir, fileName) + + // 写入数据 + FileOutputStream(outputFile).use { fos -> + // 写入表头 + fos.write("timestamp,probability,rms,smoothed_rms\n".toByteArray()) + + // 写入数据行 + for (i in timestamps.indices) { + val line = "${timestamps[i]},${probabilities[i]},${rmsValues[i]},${smoothedRmsValues[i]}\n" + fos.write(line.toByteArray()) + } + } + + Log.d(TAG, "Saved VAD data to: ${outputFile.absolutePath}") + } catch (e: Exception) { + Log.e(TAG, "Error saving VAD data: ${e.message}") + } + } + + private fun removeTokens(text: String): String { + // Remove tokens like <|zh|>, <|NEUTRAL|>, <|Speech|>, <|woitn|> and stray '>' chars + var cleaned = text.replace(Regex("<\\|[^>]+\\|>"), "") + cleaned = cleaned.replace(Regex("[>>≥≫]"), "") + cleaned = cleaned.trim().replace(Regex("\\s+"), " ") + return cleaned + } + + private fun enqueueTtsSegment(seg: String) { + // 移除句末的标点符号 + val cleanedSeg = seg.trimEnd('.', '。', '!', '!', '?', '?', ',', ',', ';', ';', ':', ':') + + currentTrace?.markTtsRequestEnqueued() + ttsQueue.offer(TtsQueueItem.Segment(cleanedSeg)) + ensureTtsWorker() + } + + private fun ensureTtsWorker() { + if (!ttsWorkerRunning.compareAndSet(false, true)) return + ioScope.launch { + try { + runTtsWorker() + } finally { + ttsWorkerRunning.set(false) + } + } + } + + private fun ensureAsrWorker() { + if (!asrWorkerRunning.compareAndSet(false, true)) return + ioScope.launch { + try { + runAsrWorker() + } finally { + asrWorkerRunning.set(false) + } + } + } + + private fun runTtsWorker() { + val t = tts ?: return + val audioTrack = track ?: return + + var firstAudioMarked = false + var isFirstSegment = true + while (true) { + val item = ttsQueue.take() + if (ttsStopped.get()) break + + when (item) { + is TtsQueueItem.Segment -> { + ttsPlaying.set(true) + runOnUiThread { videoPlayerManager?.setSpeaking(true) } + val trace = currentTrace + trace?.markTtsSynthesisStart() + Log.d(TAG, "TTS started: processing segment '${item.text}'") + runOnUiThread { + appendToUi("\n[TTS] 开始合成...\n") + } + + val startMs = System.currentTimeMillis() + var firstPcmMarked = false + + if (isFirstSegment) { + try { + audioTrack.pause() + audioTrack.flush() + audioTrack.play() + } catch (_: Throwable) { + } + isFirstSegment = false + } + + t.generateWithCallback( + text = item.text, + sid = 2, // 这里可以修改说话人 + speed = 1.0f + ) { samples -> + if (ttsStopped.get()) return@generateWithCallback 0 + if (!firstPcmMarked && samples.isNotEmpty()) { + firstPcmMarked = true + trace?.markTtsFirstPcmReady() + } + if (!firstAudioMarked && samples.isNotEmpty()) { + firstAudioMarked = true + trace?.markTtsFirstAudioPlay() + } + audioTrack.write(samples, 0, samples.size, AudioTrack.WRITE_BLOCKING) + ttsTotalSamplesWritten += samples.size + 1 + } + + val ttsMs = System.currentTimeMillis() - startMs + trace?.addDuration("tts_segment_ms_total", ttsMs) + } + + TtsQueueItem.End -> { + // 清空 ASR 队列,丢弃所有未处理的段(这些可能是 TTS 播放期间的回声) + while (asrQueue.tryReceive().isSuccess) { } + + waitForPlaybackComplete(audioTrack) + val ttsCompleteTime = System.currentTimeMillis() + + // 在主线程更新UI + runOnUiThread { + appendToUi("\n[LOG] TTS completed at: ${ttsCompleteTime}\n") + } + + ttsPlaying.set(false) + runOnUiThread { videoPlayerManager?.setSpeaking(false) } + ttsTotalSamplesWritten = 0 + isFirstSegment = true + currentTrace?.markTtsDone() + TraceManager.getInstance().endTurn() + currentTrace = null + break + } + } + } + } + + private fun waitForPlaybackComplete(audioTrack: AudioTrack) { + val totalSamples = ttsTotalSamplesWritten + if (totalSamples <= 0) return + + val sampleRate = audioTrack.sampleRate + val timeoutMs = (totalSamples * 1000 / sampleRate) + 2000 + val startTime = System.currentTimeMillis() + + while (true) { + if (ttsStopped.get()) break + + val playbackPos = audioTrack.playbackHeadPosition.toLong() + if (playbackPos >= totalSamples) { + break + } + + if (System.currentTimeMillis() - startTime > timeoutMs) { + Log.w(TAG, "waitForPlaybackComplete timeout, pos=$playbackPos, total=$totalSamples") + break + } + + Thread.sleep(20) + } + // 直接等待 1000ms,确保所有缓冲区清空 + Thread.sleep(1000) + } + + private suspend fun runAsrWorker() { + while (ioScope.coroutineContext.isActive) { + val (seg, trace) = try { + asrQueue.receive() + } catch (_: Throwable) { + break + } + + // 每次只允许一个 LLM 请求在飞,避免堆积导致卡死/竞态 + // TTS 播放期间不做 ASR,避免识别到 TTS 播放的声音 + if (llmInFlight || ttsPlaying.get()) continue + + trace?.markASRStart() + Log.d(TAG, "ASR started: processing audio segment") + withContext(Dispatchers.Main) { + appendToUi("\n[ASR] 开始识别...\n") + } + val raw = synchronized(nativeLock) { + val e = senseVoice + if (e == null || !e.isInitialized) "" else e.transcribeBuffer(seg) + } + val text = removeTokens(raw) + + // 添加过滤逻辑 + if (text.isBlank()) continue + // 过滤英文单字符"i" + if (text.length == 1 && text[0].equals('i', ignoreCase = true)) { + Log.d(TAG, "ASR segment skipped: single 'i'") + continue + } + // 过滤超过50个字符的长文本 + if (text.length > 50) { + Log.d(TAG, "ASR segment skipped: too long (${text.length} chars)") + continue + } + + trace?.markASREnd() + + withContext(Dispatchers.Main) { + appendToUi("\n\n[ASR] ${text}\n") + } + + trace?.markRecordingDone() + trace?.markLlmResponseReceived() + + if (BuildConfig.LLM_API_KEY.isBlank()) { + withContext(Dispatchers.Main) { + Toast.makeText( + this@MainActivity, + "未配置 LLM_API_KEY(在 local.properties 或 gradle.properties 里设置)", + Toast.LENGTH_LONG + ).show() + } + continue + } + + llmInFlight = true + cloudApiManager.callLLM(text) + } + } + + private fun appendToUi(s: String) { + lastUiText += s + textView.text = lastUiText + } +} \ No newline at end of file diff --git a/app/src/main/java/com/digitalperson/MainActivity.kt b/app/src/main/java/com/digitalperson/MainActivity.kt index 04bb0d0..c20afa2 100644 --- a/app/src/main/java/com/digitalperson/MainActivity.kt +++ b/app/src/main/java/com/digitalperson/MainActivity.kt @@ -20,6 +20,8 @@ import android.widget.Toast import androidx.appcompat.app.AppCompatActivity import androidx.core.app.ActivityCompat import com.digitalperson.cloud.CloudApiManager +import com.digitalperson.player.VideoPlayerManager +import com.google.android.exoplayer2.ui.PlayerView import com.digitalperson.engine.SenseVoiceEngineRKNN import com.digitalperson.metrics.TraceManager import com.digitalperson.metrics.TraceSession @@ -33,6 +35,7 @@ import kotlinx.coroutines.Dispatchers import kotlinx.coroutines.Job import kotlinx.coroutines.SupervisorJob import kotlinx.coroutines.cancel +import kotlinx.coroutines.channels.Channel import kotlinx.coroutines.isActive import kotlinx.coroutines.launch import kotlinx.coroutines.withContext @@ -66,6 +69,8 @@ class MainActivity : AppCompatActivity() { private val audioFormat = AudioFormat.ENCODING_PCM_16BIT private val permissions: Array = arrayOf(Manifest.permission.RECORD_AUDIO) + + @Volatile private var isRecording: Boolean = false @@ -74,7 +79,11 @@ class MainActivity : AppCompatActivity() { private val nativeLock = Any() private lateinit var cloudApiManager: CloudApiManager - private val segmenter = StreamingTextSegmenter() + private var videoPlayerManager: VideoPlayerManager? = null + private val segmenter = StreamingTextSegmenter( + maxLen = 30, + maxWaitMs = 600 + ) private sealed class TtsQueueItem { data class Segment(val text: String) : TtsQueueItem() @@ -84,11 +93,18 @@ class MainActivity : AppCompatActivity() { private val ttsQueue = LinkedBlockingQueue() private val ttsStopped = AtomicBoolean(false) private val ttsWorkerRunning = AtomicBoolean(false) + private val ttsPlaying = AtomicBoolean(false) + @Volatile private var ttsTotalSamplesWritten: Long = 0 private var currentTrace: TraceSession? = null private var lastUiText: String = "" @Volatile private var llmInFlight: Boolean = false + private var enableStreaming = false // 默认禁用流式输出 + + // ASR 队列和工作器 + private val asrQueue = Channel>(capacity = Channel.UNLIMITED) + private val asrWorkerRunning = AtomicBoolean(false) override fun onRequestPermissionsResult( requestCode: Int, @@ -97,8 +113,8 @@ class MainActivity : AppCompatActivity() { ) { super.onRequestPermissionsResult(requestCode, permissions, grantResults) val ok = requestCode == REQUEST_RECORD_AUDIO_PERMISSION && - grantResults.isNotEmpty() && - grantResults[0] == PackageManager.PERMISSION_GRANTED + grantResults.isNotEmpty() && + grantResults[0] == PackageManager.PERMISSION_GRANTED if (!ok) { Log.e(TAG, "Audio record is disallowed") finish() @@ -109,6 +125,17 @@ class MainActivity : AppCompatActivity() { super.onCreate(savedInstanceState) setContentView(R.layout.activity_main) + // 初始化双播放器管理器(silent 与 speaking 两个叠加的 PlayerView) + try { + val silentPv = findViewById(R.id.player_view_silent) + val speakingPv = findViewById(R.id.player_view_speaking) + videoPlayerManager = VideoPlayerManager(this, silentPv, speakingPv) + // 默认 AI 未说话 + videoPlayerManager?.setSpeaking(false) + } catch (e: Exception) { + Log.w(TAG, "PlayerViews not found or init failed: ${e.message}") + } + ActivityCompat.requestPermissions(this, permissions, REQUEST_RECORD_AUDIO_PERMISSION) startButton = findViewById(R.id.start_button) @@ -119,6 +146,19 @@ class MainActivity : AppCompatActivity() { startButton.setOnClickListener { onStartClicked() } stopButton.setOnClickListener { onStopClicked(userInitiated = true) } + // 初始化流式输出开关 + try { + val streamingSwitch = findViewById(R.id.streaming_switch) + streamingSwitch.isChecked = enableStreaming + streamingSwitch.setOnCheckedChangeListener { _, isChecked -> + enableStreaming = isChecked + cloudApiManager.setEnableStreaming(isChecked) + Toast.makeText(this, "流式输出已${if (isChecked) "启用" else "禁用"}", Toast.LENGTH_SHORT).show() + } + } catch (e: Exception) { + Log.w(TAG, "Streaming switch not found in layout: ${e.message}") + } + // 避免 UI 线程重初始化导致 ANR:在后台初始化模型与 AudioTrack startButton.isEnabled = false stopButton.isEnabled = false @@ -151,30 +191,45 @@ class MainActivity : AppCompatActivity() { } } - cloudApiManager = CloudApiManager(object : CloudApiManager.CloudApiListener { + cloudApiManager = CloudApiManager(object : CloudApiManager.CloudApiListener { private var llmFirstChunkMarked = false override fun onLLMResponseReceived(response: String) { currentTrace?.markLlmDone() llmInFlight = false - // flush remaining buffer into TTS - for (seg in segmenter.flush()) { - enqueueTtsSegment(seg) + + // 根据流式输出模式处理响应 + if (enableStreaming) { + // 启用流式输出时,刷新剩余缓冲区 + for (seg in segmenter.flush()) { + enqueueTtsSegment(seg) + } + // 发送队列结束信号 + ttsQueue.offer(TtsQueueItem.End) + } else { + runOnUiThread { + appendToUi("${response}\n") + } + // 禁用流式输出时,直接使用整段文本进行TTS + enqueueTtsSegment(response) + // 发送队列结束信号 + ttsQueue.offer(TtsQueueItem.End) } - // signal queue end (no more segments after this) - ttsQueue.offer(TtsQueueItem.End) } override fun onLLMStreamingChunkReceived(chunk: String) { - if (!llmFirstChunkMarked) { - llmFirstChunkMarked = true - currentTrace?.markLlmFirstChunk() - } - appendToUi(chunk) + // 启用流式输出时,处理流式chunk + if (enableStreaming) { + if (!llmFirstChunkMarked) { + llmFirstChunkMarked = true + currentTrace?.markLlmFirstChunk() + } + appendToUi(chunk) - val segments = segmenter.processChunk(chunk) - for (seg in segments) { - enqueueTtsSegment(seg) + val segments = segmenter.processChunk(chunk) + for (seg in segments) { + enqueueTtsSegment(seg) + } } } @@ -187,7 +242,14 @@ class MainActivity : AppCompatActivity() { Toast.makeText(this@MainActivity, errorMessage, Toast.LENGTH_LONG).show() onStopClicked(userInitiated = false) } - }) + }, applicationContext) + + // 设置流式输出模式 + cloudApiManager.setEnableStreaming(enableStreaming) + + // 预先启动ASR worker + Log.d(TAG, "Pre-starting ASR worker") + ensureAsrWorker() } override fun onDestroy() { @@ -208,10 +270,18 @@ class MainActivity : AppCompatActivity() { tts?.release() } catch (_: Throwable) { } + try { + videoPlayerManager?.release() + } catch (_: Throwable) { + } } private fun onStartClicked() { - if (isRecording) return + Log.d(TAG, "onStartClicked called") + if (isRecording) { + Log.d(TAG, "Already recording, returning") + return + } if (!initMicrophone()) { Toast.makeText(this, "麦克风初始化失败/无权限", Toast.LENGTH_SHORT).show() @@ -227,6 +297,8 @@ class MainActivity : AppCompatActivity() { textView.text = "" ttsStopped.set(false) + ttsPlaying.set(false) + ttsTotalSamplesWritten = 0 ttsQueue.clear() segmenter.reset() @@ -237,10 +309,12 @@ class MainActivity : AppCompatActivity() { startButton.isEnabled = false stopButton.isEnabled = true + Log.d(TAG, "Starting processSamplesLoop coroutine") recordingJob?.cancel() recordingJob = ioScope.launch { processSamplesLoop() } + Log.d(TAG, "onStartClicked completed") } private fun onStopClicked(userInitiated: Boolean) { @@ -259,6 +333,8 @@ class MainActivity : AppCompatActivity() { recordingJob = null ttsStopped.set(true) + ttsPlaying.set(false) + ttsTotalSamplesWritten = 0 ttsQueue.clear() // wake worker if waiting ttsQueue.offer(TtsQueueItem.End) @@ -480,22 +556,43 @@ class MainActivity : AppCompatActivity() { } private suspend fun processSamplesLoop() { + Log.d(TAG, "processSamplesLoop started") // Avoid calling vad.front()/vad.pop() (native queue APIs) since it crashes on some builds. // Use vad.compute() and implement a simple VAD segmenter in Kotlin instead. val windowSize = 512 val buffer = ShortArray(windowSize) - val threshold = 0.5f - val minSilenceSamples = (0.25f * sampleRateInHz).toInt() - val minSpeechSamples = (0.25f * sampleRateInHz).toInt() + // 双阈值设置 + val startThreshold = 0.2f // 进入语音的阈值 + val endThreshold = 0.15f // 退出语音的阈值 + val minSilenceSamples = (0.5f * sampleRateInHz).toInt() + val minSpeechSamples = (0.1f * sampleRateInHz).toInt() val maxSpeechSamples = (5.0f * sampleRateInHz).toInt() + + Log.d(TAG, "VAD thresholds: start=$startThreshold, end=$endThreshold, minSilenceSamples=$minSilenceSamples, minSpeechSamples=$minSpeechSamples") + + // VAD 概率数据记录 + val vadProbabilities = mutableListOf() + val vadTimestamps = mutableListOf() + val vadRMSValues = mutableListOf() + val vadSmoothedRMSValues = mutableListOf() + + // 指数平滑相关变量 + var smoothedRms = 0f + val alpha = 0.8f // 平滑系数 var inSpeech = false var silenceSamples = 0 var speechBuf = FloatArray(0) var speechLen = 0 + var processedSpeechBuf = FloatArray(0) + var processedSpeechLen = 0 - fun appendSpeech(chunk: FloatArray) { + var loopCount = 0 + var vadComputeCount = 0 + + fun appendSpeech(chunk: FloatArray, processedChunk: FloatArray) { + // 保存原始音频 val needed = speechLen + chunk.size if (speechBuf.size < needed) { var newCap = maxOf(needed, maxOf(1024, speechBuf.size * 2)) @@ -509,85 +606,152 @@ class MainActivity : AppCompatActivity() { System.arraycopy(chunk, 0, speechBuf, speechLen, copyN) speechLen += copyN } + + // 保存增益后的音频 + val processedNeeded = processedSpeechLen + processedChunk.size + if (processedSpeechBuf.size < processedNeeded) { + var newCap = maxOf(processedNeeded, maxOf(1024, processedSpeechBuf.size * 2)) + if (newCap > maxSpeechSamples) newCap = maxSpeechSamples + val n = FloatArray(newCap) + if (processedSpeechLen > 0) System.arraycopy(processedSpeechBuf, 0, n, 0, processedSpeechLen) + processedSpeechBuf = n + } + val processedCopyN = minOf(processedChunk.size, max(0, maxSpeechSamples - processedSpeechLen)) + if (processedCopyN > 0) { + System.arraycopy(processedChunk, 0, processedSpeechBuf, processedSpeechLen, processedCopyN) + processedSpeechLen += processedCopyN + } } suspend fun finalizeSegmentIfAny() { + Log.d(TAG, "finalizeSegmentIfAny called: speechLen=$speechLen, minSpeechSamples=$minSpeechSamples, ttsPlaying=${ttsPlaying.get()}, llmInFlight=$llmInFlight") + if (speechLen < minSpeechSamples) { + Log.d(TAG, "finalizeSegmentIfAny: speech too short, discarding") speechLen = 0 + processedSpeechLen = 0 inSpeech = false silenceSamples = 0 return } - - val seg = speechBuf.copyOf(speechLen) + // ✅ 新增:如果 TTS 正在播放或 LLM 请求中,丢弃此段(避免回声) + if (ttsPlaying.get() || llmInFlight) { + Log.d(TAG, "finalizeSegmentIfAny: TTS playing or LLM in flight, discarding") + speechLen = 0 + processedSpeechLen = 0 + inSpeech = false + silenceSamples = 0 + return + } + val originalSeg = speechBuf.copyOf(speechLen) + val processedSeg = processedSpeechBuf.copyOf(processedSpeechLen) speechLen = 0 + processedSpeechLen = 0 inSpeech = false silenceSamples = 0 - // 每次只允许一个 LLM 请求在飞,避免堆积导致卡死/竞态 - if (llmInFlight) return - - val trace = currentTrace - trace?.markASRStart() - val raw = synchronized(nativeLock) { - val e = senseVoice - if (e == null || !e.isInitialized) "" else e.transcribeBuffer(seg) - } - val text = removeTokens(raw) - if (text.isBlank()) return - trace?.markASREnd() - if (text.isBlank()) return - - withContext(Dispatchers.Main) { - appendToUi("\n\n[ASR] ${text}\n") - } - - trace?.markRecordingDone() - trace?.markLlmResponseReceived() - - if (BuildConfig.LLM_API_KEY.isBlank()) { - withContext(Dispatchers.Main) { - Toast.makeText( - this@MainActivity, - "未配置 LLM_API_KEY(在 local.properties 或 gradle.properties 里设置)", - Toast.LENGTH_LONG - ).show() - } - return - } - - llmInFlight = true - cloudApiManager.callLLM(text) + // 将语音段加入 ASR 处理队列,异步处理 + Log.d(TAG, "Sending audio segment to ASR queue, size: ${processedSeg.size}") + asrQueue.send(Pair(originalSeg, processedSeg)) + Log.d(TAG, "Calling ensureAsrWorker") + ensureAsrWorker() } while (isRecording && ioScope.coroutineContext.isActive) { + loopCount++ + if (loopCount % 100 == 0) { + Log.d(TAG, "processSamplesLoop running, loopCount=$loopCount, ttsPlaying=${ttsPlaying.get()}") + } + // 如果TTS正在播放,跳过VAD处理,避免检测到回声 + if (ttsPlaying.get()) { + // 如果正在语音状态,立即结束它 + if (inSpeech) { + Log.d(TAG, "TTS playing, resetting VAD state") + inSpeech = false + silenceSamples = 0 + speechLen = 0 + processedSpeechLen = 0 + } + // 读取并丢弃音频数据,保持录音状态 + val ret = audioRecord?.read(buffer, 0, buffer.size) ?: break + if (ret <= 0) continue + continue + } + val ret = audioRecord?.read(buffer, 0, buffer.size) ?: break if (ret <= 0) continue if (ret != windowSize) continue - + // 在 processSamplesLoop 方法中 val chunk = FloatArray(ret) { buffer[it] / 32768.0f } - val prob = synchronized(nativeLock) { vad.compute(chunk) } - if (prob >= threshold) { - if (!inSpeech) { - inSpeech = true - silenceSamples = 0 + // 计算当前音频的RMS值(均方根) + val rms = calculateRMS(chunk) + + // 应用指数平滑 + smoothedRms = if (smoothedRms == 0f) rms else alpha * rms + (1 - alpha) * smoothedRms + + // 动态调整增益因子,目标RMS设为0.1(约-20dB) + val targetRMS = 0.1f + var gainFactor = if (smoothedRms > 0) targetRMS / smoothedRms else 3.0f + + // 设置增益的上下限,避免过度增益导致削波 + gainFactor = gainFactor.coerceIn(0.1f, 10.0f) + + // 应用增益因子 + val processedChunk = FloatArray(chunk.size) { + val value = chunk[it] * gainFactor + // 限制音量范围,避免削波 + if (value > 1.0f) 1.0f else if (value < -1.0f) -1.0f else value + } + + // 使用处理后的音频数据 + val prob = synchronized(nativeLock) { vad.compute(processedChunk) } + vadComputeCount++ + + // 记录VAD概率、时间戳、原始RMS值和平滑后的RMS值 + vadProbabilities.add(prob) + vadTimestamps.add(System.currentTimeMillis()) + vadRMSValues.add(rms) + vadSmoothedRMSValues.add(smoothedRms) + + // 每100次循环输出一次VAD概率 + if (vadComputeCount % 100 == 0) { + Log.d(TAG, "VAD prob=$prob, inSpeech=$inSpeech, rms=$rms, smoothedRms=$smoothedRms") + } + + // 双阈值状态机逻辑 + if (!inSpeech && prob >= startThreshold) { + // 进入语音状态 + inSpeech = true + silenceSamples = 0 + appendSpeech(chunk, processedChunk) + Log.d(TAG, "VAD: Entered speech state, prob=$prob, speechLen=$speechLen") + } else if (inSpeech && prob <= endThreshold) { + // 开始计数静音样本 + silenceSamples += ret + if (silenceSamples >= minSilenceSamples) { + // 退出语音状态 + Log.d(TAG, "VAD: Exiting speech state, prob=$prob, silenceSamples=$silenceSamples, speechLen=$speechLen") + finalizeSegmentIfAny() + } else { + // 保留尾音 + appendSpeech(chunk, processedChunk) } - appendSpeech(chunk) + } else if (inSpeech) { + // 语音过程中,持续添加音频 + appendSpeech(chunk, processedChunk) + silenceSamples = 0 // 重置静音计数 if (speechLen >= maxSpeechSamples) { + Log.d(TAG, "VAD: Max speech length reached, finalizing segment") finalizeSegmentIfAny() } - } else { - if (inSpeech) { - silenceSamples += ret - if (silenceSamples >= minSilenceSamples) { - finalizeSegmentIfAny() - } else { - // keep a bit of trailing silence to avoid chopping - appendSpeech(chunk) - } - } + } + // 非语音状态且概率低于开始阈值,不做处理 + + // 每1000次循环输出一次VAD状态 + if (loopCount % 1000 == 0) { + Log.d(TAG, "VAD status: inSpeech=$inSpeech, prob=$prob, speechLen=$speechLen") } // 时间兜底切段(避免长时间无标点导致首包太慢) @@ -597,6 +761,58 @@ class MainActivity : AppCompatActivity() { // flush last partial segment finalizeSegmentIfAny() + + // 保存VAD数据到文件 + saveVadData(vadTimestamps, vadProbabilities, vadRMSValues, vadSmoothedRMSValues) + } + + /** + * 计算音频数据的均方根(RMS)值,用于动态调整增益 + */ + private fun calculateRMS(samples: FloatArray): Float { + if (samples.isEmpty()) return 0.0f + + var sumSquared = 0.0f + for (sample in samples) { + sumSquared += sample * sample + } + + val meanSquared = sumSquared / samples.size + return kotlin.math.sqrt(meanSquared) + } + + /** + * 保存VAD数据到文件,方便后续分析和绘图 + */ + private fun saveVadData(timestamps: List, probabilities: List, rmsValues: List, smoothedRmsValues: List) { + try { + // 创建保存目录 + val vadDataDir = File(filesDir, "vad_data") + if (!vadDataDir.exists()) { + vadDataDir.mkdirs() + } + + // 生成唯一的文件名 + val timestamp = System.currentTimeMillis() + val fileName = "vad_data_${timestamp}.csv" + val outputFile = File(vadDataDir, fileName) + + // 写入数据 + FileOutputStream(outputFile).use { fos -> + // 写入表头 + fos.write("timestamp,probability,rms,smoothed_rms\n".toByteArray()) + + // 写入数据行 + for (i in timestamps.indices) { + val line = "${timestamps[i]},${probabilities[i]},${rmsValues[i]},${smoothedRmsValues[i]}\n" + fos.write(line.toByteArray()) + } + } + + Log.d(TAG, "Saved VAD data to: ${outputFile.absolutePath}") + } catch (e: Exception) { + Log.e(TAG, "Error saving VAD data: ${e.message}") + } } private fun removeTokens(text: String): String { @@ -608,8 +824,11 @@ class MainActivity : AppCompatActivity() { } private fun enqueueTtsSegment(seg: String) { + // 移除句末的标点符号 + val cleanedSeg = seg.trimEnd('.', '。', '!', '!', '?', '?', ',', ',', ';', ';', ':', ':') + currentTrace?.markTtsRequestEnqueued() - ttsQueue.offer(TtsQueueItem.Segment(seg)) + ttsQueue.offer(TtsQueueItem.Segment(cleanedSeg)) ensureTtsWorker() } @@ -624,34 +843,60 @@ class MainActivity : AppCompatActivity() { } } + private fun ensureAsrWorker() { + Log.d(TAG, "ensureAsrWorker called, asrWorkerRunning=${asrWorkerRunning.get()}") + if (!asrWorkerRunning.compareAndSet(false, true)) { + Log.d(TAG, "ASR worker already running, returning") + return + } + Log.d(TAG, "Starting ASR worker coroutine") + ioScope.launch { + try { + runAsrWorker() + } finally { + Log.d(TAG, "ASR worker coroutine finished") + asrWorkerRunning.set(false) + } + } + } + private fun runTtsWorker() { val t = tts ?: return val audioTrack = track ?: return var firstAudioMarked = false + var isFirstSegment = true while (true) { val item = ttsQueue.take() if (ttsStopped.get()) break when (item) { is TtsQueueItem.Segment -> { + ttsPlaying.set(true) + runOnUiThread { videoPlayerManager?.setSpeaking(true) } val trace = currentTrace trace?.markTtsSynthesisStart() + Log.d(TAG, "TTS started: processing segment '${item.text}'") + runOnUiThread { + appendToUi("\n[TTS] 开始合成...\n") + } val startMs = System.currentTimeMillis() var firstPcmMarked = false - // flush to reduce latency between segments - try { - audioTrack.pause() - audioTrack.flush() - audioTrack.play() - } catch (_: Throwable) { + if (isFirstSegment) { + try { + audioTrack.pause() + audioTrack.flush() + audioTrack.play() + } catch (_: Throwable) { + } + isFirstSegment = false } t.generateWithCallback( text = item.text, - sid = 0, + sid = 2, // 这里可以修改说话人 speed = 1.0f ) { samples -> if (ttsStopped.get()) return@generateWithCallback 0 @@ -664,6 +909,7 @@ class MainActivity : AppCompatActivity() { trace?.markTtsFirstAudioPlay() } audioTrack.write(samples, 0, samples.size, AudioTrack.WRITE_BLOCKING) + ttsTotalSamplesWritten += samples.size 1 } @@ -672,6 +918,21 @@ class MainActivity : AppCompatActivity() { } TtsQueueItem.End -> { + // 清空 ASR 队列,丢弃所有未处理的段(这些可能是 TTS 播放期间的回声) + while (asrQueue.tryReceive().isSuccess) { } + + waitForPlaybackComplete(audioTrack) + val ttsCompleteTime = System.currentTimeMillis() + + // 在主线程更新UI + runOnUiThread { + appendToUi("\n[LOG] TTS completed at: ${ttsCompleteTime}\n") + } + + ttsPlaying.set(false) + runOnUiThread { videoPlayerManager?.setSpeaking(false) } + ttsTotalSamplesWritten = 0 + isFirstSegment = true currentTrace?.markTtsDone() TraceManager.getInstance().endTurn() currentTrace = null @@ -681,9 +942,257 @@ class MainActivity : AppCompatActivity() { } } + private fun waitForPlaybackComplete(audioTrack: AudioTrack) { + val totalSamples = ttsTotalSamplesWritten + if (totalSamples <= 0) return + + val sampleRate = audioTrack.sampleRate + val timeoutMs = (totalSamples * 1000 / sampleRate) + 2000 + val startTime = System.currentTimeMillis() + + while (true) { + if (ttsStopped.get()) break + + val playbackPos = audioTrack.playbackHeadPosition.toLong() + if (playbackPos >= totalSamples) { + break + } + + if (System.currentTimeMillis() - startTime > timeoutMs) { + Log.w(TAG, "waitForPlaybackComplete timeout, pos=$playbackPos, total=$totalSamples") + break + } + + Thread.sleep(20) + } + // 直接等待 1000ms,确保所有缓冲区清空 + Thread.sleep(1000) + } + + private suspend fun runAsrWorker() { + Log.d(TAG, "ASR worker started") + try { + while (ioScope.coroutineContext.isActive) { + val (originalSeg, processedSeg) = try { + Log.d(TAG, "ASR worker waiting for audio segment") + asrQueue.receive() + } catch (e: Throwable) { + Log.e(TAG, "ASR worker receive failed: ${e.message}") + break + } + + Log.d(TAG, "ASR worker received audio segment, size=${processedSeg.size}") + + // 每次只允许一个 LLM 请求在飞,避免堆积导致卡死/竞态 + // TTS 播放期间不做 ASR,避免识别到 TTS 播放的声音 + if (llmInFlight || ttsPlaying.get()) { + Log.d(TAG, "ASR worker skipping segment: llmInFlight=$llmInFlight, ttsPlaying=${ttsPlaying.get()}") + continue + } + + val trace = currentTrace + trace?.markASRStart() + Log.d(TAG, "ASR started: processing audio segment") + withContext(Dispatchers.Main) { + appendToUi("\n[ASR] 开始识别...\n") + } + + // 保存ASR音频用于调试 + saveAsrAudio(originalSeg, processedSeg) + + val raw = synchronized(nativeLock) { + val e = senseVoice + if (e == null || !e.isInitialized) { + Log.e(TAG, "ASR failed: SenseVoice engine not initialized") + "" + } else { + try { + e.transcribeBuffer(processedSeg) + } catch (e: Throwable) { + Log.e(TAG, "ASR transcribe failed: ${e.message}") + "" + } + } + } + Log.d(TAG, "ASR raw result: $raw") + val text = removeTokens(raw) + + // 添加过滤逻辑 + if (text.isBlank()) { + Log.d(TAG, "ASR segment skipped: blank text") + continue + } + // 过滤英文单字符"i" + if (text.length == 1 && text[0].equals('i', ignoreCase = true)) { + Log.d(TAG, "ASR segment skipped: single 'i'") + continue + } + // 过滤超过50个字符的长文本 + if (text.length > 50) { + Log.d(TAG, "ASR segment skipped: too long (${text.length} chars)") + continue + } + + trace?.markASREnd() + + withContext(Dispatchers.Main) { + appendToUi("\n\n[ASR] ${text}\n") + } + + trace?.markRecordingDone() + trace?.markLlmResponseReceived() + + if (BuildConfig.LLM_API_KEY.isBlank()) { + withContext(Dispatchers.Main) { + Toast.makeText( + this@MainActivity, + "未配置 LLM_API_KEY(在 local.properties 或 gradle.properties 里设置)", + Toast.LENGTH_LONG + ).show() + } + continue + } + + llmInFlight = true + Log.d(TAG, "Calling LLM with text: $text") + cloudApiManager.callLLM(text) + } + } catch (e: Throwable) { + Log.e(TAG, "ASR worker error: ${e.message}", e) + } finally { + Log.d(TAG, "ASR worker exiting") + } + } + private fun appendToUi(s: String) { lastUiText += s textView.text = lastUiText } -} + /** + * 保存ASR音频用于调试 + */ + private fun saveAsrAudio(originalAudio: FloatArray, processedAudio: FloatArray) { + try { + // 创建保存目录 + val asrAudioDir = File(filesDir, "asr_audio") + if (!asrAudioDir.exists()) { + asrAudioDir.mkdirs() + } + + // 生成唯一的文件名 + val timestamp = System.currentTimeMillis() + + // 保存原始音频 + val originalFile = File(asrAudioDir, "asr_${timestamp}_original.wav") + saveFloatArrayAsWav(originalFile, originalAudio, sampleRateInHz) + Log.d(TAG, "Saved original ASR audio to: ${originalFile.absolutePath}") + + // 保存处理后的音频(增益后) + val processedFile = File(asrAudioDir, "asr_${timestamp}_processed.wav") + saveFloatArrayAsWav(processedFile, processedAudio, sampleRateInHz) + Log.d(TAG, "Saved processed ASR audio to: ${processedFile.absolutePath}") + } catch (e: Exception) { + Log.e(TAG, "Error saving ASR audio: ${e.message}") + } + } + + /** + * 将FloatArray保存为WAV文件 + */ + private fun saveFloatArrayAsWav(file: File, samples: FloatArray, sampleRate: Int) { + FileOutputStream(file).use { fos -> + // WAV文件头 + val header = ByteArray(44) + + // RIFF标识 + header[0] = 'R'.code.toByte() + header[1] = 'I'.code.toByte() + header[2] = 'F'.code.toByte() + header[3] = 'F'.code.toByte() + + // 文件大小(不包括RIFF标识和文件大小字段本身) + val fileSize = 36 + samples.size * 2 + intToByteArray(fileSize, header, 4) + + // WAVE标识 + header[8] = 'W'.code.toByte() + header[9] = 'A'.code.toByte() + header[10] = 'V'.code.toByte() + header[11] = 'E'.code.toByte() + + // fmt标识 + header[12] = 'f'.code.toByte() + header[13] = 'm'.code.toByte() + header[14] = 't'.code.toByte() + header[15] = ' '.code.toByte() + + // 子块大小 + intToByteArray(16, header, 16) + + // 音频格式(1 = PCM) + shortToByteArray(1, header, 20) + + // 声道数(1 = 单声道) + shortToByteArray(1, header, 22) + + // 采样率 + intToByteArray(sampleRate, header, 24) + + // 字节率 = 采样率 * 声道数 * 位深度 / 8 + val byteRate = sampleRate * 1 * 16 / 8 + intToByteArray(byteRate, header, 28) + + // 块对齐 = 声道数 * 位深度 / 8 + val blockAlign = 1 * 16 / 8 + shortToByteArray(blockAlign.toShort(), header, 32) + + // 位深度(16位) + shortToByteArray(16, header, 34) + + // data标识 + header[36] = 'd'.code.toByte() + header[37] = 'a'.code.toByte() + header[38] = 't'.code.toByte() + header[39] = 'a'.code.toByte() + + // 数据大小 + val dataSize = samples.size * 2 + intToByteArray(dataSize, header, 40) + + // 写入文件头 + fos.write(header) + + // 写入音频数据(转换为16位PCM) + for (sample in samples) { + // 确保样本在[-1, 1]范围内 + val clampedSample = sample.coerceIn(-1.0f, 1.0f) + // 转换为16位整数 + val shortSample = (clampedSample * 32767.0f).toInt().toShort() + // 写入小端序 + val bytes = ByteArray(2) + bytes[0] = (shortSample.toInt() and 0xFF).toByte() + bytes[1] = (shortSample.toInt() shr 8 and 0xFF).toByte() + fos.write(bytes) + } + } + } + + /** + * 将int转换为小端序字节数组 + */ + private fun intToByteArray(value: Int, dest: ByteArray, offset: Int) { + dest[offset] = (value and 0xFF).toByte() + dest[offset + 1] = (value shr 8 and 0xFF).toByte() + dest[offset + 2] = (value shr 16 and 0xFF).toByte() + dest[offset + 3] = (value shr 24 and 0xFF).toByte() + } + + /** + * 将short转换为小端序字节数组 + */ + private fun shortToByteArray(value: Short, dest: ByteArray, offset: Int) { + dest[offset] = (value.toInt() and 0xFF).toByte() + dest[offset + 1] = (value.toInt() shr 8 and 0xFF).toByte() + } +} \ No newline at end of file diff --git a/app/src/main/java/com/digitalperson/cloud/CloudApiManager.java b/app/src/main/java/com/digitalperson/cloud/CloudApiManager.java index ce10e48..104aef1 100644 --- a/app/src/main/java/com/digitalperson/cloud/CloudApiManager.java +++ b/app/src/main/java/com/digitalperson/cloud/CloudApiManager.java @@ -1,10 +1,12 @@ package com.digitalperson.cloud; +import android.content.Context; import android.os.Handler; import android.os.Looper; import android.util.Log; import com.digitalperson.BuildConfig; +import com.digitalperson.R; import org.json.JSONArray; import org.json.JSONException; @@ -30,6 +32,7 @@ public class CloudApiManager { private CloudApiListener mListener; private Handler mMainHandler; // 用于在主线程执行UI更新 private JSONArray mConversationHistory; // 存储对话历史 + private boolean mEnableStreaming = true; // 默认启用流式输出 public interface CloudApiListener { void onLLMResponseReceived(String response); @@ -38,10 +41,37 @@ public class CloudApiManager { void onError(String errorMessage); } - public CloudApiManager(CloudApiListener listener) { + public CloudApiManager(CloudApiListener listener, Context context) { this.mListener = listener; this.mMainHandler = new Handler(Looper.getMainLooper()); // 初始化主线程Handler this.mConversationHistory = new JSONArray(); // 初始化对话历史 + + // 添加 system message,要求回答简洁 + try { + JSONObject systemMessage = new JSONObject(); + systemMessage.put("role", "system"); + String systemPrompt = context.getString(R.string.system_prompt); + systemMessage.put("content", systemPrompt); + mConversationHistory.put(systemMessage); + } catch (JSONException e) { + Log.e(TAG, "Failed to add system message: " + e.getMessage()); + } + } + + /** + * 设置是否启用流式输出 + * @param enableStreaming true: 启用流式输出,false: 禁用流式输出(整段输出) + */ + public void setEnableStreaming(boolean enableStreaming) { + this.mEnableStreaming = enableStreaming; + } + + /** + * 获取当前是否启用流式输出 + * @return true: 启用流式输出,false: 禁用流式输出(整段输出) + */ + public boolean isEnableStreaming() { + return mEnableStreaming; } public void callLLM(String userInput) { @@ -64,7 +94,7 @@ public class CloudApiManager { JSONObject requestBody = new JSONObject(); requestBody.put("model", LLM_MODEL); requestBody.put("messages", mConversationHistory); - requestBody.put("stream", true); // 启用流式响应 + requestBody.put("stream", mEnableStreaming); // 根据配置决定是否启用流式响应 String jsonBody = requestBody.toString(); @@ -84,47 +114,74 @@ public class CloudApiManager { Log.d(TAG, "LLM Response Code: " + responseCode); if (responseCode == 200) { - // 逐行读取流式响应 - try (BufferedReader br = new BufferedReader( - new InputStreamReader(conn.getInputStream(), StandardCharsets.UTF_8))) { - String line; - while ((line = br.readLine()) != null) { - Log.d(TAG, "LLM Streaming Line: " + line); - - // 处理SSE格式的响应 - if (line.startsWith("data: ")) { - String dataPart = line.substring(6); - if (dataPart.equals("[DONE]")) { - // 流式响应结束 - break; - } + if (mEnableStreaming) { + // 逐行读取流式响应 + try (BufferedReader br = new BufferedReader( + new InputStreamReader(conn.getInputStream(), StandardCharsets.UTF_8))) { + String line; + while ((line = br.readLine()) != null) { + Log.d(TAG, "LLM Streaming Line: " + line); - try { - // 解析JSON - JSONObject chunkObj = new JSONObject(dataPart); - JSONArray choices = chunkObj.getJSONArray("choices"); - if (choices.length() > 0) { - JSONObject choice = choices.getJSONObject(0); - JSONObject delta = choice.getJSONObject("delta"); - - if (delta.has("content")) { - String chunkContent = delta.getString("content"); - accumulatedContent.append(chunkContent); + // 处理SSE格式的响应 + if (line.startsWith("data: ")) { + String dataPart = line.substring(6); + if (dataPart.equals("[DONE]")) { + // 流式响应结束 + break; + } + + try { + // 解析JSON + JSONObject chunkObj = new JSONObject(dataPart); + JSONArray choices = chunkObj.getJSONArray("choices"); + if (choices.length() > 0) { + JSONObject choice = choices.getJSONObject(0); + JSONObject delta = choice.getJSONObject("delta"); - // 发送流式chunk到监听器 - if (mListener != null) { - mMainHandler.post(() -> { - mListener.onLLMStreamingChunkReceived(chunkContent); - }); + if (delta.has("content")) { + String chunkContent = delta.getString("content"); + accumulatedContent.append(chunkContent); + + // 发送流式chunk到监听器 + if (mListener != null) { + mMainHandler.post(() -> { + mListener.onLLMStreamingChunkReceived(chunkContent); + }); + } } } + } catch (JSONException e) { + Log.e(TAG, "Failed to parse streaming chunk: " + e.getMessage()); } - } catch (JSONException e) { - Log.e(TAG, "Failed to parse streaming chunk: " + e.getMessage()); + } + + fullResponse.append(line).append("\n"); + } + } + } else { + // 读取完整响应 + try (BufferedReader br = new BufferedReader( + new InputStreamReader(conn.getInputStream(), StandardCharsets.UTF_8))) { + String line; + while ((line = br.readLine()) != null) { + fullResponse.append(line); + } + } + + // 解析完整JSON响应 + try { + JSONObject responseObj = new JSONObject(fullResponse.toString()); + JSONArray choices = responseObj.getJSONArray("choices"); + if (choices.length() > 0) { + JSONObject choice = choices.getJSONObject(0); + JSONObject message = choice.getJSONObject("message"); + if (message.has("content")) { + String content = message.getString("content"); + accumulatedContent.append(content); } } - - fullResponse.append(line).append("\n"); + } catch (JSONException e) { + Log.e(TAG, "Failed to parse full response: " + e.getMessage()); } } diff --git a/app/src/main/java/com/digitalperson/metrics/TraceSession.java b/app/src/main/java/com/digitalperson/metrics/TraceSession.java index 776ed53..2b7e1c9 100644 --- a/app/src/main/java/com/digitalperson/metrics/TraceSession.java +++ b/app/src/main/java/com/digitalperson/metrics/TraceSession.java @@ -38,13 +38,19 @@ public class TraceSession { long newValue = (currentValue != null) ? currentValue + deltaMs : deltaMs; if (currentValue == null) { // 如果键不存在,尝试添加 - if (durations.putIfAbsent(name, newValue) == null) { - break; + synchronized (durations) { + if (!durations.containsKey(name)) { + durations.put(name, newValue); + break; + } } } else { // 如果键存在,尝试更新 - if (durations.replace(name, currentValue, newValue)) { - break; + synchronized (durations) { + if (durations.containsKey(name) && durations.get(name).equals(currentValue)) { + durations.put(name, newValue); + break; + } } } } diff --git a/app/src/main/java/com/digitalperson/player/VideoPlayerManager.kt b/app/src/main/java/com/digitalperson/player/VideoPlayerManager.kt new file mode 100644 index 0000000..952f8cd --- /dev/null +++ b/app/src/main/java/com/digitalperson/player/VideoPlayerManager.kt @@ -0,0 +1,99 @@ +package com.digitalperson.player + +import android.content.Context +import android.net.Uri +import android.view.View +import com.digitalperson.R +import com.google.android.exoplayer2.ExoPlayer +import com.google.android.exoplayer2.MediaItem +import com.google.android.exoplayer2.Player +import com.google.android.exoplayer2.ui.PlayerView + +class VideoPlayerManager( + private val context: Context, + private val silentView: PlayerView, + private val speakingView: PlayerView +) { + private var playerSilent: ExoPlayer? = null + private var playerSpeaking: ExoPlayer? = null + private var currentState: Boolean = false + private var transitionDuration = 300L // 淡入淡出时长 + + init { + // 确保初始 alpha + silentView.alpha = 1f + speakingView.alpha = 0f + initPlayers() + } + + private fun uriForRaw(resId: Int): Uri = Uri.parse("android.resource://${context.packageName}/$resId") + + private fun initPlayers() { + playerSilent = ExoPlayer.Builder(context).build().apply { + repeatMode = Player.REPEAT_MODE_ONE + playWhenReady = true + setMediaItem(MediaItem.fromUri(uriForRaw(R.raw.silent))) + prepare() + } + + playerSpeaking = ExoPlayer.Builder(context).build().apply { + repeatMode = Player.REPEAT_MODE_ONE + playWhenReady = true + setMediaItem(MediaItem.fromUri(uriForRaw(R.raw.speak_no_voice))) + prepare() + } + + // 绑定到各自的 PlayerView + silentView.player = playerSilent + speakingView.player = playerSpeaking + + // 静音视频音频输出(通常不需要声音) + playerSilent?.volume = 0f + playerSpeaking?.volume = 0f + + // 启动播放(prepare 后自动播放) + playerSilent?.play() + playerSpeaking?.play() + + // 确保初始 alpha 状态(防止 Surface/Texture 的 race) + silentView.alpha = 1f + speakingView.alpha = 0f + currentState = false + } + + /** + * 切换到说话状态:speaking=true 播放 speakingView(alpha 1),silentView 渐隐 + */ + fun setSpeaking(speaking: Boolean) { + if (speaking == currentState) return + currentState = speaking + + // 同步位置:以 silent 为主(也可以反向) + syncPositions() + + val fadeInView = if (speaking) speakingView else silentView + val fadeOutView = if (speaking) silentView else speakingView + + // 执行淡入淡出 + fadeOutView.animate().alpha(0f).setDuration(transitionDuration).start() + fadeInView.visibility = View.VISIBLE + fadeInView.animate().alpha(1f).setDuration(transitionDuration).start() + } + + private fun syncPositions() { + // 以 silent 为主:将 speaking 同步到 silent 的位置 + try { + val pos = playerSilent?.currentPosition ?: 0L + playerSpeaking?.seekTo(pos) + } catch (_: Throwable) {} + } + + fun release() { + try { silentView.player = null } catch (_: Throwable) {} + try { speakingView.player = null } catch (_: Throwable) {} + try { playerSilent?.release() } catch (_: Throwable) {} + try { playerSpeaking?.release() } catch (_: Throwable) {} + playerSilent = null + playerSpeaking = null + } +} diff --git a/app/src/main/res/layout/activity_main.xml b/app/src/main/res/layout/activity_main.xml index 85e5bf4..6c8ba12 100644 --- a/app/src/main/res/layout/activity_main.xml +++ b/app/src/main/res/layout/activity_main.xml @@ -4,8 +4,39 @@ xmlns:tools="http://schemas.android.com/tools" android:layout_width="match_parent" android:layout_height="match_parent" + android:background="#606060" tools:context="com.digitalperson.MainActivity"> + + + + + + + + + + app:layout_constraintTop_toTopOf="parent" + android:background="@android:color/transparent" + /> + + + + + + + 开始 结束 点击“开始”说话;识别后会请求大模型并用 TTS 播放回复。 + 你是一名小学女老师,喜欢回答学生的各种问题,请简洁但温柔地回答,每个回答不超过30字。 diff --git a/gradle.properties b/gradle.properties index 2d3fece..f5e7efa 100644 --- a/gradle.properties +++ b/gradle.properties @@ -6,7 +6,7 @@ # http://www.gradle.org/docs/current/userguide/build_environment.html # Specifies the JVM arguments used for the daemon process. # The setting is particularly useful for tweaking memory settings. -org.gradle.jvmargs=-Xmx2048m -Dfile.encoding=UTF-8 +org.gradle.jvmargs=-Xmx6g -Dfile.encoding=UTF-8 # When configured, Gradle will run in incubating parallel mode. # This option should only be used with decoupled projects. More details, visit # http://www.gradle.org/docs/current/userguide/multi_project_builds.html#sec:decoupled_projects diff --git a/settings.gradle b/settings.gradle index e4cc0a3..9a7e8df 100644 --- a/settings.gradle +++ b/settings.gradle @@ -8,6 +8,9 @@ pluginManagement { dependencyResolutionManagement { repositoriesMode.set(RepositoriesMode.FAIL_ON_PROJECT_REPOS) repositories { + maven { url 'https://maven.aliyun.com/repository/central' } + maven { url 'https://maven.aliyun.com/repository/google' } + maven { url 'https://maven.aliyun.com/repository/gradle-plugin' } google() mavenCentral() }