Files
xiaozhi-esp32/main/audio/audio_service.cc
Xiaoxia 2b025c4ea6 Enhance audio processing and wake word detection (#1739)
* Enhance audio processing and wake word detection

- Set task priority in Application::Run to improve responsiveness.
- Log detected wake words with their state in HandleWakeWordDetectedEvent.
- Streamline audio feeding in AudioService to handle both wake word and audio processor events.
- Implement input buffering in AfeAudioProcessor, AfeWakeWord, CustomWakeWord, and EspWakeWord to manage audio data more efficiently.
- Clear input buffers on stop to prevent residual data issues.

* Refactor audio processing to enhance thread safety and state management

- Implement early return checks in Feed methods of AfeAudioProcessor, AfeWakeWord, CustomWakeWord, and EspWakeWord to prevent processing when not running.
- Introduce std::atomic for running state in CustomWakeWord and EspWakeWord to ensure thread-safe access.
- Consolidate input buffer management with mutex locks to avoid race conditions during Stop and Feed operations.

* Refactor listening mode handling and wake word detection configuration

- Replace direct mode setting logic with a new GetDefaultListeningMode method for improved clarity and maintainability.
- Update HandleToggleChatEvent, HandleWakeWordDetectedEvent, and ContinueWakeWordInvoke to utilize the new method for determining listening mode.
- Introduce Kconfig option WAKE_WORD_DETECTION_IN_LISTENING to enable or disable wake word detection during listening mode, enhancing configurability.
2026-02-04 14:28:21 +08:00

805 lines
31 KiB
C++
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
#include "audio_service.h"
#include <esp_log.h>
#include <cstring>
#define RATE_CVT_CFG(_src_rate, _dest_rate, _channel) \
(esp_ae_rate_cvt_cfg_t) \
{ \
.src_rate = (uint32_t)(_src_rate), \
.dest_rate = (uint32_t)(_dest_rate), \
.channel = (uint8_t)(_channel), \
.bits_per_sample = ESP_AUDIO_BIT16, \
.complexity = 2, \
.perf_type = ESP_AE_RATE_CVT_PERF_TYPE_SPEED, \
}
#define OPUS_DEC_CFG(_sample_rate, _frame_duration_ms) \
(esp_opus_dec_cfg_t) \
{ \
.sample_rate = (uint32_t)(_sample_rate), \
.channel = ESP_AUDIO_MONO, \
.frame_duration = (esp_opus_dec_frame_duration_t)AS_OPUS_GET_FRAME_DRU_ENUM(_frame_duration_ms), \
.self_delimited = false, \
}
#if CONFIG_USE_AUDIO_PROCESSOR
#include "processors/afe_audio_processor.h"
#else
#include "processors/no_audio_processor.h"
#endif
#if CONFIG_IDF_TARGET_ESP32S3 || CONFIG_IDF_TARGET_ESP32P4
#include "wake_words/afe_wake_word.h"
#include "wake_words/custom_wake_word.h"
#else
#include "wake_words/esp_wake_word.h"
#endif
#define TAG "AudioService"
AudioService::AudioService() {
event_group_ = xEventGroupCreate();
}
AudioService::~AudioService() {
if (event_group_ != nullptr) {
vEventGroupDelete(event_group_);
}
if (opus_encoder_ != nullptr) {
esp_opus_enc_close(opus_encoder_);
}
if (opus_decoder_ != nullptr) {
esp_opus_dec_close(opus_decoder_);
}
if (input_resampler_ != nullptr) {
esp_ae_rate_cvt_close(input_resampler_);
}
if (output_resampler_ != nullptr) {
esp_ae_rate_cvt_close(output_resampler_);
}
}
void AudioService::Initialize(AudioCodec* codec) {
codec_ = codec;
codec_->Start();
esp_opus_dec_cfg_t opus_dec_cfg = OPUS_DEC_CFG(codec->output_sample_rate(), OPUS_FRAME_DURATION_MS);
auto ret = esp_opus_dec_open(&opus_dec_cfg, sizeof(esp_opus_dec_cfg_t), &opus_decoder_);
if (opus_decoder_ == nullptr) {
ESP_LOGE(TAG, "Failed to create audio decoder, error code: %d", ret);
} else {
decoder_sample_rate_ = codec->output_sample_rate();
decoder_duration_ms_ = OPUS_FRAME_DURATION_MS;
decoder_frame_size_ = decoder_sample_rate_ / 1000 * OPUS_FRAME_DURATION_MS;
}
esp_opus_enc_config_t opus_enc_cfg = AS_OPUS_ENC_CONFIG();
ret = esp_opus_enc_open(&opus_enc_cfg, sizeof(esp_opus_enc_config_t), &opus_encoder_);
if (opus_encoder_ == nullptr) {
ESP_LOGE(TAG, "Failed to create audio encoder, error code: %d", ret);
} else {
encoder_sample_rate_ = 16000;
encoder_duration_ms_ = OPUS_FRAME_DURATION_MS;
esp_opus_enc_get_frame_size(opus_encoder_, &encoder_frame_size_, &encoder_outbuf_size_);
encoder_frame_size_ = encoder_frame_size_ / sizeof(int16_t);
}
if (codec->input_sample_rate() != 16000) {
esp_ae_rate_cvt_cfg_t input_resampler_cfg = RATE_CVT_CFG(
codec->input_sample_rate(), ESP_AUDIO_SAMPLE_RATE_16K, codec->input_channels());
auto resampler_ret = esp_ae_rate_cvt_open(&input_resampler_cfg, &input_resampler_);
if (input_resampler_ == nullptr) {
ESP_LOGE(TAG, "Failed to create input resampler, error code: %d", resampler_ret);
}
}
#if CONFIG_USE_AUDIO_PROCESSOR
audio_processor_ = std::make_unique<AfeAudioProcessor>();
#else
audio_processor_ = std::make_unique<NoAudioProcessor>();
#endif
audio_processor_->OnOutput([this](std::vector<int16_t>&& data) {
PushTaskToEncodeQueue(kAudioTaskTypeEncodeToSendQueue, std::move(data));
});
audio_processor_->OnVadStateChange([this](bool speaking) {
voice_detected_ = speaking;
if (callbacks_.on_vad_change) {
callbacks_.on_vad_change(speaking);
}
});
esp_timer_create_args_t audio_power_timer_args = {
.callback = [](void* arg) {
AudioService* audio_service = (AudioService*)arg;
audio_service->CheckAndUpdateAudioPowerState();
},
.arg = this,
.dispatch_method = ESP_TIMER_TASK,
.name = "audio_power_timer",
.skip_unhandled_events = true,
};
esp_timer_create(&audio_power_timer_args, &audio_power_timer_);
}
void AudioService::Start() {
service_stopped_ = false;
xEventGroupClearBits(event_group_, AS_EVENT_AUDIO_TESTING_RUNNING | AS_EVENT_WAKE_WORD_RUNNING | AS_EVENT_AUDIO_PROCESSOR_RUNNING);
esp_timer_start_periodic(audio_power_timer_, 1000000);
#if CONFIG_USE_AUDIO_PROCESSOR
/* Start the audio input task */
xTaskCreatePinnedToCore([](void* arg) {
AudioService* audio_service = (AudioService*)arg;
audio_service->AudioInputTask();
vTaskDelete(NULL);
}, "audio_input", 2048 * 3, this, 8, &audio_input_task_handle_, 0);
/* Start the audio output task */
xTaskCreate([](void* arg) {
AudioService* audio_service = (AudioService*)arg;
audio_service->AudioOutputTask();
vTaskDelete(NULL);
}, "audio_output", 2048 * 2, this, 4, &audio_output_task_handle_);
#else
/* Start the audio input task */
xTaskCreate([](void* arg) {
AudioService* audio_service = (AudioService*)arg;
audio_service->AudioInputTask();
vTaskDelete(NULL);
}, "audio_input", 2048 * 2, this, 8, &audio_input_task_handle_);
/* Start the audio output task */
xTaskCreate([](void* arg) {
AudioService* audio_service = (AudioService*)arg;
audio_service->AudioOutputTask();
vTaskDelete(NULL);
}, "audio_output", 2048, this, 4, &audio_output_task_handle_);
#endif
/* Start the opus codec task */
xTaskCreate([](void* arg) {
AudioService* audio_service = (AudioService*)arg;
audio_service->OpusCodecTask();
vTaskDelete(NULL);
}, "opus_codec", 2048 * 12, this, 2, &opus_codec_task_handle_);
}
void AudioService::Stop() {
esp_timer_stop(audio_power_timer_);
service_stopped_ = true;
xEventGroupSetBits(event_group_, AS_EVENT_AUDIO_TESTING_RUNNING |
AS_EVENT_WAKE_WORD_RUNNING |
AS_EVENT_AUDIO_PROCESSOR_RUNNING);
std::lock_guard<std::mutex> lock(audio_queue_mutex_);
audio_encode_queue_.clear();
audio_decode_queue_.clear();
audio_playback_queue_.clear();
audio_testing_queue_.clear();
audio_queue_cv_.notify_all();
}
bool AudioService::ReadAudioData(std::vector<int16_t>& data, int sample_rate, int samples) {
if (!codec_->input_enabled()) {
esp_timer_stop(audio_power_timer_);
esp_timer_start_periodic(audio_power_timer_, AUDIO_POWER_CHECK_INTERVAL_MS * 1000);
codec_->EnableInput(true);
}
if (codec_->input_sample_rate() != sample_rate) {
data.resize(samples * codec_->input_sample_rate() / sample_rate * codec_->input_channels());
if (!codec_->InputData(data)) {
return false;
}
if (input_resampler_ != nullptr) {
std::lock_guard<std::mutex> lock(input_resampler_mutex_);
uint32_t in_sample_num = data.size() / codec_->input_channels();
uint32_t output_samples = 0;
esp_ae_rate_cvt_get_max_out_sample_num(input_resampler_, in_sample_num, &output_samples);
auto resampled = std::vector<int16_t>(output_samples * codec_->input_channels());
uint32_t actual_output = output_samples;
esp_ae_rate_cvt_process(input_resampler_, (esp_ae_sample_t)data.data(), in_sample_num,
(esp_ae_sample_t)resampled.data(), &actual_output);
resampled.resize(actual_output * codec_->input_channels());
data = std::move(resampled);
}
} else {
data.resize(samples * codec_->input_channels());
if (!codec_->InputData(data)) {
return false;
}
}
/* Update the last input time */
last_input_time_ = std::chrono::steady_clock::now();
debug_statistics_.input_count++;
#if CONFIG_USE_AUDIO_DEBUGGER
// 音频调试:发送原始音频数据
if (audio_debugger_ == nullptr) {
audio_debugger_ = std::make_unique<AudioDebugger>();
}
audio_debugger_->Feed(data);
#endif
return true;
}
void AudioService::AudioInputTask() {
while (true) {
EventBits_t bits = xEventGroupWaitBits(event_group_, AS_EVENT_AUDIO_TESTING_RUNNING |
AS_EVENT_WAKE_WORD_RUNNING | AS_EVENT_AUDIO_PROCESSOR_RUNNING,
pdFALSE, pdFALSE, portMAX_DELAY);
if (service_stopped_) {
break;
}
if (audio_input_need_warmup_) {
audio_input_need_warmup_ = false;
vTaskDelay(pdMS_TO_TICKS(120));
continue;
}
/* Used for audio testing in NetworkConfiguring mode by clicking the BOOT button */
if (bits & AS_EVENT_AUDIO_TESTING_RUNNING) {
if (audio_testing_queue_.size() >= AUDIO_TESTING_MAX_DURATION_MS / OPUS_FRAME_DURATION_MS) {
ESP_LOGW(TAG, "Audio testing queue is full, stopping audio testing");
EnableAudioTesting(false);
continue;
}
std::vector<int16_t> data;
int samples = OPUS_FRAME_DURATION_MS * 16000 / 1000;
if (ReadAudioData(data, 16000, samples)) {
// If input channels is 2, we need to fetch the left channel data
if (codec_->input_channels() == 2) {
auto mono_data = std::vector<int16_t>(data.size() / 2);
for (size_t i = 0, j = 0; i < mono_data.size(); ++i, j += 2) {
mono_data[i] = data[j];
}
data = std::move(mono_data);
}
PushTaskToEncodeQueue(kAudioTaskTypeEncodeToTestingQueue, std::move(data));
continue;
}
}
/* Feed the wake word and/or audio processor */
if (bits & (AS_EVENT_WAKE_WORD_RUNNING | AS_EVENT_AUDIO_PROCESSOR_RUNNING)) {
int samples = 160; // 10ms
std::vector<int16_t> data;
if (ReadAudioData(data, 16000, samples)) {
if (bits & AS_EVENT_WAKE_WORD_RUNNING) {
wake_word_->Feed(data);
}
if (bits & AS_EVENT_AUDIO_PROCESSOR_RUNNING) {
audio_processor_->Feed(std::move(data));
}
continue;
}
}
ESP_LOGE(TAG, "Should not be here, bits: %lx", bits);
break;
}
ESP_LOGW(TAG, "Audio input task stopped");
}
void AudioService::AudioOutputTask() {
while (true) {
std::unique_lock<std::mutex> lock(audio_queue_mutex_);
audio_queue_cv_.wait(lock, [this]() { return !audio_playback_queue_.empty() || service_stopped_; });
if (service_stopped_) {
break;
}
auto task = std::move(audio_playback_queue_.front());
audio_playback_queue_.pop_front();
audio_queue_cv_.notify_all();
lock.unlock();
if (!codec_->output_enabled()) {
esp_timer_stop(audio_power_timer_);
esp_timer_start_periodic(audio_power_timer_, AUDIO_POWER_CHECK_INTERVAL_MS * 1000);
codec_->EnableOutput(true);
}
codec_->OutputData(task->pcm);
/* Update the last output time */
last_output_time_ = std::chrono::steady_clock::now();
debug_statistics_.playback_count++;
#if CONFIG_USE_SERVER_AEC
/* Record the timestamp for server AEC */
if (task->timestamp > 0) {
lock.lock();
timestamp_queue_.push_back(task->timestamp);
}
#endif
}
ESP_LOGW(TAG, "Audio output task stopped");
}
void AudioService::OpusCodecTask() {
while (true) {
std::unique_lock<std::mutex> lock(audio_queue_mutex_);
audio_queue_cv_.wait(lock, [this]() {
return service_stopped_ ||
(!audio_encode_queue_.empty() && audio_send_queue_.size() < MAX_SEND_PACKETS_IN_QUEUE) ||
(!audio_decode_queue_.empty() && audio_playback_queue_.size() < MAX_PLAYBACK_TASKS_IN_QUEUE);
});
if (service_stopped_) {
break;
}
/* Decode the audio from decode queue */
if (!audio_decode_queue_.empty() && audio_playback_queue_.size() < MAX_PLAYBACK_TASKS_IN_QUEUE) {
auto packet = std::move(audio_decode_queue_.front());
audio_decode_queue_.pop_front();
audio_queue_cv_.notify_all();
lock.unlock();
auto task = std::make_unique<AudioTask>();
task->type = kAudioTaskTypeDecodeToPlaybackQueue;
task->timestamp = packet->timestamp;
SetDecodeSampleRate(packet->sample_rate, packet->frame_duration);
if (opus_decoder_ != nullptr) {
task->pcm.resize(decoder_frame_size_);
esp_audio_dec_in_raw_t raw = {
.buffer = (uint8_t *)(packet->payload.data()),
.len = (uint32_t)(packet->payload.size()),
.consumed = 0,
.frame_recover = ESP_AUDIO_DEC_RECOVERY_NONE,
};
esp_audio_dec_out_frame_t out_frame = {
.buffer = (uint8_t *)(task->pcm.data()),
.len = (uint32_t)(task->pcm.size() * sizeof(int16_t)),
.decoded_size = 0,
};
esp_audio_dec_info_t dec_info = {};
std::unique_lock<std::mutex> decoder_lock(decoder_mutex_);
auto ret = esp_opus_dec_decode(opus_decoder_, &raw, &out_frame, &dec_info);
decoder_lock.unlock();
if (ret == ESP_AUDIO_ERR_OK) {
task->pcm.resize(out_frame.decoded_size / sizeof(int16_t));
if (decoder_sample_rate_ != codec_->output_sample_rate() && output_resampler_ != nullptr) {
uint32_t target_size = 0;
esp_ae_rate_cvt_get_max_out_sample_num(output_resampler_, task->pcm.size(), &target_size);
std::vector<int16_t> resampled(target_size);
uint32_t actual_output = target_size;
esp_ae_rate_cvt_process(output_resampler_, (esp_ae_sample_t)task->pcm.data(), task->pcm.size(),
(esp_ae_sample_t)resampled.data(), &actual_output);
resampled.resize(actual_output);
task->pcm = std::move(resampled);
}
lock.lock();
audio_playback_queue_.push_back(std::move(task));
audio_queue_cv_.notify_all();
debug_statistics_.decode_count++;
} else {
ESP_LOGE(TAG, "Failed to decode audio after resize, error code: %d", ret);
lock.lock();
}
} else {
ESP_LOGE(TAG, "Audio decoder is not configured");
lock.lock();
}
debug_statistics_.decode_count++;
}
/* Encode the audio to send queue */
if (!audio_encode_queue_.empty() && audio_send_queue_.size() < MAX_SEND_PACKETS_IN_QUEUE) {
auto task = std::move(audio_encode_queue_.front());
audio_encode_queue_.pop_front();
audio_queue_cv_.notify_all();
lock.unlock();
auto packet = std::make_unique<AudioStreamPacket>();
packet->frame_duration = OPUS_FRAME_DURATION_MS;
packet->sample_rate = 16000;
packet->timestamp = task->timestamp;
if (opus_encoder_ != nullptr && task->pcm.size() == encoder_frame_size_) {
std::vector<uint8_t> buf(encoder_outbuf_size_);
esp_audio_enc_in_frame_t in = {
.buffer = (uint8_t *)(task->pcm.data()),
.len = (uint32_t)(encoder_frame_size_ * sizeof(int16_t)),
};
esp_audio_enc_out_frame_t out = {
.buffer = buf.data(),
.len = (uint32_t)encoder_outbuf_size_,
.encoded_bytes = 0,
};
auto ret = esp_opus_enc_process(opus_encoder_, &in, &out);
if (ret == ESP_AUDIO_ERR_OK) {
packet->payload.assign(buf.data(), buf.data() + out.encoded_bytes);
if (task->type == kAudioTaskTypeEncodeToSendQueue) {
{
std::lock_guard<std::mutex> lock2(audio_queue_mutex_);
audio_send_queue_.push_back(std::move(packet));
}
if (callbacks_.on_send_queue_available) {
callbacks_.on_send_queue_available();
}
} else if (task->type == kAudioTaskTypeEncodeToTestingQueue) {
std::lock_guard<std::mutex> lock2(audio_queue_mutex_);
audio_testing_queue_.push_back(std::move(packet));
}
debug_statistics_.encode_count++;
} else {
ESP_LOGE(TAG, "Failed to encode audio, error code: %d", ret);
}
} else {
ESP_LOGE(TAG, "Failed to encode audio: encoder not configured or invalid frame size (got %u, expected %u)",
task->pcm.size(), encoder_frame_size_);
}
lock.lock();
}
}
ESP_LOGW(TAG, "Opus codec task stopped");
}
void AudioService::SetDecodeSampleRate(int sample_rate, int frame_duration) {
if (decoder_sample_rate_ == sample_rate && decoder_duration_ms_ == frame_duration) {
return;
}
std::unique_lock<std::mutex> decoder_lock(decoder_mutex_);
if (opus_decoder_ != nullptr) {
esp_opus_dec_close(opus_decoder_);
opus_decoder_ = nullptr;
}
decoder_lock.unlock();
esp_opus_dec_cfg_t opus_dec_cfg = OPUS_DEC_CFG(sample_rate, frame_duration);
auto ret = esp_opus_dec_open(&opus_dec_cfg, sizeof(esp_opus_dec_cfg_t), &opus_decoder_);
if (opus_decoder_ == nullptr) {
ESP_LOGE(TAG, "Failed to create audio decoder, error code: %d", ret);
return;
}
decoder_sample_rate_ = sample_rate;
decoder_duration_ms_ = frame_duration;
decoder_frame_size_ = decoder_sample_rate_ / 1000 * frame_duration;
auto codec = Board::GetInstance().GetAudioCodec();
if (decoder_sample_rate_ != codec->output_sample_rate()) {
ESP_LOGI(TAG, "Resampling audio from %d to %d", decoder_sample_rate_, codec->output_sample_rate());
if (output_resampler_ != nullptr) {
esp_ae_rate_cvt_close(output_resampler_);
output_resampler_ = nullptr;
}
esp_ae_rate_cvt_cfg_t output_resampler_cfg = RATE_CVT_CFG(
decoder_sample_rate_, codec->output_sample_rate(), ESP_AUDIO_MONO);
auto resampler_ret = esp_ae_rate_cvt_open(&output_resampler_cfg, &output_resampler_);
if (output_resampler_ == nullptr) {
ESP_LOGE(TAG, "Failed to create output resampler, error code: %d", resampler_ret);
}
}
}
void AudioService::PushTaskToEncodeQueue(AudioTaskType type, std::vector<int16_t>&& pcm) {
auto task = std::make_unique<AudioTask>();
task->type = type;
task->pcm = std::move(pcm);
/* Push the task to the encode queue */
std::unique_lock<std::mutex> lock(audio_queue_mutex_);
/* If the task is to send queue, we need to set the timestamp */
if (type == kAudioTaskTypeEncodeToSendQueue && !timestamp_queue_.empty()) {
if (timestamp_queue_.size() <= MAX_TIMESTAMPS_IN_QUEUE) {
task->timestamp = timestamp_queue_.front();
} else {
ESP_LOGW(TAG, "Timestamp queue (%u) is full, dropping timestamp", timestamp_queue_.size());
}
timestamp_queue_.pop_front();
}
audio_queue_cv_.wait(lock, [this]() { return audio_encode_queue_.size() < MAX_ENCODE_TASKS_IN_QUEUE; });
audio_encode_queue_.push_back(std::move(task));
audio_queue_cv_.notify_all();
}
bool AudioService::PushPacketToDecodeQueue(std::unique_ptr<AudioStreamPacket> packet, bool wait) {
std::unique_lock<std::mutex> lock(audio_queue_mutex_);
if (audio_decode_queue_.size() >= MAX_DECODE_PACKETS_IN_QUEUE) {
if (wait) {
audio_queue_cv_.wait(lock, [this]() { return audio_decode_queue_.size() < MAX_DECODE_PACKETS_IN_QUEUE; });
} else {
return false;
}
}
audio_decode_queue_.push_back(std::move(packet));
audio_queue_cv_.notify_all();
return true;
}
std::unique_ptr<AudioStreamPacket> AudioService::PopPacketFromSendQueue() {
std::lock_guard<std::mutex> lock(audio_queue_mutex_);
if (audio_send_queue_.empty()) {
return nullptr;
}
auto packet = std::move(audio_send_queue_.front());
audio_send_queue_.pop_front();
audio_queue_cv_.notify_all();
return packet;
}
void AudioService::EncodeWakeWord() {
if (wake_word_) {
wake_word_->EncodeWakeWordData();
}
}
const std::string& AudioService::GetLastWakeWord() const {
return wake_word_->GetLastDetectedWakeWord();
}
std::unique_ptr<AudioStreamPacket> AudioService::PopWakeWordPacket() {
auto packet = std::make_unique<AudioStreamPacket>();
if (wake_word_->GetWakeWordOpus(packet->payload)) {
return packet;
}
return nullptr;
}
void AudioService::EnableWakeWordDetection(bool enable) {
if (!wake_word_) {
return;
}
ESP_LOGD(TAG, "%s wake word detection", enable ? "Enabling" : "Disabling");
if (enable) {
if (!wake_word_initialized_) {
if (!wake_word_->Initialize(codec_, models_list_)) {
ESP_LOGE(TAG, "Failed to initialize wake word");
return;
}
wake_word_initialized_ = true;
}
// Reset input resampler to clear cached data from previous mode (e.g. AudioProcessor)
// This prevents buffer overflow when switching between different feed sizes
{
std::lock_guard<std::mutex> lock(input_resampler_mutex_);
if (input_resampler_ != nullptr) {
esp_ae_rate_cvt_reset(input_resampler_);
}
}
wake_word_->Start();
xEventGroupSetBits(event_group_, AS_EVENT_WAKE_WORD_RUNNING);
} else {
wake_word_->Stop();
xEventGroupClearBits(event_group_, AS_EVENT_WAKE_WORD_RUNNING);
}
}
void AudioService::EnableVoiceProcessing(bool enable) {
ESP_LOGD(TAG, "%s voice processing", enable ? "Enabling" : "Disabling");
if (enable) {
if (!audio_processor_initialized_) {
audio_processor_->Initialize(codec_, OPUS_FRAME_DURATION_MS, models_list_);
audio_processor_initialized_ = true;
}
/* We should make sure no audio is playing */
ResetDecoder();
audio_input_need_warmup_ = true;
// Reset input resampler to clear cached data from previous mode (e.g. WakeWord)
// This prevents buffer overflow when switching between different feed sizes
{
std::lock_guard<std::mutex> lock(input_resampler_mutex_);
if (input_resampler_ != nullptr) {
esp_ae_rate_cvt_reset(input_resampler_);
}
}
audio_processor_->Start();
xEventGroupSetBits(event_group_, AS_EVENT_AUDIO_PROCESSOR_RUNNING);
} else {
audio_processor_->Stop();
xEventGroupClearBits(event_group_, AS_EVENT_AUDIO_PROCESSOR_RUNNING);
}
}
void AudioService::EnableAudioTesting(bool enable) {
ESP_LOGI(TAG, "%s audio testing", enable ? "Enabling" : "Disabling");
if (enable) {
xEventGroupSetBits(event_group_, AS_EVENT_AUDIO_TESTING_RUNNING);
} else {
xEventGroupClearBits(event_group_, AS_EVENT_AUDIO_TESTING_RUNNING);
/* Copy audio_testing_queue_ to audio_decode_queue_ */
std::lock_guard<std::mutex> lock(audio_queue_mutex_);
audio_decode_queue_ = std::move(audio_testing_queue_);
audio_queue_cv_.notify_all();
}
}
void AudioService::EnableDeviceAec(bool enable) {
ESP_LOGI(TAG, "%s device AEC", enable ? "Enabling" : "Disabling");
if (!audio_processor_initialized_) {
audio_processor_->Initialize(codec_, OPUS_FRAME_DURATION_MS, models_list_);
audio_processor_initialized_ = true;
}
audio_processor_->EnableDeviceAec(enable);
}
void AudioService::SetCallbacks(AudioServiceCallbacks& callbacks) {
callbacks_ = callbacks;
}
void AudioService::PlaySound(const std::string_view& ogg) {
if (!codec_->output_enabled()) {
esp_timer_stop(audio_power_timer_);
esp_timer_start_periodic(audio_power_timer_, AUDIO_POWER_CHECK_INTERVAL_MS * 1000);
codec_->EnableOutput(true);
}
const uint8_t* buf = reinterpret_cast<const uint8_t*>(ogg.data());
size_t size = ogg.size();
size_t offset = 0;
auto find_page = [&](size_t start)->size_t {
for (size_t i = start; i + 4 <= size; ++i) {
if (buf[i] == 'O' && buf[i+1] == 'g' && buf[i+2] == 'g' && buf[i+3] == 'S') return i;
}
return static_cast<size_t>(-1);
};
bool seen_head = false;
bool seen_tags = false;
int sample_rate = 16000; // 默认值
while (true) {
size_t pos = find_page(offset);
if (pos == static_cast<size_t>(-1)) break;
offset = pos;
if (offset + 27 > size) break;
const uint8_t* page = buf + offset;
uint8_t page_segments = page[26];
size_t seg_table_off = offset + 27;
if (seg_table_off + page_segments > size) break;
size_t body_size = 0;
for (size_t i = 0; i < page_segments; ++i) body_size += page[27 + i];
size_t body_off = seg_table_off + page_segments;
if (body_off + body_size > size) break;
// Parse packets using lacing
size_t cur = body_off;
size_t seg_idx = 0;
while (seg_idx < page_segments) {
size_t pkt_len = 0;
size_t pkt_start = cur;
bool continued = false;
do {
uint8_t l = page[27 + seg_idx++];
pkt_len += l;
cur += l;
continued = (l == 255);
} while (continued && seg_idx < page_segments);
if (pkt_len == 0) continue;
const uint8_t* pkt_ptr = buf + pkt_start;
if (!seen_head) {
// 解析OpusHead包
if (pkt_len >= 19 && std::memcmp(pkt_ptr, "OpusHead", 8) == 0) {
seen_head = true;
// OpusHead结构[0-7] "OpusHead", [8] version, [9] channel_count, [10-11] pre_skip
// [12-15] input_sample_rate, [16-17] output_gain, [18] mapping_family
if (pkt_len >= 12) {
uint8_t version = pkt_ptr[8];
uint8_t channel_count = pkt_ptr[9];
if (pkt_len >= 16) {
// 读取输入采样率 (little-endian)
sample_rate = pkt_ptr[12] | (pkt_ptr[13] << 8) |
(pkt_ptr[14] << 16) | (pkt_ptr[15] << 24);
ESP_LOGI(TAG, "OpusHead: version=%d, channels=%d, sample_rate=%d",
version, channel_count, sample_rate);
}
}
}
continue;
}
if (!seen_tags) {
// Expect OpusTags in second packet
if (pkt_len >= 8 && std::memcmp(pkt_ptr, "OpusTags", 8) == 0) {
seen_tags = true;
}
continue;
}
// Audio packet (Opus)
auto packet = std::make_unique<AudioStreamPacket>();
packet->sample_rate = sample_rate;
packet->frame_duration = 60;
packet->payload.resize(pkt_len);
std::memcpy(packet->payload.data(), pkt_ptr, pkt_len);
PushPacketToDecodeQueue(std::move(packet), true);
}
offset = body_off + body_size;
}
}
bool AudioService::IsIdle() {
std::lock_guard<std::mutex> lock(audio_queue_mutex_);
return audio_encode_queue_.empty() && audio_decode_queue_.empty() && audio_playback_queue_.empty() && audio_testing_queue_.empty();
}
void AudioService::WaitForPlaybackQueueEmpty() {
std::unique_lock<std::mutex> lock(audio_queue_mutex_);
audio_queue_cv_.wait(lock, [this]() {
return service_stopped_ || (audio_decode_queue_.empty() && audio_playback_queue_.empty());
});
}
void AudioService::ResetDecoder() {
std::lock_guard<std::mutex> lock(audio_queue_mutex_);
std::unique_lock<std::mutex> decoder_lock(decoder_mutex_);
if (opus_decoder_ != nullptr) {
esp_opus_dec_reset(opus_decoder_);
}
decoder_lock.unlock();
timestamp_queue_.clear();
audio_decode_queue_.clear();
audio_playback_queue_.clear();
audio_testing_queue_.clear();
audio_queue_cv_.notify_all();
}
void AudioService::CheckAndUpdateAudioPowerState() {
auto now = std::chrono::steady_clock::now();
auto input_elapsed = std::chrono::duration_cast<std::chrono::milliseconds>(now - last_input_time_).count();
auto output_elapsed = std::chrono::duration_cast<std::chrono::milliseconds>(now - last_output_time_).count();
if (input_elapsed > AUDIO_POWER_TIMEOUT_MS && codec_->input_enabled()) {
codec_->EnableInput(false);
}
if (output_elapsed > AUDIO_POWER_TIMEOUT_MS && codec_->output_enabled()) {
codec_->EnableOutput(false);
}
if (!codec_->input_enabled() && !codec_->output_enabled()) {
esp_timer_stop(audio_power_timer_);
}
}
void AudioService::SetModelsList(srmodel_list_t* models_list) {
models_list_ = models_list;
#if CONFIG_IDF_TARGET_ESP32S3 || CONFIG_IDF_TARGET_ESP32P4
if (esp_srmodel_filter(models_list_, ESP_MN_PREFIX, NULL) != nullptr) {
wake_word_ = std::make_unique<CustomWakeWord>();
} else if (esp_srmodel_filter(models_list_, ESP_WN_PREFIX, NULL) != nullptr) {
wake_word_ = std::make_unique<AfeWakeWord>();
} else {
wake_word_ = nullptr;
}
#else
if (esp_srmodel_filter(models_list_, ESP_WN_PREFIX, NULL) != nullptr) {
wake_word_ = std::make_unique<EspWakeWord>();
} else {
wake_word_ = nullptr;
}
#endif
if (wake_word_) {
wake_word_->OnWakeWordDetected([this](const std::string& wake_word) {
if (callbacks_.on_wake_word_detected) {
callbacks_.on_wake_word_detected(wake_word);
}
});
}
}
bool AudioService::IsAfeWakeWord() {
#if CONFIG_IDF_TARGET_ESP32S3 || CONFIG_IDF_TARGET_ESP32P4
return wake_word_ != nullptr && dynamic_cast<AfeWakeWord*>(wake_word_.get()) != nullptr;
#else
return false;
#endif
}