diff --git a/esphome/components/i2s_audio/speaker/i2s_audio_speaker.cpp b/esphome/components/i2s_audio/speaker/i2s_audio_speaker.cpp index ed13e6b458..e729cdf954 100644 --- a/esphome/components/i2s_audio/speaker/i2s_audio_speaker.cpp +++ b/esphome/components/i2s_audio/speaker/i2s_audio_speaker.cpp @@ -220,6 +220,8 @@ size_t I2SAudioSpeaker::play(const uint8_t *data, size_t length) { return index; } +bool I2SAudioSpeaker::has_buffered_data() const { return uxQueueMessagesWaiting(this->buffer_queue_) > 0; } + } // namespace i2s_audio } // namespace esphome diff --git a/esphome/components/i2s_audio/speaker/i2s_audio_speaker.h b/esphome/components/i2s_audio/speaker/i2s_audio_speaker.h index b075722e1b..20c36a69d3 100644 --- a/esphome/components/i2s_audio/speaker/i2s_audio_speaker.h +++ b/esphome/components/i2s_audio/speaker/i2s_audio_speaker.h @@ -56,6 +56,8 @@ class I2SAudioSpeaker : public Component, public speaker::Speaker, public I2SAud size_t play(const uint8_t *data, size_t length) override; + bool has_buffered_data() const override; + protected: void start_(); // void stop_(); diff --git a/esphome/components/speaker/speaker.h b/esphome/components/speaker/speaker.h index 3f520e3c5e..b494873160 100644 --- a/esphome/components/speaker/speaker.h +++ b/esphome/components/speaker/speaker.h @@ -18,6 +18,8 @@ class Speaker { virtual void start() = 0; virtual void stop() = 0; + virtual bool has_buffered_data() const = 0; + bool is_running() const { return this->state_ == STATE_RUNNING; } protected: diff --git a/esphome/components/voice_assistant/voice_assistant.cpp b/esphome/components/voice_assistant/voice_assistant.cpp index 9b13a71039..c0e706305d 100644 --- a/esphome/components/voice_assistant/voice_assistant.cpp +++ b/esphome/components/voice_assistant/voice_assistant.cpp @@ -273,28 +273,27 @@ void VoiceAssistant::loop() { bool playing = false; #ifdef USE_SPEAKER if (this->speaker_ != nullptr) { + ssize_t received_len = 0; if (this->speaker_buffer_index_ + RECEIVE_SIZE < SPEAKER_BUFFER_SIZE) { - auto len = this->socket_->read(this->speaker_buffer_ + this->speaker_buffer_index_, RECEIVE_SIZE); - if (len > 0) { - this->speaker_buffer_index_ += len; - this->speaker_buffer_size_ += len; + received_len = this->socket_->read(this->speaker_buffer_ + this->speaker_buffer_index_, RECEIVE_SIZE); + if (received_len > 0) { + this->speaker_buffer_index_ += received_len; + this->speaker_buffer_size_ += received_len; + this->speaker_bytes_received_ += received_len; } } else { - ESP_LOGW(TAG, "Receive buffer full"); - } - if (this->speaker_buffer_size_ > 0) { - size_t written = this->speaker_->play(this->speaker_buffer_, this->speaker_buffer_size_); - if (written > 0) { - memmove(this->speaker_buffer_, this->speaker_buffer_ + written, this->speaker_buffer_size_ - written); - this->speaker_buffer_size_ -= written; - this->speaker_buffer_index_ -= written; - this->set_timeout("speaker-timeout", 2000, [this]() { this->speaker_->stop(); }); - } else { - ESP_LOGW(TAG, "Speaker buffer full"); - } + ESP_LOGD(TAG, "Receive buffer full"); } + // Build a small buffer of audio before sending to the speaker + if (this->speaker_bytes_received_ > RECEIVE_SIZE * 4) + this->write_speaker_(); if (this->wait_for_stream_end_) { this->cancel_timeout("playing"); + if (this->stream_ended_ && received_len < 0) { + ESP_LOGD(TAG, "End of audio stream received"); + this->cancel_timeout("speaker-timeout"); + this->set_state_(State::RESPONSE_FINISHED, State::RESPONSE_FINISHED); + } break; // We dont want to timeout here as the STREAM_END event will take care of that. } playing = this->speaker_->is_running(); @@ -316,14 +315,26 @@ void VoiceAssistant::loop() { case State::RESPONSE_FINISHED: { #ifdef USE_SPEAKER if (this->speaker_ != nullptr) { + if (this->speaker_buffer_size_ > 0) { + this->write_speaker_(); + break; + } + if (this->speaker_->has_buffered_data() || this->speaker_->is_running()) { + break; + } + ESP_LOGD(TAG, "Speaker has finished outputting all audio"); this->speaker_->stop(); this->cancel_timeout("speaker-timeout"); this->cancel_timeout("playing"); this->speaker_buffer_size_ = 0; this->speaker_buffer_index_ = 0; + this->speaker_bytes_received_ = 0; memset(this->speaker_buffer_, 0, SPEAKER_BUFFER_SIZE); + this->wait_for_stream_end_ = false; + this->stream_ended_ = false; + + this->tts_stream_end_trigger_->trigger(); } - this->wait_for_stream_end_ = false; #endif this->set_state_(State::IDLE, State::IDLE); break; @@ -333,6 +344,20 @@ void VoiceAssistant::loop() { } } +void VoiceAssistant::write_speaker_() { + if (this->speaker_buffer_size_ > 0) { + size_t written = this->speaker_->play(this->speaker_buffer_, this->speaker_buffer_size_); + if (written > 0) { + memmove(this->speaker_buffer_, this->speaker_buffer_ + written, this->speaker_buffer_size_ - written); + this->speaker_buffer_size_ -= written; + this->speaker_buffer_index_ -= written; + this->set_timeout("speaker-timeout", 5000, [this]() { this->speaker_->stop(); }); + } else { + ESP_LOGD(TAG, "Speaker buffer full, trying again next loop"); + } + } +} + void VoiceAssistant::client_subscription(api::APIConnection *client, bool subscribe) { if (!subscribe) { if (this->api_client_ == nullptr || client != this->api_client_) { @@ -503,21 +528,20 @@ void VoiceAssistant::on_event(const api::VoiceAssistantEventResponse &msg) { switch (msg.event_type) { case api::enums::VOICE_ASSISTANT_RUN_START: ESP_LOGD(TAG, "Assist Pipeline running"); - this->start_trigger_->trigger(); + this->defer([this]() { this->start_trigger_->trigger(); }); break; case api::enums::VOICE_ASSISTANT_WAKE_WORD_START: break; case api::enums::VOICE_ASSISTANT_WAKE_WORD_END: { ESP_LOGD(TAG, "Wake word detected"); - this->wake_word_detected_trigger_->trigger(); + this->defer([this]() { this->wake_word_detected_trigger_->trigger(); }); break; } case api::enums::VOICE_ASSISTANT_STT_START: ESP_LOGD(TAG, "STT started"); - this->listening_trigger_->trigger(); + this->defer([this]() { this->listening_trigger_->trigger(); }); break; case api::enums::VOICE_ASSISTANT_STT_END: { - this->set_state_(State::STOP_MICROPHONE, State::AWAITING_RESPONSE); std::string text; for (auto arg : msg.data) { if (arg.name == "text") { @@ -529,12 +553,12 @@ void VoiceAssistant::on_event(const api::VoiceAssistantEventResponse &msg) { return; } ESP_LOGD(TAG, "Speech recognised as: \"%s\"", text.c_str()); - this->stt_end_trigger_->trigger(text); + this->defer([this, text]() { this->stt_end_trigger_->trigger(text); }); break; } case api::enums::VOICE_ASSISTANT_INTENT_START: ESP_LOGD(TAG, "Intent started"); - this->intent_start_trigger_->trigger(); + this->defer([this]() { this->intent_start_trigger_->trigger(); }); break; case api::enums::VOICE_ASSISTANT_INTENT_END: { for (auto arg : msg.data) { @@ -542,7 +566,7 @@ void VoiceAssistant::on_event(const api::VoiceAssistantEventResponse &msg) { this->conversation_id_ = std::move(arg.value); } } - this->intent_end_trigger_->trigger(); + this->defer([this]() { this->intent_end_trigger_->trigger(); }); break; } case api::enums::VOICE_ASSISTANT_TTS_START: { @@ -557,10 +581,12 @@ void VoiceAssistant::on_event(const api::VoiceAssistantEventResponse &msg) { return; } ESP_LOGD(TAG, "Response: \"%s\"", text.c_str()); - this->tts_start_trigger_->trigger(text); + this->defer([this, text]() { + this->tts_start_trigger_->trigger(text); #ifdef USE_SPEAKER - this->speaker_->start(); + this->speaker_->start(); #endif + }); break; } case api::enums::VOICE_ASSISTANT_TTS_END: { @@ -575,14 +601,16 @@ void VoiceAssistant::on_event(const api::VoiceAssistantEventResponse &msg) { return; } ESP_LOGD(TAG, "Response URL: \"%s\"", url.c_str()); + this->defer([this, url]() { #ifdef USE_MEDIA_PLAYER - if (this->media_player_ != nullptr) { - this->media_player_->make_call().set_media_url(url).perform(); - } + if (this->media_player_ != nullptr) { + this->media_player_->make_call().set_media_url(url).perform(); + } #endif + this->tts_end_trigger_->trigger(url); + }); State new_state = this->local_output_ ? State::STREAMING_RESPONSE : State::IDLE; this->set_state_(new_state, new_state); - this->tts_end_trigger_->trigger(url); break; } case api::enums::VOICE_ASSISTANT_RUN_END: { @@ -599,7 +627,7 @@ void VoiceAssistant::on_event(const api::VoiceAssistantEventResponse &msg) { this->set_state_(State::IDLE, State::IDLE); } } - this->end_trigger_->trigger(); + this->defer([this]() { this->end_trigger_->trigger(); }); break; } case api::enums::VOICE_ASSISTANT_ERROR: { @@ -617,8 +645,10 @@ void VoiceAssistant::on_event(const api::VoiceAssistantEventResponse &msg) { return; } else if (code == "wake-provider-missing" || code == "wake-engine-missing") { // Wake word is not set up or not ready on Home Assistant so stop and do not retry until user starts again. - this->request_stop(); - this->error_trigger_->trigger(code, message); + this->defer([this, code, message]() { + this->request_stop(); + this->error_trigger_->trigger(code, message); + }); return; } ESP_LOGE(TAG, "Error: %s - %s", code.c_str(), message.c_str()); @@ -626,32 +656,32 @@ void VoiceAssistant::on_event(const api::VoiceAssistantEventResponse &msg) { this->signal_stop_(); this->set_state_(State::STOP_MICROPHONE, State::IDLE); } - this->error_trigger_->trigger(code, message); + this->defer([this, code, message]() { this->error_trigger_->trigger(code, message); }); break; } case api::enums::VOICE_ASSISTANT_TTS_STREAM_START: { #ifdef USE_SPEAKER this->wait_for_stream_end_ = true; ESP_LOGD(TAG, "TTS stream start"); - this->tts_stream_start_trigger_->trigger(); + this->defer([this] { this->tts_stream_start_trigger_->trigger(); }); #endif break; } case api::enums::VOICE_ASSISTANT_TTS_STREAM_END: { - this->set_state_(State::RESPONSE_FINISHED, State::IDLE); #ifdef USE_SPEAKER + this->stream_ended_ = true; ESP_LOGD(TAG, "TTS stream end"); - this->tts_stream_end_trigger_->trigger(); #endif break; } case api::enums::VOICE_ASSISTANT_STT_VAD_START: ESP_LOGD(TAG, "Starting STT by VAD"); - this->stt_vad_start_trigger_->trigger(); + this->defer([this]() { this->stt_vad_start_trigger_->trigger(); }); break; case api::enums::VOICE_ASSISTANT_STT_VAD_END: ESP_LOGD(TAG, "STT by VAD end"); - this->stt_vad_end_trigger_->trigger(); + this->set_state_(State::STOP_MICROPHONE, State::AWAITING_RESPONSE); + this->defer([this]() { this->stt_vad_end_trigger_->trigger(); }); break; default: ESP_LOGD(TAG, "Unhandled event type: %d", msg.event_type); diff --git a/esphome/components/voice_assistant/voice_assistant.h b/esphome/components/voice_assistant/voice_assistant.h index f6dcd1c563..66bf4c3c57 100644 --- a/esphome/components/voice_assistant/voice_assistant.h +++ b/esphome/components/voice_assistant/voice_assistant.h @@ -156,11 +156,14 @@ class VoiceAssistant : public Component { microphone::Microphone *mic_{nullptr}; #ifdef USE_SPEAKER + void write_speaker_(); speaker::Speaker *speaker_{nullptr}; uint8_t *speaker_buffer_; size_t speaker_buffer_index_{0}; size_t speaker_buffer_size_{0}; + size_t speaker_bytes_received_{0}; bool wait_for_stream_end_{false}; + bool stream_ended_{false}; #endif #ifdef USE_MEDIA_PLAYER media_player::MediaPlayer *media_player_{nullptr};