diff --git a/esphome/components/api/api.proto b/esphome/components/api/api.proto index ec4a0f7cc9..69765c7a94 100644 --- a/esphome/components/api/api.proto +++ b/esphome/components/api/api.proto @@ -1459,6 +1459,8 @@ enum VoiceAssistantEvent { VOICE_ASSISTANT_WAKE_WORD_END = 10; VOICE_ASSISTANT_STT_VAD_START = 11; VOICE_ASSISTANT_STT_VAD_END = 12; + VOICE_ASSISTANT_TTS_STREAM_START = 98; + VOICE_ASSISTANT_TTS_STREAM_END = 99; } message VoiceAssistantEventData { diff --git a/esphome/components/api/api_pb2.cpp b/esphome/components/api/api_pb2.cpp index 225b213a67..65df2312e1 100644 --- a/esphome/components/api/api_pb2.cpp +++ b/esphome/components/api/api_pb2.cpp @@ -452,6 +452,10 @@ template<> const char *proto_enum_to_string(enums::V return "VOICE_ASSISTANT_STT_VAD_START"; case enums::VOICE_ASSISTANT_STT_VAD_END: return "VOICE_ASSISTANT_STT_VAD_END"; + case enums::VOICE_ASSISTANT_TTS_STREAM_START: + return "VOICE_ASSISTANT_TTS_STREAM_START"; + case enums::VOICE_ASSISTANT_TTS_STREAM_END: + return "VOICE_ASSISTANT_TTS_STREAM_END"; default: return "UNKNOWN"; } diff --git a/esphome/components/api/api_pb2.h b/esphome/components/api/api_pb2.h index a4826f09d2..4c70facf3d 100644 --- a/esphome/components/api/api_pb2.h +++ b/esphome/components/api/api_pb2.h @@ -184,6 +184,8 @@ enum VoiceAssistantEvent : uint32_t { VOICE_ASSISTANT_WAKE_WORD_END = 10, VOICE_ASSISTANT_STT_VAD_START = 11, VOICE_ASSISTANT_STT_VAD_END = 12, + VOICE_ASSISTANT_TTS_STREAM_START = 98, + VOICE_ASSISTANT_TTS_STREAM_END = 99, }; enum AlarmControlPanelState : uint32_t { ALARM_STATE_DISARMED = 0, diff --git a/esphome/components/i2s_audio/speaker/i2s_audio_speaker.cpp b/esphome/components/i2s_audio/speaker/i2s_audio_speaker.cpp index 592a27b739..ed13e6b458 100644 --- a/esphome/components/i2s_audio/speaker/i2s_audio_speaker.cpp +++ b/esphome/components/i2s_audio/speaker/i2s_audio_speaker.cpp @@ -158,8 +158,13 @@ void I2SAudioSpeaker::watch_() { if (xQueueReceive(this->event_queue_, &event, 0) == pdTRUE) { switch (event.type) { case TaskEventType::STARTING: + ESP_LOGD(TAG, "Starting I2S Audio Speaker"); + break; case TaskEventType::STARTED: + ESP_LOGD(TAG, "Started I2S Audio Speaker"); + break; case TaskEventType::STOPPING: + ESP_LOGD(TAG, "Stopping I2S Audio Speaker"); break; case TaskEventType::PLAYING: this->status_clear_warning(); @@ -170,6 +175,7 @@ void I2SAudioSpeaker::watch_() { this->player_task_handle_ = nullptr; this->parent_->unlock(); xQueueReset(this->buffer_queue_); + ESP_LOGD(TAG, "Stopped I2S Audio Speaker"); break; case TaskEventType::WARNING: ESP_LOGW(TAG, "Error writing to I2S: %s", esp_err_to_name(event.err)); diff --git a/esphome/components/voice_assistant/voice_assistant.cpp b/esphome/components/voice_assistant/voice_assistant.cpp index 448df61d80..12fbdc97b4 100644 --- a/esphome/components/voice_assistant/voice_assistant.cpp +++ b/esphome/components/voice_assistant/voice_assistant.cpp @@ -281,11 +281,14 @@ void VoiceAssistant::loop() { memmove(this->speaker_buffer_, this->speaker_buffer_ + written, this->speaker_buffer_size_ - written); this->speaker_buffer_size_ -= written; this->speaker_buffer_index_ -= written; - this->set_timeout("speaker-timeout", 1000, [this]() { this->speaker_->stop(); }); + this->set_timeout("speaker-timeout", 2000, [this]() { this->speaker_->stop(); }); } else { ESP_LOGW(TAG, "Speaker buffer full."); } } + if (this->wait_for_stream_end_) { + break; // We dont want to timeout here as the STREAM_END event will take care of that. + } playing = this->speaker_->is_running(); } #endif @@ -295,28 +298,77 @@ void VoiceAssistant::loop() { } #endif if (playing) { - this->set_timeout("playing", 100, [this]() { + this->set_timeout("playing", 2000, [this]() { this->cancel_timeout("speaker-timeout"); this->set_state_(State::IDLE, State::IDLE); }); } break; } + case State::RESPONSE_FINISHED: { +#ifdef USE_SPEAKER + if (this->speaker_ != nullptr) { + this->speaker_->stop(); + this->cancel_timeout("speaker-timeout"); + this->cancel_timeout("playing"); + this->speaker_buffer_size_ = 0; + this->speaker_buffer_index_ = 0; + memset(this->speaker_buffer_, 0, SPEAKER_BUFFER_SIZE); + } +#endif + this->wait_for_stream_end_ = false; + this->set_state_(State::IDLE, State::IDLE); + break; + } default: break; } } +static const LogString *voice_assistant_state_to_string(State state) { + switch (state) { + case State::IDLE: + return LOG_STR("IDLE"); + case State::START_MICROPHONE: + return LOG_STR("START_MICROPHONE"); + case State::STARTING_MICROPHONE: + return LOG_STR("STARTING_MICROPHONE"); + case State::WAIT_FOR_VAD: + return LOG_STR("WAIT_FOR_VAD"); + case State::WAITING_FOR_VAD: + return LOG_STR("WAITING_FOR_VAD"); + case State::START_PIPELINE: + return LOG_STR("START_PIPELINE"); + case State::STARTING_PIPELINE: + return LOG_STR("STARTING_PIPELINE"); + case State::STREAMING_MICROPHONE: + return LOG_STR("STREAMING_MICROPHONE"); + case State::STOP_MICROPHONE: + return LOG_STR("STOP_MICROPHONE"); + case State::STOPPING_MICROPHONE: + return LOG_STR("STOPPING_MICROPHONE"); + case State::AWAITING_RESPONSE: + return LOG_STR("AWAITING_RESPONSE"); + case State::STREAMING_RESPONSE: + return LOG_STR("STREAMING_RESPONSE"); + case State::RESPONSE_FINISHED: + return LOG_STR("RESPONSE_FINISHED"); + default: + return LOG_STR("UNKNOWN"); + } +}; + void VoiceAssistant::set_state_(State state) { State old_state = this->state_; this->state_ = state; - ESP_LOGD(TAG, "State changed from %d to %d", static_cast(old_state), static_cast(state)); + ESP_LOGD(TAG, "State changed from %s to %s", LOG_STR_ARG(voice_assistant_state_to_string(old_state)), + LOG_STR_ARG(voice_assistant_state_to_string(state))); } void VoiceAssistant::set_state_(State state, State desired_state) { this->set_state_(state); this->desired_state_ = desired_state; - ESP_LOGD(TAG, "Desired state set to %d", static_cast(desired_state)); + ESP_LOGD(TAG, "Desired state set to %s", LOG_STR_ARG(voice_assistant_state_to_string(desired_state))); } void VoiceAssistant::failed_to_start() { @@ -400,6 +452,7 @@ void VoiceAssistant::request_stop() { break; case State::AWAITING_RESPONSE: case State::STREAMING_RESPONSE: + case State::RESPONSE_FINISHED: break; // Let the incoming audio stream finish then it will go to idle. } } @@ -531,6 +584,14 @@ void VoiceAssistant::on_event(const api::VoiceAssistantEventResponse &msg) { this->error_trigger_->trigger(code, message); break; } + case api::enums::VOICE_ASSISTANT_TTS_STREAM_START: { + this->wait_for_stream_end_ = true; + break; + } + case api::enums::VOICE_ASSISTANT_TTS_STREAM_END: { + this->set_state_(State::RESPONSE_FINISHED, State::IDLE); + break; + } default: ESP_LOGD(TAG, "Unhandled event type: %d", msg.event_type); break; diff --git a/esphome/components/voice_assistant/voice_assistant.h b/esphome/components/voice_assistant/voice_assistant.h index ce22538a85..cd448293db 100644 --- a/esphome/components/voice_assistant/voice_assistant.h +++ b/esphome/components/voice_assistant/voice_assistant.h @@ -46,6 +46,7 @@ enum class State { STOPPING_MICROPHONE, AWAITING_RESPONSE, STREAMING_RESPONSE, + RESPONSE_FINISHED, }; class VoiceAssistant : public Component { @@ -132,10 +133,10 @@ class VoiceAssistant : public Component { uint8_t *speaker_buffer_; size_t speaker_buffer_index_{0}; size_t speaker_buffer_size_{0}; + bool wait_for_stream_end_{false}; #endif #ifdef USE_MEDIA_PLAYER media_player::MediaPlayer *media_player_{nullptr}; - bool playing_tts_{false}; #endif bool local_output_{false};