From 511af5845ecb54c36c8b575013cbbd7e145fa3af Mon Sep 17 00:00:00 2001 From: Jesse Hills <3060199+jesserockz@users.noreply.github.com> Date: Tue, 10 Oct 2023 19:52:42 +1300 Subject: [PATCH] Remote wake word support for voice assistant (#5229) --- esphome/components/api/api.proto | 19 +- esphome/components/api/api_connection.cpp | 13 +- esphome/components/api/api_connection.h | 2 +- esphome/components/api/api_pb2.cpp | 274 +++++++----- esphome/components/api/api_pb2.h | 26 +- esphome/components/api/api_server.cpp | 20 +- esphome/components/api/api_server.h | 11 +- .../microphone/i2s_audio_microphone.cpp | 9 +- .../i2s_audio/speaker/i2s_audio_speaker.cpp | 4 +- .../components/voice_assistant/__init__.py | 86 +++- .../voice_assistant/voice_assistant.cpp | 391 +++++++++++++++--- .../voice_assistant/voice_assistant.h | 97 ++++- tests/test4.yaml | 3 + 13 files changed, 744 insertions(+), 211 deletions(-) diff --git a/esphome/components/api/api.proto b/esphome/components/api/api.proto index 86685aa5e6..ec4a0f7cc9 100644 --- a/esphome/components/api/api.proto +++ b/esphome/components/api/api.proto @@ -1413,6 +1413,18 @@ message SubscribeVoiceAssistantRequest { bool subscribe = 1; } +enum VoiceAssistantRequestFlag { + VOICE_ASSISTANT_REQUEST_NONE = 0; + VOICE_ASSISTANT_REQUEST_USE_VAD = 1; + VOICE_ASSISTANT_REQUEST_USE_WAKE_WORD = 2; +} + +message VoiceAssistantAudioSettings { + uint32 noise_suppression_level = 1; + uint32 auto_gain = 2; + float volume_multiplier = 3; +} + message VoiceAssistantRequest { option (id) = 90; option (source) = SOURCE_SERVER; @@ -1420,7 +1432,8 @@ message VoiceAssistantRequest { bool start = 1; string conversation_id = 2; - bool use_vad = 3; + uint32 flags = 3; + VoiceAssistantAudioSettings audio_settings = 4; } message VoiceAssistantResponse { @@ -1442,6 +1455,10 @@ enum VoiceAssistantEvent { VOICE_ASSISTANT_INTENT_END = 6; VOICE_ASSISTANT_TTS_START = 7; VOICE_ASSISTANT_TTS_END = 8; + VOICE_ASSISTANT_WAKE_WORD_START = 9; + VOICE_ASSISTANT_WAKE_WORD_END = 10; + VOICE_ASSISTANT_STT_VAD_START = 11; + VOICE_ASSISTANT_STT_VAD_END = 12; } message VoiceAssistantEventData { diff --git a/esphome/components/api/api_connection.cpp b/esphome/components/api/api_connection.cpp index ceec53bb65..3172b71fa2 100644 --- a/esphome/components/api/api_connection.cpp +++ b/esphome/components/api/api_connection.cpp @@ -907,21 +907,22 @@ BluetoothConnectionsFreeResponse APIConnection::subscribe_bluetooth_connections_ #endif #ifdef USE_VOICE_ASSISTANT -bool APIConnection::request_voice_assistant(bool start, const std::string &conversation_id, bool use_vad) { +bool APIConnection::request_voice_assistant(const VoiceAssistantRequest &msg) { if (!this->voice_assistant_subscription_) return false; - VoiceAssistantRequest msg; - msg.start = start; - msg.conversation_id = conversation_id; - msg.use_vad = use_vad; + return this->send_voice_assistant_request(msg); } void APIConnection::on_voice_assistant_response(const VoiceAssistantResponse &msg) { if (voice_assistant::global_voice_assistant != nullptr) { + if (msg.error) { + voice_assistant::global_voice_assistant->failed_to_start(); + return; + } struct sockaddr_storage storage; socklen_t len = sizeof(storage); this->helper_->getpeername((struct sockaddr *) &storage, &len); - voice_assistant::global_voice_assistant->start(&storage, msg.port); + voice_assistant::global_voice_assistant->start_streaming(&storage, msg.port); } }; void APIConnection::on_voice_assistant_event_response(const VoiceAssistantEventResponse &msg) { diff --git a/esphome/components/api/api_connection.h b/esphome/components/api/api_connection.h index acc4578661..2a62c2faff 100644 --- a/esphome/components/api/api_connection.h +++ b/esphome/components/api/api_connection.h @@ -124,7 +124,7 @@ class APIConnection : public APIServerConnection { void subscribe_voice_assistant(const SubscribeVoiceAssistantRequest &msg) override { this->voice_assistant_subscription_ = msg.subscribe; } - bool request_voice_assistant(bool start, const std::string &conversation_id, bool use_vad); + bool request_voice_assistant(const VoiceAssistantRequest &msg); void on_voice_assistant_response(const VoiceAssistantResponse &msg) override; void on_voice_assistant_event_response(const VoiceAssistantEventResponse &msg) override; #endif diff --git a/esphome/components/api/api_pb2.cpp b/esphome/components/api/api_pb2.cpp index 6149a970ee..d0711aba7b 100644 --- a/esphome/components/api/api_pb2.cpp +++ b/esphome/components/api/api_pb2.cpp @@ -3,8 +3,6 @@ #include "api_pb2.h" #include "esphome/core/log.h" -#include - namespace esphome { namespace api { @@ -410,6 +408,20 @@ const char *proto_enum_to_string(enums::Bluet } #endif #ifdef HAS_PROTO_MESSAGE_DUMP +template<> const char *proto_enum_to_string(enums::VoiceAssistantRequestFlag value) { + switch (value) { + case enums::VOICE_ASSISTANT_REQUEST_NONE: + return "VOICE_ASSISTANT_REQUEST_NONE"; + case enums::VOICE_ASSISTANT_REQUEST_USE_VAD: + return "VOICE_ASSISTANT_REQUEST_USE_VAD"; + case enums::VOICE_ASSISTANT_REQUEST_USE_WAKE_WORD: + return "VOICE_ASSISTANT_REQUEST_USE_WAKE_WORD"; + default: + return "UNKNOWN"; + } +} +#endif +#ifdef HAS_PROTO_MESSAGE_DUMP template<> const char *proto_enum_to_string(enums::VoiceAssistantEvent value) { switch (value) { case enums::VOICE_ASSISTANT_ERROR: @@ -430,6 +442,14 @@ template<> const char *proto_enum_to_string(enums::V return "VOICE_ASSISTANT_TTS_START"; case enums::VOICE_ASSISTANT_TTS_END: return "VOICE_ASSISTANT_TTS_END"; + case enums::VOICE_ASSISTANT_WAKE_WORD_START: + return "VOICE_ASSISTANT_WAKE_WORD_START"; + case enums::VOICE_ASSISTANT_WAKE_WORD_END: + return "VOICE_ASSISTANT_WAKE_WORD_END"; + case enums::VOICE_ASSISTANT_STT_VAD_START: + return "VOICE_ASSISTANT_STT_VAD_START"; + case enums::VOICE_ASSISTANT_STT_VAD_END: + return "VOICE_ASSISTANT_STT_VAD_END"; default: return "UNKNOWN"; } @@ -524,12 +544,12 @@ void HelloRequest::dump_to(std::string &out) const { out.append("\n"); out.append(" api_version_major: "); - sprintf(buffer, "%" PRIu32, this->api_version_major); + sprintf(buffer, "%u", this->api_version_major); out.append(buffer); out.append("\n"); out.append(" api_version_minor: "); - sprintf(buffer, "%" PRIu32, this->api_version_minor); + sprintf(buffer, "%u", this->api_version_minor); out.append(buffer); out.append("\n"); out.append("}"); @@ -574,12 +594,12 @@ void HelloResponse::dump_to(std::string &out) const { __attribute__((unused)) char buffer[64]; out.append("HelloResponse {\n"); out.append(" api_version_major: "); - sprintf(buffer, "%" PRIu32, this->api_version_major); + sprintf(buffer, "%u", this->api_version_major); out.append(buffer); out.append("\n"); out.append(" api_version_minor: "); - sprintf(buffer, "%" PRIu32, this->api_version_minor); + sprintf(buffer, "%u", this->api_version_minor); out.append(buffer); out.append("\n"); @@ -785,17 +805,17 @@ void DeviceInfoResponse::dump_to(std::string &out) const { out.append("\n"); out.append(" webserver_port: "); - sprintf(buffer, "%" PRIu32, this->webserver_port); + sprintf(buffer, "%u", this->webserver_port); out.append(buffer); out.append("\n"); out.append(" legacy_bluetooth_proxy_version: "); - sprintf(buffer, "%" PRIu32, this->legacy_bluetooth_proxy_version); + sprintf(buffer, "%u", this->legacy_bluetooth_proxy_version); out.append(buffer); out.append("\n"); out.append(" bluetooth_proxy_feature_flags: "); - sprintf(buffer, "%" PRIu32, this->bluetooth_proxy_feature_flags); + sprintf(buffer, "%u", this->bluetooth_proxy_feature_flags); out.append(buffer); out.append("\n"); @@ -808,7 +828,7 @@ void DeviceInfoResponse::dump_to(std::string &out) const { out.append("\n"); out.append(" voice_assistant_version: "); - sprintf(buffer, "%" PRIu32, this->voice_assistant_version); + sprintf(buffer, "%u", this->voice_assistant_version); out.append(buffer); out.append("\n"); out.append("}"); @@ -900,7 +920,7 @@ void ListEntitiesBinarySensorResponse::dump_to(std::string &out) const { out.append("\n"); out.append(" key: "); - sprintf(buffer, "%" PRIu32, this->key); + sprintf(buffer, "%u", this->key); out.append(buffer); out.append("\n"); @@ -968,7 +988,7 @@ void BinarySensorStateResponse::dump_to(std::string &out) const { __attribute__((unused)) char buffer[64]; out.append("BinarySensorStateResponse {\n"); out.append(" key: "); - sprintf(buffer, "%" PRIu32, this->key); + sprintf(buffer, "%u", this->key); out.append(buffer); out.append("\n"); @@ -1071,7 +1091,7 @@ void ListEntitiesCoverResponse::dump_to(std::string &out) const { out.append("\n"); out.append(" key: "); - sprintf(buffer, "%" PRIu32, this->key); + sprintf(buffer, "%u", this->key); out.append(buffer); out.append("\n"); @@ -1161,7 +1181,7 @@ void CoverStateResponse::dump_to(std::string &out) const { __attribute__((unused)) char buffer[64]; out.append("CoverStateResponse {\n"); out.append(" key: "); - sprintf(buffer, "%" PRIu32, this->key); + sprintf(buffer, "%u", this->key); out.append(buffer); out.append("\n"); @@ -1244,7 +1264,7 @@ void CoverCommandRequest::dump_to(std::string &out) const { __attribute__((unused)) char buffer[64]; out.append("CoverCommandRequest {\n"); out.append(" key: "); - sprintf(buffer, "%" PRIu32, this->key); + sprintf(buffer, "%u", this->key); out.append(buffer); out.append("\n"); @@ -1364,7 +1384,7 @@ void ListEntitiesFanResponse::dump_to(std::string &out) const { out.append("\n"); out.append(" key: "); - sprintf(buffer, "%" PRIu32, this->key); + sprintf(buffer, "%u", this->key); out.append(buffer); out.append("\n"); @@ -1389,7 +1409,7 @@ void ListEntitiesFanResponse::dump_to(std::string &out) const { out.append("\n"); out.append(" supported_speed_count: "); - sprintf(buffer, "%" PRId32, this->supported_speed_count); + sprintf(buffer, "%d", this->supported_speed_count); out.append(buffer); out.append("\n"); @@ -1456,7 +1476,7 @@ void FanStateResponse::dump_to(std::string &out) const { __attribute__((unused)) char buffer[64]; out.append("FanStateResponse {\n"); out.append(" key: "); - sprintf(buffer, "%" PRIu32, this->key); + sprintf(buffer, "%u", this->key); out.append(buffer); out.append("\n"); @@ -1477,7 +1497,7 @@ void FanStateResponse::dump_to(std::string &out) const { out.append("\n"); out.append(" speed_level: "); - sprintf(buffer, "%" PRId32, this->speed_level); + sprintf(buffer, "%d", this->speed_level); out.append(buffer); out.append("\n"); out.append("}"); @@ -1557,7 +1577,7 @@ void FanCommandRequest::dump_to(std::string &out) const { __attribute__((unused)) char buffer[64]; out.append("FanCommandRequest {\n"); out.append(" key: "); - sprintf(buffer, "%" PRIu32, this->key); + sprintf(buffer, "%u", this->key); out.append(buffer); out.append("\n"); @@ -1598,7 +1618,7 @@ void FanCommandRequest::dump_to(std::string &out) const { out.append("\n"); out.append(" speed_level: "); - sprintf(buffer, "%" PRId32, this->speed_level); + sprintf(buffer, "%d", this->speed_level); out.append(buffer); out.append("\n"); out.append("}"); @@ -1712,7 +1732,7 @@ void ListEntitiesLightResponse::dump_to(std::string &out) const { out.append("\n"); out.append(" key: "); - sprintf(buffer, "%" PRIu32, this->key); + sprintf(buffer, "%u", this->key); out.append(buffer); out.append("\n"); @@ -1866,7 +1886,7 @@ void LightStateResponse::dump_to(std::string &out) const { __attribute__((unused)) char buffer[64]; out.append("LightStateResponse {\n"); out.append(" key: "); - sprintf(buffer, "%" PRIu32, this->key); + sprintf(buffer, "%u", this->key); out.append(buffer); out.append("\n"); @@ -2089,7 +2109,7 @@ void LightCommandRequest::dump_to(std::string &out) const { __attribute__((unused)) char buffer[64]; out.append("LightCommandRequest {\n"); out.append(" key: "); - sprintf(buffer, "%" PRIu32, this->key); + sprintf(buffer, "%u", this->key); out.append(buffer); out.append("\n"); @@ -2187,7 +2207,7 @@ void LightCommandRequest::dump_to(std::string &out) const { out.append("\n"); out.append(" transition_length: "); - sprintf(buffer, "%" PRIu32, this->transition_length); + sprintf(buffer, "%u", this->transition_length); out.append(buffer); out.append("\n"); @@ -2196,7 +2216,7 @@ void LightCommandRequest::dump_to(std::string &out) const { out.append("\n"); out.append(" flash_length: "); - sprintf(buffer, "%" PRIu32, this->flash_length); + sprintf(buffer, "%u", this->flash_length); out.append(buffer); out.append("\n"); @@ -2304,7 +2324,7 @@ void ListEntitiesSensorResponse::dump_to(std::string &out) const { out.append("\n"); out.append(" key: "); - sprintf(buffer, "%" PRIu32, this->key); + sprintf(buffer, "%u", this->key); out.append(buffer); out.append("\n"); @@ -2325,7 +2345,7 @@ void ListEntitiesSensorResponse::dump_to(std::string &out) const { out.append("\n"); out.append(" accuracy_decimals: "); - sprintf(buffer, "%" PRId32, this->accuracy_decimals); + sprintf(buffer, "%d", this->accuracy_decimals); out.append(buffer); out.append("\n"); @@ -2389,7 +2409,7 @@ void SensorStateResponse::dump_to(std::string &out) const { __attribute__((unused)) char buffer[64]; out.append("SensorStateResponse {\n"); out.append(" key: "); - sprintf(buffer, "%" PRIu32, this->key); + sprintf(buffer, "%u", this->key); out.append(buffer); out.append("\n"); @@ -2478,7 +2498,7 @@ void ListEntitiesSwitchResponse::dump_to(std::string &out) const { out.append("\n"); out.append(" key: "); - sprintf(buffer, "%" PRIu32, this->key); + sprintf(buffer, "%u", this->key); out.append(buffer); out.append("\n"); @@ -2541,7 +2561,7 @@ void SwitchStateResponse::dump_to(std::string &out) const { __attribute__((unused)) char buffer[64]; out.append("SwitchStateResponse {\n"); out.append(" key: "); - sprintf(buffer, "%" PRIu32, this->key); + sprintf(buffer, "%u", this->key); out.append(buffer); out.append("\n"); @@ -2580,7 +2600,7 @@ void SwitchCommandRequest::dump_to(std::string &out) const { __attribute__((unused)) char buffer[64]; out.append("SwitchCommandRequest {\n"); out.append(" key: "); - sprintf(buffer, "%" PRIu32, this->key); + sprintf(buffer, "%u", this->key); out.append(buffer); out.append("\n"); @@ -2654,7 +2674,7 @@ void ListEntitiesTextSensorResponse::dump_to(std::string &out) const { out.append("\n"); out.append(" key: "); - sprintf(buffer, "%" PRIu32, this->key); + sprintf(buffer, "%u", this->key); out.append(buffer); out.append("\n"); @@ -2720,7 +2740,7 @@ void TextSensorStateResponse::dump_to(std::string &out) const { __attribute__((unused)) char buffer[64]; out.append("TextSensorStateResponse {\n"); out.append(" key: "); - sprintf(buffer, "%" PRIu32, this->key); + sprintf(buffer, "%u", this->key); out.append(buffer); out.append("\n"); @@ -3027,7 +3047,7 @@ void GetTimeResponse::dump_to(std::string &out) const { __attribute__((unused)) char buffer[64]; out.append("GetTimeResponse {\n"); out.append(" epoch_seconds: "); - sprintf(buffer, "%" PRIu32, this->epoch_seconds); + sprintf(buffer, "%u", this->epoch_seconds); out.append(buffer); out.append("\n"); out.append("}"); @@ -3111,7 +3131,7 @@ void ListEntitiesServicesResponse::dump_to(std::string &out) const { out.append("\n"); out.append(" key: "); - sprintf(buffer, "%" PRIu32, this->key); + sprintf(buffer, "%u", this->key); out.append(buffer); out.append("\n"); @@ -3205,7 +3225,7 @@ void ExecuteServiceArgument::dump_to(std::string &out) const { out.append("\n"); out.append(" legacy_int: "); - sprintf(buffer, "%" PRId32, this->legacy_int); + sprintf(buffer, "%d", this->legacy_int); out.append(buffer); out.append("\n"); @@ -3219,7 +3239,7 @@ void ExecuteServiceArgument::dump_to(std::string &out) const { out.append("\n"); out.append(" int_: "); - sprintf(buffer, "%" PRId32, this->int_); + sprintf(buffer, "%d", this->int_); out.append(buffer); out.append("\n"); @@ -3231,7 +3251,7 @@ void ExecuteServiceArgument::dump_to(std::string &out) const { for (const auto &it : this->int_array) { out.append(" int_array: "); - sprintf(buffer, "%" PRId32, it); + sprintf(buffer, "%d", it); out.append(buffer); out.append("\n"); } @@ -3282,7 +3302,7 @@ void ExecuteServiceRequest::dump_to(std::string &out) const { __attribute__((unused)) char buffer[64]; out.append("ExecuteServiceRequest {\n"); out.append(" key: "); - sprintf(buffer, "%" PRIu32, this->key); + sprintf(buffer, "%u", this->key); out.append(buffer); out.append("\n"); @@ -3358,7 +3378,7 @@ void ListEntitiesCameraResponse::dump_to(std::string &out) const { out.append("\n"); out.append(" key: "); - sprintf(buffer, "%" PRIu32, this->key); + sprintf(buffer, "%u", this->key); out.append(buffer); out.append("\n"); @@ -3424,7 +3444,7 @@ void CameraImageResponse::dump_to(std::string &out) const { __attribute__((unused)) char buffer[64]; out.append("CameraImageResponse {\n"); out.append(" key: "); - sprintf(buffer, "%" PRIu32, this->key); + sprintf(buffer, "%u", this->key); out.append(buffer); out.append("\n"); @@ -3616,7 +3636,7 @@ void ListEntitiesClimateResponse::dump_to(std::string &out) const { out.append("\n"); out.append(" key: "); - sprintf(buffer, "%" PRIu32, this->key); + sprintf(buffer, "%u", this->key); out.append(buffer); out.append("\n"); @@ -3804,7 +3824,7 @@ void ClimateStateResponse::dump_to(std::string &out) const { __attribute__((unused)) char buffer[64]; out.append("ClimateStateResponse {\n"); out.append(" key: "); - sprintf(buffer, "%" PRIu32, this->key); + sprintf(buffer, "%u", this->key); out.append(buffer); out.append("\n"); @@ -3992,7 +4012,7 @@ void ClimateCommandRequest::dump_to(std::string &out) const { __attribute__((unused)) char buffer[64]; out.append("ClimateCommandRequest {\n"); out.append(" key: "); - sprintf(buffer, "%" PRIu32, this->key); + sprintf(buffer, "%u", this->key); out.append(buffer); out.append("\n"); @@ -4175,7 +4195,7 @@ void ListEntitiesNumberResponse::dump_to(std::string &out) const { out.append("\n"); out.append(" key: "); - sprintf(buffer, "%" PRIu32, this->key); + sprintf(buffer, "%u", this->key); out.append(buffer); out.append("\n"); @@ -4262,7 +4282,7 @@ void NumberStateResponse::dump_to(std::string &out) const { __attribute__((unused)) char buffer[64]; out.append("NumberStateResponse {\n"); out.append(" key: "); - sprintf(buffer, "%" PRIu32, this->key); + sprintf(buffer, "%u", this->key); out.append(buffer); out.append("\n"); @@ -4300,7 +4320,7 @@ void NumberCommandRequest::dump_to(std::string &out) const { __attribute__((unused)) char buffer[64]; out.append("NumberCommandRequest {\n"); out.append(" key: "); - sprintf(buffer, "%" PRIu32, this->key); + sprintf(buffer, "%u", this->key); out.append(buffer); out.append("\n"); @@ -4382,7 +4402,7 @@ void ListEntitiesSelectResponse::dump_to(std::string &out) const { out.append("\n"); out.append(" key: "); - sprintf(buffer, "%" PRIu32, this->key); + sprintf(buffer, "%u", this->key); out.append(buffer); out.append("\n"); @@ -4454,7 +4474,7 @@ void SelectStateResponse::dump_to(std::string &out) const { __attribute__((unused)) char buffer[64]; out.append("SelectStateResponse {\n"); out.append(" key: "); - sprintf(buffer, "%" PRIu32, this->key); + sprintf(buffer, "%u", this->key); out.append(buffer); out.append("\n"); @@ -4497,7 +4517,7 @@ void SelectCommandRequest::dump_to(std::string &out) const { __attribute__((unused)) char buffer[64]; out.append("SelectCommandRequest {\n"); out.append(" key: "); - sprintf(buffer, "%" PRIu32, this->key); + sprintf(buffer, "%u", this->key); out.append(buffer); out.append("\n"); @@ -4591,7 +4611,7 @@ void ListEntitiesLockResponse::dump_to(std::string &out) const { out.append("\n"); out.append(" key: "); - sprintf(buffer, "%" PRIu32, this->key); + sprintf(buffer, "%u", this->key); out.append(buffer); out.append("\n"); @@ -4662,7 +4682,7 @@ void LockStateResponse::dump_to(std::string &out) const { __attribute__((unused)) char buffer[64]; out.append("LockStateResponse {\n"); out.append(" key: "); - sprintf(buffer, "%" PRIu32, this->key); + sprintf(buffer, "%u", this->key); out.append(buffer); out.append("\n"); @@ -4717,7 +4737,7 @@ void LockCommandRequest::dump_to(std::string &out) const { __attribute__((unused)) char buffer[64]; out.append("LockCommandRequest {\n"); out.append(" key: "); - sprintf(buffer, "%" PRIu32, this->key); + sprintf(buffer, "%u", this->key); out.append(buffer); out.append("\n"); @@ -4804,7 +4824,7 @@ void ListEntitiesButtonResponse::dump_to(std::string &out) const { out.append("\n"); out.append(" key: "); - sprintf(buffer, "%" PRIu32, this->key); + sprintf(buffer, "%u", this->key); out.append(buffer); out.append("\n"); @@ -4850,7 +4870,7 @@ void ButtonCommandRequest::dump_to(std::string &out) const { __attribute__((unused)) char buffer[64]; out.append("ButtonCommandRequest {\n"); out.append(" key: "); - sprintf(buffer, "%" PRIu32, this->key); + sprintf(buffer, "%u", this->key); out.append(buffer); out.append("\n"); out.append("}"); @@ -4925,7 +4945,7 @@ void ListEntitiesMediaPlayerResponse::dump_to(std::string &out) const { out.append("\n"); out.append(" key: "); - sprintf(buffer, "%" PRIu32, this->key); + sprintf(buffer, "%u", this->key); out.append(buffer); out.append("\n"); @@ -4994,7 +5014,7 @@ void MediaPlayerStateResponse::dump_to(std::string &out) const { __attribute__((unused)) char buffer[64]; out.append("MediaPlayerStateResponse {\n"); out.append(" key: "); - sprintf(buffer, "%" PRIu32, this->key); + sprintf(buffer, "%u", this->key); out.append(buffer); out.append("\n"); @@ -5073,7 +5093,7 @@ void MediaPlayerCommandRequest::dump_to(std::string &out) const { __attribute__((unused)) char buffer[64]; out.append("MediaPlayerCommandRequest {\n"); out.append(" key: "); - sprintf(buffer, "%" PRIu32, this->key); + sprintf(buffer, "%u", this->key); out.append(buffer); out.append("\n"); @@ -5122,7 +5142,7 @@ void SubscribeBluetoothLEAdvertisementsRequest::dump_to(std::string &out) const __attribute__((unused)) char buffer[64]; out.append("SubscribeBluetoothLEAdvertisementsRequest {\n"); out.append(" flags: "); - sprintf(buffer, "%" PRIu32, this->flags); + sprintf(buffer, "%u", this->flags); out.append(buffer); out.append("\n"); out.append("}"); @@ -5169,7 +5189,7 @@ void BluetoothServiceData::dump_to(std::string &out) const { for (const auto &it : this->legacy_data) { out.append(" legacy_data: "); - sprintf(buffer, "%" PRIu32, it); + sprintf(buffer, "%u", it); out.append(buffer); out.append("\n"); } @@ -5249,7 +5269,7 @@ void BluetoothLEAdvertisementResponse::dump_to(std::string &out) const { out.append("\n"); out.append(" rssi: "); - sprintf(buffer, "%" PRId32, this->rssi); + sprintf(buffer, "%d", this->rssi); out.append(buffer); out.append("\n"); @@ -5272,7 +5292,7 @@ void BluetoothLEAdvertisementResponse::dump_to(std::string &out) const { } out.append(" address_type: "); - sprintf(buffer, "%" PRIu32, this->address_type); + sprintf(buffer, "%u", this->address_type); out.append(buffer); out.append("\n"); out.append("}"); @@ -5322,12 +5342,12 @@ void BluetoothLERawAdvertisement::dump_to(std::string &out) const { out.append("\n"); out.append(" rssi: "); - sprintf(buffer, "%" PRId32, this->rssi); + sprintf(buffer, "%d", this->rssi); out.append(buffer); out.append("\n"); out.append(" address_type: "); - sprintf(buffer, "%" PRIu32, this->address_type); + sprintf(buffer, "%u", this->address_type); out.append(buffer); out.append("\n"); @@ -5410,7 +5430,7 @@ void BluetoothDeviceRequest::dump_to(std::string &out) const { out.append("\n"); out.append(" address_type: "); - sprintf(buffer, "%" PRIu32, this->address_type); + sprintf(buffer, "%u", this->address_type); out.append(buffer); out.append("\n"); out.append("}"); @@ -5458,12 +5478,12 @@ void BluetoothDeviceConnectionResponse::dump_to(std::string &out) const { out.append("\n"); out.append(" mtu: "); - sprintf(buffer, "%" PRIu32, this->mtu); + sprintf(buffer, "%u", this->mtu); out.append(buffer); out.append("\n"); out.append(" error: "); - sprintf(buffer, "%" PRId32, this->error); + sprintf(buffer, "%d", this->error); out.append(buffer); out.append("\n"); out.append("}"); @@ -5523,7 +5543,7 @@ void BluetoothGATTDescriptor::dump_to(std::string &out) const { } out.append(" handle: "); - sprintf(buffer, "%" PRIu32, this->handle); + sprintf(buffer, "%u", this->handle); out.append(buffer); out.append("\n"); out.append("}"); @@ -5579,12 +5599,12 @@ void BluetoothGATTCharacteristic::dump_to(std::string &out) const { } out.append(" handle: "); - sprintf(buffer, "%" PRIu32, this->handle); + sprintf(buffer, "%u", this->handle); out.append(buffer); out.append("\n"); out.append(" properties: "); - sprintf(buffer, "%" PRIu32, this->properties); + sprintf(buffer, "%u", this->properties); out.append(buffer); out.append("\n"); @@ -5641,7 +5661,7 @@ void BluetoothGATTService::dump_to(std::string &out) const { } out.append(" handle: "); - sprintf(buffer, "%" PRIu32, this->handle); + sprintf(buffer, "%u", this->handle); out.append(buffer); out.append("\n"); @@ -5748,7 +5768,7 @@ void BluetoothGATTReadRequest::dump_to(std::string &out) const { out.append("\n"); out.append(" handle: "); - sprintf(buffer, "%" PRIu32, this->handle); + sprintf(buffer, "%u", this->handle); out.append(buffer); out.append("\n"); out.append("}"); @@ -5793,7 +5813,7 @@ void BluetoothGATTReadResponse::dump_to(std::string &out) const { out.append("\n"); out.append(" handle: "); - sprintf(buffer, "%" PRIu32, this->handle); + sprintf(buffer, "%u", this->handle); out.append(buffer); out.append("\n"); @@ -5847,7 +5867,7 @@ void BluetoothGATTWriteRequest::dump_to(std::string &out) const { out.append("\n"); out.append(" handle: "); - sprintf(buffer, "%" PRIu32, this->handle); + sprintf(buffer, "%u", this->handle); out.append(buffer); out.append("\n"); @@ -5889,7 +5909,7 @@ void BluetoothGATTReadDescriptorRequest::dump_to(std::string &out) const { out.append("\n"); out.append(" handle: "); - sprintf(buffer, "%" PRIu32, this->handle); + sprintf(buffer, "%u", this->handle); out.append(buffer); out.append("\n"); out.append("}"); @@ -5934,7 +5954,7 @@ void BluetoothGATTWriteDescriptorRequest::dump_to(std::string &out) const { out.append("\n"); out.append(" handle: "); - sprintf(buffer, "%" PRIu32, this->handle); + sprintf(buffer, "%u", this->handle); out.append(buffer); out.append("\n"); @@ -5977,7 +5997,7 @@ void BluetoothGATTNotifyRequest::dump_to(std::string &out) const { out.append("\n"); out.append(" handle: "); - sprintf(buffer, "%" PRIu32, this->handle); + sprintf(buffer, "%u", this->handle); out.append(buffer); out.append("\n"); @@ -6026,7 +6046,7 @@ void BluetoothGATTNotifyDataResponse::dump_to(std::string &out) const { out.append("\n"); out.append(" handle: "); - sprintf(buffer, "%" PRIu32, this->handle); + sprintf(buffer, "%u", this->handle); out.append(buffer); out.append("\n"); @@ -6065,12 +6085,12 @@ void BluetoothConnectionsFreeResponse::dump_to(std::string &out) const { __attribute__((unused)) char buffer[64]; out.append("BluetoothConnectionsFreeResponse {\n"); out.append(" free: "); - sprintf(buffer, "%" PRIu32, this->free); + sprintf(buffer, "%u", this->free); out.append(buffer); out.append("\n"); out.append(" limit: "); - sprintf(buffer, "%" PRIu32, this->limit); + sprintf(buffer, "%u", this->limit); out.append(buffer); out.append("\n"); out.append("}"); @@ -6109,12 +6129,12 @@ void BluetoothGATTErrorResponse::dump_to(std::string &out) const { out.append("\n"); out.append(" handle: "); - sprintf(buffer, "%" PRIu32, this->handle); + sprintf(buffer, "%u", this->handle); out.append(buffer); out.append("\n"); out.append(" error: "); - sprintf(buffer, "%" PRId32, this->error); + sprintf(buffer, "%d", this->error); out.append(buffer); out.append("\n"); out.append("}"); @@ -6148,7 +6168,7 @@ void BluetoothGATTWriteResponse::dump_to(std::string &out) const { out.append("\n"); out.append(" handle: "); - sprintf(buffer, "%" PRIu32, this->handle); + sprintf(buffer, "%u", this->handle); out.append(buffer); out.append("\n"); out.append("}"); @@ -6182,7 +6202,7 @@ void BluetoothGATTNotifyResponse::dump_to(std::string &out) const { out.append("\n"); out.append(" handle: "); - sprintf(buffer, "%" PRIu32, this->handle); + sprintf(buffer, "%u", this->handle); out.append(buffer); out.append("\n"); out.append("}"); @@ -6225,7 +6245,7 @@ void BluetoothDevicePairingResponse::dump_to(std::string &out) const { out.append("\n"); out.append(" error: "); - sprintf(buffer, "%" PRId32, this->error); + sprintf(buffer, "%d", this->error); out.append(buffer); out.append("\n"); out.append("}"); @@ -6268,7 +6288,7 @@ void BluetoothDeviceUnpairingResponse::dump_to(std::string &out) const { out.append("\n"); out.append(" error: "); - sprintf(buffer, "%" PRId32, this->error); + sprintf(buffer, "%d", this->error); out.append(buffer); out.append("\n"); out.append("}"); @@ -6317,7 +6337,7 @@ void BluetoothDeviceClearCacheResponse::dump_to(std::string &out) const { out.append("\n"); out.append(" error: "); - sprintf(buffer, "%" PRId32, this->error); + sprintf(buffer, "%d", this->error); out.append(buffer); out.append("\n"); out.append("}"); @@ -6344,6 +6364,56 @@ void SubscribeVoiceAssistantRequest::dump_to(std::string &out) const { out.append("}"); } #endif +bool VoiceAssistantAudioSettings::decode_varint(uint32_t field_id, ProtoVarInt value) { + switch (field_id) { + case 1: { + this->noise_suppression_level = value.as_uint32(); + return true; + } + case 2: { + this->auto_gain = value.as_uint32(); + return true; + } + default: + return false; + } +} +bool VoiceAssistantAudioSettings::decode_32bit(uint32_t field_id, Proto32Bit value) { + switch (field_id) { + case 3: { + this->volume_multiplier = value.as_float(); + return true; + } + default: + return false; + } +} +void VoiceAssistantAudioSettings::encode(ProtoWriteBuffer buffer) const { + buffer.encode_uint32(1, this->noise_suppression_level); + buffer.encode_uint32(2, this->auto_gain); + buffer.encode_float(3, this->volume_multiplier); +} +#ifdef HAS_PROTO_MESSAGE_DUMP +void VoiceAssistantAudioSettings::dump_to(std::string &out) const { + __attribute__((unused)) char buffer[64]; + out.append("VoiceAssistantAudioSettings {\n"); + out.append(" noise_suppression_level: "); + sprintf(buffer, "%u", this->noise_suppression_level); + out.append(buffer); + out.append("\n"); + + out.append(" auto_gain: "); + sprintf(buffer, "%u", this->auto_gain); + out.append(buffer); + out.append("\n"); + + out.append(" volume_multiplier: "); + sprintf(buffer, "%g", this->volume_multiplier); + out.append(buffer); + out.append("\n"); + out.append("}"); +} +#endif bool VoiceAssistantRequest::decode_varint(uint32_t field_id, ProtoVarInt value) { switch (field_id) { case 1: { @@ -6351,7 +6421,7 @@ bool VoiceAssistantRequest::decode_varint(uint32_t field_id, ProtoVarInt value) return true; } case 3: { - this->use_vad = value.as_bool(); + this->flags = value.as_uint32(); return true; } default: @@ -6364,6 +6434,10 @@ bool VoiceAssistantRequest::decode_length(uint32_t field_id, ProtoLengthDelimite this->conversation_id = value.as_string(); return true; } + case 4: { + this->audio_settings = value.as_message(); + return true; + } default: return false; } @@ -6371,7 +6445,8 @@ bool VoiceAssistantRequest::decode_length(uint32_t field_id, ProtoLengthDelimite void VoiceAssistantRequest::encode(ProtoWriteBuffer buffer) const { buffer.encode_bool(1, this->start); buffer.encode_string(2, this->conversation_id); - buffer.encode_bool(3, this->use_vad); + buffer.encode_uint32(3, this->flags); + buffer.encode_message(4, this->audio_settings); } #ifdef HAS_PROTO_MESSAGE_DUMP void VoiceAssistantRequest::dump_to(std::string &out) const { @@ -6385,8 +6460,13 @@ void VoiceAssistantRequest::dump_to(std::string &out) const { out.append("'").append(this->conversation_id).append("'"); out.append("\n"); - out.append(" use_vad: "); - out.append(YESNO(this->use_vad)); + out.append(" flags: "); + sprintf(buffer, "%u", this->flags); + out.append(buffer); + out.append("\n"); + + out.append(" audio_settings: "); + this->audio_settings.dump_to(out); out.append("\n"); out.append("}"); } @@ -6414,7 +6494,7 @@ void VoiceAssistantResponse::dump_to(std::string &out) const { __attribute__((unused)) char buffer[64]; out.append("VoiceAssistantResponse {\n"); out.append(" port: "); - sprintf(buffer, "%" PRIu32, this->port); + sprintf(buffer, "%u", this->port); out.append(buffer); out.append("\n"); @@ -6577,7 +6657,7 @@ void ListEntitiesAlarmControlPanelResponse::dump_to(std::string &out) const { out.append("\n"); out.append(" key: "); - sprintf(buffer, "%" PRIu32, this->key); + sprintf(buffer, "%u", this->key); out.append(buffer); out.append("\n"); @@ -6602,7 +6682,7 @@ void ListEntitiesAlarmControlPanelResponse::dump_to(std::string &out) const { out.append("\n"); out.append(" supported_features: "); - sprintf(buffer, "%" PRIu32, this->supported_features); + sprintf(buffer, "%u", this->supported_features); out.append(buffer); out.append("\n"); @@ -6645,7 +6725,7 @@ void AlarmControlPanelStateResponse::dump_to(std::string &out) const { __attribute__((unused)) char buffer[64]; out.append("AlarmControlPanelStateResponse {\n"); out.append(" key: "); - sprintf(buffer, "%" PRIu32, this->key); + sprintf(buffer, "%u", this->key); out.append(buffer); out.append("\n"); @@ -6695,7 +6775,7 @@ void AlarmControlPanelCommandRequest::dump_to(std::string &out) const { __attribute__((unused)) char buffer[64]; out.append("AlarmControlPanelCommandRequest {\n"); out.append(" key: "); - sprintf(buffer, "%" PRIu32, this->key); + sprintf(buffer, "%u", this->key); out.append(buffer); out.append("\n"); diff --git a/esphome/components/api/api_pb2.h b/esphome/components/api/api_pb2.h index 627165953d..a4826f09d2 100644 --- a/esphome/components/api/api_pb2.h +++ b/esphome/components/api/api_pb2.h @@ -165,6 +165,11 @@ enum BluetoothDeviceRequestType : uint32_t { BLUETOOTH_DEVICE_REQUEST_TYPE_CONNECT_V3_WITHOUT_CACHE = 5, BLUETOOTH_DEVICE_REQUEST_TYPE_CLEAR_CACHE = 6, }; +enum VoiceAssistantRequestFlag : uint32_t { + VOICE_ASSISTANT_REQUEST_NONE = 0, + VOICE_ASSISTANT_REQUEST_USE_VAD = 1, + VOICE_ASSISTANT_REQUEST_USE_WAKE_WORD = 2, +}; enum VoiceAssistantEvent : uint32_t { VOICE_ASSISTANT_ERROR = 0, VOICE_ASSISTANT_RUN_START = 1, @@ -175,6 +180,10 @@ enum VoiceAssistantEvent : uint32_t { VOICE_ASSISTANT_INTENT_END = 6, VOICE_ASSISTANT_TTS_START = 7, VOICE_ASSISTANT_TTS_END = 8, + VOICE_ASSISTANT_WAKE_WORD_START = 9, + VOICE_ASSISTANT_WAKE_WORD_END = 10, + VOICE_ASSISTANT_STT_VAD_START = 11, + VOICE_ASSISTANT_STT_VAD_END = 12, }; enum AlarmControlPanelState : uint32_t { ALARM_STATE_DISARMED = 0, @@ -1651,11 +1660,26 @@ class SubscribeVoiceAssistantRequest : public ProtoMessage { protected: bool decode_varint(uint32_t field_id, ProtoVarInt value) override; }; +class VoiceAssistantAudioSettings : public ProtoMessage { + public: + uint32_t noise_suppression_level{0}; + uint32_t auto_gain{0}; + float volume_multiplier{0.0f}; + void encode(ProtoWriteBuffer buffer) const override; +#ifdef HAS_PROTO_MESSAGE_DUMP + void dump_to(std::string &out) const override; +#endif + + protected: + bool decode_32bit(uint32_t field_id, Proto32Bit value) override; + bool decode_varint(uint32_t field_id, ProtoVarInt value) override; +}; class VoiceAssistantRequest : public ProtoMessage { public: bool start{false}; std::string conversation_id{}; - bool use_vad{false}; + uint32_t flags{0}; + VoiceAssistantAudioSettings audio_settings{}; void encode(ProtoWriteBuffer buffer) const override; #ifdef HAS_PROTO_MESSAGE_DUMP void dump_to(std::string &out) const override; diff --git a/esphome/components/api/api_server.cpp b/esphome/components/api/api_server.cpp index f70d45ecd0..54266ff0f0 100644 --- a/esphome/components/api/api_server.cpp +++ b/esphome/components/api/api_server.cpp @@ -1,13 +1,13 @@ #include "api_server.h" +#include #include "api_connection.h" +#include "esphome/components/network/util.h" #include "esphome/core/application.h" #include "esphome/core/defines.h" +#include "esphome/core/hal.h" #include "esphome/core/log.h" #include "esphome/core/util.h" #include "esphome/core/version.h" -#include "esphome/core/hal.h" -#include "esphome/components/network/util.h" -#include #ifdef USE_LOGGER #include "esphome/components/logger/logger.h" @@ -323,16 +323,24 @@ void APIServer::on_shutdown() { } #ifdef USE_VOICE_ASSISTANT -bool APIServer::start_voice_assistant(const std::string &conversation_id, bool use_vad) { +bool APIServer::start_voice_assistant(const std::string &conversation_id, uint32_t flags, + const api::VoiceAssistantAudioSettings &audio_settings) { + VoiceAssistantRequest msg; + msg.start = true; + msg.conversation_id = conversation_id; + msg.flags = flags; + msg.audio_settings = audio_settings; for (auto &c : this->clients_) { - if (c->request_voice_assistant(true, conversation_id, use_vad)) + if (c->request_voice_assistant(msg)) return true; } return false; } void APIServer::stop_voice_assistant() { + VoiceAssistantRequest msg; + msg.start = false; for (auto &c : this->clients_) { - if (c->request_voice_assistant(false, "", false)) + if (c->request_voice_assistant(msg)) return; } } diff --git a/esphome/components/api/api_server.h b/esphome/components/api/api_server.h index 9b40a5ef02..a4454d4b84 100644 --- a/esphome/components/api/api_server.h +++ b/esphome/components/api/api_server.h @@ -1,16 +1,16 @@ #pragma once +#include "api_noise_context.h" +#include "api_pb2.h" +#include "api_pb2_service.h" +#include "esphome/components/socket/socket.h" #include "esphome/core/component.h" #include "esphome/core/controller.h" #include "esphome/core/defines.h" #include "esphome/core/log.h" -#include "esphome/components/socket/socket.h" -#include "api_pb2.h" -#include "api_pb2_service.h" #include "list_entities.h" #include "subscribe_state.h" #include "user_services.h" -#include "api_noise_context.h" #include @@ -81,7 +81,8 @@ class APIServer : public Component, public Controller { #endif #ifdef USE_VOICE_ASSISTANT - bool start_voice_assistant(const std::string &conversation_id, bool use_vad); + bool start_voice_assistant(const std::string &conversation_id, uint32_t flags, + const api::VoiceAssistantAudioSettings &audio_settings); void stop_voice_assistant(); #endif diff --git a/esphome/components/i2s_audio/microphone/i2s_audio_microphone.cpp b/esphome/components/i2s_audio/microphone/i2s_audio_microphone.cpp index cf0628d638..44c73eb8fd 100644 --- a/esphome/components/i2s_audio/microphone/i2s_audio_microphone.cpp +++ b/esphome/components/i2s_audio/microphone/i2s_audio_microphone.cpp @@ -66,8 +66,9 @@ void I2SAudioMicrophone::start_() { i2s_set_adc_mode(ADC_UNIT_1, this->adc_channel_); i2s_adc_enable(this->parent_->get_port()); - } else { + } else #endif + { if (this->pdm_) config.mode = (i2s_mode_t) (config.mode | I2S_MODE_PDM); @@ -77,9 +78,7 @@ void I2SAudioMicrophone::start_() { pin_config.data_in_num = this->din_pin_; i2s_set_pin(this->parent_->get_port(), &pin_config); -#if SOC_I2S_SUPPORTS_ADC } -#endif this->state_ = microphone::STATE_RUNNING; this->high_freq_.start(); } @@ -110,6 +109,10 @@ size_t I2SAudioMicrophone::read(int16_t *buf, size_t len) { this->status_set_warning(); return 0; } + if (bytes_read == 0) { + this->status_set_warning(); + return 0; + } this->status_clear_warning(); if (this->bits_per_sample_ == I2S_BITS_PER_SAMPLE_16BIT) { return bytes_read; diff --git a/esphome/components/i2s_audio/speaker/i2s_audio_speaker.cpp b/esphome/components/i2s_audio/speaker/i2s_audio_speaker.cpp index 43bc005136..a0934e3844 100644 --- a/esphome/components/i2s_audio/speaker/i2s_audio_speaker.cpp +++ b/esphome/components/i2s_audio/speaker/i2s_audio_speaker.cpp @@ -158,10 +158,10 @@ void I2SAudioSpeaker::watch_() { this->status_clear_warning(); break; case TaskEventType::STOPPED: - this->parent_->unlock(); this->state_ = speaker::STATE_STOPPED; vTaskDelete(this->player_task_handle_); this->player_task_handle_ = nullptr; + this->parent_->unlock(); break; case TaskEventType::WARNING: ESP_LOGW(TAG, "Error writing to I2S: %s", esp_err_to_name(event.err)); @@ -177,9 +177,9 @@ void I2SAudioSpeaker::loop() { this->start_(); break; case speaker::STATE_RUNNING: + case speaker::STATE_STOPPING: this->watch_(); break; - case speaker::STATE_STOPPING: case speaker::STATE_STOPPED: break; } diff --git a/esphome/components/voice_assistant/__init__.py b/esphome/components/voice_assistant/__init__.py index 55d995be88..14176ad7cf 100644 --- a/esphome/components/voice_assistant/__init__.py +++ b/esphome/components/voice_assistant/__init__.py @@ -19,11 +19,18 @@ CODEOWNERS = ["@jesserockz"] CONF_SILENCE_DETECTION = "silence_detection" CONF_ON_LISTENING = "on_listening" CONF_ON_START = "on_start" +CONF_ON_WAKE_WORD_DETECTED = "on_wake_word_detected" CONF_ON_STT_END = "on_stt_end" CONF_ON_TTS_START = "on_tts_start" CONF_ON_TTS_END = "on_tts_end" CONF_ON_END = "on_end" CONF_ON_ERROR = "on_error" +CONF_USE_WAKE_WORD = "use_wake_word" +CONF_VAD_THRESHOLD = "vad_threshold" + +CONF_NOISE_SUPPRESSION_LEVEL = "noise_suppression_level" +CONF_AUTO_GAIN = "auto_gain" +CONF_VOLUME_MULTIPLIER = "volume_multiplier" voice_assistant_ns = cg.esphome_ns.namespace("voice_assistant") @@ -42,23 +49,40 @@ IsRunningCondition = voice_assistant_ns.class_( "IsRunningCondition", automation.Condition, cg.Parented.template(VoiceAssistant) ) - -CONFIG_SCHEMA = cv.Schema( - { - cv.GenerateID(): cv.declare_id(VoiceAssistant), - cv.GenerateID(CONF_MICROPHONE): cv.use_id(microphone.Microphone), - cv.Exclusive(CONF_SPEAKER, "output"): cv.use_id(speaker.Speaker), - cv.Exclusive(CONF_MEDIA_PLAYER, "output"): cv.use_id(media_player.MediaPlayer), - cv.Optional(CONF_SILENCE_DETECTION, default=True): cv.boolean, - cv.Optional(CONF_ON_LISTENING): automation.validate_automation(single=True), - cv.Optional(CONF_ON_START): automation.validate_automation(single=True), - cv.Optional(CONF_ON_STT_END): automation.validate_automation(single=True), - cv.Optional(CONF_ON_TTS_START): automation.validate_automation(single=True), - cv.Optional(CONF_ON_TTS_END): automation.validate_automation(single=True), - cv.Optional(CONF_ON_END): automation.validate_automation(single=True), - cv.Optional(CONF_ON_ERROR): automation.validate_automation(single=True), - } -).extend(cv.COMPONENT_SCHEMA) +CONFIG_SCHEMA = cv.All( + cv.Schema( + { + cv.GenerateID(): cv.declare_id(VoiceAssistant), + cv.GenerateID(CONF_MICROPHONE): cv.use_id(microphone.Microphone), + cv.Exclusive(CONF_SPEAKER, "output"): cv.use_id(speaker.Speaker), + cv.Exclusive(CONF_MEDIA_PLAYER, "output"): cv.use_id( + media_player.MediaPlayer + ), + cv.Optional(CONF_USE_WAKE_WORD, default=False): cv.boolean, + cv.Optional(CONF_VAD_THRESHOLD): cv.All( + cv.requires_component("esp_adf"), cv.only_with_esp_idf, cv.uint8_t + ), + cv.Optional(CONF_NOISE_SUPPRESSION_LEVEL, default=0): cv.int_range(0, 4), + cv.Optional(CONF_AUTO_GAIN, default="0dBFS"): cv.All( + cv.float_with_unit("decibel full scale", "(dBFS|dbfs|DBFS)"), + cv.int_range(0, 31), + ), + cv.Optional(CONF_VOLUME_MULTIPLIER, default=1.0): cv.float_range( + min=0.0, min_included=False + ), + cv.Optional(CONF_ON_LISTENING): automation.validate_automation(single=True), + cv.Optional(CONF_ON_START): automation.validate_automation(single=True), + cv.Optional(CONF_ON_WAKE_WORD_DETECTED): automation.validate_automation( + single=True + ), + cv.Optional(CONF_ON_STT_END): automation.validate_automation(single=True), + cv.Optional(CONF_ON_TTS_START): automation.validate_automation(single=True), + cv.Optional(CONF_ON_TTS_END): automation.validate_automation(single=True), + cv.Optional(CONF_ON_END): automation.validate_automation(single=True), + cv.Optional(CONF_ON_ERROR): automation.validate_automation(single=True), + } + ).extend(cv.COMPONENT_SCHEMA), +) async def to_code(config): @@ -76,7 +100,14 @@ async def to_code(config): mp = await cg.get_variable(config[CONF_MEDIA_PLAYER]) cg.add(var.set_media_player(mp)) - cg.add(var.set_silence_detection(config[CONF_SILENCE_DETECTION])) + cg.add(var.set_use_wake_word(config[CONF_USE_WAKE_WORD])) + + if (vad_threshold := config.get(CONF_VAD_THRESHOLD)) is not None: + cg.add(var.set_vad_threshold(vad_threshold)) + + cg.add(var.set_noise_suppression_level(config[CONF_NOISE_SUPPRESSION_LEVEL])) + cg.add(var.set_auto_gain(config[CONF_AUTO_GAIN])) + cg.add(var.set_volume_multiplier(config[CONF_VOLUME_MULTIPLIER])) if CONF_ON_LISTENING in config: await automation.build_automation( @@ -88,6 +119,13 @@ async def to_code(config): var.get_start_trigger(), [], config[CONF_ON_START] ) + if CONF_ON_WAKE_WORD_DETECTED in config: + await automation.build_automation( + var.get_wake_word_detected_trigger(), + [], + config[CONF_ON_WAKE_WORD_DETECTED], + ) + if CONF_ON_STT_END in config: await automation.build_automation( var.get_stt_end_trigger(), [(cg.std_string, "x")], config[CONF_ON_STT_END] @@ -128,10 +166,20 @@ VOICE_ASSISTANT_ACTION_SCHEMA = cv.Schema({cv.GenerateID(): cv.use_id(VoiceAssis StartContinuousAction, VOICE_ASSISTANT_ACTION_SCHEMA, ) -@register_action("voice_assistant.start", StartAction, VOICE_ASSISTANT_ACTION_SCHEMA) +@register_action( + "voice_assistant.start", + StartAction, + VOICE_ASSISTANT_ACTION_SCHEMA.extend( + { + cv.Optional(CONF_SILENCE_DETECTION, default=True): cv.boolean, + } + ), +) async def voice_assistant_listen_to_code(config, action_id, template_arg, args): var = cg.new_Pvariable(action_id, template_arg) await cg.register_parented(var, config[CONF_ID]) + if CONF_SILENCE_DETECTION in config: + cg.add(var.set_silence_detection(config[CONF_SILENCE_DETECTION])) return var diff --git a/esphome/components/voice_assistant/voice_assistant.cpp b/esphome/components/voice_assistant/voice_assistant.cpp index 217ddb6354..802ae508ff 100644 --- a/esphome/components/voice_assistant/voice_assistant.cpp +++ b/esphome/components/voice_assistant/voice_assistant.cpp @@ -11,6 +11,17 @@ namespace voice_assistant { static const char *const TAG = "voice_assistant"; +#ifdef SAMPLE_RATE_HZ +#undef SAMPLE_RATE_HZ +#endif + +static const size_t SAMPLE_RATE_HZ = 16000; +static const size_t INPUT_BUFFER_SIZE = 32 * SAMPLE_RATE_HZ / 1000; // 32ms * 16kHz / 1000ms +static const size_t BUFFER_SIZE = 1000 * SAMPLE_RATE_HZ / 1000; // 1s +static const size_t SEND_BUFFER_SIZE = INPUT_BUFFER_SIZE * sizeof(int16_t); +static const size_t RECEIVE_SIZE = 1024; +static const size_t SPEAKER_BUFFER_SIZE = 16 * RECEIVE_SIZE; + float VoiceAssistant::get_setup_priority() const { return setup_priority::AFTER_CONNECTION; } void VoiceAssistant::setup() { @@ -47,7 +58,6 @@ void VoiceAssistant::setup() { this->mark_failed(); return; } - server.ss_family = AF_INET; err = socket_->bind((struct sockaddr *) &server, sizeof(server)); if (err != 0) { @@ -55,60 +65,262 @@ void VoiceAssistant::setup() { this->mark_failed(); return; } + + ExternalRAMAllocator speaker_allocator(ExternalRAMAllocator::ALLOW_FAILURE); + this->speaker_buffer_ = speaker_allocator.allocate(SPEAKER_BUFFER_SIZE); + if (this->speaker_buffer_ == nullptr) { + ESP_LOGW(TAG, "Could not allocate speaker buffer."); + this->mark_failed(); + return; + } } #endif - this->mic_->add_data_callback([this](const std::vector &data) { - if (!this->running_) { - return; + ExternalRAMAllocator allocator(ExternalRAMAllocator::ALLOW_FAILURE); + this->input_buffer_ = allocator.allocate(INPUT_BUFFER_SIZE); + if (this->input_buffer_ == nullptr) { + ESP_LOGW(TAG, "Could not allocate input buffer."); + this->mark_failed(); + return; + } + +#ifdef USE_ESP_ADF + this->vad_instance_ = vad_create(VAD_MODE_4); + + this->ring_buffer_ = rb_create(BUFFER_SIZE, sizeof(int16_t)); + if (this->ring_buffer_ == nullptr) { + ESP_LOGW(TAG, "Could not allocate ring buffer."); + this->mark_failed(); + return; + } +#endif + + ExternalRAMAllocator send_allocator(ExternalRAMAllocator::ALLOW_FAILURE); + this->send_buffer_ = send_allocator.allocate(SEND_BUFFER_SIZE); + if (send_buffer_ == nullptr) { + ESP_LOGW(TAG, "Could not allocate send buffer."); + this->mark_failed(); + return; + } +} + +int VoiceAssistant::read_microphone_() { + size_t bytes_read = 0; + if (this->mic_->is_running()) { // Read audio into input buffer + bytes_read = this->mic_->read(this->input_buffer_, INPUT_BUFFER_SIZE * sizeof(int16_t)); + if (bytes_read == 0) { + memset(this->input_buffer_, 0, INPUT_BUFFER_SIZE * sizeof(int16_t)); + return 0; } - this->socket_->sendto(data.data(), data.size() * sizeof(int16_t), 0, (struct sockaddr *) &this->dest_addr_, - sizeof(this->dest_addr_)); - }); +#ifdef USE_ESP_ADF + // Write audio into ring buffer + int available = rb_bytes_available(this->ring_buffer_); + if (available < bytes_read) { + rb_read(this->ring_buffer_, nullptr, bytes_read - available, 0); + } + rb_write(this->ring_buffer_, (char *) this->input_buffer_, bytes_read, 0); +#endif + } else { + ESP_LOGD(TAG, "microphone not running"); + } + return bytes_read; } void VoiceAssistant::loop() { -#ifdef USE_SPEAKER - if (this->speaker_ != nullptr) { - uint8_t buf[1024]; - auto len = this->socket_->read(buf, sizeof(buf)); - if (len == -1) { - return; + if (this->state_ != State::IDLE && this->state_ != State::STOP_MICROPHONE && + this->state_ != State::STOPPING_MICROPHONE && !api::global_api_server->is_connected()) { + if (this->mic_->is_running() || this->state_ == State::STARTING_MICROPHONE) { + this->set_state_(State::STOP_MICROPHONE, State::IDLE); + } else { + this->set_state_(State::IDLE, State::IDLE); } - this->speaker_->play(buf, len); - this->set_timeout("data-incoming", 200, [this]() { - if (this->continuous_) { - this->request_start(true); - } - }); + this->continuous_ = false; + this->signal_stop_(); return; } + switch (this->state_) { + case State::IDLE: { + if (this->continuous_ && this->desired_state_ == State::IDLE) { +#ifdef USE_ESP_ADF + if (this->use_wake_word_) { + rb_reset(this->ring_buffer_); + this->set_state_(State::START_MICROPHONE, State::WAIT_FOR_VAD); + } else +#endif + { + this->set_state_(State::START_PIPELINE, State::START_MICROPHONE); + } + } else { + this->high_freq_.stop(); + } + break; + } + case State::START_MICROPHONE: { + ESP_LOGD(TAG, "Starting Microphone"); + memset(this->send_buffer_, 0, SEND_BUFFER_SIZE); + memset(this->input_buffer_, 0, INPUT_BUFFER_SIZE * sizeof(int16_t)); + this->mic_->start(); + this->high_freq_.start(); + this->set_state_(State::STARTING_MICROPHONE); + break; + } + case State::STARTING_MICROPHONE: { + if (this->mic_->is_running()) { + this->set_state_(this->desired_state_); + } + break; + } +#ifdef USE_ESP_ADF + case State::WAIT_FOR_VAD: { + this->read_microphone_(); + ESP_LOGD(TAG, "Waiting for speech..."); + this->set_state_(State::WAITING_FOR_VAD); + break; + } + case State::WAITING_FOR_VAD: { + size_t bytes_read = this->read_microphone_(); + if (bytes_read > 0) { + vad_state_t vad_state = + vad_process(this->vad_instance_, this->input_buffer_, SAMPLE_RATE_HZ, VAD_FRAME_LENGTH_MS); + if (vad_state == VAD_SPEECH) { + if (this->vad_counter_ < this->vad_threshold_) { + this->vad_counter_++; + } else { + ESP_LOGD(TAG, "VAD detected speech"); + this->set_state_(State::START_PIPELINE, State::STREAMING_MICROPHONE); + } + } else { + if (this->vad_counter_ > 0) { + this->vad_counter_--; + } + } + } + break; + } +#endif + case State::START_PIPELINE: { + this->read_microphone_(); + ESP_LOGD(TAG, "Requesting start..."); + uint32_t flags = 0; + if (this->use_wake_word_) + flags |= api::enums::VOICE_ASSISTANT_REQUEST_USE_WAKE_WORD; + if (this->silence_detection_) + flags |= api::enums::VOICE_ASSISTANT_REQUEST_USE_VAD; + api::VoiceAssistantAudioSettings audio_settings; + audio_settings.noise_suppression_level = this->noise_suppression_level_; + audio_settings.auto_gain = this->auto_gain_; + audio_settings.volume_multiplier = this->volume_multiplier_; + if (!api::global_api_server->start_voice_assistant(this->conversation_id_, flags, audio_settings)) { + ESP_LOGW(TAG, "Could not request start."); + this->error_trigger_->trigger("not-connected", "Could not request start."); + this->continuous_ = false; + this->set_state_(State::IDLE, State::IDLE); + break; + } + this->set_state_(State::STARTING_PIPELINE); + this->set_timeout("reset-conversation_id", 5 * 60 * 1000, [this]() { this->conversation_id_ = ""; }); + break; + } + case State::STARTING_PIPELINE: { + this->read_microphone_(); + break; // State changed when udp server port received + } + case State::STREAMING_MICROPHONE: { + size_t bytes_read = this->read_microphone_(); +#ifdef USE_ESP_ADF + if (rb_bytes_filled(this->ring_buffer_) >= SEND_BUFFER_SIZE) { + rb_read(this->ring_buffer_, (char *) this->send_buffer_, SEND_BUFFER_SIZE, 0); + this->socket_->sendto(this->send_buffer_, SEND_BUFFER_SIZE, 0, (struct sockaddr *) &this->dest_addr_, + sizeof(this->dest_addr_)); + } +#else + if (bytes_read > 0) { + this->socket_->sendto(this->input_buffer_, bytes_read, 0, (struct sockaddr *) &this->dest_addr_, + sizeof(this->dest_addr_)); + } +#endif + break; + } + case State::STOP_MICROPHONE: { + if (this->mic_->is_running()) { + this->mic_->stop(); + this->set_state_(State::STOPPING_MICROPHONE); + } else { + this->set_state_(this->desired_state_); + } + break; + } + case State::STOPPING_MICROPHONE: { + if (this->mic_->is_stopped()) { + this->set_state_(this->desired_state_); + } + break; + } + case State::AWAITING_RESPONSE: { + break; // State changed by events + } + case State::STREAMING_RESPONSE: { + bool playing = false; +#ifdef USE_SPEAKER + if (this->speaker_ != nullptr) { + if (this->speaker_buffer_index_ + RECEIVE_SIZE < SPEAKER_BUFFER_SIZE) { + auto len = this->socket_->read(this->speaker_buffer_ + this->speaker_buffer_index_, RECEIVE_SIZE); + if (len > 0) { + this->speaker_buffer_index_ += len; + this->speaker_buffer_size_ += len; + } + } else { + ESP_LOGW(TAG, "Speaker buffer full."); + } + if (this->speaker_buffer_size_ > 0) { + size_t written = this->speaker_->play(this->speaker_buffer_, this->speaker_buffer_size_); + memmove(this->speaker_buffer_, this->speaker_buffer_ + written, this->speaker_buffer_size_ - written); + this->speaker_buffer_size_ -= written; + this->speaker_buffer_index_ -= written; + } + playing = this->speaker_->is_running(); + } #endif #ifdef USE_MEDIA_PLAYER - if (this->media_player_ != nullptr) { - if (!this->playing_tts_ || - this->media_player_->state == media_player::MediaPlayerState::MEDIA_PLAYER_STATE_PLAYING) { - return; - } - this->set_timeout("playing-media", 1000, [this]() { - this->playing_tts_ = false; - if (this->continuous_) { - this->request_start(true); + if (this->media_player_ != nullptr) { + playing = (this->media_player_->state == media_player::MediaPlayerState::MEDIA_PLAYER_STATE_PLAYING); } - }); - return; - } #endif - // Set a 1 second timeout to start the voice assistant again. - this->set_timeout("continuous-no-sound", 1000, [this]() { - if (this->continuous_) { - this->request_start(true); + if (playing) { + this->set_timeout("playing", 100, [this]() { this->set_state_(State::IDLE, State::IDLE); }); + } + break; } - }); + default: + break; + } } -void VoiceAssistant::start(struct sockaddr_storage *addr, uint16_t port) { - ESP_LOGD(TAG, "Starting..."); +void VoiceAssistant::set_state_(State state) { + State old_state = this->state_; + this->state_ = state; + ESP_LOGD(TAG, "State changed from %d to %d", static_cast(old_state), static_cast(state)); +} + +void VoiceAssistant::set_state_(State state, State desired_state) { + this->set_state_(state); + this->desired_state_ = desired_state; + ESP_LOGD(TAG, "Desired state set to %d", static_cast(desired_state)); +} + +void VoiceAssistant::failed_to_start() { + ESP_LOGE(TAG, "Failed to start server. See Home Assistant logs for more details."); + this->error_trigger_->trigger("failed-to-start", "Failed to start server. See Home Assistant logs for more details."); + this->set_state_(State::STOP_MICROPHONE, State::IDLE); +} + +void VoiceAssistant::start_streaming(struct sockaddr_storage *addr, uint16_t port) { + if (this->state_ != State::STARTING_PIPELINE) { + this->signal_stop_(); + return; + } + + ESP_LOGD(TAG, "Client started, streaming microphone"); memcpy(&this->dest_addr_, addr, sizeof(this->dest_addr_)); if (this->dest_addr_.ss_family == AF_INET) { @@ -123,38 +335,90 @@ void VoiceAssistant::start(struct sockaddr_storage *addr, uint16_t port) { ESP_LOGW(TAG, "Unknown address family: %d", this->dest_addr_.ss_family); return; } - this->running_ = true; - this->mic_->start(); - this->listening_trigger_->trigger(); + + if (this->mic_->is_running()) { + this->set_state_(State::STREAMING_MICROPHONE, State::STREAMING_MICROPHONE); + } else { + this->set_state_(State::START_MICROPHONE, State::STREAMING_MICROPHONE); + } } -void VoiceAssistant::request_start(bool continuous) { - ESP_LOGD(TAG, "Requesting start..."); - if (!api::global_api_server->start_voice_assistant(this->conversation_id_, this->silence_detection_)) { - ESP_LOGW(TAG, "Could not request start."); - this->error_trigger_->trigger("not-connected", "Could not request start."); +void VoiceAssistant::request_start(bool continuous, bool silence_detection) { + if (!api::global_api_server->is_connected()) { + ESP_LOGE(TAG, "No API client connected"); + this->set_state_(State::IDLE, State::IDLE); this->continuous_ = false; return; } - this->continuous_ = continuous; - this->set_timeout("reset-conversation_id", 5 * 60 * 1000, [this]() { this->conversation_id_ = ""; }); + if (this->state_ == State::IDLE) { + this->continuous_ = continuous; + this->silence_detection_ = silence_detection; +#ifdef USE_ESP_ADF + if (this->use_wake_word_) { + rb_reset(this->ring_buffer_); + this->set_state_(State::START_MICROPHONE, State::WAIT_FOR_VAD); + } else +#endif + { + this->set_state_(State::START_PIPELINE, State::START_MICROPHONE); + } + } } -void VoiceAssistant::signal_stop() { +void VoiceAssistant::request_stop() { + this->continuous_ = false; + + switch (this->state_) { + case State::IDLE: + break; + case State::START_MICROPHONE: + case State::STARTING_MICROPHONE: + case State::WAIT_FOR_VAD: + case State::WAITING_FOR_VAD: + case State::START_PIPELINE: + this->set_state_(State::STOP_MICROPHONE, State::IDLE); + break; + case State::STARTING_PIPELINE: + case State::STREAMING_MICROPHONE: + this->signal_stop_(); + this->set_state_(State::STOP_MICROPHONE, State::IDLE); + break; + case State::STOP_MICROPHONE: + case State::STOPPING_MICROPHONE: + this->desired_state_ = State::IDLE; + break; + case State::AWAITING_RESPONSE: + case State::STREAMING_RESPONSE: + break; // Let the incoming audio stream finish then it will go to idle. + } +} + +void VoiceAssistant::signal_stop_() { ESP_LOGD(TAG, "Signaling stop..."); - this->mic_->stop(); - this->running_ = false; api::global_api_server->stop_voice_assistant(); memset(&this->dest_addr_, 0, sizeof(this->dest_addr_)); } void VoiceAssistant::on_event(const api::VoiceAssistantEventResponse &msg) { + ESP_LOGD(TAG, "Event Type: %d", msg.event_type); switch (msg.event_type) { case api::enums::VOICE_ASSISTANT_RUN_START: ESP_LOGD(TAG, "Assist Pipeline running"); this->start_trigger_->trigger(); break; + case api::enums::VOICE_ASSISTANT_WAKE_WORD_START: + break; + case api::enums::VOICE_ASSISTANT_WAKE_WORD_END: { + ESP_LOGD(TAG, "Wake word detected"); + this->wake_word_detected_trigger_->trigger(); + break; + } + case api::enums::VOICE_ASSISTANT_STT_START: + ESP_LOGD(TAG, "STT Started"); + this->listening_trigger_->trigger(); + break; case api::enums::VOICE_ASSISTANT_STT_END: { + this->set_state_(State::STOP_MICROPHONE, State::AWAITING_RESPONSE); std::string text; for (auto arg : msg.data) { if (arg.name == "text") { @@ -166,7 +430,6 @@ void VoiceAssistant::on_event(const api::VoiceAssistantEventResponse &msg) { return; } ESP_LOGD(TAG, "Speech recognised as: \"%s\"", text.c_str()); - this->signal_stop(); this->stt_end_trigger_->trigger(text); break; } @@ -191,6 +454,9 @@ void VoiceAssistant::on_event(const api::VoiceAssistantEventResponse &msg) { } ESP_LOGD(TAG, "Response: \"%s\"", text.c_str()); this->tts_start_trigger_->trigger(text); +#ifdef USE_SPEAKER + this->speaker_->start(); +#endif break; } case api::enums::VOICE_ASSISTANT_TTS_END: { @@ -207,17 +473,22 @@ void VoiceAssistant::on_event(const api::VoiceAssistantEventResponse &msg) { ESP_LOGD(TAG, "Response URL: \"%s\"", url.c_str()); #ifdef USE_MEDIA_PLAYER if (this->media_player_ != nullptr) { - this->playing_tts_ = true; this->media_player_->make_call().set_media_url(url).perform(); } #endif + State new_state = this->local_output_ ? State::STREAMING_RESPONSE : State::IDLE; + this->set_state_(new_state, new_state); this->tts_end_trigger_->trigger(url); break; } - case api::enums::VOICE_ASSISTANT_RUN_END: + case api::enums::VOICE_ASSISTANT_RUN_END: { ESP_LOGD(TAG, "Assist Pipeline ended"); + if (this->state_ != State::STREAMING_RESPONSE && this->state_ != State::IDLE) { + this->set_state_(State::IDLE, State::IDLE); + } this->end_trigger_->trigger(); break; + } case api::enums::VOICE_ASSISTANT_ERROR: { std::string code = ""; std::string message = ""; @@ -228,12 +499,20 @@ void VoiceAssistant::on_event(const api::VoiceAssistantEventResponse &msg) { message = std::move(arg.value); } } + if (code == "wake-word-timeout" || code == "wake_word_detection_aborted") { + this->set_state_(State::STOP_MICROPHONE, State::IDLE); + return; + } ESP_LOGE(TAG, "Error: %s - %s", code.c_str(), message.c_str()); - this->continuous_ = false; - this->signal_stop(); + if (this->state_ != State::IDLE) { + this->signal_stop_(); + this->set_state_(State::STOP_MICROPHONE, State::IDLE); + } this->error_trigger_->trigger(code, message); + break; } default: + ESP_LOGD(TAG, "Unhandled event type: %d", msg.event_type); break; } } diff --git a/esphome/components/voice_assistant/voice_assistant.h b/esphome/components/voice_assistant/voice_assistant.h index 75c17965bc..ce22538a85 100644 --- a/esphome/components/voice_assistant/voice_assistant.h +++ b/esphome/components/voice_assistant/voice_assistant.h @@ -19,6 +19,11 @@ #endif #include "esphome/components/socket/socket.h" +#ifdef USE_ESP_ADF +#include +#include +#endif + namespace esphome { namespace voice_assistant { @@ -28,19 +33,41 @@ namespace voice_assistant { static const uint32_t INITIAL_VERSION = 1; static const uint32_t SPEAKER_SUPPORT = 2; +enum class State { + IDLE, + START_MICROPHONE, + STARTING_MICROPHONE, + WAIT_FOR_VAD, + WAITING_FOR_VAD, + START_PIPELINE, + STARTING_PIPELINE, + STREAMING_MICROPHONE, + STOP_MICROPHONE, + STOPPING_MICROPHONE, + AWAITING_RESPONSE, + STREAMING_RESPONSE, +}; + class VoiceAssistant : public Component { public: void setup() override; void loop() override; float get_setup_priority() const override; - void start(struct sockaddr_storage *addr, uint16_t port); + void start_streaming(struct sockaddr_storage *addr, uint16_t port); + void failed_to_start(); void set_microphone(microphone::Microphone *mic) { this->mic_ = mic; } #ifdef USE_SPEAKER - void set_speaker(speaker::Speaker *speaker) { this->speaker_ = speaker; } + void set_speaker(speaker::Speaker *speaker) { + this->speaker_ = speaker; + this->local_output_ = true; + } #endif #ifdef USE_MEDIA_PLAYER - void set_media_player(media_player::MediaPlayer *media_player) { this->media_player_ = media_player; } + void set_media_player(media_player::MediaPlayer *media_player) { + this->media_player_ = media_player; + this->local_output_ = true; + } #endif uint32_t get_version() const { @@ -52,19 +79,29 @@ class VoiceAssistant : public Component { return INITIAL_VERSION; } - void request_start(bool continuous = false); - void signal_stop(); + void request_start(bool continuous, bool silence_detection); + void request_stop(); void on_event(const api::VoiceAssistantEventResponse &msg); - bool is_running() const { return this->running_; } + bool is_running() const { return this->state_ != State::IDLE; } void set_continuous(bool continuous) { this->continuous_ = continuous; } bool is_continuous() const { return this->continuous_; } - void set_silence_detection(bool silence_detection) { this->silence_detection_ = silence_detection; } + void set_use_wake_word(bool use_wake_word) { this->use_wake_word_ = use_wake_word; } +#ifdef USE_ESP_ADF + void set_vad_threshold(uint8_t vad_threshold) { this->vad_threshold_ = vad_threshold; } +#endif + + void set_noise_suppression_level(uint8_t noise_suppression_level) { + this->noise_suppression_level_ = noise_suppression_level; + } + void set_auto_gain(uint8_t auto_gain) { this->auto_gain_ = auto_gain; } + void set_volume_multiplier(float volume_multiplier) { this->volume_multiplier_ = volume_multiplier; } Trigger<> *get_listening_trigger() const { return this->listening_trigger_; } Trigger<> *get_start_trigger() const { return this->start_trigger_; } + Trigger<> *get_wake_word_detected_trigger() const { return this->wake_word_detected_trigger_; } Trigger *get_stt_end_trigger() const { return this->stt_end_trigger_; } Trigger *get_tts_start_trigger() const { return this->tts_start_trigger_; } Trigger *get_tts_end_trigger() const { return this->tts_end_trigger_; } @@ -72,11 +109,17 @@ class VoiceAssistant : public Component { Trigger *get_error_trigger() const { return this->error_trigger_; } protected: + int read_microphone_(); + void set_state_(State state); + void set_state_(State state, State desired_state); + void signal_stop_(); + std::unique_ptr socket_ = nullptr; struct sockaddr_storage dest_addr_; Trigger<> *listening_trigger_ = new Trigger<>(); Trigger<> *start_trigger_ = new Trigger<>(); + Trigger<> *wake_word_detected_trigger_ = new Trigger<>(); Trigger *stt_end_trigger_ = new Trigger(); Trigger *tts_start_trigger_ = new Trigger(); Trigger *tts_end_trigger_ = new Trigger(); @@ -86,35 +129,61 @@ class VoiceAssistant : public Component { microphone::Microphone *mic_{nullptr}; #ifdef USE_SPEAKER speaker::Speaker *speaker_{nullptr}; + uint8_t *speaker_buffer_; + size_t speaker_buffer_index_{0}; + size_t speaker_buffer_size_{0}; #endif #ifdef USE_MEDIA_PLAYER media_player::MediaPlayer *media_player_{nullptr}; bool playing_tts_{false}; #endif + bool local_output_{false}; + std::string conversation_id_{""}; - bool running_{false}; + HighFrequencyLoopRequester high_freq_; + +#ifdef USE_ESP_ADF + vad_handle_t vad_instance_; + ringbuf_handle_t ring_buffer_; + uint8_t vad_threshold_{5}; + uint8_t vad_counter_{0}; +#endif + + bool use_wake_word_; + uint8_t noise_suppression_level_; + uint8_t auto_gain_; + float volume_multiplier_; + + uint8_t *send_buffer_; + int16_t *input_buffer_; + bool continuous_{false}; bool silence_detection_; + + State state_{State::IDLE}; + State desired_state_{State::IDLE}; }; template class StartAction : public Action, public Parented { public: - void play(Ts... x) override { this->parent_->request_start(); } + void play(Ts... x) override { this->parent_->request_start(false, this->silence_detection_); } + + void set_silence_detection(bool silence_detection) { this->silence_detection_ = silence_detection; } + + protected: + bool silence_detection_; }; template class StartContinuousAction : public Action, public Parented { public: - void play(Ts... x) override { this->parent_->request_start(true); } + void play(Ts... x) override { this->parent_->request_start(true, true); } }; template class StopAction : public Action, public Parented { public: - void play(Ts... x) override { - this->parent_->set_continuous(false); - this->parent_->signal_stop(); - } + void play(Ts... x) override { this->parent_->request_stop(); } }; template class IsRunningCondition : public Condition, public Parented { diff --git a/tests/test4.yaml b/tests/test4.yaml index bb4357e28d..3d0ed2f658 100644 --- a/tests/test4.yaml +++ b/tests/test4.yaml @@ -767,6 +767,9 @@ speaker: voice_assistant: microphone: mic_id_external + speaker: speaker_id + on_listening: + - logger.log: "Voice assistant microphone listening" on_start: - logger.log: "Voice assistant started" on_stt_end: