Send/Receive Voice Assistant audio via API (#6471)

Co-authored-by: Michael Hansen <mike@rhasspy.org>
2025-02-01 23:22:57 +01:00 · 2024-04-08 16:19:22 +12:00 · 2024-04-08 16:19:22 +12:00 · 6f71363d9b
commit 6f71363d9b
parent 97ff87b718
9 changed files with 275 additions and 48 deletions
--- a/esphome/components/api/api.proto
+++ b/esphome/components/api/api.proto
@ -217,7 +217,8 @@ message DeviceInfoResponse {

  string friendly_name = 13;

-  uint32 voice_assistant_version = 14;
+  uint32 legacy_voice_assistant_version = 14;
+  uint32 voice_assistant_feature_flags = 17;

  string suggested_area = 16;
 }
@ -1422,12 +1423,18 @@ message BluetoothDeviceClearCacheResponse {
 }

 // ==================== PUSH TO TALK ====================
+enum VoiceAssistantSubscribeFlag {
+  VOICE_ASSISTANT_SUBSCRIBE_NONE = 0;
+  VOICE_ASSISTANT_SUBSCRIBE_API_AUDIO = 1;
+}
+
 message SubscribeVoiceAssistantRequest {
  option (id) = 89;
  option (source) = SOURCE_CLIENT;
  option (ifdef) = "USE_VOICE_ASSISTANT";

  bool subscribe = 1;
+  uint32 flags = 2;
 }

 enum VoiceAssistantRequestFlag {
@ -1495,6 +1502,16 @@ message VoiceAssistantEventResponse {
  repeated VoiceAssistantEventData data = 2;
 }

+message VoiceAssistantAudio {
+  option (id) = 106;
+  option (source) = SOURCE_BOTH;
+  option (ifdef) = "USE_VOICE_ASSISTANT";
+
+  bytes data = 1;
+  bool end = 2;
+}
+
+
 // ==================== ALARM CONTROL PANEL ====================
 enum AlarmControlPanelState {
  ALARM_STATE_DISARMED = 0;
--- a/esphome/components/api/api_connection.cpp
+++ b/esphome/components/api/api_connection.cpp
@ -1040,10 +1040,15 @@ void APIConnection::on_voice_assistant_response(const VoiceAssistantResponse &ms
      voice_assistant::global_voice_assistant->failed_to_start();
      return;
    }
-    struct sockaddr_storage storage;
-    socklen_t len = sizeof(storage);
-    this->helper_->getpeername((struct sockaddr *) &storage, &len);
-    voice_assistant::global_voice_assistant->start_streaming(&storage, msg.port);
+    if (msg.port == 0) {
+      // Use API Audio
+      voice_assistant::global_voice_assistant->start_streaming();
+    } else {
+      struct sockaddr_storage storage;
+      socklen_t len = sizeof(storage);
+      this->helper_->getpeername((struct sockaddr *) &storage, &len);
+      voice_assistant::global_voice_assistant->start_streaming(&storage, msg.port);
+    }
  }
 };
 void APIConnection::on_voice_assistant_event_response(const VoiceAssistantEventResponse &msg) {
@ -1055,6 +1060,15 @@ void APIConnection::on_voice_assistant_event_response(const VoiceAssistantEventR
    voice_assistant::global_voice_assistant->on_event(msg);
  }
 }
+void APIConnection::on_voice_assistant_audio(const VoiceAssistantAudio &msg) {
+  if (voice_assistant::global_voice_assistant != nullptr) {
+    if (voice_assistant::global_voice_assistant->get_api_connection() != this) {
+      return;
+    }
+
+    voice_assistant::global_voice_assistant->on_audio(msg);
+  }
+};

 #endif

@ -1142,7 +1156,7 @@ HelloResponse APIConnection::hello(const HelloRequest &msg) {

  HelloResponse resp;
  resp.api_version_major = 1;
-  resp.api_version_minor = 9;
+  resp.api_version_minor = 10;
  resp.server_info = App.get_name() + " (esphome v" ESPHOME_VERSION ")";
  resp.name = App.get_name();

@ -1203,7 +1217,8 @@ DeviceInfoResponse APIConnection::device_info(const DeviceInfoRequest &msg) {
  resp.bluetooth_proxy_feature_flags = bluetooth_proxy::global_bluetooth_proxy->get_feature_flags();
 #endif
 #ifdef USE_VOICE_ASSISTANT
-  resp.voice_assistant_version = voice_assistant::global_voice_assistant->get_version();
+  resp.legacy_voice_assistant_version = voice_assistant::global_voice_assistant->get_legacy_version();
+  resp.voice_assistant_feature_flags = voice_assistant::global_voice_assistant->get_feature_flags();
 #endif
  return resp;
 }
--- a/esphome/components/api/api_connection.h
+++ b/esphome/components/api/api_connection.h
@ -134,6 +134,7 @@ class APIConnection : public APIServerConnection {
  void subscribe_voice_assistant(const SubscribeVoiceAssistantRequest &msg) override;
  void on_voice_assistant_response(const VoiceAssistantResponse &msg) override;
  void on_voice_assistant_event_response(const VoiceAssistantEventResponse &msg) override;
+  void on_voice_assistant_audio(const VoiceAssistantAudio &msg) override;
 #endif

 #ifdef USE_ALARM_CONTROL_PANEL
--- a/esphome/components/api/api_pb2.cpp
+++ b/esphome/components/api/api_pb2.cpp
@ -410,6 +410,19 @@ const char *proto_enum_to_string<enums::BluetoothDeviceRequestType>(enums::Bluet
 }
 #endif
 #ifdef HAS_PROTO_MESSAGE_DUMP
+template<>
+const char *proto_enum_to_string<enums::VoiceAssistantSubscribeFlag>(enums::VoiceAssistantSubscribeFlag value) {
+  switch (value) {
+    case enums::VOICE_ASSISTANT_SUBSCRIBE_NONE:
+      return "VOICE_ASSISTANT_SUBSCRIBE_NONE";
+    case enums::VOICE_ASSISTANT_SUBSCRIBE_API_AUDIO:
+      return "VOICE_ASSISTANT_SUBSCRIBE_API_AUDIO";
+    default:
+      return "UNKNOWN";
+  }
+}
+#endif
+#ifdef HAS_PROTO_MESSAGE_DUMP
 template<> const char *proto_enum_to_string<enums::VoiceAssistantRequestFlag>(enums::VoiceAssistantRequestFlag value) {
  switch (value) {
    case enums::VOICE_ASSISTANT_REQUEST_NONE:
@ -716,7 +729,11 @@ bool DeviceInfoResponse::decode_varint(uint32_t field_id, ProtoVarInt value) {
      return true;
    }
    case 14: {
-      this->voice_assistant_version = value.as_uint32();
+      this->legacy_voice_assistant_version = value.as_uint32();
+      return true;
+    }
+    case 17: {
+      this->voice_assistant_feature_flags = value.as_uint32();
      return true;
    }
    default:
@ -784,7 +801,8 @@ void DeviceInfoResponse::encode(ProtoWriteBuffer buffer) const {
  buffer.encode_uint32(15, this->bluetooth_proxy_feature_flags);
  buffer.encode_string(12, this->manufacturer);
  buffer.encode_string(13, this->friendly_name);
-  buffer.encode_uint32(14, this->voice_assistant_version);
+  buffer.encode_uint32(14, this->legacy_voice_assistant_version);
+  buffer.encode_uint32(17, this->voice_assistant_feature_flags);
  buffer.encode_string(16, this->suggested_area);
 }
 #ifdef HAS_PROTO_MESSAGE_DUMP
@ -850,8 +868,13 @@ void DeviceInfoResponse::dump_to(std::string &out) const {
  out.append("'").append(this->friendly_name).append("'");
  out.append("\n");

-  out.append("  voice_assistant_version: ");
-  sprintf(buffer, "%" PRIu32, this->voice_assistant_version);
+  out.append("  legacy_voice_assistant_version: ");
+  sprintf(buffer, "%" PRIu32, this->legacy_voice_assistant_version);
+  out.append(buffer);
+  out.append("\n");
+
+  out.append("  voice_assistant_feature_flags: ");
+  sprintf(buffer, "%" PRIu32, this->voice_assistant_feature_flags);
  out.append(buffer);
  out.append("\n");

@ -6514,11 +6537,18 @@ bool SubscribeVoiceAssistantRequest::decode_varint(uint32_t field_id, ProtoVarIn
      this->subscribe = value.as_bool();
      return true;
    }
+    case 2: {
+      this->flags = value.as_uint32();
+      return true;
+    }
    default:
      return false;
  }
 }
-void SubscribeVoiceAssistantRequest::encode(ProtoWriteBuffer buffer) const { buffer.encode_bool(1, this->subscribe); }
+void SubscribeVoiceAssistantRequest::encode(ProtoWriteBuffer buffer) const {
+  buffer.encode_bool(1, this->subscribe);
+  buffer.encode_uint32(2, this->flags);
+}
 #ifdef HAS_PROTO_MESSAGE_DUMP
 void SubscribeVoiceAssistantRequest::dump_to(std::string &out) const {
  __attribute__((unused)) char buffer[64];
@ -6526,6 +6556,11 @@ void SubscribeVoiceAssistantRequest::dump_to(std::string &out) const {
  out.append("  subscribe: ");
  out.append(YESNO(this->subscribe));
  out.append("\n");
+
+  out.append("  flags: ");
+  sprintf(buffer, "%" PRIu32, this->flags);
+  out.append(buffer);
+  out.append("\n");
  out.append("}");
 }
 #endif
@ -6752,6 +6787,44 @@ void VoiceAssistantEventResponse::dump_to(std::string &out) const {
  out.append("}");
 }
 #endif
+bool VoiceAssistantAudio::decode_varint(uint32_t field_id, ProtoVarInt value) {
+  switch (field_id) {
+    case 2: {
+      this->end = value.as_bool();
+      return true;
+    }
+    default:
+      return false;
+  }
+}
+bool VoiceAssistantAudio::decode_length(uint32_t field_id, ProtoLengthDelimited value) {
+  switch (field_id) {
+    case 1: {
+      this->data = value.as_string();
+      return true;
+    }
+    default:
+      return false;
+  }
+}
+void VoiceAssistantAudio::encode(ProtoWriteBuffer buffer) const {
+  buffer.encode_string(1, this->data);
+  buffer.encode_bool(2, this->end);
+}
+#ifdef HAS_PROTO_MESSAGE_DUMP
+void VoiceAssistantAudio::dump_to(std::string &out) const {
+  __attribute__((unused)) char buffer[64];
+  out.append("VoiceAssistantAudio {\n");
+  out.append("  data: ");
+  out.append("'").append(this->data).append("'");
+  out.append("\n");
+
+  out.append("  end: ");
+  out.append(YESNO(this->end));
+  out.append("\n");
+  out.append("}");
+}
+#endif
 bool ListEntitiesAlarmControlPanelResponse::decode_varint(uint32_t field_id, ProtoVarInt value) {
  switch (field_id) {
    case 6: {
--- a/esphome/components/api/api_pb2.h
+++ b/esphome/components/api/api_pb2.h
@ -165,6 +165,10 @@ enum BluetoothDeviceRequestType : uint32_t {
  BLUETOOTH_DEVICE_REQUEST_TYPE_CONNECT_V3_WITHOUT_CACHE = 5,
  BLUETOOTH_DEVICE_REQUEST_TYPE_CLEAR_CACHE = 6,
 };
+enum VoiceAssistantSubscribeFlag : uint32_t {
+  VOICE_ASSISTANT_SUBSCRIBE_NONE = 0,
+  VOICE_ASSISTANT_SUBSCRIBE_API_AUDIO = 1,
+};
 enum VoiceAssistantRequestFlag : uint32_t {
  VOICE_ASSISTANT_REQUEST_NONE = 0,
  VOICE_ASSISTANT_REQUEST_USE_VAD = 1,
@ -327,7 +331,8 @@ class DeviceInfoResponse : public ProtoMessage {
  uint32_t bluetooth_proxy_feature_flags{0};
  std::string manufacturer{};
  std::string friendly_name{};
-  uint32_t voice_assistant_version{0};
+  uint32_t legacy_voice_assistant_version{0};
+  uint32_t voice_assistant_feature_flags{0};
  std::string suggested_area{};
  void encode(ProtoWriteBuffer buffer) const override;
 #ifdef HAS_PROTO_MESSAGE_DUMP
@ -1674,6 +1679,7 @@ class BluetoothDeviceClearCacheResponse : public ProtoMessage {
 class SubscribeVoiceAssistantRequest : public ProtoMessage {
 public:
  bool subscribe{false};
+  uint32_t flags{0};
  void encode(ProtoWriteBuffer buffer) const override;
 #ifdef HAS_PROTO_MESSAGE_DUMP
  void dump_to(std::string &out) const override;
@ -1749,6 +1755,19 @@ class VoiceAssistantEventResponse : public ProtoMessage {
  bool decode_length(uint32_t field_id, ProtoLengthDelimited value) override;
  bool decode_varint(uint32_t field_id, ProtoVarInt value) override;
 };
+class VoiceAssistantAudio : public ProtoMessage {
+ public:
+  std::string data{};
+  bool end{false};
+  void encode(ProtoWriteBuffer buffer) const override;
+#ifdef HAS_PROTO_MESSAGE_DUMP
+  void dump_to(std::string &out) const override;
+#endif
+
+ protected:
+  bool decode_length(uint32_t field_id, ProtoLengthDelimited value) override;
+  bool decode_varint(uint32_t field_id, ProtoVarInt value) override;
+};
 class ListEntitiesAlarmControlPanelResponse : public ProtoMessage {
 public:
  std::string object_id{};
--- a/esphome/components/api/api_pb2_service.cpp
+++ b/esphome/components/api/api_pb2_service.cpp
@ -476,6 +476,14 @@ bool APIServerConnectionBase::send_voice_assistant_request(const VoiceAssistantR
 #endif
 #ifdef USE_VOICE_ASSISTANT
 #endif
+#ifdef USE_VOICE_ASSISTANT
+bool APIServerConnectionBase::send_voice_assistant_audio(const VoiceAssistantAudio &msg) {
+#ifdef HAS_PROTO_MESSAGE_DUMP
+  ESP_LOGVV(TAG, "send_voice_assistant_audio: %s", msg.dump().c_str());
+#endif
+  return this->send_message_<VoiceAssistantAudio>(msg, 106);
+}
+#endif
 #ifdef USE_ALARM_CONTROL_PANEL
 bool APIServerConnectionBase::send_list_entities_alarm_control_panel_response(
    const ListEntitiesAlarmControlPanelResponse &msg) {
@ -971,6 +979,17 @@ bool APIServerConnectionBase::read_message(uint32_t msg_size, uint32_t msg_type,
      ESP_LOGVV(TAG, "on_date_command_request: %s", msg.dump().c_str());
 #endif
      this->on_date_command_request(msg);
+#endif
+      break;
+    }
+    case 106: {
+#ifdef USE_VOICE_ASSISTANT
+      VoiceAssistantAudio msg;
+      msg.decode(msg_data, msg_size);
+#ifdef HAS_PROTO_MESSAGE_DUMP
+      ESP_LOGVV(TAG, "on_voice_assistant_audio: %s", msg.dump().c_str());
+#endif
+      this->on_voice_assistant_audio(msg);
 #endif
      break;
    }
--- a/esphome/components/api/api_pb2_service.h
+++ b/esphome/components/api/api_pb2_service.h
@ -240,6 +240,10 @@ class APIServerConnectionBase : public ProtoService {
 #ifdef USE_VOICE_ASSISTANT
  virtual void on_voice_assistant_event_response(const VoiceAssistantEventResponse &value){};
 #endif
+#ifdef USE_VOICE_ASSISTANT
+  bool send_voice_assistant_audio(const VoiceAssistantAudio &msg);
+  virtual void on_voice_assistant_audio(const VoiceAssistantAudio &value){};
+#endif
 #ifdef USE_ALARM_CONTROL_PANEL
  bool send_list_entities_alarm_control_panel_response(const ListEntitiesAlarmControlPanelResponse &msg);
 #endif
--- a/esphome/components/voice_assistant/voice_assistant.cpp
+++ b/esphome/components/voice_assistant/voice_assistant.cpp
@ -24,28 +24,24 @@ static const size_t SPEAKER_BUFFER_SIZE = 16 * RECEIVE_SIZE;

 float VoiceAssistant::get_setup_priority() const { return setup_priority::AFTER_CONNECTION; }

-void VoiceAssistant::setup() {
-  ESP_LOGCONFIG(TAG, "Setting up Voice Assistant...");
-
-  global_voice_assistant = this;
-
+bool VoiceAssistant::start_udp_socket_() {
  this->socket_ = socket::socket(AF_INET, SOCK_DGRAM, IPPROTO_IP);
-  if (socket_ == nullptr) {
-    ESP_LOGW(TAG, "Could not create socket");
+  if (this->socket_ == nullptr) {
+    ESP_LOGE(TAG, "Could not create socket");
    this->mark_failed();
-    return;
+    return false;
  }
  int enable = 1;
-  int err = socket_->setsockopt(SOL_SOCKET, SO_REUSEADDR, &enable, sizeof(int));
+  int err = this->socket_->setsockopt(SOL_SOCKET, SO_REUSEADDR, &enable, sizeof(int));
  if (err != 0) {
    ESP_LOGW(TAG, "Socket unable to set reuseaddr: errno %d", err);
    // we can still continue
  }
-  err = socket_->setblocking(false);
+  err = this->socket_->setblocking(false);
  if (err != 0) {
-    ESP_LOGW(TAG, "Socket unable to set nonblocking mode: errno %d", err);
+    ESP_LOGE(TAG, "Socket unable to set nonblocking mode: errno %d", err);
    this->mark_failed();
-    return;
+    return false;
  }

 #ifdef USE_SPEAKER
@ -54,18 +50,30 @@ void VoiceAssistant::setup() {

    socklen_t sl = socket::set_sockaddr_any((struct sockaddr *) &server, sizeof(server), 6055);
    if (sl == 0) {
-      ESP_LOGW(TAG, "Socket unable to set sockaddr: errno %d", errno);
+      ESP_LOGE(TAG, "Socket unable to set sockaddr: errno %d", errno);
      this->mark_failed();
-      return;
+      return false;
    }

-    err = socket_->bind((struct sockaddr *) &server, sizeof(server));
+    err = this->socket_->bind((struct sockaddr *) &server, sizeof(server));
    if (err != 0) {
-      ESP_LOGW(TAG, "Socket unable to bind: errno %d", errno);
+      ESP_LOGE(TAG, "Socket unable to bind: errno %d", errno);
      this->mark_failed();
-      return;
+      return false;
    }
+  }
+#endif
+  this->udp_socket_running_ = true;
+  return true;
+}

+void VoiceAssistant::setup() {
+  ESP_LOGCONFIG(TAG, "Setting up Voice Assistant...");
+
+  global_voice_assistant = this;
+
+#ifdef USE_SPEAKER
+  if (this->speaker_ != nullptr) {
    ExternalRAMAllocator<uint8_t> speaker_allocator(ExternalRAMAllocator<uint8_t>::ALLOW_FAILURE);
    this->speaker_buffer_ = speaker_allocator.allocate(SPEAKER_BUFFER_SIZE);
    if (this->speaker_buffer_ == nullptr) {
@ -238,8 +246,20 @@ void VoiceAssistant::loop() {
      size_t available = this->ring_buffer_->available();
      while (available >= SEND_BUFFER_SIZE) {
        size_t read_bytes = this->ring_buffer_->read((void *) this->send_buffer_, SEND_BUFFER_SIZE, 0);
-        this->socket_->sendto(this->send_buffer_, read_bytes, 0, (struct sockaddr *) &this->dest_addr_,
-                              sizeof(this->dest_addr_));
+        if (this->audio_mode_ == AUDIO_MODE_API) {
+          api::VoiceAssistantAudio msg;
+          msg.data.assign((char *) this->send_buffer_, read_bytes);
+          this->api_client_->send_voice_assistant_audio(msg);
+        } else {
+          if (!this->udp_socket_running_) {
+            if (!this->start_udp_socket_()) {
+              this->set_state_(State::STOP_MICROPHONE, State::IDLE);
+              break;
+            }
+          }
+          this->socket_->sendto(this->send_buffer_, read_bytes, 0, (struct sockaddr *) &this->dest_addr_,
+                                sizeof(this->dest_addr_));
+        }
        available = this->ring_buffer_->available();
      }

@ -268,22 +288,25 @@ void VoiceAssistant::loop() {
 #ifdef USE_SPEAKER
      if (this->speaker_ != nullptr) {
        ssize_t received_len = 0;
-        if (this->speaker_buffer_index_ + RECEIVE_SIZE < SPEAKER_BUFFER_SIZE) {
-          received_len = this->socket_->read(this->speaker_buffer_ + this->speaker_buffer_index_, RECEIVE_SIZE);
-          if (received_len > 0) {
-            this->speaker_buffer_index_ += received_len;
-            this->speaker_buffer_size_ += received_len;
-            this->speaker_bytes_received_ += received_len;
+        if (this->audio_mode_ == AUDIO_MODE_UDP) {
+          if (this->speaker_buffer_index_ + RECEIVE_SIZE < SPEAKER_BUFFER_SIZE) {
+            received_len = this->socket_->read(this->speaker_buffer_ + this->speaker_buffer_index_, RECEIVE_SIZE);
+            if (received_len > 0) {
+              this->speaker_buffer_index_ += received_len;
+              this->speaker_buffer_size_ += received_len;
+              this->speaker_bytes_received_ += received_len;
+            }
+          } else {
+            ESP_LOGD(TAG, "Receive buffer full");
          }
-        } else {
-          ESP_LOGD(TAG, "Receive buffer full");
        }
        // Build a small buffer of audio before sending to the speaker
-        if (this->speaker_bytes_received_ > RECEIVE_SIZE * 4)
+        bool end_of_stream = this->stream_ended_ && (this->audio_mode_ == AUDIO_MODE_API || received_len < 0);
+        if (this->speaker_bytes_received_ > RECEIVE_SIZE * 4 || end_of_stream)
          this->write_speaker_();
        if (this->wait_for_stream_end_) {
          this->cancel_timeout("playing");
-          if (this->stream_ended_ && received_len < 0) {
+          if (end_of_stream) {
            ESP_LOGD(TAG, "End of audio stream received");
            this->cancel_timeout("speaker-timeout");
            this->set_state_(State::RESPONSE_FINISHED, State::RESPONSE_FINISHED);
@ -428,6 +451,22 @@ void VoiceAssistant::failed_to_start() {
  this->set_state_(State::STOP_MICROPHONE, State::IDLE);
 }

+void VoiceAssistant::start_streaming() {
+  if (this->state_ != State::STARTING_PIPELINE) {
+    this->signal_stop_();
+    return;
+  }
+
+  ESP_LOGD(TAG, "Client started, streaming microphone");
+  this->audio_mode_ = AUDIO_MODE_API;
+
+  if (this->mic_->is_running()) {
+    this->set_state_(State::STREAMING_MICROPHONE, State::STREAMING_MICROPHONE);
+  } else {
+    this->set_state_(State::START_MICROPHONE, State::STREAMING_MICROPHONE);
+  }
+}
+
 void VoiceAssistant::start_streaming(struct sockaddr_storage *addr, uint16_t port) {
  if (this->state_ != State::STARTING_PIPELINE) {
    this->signal_stop_();
@ -435,6 +474,7 @@ void VoiceAssistant::start_streaming(struct sockaddr_storage *addr, uint16_t por
  }

  ESP_LOGD(TAG, "Client started, streaming microphone");
+  this->audio_mode_ = AUDIO_MODE_UDP;

  memcpy(&this->dest_addr_, addr, sizeof(this->dest_addr_));
  if (this->dest_addr_.ss_family == AF_INET) {
@ -688,6 +728,17 @@ void VoiceAssistant::on_event(const api::VoiceAssistantEventResponse &msg) {
  }
 }

+void VoiceAssistant::on_audio(const api::VoiceAssistantAudio &msg) {
+  if (this->speaker_buffer_index_ + msg.data.length() < SPEAKER_BUFFER_SIZE) {
+    memcpy(this->speaker_buffer_ + this->speaker_buffer_index_, msg.data.data(), msg.data.length());
+    this->speaker_buffer_index_ += msg.data.length();
+    this->speaker_buffer_size_ += msg.data.length();
+    this->speaker_bytes_received_ += msg.data.length();
+  } else {
+    ESP_LOGE(TAG, "Cannot receive audio, buffer is full");
+  }
+}
+
 VoiceAssistant *global_voice_assistant = nullptr;  // NOLINT(cppcoreguidelines-avoid-non-const-global-variables)

 }  // namespace voice_assistant
--- a/esphome/components/voice_assistant/voice_assistant.h
+++ b/esphome/components/voice_assistant/voice_assistant.h
@ -29,9 +29,14 @@ namespace voice_assistant {

 // Version 1: Initial version
 // Version 2: Adds raw speaker support
-// Version 3: Unused/skip
-static const uint32_t INITIAL_VERSION = 1;
-static const uint32_t SPEAKER_SUPPORT = 2;
+static const uint32_t LEGACY_INITIAL_VERSION = 1;
+static const uint32_t LEGACY_SPEAKER_SUPPORT = 2;
+
+enum VoiceAssistantFeature : uint32_t {
+  FEATURE_VOICE_ASSISTANT = 1 << 0,
+  FEATURE_SPEAKER = 1 << 1,
+  FEATURE_API_AUDIO = 1 << 2,
+};

 enum class State {
  IDLE,
@ -49,11 +54,17 @@ enum class State {
  RESPONSE_FINISHED,
 };

+enum AudioMode : uint8_t {
+  AUDIO_MODE_UDP,
+  AUDIO_MODE_API,
+};
+
 class VoiceAssistant : public Component {
 public:
  void setup() override;
  void loop() override;
  float get_setup_priority() const override;
+  void start_streaming();
  void start_streaming(struct sockaddr_storage *addr, uint16_t port);
  void failed_to_start();

@ -71,19 +82,32 @@ class VoiceAssistant : public Component {
  }
 #endif

-  uint32_t get_version() const {
+  uint32_t get_legacy_version() const {
 #ifdef USE_SPEAKER
    if (this->speaker_ != nullptr) {
-      return SPEAKER_SUPPORT;
+      return LEGACY_SPEAKER_SUPPORT;
    }
 #endif
-    return INITIAL_VERSION;
+    return LEGACY_INITIAL_VERSION;
+  }
+
+  uint32_t get_feature_flags() const {
+    uint32_t flags = 0;
+    flags |= VoiceAssistantFeature::FEATURE_VOICE_ASSISTANT;
+#ifdef USE_SPEAKER
+    if (this->speaker_ != nullptr) {
+      flags |= VoiceAssistantFeature::FEATURE_SPEAKER;
+      flags |= VoiceAssistantFeature::FEATURE_API_AUDIO;
+    }
+#endif
+    return flags;
  }

  void request_start(bool continuous, bool silence_detection);
  void request_stop();

  void on_event(const api::VoiceAssistantEventResponse &msg);
+  void on_audio(const api::VoiceAssistantAudio &msg);

  bool is_running() const { return this->state_ != State::IDLE; }
  void set_continuous(bool continuous) { this->continuous_ = continuous; }
@ -201,6 +225,10 @@ class VoiceAssistant : public Component {

  State state_{State::IDLE};
  State desired_state_{State::IDLE};
+
+  AudioMode audio_mode_{AUDIO_MODE_UDP};
+  bool udp_socket_running_{false};
+  bool start_udp_socket_();
 };

 template<typename... Ts> class StartAction : public Action<Ts...>, public Parented<VoiceAssistant> {