diff --git a/esphome/components/api/api.proto b/esphome/components/api/api.proto
index ec4a0f7cc9..69765c7a94 100644
--- a/esphome/components/api/api.proto
+++ b/esphome/components/api/api.proto
@@ -1459,6 +1459,8 @@ enum VoiceAssistantEvent {
   VOICE_ASSISTANT_WAKE_WORD_END = 10;
   VOICE_ASSISTANT_STT_VAD_START = 11;
   VOICE_ASSISTANT_STT_VAD_END = 12;
+  VOICE_ASSISTANT_TTS_STREAM_START = 98;
+  VOICE_ASSISTANT_TTS_STREAM_END = 99;
 }
 
 message VoiceAssistantEventData {
diff --git a/esphome/components/api/api_pb2.cpp b/esphome/components/api/api_pb2.cpp
index 225b213a67..65df2312e1 100644
--- a/esphome/components/api/api_pb2.cpp
+++ b/esphome/components/api/api_pb2.cpp
@@ -452,6 +452,10 @@ template<> const char *proto_enum_to_string<enums::VoiceAssistantEvent>(enums::V
       return "VOICE_ASSISTANT_STT_VAD_START";
     case enums::VOICE_ASSISTANT_STT_VAD_END:
       return "VOICE_ASSISTANT_STT_VAD_END";
+    case enums::VOICE_ASSISTANT_TTS_STREAM_START:
+      return "VOICE_ASSISTANT_TTS_STREAM_START";
+    case enums::VOICE_ASSISTANT_TTS_STREAM_END:
+      return "VOICE_ASSISTANT_TTS_STREAM_END";
     default:
       return "UNKNOWN";
   }
diff --git a/esphome/components/api/api_pb2.h b/esphome/components/api/api_pb2.h
index a4826f09d2..4c70facf3d 100644
--- a/esphome/components/api/api_pb2.h
+++ b/esphome/components/api/api_pb2.h
@@ -184,6 +184,8 @@ enum VoiceAssistantEvent : uint32_t {
   VOICE_ASSISTANT_WAKE_WORD_END = 10,
   VOICE_ASSISTANT_STT_VAD_START = 11,
   VOICE_ASSISTANT_STT_VAD_END = 12,
+  VOICE_ASSISTANT_TTS_STREAM_START = 98,
+  VOICE_ASSISTANT_TTS_STREAM_END = 99,
 };
 enum AlarmControlPanelState : uint32_t {
   ALARM_STATE_DISARMED = 0,
diff --git a/esphome/components/i2s_audio/speaker/i2s_audio_speaker.cpp b/esphome/components/i2s_audio/speaker/i2s_audio_speaker.cpp
index 592a27b739..ed13e6b458 100644
--- a/esphome/components/i2s_audio/speaker/i2s_audio_speaker.cpp
+++ b/esphome/components/i2s_audio/speaker/i2s_audio_speaker.cpp
@@ -158,8 +158,13 @@ void I2SAudioSpeaker::watch_() {
   if (xQueueReceive(this->event_queue_, &event, 0) == pdTRUE) {
     switch (event.type) {
       case TaskEventType::STARTING:
+        ESP_LOGD(TAG, "Starting I2S Audio Speaker");
+        break;
       case TaskEventType::STARTED:
+        ESP_LOGD(TAG, "Started I2S Audio Speaker");
+        break;
       case TaskEventType::STOPPING:
+        ESP_LOGD(TAG, "Stopping I2S Audio Speaker");
         break;
       case TaskEventType::PLAYING:
         this->status_clear_warning();
@@ -170,6 +175,7 @@ void I2SAudioSpeaker::watch_() {
         this->player_task_handle_ = nullptr;
         this->parent_->unlock();
         xQueueReset(this->buffer_queue_);
+        ESP_LOGD(TAG, "Stopped I2S Audio Speaker");
         break;
       case TaskEventType::WARNING:
         ESP_LOGW(TAG, "Error writing to I2S: %s", esp_err_to_name(event.err));
diff --git a/esphome/components/voice_assistant/voice_assistant.cpp b/esphome/components/voice_assistant/voice_assistant.cpp
index 448df61d80..12fbdc97b4 100644
--- a/esphome/components/voice_assistant/voice_assistant.cpp
+++ b/esphome/components/voice_assistant/voice_assistant.cpp
@@ -281,11 +281,14 @@ void VoiceAssistant::loop() {
             memmove(this->speaker_buffer_, this->speaker_buffer_ + written, this->speaker_buffer_size_ - written);
             this->speaker_buffer_size_ -= written;
             this->speaker_buffer_index_ -= written;
-            this->set_timeout("speaker-timeout", 1000, [this]() { this->speaker_->stop(); });
+            this->set_timeout("speaker-timeout", 2000, [this]() { this->speaker_->stop(); });
           } else {
             ESP_LOGW(TAG, "Speaker buffer full.");
           }
         }
+        if (this->wait_for_stream_end_) {
+          break;  // We dont want to timeout here as the STREAM_END event will take care of that.
+        }
         playing = this->speaker_->is_running();
       }
 #endif
@@ -295,28 +298,77 @@ void VoiceAssistant::loop() {
       }
 #endif
       if (playing) {
-        this->set_timeout("playing", 100, [this]() {
+        this->set_timeout("playing", 2000, [this]() {
           this->cancel_timeout("speaker-timeout");
           this->set_state_(State::IDLE, State::IDLE);
         });
       }
       break;
     }
+    case State::RESPONSE_FINISHED: {
+#ifdef USE_SPEAKER
+      if (this->speaker_ != nullptr) {
+        this->speaker_->stop();
+        this->cancel_timeout("speaker-timeout");
+        this->cancel_timeout("playing");
+        this->speaker_buffer_size_ = 0;
+        this->speaker_buffer_index_ = 0;
+        memset(this->speaker_buffer_, 0, SPEAKER_BUFFER_SIZE);
+      }
+#endif
+      this->wait_for_stream_end_ = false;
+      this->set_state_(State::IDLE, State::IDLE);
+      break;
+    }
     default:
       break;
   }
 }
 
+static const LogString *voice_assistant_state_to_string(State state) {
+  switch (state) {
+    case State::IDLE:
+      return LOG_STR("IDLE");
+    case State::START_MICROPHONE:
+      return LOG_STR("START_MICROPHONE");
+    case State::STARTING_MICROPHONE:
+      return LOG_STR("STARTING_MICROPHONE");
+    case State::WAIT_FOR_VAD:
+      return LOG_STR("WAIT_FOR_VAD");
+    case State::WAITING_FOR_VAD:
+      return LOG_STR("WAITING_FOR_VAD");
+    case State::START_PIPELINE:
+      return LOG_STR("START_PIPELINE");
+    case State::STARTING_PIPELINE:
+      return LOG_STR("STARTING_PIPELINE");
+    case State::STREAMING_MICROPHONE:
+      return LOG_STR("STREAMING_MICROPHONE");
+    case State::STOP_MICROPHONE:
+      return LOG_STR("STOP_MICROPHONE");
+    case State::STOPPING_MICROPHONE:
+      return LOG_STR("STOPPING_MICROPHONE");
+    case State::AWAITING_RESPONSE:
+      return LOG_STR("AWAITING_RESPONSE");
+    case State::STREAMING_RESPONSE:
+      return LOG_STR("STREAMING_RESPONSE");
+    case State::RESPONSE_FINISHED:
+      return LOG_STR("RESPONSE_FINISHED");
+    default:
+      return LOG_STR("UNKNOWN");
+  }
+};
+
 void VoiceAssistant::set_state_(State state) {
   State old_state = this->state_;
   this->state_ = state;
-  ESP_LOGD(TAG, "State changed from %d to %d", static_cast<uint8_t>(old_state), static_cast<uint8_t>(state));
+  ESP_LOGD(TAG, "State changed from %s to %s", LOG_STR_ARG(voice_assistant_state_to_string(old_state)),
+           LOG_STR_ARG(voice_assistant_state_to_string(state)));
 }
 
 void VoiceAssistant::set_state_(State state, State desired_state) {
   this->set_state_(state);
   this->desired_state_ = desired_state;
-  ESP_LOGD(TAG, "Desired state set to %d", static_cast<uint8_t>(desired_state));
+  ESP_LOGD(TAG, "Desired state set to %s", LOG_STR_ARG(voice_assistant_state_to_string(desired_state)));
 }
 
 void VoiceAssistant::failed_to_start() {
@@ -400,6 +452,7 @@ void VoiceAssistant::request_stop() {
       break;
     case State::AWAITING_RESPONSE:
     case State::STREAMING_RESPONSE:
+    case State::RESPONSE_FINISHED:
       break;  // Let the incoming audio stream finish then it will go to idle.
   }
 }
@@ -531,6 +584,14 @@ void VoiceAssistant::on_event(const api::VoiceAssistantEventResponse &msg) {
       this->error_trigger_->trigger(code, message);
       break;
     }
+    case api::enums::VOICE_ASSISTANT_TTS_STREAM_START: {
+      this->wait_for_stream_end_ = true;
+      break;
+    }
+    case api::enums::VOICE_ASSISTANT_TTS_STREAM_END: {
+      this->set_state_(State::RESPONSE_FINISHED, State::IDLE);
+      break;
+    }
     default:
       ESP_LOGD(TAG, "Unhandled event type: %d", msg.event_type);
       break;
diff --git a/esphome/components/voice_assistant/voice_assistant.h b/esphome/components/voice_assistant/voice_assistant.h
index ce22538a85..cd448293db 100644
--- a/esphome/components/voice_assistant/voice_assistant.h
+++ b/esphome/components/voice_assistant/voice_assistant.h
@@ -46,6 +46,7 @@ enum class State {
   STOPPING_MICROPHONE,
   AWAITING_RESPONSE,
   STREAMING_RESPONSE,
+  RESPONSE_FINISHED,
 };
 
 class VoiceAssistant : public Component {
@@ -132,10 +133,10 @@ class VoiceAssistant : public Component {
   uint8_t *speaker_buffer_;
   size_t speaker_buffer_index_{0};
   size_t speaker_buffer_size_{0};
+  bool wait_for_stream_end_{false};
 #endif
 #ifdef USE_MEDIA_PLAYER
   media_player::MediaPlayer *media_player_{nullptr};
-  bool playing_tts_{false};
 #endif
 
   bool local_output_{false};