diff --git a/esphome/components/rtttl/__init__.py b/esphome/components/rtttl/__init__.py
index e9453896ac..6163129529 100644
--- a/esphome/components/rtttl/__init__.py
+++ b/esphome/components/rtttl/__init__.py
@@ -4,7 +4,15 @@ import esphome.config_validation as cv
 import esphome.final_validate as fv
 from esphome import automation
 from esphome.components.output import FloatOutput
-from esphome.const import CONF_ID, CONF_OUTPUT, CONF_PLATFORM, CONF_TRIGGER_ID
+from esphome.components.speaker import Speaker
+
+from esphome.const import (
+    CONF_ID,
+    CONF_OUTPUT,
+    CONF_PLATFORM,
+    CONF_TRIGGER_ID,
+    CONF_SPEAKER,
+)
 
 _LOGGER = logging.getLogger(__name__)
 
@@ -24,17 +32,23 @@ IsPlayingCondition = rtttl_ns.class_("IsPlayingCondition", automation.Condition)
 
 MULTI_CONF = True
 
-CONFIG_SCHEMA = cv.Schema(
-    {
-        cv.GenerateID(CONF_ID): cv.declare_id(Rtttl),
-        cv.Required(CONF_OUTPUT): cv.use_id(FloatOutput),
-        cv.Optional(CONF_ON_FINISHED_PLAYBACK): automation.validate_automation(
-            {
-                cv.GenerateID(CONF_TRIGGER_ID): cv.declare_id(FinishedPlaybackTrigger),
-            }
-        ),
-    }
-).extend(cv.COMPONENT_SCHEMA)
+CONFIG_SCHEMA = cv.All(
+    cv.Schema(
+        {
+            cv.GenerateID(CONF_ID): cv.declare_id(Rtttl),
+            cv.Optional(CONF_OUTPUT): cv.use_id(FloatOutput),
+            cv.Optional(CONF_SPEAKER): cv.use_id(Speaker),
+            cv.Optional(CONF_ON_FINISHED_PLAYBACK): automation.validate_automation(
+                {
+                    cv.GenerateID(CONF_TRIGGER_ID): cv.declare_id(
+                        FinishedPlaybackTrigger
+                    ),
+                }
+            ),
+        }
+    ).extend(cv.COMPONENT_SCHEMA),
+    cv.has_exactly_one_key(CONF_OUTPUT, CONF_SPEAKER),
+)
 
 
 def validate_parent_output_config(value):
@@ -63,9 +77,9 @@ def validate_parent_output_config(value):
 
 FINAL_VALIDATE_SCHEMA = cv.Schema(
     {
-        cv.Required(CONF_OUTPUT): fv.id_declaration_match_schema(
+        cv.Optional(CONF_OUTPUT): fv.id_declaration_match_schema(
             validate_parent_output_config
-        )
+        ),
     },
     extra=cv.ALLOW_EXTRA,
 )
@@ -75,8 +89,14 @@ async def to_code(config):
     var = cg.new_Pvariable(config[CONF_ID])
     await cg.register_component(var, config)
 
-    out = await cg.get_variable(config[CONF_OUTPUT])
-    cg.add(var.set_output(out))
+    if CONF_OUTPUT in config:
+        out = await cg.get_variable(config[CONF_OUTPUT])
+        cg.add(var.set_output(out))
+        cg.add_define("USE_OUTPUT")
+
+    if CONF_SPEAKER in config:
+        out = await cg.get_variable(config[CONF_SPEAKER])
+        cg.add(var.set_speaker(out))
 
     for conf in config.get(CONF_ON_FINISHED_PLAYBACK, []):
         trigger = cg.new_Pvariable(conf[CONF_TRIGGER_ID], var)
diff --git a/esphome/components/rtttl/rtttl.cpp b/esphome/components/rtttl/rtttl.cpp
index 6274e69ba3..199a373785 100644
--- a/esphome/components/rtttl/rtttl.cpp
+++ b/esphome/components/rtttl/rtttl.cpp
@@ -1,4 +1,5 @@
 #include "rtttl.h"
+#include <cmath>
 #include "esphome/core/hal.h"
 #include "esphome/core/log.h"
 
@@ -15,104 +16,185 @@ static const uint16_t NOTES[] = {0,    262,  277,  294,  311,  330,  349,  370,
                                  1109, 1175, 1245, 1319, 1397, 1480, 1568, 1661, 1760, 1865, 1976, 2093, 2217,
                                  2349, 2489, 2637, 2794, 2960, 3136, 3322, 3520, 3729, 3951};
 
+static const uint16_t I2S_SPEED = 1600;
+
+#undef HALF_PI
+static const double HALF_PI = 1.5707963267948966192313216916398;
+
+inline double deg2rad(double degrees) {
+  static const double PI_ON_180 = 4.0 * atan(1.0) / 180.0;
+  return degrees * PI_ON_180;
+}
+
 void Rtttl::dump_config() { ESP_LOGCONFIG(TAG, "Rtttl"); }
 
 void Rtttl::play(std::string rtttl) {
-  rtttl_ = std::move(rtttl);
+  this->rtttl_ = std::move(rtttl);
+
+  this->default_duration_ = 4;
+  this->default_octave_ = 6;
+  this->note_duration_ = 0;
 
-  default_duration_ = 4;
-  default_octave_ = 6;
   int bpm = 63;
   uint8_t num;
 
   // Get name
-  position_ = rtttl_.find(':');
+  this->position_ = rtttl_.find(':');
 
   // it's somewhat documented to be up to 10 characters but let's be a bit flexible here
-  if (position_ == std::string::npos || position_ > 15) {
+  if (this->position_ == std::string::npos || this->position_ > 15) {
     ESP_LOGE(TAG, "Missing ':' when looking for name.");
     return;
   }
 
-  auto name = this->rtttl_.substr(0, position_);
+  auto name = this->rtttl_.substr(0, this->position_);
   ESP_LOGD(TAG, "Playing song %s", name.c_str());
 
   // get default duration
-  position_ = this->rtttl_.find("d=", position_);
-  if (position_ == std::string::npos) {
+  this->position_ = this->rtttl_.find("d=", this->position_);
+  if (this->position_ == std::string::npos) {
     ESP_LOGE(TAG, "Missing 'd='");
     return;
   }
-  position_ += 2;
+  this->position_ += 2;
   num = this->get_integer_();
   if (num > 0)
-    default_duration_ = num;
+    this->default_duration_ = num;
 
   // get default octave
-  position_ = rtttl_.find("o=", position_);
-  if (position_ == std::string::npos) {
+  this->position_ = this->rtttl_.find("o=", this->position_);
+  if (this->position_ == std::string::npos) {
     ESP_LOGE(TAG, "Missing 'o=");
     return;
   }
-  position_ += 2;
+  this->position_ += 2;
   num = get_integer_();
   if (num >= 3 && num <= 7)
-    default_octave_ = num;
+    this->default_octave_ = num;
 
   // get BPM
-  position_ = rtttl_.find("b=", position_);
-  if (position_ == std::string::npos) {
+  this->position_ = this->rtttl_.find("b=", this->position_);
+  if (this->position_ == std::string::npos) {
     ESP_LOGE(TAG, "Missing b=");
     return;
   }
-  position_ += 2;
+  this->position_ += 2;
   num = get_integer_();
   if (num != 0)
     bpm = num;
 
-  position_ = rtttl_.find(':', position_);
-  if (position_ == std::string::npos) {
+  this->position_ = this->rtttl_.find(':', this->position_);
+  if (this->position_ == std::string::npos) {
     ESP_LOGE(TAG, "Missing second ':'");
     return;
   }
-  position_++;
+  this->position_++;
 
   // BPM usually expresses the number of quarter notes per minute
-  wholenote_ = 60 * 1000L * 4 / bpm;  // this is the time for whole note (in milliseconds)
+  this->wholenote_ = 60 * 1000L * 4 / bpm;  // this is the time for whole note (in milliseconds)
 
-  output_freq_ = 0;
-  last_note_ = millis();
-  note_duration_ = 1;
+  this->output_freq_ = 0;
+  this->last_note_ = millis();
+  this->note_duration_ = 1;
+
+#ifdef USE_SPEAKER
+  this->samples_sent_ = 0;
+  this->samples_count_ = 0;
+#endif
+}
+
+void Rtttl::stop() {
+  this->note_duration_ = 0;
+#ifdef USE_OUTPUT
+  if (this->output_ != nullptr) {
+    this->output_->set_level(0.0);
+  }
+#endif
+#ifdef USE_SPEAKER
+  if (this->speaker_ != nullptr) {
+    if (this->speaker_->is_running()) {
+      this->speaker_->stop();
+    }
+  }
+#endif
 }
 
 void Rtttl::loop() {
-  if (note_duration_ == 0 || millis() - last_note_ < note_duration_)
+  if (this->note_duration_ == 0)
     return;
 
-  if (!rtttl_[position_]) {
-    output_->set_level(0.0);
+#ifdef USE_SPEAKER
+  if (this->speaker_ != nullptr) {
+    if (this->samples_sent_ != this->samples_count_) {
+      SpeakerSample sample[SAMPLE_BUFFER_SIZE + 1];
+      int x = 0;
+      double rem = 0.0;
+
+      while (true) {
+        // Try and send out the remainder of the existing note, one per loop()
+
+        if (this->samples_per_wave_ != 0 && this->samples_sent_ >= this->samples_gap_) {  // Play note//
+          rem = ((this->samples_sent_ << 10) % this->samples_per_wave_) * (360.0 / this->samples_per_wave_);
+
+          int16_t val = 8192 * sin(deg2rad(rem));
+
+          sample[x].left = val;
+          sample[x].right = val;
+
+        } else {
+          sample[x].left = 0;
+          sample[x].right = 0;
+        }
+
+        if (x >= SAMPLE_BUFFER_SIZE || this->samples_sent_ >= this->samples_count_) {
+          break;
+        }
+        this->samples_sent_++;
+        x++;
+      }
+      if (x > 0) {
+        int send = this->speaker_->play((uint8_t *) (&sample), x * 4);
+        if (send != x * 4) {
+          this->samples_sent_ -= (x - (send / 4));
+        }
+        return;
+      }
+    }
+  }
+#endif
+#ifdef USE_OUTPUT
+  if (this->output_ != nullptr && millis() - this->last_note_ < this->note_duration_)
+    return;
+#endif
+  if (!this->rtttl_[position_]) {
+    this->note_duration_ = 0;
+#ifdef USE_OUTPUT
+    if (this->output_ != nullptr) {
+      this->output_->set_level(0.0);
+    }
+#endif
     ESP_LOGD(TAG, "Playback finished");
     this->on_finished_playback_callback_.call();
-    note_duration_ = 0;
     return;
   }
 
   // align to note: most rtttl's out there does not add and space after the ',' separator but just in case...
-  while (rtttl_[position_] == ',' || rtttl_[position_] == ' ')
-    position_++;
+  while (this->rtttl_[this->position_] == ',' || this->rtttl_[this->position_] == ' ')
+    this->position_++;
 
   // first, get note duration, if available
   uint8_t num = this->get_integer_();
 
   if (num) {
-    note_duration_ = wholenote_ / num;
+    this->note_duration_ = this->wholenote_ / num;
   } else {
-    note_duration_ = wholenote_ / default_duration_;  // we will need to check if we are a dotted note after
+    this->note_duration_ =
+        this->wholenote_ / this->default_duration_;  // we will need to check if we are a dotted note after
   }
 
   uint8_t note;
 
-  switch (rtttl_[position_]) {
+  switch (this->rtttl_[this->position_]) {
     case 'c':
       note = 1;
       break;
@@ -138,51 +220,81 @@ void Rtttl::loop() {
     default:
       note = 0;
   }
-  position_++;
+  this->position_++;
 
   // now, get optional '#' sharp
-  if (rtttl_[position_] == '#') {
+  if (this->rtttl_[this->position_] == '#') {
     note++;
-    position_++;
+    this->position_++;
   }
 
   // now, get optional '.' dotted note
-  if (rtttl_[position_] == '.') {
-    note_duration_ += note_duration_ / 2;
-    position_++;
+  if (this->rtttl_[this->position_] == '.') {
+    this->note_duration_ += this->note_duration_ / 2;
+    this->position_++;
   }
 
   // now, get scale
   uint8_t scale = get_integer_();
   if (scale == 0)
-    scale = default_octave_;
+    scale = this->default_octave_;
+  bool need_note_gap = false;
 
   // Now play the note
   if (note) {
     auto note_index = (scale - 4) * 12 + note;
     if (note_index < 0 || note_index >= (int) sizeof(NOTES)) {
       ESP_LOGE(TAG, "Note out of valid range");
+      this->note_duration_ = 0;
       return;
     }
     auto freq = NOTES[note_index];
+    need_note_gap = freq == this->output_freq_;
 
-    if (freq == output_freq_) {
-      // Add small silence gap between same note
-      output_->set_level(0.0);
-      delay(DOUBLE_NOTE_GAP_MS);
-      note_duration_ -= DOUBLE_NOTE_GAP_MS;
-    }
-    output_freq_ = freq;
+    // Add small silence gap between same note
+    this->output_freq_ = freq;
 
-    ESP_LOGVV(TAG, "playing note: %d for %dms", note, note_duration_);
-    output_->update_frequency(freq);
-    output_->set_level(0.5);
+    ESP_LOGVV(TAG, "playing note: %d for %dms", note, this->note_duration_);
   } else {
-    ESP_LOGVV(TAG, "waiting: %dms", note_duration_);
-    output_->set_level(0.0);
+    ESP_LOGVV(TAG, "waiting: %dms", this->note_duration_);
+    this->output_freq_ = 0;
   }
 
-  last_note_ = millis();
+#ifdef USE_OUTPUT
+  if (this->output_ != nullptr) {
+    if (need_note_gap) {
+      this->output_->set_level(0.0);
+      delay(DOUBLE_NOTE_GAP_MS);
+      this->note_duration_ -= DOUBLE_NOTE_GAP_MS;
+    }
+    if (this->output_freq_ != 0) {
+      this->output_->update_frequency(this->output_freq_);
+      this->output_->set_level(0.5);
+    } else {
+      this->output_->set_level(0.0);
+    }
+  }
+#endif
+#ifdef USE_SPEAKER
+  if (this->speaker_ != nullptr) {
+    this->samples_sent_ = 0;
+    this->samples_count_ = (this->sample_rate_ * this->note_duration_) / I2S_SPEED;
+    // Convert from frequency in Hz to high and low samples in fixed point
+    if (this->output_freq_ != 0) {
+      this->samples_per_wave_ = (this->sample_rate_ << 10) / this->output_freq_;
+    } else {
+      this->samples_per_wave_ = 0;
+    }
+    if (need_note_gap) {
+      this->samples_gap_ = (this->sample_rate_ * DOUBLE_NOTE_GAP_MS) / I2S_SPEED;
+    } else {
+      this->samples_gap_ = 0;
+    }
+  }
+#endif
+
+  this->last_note_ = millis();
 }
+
 }  // namespace rtttl
 }  // namespace esphome
diff --git a/esphome/components/rtttl/rtttl.h b/esphome/components/rtttl/rtttl.h
index ec6fe7f98f..e09b0265be 100644
--- a/esphome/components/rtttl/rtttl.h
+++ b/esphome/components/rtttl/rtttl.h
@@ -1,23 +1,41 @@
 #pragma once
 
-#include "esphome/core/component.h"
 #include "esphome/core/automation.h"
+#include "esphome/core/component.h"
+
+#ifdef USE_OUTPUT
 #include "esphome/components/output/float_output.h"
+#endif
+
+#ifdef USE_SPEAKER
+#include "esphome/components/speaker/speaker.h"
+#endif
 
 namespace esphome {
 namespace rtttl {
 
+#ifdef USE_SPEAKER
+static const size_t SAMPLE_BUFFER_SIZE = 256;
+
+struct SpeakerSample {
+  int16_t left{0};
+  int16_t right{0};
+};
+#endif
+
 class Rtttl : public Component {
  public:
-  void set_output(output::FloatOutput *output) { output_ = output; }
+#ifdef USE_OUTPUT
+  void set_output(output::FloatOutput *output) { this->output_ = output; }
+#endif
+#ifdef USE_SPEAKER
+  void set_speaker(speaker::Speaker *speaker) { this->speaker_ = speaker; }
+#endif
   void play(std::string rtttl);
-  void stop() {
-    note_duration_ = 0;
-    output_->set_level(0.0);
-  }
+  void stop();
   void dump_config() override;
 
-  bool is_playing() { return note_duration_ != 0; }
+  bool is_playing() { return this->note_duration_ != 0; }
   void loop() override;
 
   void add_on_finished_playback_callback(std::function<void()> callback) {
@@ -27,14 +45,14 @@ class Rtttl : public Component {
  protected:
   inline uint8_t get_integer_() {
     uint8_t ret = 0;
-    while (isdigit(rtttl_[position_])) {
-      ret = (ret * 10) + (rtttl_[position_++] - '0');
+    while (isdigit(this->rtttl_[this->position_])) {
+      ret = (ret * 10) + (this->rtttl_[this->position_++] - '0');
     }
     return ret;
   }
 
-  std::string rtttl_;
-  size_t position_;
+  std::string rtttl_{""};
+  size_t position_{0};
   uint16_t wholenote_;
   uint16_t default_duration_;
   uint16_t default_octave_;
@@ -42,7 +60,22 @@ class Rtttl : public Component {
   uint16_t note_duration_;
 
   uint32_t output_freq_;
+
+#ifdef USE_OUTPUT
   output::FloatOutput *output_;
+#endif
+
+  void play_output_();
+
+#ifdef USE_SPEAKER
+  speaker::Speaker *speaker_;
+  void play_speaker_();
+  int sample_rate_{16000};
+  int samples_per_wave_{0};
+  int samples_sent_{0};
+  int samples_count_{0};
+  int samples_gap_{0};
+#endif
 
   CallbackManager<void()> on_finished_playback_callback_;
 };