diff --git a/mail/mailer.go b/mail/mailer.go index 1c9a85359..ec7680cfd 100644 --- a/mail/mailer.go +++ b/mail/mailer.go @@ -13,6 +13,7 @@ import ( "net" "net/mail" "net/smtp" + "net/textproto" "strconv" "strings" "time" @@ -151,7 +152,7 @@ func (m *MailerImpl) generateMessage(to []string, subject, body string) ([]byte, addrs := []string{} for _, a := range to { if !core.IsASCII(a) { - return nil, fmt.Errorf("Non-ASCII email address") + return nil, fmt.Errorf("Non-ASCIR email address") } addrs = append(addrs, strconv.Quote(a)) } @@ -292,10 +293,31 @@ func (m *MailerImpl) SendMail(to []string, subject, msg string) error { m.stats.Inc("SendMail.Reconnects", 1) continue } else if err != nil { - // If it wasn't an EOF error it is unexpected and we return from - // SendMail() with an error - m.stats.Inc("SendMail.Errors", 1) - return err + /* + * If the error is an instace of `textproto.Error` with a SMTP error code, + * and that error code is 421 then treat this as a reconnect-able event. + * + * The SMTP RFC defines this error code as: + * 421 Service not available, closing transmission channel + * (This may be a reply to any command if the service knows it + * must shut down) + * + * In practice we see this code being used by our production SMTP server + * when the connection has gone idle for too long. For more information + * see issue #2249[0]. + * + * [0] - https://github.com/letsencrypt/boulder/issues/2249 + */ + if protoErr, ok := err.(*textproto.Error); ok && protoErr.Code == 421 { + m.stats.Inc("SendMail.Errors.SMTP.421", 1) + m.reconnect() + m.stats.Inc("SendMail.Reconnects", 1) + } else { + // If it wasn't an EOF error or a SMTP 421 it is unexpected and we + // return from SendMail() with an error + m.stats.Inc("SendMail.Errors", 1) + return err + } } } diff --git a/test/mail-test-srv/main.go b/test/mail-test-srv/main.go index 3044889f3..a72c6723d 100644 --- a/test/mail-test-srv/main.go +++ b/test/mail-test-srv/main.go @@ -101,11 +101,25 @@ func (srv *mailSrv) handleConn(conn net.Conn) { case "MAIL": srv.connNumberMutex.RLock() if srv.connNumber <= srv.closeFirst { - log.Printf( - "mail-test-srv: connection # %d < -closeFirst parameter %d, disconnecting client. Bye!\n", - srv.connNumber, srv.closeFirst) - clearState() - conn.Close() + // Half of the time, close cleanly to simulate the server side closing + // unexpectedly. + if srv.connNumber%2 == 0 { + log.Printf( + "mail-test-srv: connection # %d < -closeFirst parameter %d, disconnecting client. Bye!\n", + srv.connNumber, srv.closeFirst) + clearState() + conn.Close() + } else { + // The rest of the time, simulate a stale connection timeout by sending + // a SMTP 421 message. This replicates the timeout/close from issue + // 2249 - https://github.com/letsencrypt/boulder/issues/2249 + log.Printf( + "mail-test-srv: connection # %d < -closeFirst parameter %d, disconnecting with 421. Bye!\n", + srv.connNumber, srv.closeFirst) + clearState() + conn.Write([]byte("421 1.2.3 foo.bar.baz Error: timeout exceeded \r\n")) + conn.Close() + } } srv.connNumberMutex.RUnlock() clearState()