From b64e51fe511a7a51c19e411753abfe9deeca6169 Mon Sep 17 00:00:00 2001 From: Daniel Date: Wed, 19 Oct 2016 16:07:49 -0400 Subject: [PATCH] Adds reconnect handling for SMTP 421 events. This commit allows the mailer to treat a SMTP 421 err as an event that should produce a reconnect attempt. Issue #2249 describes a case where we see this SMTP error code from the remote server when our connection has been idle for too long. We now reconnect when this happens rather than failing ungracefully. The logic in the `mail-test-srv` used to force a number of initial connections to be disconnected is changed such that half of these forced disconnects are the normal clean connection close and half are a SMTP 421. This allows the existing integration test for server disconnects to be reused to test the 421 reconnect logic. --- mail/mailer.go | 32 +++++++++++++++++++++++++++----- test/mail-test-srv/main.go | 24 +++++++++++++++++++----- 2 files changed, 46 insertions(+), 10 deletions(-) diff --git a/mail/mailer.go b/mail/mailer.go index 1c9a85359..ec7680cfd 100644 --- a/mail/mailer.go +++ b/mail/mailer.go @@ -13,6 +13,7 @@ import ( "net" "net/mail" "net/smtp" + "net/textproto" "strconv" "strings" "time" @@ -151,7 +152,7 @@ func (m *MailerImpl) generateMessage(to []string, subject, body string) ([]byte, addrs := []string{} for _, a := range to { if !core.IsASCII(a) { - return nil, fmt.Errorf("Non-ASCII email address") + return nil, fmt.Errorf("Non-ASCIR email address") } addrs = append(addrs, strconv.Quote(a)) } @@ -292,10 +293,31 @@ func (m *MailerImpl) SendMail(to []string, subject, msg string) error { m.stats.Inc("SendMail.Reconnects", 1) continue } else if err != nil { - // If it wasn't an EOF error it is unexpected and we return from - // SendMail() with an error - m.stats.Inc("SendMail.Errors", 1) - return err + /* + * If the error is an instace of `textproto.Error` with a SMTP error code, + * and that error code is 421 then treat this as a reconnect-able event. + * + * The SMTP RFC defines this error code as: + * 421 Service not available, closing transmission channel + * (This may be a reply to any command if the service knows it + * must shut down) + * + * In practice we see this code being used by our production SMTP server + * when the connection has gone idle for too long. For more information + * see issue #2249[0]. + * + * [0] - https://github.com/letsencrypt/boulder/issues/2249 + */ + if protoErr, ok := err.(*textproto.Error); ok && protoErr.Code == 421 { + m.stats.Inc("SendMail.Errors.SMTP.421", 1) + m.reconnect() + m.stats.Inc("SendMail.Reconnects", 1) + } else { + // If it wasn't an EOF error or a SMTP 421 it is unexpected and we + // return from SendMail() with an error + m.stats.Inc("SendMail.Errors", 1) + return err + } } } diff --git a/test/mail-test-srv/main.go b/test/mail-test-srv/main.go index 3044889f3..a72c6723d 100644 --- a/test/mail-test-srv/main.go +++ b/test/mail-test-srv/main.go @@ -101,11 +101,25 @@ func (srv *mailSrv) handleConn(conn net.Conn) { case "MAIL": srv.connNumberMutex.RLock() if srv.connNumber <= srv.closeFirst { - log.Printf( - "mail-test-srv: connection # %d < -closeFirst parameter %d, disconnecting client. Bye!\n", - srv.connNumber, srv.closeFirst) - clearState() - conn.Close() + // Half of the time, close cleanly to simulate the server side closing + // unexpectedly. + if srv.connNumber%2 == 0 { + log.Printf( + "mail-test-srv: connection # %d < -closeFirst parameter %d, disconnecting client. Bye!\n", + srv.connNumber, srv.closeFirst) + clearState() + conn.Close() + } else { + // The rest of the time, simulate a stale connection timeout by sending + // a SMTP 421 message. This replicates the timeout/close from issue + // 2249 - https://github.com/letsencrypt/boulder/issues/2249 + log.Printf( + "mail-test-srv: connection # %d < -closeFirst parameter %d, disconnecting with 421. Bye!\n", + srv.connNumber, srv.closeFirst) + clearState() + conn.Write([]byte("421 1.2.3 foo.bar.baz Error: timeout exceeded \r\n")) + conn.Close() + } } srv.connNumberMutex.RUnlock() clearState()