Adds reconnect handling for SMTP 421 events.

This commit allows the mailer to treat a SMTP 421 err as an event that
should produce a reconnect attempt. Issue #2249 describes a case where
we see this SMTP error code from the remote server when our connection
has been idle for too long. We now reconnect when this happens rather
than failing ungracefully.

The logic in the `mail-test-srv` used to force a number of initial
connections to be disconnected is changed such that half of these forced
disconnects are the normal clean connection close and half are a SMTP
421. This allows the existing integration test for server disconnects to
be reused to test the 421 reconnect logic.
This commit is contained in:
Daniel 2016-10-19 16:07:49 -04:00
parent abfdce6e0a
commit b64e51fe51
No known key found for this signature in database
GPG Key ID: 08FB2BFC470E75B4
2 changed files with 46 additions and 10 deletions

View File

@ -13,6 +13,7 @@ import (
"net"
"net/mail"
"net/smtp"
"net/textproto"
"strconv"
"strings"
"time"
@ -151,7 +152,7 @@ func (m *MailerImpl) generateMessage(to []string, subject, body string) ([]byte,
addrs := []string{}
for _, a := range to {
if !core.IsASCII(a) {
return nil, fmt.Errorf("Non-ASCII email address")
return nil, fmt.Errorf("Non-ASCIR email address")
}
addrs = append(addrs, strconv.Quote(a))
}
@ -292,10 +293,31 @@ func (m *MailerImpl) SendMail(to []string, subject, msg string) error {
m.stats.Inc("SendMail.Reconnects", 1)
continue
} else if err != nil {
// If it wasn't an EOF error it is unexpected and we return from
// SendMail() with an error
m.stats.Inc("SendMail.Errors", 1)
return err
/*
* If the error is an instace of `textproto.Error` with a SMTP error code,
* and that error code is 421 then treat this as a reconnect-able event.
*
* The SMTP RFC defines this error code as:
* 421 <domain> Service not available, closing transmission channel
* (This may be a reply to any command if the service knows it
* must shut down)
*
* In practice we see this code being used by our production SMTP server
* when the connection has gone idle for too long. For more information
* see issue #2249[0].
*
* [0] - https://github.com/letsencrypt/boulder/issues/2249
*/
if protoErr, ok := err.(*textproto.Error); ok && protoErr.Code == 421 {
m.stats.Inc("SendMail.Errors.SMTP.421", 1)
m.reconnect()
m.stats.Inc("SendMail.Reconnects", 1)
} else {
// If it wasn't an EOF error or a SMTP 421 it is unexpected and we
// return from SendMail() with an error
m.stats.Inc("SendMail.Errors", 1)
return err
}
}
}

View File

@ -101,11 +101,25 @@ func (srv *mailSrv) handleConn(conn net.Conn) {
case "MAIL":
srv.connNumberMutex.RLock()
if srv.connNumber <= srv.closeFirst {
log.Printf(
"mail-test-srv: connection # %d < -closeFirst parameter %d, disconnecting client. Bye!\n",
srv.connNumber, srv.closeFirst)
clearState()
conn.Close()
// Half of the time, close cleanly to simulate the server side closing
// unexpectedly.
if srv.connNumber%2 == 0 {
log.Printf(
"mail-test-srv: connection # %d < -closeFirst parameter %d, disconnecting client. Bye!\n",
srv.connNumber, srv.closeFirst)
clearState()
conn.Close()
} else {
// The rest of the time, simulate a stale connection timeout by sending
// a SMTP 421 message. This replicates the timeout/close from issue
// 2249 - https://github.com/letsencrypt/boulder/issues/2249
log.Printf(
"mail-test-srv: connection # %d < -closeFirst parameter %d, disconnecting with 421. Bye!\n",
srv.connNumber, srv.closeFirst)
clearState()
conn.Write([]byte("421 1.2.3 foo.bar.baz Error: timeout exceeded \r\n"))
conn.Close()
}
}
srv.connNumberMutex.RUnlock()
clearState()