|
| 1 | +package simple |
| 2 | + |
| 3 | +import ( |
| 4 | + "fmt" |
| 5 | + "log" |
| 6 | + "net" |
| 7 | + "net/url" |
| 8 | + "sync" |
| 9 | + "time" |
| 10 | +) |
| 11 | + |
| 12 | +// For efficiency, we reuse connections for a while (instead of dialing every time). However, |
| 13 | +// there are two compelling reasons to redial periodically: |
| 14 | +// |
| 15 | +// 1. We don't want DNS changes on the remote end of the drain to go unnoticed for too long. |
| 16 | +// |
| 17 | +// 2. If the drain is using TCP, the underlying TCP stack can potentially take a very long time |
| 18 | +// waiting for acks and retrying send for packets that haven't been acked. This creates a |
| 19 | +// large window where packets can be spewed into the ether (without any warning) before the |
| 20 | +// problem is detected. By redialing periodically, we create the opportunity for a failed TCP |
| 21 | +// handshake-- which tells us sooner that something is wrong. |
| 22 | +// |
| 23 | +// For efficiency we want the refresh interval to be high. For resiliency, we want it to be low. |
| 24 | +// One minute has been arbitrarily selected as a sensible balance of these two concerns. |
| 25 | +const connRefreshInterval = 1 * time.Minute |
| 26 | + |
| 27 | +// This determines how many failed dial attempts are required before the drain is muted. |
| 28 | +const maxFailedConns = 5 |
| 29 | + |
| 30 | +// This determines how much time we're willing to spend dialing. |
| 31 | +const dialTimeout = 10 * time.Second |
| 32 | + |
| 33 | +// This is how long the drain is muted for after repeated connection failures. |
| 34 | +const mutePeriod = 5 * time.Minute |
| 35 | + |
| 36 | +type logDrain struct { |
| 37 | + proto string |
| 38 | + uri string |
| 39 | + conn net.Conn |
| 40 | + muted bool |
| 41 | + mutex sync.Mutex |
| 42 | +} |
| 43 | + |
| 44 | +// NewDrain returns a pointer to a new instance of a drain.LogDrain |
| 45 | +func NewDrain(drainURL string) (*logDrain, error) { |
| 46 | + u, err := url.Parse(drainURL) |
| 47 | + if err != nil { |
| 48 | + return nil, err |
| 49 | + } |
| 50 | + var proto string |
| 51 | + if u.Scheme == "udp" || u.Scheme == "syslog" { |
| 52 | + proto = "udp" |
| 53 | + } else if u.Scheme == "tcp" { |
| 54 | + proto = "tcp" |
| 55 | + } else { |
| 56 | + return nil, fmt.Errorf("Invalid drain url scheme: %s", u.Scheme) |
| 57 | + } |
| 58 | + return &logDrain{proto: proto, uri: u.Host + u.Path}, nil |
| 59 | +} |
| 60 | + |
| 61 | +// Send forwards the provided log message to an external destination |
| 62 | +func (d *logDrain) Send(message string) error { |
| 63 | + if d.muted { |
| 64 | + return nil |
| 65 | + } |
| 66 | + d.mutex.Lock() |
| 67 | + defer d.mutex.Unlock() |
| 68 | + conn, err := d.getConnection(false) |
| 69 | + if err != nil { |
| 70 | + return err |
| 71 | + } |
| 72 | + _, err = fmt.Fprintln(conn, message) |
| 73 | + if err != nil { |
| 74 | + // Try again with a new connection in case the issue was a broken pipe |
| 75 | + conn, err = d.getConnection(true) |
| 76 | + if err != nil { |
| 77 | + return err |
| 78 | + } |
| 79 | + _, err = fmt.Fprintln(conn, message) |
| 80 | + if err != nil { |
| 81 | + return err |
| 82 | + } |
| 83 | + } |
| 84 | + return nil |
| 85 | +} |
| 86 | + |
| 87 | +// getConnection returns a usable connection, often without needing to redial, but still |
| 88 | +// redialing when advised. |
| 89 | +func (d *logDrain) getConnection(forceNew bool) (net.Conn, error) { |
| 90 | + // If we have a connection, it's not old, and we're not focing a new one... |
| 91 | + if d.conn != nil && !forceNew { |
| 92 | + // then return the existing connection |
| 93 | + return d.conn, nil |
| 94 | + } |
| 95 | + // If ANY of those conditions weren't met, it's time for a new connection. |
| 96 | + // If we have an existing one, close it and nil it out, too for good measure. |
| 97 | + if d.conn != nil { |
| 98 | + if err := d.conn.Close(); err != nil { |
| 99 | + log.Println("drain: Error closing connection. Drain may be leaking connections.", err) |
| 100 | + } |
| 101 | + d.conn = nil |
| 102 | + } |
| 103 | + // Try a few times... |
| 104 | + var err error |
| 105 | + for attempt := 1; attempt <= maxFailedConns; attempt++ { |
| 106 | + d.conn, err = net.DialTimeout(d.proto, d.uri, dialTimeout) |
| 107 | + if err == nil { |
| 108 | + // We got our connection... |
| 109 | + // Make it good for only so long. See comment above on connRefreshInterval. |
| 110 | + err = d.conn.SetWriteDeadline(time.Now().Add(connRefreshInterval)) |
| 111 | + if err != nil { |
| 112 | + return nil, err |
| 113 | + } |
| 114 | + // Break out of the loop and return |
| 115 | + return d.conn, nil |
| 116 | + } |
| 117 | + } |
| 118 | + // Multiple attempts to dial have failed. Whatever the problem is, we shouldn't expect that |
| 119 | + // it will resolve itself quickly. |
| 120 | + log.Printf("drain: Experienced %d consecutive failed connection attempts; muting drain for %s.", maxFailedConns, mutePeriod) |
| 121 | + // Immediately "mute" the drain. This will prevent us from wasting resources repeatedly dialing |
| 122 | + // and failing while the message queue gets backed up. This will give the network a break and |
| 123 | + // allow us to empty the queue. |
| 124 | + d.muted = true |
| 125 | + // Unmute the drain when the mute interval has elapsed |
| 126 | + go func() { |
| 127 | + time.Sleep(mutePeriod) |
| 128 | + d.muted = false |
| 129 | + }() |
| 130 | + // Return the error from the last failed connection attempt |
| 131 | + return nil, err |
| 132 | +} |
0 commit comments