1
0
mirror of https://github.com/fumiama/terasu-cloudflared.git synced 2026-06-05 00:50:24 +08:00

CUSTESC-53681: Correct QUIC connection management for datagram handlers

Corrects the pattern of using errgroup's and context cancellation to simplify the logic for canceling extra routines for the QUIC connection. This is because the extra context cancellation is redundant with the fact that the errgroup also cancels it's own provided context when a routine returns (error or not).

For the datagram handler specifically, since it can respond faster to a context cancellation from the QUIC connection, we wrap the error before surfacing it outside of the QUIC connection scope to the supervisor. Additionally, the supervisor will look for this error type to check if it should retry the QUIC connection. These two operations are required because the supervisor does not look for a context canceled error when deciding to retry a connection. If a context canceled from the datagram handler were to be returned up to the supervisor on the initial connection, the cloudflared application would exit. We want to ensure that cloudflared maintains connection attempts even if any of the services on-top of a QUIC connection fail (datagram handler in this case).

Additional logging is also introduced along these paths to help with understanding the error conditions from the specific handlers on-top of a QUIC connection.

Related CUSTESC-53681

Closes TUN-9610
This commit is contained in:
Devin Carr
2025-08-19 16:10:00 -07:00
parent 8825ceecb5
commit 41dffd7f3c
9 changed files with 70 additions and 63 deletions

View File

@@ -132,6 +132,7 @@ func (s *Supervisor) Run(
if err == errEarlyShutdown {
return nil
}
s.log.Logger().Error().Err(err).Msg("initial tunnel connection failed")
return err
}
var tunnelsWaiting []int
@@ -154,6 +155,7 @@ func (s *Supervisor) Run(
// (note that this may also be caused by context cancellation)
case tunnelError := <-s.tunnelErrors:
tunnelsActive--
s.log.ConnAwareLogger().Err(tunnelError.err).Int(connection.LogFieldConnIndex, tunnelError.index).Msg("Connection terminated")
if tunnelError.err != nil && !shuttingDown {
switch tunnelError.err.(type) {
case ReconnectSignal:
@@ -166,7 +168,6 @@ func (s *Supervisor) Run(
if _, retry := s.tunnelsProtocolFallback[tunnelError.index].GetMaxBackoffDuration(ctx); !retry {
continue
}
s.log.ConnAwareLogger().Err(tunnelError.err).Int(connection.LogFieldConnIndex, tunnelError.index).Msg("Connection terminated")
tunnelsWaiting = append(tunnelsWaiting, tunnelError.index)
s.waitForNextTunnel(tunnelError.index)
@@ -285,7 +286,10 @@ func (s *Supervisor) startFirstTunnel(
*quic.IdleTimeoutError,
*quic.ApplicationError,
edgediscovery.DialError,
*connection.EdgeQuicDialError:
*connection.EdgeQuicDialError,
*connection.ControlStreamError,
*connection.StreamListenerError,
*connection.DatagramManagerError:
// Try again for these types of errors
default:
// Uncaught errors should bail startup
@@ -301,13 +305,9 @@ func (s *Supervisor) startTunnel(
index int,
connectedSignal *signal.Signal,
) {
var err error
defer func() {
s.tunnelErrors <- tunnelError{index: index, err: err}
}()
// nolint: gosec
err = s.edgeTunnelServer.Serve(ctx, uint8(index), s.tunnelsProtocolFallback[index], connectedSignal)
err := s.edgeTunnelServer.Serve(ctx, uint8(index), s.tunnelsProtocolFallback[index], connectedSignal)
s.tunnelErrors <- tunnelError{index: index, err: err}
}
func (s *Supervisor) newConnectedTunnelSignal(index int) *signal.Signal {