From afb304937bb9b2369781f68cb7294a03c23c1b48 Mon Sep 17 00:00:00 2001 From: Jorge Leal Date: Sat, 25 Apr 2026 08:52:42 +0200 Subject: [PATCH 1/4] configure etcd client keepalive and auto-sync to fix endpoint failover When an endpoint host crashes hard (power loss, network partition without RST/FIN) the gRPC connection stays in TCP ESTABLISHED until the kernel times out (~13 min on Linux), so the balancer never marks the endpoint unhealthy and Get/Watch can hang against the dead peer. DialTimeout only guards the initial Dial, not subsequent RPCs over the multiplexed HTTP/2 session. Set sensible defaults on the explicit Config: - DialKeepAliveTime = 10s - DialKeepAliveTimeout = 5s - PermitWithoutStream = true (so keepalives fire even with no active RPC, important for the watch) - AutoSyncInterval = 60s (refresh member list from the cluster) Users supplying an etcd config file are unaffected: those values come from the YAML and the file branch is left untouched. --- src/const.go | 11 +++++++---- src/etcd.go | 8 ++++++-- 2 files changed, 13 insertions(+), 6 deletions(-) diff --git a/src/const.go b/src/const.go index 8c7a964..ac56b87 100644 --- a/src/const.go +++ b/src/const.go @@ -25,10 +25,13 @@ var ( ) const ( - defaultEndpointIPv4 = "127.0.0.1:2379" - defaultEndpointIPv6 = "[::1]:2379" - defaultDialTimeout = 2 * time.Second - minimumDialTimeout = 10 * time.Millisecond + defaultEndpointIPv4 = "127.0.0.1:2379" + defaultEndpointIPv6 = "[::1]:2379" + defaultDialTimeout = 2 * time.Second + minimumDialTimeout = 10 * time.Millisecond + defaultDialKeepAliveTime = 10 * time.Second + defaultDialKeepAliveTimeout = 5 * time.Second + defaultAutoSyncInterval = 60 * time.Second ) const ( diff --git a/src/etcd.go b/src/etcd.go index df495ab..bce71eb 100644 --- a/src/etcd.go +++ b/src/etcd.go @@ -46,8 +46,12 @@ func (cli *etcdClient) Setup(args *programArgs) (logMessages []string, err error return } cfg := clientv3.Config{ - DialTimeout: *args.DialTimeout, - Endpoints: strings.Split(*args.Endpoints, `|`), + DialTimeout: *args.DialTimeout, + DialKeepAliveTime: defaultDialKeepAliveTime, + DialKeepAliveTimeout: defaultDialKeepAliveTimeout, + PermitWithoutStream: true, + AutoSyncInterval: defaultAutoSyncInterval, + Endpoints: strings.Split(*args.Endpoints, `|`), } logMessages = append(logMessages, fmt.Sprintf("%s: %s", dialTimeoutParam, *args.DialTimeout), From 28c8329d51a7cbe1bf98f4cce599cd264e697a2e Mon Sep 17 00:00:00 2001 From: Jorge Leal Date: Sat, 25 Apr 2026 12:12:22 +0200 Subject: [PATCH 2/4] expose etcd client keepalive and auto-sync as runtime parameters Follow-up to the previous commit: keep the same defaults but make every new field overridable both via a CLI flag (standalone mode) and via the remote-connection-string in pipe mode, matching the pattern already used by -timeout / timeout=. New parameters: -dial-keep-alive-time= (default 10s, 0 disables) -dial-keep-alive-timeout= (default 5s) -auto-sync-interval= (default 60s, 0 disables) -permit-without-stream= (default true) Also lift the inline PermitWithoutStream=true to a defaultPermitWithoutStream constant for consistency with the other defaults. --- src/const.go | 17 +++++++++------ src/etcd.go | 12 +++++++---- src/pdns-etcd3.go | 55 +++++++++++++++++++++++++++++++++-------------- 3 files changed, 58 insertions(+), 26 deletions(-) diff --git a/src/const.go b/src/const.go index ac56b87..853b84c 100644 --- a/src/const.go +++ b/src/const.go @@ -32,15 +32,20 @@ const ( defaultDialKeepAliveTime = 10 * time.Second defaultDialKeepAliveTimeout = 5 * time.Second defaultAutoSyncInterval = 60 * time.Second + defaultPermitWithoutStream = true ) const ( - pdnsVersionParam = "pdns-version" - prefixParam = "prefix" - logParamPrefix = "log-" - configFileParam = "config-file" - endpointsParam = "endpoints" - dialTimeoutParam = "timeout" + pdnsVersionParam = "pdns-version" + prefixParam = "prefix" + logParamPrefix = "log-" + configFileParam = "config-file" + endpointsParam = "endpoints" + dialTimeoutParam = "timeout" + dialKeepAliveTimeParam = "dial-keep-alive-time" + dialKeepAliveTimeoutParam = "dial-keep-alive-timeout" + autoSyncIntervalParam = "auto-sync-interval" + permitWithoutStreamParam = "permit-without-stream" ) const ( diff --git a/src/etcd.go b/src/etcd.go index bce71eb..8c50d12 100644 --- a/src/etcd.go +++ b/src/etcd.go @@ -47,14 +47,18 @@ func (cli *etcdClient) Setup(args *programArgs) (logMessages []string, err error } cfg := clientv3.Config{ DialTimeout: *args.DialTimeout, - DialKeepAliveTime: defaultDialKeepAliveTime, - DialKeepAliveTimeout: defaultDialKeepAliveTimeout, - PermitWithoutStream: true, - AutoSyncInterval: defaultAutoSyncInterval, + DialKeepAliveTime: *args.DialKeepAliveTime, + DialKeepAliveTimeout: *args.DialKeepAliveTimeout, + PermitWithoutStream: *args.PermitWithoutStream, + AutoSyncInterval: *args.AutoSyncInterval, Endpoints: strings.Split(*args.Endpoints, `|`), } logMessages = append(logMessages, fmt.Sprintf("%s: %s", dialTimeoutParam, *args.DialTimeout), + fmt.Sprintf("%s: %s", dialKeepAliveTimeParam, *args.DialKeepAliveTime), + fmt.Sprintf("%s: %s", dialKeepAliveTimeoutParam, *args.DialKeepAliveTimeout), + fmt.Sprintf("%s: %s", autoSyncIntervalParam, *args.AutoSyncInterval), + fmt.Sprintf("%s: %v", permitWithoutStreamParam, *args.PermitWithoutStream), fmt.Sprintf("%s: %s", endpointsParam, *args.Endpoints), ) cli.Client, err = clientv3.New(cfg) diff --git a/src/pdns-etcd3.go b/src/pdns-etcd3.go index 2e79242..3682dc1 100644 --- a/src/pdns-etcd3.go +++ b/src/pdns-etcd3.go @@ -33,14 +33,21 @@ import ( ) type programArgs struct { - ConfigFile *string - Endpoints *string - DialTimeout *time.Duration - Prefix *string + ConfigFile *string + Endpoints *string + DialTimeout *time.Duration + DialKeepAliveTime *time.Duration + DialKeepAliveTimeout *time.Duration + AutoSyncInterval *time.Duration + PermitWithoutStream *bool + Prefix *string } func (pa programArgs) String() string { - return fmt.Sprintf("ConfigFile=%s, Endpoints=%s, DialTimeout=%s, Prefix=%s", val2str(pa.ConfigFile), val2str(pa.Endpoints), val2str(pa.DialTimeout), val2str(pa.Prefix)) + return fmt.Sprintf("ConfigFile=%s, Endpoints=%s, DialTimeout=%s, DialKeepAliveTime=%s, DialKeepAliveTimeout=%s, AutoSyncInterval=%s, PermitWithoutStream=%s, Prefix=%s", + val2str(pa.ConfigFile), val2str(pa.Endpoints), val2str(pa.DialTimeout), + val2str(pa.DialKeepAliveTime), val2str(pa.DialKeepAliveTimeout), val2str(pa.AutoSyncInterval), + val2str(pa.PermitWithoutStream), val2str(pa.Prefix)) } type statusType struct { @@ -124,6 +131,14 @@ func readParameters(params objectType[string], client *pdnsClient) error { case !standalone && k == dialTimeoutParam: mdt := minimumDialTimeout err = setDurationParameterFunc(args.DialTimeout, &mdt)(v) + case !standalone && k == dialKeepAliveTimeParam: + err = setDurationParameterFunc(args.DialKeepAliveTime, nil)(v) + case !standalone && k == dialKeepAliveTimeoutParam: + err = setDurationParameterFunc(args.DialKeepAliveTimeout, nil)(v) + case !standalone && k == autoSyncIntervalParam: + err = setDurationParameterFunc(args.AutoSyncInterval, nil)(v) + case !standalone && k == permitWithoutStreamParam: + err = setBooleanParameterFunc(args.PermitWithoutStream)(v) case !standalone && k == prefixParam: *args.Prefix = v case k == pdnsVersionParam: @@ -300,13 +315,17 @@ func Main(programVersion VersionType, gitVersion string) { } var ( - standaloneArg = flag.String("standalone", "", `Use a standalone mode determined by the given URL (unix:///path/to/socket[?relative=] or http://:)`) - configFileArg = flag.String(configFileParam, "", "Use the given configuration file for the ETCD connection (overrides -endpoints)") - endpointsArg = flag.String(endpointsParam, defaultEndpointIPv6+"|"+defaultEndpointIPv4, "Use the endpoints configuration for ETCD connection") - dialTimeoutArg = flag.Duration(dialTimeoutParam, defaultDialTimeout, "ETCD dial timeout") - prefixArg = flag.String(prefixParam, "", "Global key prefix") - pdnsVersionArg = flag.String(pdnsVersionParam, "", "default PDNS version") - loggingArgs = func() map[logrus.Level]*string { + standaloneArg = flag.String("standalone", "", `Use a standalone mode determined by the given URL (unix:///path/to/socket[?relative=] or http://:)`) + configFileArg = flag.String(configFileParam, "", "Use the given configuration file for the ETCD connection (overrides -endpoints)") + endpointsArg = flag.String(endpointsParam, defaultEndpointIPv6+"|"+defaultEndpointIPv4, "Use the endpoints configuration for ETCD connection") + dialTimeoutArg = flag.Duration(dialTimeoutParam, defaultDialTimeout, "ETCD dial timeout") + dialKeepAliveTimeArg = flag.Duration(dialKeepAliveTimeParam, defaultDialKeepAliveTime, "ETCD dial keep-alive ping interval (0 to disable)") + dialKeepAliveTimeoutArg = flag.Duration(dialKeepAliveTimeoutParam, defaultDialKeepAliveTimeout, "ETCD dial keep-alive ping timeout") + autoSyncIntervalArg = flag.Duration(autoSyncIntervalParam, defaultAutoSyncInterval, "ETCD member list auto-sync interval (0 to disable)") + permitWithoutStreamArg = flag.Bool(permitWithoutStreamParam, defaultPermitWithoutStream, "send ETCD client keep-alive pings even with no active RPC stream") + prefixArg = flag.String(prefixParam, "", "Global key prefix") + pdnsVersionArg = flag.String(pdnsVersionParam, "", "default PDNS version") + loggingArgs = func() map[logrus.Level]*string { args := map[logrus.Level]*string{} for _, level := range logrus.AllLevels { args[level] = flag.String(logParamPrefix+level.String(), "", fmt.Sprintf("Set logging level %s to the given components (separated by +)", level)) @@ -327,10 +346,14 @@ func main(programVersion VersionType, gitVersion string, cmdLineArgs []string, o log.main().Printf("pdns-etcd3 %s, Copyright © 2016-2026 nix ", releaseVersion) // handle arguments // TODO handle more arguments, f.e. 'show-defaults' standalone command args = programArgs{ - ConfigFile: configFileArg, - Endpoints: endpointsArg, - DialTimeout: dialTimeoutArg, - Prefix: prefixArg, + ConfigFile: configFileArg, + Endpoints: endpointsArg, + DialTimeout: dialTimeoutArg, + DialKeepAliveTime: dialKeepAliveTimeArg, + DialKeepAliveTimeout: dialKeepAliveTimeoutArg, + AutoSyncInterval: autoSyncIntervalArg, + PermitWithoutStream: permitWithoutStreamArg, + Prefix: prefixArg, } if err := flag.CommandLine.Parse(cmdLineArgs); err != nil { // same as flag.Parse(), but we can pass the arguments instead of being fixed to os.Args[1:] (needed for integration testing) log.main().Panicf("failed to parse command line arguments: %s", err) From 583b52dadac3cedb4bf6dccb52973368574b019e Mon Sep 17 00:00:00 2001 From: Jorge Leal Date: Sat, 25 Apr 2026 12:33:06 +0200 Subject: [PATCH 3/4] fixed integration test (when running single) The previous commit added new pointer fields to programArgs that are populated through flag.* in main(), so the standalone path is fine. The TestPipeRequests literal builds programArgs by hand and was missing the new fields, leading to a nil pointer dereference in etcdClient.Setup when the test bypasses main(). --- src/integration_test.go | 16 ++++++++++++---- 1 file changed, 12 insertions(+), 4 deletions(-) diff --git a/src/integration_test.go b/src/integration_test.go index b236638..7f053c1 100644 --- a/src/integration_test.go +++ b/src/integration_test.go @@ -95,12 +95,20 @@ func TestPipeRequests(t *testing.T) { defer closeNoError(outR) // this should be done automatically by pdns-etcd3, but just in case config := "" timeout, _ := time.ParseDuration("5s") + keepAliveTime := defaultDialKeepAliveTime + keepAliveTimeout := defaultDialKeepAliveTimeout + autoSync := defaultAutoSyncInterval + permitWithoutStream := defaultPermitWithoutStream prefix := "" args = programArgs{ - ConfigFile: &config, - Endpoints: &etcd.Endpoint, - DialTimeout: &timeout, - Prefix: &prefix, + ConfigFile: &config, + Endpoints: &etcd.Endpoint, + DialTimeout: &timeout, + DialKeepAliveTime: &keepAliveTime, + DialKeepAliveTimeout: &keepAliveTimeout, + AutoSyncInterval: &autoSync, + PermitWithoutStream: &permitWithoutStream, + Prefix: &prefix, } ctx, cancel := context.WithCancel(context.Background()) defer cancel() From ece480847f5dd91f405fbdb577c8efa26f67f6cd Mon Sep 17 00:00:00 2001 From: Jorge Leal Date: Tue, 28 Apr 2026 12:39:41 +0200 Subject: [PATCH 4/4] documented new etcd client parameters in README --- README.md | 21 ++++++++++++++++++++- 1 file changed, 20 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index 25108e9..ac0c90d 100644 --- a/README.md +++ b/README.md @@ -107,7 +107,7 @@ enough to connect to ETCD, read all data, and reply to this first request. This Example PowerDNS configuration file: ``` launch=remote -remote-connection-string=pipe:command=/path/to/pdns-etcd3[,pdns-version=3|4|5][,][,prefix=][,timeout=][,log-=] +remote-connection-string=pipe:command=/path/to/pdns-etcd3[,pdns-version=3|4|5][,][,prefix=][,timeout=][,dial-keep-alive-time=][,dial-keep-alive-timeout=][,auto-sync-interval=][,permit-without-stream=][,log-=] # since in pipe mode every instance connects to ETCD and loads the data for itself (uses memory), possibly do this: distributor-threads=1 ``` @@ -199,6 +199,25 @@ are tagged by *#STANDALONE*): `timeout=` *config file* (in milliseconds, e.g. `1500` for 1.5 seconds)
An optional parameter which sets the dial timeout to ETCD. Must be a positive value (>= 1ms).
Defaults to 2 seconds. +* `dial-keep-alive-time=` *#STANDALONE*
+ Interval at which the client sends keep-alive pings to ETCD on the underlying gRPC (HTTP/2) connection. + These pings allow the client to detect a dead endpoint (e.g. a host that vanished without sending a TCP RST/FIN) + and rotate to another endpoint instead of waiting for the kernel TCP timeout (~13–15 minutes). + Set to `0` to disable keep-alive pings.
+ Defaults to 10 seconds. +* `dial-keep-alive-timeout=` *#STANDALONE*
+ Time the client waits for an acknowledgement after sending a keep-alive ping. If no ack arrives within this timeout, + the connection is considered dead and the client reconnects (to another endpoint if available).
+ Defaults to 5 seconds. +* `auto-sync-interval=` *#STANDALONE*
+ Interval at which the client refreshes its view of the ETCD cluster member list. + This makes new endpoints (e.g. cluster members added or rotated in after the client connected) reachable + without restarting the backend. Set to `0` to disable.
+ Defaults to 1 minute. +* `permit-without-stream=` *#STANDALONE*
+ When true, the client sends keep-alive pings even when no RPC stream is active on the connection. + This is needed to detect a dead endpoint while the backend is idle (e.g. between watch events on a low-traffic deployment).
+ Defaults to `true`. * `pdns-version=3|4|5`
The (major) PowerDNS version. Version 3 and 4 have incompatible protocols with the backend, so one must use the proper one. Version 5 is accepted, but works currently the same as 4 (no relevant API changes yet).