I implemented a raft cluster function using golang through the module "github.com/hashicorp/raft" and found a problem in the following scenario:
There are currently 2 raft clusters, the cluster names, cluster nodes and IP addresses are as follows: (Raft clusters are initialized through the BootstrapCluster method)
Cluster1 BootstrapCluster servers:
- node1: {raft.ServerID: c1-node1, raft.ServerAddress: 192.168.100.1:7000}
- node2: {raft.ServerID: c1-node2, raft.ServerAddress: 192.168.100.2:7000}
- node3: {raft.ServerID: c1-node3, raft.ServerAddress: 192.168.100.3:7000}
Cluster2 BootstrapCluster servers:
- node3: {raft.ServerID: c2-node3, raft.ServerAddress: 192.168.100.3:7000}
- node4: {raft.ServerID: c2-node4, raft.ServerAddress: 192.168.100.4:7000}
- node5: {raft.ServerID: c2-node5, raft.ServerAddress: 192.168.100.5:7000}
Among them, "node1" and "node2" are started according to "Cluster1":
sudo ./raft_svr -cluster 'c1-node1,127.0.0.1,800;c1-node2,127.0.0.2,800;c1-node3,127.0.0.3,800' -id c1-node1
sudo ./raft_svr -cluster 'c1-node1,127.0.0.1,800;c1-node2,127.0.0.2,800;c1-node3,127.0.0.3,800' -id c1-node2
"node3","node4","node5" first start according to "Cluster2":
sudo ./raft_svr -cluster 'c2-node3,127.0.0.3,800;c2-node4,127.0.0.4,800;c2-node5,127.0.0.5,800' -id c2-node3
sudo ./raft_svr -cluster 'c2-node3,127.0.0.3,800;c2-node4,127.0.0.4,800;c2-node5,127.0.0.5,800' -id c2-node4
sudo ./raft_svr -cluster 'c2-node3,127.0.0.3,800;c2-node4,127.0.0.4,800;c2-node5,127.0.0.5,800' -id c2-node5
Then you will find that "node3" will switch back and forth between "Cluster1" and "Cluster2", sometimes belonging to "Cluster1" and sometimes belonging to "Cluster2".
INFO[0170] current state:Follower, servers:[{Suffrage:Voter ID:c2-node3 Address:127.0.0.3:800} {Suffrage:Voter ID:c2-node4 Address:127.0.0.4:800} {Suffrage:Voter ID:c2-node5 Address:127.0.0.5:800}], leader address:127.0.0.5:800, last contact:2025-05-14 15:35:53.330867 +0800 CST m=+169.779019126
INFO[0171] current state:Follower, servers:[{Suffrage:Voter ID:c2-node3 Address:127.0.0.3:800} {Suffrage:Voter ID:c2-node4 Address:127.0.0.4:800} {Suffrage:Voter ID:c2-node5 Address:127.0.0.5:800}], leader address:127.0.0.1:800, last contact:2025-05-14 15:35:54.308388 +0800 CST m=+170.756576126
Is this situation expected?
here is my code:
package main
import (
"flag"
"fmt"
"io"
"net"
"os"
"strconv"
"strings"
"time"
"github.com/hashicorp/raft" // github.com/hashicorp/raft v1.7.3
log "github.com/sirupsen/logrus"
)
type raftCluster struct {
localRaftID raft.ServerID
servers map[raft.ServerID]raft.ServerAddress // raftID : raftAddressPort
raft *raft.Raft
electionTimeout time.Duration
}
func (r *raftCluster) Start() error {
config := raft.DefaultConfig()
config.HeartbeatTimeout = 2000 * time.Millisecond
config.ElectionTimeout = 5000 * time.Millisecond
config.CommitTimeout = 2000 * time.Millisecond
config.LeaderLeaseTimeout = 2000 * time.Millisecond
config.LocalID = r.localRaftID
config.LogOutput = log.StandardLogger().Out
r.electionTimeout = config.ElectionTimeout * time.Duration(len(r.servers)*2)
localAddressPort := string(r.servers[r.localRaftID])
tcpAddr, err := net.ResolveTCPAddr("tcp", localAddressPort)
if err != nil {
return fmt.Errorf("resolve tcp address %s, %v", localAddressPort, err)
}
transport, err := raft.NewTCPTransport(localAddressPort, tcpAddr, 2, 10*time.Second, log.StandardLogger().Out)
if err != nil {
return fmt.Errorf("fail to create tcp transport, localAddressPort:%s, tcpAddr:%v, %v",
localAddressPort, tcpAddr, err)
}
snapshots := raft.NewInmemSnapshotStore()
logStore := raft.NewInmemStore()
stableStore := raft.NewInmemStore()
fm := NewFsm()
r.raft, err = raft.NewRaft(config, fm, logStore, stableStore, snapshots, transport)
if err != nil {
return fmt.Errorf("create raft error, %v", err)
}
var configuration raft.Configuration
for sID, addr := range r.servers {
server := raft.Server{
ID: sID,
Address: addr,
}
configuration.Servers = append(configuration.Servers, server)
}
err = r.raft.BootstrapCluster(configuration).Error()
if err != nil {
return fmt.Errorf("raft bootstrap faild, conf:%v, %v", configuration, err)
}
log.Infof("bootstrap cluster as config: %v", configuration)
return nil
}
func (r *raftCluster) checkLeaderState() {
ticker := time.NewTicker(time.Second)
for {
select {
case leader := <-r.raft.LeaderCh():
log.Infof("im leader:%v, state:%s, leader address:%s", leader, r.raft.State(), r.raft.Leader())
case <-ticker.C:
verifyErr := r.raft.VerifyLeader().Error()
servers := r.raft.GetConfiguration().Configuration().Servers
switch verifyErr {
case nil:
log.Infof("im leader, servers:%v", servers)
case raft.ErrNotLeader:
// check cluster leader
log.Infof("current state:%v, servers:%+v, leader address:%v, last contact:%v",
r.raft.State(), servers, r.raft.Leader(), r.raft.LastContact())
}
}
}
}
func main() {
var (
clusters = flag.String("cluster", "",
"cluster node address, fmt: ID,IP,Port;ID,IP,Port")
clusterId = flag.String("id", "", "cluster id")
)
flag.Parse()
if *clusterId == "" {
log.Infof("cluster id messing")
os.Exit(1)
}
servers := make(map[raft.ServerID]raft.ServerAddress)
for _, cluster := range strings.Split(*clusters, ";") {
info := strings.Split(cluster, ",")
var (
nid string
nip net.IP
nport int
err error
)
switch {
case len(info) == 3:
nid = info[0]
nip = net.ParseIP(info[1])
if nip == nil {
log.Infof("cluster %s ip %s parse failed", cluster, info[1])
os.Exit(1)
}
nport, err = strconv.Atoi(info[2])
if err != nil {
log.Infof("cluster %s port %s parse failed, %v", cluster, info[2], err)
}
default:
log.Infof("cluster args value is bad format")
os.Exit(1)
}
log.Infof("cluster node id:%s, ip:%v, port:%d", nid, nip, nport)
addr := net.TCPAddr{IP: nip, Port: nport}
servers[raft.ServerID(nid)] = raft.ServerAddress(addr.String())
}
r := raftCluster{
localRaftID: raft.ServerID(*clusterId),
servers: servers,
}
err := r.Start()
if err != nil {
log.Infof("rafter cluster start failed, %v", err)
os.Exit(1)
}
r.checkLeaderState()
}
// SimpleFsm: 实现一个简单的Fsm
type SimpleFsm struct {
db database
}
func NewFsm() *SimpleFsm {
fsm := &SimpleFsm{
db: NewDatabase(),
}
return fsm
}
func (f *SimpleFsm) Apply(l *raft.Log) interface{} {
return nil
}
func (f *SimpleFsm) Snapshot() (raft.FSMSnapshot, error) {
return &f.db, nil
}
func (f *SimpleFsm) Restore(io.ReadCloser) error {
return nil
}
type database struct{}
func NewDatabase() database {
return database{}
}
func (d *database) Get(key string) string {
return "not implemented"
}
func (d *database) Set(key, value string) {}
func (d *database) Persist(sink raft.SnapshotSink) error {
_, _ = sink.Write([]byte{})
_ = sink.Close()
return nil
}
func (d *database) Release() {}
I implemented a raft cluster function using golang through the module "github.com/hashicorp/raft" and found a problem in the following scenario:
There are currently 2 raft clusters, the cluster names, cluster nodes and IP addresses are as follows: (Raft clusters are initialized through the
BootstrapClustermethod)Cluster1 BootstrapCluster servers:
Cluster2 BootstrapCluster servers:
Among them, "node1" and "node2" are started according to "Cluster1":
"node3","node4","node5" first start according to "Cluster2":
Then you will find that "node3" will switch back and forth between "Cluster1" and "Cluster2", sometimes belonging to "Cluster1" and sometimes belonging to "Cluster2".
Is this situation expected?
here is my code: