选举领头sentinel
当sentinelStartFailoverIfNeeded判断需要进入故障恢复(failover)的时候,会调用sentinelStartFailover函数,开始进入failover状态。
这时,会标记master的failover_state为SENTINEL_FAILOVER_STATE_WAIT_START。
int sentinelStartFailoverIfNeeded(sentinelRedisInstance *master) {
/* We can't failover if the master is not in O_DOWN state. */
if (!(master->flags & SRI_O_DOWN)) return 0;
/* Failover already in progress? */
if (master->flags & SRI_FAILOVER_IN_PROGRESS) return 0;
/* Last failover attempt started too little time ago? */
if (mstime() - master->failover_start_time <
master->failover_timeout*2)
{
if (master->failover_delay_logged != master->failover_start_time) {
time_t clock = (master->failover_start_time +
master->failover_timeout*2) / 1000;
char ctimebuf[26];
ctime_r(&clock,ctimebuf);
ctimebuf[24] = ''; /* Remove newline. */
master->failover_delay_logged = master->failover_start_time;
redisLog(REDIS_WARNING,
"Next failover delay: I will not start a failover before %s",
ctimebuf);
}
return 0;
}
sentinelStartFailover(master);
return 1;
}
void sentinelStartFailover(sentinelRedisInstance *master) {
redisAssert(master->flags & SRI_MASTER);
master->failover_state = SENTINEL_FAILOVER_STATE_WAIT_START;
master->flags |= SRI_FAILOVER_IN_PROGRESS;
master->failover_epoch = ++sentinel.current_epoch;
sentinelEvent(REDIS_WARNING,"+new-epoch",master,"%llu",
(unsigned long long) sentinel.current_epoch);
sentinelEvent(REDIS_WARNING,"+try-failover",master,"%@");
master->failover_start_time = mstime()+rand()%SENTINEL_MAX_DESYNC;
master->failover_state_change_time = mstime();
}
按照前面发送is-master-down-by-addr询问的逻辑,这时候sentinel在发送is-master-down-by-addr询问的时候,
会带上自己的run id,表示要选举一个局部领头sentinel。大致选举规则和方法:
* 所有在线的sentinel都有被选为领头sentinel的资格
* 每次进行领头sentinel选举之后,不论选举是否成功,所有sentinel的配置纪元的值都会自增一次。
* 在一个配置纪元里面,所有sentinel都有一次将某个sentinel设置为局部领头sentinel的机会,并且一旦设置,这个配置新纪元就不能再修改
* 每个发现主服务器进入客观下线的sentinel都会要求其他sentinel将自己设置为局部领头sentinel
* 当一个sentinel(源sentinel)向另一个sentinel(目标sentinel)发送is-master-down-by-addr询问,表示要求对方将自己设置为局部领头sentinel
* 局部领头sentinel是先到先得:只有第一个发送is-master-down-by-addr询问的sentinel被设为局部领头sentinel,后续的都会被拒绝
* 目标sentinel回复is-master-down-by-addr询问,回复中会带上局部领头sentinel运行id和配置纪元
* 源sentinel收到目标sentinel返回之后,会检查其中的leader_epoch参数和自己的配置纪元是否相同,如果相同再比较其中的运行id是否和自己的相同,
如果相同,目标sentinel将源sentinel设置成了局部领头sentinel
* 如果有某个sentinel被半数以上sentinel设置局部领头sentinel,则这个sentinel成为领头sentinel
* 因为领头sentinel的产生需要半数以上sentinel支持,并且每个sentinel在一个配置纪元里面只能设置一次局部sentinel,所以一个配置纪元里面,
只会出现一个领头sentinel
* 如果在给定时限内没有选举出领头sentinel,将会在一段时间后再次进行选举。
void sentinelCommand(redisClient *c) {
...
else if (!strcasecmp(c->argv[1]->ptr,"is-master-down-by-addr")) {
/* SENTINEL IS-MASTER-DOWN-BY-ADDR <ip> <port> <current-epoch> <runid>*/
sentinelRedisInstance *ri;
long long req_epoch;
uint64_t leader_epoch = 0;
char *leader = NULL;
long port;
int isdown = 0;
if (c->argc != 6) goto numargserr;
if (getLongFromObjectOrReply(c,c->argv[3],&port,NULL) != REDIS_OK ||
getLongLongFromObjectOrReply(c,c->argv[4],&req_epoch,NULL)
!= REDIS_OK)
return;
ri = getSentinelRedisInstanceByAddrAndRunID(sentinel.masters,
c->argv[2]->ptr,port,NULL);
/* It exists? Is actually a master? Is subjectively down? It's down.
* Note: if we are in tilt mode we always reply with "0". */
if (!sentinel.tilt && ri && (ri->flags & SRI_S_DOWN) &&
(ri->flags & SRI_MASTER))
isdown = 1;
/* Vote for the master (or fetch the previous vote) if the request
* includes a runid, otherwise the sender is not seeking for a vote. */
// 这个时候发过来的是sentinel的run_id,所以要给出一个局部领头sentinel
if (ri && ri->flags & SRI_MASTER && strcasecmp(c->argv[5]->ptr,"*")) {
leader = sentinelVoteLeader(ri,(uint64_t)req_epoch,
c->argv[5]->ptr,
&leader_epoch);
}
/* Reply with a three-elements multi-bulk reply:
* down state, leader, vote epoch. */
addReplyMultiBulkLen(c,3);
addReply(c, isdown ? shared.cone : shared.czero);
addReplyBulkCString(c, leader ? leader : "*");
addReplyLongLong(c, (long long)leader_epoch);
if (leader) sdsfree(leader);
}
...
}
/* Vote for the sentinel with 'req_runid' or return the old vote if already
* voted for the specifed 'req_epoch' or one greater.
*
* If a vote is not available returns NULL, otherwise return the Sentinel
* runid and populate the leader_epoch with the epoch of the vote. */
char *sentinelVoteLeader(sentinelRedisInstance *master, uint64_t req_epoch, char *req_runid, uint64_t *leader_epoch) {
// 源sentinel纪元比当前的新,更新纪元
if (req_epoch > sentinel.current_epoch) {
sentinel.current_epoch = req_epoch;
sentinelFlushConfig();
sentinelEvent(REDIS_WARNING,"+new-epoch",master,"%llu",
(unsigned long long) sentinel.current_epoch);
}
// 当前sentinel在当前纪元还没有领头sentinel,设置源sentinel为局部领头sentinel
if (master->leader_epoch < req_epoch && sentinel.current_epoch <= req_epoch)
{
sdsfree(master->leader);
// 设置局部领头sentinel为源sentinel
master->leader = sdsnew(req_runid);
master->leader_epoch = sentinel.current_epoch;
sentinelFlushConfig();
sentinelEvent(REDIS_WARNING,"+vote-for-leader",master,"%s %llu",
master->leader, (unsigned long long) master->leader_epoch);
/* If we did not voted for ourselves, set the master failover start
* time to now, in order to force a delay before we can start a
* failover for the same master. */
if (strcasecmp(master->leader,server.runid))
master->failover_start_time = mstime()+rand()%SENTINEL_MAX_DESYNC;
}
*leader_epoch = master->leader_epoch;
return master->leader ? sdsnew(master->leader) : NULL;
}
这时候,通过is-master-down-by-addr询问,已经选举出局部领头sentinel,然后继续回到sentinelHandleRedisInstance,
sentinelFailoverStateMachine方法会继续这个选举:
void sentinelFailoverStateMachine(sentinelRedisInstance *ri) {
redisAssert(ri->flags & SRI_MASTER);
if (!(ri->flags & SRI_FAILOVER_IN_PROGRESS)) return;
switch(ri->failover_state) {
case SENTINEL_FAILOVER_STATE_WAIT_START:
sentinelFailoverWaitStart(ri);
break;
case SENTINEL_FAILOVER_STATE_SELECT_SLAVE:
sentinelFailoverSelectSlave(ri);
break;
case SENTINEL_FAILOVER_STATE_SEND_SLAVEOF_NOONE:
sentinelFailoverSendSlaveOfNoOne(ri);
break;
case SENTINEL_FAILOVER_STATE_WAIT_PROMOTION:
sentinelFailoverWaitPromotion(ri);
break;
case SENTINEL_FAILOVER_STATE_RECONF_SLAVES:
sentinelFailoverReconfNextSlave(ri);
break;
}
}
void sentinelFailoverWaitStart(sentinelRedisInstance *ri) {
char *leader;
int isleader;
/* Check if we are the leader for the failover epoch. */
// 在等待开始failover的时候,检查leader是否已经选举出来了
leader = sentinelGetLeader(ri, ri->failover_epoch);
isleader = leader && strcasecmp(leader,server.runid) == 0;
sdsfree(leader);
/* If I'm not the leader, and it is not a forced failover via
* SENTINEL FAILOVER, then I can't continue with the failover. */
// 没有被选上作为领头sentinel,放弃进入failover,由领头sentinel去完成
if (!isleader && !(ri->flags & SRI_FORCE_FAILOVER)) {
int election_timeout = SENTINEL_ELECTION_TIMEOUT;
/* The election timeout is the MIN between SENTINEL_ELECTION_TIMEOUT
* and the configured failover timeout. */
if (election_timeout > ri->failover_timeout)
election_timeout = ri->failover_timeout;
/* Abort the failover if I'm not the leader after some time. */
if (mstime() - ri->failover_start_time > election_timeout) {
sentinelEvent(REDIS_WARNING,"-failover-abort-not-elected",ri,"%@");
sentinelAbortFailover(ri);
}
return;
}
sentinelEvent(REDIS_WARNING,"+elected-leader",ri,"%@");
ri->failover_state = SENTINEL_FAILOVER_STATE_SELECT_SLAVE;
ri->failover_state_change_time = mstime();
sentinelEvent(REDIS_WARNING,"+failover-state-select-slave",ri,"%@");
}
char *sentinelGetLeader(sentinelRedisInstance *master, uint64_t epoch) {
dict *counters;
dictIterator *di;
dictEntry *de;
unsigned int voters = 0, voters_quorum;
char *myvote;
char *winner = NULL;
uint64_t leader_epoch;
uint64_t max_votes = 0;
redisAssert(master->flags & (SRI_O_DOWN|SRI_FAILOVER_IN_PROGRESS));
// 用于统计sentinel获得的票数,key为sentinel的run_id,value为获得的票数
counters = dictCreate(&leaderVotesDictType,NULL);
voters = dictSize(master->sentinels)+1; /* All the other sentinels and me. */
/* Count other sentinels votes */
di = dictGetIterator(master->sentinels);
// 迭代所有sentinels,统计每个sentinel记录的局部领头sentinel
while((de = dictNext(di)) != NULL) {
sentinelRedisInstance *ri = dictGetVal(de);
// 只能统计当前纪元的
if (ri->leader != NULL && ri->leader_epoch == sentinel.current_epoch)
sentinelLeaderIncr(counters,ri->leader);
}
dictReleaseIterator(di);
/* Check what's the winner. For the winner to win, it needs two conditions:
* 1) Absolute majority between voters (50% + 1).
* 2) And anyway at least master->quorum votes. */
// 获得刚统计的获得票数最多的sentinel的run_id和最大票数
di = dictGetIterator(counters);
while((de = dictNext(di)) != NULL) {
uint64_t votes = dictGetUnsignedIntegerVal(de);
if (votes > max_votes) {
max_votes = votes;
winner = dictGetKey(de);
}
}
dictReleaseIterator(di);
/* Count this Sentinel vote:
* if this Sentinel did not voted yet, either vote for the most
* common voted sentinel, or for itself if no vote exists at all. */
// 当前sentinel的票数,确定自己能否成为领头sentinel
if (winner)
myvote = sentinelVoteLeader(master,epoch,winner,&leader_epoch);
else
myvote = sentinelVoteLeader(master,epoch,server.runid,&leader_epoch);
if (myvote && leader_epoch == epoch) {
uint64_t votes = sentinelLeaderIncr(counters,myvote);
if (votes > max_votes) {
max_votes = votes;
winner = myvote;
}
}
voters_quorum = voters/2+1;
// 必须大于半数,且大于配置的quonum
if (winner && (max_votes < voters_quorum || max_votes < master->quorum))
winner = NULL;
winner = winner ? sdsnew(winner) : NULL;
sdsfree(myvote);
dictRelease(counters);
return winner;
}
这样,领头sentinel就已经被选择出来,并且failover状态已经变成了SENTINEL_FAILOVER_STATE_SELECT_SLAVE,
当前sentinel作为领头sentinel,将会真正完成master的failover。
故障转移
故障转移(failover)的第一步,就是选出新的master,大致的筛选流程为:
1. 删除列表中所有处于下线或者断线状态的slave
2. 删除列表中所有最近五秒内没有回复过领头sentinel的INFO命令的slave
3. 删除所有与已下线主服务器连接断开超过down-after-milliseconds * 10毫秒的slave(确保slave没有过早与master断开,副本比较新)
4. 根据slave优先级选择
5. 如果优先级相同,选择复制偏移量最大的slave
6. 如果都相同,按照run_id排序,选出run_id最小的slave
void sentinelFailoverSelectSlave(sentinelRedisInstance *ri) {
// 选出slave
sentinelRedisInstance *slave = sentinelSelectSlave(ri);
/* We don't handle the timeout in this state as the function aborts
* the failover or go forward in the next state. */
if (slave == NULL) {
sentinelEvent(REDIS_WARNING,"-failover-abort-no-good-slave",ri,"%@");
sentinelAbortFailover(ri);
} else {
// 修改状态为SENTINEL_FAILOVER_STATE_SEND_SLAVEOF_NOONE
sentinelEvent(REDIS_WARNING,"+selected-slave",slave,"%@");
slave->flags |= SRI_PROMOTED;
ri->promoted_slave = slave;
ri->failover_state = SENTINEL_FAILOVER_STATE_SEND_SLAVEOF_NOONE;
ri->failover_state_change_time = mstime();
sentinelEvent(REDIS_NOTICE,"+failover-state-send-slaveof-noone",
slave, "%@");
}
}
sentinelRedisInstance *sentinelSelectSlave(sentinelRedisInstance *master) {
sentinelRedisInstance **instance =
zmalloc(sizeof(instance[0])*dictSize(master->slaves));
sentinelRedisInstance *selected = NULL;
int instances = 0;
dictIterator *di;
dictEntry *de;
mstime_t max_master_down_time = 0;
// 计算最长同步延迟
if (master->flags & SRI_S_DOWN)
max_master_down_time += mstime() - master->s_down_since_time;
max_master_down_time += master->down_after_period * 10;
di = dictGetIterator(master->slaves);
while((de = dictNext(di)) != NULL) {
sentinelRedisInstance *slave = dictGetVal(de);
mstime_t info_validity_time;
// 已经断开的slave,直接忽略
if (slave->flags & (SRI_S_DOWN|SRI_O_DOWN|SRI_DISCONNECTED)) continue;
// 超过5倍ping间隔的slave也忽略
if (mstime() - slave->last_avail_time > SENTINEL_PING_PERIOD*5) continue;
if (slave->slave_priority == 0) continue;
/* If the master is in SDOWN state we get INFO for slaves every second.
* Otherwise we get it with the usual period so we need to account for
* a larger delay. */
if (master->flags & SRI_S_DOWN)
info_validity_time = SENTINEL_PING_PERIOD*5;
else
info_validity_time = SENTINEL_INFO_PERIOD*3;
// INFO响应超过有效时间,忽略
if (mstime() - slave->info_refresh > info_validity_time) continue;
// 和master断开的时间太长,忽略
if (slave->master_link_down_time > max_master_down_time) continue;
instance[instances++] = slave;
}
dictReleaseIterator(di);
if (instances) {
// 快速排序
qsort(instance,instances,sizeof(sentinelRedisInstance*),
compareSlavesForPromotion);
selected = instance[0];
}
zfree(instance);
return selected;
}
int compareSlavesForPromotion(const void *a, const void *b) {
sentinelRedisInstance **sa = (sentinelRedisInstance **)a,
**sb = (sentinelRedisInstance **)b;
char *sa_runid, *sb_runid;
// 先根据slave优先级排序
if ((*sa)->slave_priority != (*sb)->slave_priority)
return (*sa)->slave_priority - (*sb)->slave_priority;
/* If priority is the same, select the slave with greater replication
* offset (processed more data frmo the master). */
// 优先级相同,根据复制偏移量
if ((*sa)->slave_repl_offset > (*sb)->slave_repl_offset) {
return -1; /* a < b */
} else if ((*sa)->slave_repl_offset < (*sb)->slave_repl_offset) {
return 1; /* b > a */
}
/* If the replication offset is the same select the slave with that has
* the lexicographically smaller runid. Note that we try to handle runid
* == NULL as there are old Redis versions that don't publish runid in
* INFO. A NULL runid is considered bigger than any other runid. */
// 到这里选哪个都无所谓了,按照runid来选择
sa_runid = (*sa)->runid;
sb_runid = (*sb)->runid;
if (sa_runid == NULL && sb_runid == NULL) return 0;
else if (sa_runid == NULL) return 1; /* a > b */
else if (sb_runid == NULL) return -1; /* a < b */
return strcasecmp(sa_runid, sb_runid);
}
这样,新的master已经被选出来了。failover的状态变成了SENTINEL_FAILOVER_STATE_SEND_SLAVEOF_NOONE。
新的master被选出来之后,需要对新的master发送命令,让它角色发生变化,从slave变成master:
void sentinelFailoverSendSlaveOfNoOne(sentinelRedisInstance *ri) {
int retval;
/* We can't send the command to the promoted slave if it is now
* disconnected. Retry again and again with this state until the timeout
* is reached, then abort the failover. */
if (ri->promoted_slave->flags & SRI_DISCONNECTED) {
if (mstime() - ri->failover_state_change_time > ri->failover_timeout) {
sentinelEvent(REDIS_WARNING,"-failover-abort-slave-timeout",ri,"%@");
sentinelAbortFailover(ri);
}
return;
}
/* Send SLAVEOF NO ONE command to turn the slave into a master.
* We actually register a generic callback for this command as we don't
* really care about the reply. We check if it worked indirectly observing
* if INFO returns a different role (master instead of slave). */
// 发送slaveof no one命令,告知slave成为master
// 由于是否成功通过info命令观察,所以这里发送的时候不关注slaveof的结果
retval = sentinelSendSlaveOf(ri->promoted_slave,NULL,0);
if (retval != REDIS_OK) return;
sentinelEvent(REDIS_NOTICE, "+failover-state-wait-promotion",
ri->promoted_slave,"%@");
// 状态变成SENTINEL_FAILOVER_STATE_WAIT_PROMOTION
ri->failover_state = SENTINEL_FAILOVER_STATE_WAIT_PROMOTION;
ri->failover_state_change_time = mstime();
}
这样,就完成了slave到master的转变。failover状态变成了SENTINEL_FAILOVER_STATE_WAIT_PROMOTION。
在SENTINEL_FAILOVER_STATE_WAIT_PROMOTION这个状态,处理函数不真正去检查slave状态,而是通过定时的INFO命令来修改上位的slave。
void sentinelFailoverWaitPromotion(sentinelRedisInstance *ri) {
/* Just handle the timeout. Switching to the next state is handled
* by the function parsing the INFO command of the promoted slave. */
if (mstime() - ri->failover_state_change_time > ri->failover_timeout) {
sentinelEvent(REDIS_WARNING,"-failover-abort-slave-timeout",ri,"%@");
sentinelAbortFailover(ri);
}
}
void sentinelRefreshInstanceInfo(sentinelRedisInstance *ri, const char *info) {
...
// 发现有slave成为了master(之前发送了slaveof no one)
if ((ri->flags & SRI_SLAVE) && role == SRI_MASTER) {
/* If this is a promoted slave we can change state to the
* failover state machine. */
// 确定的确是sentinel发出的切换指令
if ((ri->flags & SRI_PROMOTED) &&
(ri->master->flags & SRI_FAILOVER_IN_PROGRESS) &&
(ri->master->failover_state ==
SENTINEL_FAILOVER_STATE_WAIT_PROMOTION))
{
/* Now that we are sure the slave was reconfigured as a master
* set the master configuration epoch to the epoch we won the
* election to perform this failover. This will force the other
* Sentinels to update their config (assuming there is not
* a newer one already available). */
// 修改master配置纪元为新的纪元
ri->master->config_epoch = ri->master->failover_epoch;
// failover状态改成SENTINEL_FAILOVER_STATE_RECONF_SLAVES
ri->master->failover_state = SENTINEL_FAILOVER_STATE_RECONF_SLAVES;
ri->master->failover_state_change_time = mstime();
sentinelFlushConfig();
sentinelEvent(REDIS_WARNING,"+promoted-slave",ri,"%@");
sentinelEvent(REDIS_WARNING,"+failover-state-reconf-slaves",
ri->master,"%@");
sentinelCallClientReconfScript(ri->master,SENTINEL_LEADER,
"start",ri->master->addr,ri->addr);
sentinelForceHelloUpdateForMaster(ri->master);
} else {
/* A slave turned into a master. We want to force our view and
* reconfigure as slave. Wait some time after the change before
* going forward, to receive new configs if any. */
// 有一个自称是master的,强行把他变成slave
mstime_t wait_time = SENTINEL_PUBLISH_PERIOD*4;
if (!(ri->flags & SRI_PROMOTED) &&
sentinelMasterLooksSane(ri->master) &&
sentinelRedisInstanceNoDownFor(ri,wait_time) &&
mstime() - ri->role_reported_time > wait_time)
{
int retval = sentinelSendSlaveOf(ri,
ri->master->addr->ip,
ri->master->addr->port);
if (retval == REDIS_OK)
sentinelEvent(REDIS_NOTICE,"+convert-to-slave",ri,"%@");
}
}
}
...
}
在INFO命令回调中,处理了slave->master的切换,这时候failover状态变成了SENTINEL_FAILOVER_STATE_RECONF_SLAVES。
这个状态的时候,状态机处理函数会对所有slave发送slaveof命令,切换slave对应的master,重新建立主备关系。
void sentinelFailoverReconfNextSlave(sentinelRedisInstance *master) {
dictIterator *di;
dictEntry *de;
int in_progress = 0;
di = dictGetIterator(master->slaves);
// 首先判断有多少处于重新建立主备关系
while((de = dictNext(di)) != NULL) {
sentinelRedisInstance *slave = dictGetVal(de);
if (slave->flags & (SRI_RECONF_SENT|SRI_RECONF_INPROG))
in_progress++;
}
dictReleaseIterator(di);
di = dictGetIterator(master->slaves);
// 控制网络传输等,确保并行创建主备的slave小于设置的阈值
while(in_progress < master->parallel_syncs &&
(de = dictNext(di)) != NULL)
{
sentinelRedisInstance *slave = dictGetVal(de);
int retval;
/* Skip the promoted slave, and already configured slaves. */
if (slave->flags & (SRI_PROMOTED|SRI_RECONF_DONE)) continue;
/* If too much time elapsed without the slave moving forward to
* the next state, consider it reconfigured even if it is not.
* Sentinels will detect the slave as misconfigured and fix its
* configuration later. */
// 同步超时,先不管
if ((slave->flags & SRI_RECONF_SENT) &&
(mstime() - slave->slave_reconf_sent_time) >
SENTINEL_SLAVE_RECONF_TIMEOUT)
{
sentinelEvent(REDIS_NOTICE,"-slave-reconf-sent-timeout",slave,"%@");
slave->flags &= ~SRI_RECONF_SENT;
slave->flags |= SRI_RECONF_DONE;
}
/* Nothing to do for instances that are disconnected or already
* in RECONF_SENT state. */
// 已经在同步中
if (slave->flags & (SRI_DISCONNECTED|SRI_RECONF_SENT|SRI_RECONF_INPROG))
continue;
/* Send SLAVEOF <new master>. */
// 还没同步,发送slaveof命令,开始主备复制
retval = sentinelSendSlaveOf(slave,
master->promoted_slave->addr->ip,
master->promoted_slave->addr->port);
if (retval == REDIS_OK) {
slave->flags |= SRI_RECONF_SENT;
slave->slave_reconf_sent_time = mstime();
sentinelEvent(REDIS_NOTICE,"+slave-reconf-sent",slave,"%@");
in_progress++;
}
}
dictReleaseIterator(di);
/* Check if all the slaves are reconfigured and handle timeout. */
sentinelFailoverDetectEnd(master);
}
如果所有的slave都复制完毕,failover会进入SENTINEL_FAILOVER_STATE_UPDATE_CONFIG状态。
SENTINEL_FAILOVER_STATE_UPDATE_CONFIG状态会重置master,将master相关属性全部从原先的master改成被提升的master。
(sentinelResetMasterAndChangeAddress)。
这样整个failover的流程就结束了,redis集群又重新建立了新的主备关系。
转载自:https://coolex.info/blog/475.html