Skip to content

Commit

Permalink
Merge pull request #140 from sjones4/topic-node-reboot-handling
Browse files Browse the repository at this point in the history
EC2 node controller instance reboot handling
  • Loading branch information
sjones4 committed Dec 13, 2018
2 parents e722a01 + a43105f commit 69d17f7
Show file tree
Hide file tree
Showing 3 changed files with 81 additions and 51 deletions.
22 changes: 11 additions & 11 deletions node/handlers.c
Original file line number Diff line number Diff line change
Expand Up @@ -184,7 +184,7 @@ bunchOfInstances *global_instances = NULL; //!< pointer to the instance list
bunchOfInstances *global_instances_copy = NULL; //!< pointer to the copied instance list

const int default_staging_cleanup_threshold = 60 * 60 * 2; //!< after this many seconds any STAGING domains will be cleaned up
const int default_booting_cleanup_threshold = 60; //!< after this many seconds any BOOTING domains will be cleaned up
const int default_booting_cleanup_threshold = 60 + MONITORING_PERIOD; //!< after this many seconds any BOOTING domains will be cleaned up
const int default_booting_envwait_threshold = NETWORK_GATE_TIMEOUT_SEC; //!< after this many seconds an instance will fail to boot unless network environment is ready
const int default_bundling_cleanup_threshold = 60 * 60 * 2; //!< after this many seconds any BUNDLING domains will be cleaned up
const int default_createImage_cleanup_threshold = 60 * 60 * 2; //!< after this many seconds any CREATEIMAGE domains will be cleaned up
Expand Down Expand Up @@ -1074,8 +1074,8 @@ void change_state(ncInstance * instance, instance_states state)

euca_strncpy(instance->stateName, instance_state_names[instance->stateCode], CHAR_BUFFER_SIZE);
if (old_state != state) {
LOGDEBUG("[%s] state change for instance: %s -> %s (%s)\n",
instance->instanceId, instance_state_names[old_state], instance_state_names[instance->state], instance_state_names[instance->stateCode]);
LOGINFO("[%s] state change for instance: %s -> %s (%s)\n",
instance->instanceId, instance_state_names[old_state], instance_state_names[instance->state], instance_state_names[instance->stateCode]);
}
}

Expand Down Expand Up @@ -1296,14 +1296,13 @@ static void refresh_instance_info(struct nc_state_t *nc, ncInstance * instance)
}
}

// during reboot ensure that the domain enters reboot before setting instance back to Running
// and that we allow the instance to restart without detecting it as termination of the instance
// on reboot ensure the domain restarts without being detected as shutdown
if ((old_state == BOOTING) && (
((new_state == RUNNING) && (instance->bootTime > (time(NULL) - MONITORING_PERIOD))) ||
((new_state == SHUTOFF || new_state == SHUTDOWN) && (instance->bootTime > (time(NULL) - nc_state.reboot_grace_period_sec)))
((new_state == RUNNING || new_state == SHUTOFF || new_state == SHUTDOWN)
&& (instance->rebootTime > (time(NULL) - nc_state.reboot_grace_period_sec)))
)) {
if (new_state != RUNNING) { // skip logging for running as this happens frequently on reboot
LOGINFO("[%s] ignoring hypervisor reported state %s for booting domain during grace period (%d)\n",
if (new_state != RUNNING) { // running is reported while the instance is shutting down
LOGINFO("[%s] ignoring hypervisor reported state %s for rebooting domain during grace period (%d)\n",
instance->instanceId, instance_state_names[new_state], nc_state.reboot_grace_period_sec);
}
break;
Expand All @@ -1314,7 +1313,8 @@ static void refresh_instance_info(struct nc_state_t *nc, ncInstance * instance)
instance->instanceId, instance_state_names[new_state], nc_state.shutdown_grace_period_sec);
break;
}
LOGWARN("[%s] hypervisor reported previously running domain as %s\n", instance->instanceId, instance_state_names[new_state]);
LOGWARN("[%s] hypervisor reported %s domain as %s\n", instance->instanceId,
instance_state_names[old_state], instance_state_names[new_state]);
}
// change to state, whatever it happens to be
change_state(instance, new_state);
Expand Down Expand Up @@ -2390,7 +2390,7 @@ static int init(void)
GET_VAR_INT(nc_state.sc_request_timeout_sec, CONFIG_SC_REQUEST_TIMEOUT, 45);
GET_VAR_INT(nc_state.concurrent_cleanup_ops, CONFIG_CONCURRENT_CLEANUP_OPS, 30);
GET_VAR_INT(nc_state.disable_snapshots, CONFIG_DISABLE_SNAPSHOTS, 0);
GET_VAR_INT(nc_state.reboot_grace_period_sec, CONFIG_NC_REBOOT_GRACE_PERIOD_SEC, 30);
GET_VAR_INT(nc_state.reboot_grace_period_sec, CONFIG_NC_REBOOT_GRACE_PERIOD_SEC, 60 + MONITORING_PERIOD);
GET_VAR_INT(nc_state.shutdown_grace_period_sec, CONFIG_SHUTDOWN_GRACE_PERIOD_SEC, 60);

strcpy(nc_state.admin_user_id, EUCALYPTUS_ADMIN);
Expand Down
109 changes: 69 additions & 40 deletions node/handlers_kvm.c
Original file line number Diff line number Diff line change
Expand Up @@ -325,6 +325,7 @@ static void *rebooting_thread(void *arg)
virConnectPtr conn = NULL;
rebooting_thread_params *params = ((rebooting_thread_params *) arg);
nc = &(params->nc);
boolean shutoff = FALSE;

sem_p(inst_sem);
{
Expand All @@ -345,23 +346,25 @@ static void *rebooting_thread(void *arg)
EUCA_FREE(params);
return NULL;
}
dom = virDomainLookupByName(conn, instance->instanceId);
if (dom == NULL) {
LOGERROR("[%s] cannot locate instance to reboot, giving up\n", instance->instanceId);
unlock_hypervisor_conn();
EUCA_FREE(params);
return NULL;
}
// obtain the most up-to-date XML for domain from libvirt
xml = virDomainGetXMLDesc(dom, 0);
if (xml == NULL) {
LOGERROR("[%s] cannot obtain metadata for instance to reboot, giving up\n", instance->instanceId);
virDomainFree(dom); // release libvirt resource
unlock_hypervisor_conn();
EUCA_FREE(params);
return NULL;
}
virDomainFree(dom); // release libvirt resource
{ // hypervisor lock
dom = virDomainLookupByName(conn, instance->instanceId);
if (dom == NULL) {
LOGERROR("[%s] cannot locate instance to reboot, giving up\n", instance->instanceId);
unlock_hypervisor_conn();
EUCA_FREE(params);
return NULL;
}
// obtain the most up-to-date XML for domain from libvirt
xml = virDomainGetXMLDesc(dom, 0);
if (xml == NULL) {
LOGERROR("[%s] cannot obtain metadata for instance to reboot, giving up\n", instance->instanceId);
virDomainFree(dom); // release libvirt resource
unlock_hypervisor_conn();
EUCA_FREE(params);
return NULL;
}
virDomainFree(dom); // release libvirt resource
} // end hypervisor lock
unlock_hypervisor_conn();

LOGINFO("[%s] shutting down\n", instance->instanceId);
Expand All @@ -384,21 +387,32 @@ static void *rebooting_thread(void *arg)
EUCA_FREE(params);
return NULL;
}
// domain is now shut down, create a new one with the same XML
LOGINFO("[%s] rebooting\n", instance->instanceId);
if (!strcmp(nc->pEucaNet->sMode, NETMODE_VPCMIDO)) {
// need to sleep to allow midolman to update the VM interface
sleep(10);
}
dom = virDomainCreateLinux(conn, xml, 0);
if (dom == NULL) {
LOGERROR("[%s] failed to restart instance\n", instance->instanceId);
change_state(instance, SHUTOFF);
} else {
euca_strncpy(resourceName[0], instance->instanceId, MAX_SENSOR_NAME_LEN);
sensor_refresh_resources(resourceName, resourceAlias, 1); // refresh stats so we set base value accurately
virDomainFree(dom);
{ // hypervisor lock
// domain is now shut down, create a new one with the same XML
LOGINFO("[%s] rebooting\n", instance->instanceId);
if (!strcmp(nc->pEucaNet->sMode, NETMODE_VPCMIDO)) {
// need to sleep to allow midolman to update the VM interface
sleep(10);
}
dom = virDomainCreateLinux(conn, xml, 0);
if (dom == NULL) {
LOGERROR("[%s] failed to restart instance\n", instance->instanceId);
shutoff = TRUE;
} else {
euca_strncpy(resourceName[0], instance->instanceId, MAX_SENSOR_NAME_LEN);
sensor_refresh_resources(resourceName, resourceAlias, 1); // refresh stats so we set base value accurately
virDomainFree(dom);
}
} // end hypervisor lock
unlock_hypervisor_conn();

if (shutoff == TRUE) {
sem_p(inst_sem);
{
change_state(instance, SHUTOFF);
}
sem_v(inst_sem);
} else {
if (!strcmp(nc->pEucaNet->sMode, NETMODE_VPCMIDO)) {
bridge_instance_interfaces_remove(nc, instance);
}
Expand All @@ -409,11 +423,14 @@ static void *rebooting_thread(void *arg)
snprintf(iface, 16, "vn_%s", instance->instanceId);
bridge_interface_set_hairpin(nc, instance, iface);
}

sem_p(inst_sem);
{
instance->rebootTime = 0; // clear reboot time when running
}
sem_v(inst_sem);
}
EUCA_FREE(xml);

unlock_hypervisor_conn();
unset_corrid(get_corrid());
EUCA_FREE(params);
return NULL;
Expand All @@ -434,14 +451,21 @@ static int doRebootInstance(struct nc_state_t *nc, ncMetadata * pMeta, char *ins
pthread_t tcb = { 0 };
ncInstance *instance = NULL;
rebooting_thread_params *params = NULL;
int old_state;
int old_state = 0;
int old_bootTime = 0;
sem_p(inst_sem);
{
instance = find_instance(&global_instances, instanceId);
old_state = instance->state;
instance->bootTime = time(NULL); // otherwise nc_state.booting_cleanup_threshold will kick in
change_state(instance, BOOTING); // not STAGING, since in that mode we don't poll hypervisor for info
LOGDEBUG("[%s] is set to BOOTING stage\n", instanceId);
if (instance != NULL) {
old_state = instance->state;
if (old_state != BOOTING) {
old_bootTime = instance->bootTime;
instance->bootTime = time(NULL); // otherwise nc_state.booting_cleanup_threshold will kick in
instance->rebootTime = instance->bootTime; // reboot time will not be set on initial boot
change_state(instance, BOOTING); // not STAGING, since in that mode we don't poll hypervisor for info
LOGDEBUG("[%s] is set to BOOTING stage\n", instanceId);
}
}
}
sem_v(inst_sem);

Expand All @@ -450,6 +474,11 @@ static int doRebootInstance(struct nc_state_t *nc, ncMetadata * pMeta, char *ins
return (EUCA_NOT_FOUND_ERROR);
}

if ((old_state == BOOTING) && (instance->rebootTime > 0)) {
LOGINFO("[%s] reboot in progress, ignoring reboot request.\n", instanceId);
return (EUCA_OK);
}

params = EUCA_ZALLOC(1, sizeof(rebooting_thread_params));
memcpy(&(params->instanceId), instanceId, CHAR_BUFFER_SIZE);
memcpy(&(params->nc), nc, sizeof(struct nc_state_t));
Expand All @@ -460,8 +489,8 @@ static int doRebootInstance(struct nc_state_t *nc, ncMetadata * pMeta, char *ins
{
instance = find_instance(&global_instances, instanceId);
// if instance state is still BOOTING set it back to the old one
if (instance->state == BOOTING) {
instance->bootTime = 0;
if (instance != NULL && instance->state == BOOTING) {
instance->bootTime = old_bootTime;
change_state(instance, old_state);
}
}
Expand Down
1 change: 1 addition & 0 deletions util/data.h
Original file line number Diff line number Diff line change
Expand Up @@ -387,6 +387,7 @@ typedef struct ncInstance_t {
int launchTime; //!< timestamp of RunInstances request arrival
int expiryTime; //!< timestamp of instance ->RUNNING expiration
int bootTime; //!< timestamp of STAGING->BOOTING transition
int rebootTime; //!< timestamp of RUNNING->BOOTING transition (while active)
int bundlingTime; //!< timestamp of ->BUNDLING transition
int createImageTime; //!< timestamp of ->CREATEIMAGE transition
int terminationRequestedTime; //!< timestamp of TerminateInstance request arrival
Expand Down

0 comments on commit 69d17f7

Please sign in to comment.