Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Store Consul cluster/snapshot state to fix failed cluster behavior #326

Merged
merged 12 commits into from
Jun 28, 2022
1 change: 1 addition & 0 deletions docs/data-sources/consul_cluster.md
Original file line number Diff line number Diff line change
Expand Up @@ -53,6 +53,7 @@ data "hcp_consul_cluster" "example" {
- `scale` (Number) The the number of Consul server nodes in the cluster.
- `self_link` (String) A unique URL identifying the HCP Consul cluster.
- `size` (String) The t-shirt size representation of each server VM that this Consul cluster is provisioned with. Valid option for development tier - `x_small`. Valid options for other tiers - `small`, `medium`, `large`. For more details - https://cloud.hashicorp.com/pricing/consul
- `state` (String) The state of the HCP Consul cluster.
- `tier` (String) The tier that the HCP Consul cluster will be provisioned as. Only `development`, `standard` and `plus` are available at this time.

<a id="nestedblock--timeouts"></a>
Expand Down
1 change: 1 addition & 0 deletions docs/resources/consul_cluster.md
Original file line number Diff line number Diff line change
Expand Up @@ -67,6 +67,7 @@ resource "hcp_consul_cluster" "example" {
- `region` (String) The region where the HCP Consul cluster is located.
- `scale` (Number) The number of Consul server nodes in the cluster.
- `self_link` (String) A unique URL identifying the HCP Consul cluster.
- `state` (String) The state of the HCP Consul cluster.

<a id="nestedblock--timeouts"></a>
### Nested Schema for `timeouts`
Expand Down
1 change: 1 addition & 0 deletions docs/resources/consul_snapshot.md
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,7 @@ resource "hcp_consul_snapshot" "example" {
- `restored_at` (String) Timestamp of when the snapshot was restored. If the snapshot has not been restored, this field will be blank.
- `size` (Number) The size of the snapshot in bytes.
- `snapshot_id` (String) The ID of the Consul snapshot
- `state` (String) The state of an HCP Consul snapshot.

<a id="nestedblock--timeouts"></a>
### Nested Schema for `timeouts`
Expand Down
49 changes: 37 additions & 12 deletions internal/provider/data_source_consul_cluster.go
Original file line number Diff line number Diff line change
Expand Up @@ -64,6 +64,11 @@ func dataSourceConsulCluster() *schema.Resource {
Type: schema.TypeString,
Computed: true,
},
"state": {
Description: "The state of the HCP Consul cluster.",
Type: schema.TypeString,
Computed: true,
},
"connect_enabled": {
Description: "Denotes the Consul connect feature should be enabled for this cluster. Default to true.",
Type: schema.TypeBool,
Expand Down Expand Up @@ -168,22 +173,30 @@ func dataSourceConsulClusterRead(ctx context.Context, d *schema.ResourceData, me

d.SetId(url)

// cluster found, set data source attributes
if err := setConsulClusterDataSourceAttributes(d, cluster); err != nil {
return diag.FromErr(err)
}

// get the cluster's Consul client config files
clientConfigFiles, err := clients.GetConsulClientConfigFiles(ctx, client, loc, clusterID)
if err != nil {
return diag.Errorf("unable to retrieve Consul cluster (%s) client config files: %v", clusterID, err)
}

// Cluster found, update resource data
if err := setConsulClusterDataSourceAttributes(d, cluster, clientConfigFiles); err != nil {
// client config found, set data source attributes
if err := setConsulClusterClientConfigDataSourceAttributes(d, clientConfigFiles); err != nil {
return diag.FromErr(err)
}

return nil
}

func setConsulClusterDataSourceAttributes(d *schema.ResourceData, cluster *consulmodels.HashicorpCloudConsul20210204Cluster,
clientConfigFiles *consulmodels.HashicorpCloudConsul20210204GetClientConfigResponse) error {
// setConsulClusterDataSourceAttributes sets all data source attributes from the cluster
func setConsulClusterDataSourceAttributes(
d *schema.ResourceData,
cluster *consulmodels.HashicorpCloudConsul20210204Cluster,
) error {

if err := d.Set("cluster_id", cluster.ID); err != nil {
return err
Expand Down Expand Up @@ -218,6 +231,10 @@ func setConsulClusterDataSourceAttributes(d *schema.ResourceData, cluster *consu
return err
}

if err := d.Set("state", cluster.State); err != nil {
return err
}

if err := d.Set("scale", cluster.Config.CapacityConfig.Scale); err != nil {
return err
}
Expand All @@ -238,14 +255,6 @@ func setConsulClusterDataSourceAttributes(d *schema.ResourceData, cluster *consu
return err
}

if err := d.Set("consul_config_file", clientConfigFiles.ConsulConfigFile.String()); err != nil {
return err
}

if err := d.Set("consul_ca_file", clientConfigFiles.CaFile.String()); err != nil {
return err
}

if err := d.Set("connect_enabled", cluster.Config.ConsulConfig.ConnectEnabled); err != nil {
return err
}
Expand Down Expand Up @@ -292,3 +301,19 @@ func setConsulClusterDataSourceAttributes(d *schema.ResourceData, cluster *consu

return nil
}

// setConsulClusterClientConfigResourceData sets all resource data that's derived from client config meta
func setConsulClusterClientConfigDataSourceAttributes(
d *schema.ResourceData,
clientConfigFiles *consulmodels.HashicorpCloudConsul20210204GetClientConfigResponse,
) error {
if err := d.Set("consul_config_file", clientConfigFiles.ConsulConfigFile.String()); err != nil {
return err
}

if err := d.Set("consul_ca_file", clientConfigFiles.CaFile.String()); err != nil {
return err
}

return nil
}
71 changes: 48 additions & 23 deletions internal/provider/resource_consul_cluster.go
Original file line number Diff line number Diff line change
Expand Up @@ -170,6 +170,11 @@ func resourceConsulCluster() *schema.Resource {
Type: schema.TypeString,
Computed: true,
},
"state": {
Description: "The state of the HCP Consul cluster.",
Type: schema.TypeString,
Computed: true,
},
"consul_automatic_upgrades": {
Description: "Denotes that automatic Consul upgrades are enabled.",
Type: schema.TypeBool,
Expand Down Expand Up @@ -382,12 +387,20 @@ func resourceConsulClusterCreate(ctx context.Context, d *schema.ResourceData, me
return diag.Errorf("unable to retrieve Consul cluster (%s): %v", payload.Cluster.ID, err)
}

if err := setConsulClusterResourceData(d, cluster); err != nil {
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

When the create fails, where would the tf have failed previously? Was it in attempting to get the client config files?

I had assumed the WaitForOperation call back on line 379 still prior to this call.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

you're right and yes, this was failing up above in the Wait. Where I saw this was an issue was in resourceConsulClusterRead (the GetConsulClusterByID call succeeds while GetConsulClientConfigFiles fails). I then applied the same pattern everywhere we make these two API calls (storing results of first)

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

If it fails during the Wait then will we still clean it up?

Copy link
Contributor Author

@jjti jjti Jun 13, 2022

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes, we'll still clean it up on the next terraform delete or terraform apply. A PR from last year addressed this by storing the ID of the resource before polling: #59

Current behavior:

  • tf create fail during poll: bail, return diag.Error, we store nothing but ID
  • tf plan, read: GET with ID, see it's FAILED, purge from state
  • tf apply: create a new cluster. Don't delete prior cluster (it was purged). Probably (unless user knows to go manually delete the old resources) run into a duplicate cluster/hvn ID issue

Behavior after this change:

  • tf create fail during poll: bail, return diag.Error, we store nothing but ID
  • tf plan, read: GET with ID, set state
  • tf apply, old cluster is tainted, delete, create a new cluster

Copy link
Contributor

@bcmdarroch bcmdarroch Jun 14, 2022

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Thanks for summing up the behavior change here 📝 💡

One thing I'm considering is how useful this computed output state might be in other circumstances, since it seems more of an internal detail. Would a user ever need to use the output of state or react to it? I think that was the main reason for keeping it out of the resource schema until now.

We could also resolve this issue without state by instead deleting any cluster that returns in a failed state, in addition to setting the ID to nil in the TF state. We could put this operation in a retry function.

Which operation takes more time on the Consul service side? Deleting a failed cluster or re-attempting the creation of a failed cluster? Or any other advantages?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I realized and messed up and replied out of thread: #326 (comment)

return diag.FromErr(err)
}

// get the cluster's Consul client config files
clientConfigFiles, err := clients.GetConsulClientConfigFiles(ctx, client, loc, payload.Cluster.ID)
if err != nil {
return diag.Errorf("unable to retrieve Consul cluster (%s) client config files: %v", payload.Cluster.ID, err)
}

if err := setConsulClusterClientConfigResourceData(d, clientConfigFiles); err != nil {
return diag.FromErr(err)
}

// create customer root ACL token
rootACLToken, err := clients.CreateCustomerRootACLToken(ctx, client, loc, payload.Cluster.ID)
if err != nil {
Expand All @@ -403,19 +416,13 @@ func resourceConsulClusterCreate(ctx context.Context, d *schema.ResourceData, me
return diag.FromErr(err)
}

if err := setConsulClusterResourceData(d, cluster, clientConfigFiles); err != nil {
return diag.FromErr(err)
}

return nil
}

// setConsulClusterResourceData sets the KV pairs of the Consul cluster resource schema.
// We do not set consul_root_token_accessor_id and consul_root_token_secret_id here since
// the original root token is only available during cluster creation.
func setConsulClusterResourceData(d *schema.ResourceData, cluster *consulmodels.HashicorpCloudConsul20210204Cluster,
clientConfigFiles *consulmodels.HashicorpCloudConsul20210204GetClientConfigResponse) error {

func setConsulClusterResourceData(d *schema.ResourceData, cluster *consulmodels.HashicorpCloudConsul20210204Cluster) error {
if err := d.Set("cluster_id", cluster.ID); err != nil {
return err
}
Expand All @@ -440,6 +447,10 @@ func setConsulClusterResourceData(d *schema.ResourceData, cluster *consulmodels.
return err
}

if err := d.Set("state", cluster.State); err != nil {
return err
}

publicEndpoint := !cluster.Config.NetworkConfig.Private
if err := d.Set("public_endpoint", publicEndpoint); err != nil {
return err
Expand Down Expand Up @@ -469,14 +480,6 @@ func setConsulClusterResourceData(d *schema.ResourceData, cluster *consulmodels.
return err
}

if err := d.Set("consul_config_file", clientConfigFiles.ConsulConfigFile.String()); err != nil {
return err
}

if err := d.Set("consul_ca_file", clientConfigFiles.CaFile.String()); err != nil {
return err
}

if err := d.Set("connect_enabled", cluster.Config.ConsulConfig.ConnectEnabled); err != nil {
return err
}
Expand Down Expand Up @@ -528,6 +531,22 @@ func setConsulClusterResourceData(d *schema.ResourceData, cluster *consulmodels.
return nil
}

// setConsulClusterClientConfigResourceData sets all resource data that's derived from client config meta
func setConsulClusterClientConfigResourceData(
d *schema.ResourceData,
clientConfigFiles *consulmodels.HashicorpCloudConsul20210204GetClientConfigResponse,
) error {
if err := d.Set("consul_config_file", clientConfigFiles.ConsulConfigFile.String()); err != nil {
return err
}

if err := d.Set("consul_ca_file", clientConfigFiles.CaFile.String()); err != nil {
return err
}

return nil
}

func resourceConsulClusterRead(ctx context.Context, d *schema.ResourceData, meta interface{}) diag.Diagnostics {
client := meta.(*clients.Client)

Expand All @@ -552,22 +571,24 @@ func resourceConsulClusterRead(ctx context.Context, d *schema.ResourceData, meta
return diag.Errorf("unable to fetch Consul cluster (%s): %v", clusterID, err)
}

// The Consul cluster failed to provision properly so we want to let the user know and
// remove it from state
if cluster.State == consulmodels.HashicorpCloudConsul20210204ClusterStateFAILED {
log.Printf("[WARN] Consul cluster (%s) failed to provision, removing from state", clusterID)
// we should only ever get a CodeNotFound response if the cluster is deleted. The below is precautionary
if cluster.State == consulmodels.HashicorpCloudConsul20210204ClusterStateDELETED {
log.Printf("[WARN] Consul cluster (%s) was deleted", clusterID)
d.SetId("")
return nil
}

if err := setConsulClusterResourceData(d, cluster); err != nil {
return diag.FromErr(err)
}

// get the cluster's Consul client config files
clientConfigFiles, err := clients.GetConsulClientConfigFiles(ctx, client, loc, clusterID)
if err != nil {
return diag.Errorf("unable to retrieve Consul cluster (%s) client config files: %v", clusterID, err)
}

// Cluster found, update resource data
if err := setConsulClusterResourceData(d, cluster, clientConfigFiles); err != nil {
if err := setConsulClusterClientConfigResourceData(d, clientConfigFiles); err != nil {
return diag.FromErr(err)
}

Expand Down Expand Up @@ -666,13 +687,17 @@ func resourceConsulClusterUpdate(ctx context.Context, d *schema.ResourceData, me
return diag.Errorf("unable to retrieve Consul cluster (%s): %v", clusterID, err)
}

// get the cluster's Consul client config files
if err := setConsulClusterResourceData(d, updatedCluster); err != nil {
return diag.FromErr(err)
}

// Get the cluster's Consul client config files
clientConfigFiles, err := clients.GetConsulClientConfigFiles(ctx, client, cluster.Location, clusterID)
if err != nil {
return diag.Errorf("unable to retrieve Consul cluster (%s) client config files: %v", clusterID, err)
}

if err := setConsulClusterResourceData(d, updatedCluster, clientConfigFiles); err != nil {
if err := setConsulClusterClientConfigResourceData(d, clientConfigFiles); err != nil {
return diag.FromErr(err)
}

Expand Down
17 changes: 10 additions & 7 deletions internal/provider/resource_consul_cluster_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@ var consulCluster = `
resource "hcp_consul_cluster" "test" {
cluster_id = "test-consul-cluster"
hvn_id = hcp_hvn.test.hvn_id
tier = "standard"
tier = "development"
min_consul_version = data.hcp_consul_versions.test.recommended
}
`
Expand All @@ -24,7 +24,7 @@ resource "hcp_consul_cluster" "test" {
cluster_id = "test-consul-cluster"
hvn_id = hcp_hvn.test.hvn_id
tier = "standard"
size = "medium"
size = "small"
}
`

Expand Down Expand Up @@ -71,16 +71,17 @@ func TestAccConsulCluster(t *testing.T) {
testAccCheckConsulClusterExists(resourceName),
resource.TestCheckResourceAttr(resourceName, "cluster_id", "test-consul-cluster"),
resource.TestCheckResourceAttr(resourceName, "hvn_id", "test-hvn"),
resource.TestCheckResourceAttr(resourceName, "tier", "STANDARD"),
resource.TestCheckResourceAttr(resourceName, "tier", "DEVELOPMENT"),
resource.TestCheckResourceAttr(resourceName, "cloud_provider", "aws"),
resource.TestCheckResourceAttr(resourceName, "region", "us-west-2"),
resource.TestCheckResourceAttr(resourceName, "public_endpoint", "false"),
resource.TestCheckResourceAttr(resourceName, "datacenter", "test-consul-cluster"),
resource.TestCheckResourceAttr(resourceName, "scale", "3"),
resource.TestCheckResourceAttr(resourceName, "scale", "1"),
resource.TestCheckResourceAttr(resourceName, "consul_snapshot_interval", "24h"),
resource.TestCheckResourceAttr(resourceName, "consul_snapshot_retention", "30d"),
resource.TestCheckResourceAttr(resourceName, "connect_enabled", "true"),
resource.TestCheckResourceAttr(resourceName, "auto_hvn_to_hvn_peering", "false"),
resource.TestCheckResourceAttr(resourceName, "state", "RUNNING"),
resource.TestCheckResourceAttrSet(resourceName, "organization_id"),
resource.TestCheckResourceAttrSet(resourceName, "project_id"),
resource.TestCheckResourceAttrSet(resourceName, "consul_config_file"),
Expand Down Expand Up @@ -118,15 +119,16 @@ func TestAccConsulCluster(t *testing.T) {
testAccCheckConsulClusterExists(resourceName),
resource.TestCheckResourceAttr(resourceName, "cluster_id", "test-consul-cluster"),
resource.TestCheckResourceAttr(resourceName, "hvn_id", "test-hvn"),
resource.TestCheckResourceAttr(resourceName, "tier", "STANDARD"),
resource.TestCheckResourceAttr(resourceName, "tier", "DEVELOPMENT"),
resource.TestCheckResourceAttr(resourceName, "cloud_provider", "aws"),
resource.TestCheckResourceAttr(resourceName, "region", "us-west-2"),
resource.TestCheckResourceAttr(resourceName, "public_endpoint", "false"),
resource.TestCheckResourceAttr(resourceName, "datacenter", "test-consul-cluster"),
resource.TestCheckResourceAttr(resourceName, "scale", "3"),
resource.TestCheckResourceAttr(resourceName, "scale", "1"),
resource.TestCheckResourceAttr(resourceName, "consul_snapshot_interval", "24h"),
resource.TestCheckResourceAttr(resourceName, "consul_snapshot_retention", "30d"),
resource.TestCheckResourceAttr(resourceName, "connect_enabled", "true"),
resource.TestCheckResourceAttr(resourceName, "state", "RUNNING"),
resource.TestCheckResourceAttrSet(resourceName, "organization_id"),
resource.TestCheckResourceAttrSet(resourceName, "project_id"),
resource.TestCheckResourceAttrSet(resourceName, "consul_config_file"),
Expand Down Expand Up @@ -161,6 +163,7 @@ func TestAccConsulCluster(t *testing.T) {
resource.TestCheckResourceAttrPair(resourceName, "consul_config_file", dataSourceName, "consul_config_file"),
resource.TestCheckResourceAttrPair(resourceName, "consul_ca_file", dataSourceName, "consul_ca_file"),
resource.TestCheckResourceAttrPair(resourceName, "consul_version", dataSourceName, "consul_version"),
resource.TestCheckResourceAttrPair(resourceName, "state", dataSourceName, "state"),
resource.TestCheckResourceAttrPair(resourceName, "consul_public_endpoint_url", dataSourceName, "consul_public_endpoint_url"),
resource.TestCheckResourceAttrPair(resourceName, "consul_private_endpoint_url", dataSourceName, "consul_private_endpoint_url"),
testAccCheckFullURL(resourceName, "consul_private_endpoint_url", ""),
Expand All @@ -186,7 +189,7 @@ func TestAccConsulCluster(t *testing.T) {
Config: testConfig(setTestAccConsulClusterConfig(updatedConsulCluster)),
Check: resource.ComposeTestCheckFunc(
testAccCheckConsulClusterExists(resourceName),
resource.TestCheckResourceAttr(resourceName, "size", "MEDIUM"),
resource.TestCheckResourceAttr(resourceName, "size", "SMALL"),
),
},
},
Expand Down
17 changes: 9 additions & 8 deletions internal/provider/resource_consul_snapshot.go
Original file line number Diff line number Diff line change
Expand Up @@ -88,6 +88,11 @@ func resourceConsulSnapshot() *schema.Resource {
Type: schema.TypeString,
Computed: true,
},
"state": {
Description: "The state of an HCP Consul snapshot.",
Type: schema.TypeString,
Computed: true,
},
},
}
}
Expand Down Expand Up @@ -167,14 +172,6 @@ func resourceConsulSnapshotRead(ctx context.Context, d *schema.ResourceData, met
return diag.Errorf("unable to fetch Consul snapshot (%s): %v", snapshotID, err)
}

// The Consul snapshot failed to provision properly so we want to let the user know and
// remove it from state
if snapshotResp.Snapshot.State == consulmodels.HashicorpCloudConsul20210204SnapshotSnapshotStateCREATINGFAILED {
log.Printf("[WARN] Consul snapshot (%s) failed to provision, removing from state", snapshotResp.Snapshot.ID)
d.SetId("")
return nil
}

if err := setConsulSnapshotResourceData(d, snapshotResp.Snapshot); err != nil {
return diag.FromErr(err)
}
Expand Down Expand Up @@ -264,6 +261,10 @@ func setConsulSnapshotResourceData(d *schema.ResourceData, snapshot *consulmodel
return err
}

if err := d.Set("state", snapshot.State); err != nil {
return err
}

if snapshot.Meta != nil {
size, err := strconv.Atoi(snapshot.Meta.Size)
if err != nil {
Expand Down
2 changes: 2 additions & 0 deletions internal/provider/resource_consul_snapshot_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,7 @@ func TestAccConsulSnapshot(t *testing.T) {
resource.TestCheckResourceAttrSet(resourceName, "organization_id"),
resource.TestCheckResourceAttrSet(resourceName, "size"),
resource.TestCheckResourceAttrSet(resourceName, "consul_version"),
resource.TestCheckResourceAttrSet(resourceName, "state"),
resource.TestCheckNoResourceAttr(resourceName, "restored_at"), // Not a restored snapshot
),
},
Expand All @@ -61,6 +62,7 @@ func TestAccConsulSnapshot(t *testing.T) {
resource.TestCheckResourceAttrSet(resourceName, "organization_id"),
resource.TestCheckResourceAttrSet(resourceName, "size"),
resource.TestCheckResourceAttrSet(resourceName, "consul_version"),
resource.TestCheckResourceAttrSet(resourceName, "state"),
resource.TestCheckNoResourceAttr(resourceName, "restored_at"), // Not a restored snapshot
),
},
Expand Down