Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

resource/aws_security_group: error updating Security Group (sg-xxx): couldn't find resource #21628

Closed
ialidzhikov opened this issue Nov 4, 2021 · 9 comments · Fixed by #22420
Labels
bug Addresses a defect in current functionality. service/ec2 Issues and PRs that pertain to the ec2 service. service/iam Issues and PRs that pertain to the iam service.
Milestone

Comments

@ialidzhikov
Copy link
Contributor

ialidzhikov commented Nov 4, 2021

Community Note

  • Please vote on this issue by adding a 👍 reaction to the original issue to help the community and maintainers prioritize this request
  • Please do not leave "+1" or other comments that do not add relevant new information or questions, they generate extra noise for issue followers and do not help prioritize the request
  • If you are interested in working on this issue or have submitted a pull request, please leave a comment

Terraform CLI and Terraform AWS Provider Version

terraform version - 0.12.31
provider-aws version - 3.54.0

Affected Resource(s)

  • aws_security_group
  • aws_security_group_rule

Terraform Configuration Files

Please include all Terraform configurations required to reproduce the bug. Bug reports without a functional reproduction may be closed without investigation.

    provider "aws" {
      access_key = "${var.ACCESS_KEY_ID}"
      secret_key = "${var.SECRET_ACCESS_KEY}"
      region     = "eu-west-1"
    }

    resource "aws_vpc_dhcp_options" "vpc_dhcp_options" {
      domain_name         = "eu-west-1.compute.internal"
      domain_name_servers = ["AmazonProvidedDNS"]
    }

    resource "aws_vpc" "vpc" {
      cidr_block           = "10.250.0.0/16"
      enable_dns_support   = true
      enable_dns_hostnames = true
    }

    resource "aws_vpc_dhcp_options_association" "vpc_dhcp_options_association" {
      vpc_id          = "${aws_vpc.vpc.id}"
      dhcp_options_id = "${aws_vpc_dhcp_options.vpc_dhcp_options.id}"
    }

    resource "aws_default_security_group" "default" {
      vpc_id = "${aws_vpc.vpc.id}"
    }

    resource "aws_internet_gateway" "igw" {
      vpc_id = "${aws_vpc.vpc.id}"
    }

    resource "aws_route_table" "routetable_main" {
      vpc_id = "${aws_vpc.vpc.id}"
    }

    resource "aws_route" "public" {
      route_table_id         = "${aws_route_table.routetable_main.id}"
      destination_cidr_block = "0.0.0.0/0"
      gateway_id             = "${aws_internet_gateway.igw.id}"
    }

    resource "aws_security_group" "nodes" {
      name        = "foo-nodes"
      description = "Security group for nodes"
      vpc_id      = "${aws_vpc.vpc.id}"
    }

    resource "aws_security_group_rule" "nodes_self" {
      type              = "ingress"
      from_port         = 0
      to_port           = 0
      protocol          = "-1"
      self              = true
      security_group_id = "${aws_security_group.nodes.id}"
    }

    resource "aws_security_group_rule" "nodes_tcp_all" {
      type              = "ingress"
      from_port         = 30000
      to_port           = 32767
      protocol          = "tcp"
      cidr_blocks       = ["0.0.0.0/0"]
      security_group_id = "${aws_security_group.nodes.id}"
    }

    resource "aws_security_group_rule" "nodes_udp_all" {
      type              = "ingress"
      from_port         = 30000
      to_port           = 32767
      protocol          = "udp"
      cidr_blocks       = ["0.0.0.0/0"]
      security_group_id = "${aws_security_group.nodes.id}"
    }

    resource "aws_security_group_rule" "nodes_egress_all" {
      type              = "egress"
      from_port         = 0
      to_port           = 0
      protocol          = "-1"
      cidr_blocks       = ["0.0.0.0/0"]
      security_group_id = "${aws_security_group.nodes.id}"
    }


    resource "aws_subnet" "nodes_z0" {
      vpc_id            = "${aws_vpc.vpc.id}"
      cidr_block        = "10.250.0.0/19"
      availability_zone = "eu-west-1c"
    }

    output "subnet_nodes_z0" {
      value = "${aws_subnet.nodes_z0.id}"
    }

    resource "aws_subnet" "private_utility_z0" {
      vpc_id            = "${aws_vpc.vpc.id}"
      cidr_block        = "10.250.112.0/22"
      availability_zone = "eu-west-1c"
    }

    resource "aws_security_group_rule" "nodes_tcp_internal_z0" {
      type              = "ingress"
      from_port         = 30000
      to_port           = 32767
      protocol          = "tcp"
      cidr_blocks       = ["10.250.112.0/22"]
      security_group_id = "${aws_security_group.nodes.id}"
    }

    resource "aws_security_group_rule" "nodes_udp_internal_z0" {
      type              = "ingress"
      from_port         = 30000
      to_port           = 32767
      protocol          = "udp"
      cidr_blocks       = ["10.250.112.0/22"]
      security_group_id = "${aws_security_group.nodes.id}"
    }

    resource "aws_subnet" "public_utility_z0" {
      vpc_id            = "${aws_vpc.vpc.id}"
      cidr_block        = "10.250.96.0/22"
      availability_zone = "eu-west-1c"
    }

    output "subnet_public_utility_z0" {
      value = "${aws_subnet.public_utility_z0.id}"
    }

    resource "aws_security_group_rule" "nodes_tcp_public_z0" {
      type              = "ingress"
      from_port         = 30000
      to_port           = 32767
      protocol          = "tcp"
      cidr_blocks       = ["10.250.96.0/22"]
      security_group_id = "${aws_security_group.nodes.id}"
    }

    resource "aws_security_group_rule" "nodes_udp_public_z0" {
      type              = "ingress"
      from_port         = 30000
      to_port           = 32767
      protocol          = "udp"
      cidr_blocks       = ["10.250.96.0/22"]
      security_group_id = "${aws_security_group.nodes.id}"
    }

    resource "aws_eip" "eip_natgw_z0" {
      vpc = true
    }

    resource "aws_nat_gateway" "natgw_z0" {
      allocation_id = "${aws_eip.eip_natgw_z0.id}"
      subnet_id     = "${aws_subnet.public_utility_z0.id}"
    }

    resource "aws_route_table" "routetable_private_utility_z0" {
      vpc_id = "${aws_vpc.vpc.id}"
    }

    resource "aws_route" "private_utility_z0_nat" {
      route_table_id         = "${aws_route_table.routetable_private_utility_z0.id}"
      destination_cidr_block = "0.0.0.0/0"
      nat_gateway_id         = "${aws_nat_gateway.natgw_z0.id}"

      timeouts {
        create = "5m"
      }
    }

    resource "aws_route_table_association" "routetable_private_utility_z0_association_private_utility_z0" {
      subnet_id      = "${aws_subnet.private_utility_z0.id}"
      route_table_id = "${aws_route_table.routetable_private_utility_z0.id}"
    }

    resource "aws_route_table_association" "routetable_main_association_public_utility_z0" {
      subnet_id      = "${aws_subnet.public_utility_z0.id}"
      route_table_id = "${aws_route_table.routetable_main.id}"
    }

    resource "aws_route_table_association" "routetable_private_utility_z0_association_nodes_z0" {
      subnet_id      = "${aws_subnet.nodes_z0.id}"
      route_table_id = "${aws_route_table.routetable_private_utility_z0.id}"
    }


    //=====================================================================
    //= IAM instance profiles
    //=====================================================================

    resource "aws_iam_role" "bastions" {
      name = "foo-bastions"
      path = "/"

      assume_role_policy = <<EOF
    {
      "Version": "2012-10-17",
      "Statement": [
        {
          "Effect": "Allow",
          "Principal": {
            "Service": "ec2.amazonaws.com"
          },
          "Action": "sts:AssumeRole"
        }
      ]
    }
    EOF
    }

    resource "aws_iam_instance_profile" "bastions" {
      name = "foo-bastions"
      role = "${aws_iam_role.bastions.name}"
    }

    resource "aws_iam_role_policy" "bastions" {
      name = "foo-bastions"
      role = "${aws_iam_role.bastions.id}"

      policy = <<EOF
    {
      "Version": "2012-10-17",
      "Statement": [
        {
          "Effect": "Allow",
          "Action": [
            "ec2:DescribeRegions"
          ],
          "Resource": [
            "*"
          ]
        }
      ]
    }
    EOF
    }

    resource "aws_iam_role" "nodes" {
      name = "foo-nodes"
      path = "/"

      assume_role_policy = <<EOF
    {
      "Version": "2012-10-17",
      "Statement": [
        {
          "Effect": "Allow",
          "Principal": {
            "Service": "ec2.amazonaws.com"
          },
          "Action": "sts:AssumeRole"
        }
      ]
    }
    EOF
    }

    resource "aws_iam_instance_profile" "nodes" {
      name = "foo-nodes"
      role = "${aws_iam_role.nodes.name}"
    }

    resource "aws_iam_role_policy" "nodes" {
      name = "foo-nodes"
      role = "${aws_iam_role.nodes.id}"

      policy = <<EOF
    {
      "Version": "2012-10-17",
      "Statement": [
        {
          "Effect": "Allow",
          "Action": [
            "ec2:DescribeInstances"
          ],
          "Resource": [
            "*"
          ]
        },
        {
          "Effect": "Allow",
          "Action": [
            "ecr:GetAuthorizationToken",
            "ecr:BatchCheckLayerAvailability",
            "ecr:GetDownloadUrlForLayer",
            "ecr:GetRepositoryPolicy",
            "ecr:DescribeRepositories",
            "ecr:ListImages",
            "ecr:BatchGetImage"
          ],
          "Resource": [
            "*"
          ]
        }
      ]
    }
    EOF
    }

    //=====================================================================
    //= EC2 Key Pair
    //=====================================================================

    resource "aws_key_pair" "kubernetes" {
      key_name   = "foo-ssh-publickey"
      public_key = "ssh-rsa bar"
    }

    //=====================================================================
    //= Output variables
    //=====================================================================

    output "vpc_id" {
      value = "${aws_vpc.vpc.id}"
    }

    output "iamInstanceProfileNodes" {
      value = "${aws_iam_instance_profile.nodes.name}"
    }

    output "keyName" {
      value = "${aws_key_pair.kubernetes.key_name}"
    }

    output "security_group_nodes" {
      value = "${aws_security_group.nodes.id}"
    }

    output "nodes_role_arn" {
      value = "${aws_iam_role.nodes.arn}"
    }

Debug Output

Panic Output

Expected Behavior

Actual Behavior

Initializing the backend...

Initializing provider plugins...

The following providers do not have any version constraints in configuration,
so the latest version was installed.

To prevent automatic upgrades to new major versions that may contain breaking
changes, it is recommended to add version = "..." constraints to the
corresponding provider blocks in configuration, with the constraint strings
suggested below.

* provider.aws: version = "~> 3.54"

Terraform has been successfully initialized!

You may now begin working with Terraform. Try running "terraform plan" to see
any changes that are required for your infrastructure. All Terraform commands
should now work.

If you ever set or change modules or backend configuration for Terraform,
rerun this command to reinitialize your working directory. If you forget, other
commands will detect it and remind you to do so if necessary.

aws_key_pair.kubernetes: Creating...
### Creation logs are omitted

* error updating Security Group (sg-xxx): couldn't find resource
  on tf/main.tf line 34, in resource "aws_security_group" "nodes":
      34: resource "aws_security_group" "nodes" {

Steps to Reproduce

  1. terraform apply the configuration from above

  2. Make sure that for heavily used AWS account, it may fail with the above error:

* error updating Security Group (sg-xxx): couldn't find resource
  on tf/main.tf line 34, in resource "aws_security_group" "nodes":
      34: resource "aws_security_group" "nodes" {

Maybe also cloud provider request limits and throttling can lead to this error?
Can this issue be related to the eventual consistency model of the AWS EC2 API (hence related to #16796)?

Important Factoids

References

  • #0000
@github-actions github-actions bot added needs-triage Waiting for first response or review from a maintainer. service/ec2 Issues and PRs that pertain to the ec2 service. service/iam Issues and PRs that pertain to the iam service. labels Nov 4, 2021
@justinretzolk
Copy link
Member

Hey @ialidzhikov 👋 Thank you for taking the time to file this issue. Unfortunately, it looks like the logs you provided cut off some of the more relevant log lines. Would it be possible to include any additional logging around at least the aws_security_group.nodes resource, redacting as necessary?

@justinretzolk justinretzolk added waiting-response Maintainers are waiting on response from community or contributor. and removed needs-triage Waiting for first response or review from a maintainer. labels Nov 4, 2021
@ialidzhikov
Copy link
Contributor Author

Unfortunately, we don't have all logs from the terraform apply output but only the final error. :(

@github-actions github-actions bot removed the waiting-response Maintainers are waiting on response from community or contributor. label Nov 8, 2021
@alewando
Copy link
Contributor

alewando commented Nov 8, 2021

Just to add to this, we've seen a notable increase in the number of "eventual consistency" errors when creating security groups in the past few weeks.

@justinretzolk justinretzolk added the bug Addresses a defect in current functionality. label Nov 10, 2021
@matthewduren
Copy link

We have been experiencing this issue fairly heavily since late October as well. If there are any sort of logs that would help diagnose the race condition in here I'm happy to help provide them.

@matthewduren
Copy link

To provide some more information here...we are also seeing presumably this same issue appear when running applies, terraform fails to read existing security group info into state when running a refresh, then tries to create a named security group that already exists, which results in a duplicate error:

Error: Error creating Security Group: InvalidGroup.Duplicate: The security group 'name-goes-here' already exists for VPC 'vpc-abcd1234'
	status code: 400, request id: 

@ellisroll-b
Copy link

another form for security groups: resource/aws_security_group: error reading Security Group (sg-xxx): couldn't find resource,
Installed hashicorp/aws v3.65.0 (signed by HashiCorp)

@joelthompson
Copy link
Contributor

Comparing our CloudTrail logs to the provider source code, I think I know what's going on here. After creating a new SG, the provider calls ec2:DescribeSecurityGroups and waits for a single successful response:

// Wait for the security group to truly exist
group, err := WaitSecurityGroupCreated(conn, d.Id(), d.Timeout(schema.TimeoutCreate))
if err != nil {
return fmt.Errorf(
"Error waiting for Security Group (%s) to become available: %w",
d.Id(), err)
}

Then, resourceSecurityGroupCreate returns with a call to resourceSecurityGroupUpdate:

return resourceSecurityGroupUpdate(d, meta)

The update method calls off to FindSecurityGroupByID when then makes its own ec2:DescribeSecurityGroups call:

group, err := FindSecurityGroupByID(conn, d.Id())
if err != nil {
return fmt.Errorf("error updating Security Group (%s): %w", d.Id(), err)
}

This subsequent call to ec2:DescribeSecurityGroups fails with a group not found error, and it produces the "error updating Security Group" error message described in the original post.

There's another eventual consistency bug in creating security groups as well. If this first call to FindSecurityGroupByID succeeds, then at the end of resourceSecurityGroupCreate it returns a call to resourceSecurityGroupRead:

return resourceSecurityGroupRead(d, meta)

And this then does nearly the exact same thing as the Update, just with a slightly different error message:

sg, err := FindSecurityGroupByID(conn, d.Id())
var nfe *resource.NotFoundError
if !d.IsNewResource() && errors.As(err, &nfe) {
log.Printf("[WARN] Security group (%s) not found, removing from state", d.Id())
d.SetId("")
return nil
}
if err != nil {
return fmt.Errorf("error reading Security Group (%s): %w", d.Id(), err)
}

So if this last call to FindSecurityGroupByID fails with a resource not found error, then we'll get an "error reading Security Group" error message rather than "error updating Security Group," which is also something we're seeing when attempting to create new SGs.

Ultimately, these subsequent attempts to read the SG out of AWS shouldn't just assume that they will get a successful response on newly created resources but instead should add retries (or even just skip making these extra AWS API calls and instead reuse the response when verifying the resource was actually created).

fomichevmi added a commit to fomichevmi/terraform-provider-aws that referenced this issue Jan 5, 2022
fomichevmi added a commit to fomichevmi/terraform-provider-aws that referenced this issue Jan 5, 2022
Fix for the bug described in hashicorp#21628

Update default_security_group.go
fomichevmi added a commit to fomichevmi/terraform-provider-aws that referenced this issue Jan 5, 2022
fomichevmi added a commit to fomichevmi/terraform-provider-aws that referenced this issue Jan 7, 2022
Fix for hashicorp#21628

Fix updating Security Group error

Fix for hashicorp#21628

Another attempt to fix
fomichevmi added a commit to fomichevmi/terraform-provider-aws that referenced this issue Jan 7, 2022
Fix reading security group right after creation (hashicorp#21628)
fomichevmi added a commit to fomichevmi/terraform-provider-aws that referenced this issue Jan 7, 2022
Fix reading security group right after creation (hashicorp#21628)
@github-actions github-actions bot added this to the v3.73.0 milestone Jan 18, 2022
@github-actions
Copy link

This functionality has been released in v3.73.0 of the Terraform AWS Provider. Please see the Terraform documentation on provider versioning or reach out if you need any assistance upgrading.

For further feature requests or bug reports with this functionality, please create a new GitHub issue following the template. Thank you!

@github-actions
Copy link

I'm going to lock this issue because it has been closed for 30 days ⏳. This helps our maintainers find and focus on the active issues.
If you have found a problem that seems similar to this, please open a new issue and complete the issue template so we can capture all the details necessary to investigate further.

@github-actions github-actions bot locked as resolved and limited conversation to collaborators May 18, 2022
Sign up for free to subscribe to this conversation on GitHub. Already have an account? Sign in.
Labels
bug Addresses a defect in current functionality. service/ec2 Issues and PRs that pertain to the ec2 service. service/iam Issues and PRs that pertain to the iam service.
Projects
None yet
6 participants