我正在尝试使用 AWS 提供的 CloudFormation ECS 服务模板建立一个新的 ECS 集群这里作为指导。我的 ECS 实例在 AutoScaling 组内启动,但随后无法通过健康检查并且总是被终止。
输出实际上并没有告诉我哪些检查失败或为什么失败。
我使用的 CloudFormation 代码与 AWS 文档中提供的常用代码基本相同。我添加了一个具有更广泛权限的安全组(这样我可以在迭代过程中通过 SSH 进入),并将 AMI 更新为 us-east-1 中针对 ECS 优化的 Amazon Linux 的最新版本。
当前模板:
{
"AWSTemplateFormatVersion" : "2010-09-09",
"Description": "Deploys PoC ECS infrastructure.",
"Parameters" : {
"KeyName": {
"Description": "Name of an existing EC2 KeyPair to enable SSH access to the Elastic Beanstalk and Bastion hosts",
"Type": "String",
"MinLength": "1",
"MaxLength": "255",
"AllowedPattern": "[\\x20-\\x7E]*",
"ConstraintDescription": "can contain only ASCII characters.",
"Default": "smx-test-key"
},
"SubnetID": {
"Type": "List<AWS::EC2::Subnet::Id>",
"Description": "Select a default subnet ID."
},
"DesiredCapacity": {
"Type": "Number",
"Default" : "1",
"Description": "Number of instances to launch in your ECS cluster."
},
"MaxSize": {
"Type": "Number",
"Default" : "1",
"Description": "Maximum number of instances that can be launched in your ECS cluster."
},
"ECSInstanceType": {
"Description": "The type of instance to use for ECS app servers",
"Type": "String",
"Default": "t2.micro",
"AllowedValues": ["t2.micro", "t2.small", "t2.medium", "t2.large", "m3.medium", "m3.large", "m3.xlarge" ]
},
"SSHLocation" : {
"Description" : " The IP address range that can be used to SSH to the EC2 instances.",
"Type": "String",
"MinLength": "9",
"MaxLength": "18",
"Default": "0.0.0.0/0",
"AllowedPattern": "(\\d{1,3})\\.(\\d{1,3})\\.(\\d{1,3})\\.(\\d{1,3})/(\\d{1,2})",
"ConstraintDescription": "must be a valid IP CIDR range of the form x.x.x.x/x."
}
},
"Mappings" : {
"AWSRegionToAMI" : {
"us-east-1" : { "AMIID" : "ami-5d1b984a" }
}
},
"Resources" : {
"ECSCluster": {
"Type": "AWS::ECS::Cluster"
},
"taskdefinition": {
"Type": "AWS::ECS::TaskDefinition",
"Properties" : {
"ContainerDefinitions" : [
{
"Name": "simple-app",
"Cpu": "10",
"Essential": "true",
"Image":"httpd:2.4",
"Memory":"300",
"MountPoints": [{
"ContainerPath": "/usr/local/apache2/htdocs",
"SourceVolume": "my-vol"
}],
"PortMappings": [
{ "HostPort": 80, "ContainerPort": 80 }
]
},
{
"Name": "busybox",
"Cpu": 10,
"Command": [
"/bin/sh -c \"while true; do echo '<html> <head> <title>Amazon ECS Sample App</title> <style>body {margin-top: 40px; background-color: #333;} </style> </head><body> <div style=color:white;text-align:center> <h1>Amazon ECS Sample App</h1> <h2>Congratulations!</h2> <p>Your application is now running on a container in Amazon ECS.</p>' > top; /bin/date > date ; echo '</div></body></html>' > bottom; cat top date bottom > /usr/local/apache2/htdocs/index.html ; sleep 1; done\""
],
"EntryPoint": [ "sh", "-c"],
"Essential": false,
"Image": "busybox",
"Memory": 200,
"VolumesFrom": [
{
"SourceContainer": "simple-app"
}
]
}
],
"Volumes": [
{ "Name": "my-vol" }
]
}
},
"EcsElasticLoadBalancer" : {
"Type" : "AWS::ElasticLoadBalancing::LoadBalancer",
"Properties" : {
"Subnets" : { "Ref" : "SubnetID" },
"Listeners" : [ {
"LoadBalancerPort" : "80",
"InstancePort" : "80",
"Protocol" : "HTTP"
} ],
"HealthCheck" : {
"Target" : "HTTP:80/",
"HealthyThreshold" : "2",
"UnhealthyThreshold" : "10",
"Interval" : "30",
"Timeout" : "5"
}
}
},
"ECSAutoScalingGroup" : {
"Type" : "AWS::AutoScaling::AutoScalingGroup",
"Properties" : {
"VPCZoneIdentifier" : { "Ref" : "SubnetID" },
"LaunchConfigurationName" : { "Ref" : "ContainerInstances" },
"MinSize" : "1",
"MaxSize" : { "Ref" : "MaxSize" },
"DesiredCapacity" : { "Ref" : "DesiredCapacity" }
},
"CreationPolicy" : {
"ResourceSignal" : {
"Timeout" : "PT60M"
}
},
"UpdatePolicy": {
"AutoScalingRollingUpdate": {
"MinInstancesInService": "1",
"MaxBatchSize": "1",
"PauseTime" : "PT60M",
"WaitOnResourceSignals": "true"
}
}
},
"ContainerInstances": {
"Type": "AWS::AutoScaling::LaunchConfiguration",
"Metadata" : {
"AWS::CloudFormation::Init" : {
"config" : {
"commands" : {
"01_add_instance_to_cluster" : {
"command" : { "Fn::Join": [ "", [ "#!/bin/bash\n", "echo ECS_CLUSTER=", { "Ref": "ECSCluster" }, " >> /etc/ecs/ecs.config" ] ] }
}
},
"files" : {
"/etc/cfn/cfn-hup.conf" : {
"content" : { "Fn::Join" : ["", [
"[main]\n",
"stack=", { "Ref" : "AWS::StackId" }, "\n",
"region=", { "Ref" : "AWS::Region" }, "\n"
]]},
"mode" : "000400",
"owner" : "root",
"group" : "root"
},
"/etc/cfn/hooks.d/cfn-auto-reloader.conf" : {
"content": { "Fn::Join" : ["", [
"[cfn-auto-reloader-hook]\n",
"triggers=post.update\n",
"path=Resources.ContainerInstances.Metadata.AWS::CloudFormation::Init\n",
"action=/opt/aws/bin/cfn-init -v ",
" --stack ", { "Ref" : "AWS::StackName" },
" --resource ContainerInstances ",
" --region ", { "Ref" : "AWS::Region" }, "\n",
"runas=root\n"
]]}
}
},
"services" : {
"sysvinit" : {
"cfn-hup" : { "enabled" : "true", "ensureRunning" : "true", "files" : ["/etc/cfn/cfn-hup.conf", "/etc/cfn/hooks.d/cfn-auto-reloader.conf"] }
}
}
}
}
},
"Properties": {
"ImageId" : { "Fn::FindInMap" : [ "AWSRegionToAMI", { "Ref" : "AWS::Region" }, "AMIID" ] },
"InstanceType" : { "Ref" : "ECSInstanceType" },
"IamInstanceProfile": { "Ref": "EC2InstanceProfile" },
"KeyName" : { "Ref" : "KeyName" },
"SecurityGroups": { "Ref" : "ECSSecurityGroup" },
"UserData" : { "Fn::Base64" : { "Fn::Join" : ["", [
"#!/bin/bash -xe\n",
"yum install -y aws-cfn-bootstrap\n",
"/opt/aws/bin/cfn-init -v ",
" --stack ", { "Ref" : "AWS::StackName" },
" --resource ContainerInstances ",
" --region ", { "Ref" : "AWS::Region" }, "\n",
"/opt/aws/bin/cfn-signal -e $? ",
" --stack ", { "Ref" : "AWS::StackName" },
" --resource ECSAutoScalingGroup ",
" --region ", { "Ref" : "AWS::Region" }, "\n"
]]}},
"Tags" : [ {"Key" : "Name", "Value" : "ECS autoscaling instance"} ]
}
},
"service": {
"Type": "AWS::ECS::Service",
"DependsOn": ["ECSAutoScalingGroup"],
"Properties" : {
"Cluster": {"Ref": "ECSCluster"},
"DesiredCount": "1",
"LoadBalancers": [
{
"ContainerName": "simple-app",
"ContainerPort": "80",
"LoadBalancerName" : { "Ref" : "EcsElasticLoadBalancer" }
}
],
"Role" : {"Ref":"ECSServiceRole"},
"TaskDefinition" : {"Ref":"taskdefinition"}
}
},
"ECSServiceRole": {
"Type": "AWS::IAM::Role",
"Properties": {
"AssumeRolePolicyDocument": {
"Statement": [
{
"Effect": "Allow",
"Principal": {
"Service": [
"ecs.amazonaws.com"
]
},
"Action": [
"sts:AssumeRole"
]
}
]
},
"Path": "/",
"Policies": [
{
"PolicyName": "ecs-service",
"PolicyDocument": {
"Statement": [
{
"Effect": "Allow",
"Action": [
"elasticloadbalancing:Describe*",
"elasticloadbalancing:DeregisterInstancesFromLoadBalancer",
"elasticloadbalancing:RegisterInstancesWithLoadBalancer",
"ec2:Describe*",
"ec2:AuthorizeSecurityGroupIngress"
],
"Resource": "*"
}
]
}
}
]
}
},
"EC2Role": {
"Type": "AWS::IAM::Role",
"Properties": {
"AssumeRolePolicyDocument": {
"Statement": [
{
"Effect": "Allow",
"Principal": {
"Service": [
"ec2.amazonaws.com"
]
},
"Action": [
"sts:AssumeRole"
]
}
]
},
"Path": "/",
"Policies": [
{
"PolicyName": "ecs-service",
"PolicyDocument": {
"Statement": [
{
"Effect": "Allow",
"Action": [
"ecs:CreateCluster",
"ecs:DeregisterContainerInstance",
"ecs:DiscoverPollEndpoint",
"ecs:Poll",
"ecs:RegisterContainerInstance",
"ecs:StartTelemetrySession",
"ecs:Submit*",
"logs:CreateLogStream",
"logs:PutLogEvents"
],
"Resource": "*"
}
]
}
}
]
}
},
"EC2InstanceProfile": {
"Type": "AWS::IAM::InstanceProfile",
"Properties": {
"Path": "/",
"Roles": [ { "Ref": "EC2Role" } ]
}
}
"ECSSecurityGroup" : {
"Type" : "AWS::EC2::SecurityGroup",
"Properties" : {
"GroupDescription" : "Fortigate recommended settings. See marketplace for docs.",
"VpcId" : { "Ref" : "VPC" },
"SecurityGroupIngress" : [
{ "IpProtocol" : "tcp", "FromPort" : "22", "ToPort" : "22", "CidrIp" : "0.0.0.0/0" },
{ "IpProtocol" : "tcp", "FromPort" : "80", "ToPort" : "80", "CidrIp" : "0.0.0.0/0" },
{ "IpProtocol" : "icmp", "FromPort" : "-1", "ToPort" : "-1", "CidrIp" : {"Fn::GetAtt" : [ "VPC" , "CidrBlock" ]}}
],
"Tags" : [ {"Key" : "Name", "Value" : "ECS Security Group"} ]
}
}
},
"Outputs" : {
"ecsservice" : {
"Value" : { "Ref" : "service" }
},
"ecscluster" : {
"Value" : { "Ref" : "ECSCluster" }
},
"taskdef" : {
"Value" : { "Ref" : "taskdefinition" }
}
}
}
当我创建堆栈时,AutoScaling 组之前的所有内容都已完成。AS 组已创建,实例已启动。但随后运行状况检查失败,实例终止,堆栈回滚。CloudFormation 显示自动缩放组的创建失败,并显示收到 1 个 SUCCESS 信号中的 0 个。无法满足 100% MinSuccessfulInstancesPercent 要求。
迄今为止采取的故障排除步骤:
- 将 AMI 映射回滚到其示例结果中列出的版本,结果仍旧失败。所以问题不是出在这里。
- 将堆栈设置为更长的超时时间(60 分钟)以便通过 SSH 进入并开始探索实例。
- 看起来 CloudFormation Init 不是从 AWS::AutoScaling::LaunchConfiguration 资源运行的。/etc/ecs/ecs.config不存在,其他文件缺失或缺少配置。但 LaunchConfiguration 资源在 CloudFormation 控制台事件日志中似乎已完成。
几个问题:
- 在我们解决这个问题之前,我想我应该问一下是否有一个更新的、经过验证的工作模板发布在某处?
- “ContainerInstances”资源的 AWS::CloudFormation::Init 代码看起来正常吗?
- AutoScaling 组本身看起来还好吗?
- 我怎样才能最好地解决这些配置问题?
答案1
使用此用户数据:
“UserData”:{“Fn::Base64”:{“Fn::Join”:[“”,[“#!/bin/bash -xe\n”,“sudo mkdir /etc/ecs/\n”,“sudo chmod 777 /etc/ecs/\n”,“echo ECS_CLUSTER=",{“Ref”:“ECSCluster”},“>> /etc/ecs/ecs.config\n”,“yum install -y aws-cfn-bootstrap\n”,“/opt/aws/bin/cfn-signal -e $?”,“--stack”,{“Ref”:“AWS::StackName”},“--resource ECSAutoScalingGroup”,“--region”,{“Ref”:“AWS::Region”},“\n”]]}
并将“cloudformation:SignalResource”添加到EC2Role Policy文档。