DevOps, IaC

Terraform DatadogでAPI監視・自動復旧

Datadog

 

 

そのままでは動かないので参考までに。

 

DatadogでAPIの外形監視をして、応答がなければDatadog -> Lambda -> EC2(SSM)経由でEC2のNginxを再起動します。

 

Datadog側 Terraform

 

terraform.tfvars

datadog_api_key = "xxxxx"
datadog_app_key = "yyyyy"

# SampleAppヘルスチェックURL
health_check_url_sample_app = "https://example.net/login"
# Slack通知先
slack_channel_name             = "slack-datadog通知"

 

variables.tf

variable health_check_url_sample_app {}
variable slack_channel_name {}

 

provider.tf

terraform {
  required_version = "= 1.1.1"
  backend "s3" {
    bucket = "sampleapp-terraform-xxxxx"
    region = "ap-northeast-1"
    # keyは環境で一意にすること
    key     = "datadog/terraform.tfstate"
    profile = "terraform-local-deployer"
  }
  required_providers {
    aws = {
      source  = "hashicorp/aws"
      version = "4.10.0"
    }
    datadog = {
      source  = "DataDog/datadog"
    }
  }
}
provider "datadog" {
  api_key = var.datadog_api_key
  app_key = var.datadog_app_key
}

 

synthetics_test.tf

resource "datadog_synthetics_test" "sample_app" {
  type    = "api"
  subtype = "http"
  request_definition {
    method = "GET"
    url    = var.health_check_url_sample_app
  }
  assertion {
    type     = "statusCode"
    operator = "is"
    target   = "200"
  }
  assertion {
    type     = "responseTime"
    operator = "lessThan"
    target   = 2000
  }
  locations = ["aws:ap-northeast-1"]
  options_list {
    # 1分毎にチェック
    tick_every = 60
    retry {
      count    = 2
      interval = 300
    }
    monitor_options {
      renotify_interval = 60
    }
  }
  name    = "An API test on ${var.health_check_url_sample_app} [SampleApp]"
  message = "API TEST on ${var.health_check_url_sample_app} @${var.slack_channel_name} @webhook-${datadog_webhook.restart_nginx_ec2.name} [SampleApp]"
  tags    = ["env:production", "service:proxy"]
  status = "live"
}

 

webhook.tf

resource "datadog_webhook" "restart_nginx_ec2" {
  name           = "restart-nginx-ec2-production"
  url            = "https://xxxxx.lambda-url.ap-northeast-1.on.aws/"
  encode_as      = "json"
  payload        = <<EOF
{
    "body": "$EVENT_MSG",
    "last_updated": "$LAST_UPDATED",
    "event_type": "$EVENT_TYPE",
    "title": "$EVENT_TITLE",
    "date": "$DATE",
    "org": {
        "id": "$ORG_ID",
        "name": "$ORG_NAME"
    },
    "id": "$ID"
}
EOF
}

 

 

 

 

AWS側 Terraform

 

# Lambda EC2 SSM操作用
resource "aws_iam_role" "lambda_ssm_role" {
  name               = "lambda-ssm-role-${var.ENV_VALUE_ENVIRONMENT}"
  assume_role_policy = file("${path.module}/policy/assume-lambda.json")
}
resource "aws_iam_policy" "lambda_ssm_role_policy" {
  name   = "lambda-ssm-policy-${var.ENV_VALUE_ENVIRONMENT}"
  policy = file("${path.module}/policy/lambda-ssm.json")
}
resource "aws_iam_role_policy_attachment" "lambda_ssm_role_attach_policy" {
  role       = aws_iam_role.lambda_ssm_role.name
  policy_arn = aws_iam_policy.lambda_ssm_role_policy.arn
}

# EC2用
resource "aws_iam_role" "sample_app_ec2_role" {
  name               = "sample_app-ec2-role-${var.ENV_VALUE_ENVIRONMENT}"
  assume_role_policy = file("${path.module}/policy/assume-ec2.json")
}
resource "aws_iam_policy" "sample_app_ec2_role_policy" {
  name   = "sample_app-ec2-policy-${var.ENV_VALUE_ENVIRONMENT}"
  policy = file("${path.module}/policy/sample-app-ec2.json")
}
resource "aws_iam_role_policy_attachment" "sample_app_ec2_role_attach_policy" {
  role       = aws_iam_role.sample_app_ec2_role.name
  policy_arn = aws_iam_policy.sample_app_ec2_role_policy.arn
}
# EC2 instance profile
resource "aws_iam_instance_profile" "ec2_profile" {
  name = "sample_app-ec2-profile-${var.ENV_VALUE_ENVIRONMENT}"
  role = aws_iam_role.sample_app_ec2_role.name
}

 

assume-lambda.json

{
    "Version": "2012-10-17",
    "Statement": [
      {
        "Effect": "Allow",
        "Principal": {
          "Service": "lambda.amazonaws.com"
        },
        "Action": "sts:AssumeRole"
      }
    ]
  }

 

lambda-ssm.json

{
    "Version": "2012-10-17",
    "Statement": [
        {
            "Effect": "Allow",
            "Action": [
                "cloudwatch:PutMetricData",
                "ds:CreateComputer",
                "ds:DescribeDirectories",
                "ec2:DescribeInstanceStatus",
                "logs:*",
                "ssm:*",
                "ec2messages:*"
            ],
            "Resource": "*"
        },
        {
            "Effect": "Allow",
            "Action": "iam:CreateServiceLinkedRole",
            "Resource": "arn:aws:iam::*:role/aws-service-role/ssm.amazonaws.com/AWSServiceRoleForAmazonSSM*",
            "Condition": {
                "StringLike": {
                    "iam:AWSServiceName": "ssm.amazonaws.com"
                }
            }
        },
        {
            "Effect": "Allow",
            "Action": [
                "iam:DeleteServiceLinkedRole",
                "iam:GetServiceLinkedRoleDeletionStatus"
            ],
            "Resource": "arn:aws:iam::*:role/aws-service-role/ssm.amazonaws.com/AWSServiceRoleForAmazonSSM*"
        },
        {
            "Effect": "Allow",
            "Action": [
                "ssmmessages:CreateControlChannel",
                "ssmmessages:CreateDataChannel",
                "ssmmessages:OpenControlChannel",
                "ssmmessages:OpenDataChannel"
            ],
            "Resource": "*"
        }
    ]
}

 

 

assume-ec2.json

{
    "Version": "2012-10-17",
    "Statement": [
        {
            "Effect": "Allow",
            "Principal": {
                "Service": "ec2.amazonaws.com"
            },
            "Action": "sts:AssumeRole"
        }
    ]
}

 

 

sample-app-ec2.json

{
    "Version": "2012-10-17",
    "Statement": [
        {
            "Action": [
                "logs:*"
            ],
            "Effect": "Allow",
            "Resource": "*"
        },
        {
            "Effect": "Allow",
            "Action": [
                "ssm:DescribeAssociation",
                "ssm:GetDeployablePatchSnapshotForInstance",
                "ssm:GetDocument",
                "ssm:DescribeDocument",
                "ssm:GetManifest",
                "ssm:GetParameters",
                "ssm:ListAssociations",
                "ssm:ListInstanceAssociations",
                "ssm:PutInventory",
                "ssm:PutComplianceItems",
                "ssm:PutConfigurePackageResult",
                "ssm:UpdateAssociationStatus",
                "ssm:UpdateInstanceAssociationStatus",
                "ssm:UpdateInstanceInformation"
            ],
            "Resource": "*"
        },
        {
            "Effect": "Allow",
            "Action": [
                "ssmmessages:CreateControlChannel",
                "ssmmessages:CreateDataChannel",
                "ssmmessages:OpenControlChannel",
                "ssmmessages:OpenDataChannel"
            ],
            "Resource": "*"
        },
        {
            "Effect": "Allow",
            "Action": [
                "ec2messages:AcknowledgeMessage",
                "ec2messages:DeleteMessage",
                "ec2messages:FailMessage",
                "ec2messages:GetEndpoint",
                "ec2messages:GetMessages",
                "ec2messages:SendReply"
            ],
            "Resource": "*"
        },
        {
            "Effect": "Allow",
            "Action": [
                "cloudwatch:PutMetricData"
            ],
            "Resource": "*"
        },
        {
            "Effect": "Allow",
            "Action": [
                "ec2:DescribeInstanceStatus"
            ],
            "Resource": "*"
        }
    ]
}

 

lambda.tf

# EC2のNginx再起動用関数
data "archive_file" "restart_nginx_ec2" {
  type        = "zip"
  source_dir = "${path.module}/../../src/${var.ENV_VALUE_ENVIRONMENT}/${var.lambda_repository_name}/lambda/functions/restartNginxEc2"
  output_path = "${path.module}/../../upload/${var.ENV_VALUE_ENVIRONMENT}/${var.lambda_repository_name}/lambda/functions/restartNginxEc2.zip"
}
resource "aws_lambda_function" "restart_nginx_ec2" {
  filename         = "${data.archive_file.restart_nginx_ec2.output_path}"
  function_name    = "restart-nginx-ec2-${var.ENV_VALUE_ENVIRONMENT}"
  role             = "${var.lambda_ssm_role_arn}"
  handler          = "function.lambda_handler"
  source_code_hash = "${data.archive_file.restart_nginx_ec2.output_base64sha256}"
  runtime          = "python3.9"

  memory_size = 128
  timeout     = 60
  environment {
    variables = {
      # 対象のid
      EC2_INSTANCE_ID = var.lambda_restart_nginx_ec2_target_ec2_id
    }
  }
  tags = {
    is_datadog_enable = (var.ENV_VALUE_ENVIRONMENT != "develop")
    role              = var.TAG_ROLE_LAMBDA
  }
}

resource "aws_lambda_function_url" "restart_nginx_ec2" {
  function_name      = aws_lambda_function.restart_nginx_ec2.function_name
  authorization_type = "NONE"
}

 

 

Lambdaリポジトリ

 

{リポジトリ名}/lambda/functions/restartNginxEc2/function.py

import boto3
import logging
import os


logger = logging.getLogger()
logger.setLevel(logging.INFO)

EC2_INSTANCE_ID = os.environ['EC2_INSTANCE_ID']

def lambda_handler(event, context):

    ssm = boto3.client('ssm')
    res = ssm.send_command(
        InstanceIds=[EC2_INSTANCE_ID],
        DocumentName="AWS-RunShellScript",
        Parameters={
            "commands": [
                "systemctl restart nginx"
            ],
            "executionTimeout": ["60"]
        },
    )

    if res['ResponseMetadata']['HTTPStatusCode'] != 200:
        print('FAILED to execute ssm.send_command().')

    return
    {
        'message': "script completed."
    }

 

 

 

 

Amazonおすすめ

iPad 9世代 2021年最新作

iPad 9世代出たから買い替え。安いぞ!🐱 初めてならiPad。Kindleを外で見るならiPad mini。ほとんどの人には通常のiPadをおすすめします><

コメントを残す

メールアドレスが公開されることはありません。 * が付いている欄は必須項目です

日本語が含まれない投稿は無視されますのでご注意ください。(スパム対策)