Terraform 自动化

概述

HashiCorp Terraform 是开源的基础设施即代码(Infrastructure as Code, IaC)工具,通过声明式配置文件管理云资源。支持 AWS、Azure、GCP、阿里云等 100+ Provider,以 HCL(HCL 语法)为配置语言,支持状态管理、计划预览、并行执行、资源图谱等特性。

核心特性:

  • 声明式配置 — 描述"要什么"而非"如何做"
  • 执行计划terraform plan 预览变更,防止误操作
  • 状态管理 — 跟踪资源实际状态,支持远程状态存储(S3、DynamoDB 锁)
  • 模块化 — Module 复用基础设施模式
  • 多云支持 — 统一工具管理多云资源
  • 团队协作 — 工作区(Workspace)、锁机制防止并发冲突

安装与配置

Linux/macOS 安装


# 直接下载二进制(推荐)
curl -fsSL https://releases.hashicorp.com/terraform/1.6.6/terraform_1.6.6_linux_amd64.zip -o /tmp/terraform.zip
sudo unzip /tmp/terraform.zip -d /usr/local/bin/
terraform version

# 或使用 tfenv 管理多版本
brew install tfenv
tfenv install 1.6.6
tfenv use 1.6.6

Docker 使用


# 运行 Terraform 容器(别名)
alias terraform='docker run --rm -it -v $(pwd):/workspace -w /workspace hashicorp/terraform:1.6.6'

# 持久化配置和插件
docker run --rm -it \
  -v ~/.aws:/root/.aws:ro \
  -v $(pwd):/workspace \
  -w /workspace \
  -e AWS_PROFILE=prod \
  hashicorp/terraform:1.6.6 init

AWS Provider 配置


# versions.tf
terraform {
  required_version = ">= 1.6.0"

  required_providers {
    aws = {
      source  = "hashicorp/aws"
      version = "~> 5.0"
    }
  }

  # 远程状态存储(S3 + DynamoDB 锁)
  backend "s3" {
    bucket         = "prod-terraform-state"
    key            = "network/terraform.tfstate"
    region         = "us-east-1"
    encrypt        = true
    dynamodb_table = "terraform-locks"
  }
}

provider "aws" {
  region = "us-east-1"

  # 多账号配置
  alias = "prod"

  default_tags {
    tags = {
      Environment = "production"
      ManagedBy   = "terraform"
      Project     = "opsdocs"
    }
  }
}

核心语法

资源定义


# VPC 创建
resource "aws_vpc" "prod" {
  cidr_block           = "10.0.0.0/16"
  enable_dns_hostnames = true
  enable_dns_support   = true

  tags = {
    Name = "prod-vpc"
  }
}

# 子网
resource "aws_subnet" "prod_private_1" {
  vpc_id                  = aws_vpc.prod.id
  cidr_block              = "10.0.1.0/24"
  availability_zone       = "us-east-1a"
  map_public_ip_on_launch = false

  tags = {
    Name = "prod-private-subnet-az1"
    Tier = "private"
  }
}

resource "aws_subnet" "prod_public_1" {
  vpc_id                  = aws_vpc.prod.id
  cidr_block              = "10.0.2.0/24"
  availability_zone       = "us-east-1a"
  map_public_ip_on_launch = true

  tags = {
    Name = "prod-public-subnet-az1"
    Tier = "public"
  }
}

# 互联网网关
resource "aws_internet_gateway" "prod" {
  vpc_id = aws_vpc.prod.id

  tags = {
    Name = "prod-igw"
  }
}

# 路由表
resource "aws_route_table" "prod_public" {
  vpc_id = aws_vpc.prod.id

  route {
    cidr_block = "0.0.0.0/0"
    gateway_id = aws_internet_gateway.prod.id
  }

  tags = {
    Name = "prod-public-rt"
  }
}

resource "aws_route_table_association" "prod_public_1" {
  subnet_id      = aws_subnet.prod_public_1.id
  route_table_id = aws_route_table.prod_public.id
}

数据源(Data Source)


# 查询现有 AMI
data "aws_ami" "ubuntu" {
  most_recent = true
  owners      = ["099720109477"]  # Canonical

  filter {
    name   = "name"
    values = ["ubuntu/images/hvm-ssd/ubuntu-22.04-amd64-server-*"]
  }

  filter {
    name   = "virtualization-type"
    values = ["hvm"]
  }
}

# 查询可用区
data "aws_availability_zones" "available" {
  state = "available"
}

# 查询 IAM 角色
data "aws_iam_role" "ecs_task_role" {
  name = "ecsTaskExecutionRole"
}

变量与输出


# variables.tf
variable "environment" {
  type        = string
  description = "环境名称"
  default     = "prod"
}

variable "vpc_cidr" {
  type        = string
  description = "VPC CIDR 段"
  default     = "10.0.0.0/16"

  validation {
    condition     = can(cidrhost(var.vpc_cidr, 0)) != null
    error_message = "无效的 CIDR 格式。"
  }
}

variable "availability_zones" {
  type        = list(string)
  description = "可用区列表"
  default     = ["us-east-1a", "us-east-1b", "us-east-1c"]
}

variable "tags" {
  type        = map(string)
  description = "全局标签"
  default     = {}
}

# outputs.tf
output "vpc_id" {
  description = "VPC ID"
  value       = aws_vpc.prod.id
}

output "private_subnet_ids" {
  description = "私有子网 ID 列表"
  value       = [aws_subnet.prod_private_1.id, aws_subnet.prod_private_2.id]
}

output "public_subnet_ids" {
  description = "公有子网 ID 列表"
  value       = [aws_subnet.prod_public_1.id, aws_subnet.prod_public_2.id]
}

output "vpc_cidr" {
  description = "VPC CIDR"
  value       = aws_vpc.prod.cidr_block
}

循环与条件


# Count 循环(创建多个资源)
resource "aws_subnet" "private" {
  count = 3

  vpc_id                  = aws_vpc.prod.id
  cidr_block              = cidrsubnet(var.vpc_cidr, 8, count.index + 1)
  availability_zone       = data.aws_availability_zones.available.names[count.index]
  map_public_ip_on_launch = false

  tags = {
    Name = "private-subnet-${count.index + 1}"
  }
}

# for_each 循环(创建 Map 资源,更精细控制)
resource "aws_security_group" "app" {
  for_each = toset(["web", "api", "worker"])

  name        = "sg-${each.value}"
  description = "Security group for ${each.value}"
  vpc_id      = aws_vpc.prod.id

  ingress {
    from_port   = each.value == "web" ? 80 : (each.value == "api" ? 8080 : 9000)
    to_port     = each.value == "web" ? 80 : (each.value == "api" ? 8080 : 9000)
    protocol    = "tcp"
    cidr_blocks = ["0.0.0.0/0"]
  }

  tags = {
    Name = "sg-${each.value}"
  }
}

# 条件表达式
resource "aws_db_instance" "prod" {
  # ...
  multi_az               = var.enable_multi_az ? true : false
  backup_retention_period = var.environment == "prod" ? 14 : 1
}

本地资源(Local Values)


locals {
  common_tags = {
    Environment = var.environment
    Project     = "opsdocs"
    ManagedBy   = "terraform"
  }

  # 合并标签
  all_tags = merge(local.common_tags, var.tags)

  # 常用 CIDR 计算
  private_subnets = {
    az1 = cidrsubnet(var.vpc_cidr, 8, 1)
    az2 = cidrsubnet(var.vpc_cidr, 8, 2)
    az3 = cidrsubnet(var.vpc_cidr, 8, 3)
  }
}

resource "aws_vpc" "prod" {
  # ...
  tags = local.all_tags
}

模块(Module)


# modules/vpc/main.tf
variable "vpc_cidr" { type = string }
variable "environment" { type = string }

resource "aws_vpc" "main" {
  cidr_block = var.vpc_cidr
  tags       = { Name = "${var.environment}-vpc" }
}

output "vpc_id" {
  value = aws_vpc.main.id
}

# 调用模块
module "vpc" {
  source = "./modules/vpc"

  vpc_cidr     = "10.0.0.0/16"
  environment  = "prod"
}

状态管理

本地 vs 远程状态


# 本地状态(不推荐生产使用)
terraform {
  backend "local" {
    path = "terraform.tfstate"
  }
}

# 远程状态(S3 + DynamoDB 锁)
terraform {
  backend "s3" {
    bucket         = "prod-terraform-state"
    key            = "network/terraform.tfstate"
    region         = "us-east-1"
    encrypt        = true
    dynamodb_table = "terraform-locks"
  }
}

State 锁定表


# 创建 DynamoDB 锁表
aws dynamodb create-table \
  --table-name terraform-locks \
  --attribute-definitions AttributeName=LockID,AttributeType=S \
  --key-schema AttributeName=LockID,KeyType=HASH \
  --billing-mode PAY_PER_REQUEST

State 操作


# 查看当前 state
terraform state list

# 查看特定资源
terraform state show aws_vpc.prod

# 移动资源(资源重构时使用)
terraform state mv aws_vpc.prod aws_vpc.prod_v2

# 移除丢失的资源(实际已删除)
terraform state rm aws_instance.missing

# 拉取远程 state 到本地
terraform state pull > terraform.tfstate.backup

工作流命令

完整工作流


# 1. 初始化(下载 Provider、Module、初始化 Backend)
terraform init

# 2. 格式化配置文件
terraform fmt

# 3. 验证配置语法
terraform validate

# 4. 预览变更(必看!生产前必须 review)
terraform plan -var-file="prod.tfvars"

# 5. 应用变更
terraform apply -var-file="prod.tfvars"

# 6. 确认销毁(测试环境清理)
terraform destroy -var-file="test.tfvars"

变量传递


# 命令行变量(优先级最高)
terraform apply -var="environment=prod" -var="enable_multi_az=true"

# 文件变量
terraform apply -var-file="prod.tfvars"

# 自动加载(按优先级)
# terraform.tfvars > terraform.tfvars.json > *.auto.tfvars > *.auto.tfvars.json

# prod.tfvars
environment        = "prod"
vpc_cidr          = "10.0.0.0/16"
enable_multi_az   = true
instance_type     = "t3.medium"
disk_size         = 100
availability_zones = ["us-east-1a", "us-east-1b", "us-east-1c"]
tags = {
  CostCenter = "engineering"
  Owner      = "ops-team"
}

Import 现有资源


# 将已存在的 AWS 资源导入到 Terraform 管理
# 1. 先写好配置
resource "aws_vpc" "existing" {
  cidr_block = "10.1.0.0/16"
}

# 2. 执行 import
terraform import aws_vpc.existing vpc-0abcd1234efgh5678

# 生成 import 代码
terraform plan -generate-config-out=generated.tf

模块生态

常用 Module


# VPC Module(官方)
module "vpc" {
  source  = "terraform-aws-modules/vpc/aws"
  version = "~> 5.0"

  name = "prod-vpc"
  cidr = "10.0.0.0/16"

  azs             = ["us-east-1a", "us-east-1b", "us-east-1c"]
  private_subnets = ["10.0.1.0/24", "10.0.2.0/24", "10.0.3.0/24"]
  public_subnets  = ["10.0.10.0/24", "10.0.11.0/24", "10.0.12.0/24"]

  enable_nat_gateway     = true
  single_nat_gateway     = false
  enable_dns_hostnames   = true
  enable_dns_support     = true

  tags = {
    Environment = "prod"
  }
}

# RDS Module
module "rds" {
  source  = "terraform-aws-modules/rds/aws"
  version = "~> 6.0"

  identifier = "prod-mysql"

  engine               = "mysql"
  engine_version       = "8.0"
  family               = "mysql8.0"
  major_engine_version = "8.0"

  instance_class    = "db.r6g.large"
  allocated_storage = 100
  storage_encrypted = true

  multi_az               = true
  backup_retention_period = 7
  skip_final_snapshot     = false
  final_snapshot_identifier = "prod-mysql-final-snap"

  db_name  = "appdb"
  username = "admin"
  password = "YourSecurePassword123!"  # 生产使用 secretsmanager

  vpc_security_group_ids = [module.vpc.security_group_ids["default"]]

  tags = {
    Environment = "prod"
  }
}

# EKS Module
module "eks" {
  source  = "terraform-aws-modules/eks/aws"
  version = "~> 19.0"

  cluster_name    = "prod-eks"
  cluster_version = "1.28"

  vpc_id     = module.vpc.vpc_id
  subnet_ids = module.vpc.private_subnets

  eks_managed_node_groups = {
    system = {
      min_size       = 2
      max_size       = 10
      desired_size   = 3
      instance_types = ["t3.medium"]
      capacity_type = "SPOT"
    }
  }

  enable_cluster_creator_admin_permissions = true
}

Terragrunt(DRY 复用)


# prod/terraform.tfvars 目录
# terragrunt.hcl
generate "provider" {
  path      = "provider.tf"
  if_exists = "overwrite"
  contents  = <<EOF
provider "aws" {
  region = "us-east-1"
  default_tags {
    tags = {
      Environment = "prod"
      ManagedBy   = "terragrunt"
    }
  }
}
EOF
}

# 远程模块调用
generate "vpc" {
  path      = "vpc.tf"
  if_exists = "overwrite"
  contents  = <<EOF
module "vpc" {
  source = "git::https://github.com/terraform-aws-modules/terraform-aws-vpc.git?ref=v5.0.0"
  # ... 具体配置
}
EOF
}

inputs = {
  environment = "prod"
  # ...
}

CI/CD 集成

GitHub Actions


# .github/workflows/terraform.yml
name: Terraform

on:
  push:
    branches: [main]
    paths: ['terraform/**']
  pull_request:
    branches: [main]

env:
  TF_VERSION: '1.6.6'
  AWS_REGION: 'us-east-1'

jobs:
  terraform:
    name: Terraform Plan
    runs-on: ubuntu-latest

    steps:
      - name: Checkout
        uses: actions/checkout@v4

      - name: Setup Terraform
        uses: hashicorp/setup-terraform@v2
        with:
          terraform_version: ${{ env.TF_VERSION }}

      - name: Configure AWS Credentials
        uses: aws-actions/configure-aws-credentials@v4
        with:
          aws-access-key-id: ${{ secrets.AWS_ACCESS_KEY_ID }}
          aws-secret-access-key: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
          aws-region: ${{ env.AWS_REGION }}

      - name: Terraform Init
        id: init
        working-directory: terraform
        run: |
          terraform init -upgrade

      - name: Terraform Format
        id: fmt
        working-directory: terraform
        run: terraform fmt -check -recursive

      - name: Terraform Validate
        id: validate
        working-directory: terraform
        run: terraform validate

      - name: Terraform Plan
        id: plan
        working-directory: terraform
        run: terraform plan -no-color
        env:
          TF_VAR_environment: prod

      - name: Update PR
        if: github.event_name == 'pull_request'
        uses: actions/github-script@v7
        with:
          script: |
            github.rest.issues.createComment({
              issue_number: context.issue.number,
              owner: context.repo.owner,
              repo: context.repo.repo,
              body: '```
${{ steps.plan.outputs.stdout }}

})

  • name: Terraform Apply

if: github.ref == 'refs/heads/main' && github.event_name == 'push'

working-directory: terraform

run: terraform apply -auto-approve -var-file="prod.tfvars"



## 完整实战项目

### 项目一:生产级 VPC 自动创建

以下配置创建完整的三层网络(公有子网 + 私有子网 + RDS 子网),包含 NAT网关、安全组和路由的完整生产级模板:

─── versions.tf ───────────────────────────────────────────────

terraform {

required_version = ">= 1.6.0"

required_providers {

aws = {

source = "hashicorp/aws"

version = "~> 5.0"

}

}

}

─── main.tf ──────────────────────────────────────────────────

VPC

resource "aws_vpc" "prod" {

cidr_block = var.vpc_cidr

enable_dns_hostnames = true

enable_dns_support = true

tags = merge(local.common_tags, {

Name = "${var.environment}-vpc"

})

}

公有子网(面向 Internet,用于 ALB/NAT Gateway)

resource "aws_subnet" "public" {

count = length(var.availability_zones)

vpc_id = aws_vpc.prod.id

cidr_block = cidrsubnet(var.vpc_cidr, 8, count.index)

availability_zone = var.availability_zones[count.index]

map_public_ip_on_launch = false # 公有子网不放实例,通过 NAT 出去

tags = merge(local.common_tags, {

Name = "${var.environment}-public-${var.availability_zones[count.index]}"

Tier = "public"

})

}

私有子网(应用层,EC2/ECS 所在)

resource "aws_subnet" "private_app" {

count = length(var.availability_zones)

vpc_id = aws_vpc.prod.id

cidr_block = cidrsubnet(var.vpc_cidr, 4, count.index + 1)

availability_zone = var.availability_zones[count.index]

map_public_ip_on_launch = false

tags = merge(local.common_tags, {

Name = "${var.environment}-private-app-${var.availability_zones[count.index]}"

Tier = "private_app"

})

}

数据子网(RDS/ElastiCache 所在,无外部路由)

resource "aws_subnet" "private_data" {

count = length(var.availability_zones)

vpc_id = aws_vpc.prod.id

cidr_block = cidrsubnet(var.vpc_cidr, 4, count.index + 10)

availability_zone = var.availability_zones[count.index]

map_public_ip_on_launch = false

tags = merge(local.common_tags, {

Name = "${var.environment}-private-data-${var.availability_zones[count.index]}"

Tier = "private_data"

})

}

─── 网络层 ───────────────────────────────────────────────────

NAT Gateway(放在公有子网,让私有子网可以上外网)

resource "aws_eip" "nat" {

count = length(var.availability_zones) > 1 ? 1 : 1

domain = "vpc"

tags = local.common_tags

}

resource "aws_nat_gateway" "main" {

count = length(var.availability_zones) > 1 ? 1 : 1

subnet_id = aws_subnet.public[0].id

allocation_id = aws_eip.nat[0].id

tags = merge(local.common_tags, {

Name = "${var.environment}-nat"

})

}

私有子网路由表(通过 NAT Gateway 出公网)

resource "aws_route_table" "private_app" {

vpc_id = aws_vpc.prod.id

route {

cidr_block = "0.0.0.0/0"

nat_gateway_id = aws_nat_gateway.main[0].id

}

tags = merge(local.common_tags, {

Name = "${var.environment}-private-app-rt"

})

}

数据子网路由表(完全内网,无公网出口)

resource "aws_route_table" "private_data" {

vpc_id = aws_vpc.prod.id

# 无 0.0.0.0/0 路由,RDS/Redis 只能内网访问

tags = merge(local.common_tags, {

Name = "${var.environment}-private-data-rt"

})

}

路由表关联

resource "aws_route_table_association" "private_app" {

count = length(var.availability_zones)

subnet_id = aws_subnet.private_app[count.index].id

route_table_id = aws_route_table.private_app.id

}

resource "aws_route_table_association" "private_data" {

count = length(var.availability_zones)

subnet_id = aws_subnet.private_data[count.index].id

route_table_id = aws_route_table.private_data.id

}

─── 安全组 ──────────────────────────────────────────────────

resource "aws_security_group" "alb" {

name = "${var.environment}-alb-sg"

description = "Load Balancer 安全组"

vpc_id = aws_vpc.prod.id

ingress {

from_port = 443

to_port = 443

protocol = "tcp"

cidr_blocks = ["0.0.0.0/0"]

}

ingress {

from_port = 80

to_port = 80

protocol = "tcp"

cidr_blocks = ["0.0.0.0/0"]

}

egress {

from_port = 0

to_port = 0

protocol = "-1"

cidr_blocks = ["0.0.0.0/0"]

}

tags = merge(local.common_tags, { Name = "${var.environment}-alb-sg" })

}

resource "aws_security_group" "app" {

name = "${var.environment}-app-sg"

description = "应用层安全组(接受 ALB 流量)"

vpc_id = aws_vpc.prod.id

ingress {

from_port = 8080

to_port = 8080

protocol = "tcp"

security_groups = [aws_security_group.alb.id]

}

egress {

from_port = 0

to_port = 0

protocol = "-1"

cidr_blocks = ["0.0.0.0/0"]

}

tags = merge(local.common_tags, { Name = "${var.environment}-app-sg" })

}

resource "aws_security_group" "rds" {

name = "${var.environment}-rds-sg"

description = "RDS 安全组(只允许 App 层访问)"

vpc_id = aws_vpc.prod.id

ingress {

from_port = 3306

to_port = 3306

protocol = "tcp"

security_groups = [aws_security_group.app.id]

}

tags = merge(local.common_tags, { Name = "${var.environment}-rds-sg" })

}

─── RDS MySQL ───────────────────────────────────────────────

resource "aws_db_subnet_group" "prod" {

name = "${var.environment}-db-subnet"

subnet_ids = aws_subnet.private_data[*].id

tags = merge(local.common_tags, { Name = "${var.environment}-db-subnet" })

}

resource "aws_db_instance" "prod" {

identifier = "${var.environment}-mysql"

engine = "mysql"

engine_version = "8.0"

instance_class = var.db_instance_class

allocated_storage = var.db_allocated_storage

storage_encrypted = true

storage_type = "gp3"

db_name = replace(var.environment, "-", "_")

username = "admin"

password = var.db_password # 生产从 Secrets Manager 读取

db_subnet_group_name = aws_db_subnet_group.prod.name

vpc_security_group_ids = [aws_security_group.rds.id]

multi_az = var.environment == "prod" ? true : false

backup_retention_period = var.environment == "prod" ? 14 : 1

backup_window = "03:00-04:00"

maintenance_window = "mon:04:00-mon:05:00"

final_snapshot_identifier = "${var.environment}-mysql-final-snap"

skip_final_snapshot = false

enabled_cloudwatch_logs_exports = ["error", "general", "slowquery"]

tags = local.common_tags

}

─── variables.tf ────────────────────────────────────────────

variable "environment" {

type = string

default = "prod"

}

variable "vpc_cidr" {

type = string

default = "10.0.0.0/16"

}

variable "availability_zones" {

type = list(string)

default = ["us-east-1a", "us-east-1b", "us-east-1c"]

}

variable "db_instance_class" {

type = string

default = "db.r6g.large"

}

variable "db_allocated_storage" {

type = number

default = 100

}

variable "db_password" {

type = string

sensitive = true

default = "" # 生产必须通过 -var 传入或使用 secretsmanager

}

─── locals.tf ────────────────────────────────────────────────

locals {

common_tags = {

Environment = var.environment

Project = "opsdocs"

ManagedBy = "terraform"

Owner = "ops-team"

}

}

─── outputs.tf ───────────────────────────────────────────────

output "vpc_id" { value = aws_vpc.prod.id }

output "private_app_subnets" { value = aws_subnet.private_app[*].id }

output "private_data_subnets" { value = aws_subnet.private_data[*].id }

output "rds_endpoint" { value = aws_db_instance.prod.endpoint }

output "rds_arn" { value = aws_db_instance.prod.arn }

output "security_group_app_id" { value = aws_security_group.app.id }

output "security_group_rds_id" { value = aws_security_group.rds.id }



执行顺序:

terraform init

terraform validate

terraform plan -var="db_password=YourStrongPass123!" -var-file="prod.tfvars"

terraform apply -var="db_password=YourStrongPass123!" -var-file="prod.tfvars"



---

### 项目二:EKS 集群自动创建

─── EKS Cluster ────────────────────────────────────────────

module "eks" {

source = "terraform-aws-modules/eks/aws"

version = "~> 19.0"

cluster_name = "${var.environment}-eks"

cluster_version = "1.28"

vpc_id = var.vpc_id

subnet_ids = var.private_subnet_ids

# EKS 托管节点组(系统 Pod 用)

eks_managed_node_groups = {

system = {

min_size = 2

max_size = 5

desired_size = 2

instance_types = ["t3.medium"]

capacity_type = "ON_DEMAND"

labels = { node-group = "system" }

taints = [

{

key = "node-role"

value = "system"

effect = "NO_SCHEDULE"

}

]

}

app = {

min_size = 2

max_size = 10

desired_size = 3

instance_types = ["t3.medium"]

capacity_type = "SPOT" # 生产推荐混用 SPOT 降低成本

labels = { node-group = "app" }

}

}

# kubeconfig 生成

enable_irsa = true # IAM Roles for Service Accounts(Pod 级 IAM 权限)

create_cluster_security_group = false

cluster_security_group_id = var.eks_cluster_sg_id

tags = {

Environment = var.environment

}

}

─── Karpenter 自动扩缩容 ────────────────────────────────────

resource "aws_karpenter_node_pool" "default" {

name = "${var.environment}-default"

cluster_name = module.eks.cluster_name

cluster_endpoint = module.eks.cluster_endpoint

cluster_ca_base64 = module.eks.cluster_certificate_authority_data

capacity_types = ["ON_DEMAND", "SPOT"]

instance_types = ["t3.medium", "t3.large", "m6i.large"]

weight = 100

requirements {

key = "node.kubernetes.org/instance-type"

operator = "In"

values = ["t3.medium", "t3.large", "m6i.large"]

}

requirements {

key = "topology.kubernetes.io/zone"

operator = "In"

values = ["us-east-1a", "us-east-1b", "us-east-1c"]

}

requirements {

key = "kubernetes.io/os"

operator = "In"

values = ["linux"]

}

disruption {

consolidationPolicy = "WhenEmpty"

expireAfter = "72h"

}

}

─── AWS Load Balancer Controller ─────────────────────────────

EKS 集群创建后部署 ALB Ingress Controller

resource "helm_release" "lb_controller" {

name = "aws-load-balancer-controller"

repository = "https://aws.github.io/eks-charts"

chart = "aws-load-balancer-controller"

namespace = "kube-system"

version = "1.6.0"

set {

name = "clusterName"

value = module.eks.cluster_name

}

depends_on = [module.eks]

}



---

## Workspace 多环境管理

### 概念与适用场景

Terraform Workspace 通过状态隔离实现同一套代码管理多个环境(dev/stag/prod)。每个 Workspace 有独立的 State 文件。

创建 workspace

terraform workspace new prod

terraform workspace new stag

terraform workspace new dev

切换 workspace

terraform workspace select prod

列出所有 workspace

terraform workspace list



### 多环境 Backend 隔离

每个 Workspace 使用独立的 S3 Key 路径,State 完全隔离:

versions.tf

terraform {

backend "s3" {

bucket = "prod-terraform-state"

region = "us-east-1"

encrypt = true

dynamodb_table = "terraform-locks"

# key 按 workspace 分开:envs/prod/terraform.tfstate

key = "envs/${terraform.workspace}/terraform.tfstate"

}

}

通过 -var-file 传递环境差异配置

prod.tfvars / stag.tfvars / dev.tfvars



### dev/stag/prod 变量示例

dev.tfvars

environment = "dev"

db_instance_class = "db.t3.micro"

db_allocated_storage = 20

enable_multi_az = false

instance_type = "t3.micro"

stag.tfvars

environment = "stag"

db_instance_class = "db.r6g.large"

db_allocated_storage = 50

enable_multi_az = false

instance_type = "t3.medium"

prod.tfvars

environment = "prod"

db_instance_class = "db.r6g.2xlarge"

db_allocated_storage = 200

enable_multi_az = true

instance_type = "m6i.2xlarge"



### Workspace 感知资源

根据 workspace 动态调整资源配置

resource "aws_db_instance" "prod" {

instance_class = terraform.workspace == "prod" ? "db.r6g.2xlarge" : "db.t3.micro"

multi_az = terraform.workspace == "prod" ? true : false

tags = {

Workspace = terraform.workspace

}

}



---

## Terraform Cloud / Atlantis 远程运行

### Terraform Cloud 远程执行

Terrform Cloud 提供免费的远程状态存储、团队执行、策略检查(OPA/Sentinel)。

versions.tf

terraform {

cloud {

organization = "your-org"

workspaces {

name = "prod-infra"

# 或通过 tags 管理:

# tags = ["production", "infrastructure"]

}

}

required_version = ">= 1.6.0"

required_providers {

aws = { source = "hashicorp/aws", version = "~> 5.0" }

}

}



配置完成后,`terraform login` 获取 Token,`terraform init` 自动连接到 Cloud。

.terraformrc 或 ~/.config/terraform/credentials.tfrc.json

credentials "app.terraform.io" {

token = "xxxx.atlasv1.xxxxx"

}



### Atlantis 本地远程运行

适合不想用 Terraform Cloud 的团队,Atlantis 在 Git webhooks 触发 `terraform plan/apply`,通过 PR 评论返回结果:

atlantis.yaml

version: 1

automerge: true

parallel_apply: false

projects:

  • name: prod-vpc

dir: terraform/vpc

workspace: prod

terraform_version: "1.6.6"

delete_source_branch_on_merge: true

apply_requirements: ["approved", "mergeable"]

  • name: prod-eks

dir: terraform/eks

workspace: prod

terraform_version: "1.6.6"

apply_requirements: ["approved", "mergeable"]

autoplan:

when_modified: ["*.tf", "../modules/**/*.tf"]

enabled: true



Atlantis 部署(Docker):

docker run --name atlantis -e ATLANTIS_GH_TOKEN=your_gh_token -e ATLANTIS_GH_USER=atlantis-bot -e ATLANTIS_GH_WEBHOOK_SECRET=your_webhook_secret -e ATLANTIS_REPO_ALLOWLIST="github.com/your-org/*" -v /root/atlantis.yaml:/atlantis.yaml -v /root/.aws:/root/.aws:ro -p 4141:4141 ghcr.io/runatlantis/atlantis:v0.26.0



---

## GitLab CI/CD 集成

### 完整 .gitlab-ci.yml

.gitlab-ci.yml

image:

name: hashicorp/terraform:1.6.6

entrypoint:

  • /usr/bin/env

variables:

AWS_REGION: us-east-1

TF_STATE_BUCKET: prod-terraform-state

TF_DYNAMODB_TABLE: terraform-locks

GIT_DEPTH: 1

stages:

  • validate
  • plan
  • apply
  • destroy

.before_template: &before_template

  • apk add --no-cache aws-cli curl
  • export AWS_ACCESS_KEY_ID=$AWS_ACCESS_KEY_ID
  • export AWS_SECRET_ACCESS_KEY=$AWS_SECRET_ACCESS_KEY
  • export AWS_DEFAULT_REGION=$AWS_REGION
  • terraform --version
  • terraform init -upgrade -backend-config="bucket=$TF_STATE_BUCKET" -backend-config="key=${CI_PROJECT_PATH}/${CI_COMMIT_REF_NAME}/terraform.tfstate" -backend-config="dynamodb_table=$TF_DYNAMODB_TABLE" -backend-config="region=$AWS_REGION"

terraform_validate:

stage: validate

before_script:

  • *before_template

script:

  • terraform validate
  • terraform fmt -check -recursive

only:

  • merge_requests
  • main
  • develop

terraform_plan:

stage: plan

before_script:

  • *before_template

script:

  • terraform plan -var-file="${CI_COMMIT_REF_NAME}.tfvars" -out=tfplan
  • echo "Plan complete"

artifacts:

name: tfplan

paths:

  • tfplan

expire_in: 1 day

only:

  • merge_requests
  • main
  • develop

dependencies:

  • terraform_validate

terraform_apply:

stage: apply

before_script:

  • *before_template

script:

  • terraform apply -var-file="${CI_COMMIT_REF_NAME}.tfvars" -auto-approve tfplan

environment:

name: $CI_COMMIT_REF_NAME

only:

  • main

when: manual

dependencies:

  • terraform_plan

after_script:

  • terraform output -json > terraform_output.json

artifacts:

name: tf_output

paths:

  • terraform_output.json

expire_in: 30 days

terraform_destroy:

stage: destroy

before_script:

  • *before_template

script:

  • terraform destroy -var-file="${CI_COMMIT_REF_NAME}.tfvars" -auto-approve

environment:

name: $CI_COMMIT_REF_NAME

action: destroy

only:

  • develop

when: manual



---

## 生产最佳实践

### 目录结构

terraform/

├── .terraform-version # 指定 Terraform 版本

├── .terraform.lock.hcl # 依赖锁定文件(提交到 Git)

├── versions.tf # Provider 和版本约束

├── provider.tf # Provider 配置

├── variables.tf # 变量定义

├── outputs.tf # 输出定义

├── locals.tf # 本地变量

├── main.tf # 根模块入口

├── terraform.tfvars # 本地测试变量

├── prod.tfvars # 生产变量

├── modules/ # 自定义模块

│ ├── vpc/

│ │ ├── main.tf

│ │ ├── variables.tf

│ │ └── outputs.tf

│ └── eks/

│ └── ...

└── env/

├── prod/

│ ├── main.tf

│ ├── variables.tf

│ └── outputs.tf

└── dev/

└── ...



### 安全实践

使用 KMS 加密 S3 状态

terraform {

backend "s3" {

bucket = "prod-terraform-state"

key = "network/terraform.tfstate"

region = "us-east-1"

encrypt = true

kms_key_id = "arn:aws:kms:us-east-1:123456789012:key/xxxxx"

dynamodb_table = "terraform-locks"

}

}

敏感变量不提交到 Git

.gitignore

*.tfvars

*.tfstate

.terraform/