Ceph 存储运维完全指南 / 08 - CRUSH Map 深入

08 - CRUSH Map 深入

8.1 CRUSH Map 概述

CRUSH Map 是 Ceph 集群的"地图"，它描述了整个存储基础设施的拓扑结构和数据放置规则。理解 CRUSH Map 是优化 Ceph 数据分布和故障域隔离的关键。

CRUSH Map 组成部分

CRUSH Map
├── Devices        # OSD 设备列表
├── Types          # 桶类型层级定义
├── Buckets        # 桶（设备的逻辑分组）
├── Rules          # 放置规则
└── Tunnables      # 可调参数

8.2 CRUSH Map 操作

导出与导入

# 导出编译后的 CRUSH Map（二进制格式）
ceph osd getcrushmap -o /tmp/crushmap.bin

# 反编译为可读文本格式
crushtool -d /tmp/crushmap.bin -o /tmp/crushmap.txt

# 查看 CRUSH Map
cat /tmp/crushmap.txt

# 编辑后重新编译
crushtool -c /tmp/crushmap-edited.txt -o /tmp/crushmap-new.bin

# 导入到集群
ceph osd setcrushmap -i /tmp/crushmap-new.bin

完整 CRUSH Map 示例

# begin crush map
tunable choose_local_tries 0
tunable choose_local_fallback_tries 0
tunable choose_total_tries 50
tunable chooseleaf_descend_once 1
tunable chooseleaf_stable 1
tunable straw_calc_version 1
tunable allowed_bucket_algs 54

# devices
device 0 osd.0 class hdd
device 1 osd.1 class hdd
device 2 osd.2 class hdd
device 3 osd.3 class ssd
device 4 osd.4 class ssd
device 5 osd.5 class ssd

# types
type 0 osd
type 1 host
type 2 chassis
type 3 rack
type 4 row
type 5 pdu
type 6 pod
type 7 room
type 8 datacenter
type 9 region
type 10 root

# buckets
host node1 {
    id -2
    alg straw2
    hash 0
    item osd.0 weight 1.000
    item osd.1 weight 1.000
}
host node2 {
    id -3
    alg straw2
    hash 0
    item osd.2 weight 1.000
    item osd.3 weight 0.500
}
rack rack1 {
    id -4
    alg straw2
    hash 0
    item node1 weight 2.000
    item node2 weight 1.500
}
host node3 {
    id -5
    alg straw2
    hash 0
    item osd.4 weight 0.500
    item osd.5 weight 0.500
}
rack rack2 {
    id -6
    alg straw2
    hash 0
    item node3 weight 1.000
}
root default {
    id -1
    alg straw2
    hash 0
    item rack1 weight 3.500
    item rack2 weight 1.000
}

# rules
rule replicated_rule {
    id 0
    type replicated
    step take default
    step chooseleaf firstn 0 type host
    step emit
}

rule ssd_rule {
    id 1
    type replicated
    step take default class ssd
    step chooseleaf firstn 0 type host
    step emit
}

rule rack_rule {
    id 2
    type replicated
    step take default
    step chooseleaf firstn 0 type rack
    step emit
}

rule ec_rule {
    id 3
    type erasure
    step set_chooseleaf_tries 5
    step set_choose_tries 100
    step take default
    step chooseleaf indep 0 type host
    step emit
}

# end crush map

8.3 桶类型详解

CRUSH 使用层级桶结构来描述存储拓扑。桶是设备或子桶的容器。

桶算法

算法	说明	推荐
straw2	基于抽签的公平选择算法	✅ 推荐（所有新集群）
uniform	均匀权重，所有 item 权重相同	仅用于设备完全相同的情况
tree	二叉树算法（已废弃）	❌ 不推荐
straw	straw 的旧版本（已废弃）	❌ 不推荐

桶类型层级

root (id: 10)          ← 顶层根桶
  ├── datacenter       ← 数据中心
  │   ├── room         ← 机房
  │   │   ├── row      ← 机柜排
  │   │   │   ├── rack ← 机柜
  │   │   │   │   ├── chassis ← 机箱
  │   │   │   │   │   └── host ← 主机
  │   │   │   │   │       └── osd ← OSD 设备

8.4 故障域设计

常见故障域层级

故障域	说明	适用规模
host	主机级隔离	小型集群（3-10 节点）
rack	机柜级隔离	中型集群（10-50 节点）
row	机柜排级隔离	大型集群（50+ 节点）
room	机房级隔离	多机房部署
datacenter	数据中心级隔离	跨数据中心

设计原则

核心原则：确保每个 PG 的副本分布在不同的故障域中

示例：3 副本，host 故障域
  PG 0 → [node1, node2, node3]  ✅ 不同主机
  PG 1 → [node2, node3, node1]  ✅ 不同主机

示例：3 副本，rack 故障域
  PG 0 → [rack1-node1, rack2-node3, rack3-node5]  ✅ 不同机柜

生产环境故障域示例

# 三机柜部署
root default {
    id -1
    alg straw2
    hash 0
    item rack1 weight 10.000
    item rack2 weight 10.000
    item rack3 weight 10.000
}

rack rack1 {
    id -2
    alg straw2
    hash 0
    item node1 weight 3.300
    item node2 weight 3.300
    item node3 weight 3.400
}

# 规则：在不同 rack 上放置副本
rule production_rule {
    id 0
    type replicated
    step take default
    step chooseleaf firstn 0 type rack   # 故障域 = rack
    step emit
}

8.5 自定义 CRUSH 规则

规则语法

rule <name> {
    id <id>
    type <replicated|erasure>
    step take <bucket-name> [class <device-class>]
    step choose [firstn|indep] <N> type <bucket-type>
    step chooseleaf [firstn|indep] <N> type <bucket-type>
    step emit
}

常用规则模板

# 规则 1：跨主机 3 副本（默认）
rule replicated_host {
    id 0
    type replicated
    step take default
    step chooseleaf firstn 0 type host
    step emit
}

# 规则 2：跨机柜 3 副本
rule replicated_rack {
    id 1
    type replicated
    step take default
    step chooseleaf firstn 0 type rack
    step emit
}

# 规则 3：仅使用 SSD
rule ssd_only {
    id 2
    type replicated
    step take default class ssd
    step chooseleaf firstn 0 type host
    step emit
}

# 规则 4：仅使用 HDD
rule hdd_only {
    id 3
    type replicated
    step take default class hdd
    step chooseleaf firstn 0 type host
    step emit
}

# 规则 5：混合 SSD(WAL/DB) + HDD(Data)
rule mixed_storage {
    id 4
    type replicated
    step take default class hdd
    step chooseleaf firstn 0 type host
    step emit
}

# 规则 6：指定数据中心
rule dc1_only {
    id 5
    type replicated
    step take dc1
    step chooseleaf firstn 0 type host
    step emit
}

# 规则 7：EC 规则
rule ec_k4m2 {
    id 6
    type erasure
    step set_chooseleaf_tries 5
    step set_choose_tries 100
    step take default
    step chooseleaf indep 0 type host
    step emit
}

8.6 设备类（Device Class）

设备类允许根据磁盘类型（SSD/HDD/NVMe）分别管理 OSD。

# 查看设备类
ceph osd crush class ls

# 设置 OSD 的设备类
ceph osd crush set-device-class ssd osd.3
ceph osd crush set-device-class hdd osd.0

# 使用设备类的 CRUSH 规则
ceph osd crush rule create-replicated ssd_rule default host firstn class ssd
ceph osd crush rule create-replicated hdd_rule default host firstn class hdd

# 查看设备类下的 OSD
ceph osd crush class ls-osd ssd

# 重命名设备类
ceph osd crush rename-device-class old_name new_name

# 移除设备类
ceph osd crush rm-device-class osd.3

设备类应用场景

场景：混合存储架构
├── NVMe class (高性能)
│   └── 热数据池 → rule_nvme
├── SSD class (中等性能)
│   └── 温数据池 → rule_ssd
└── HDD class (大容量)
    └── 冷数据/EC 池 → rule_hdd

配置示例：
ceph osd pool create hot_pool 128 128 replicated
ceph osd pool set hot_pool crush_rule ssd_rule

ceph osd pool create cold_pool 256 256 erasure ec_k4m2
# EC 池使用 HDD 设备类

8.7 CRUSH 调优参数

参数	默认值	说明
`choose_local_tries`	0	本地重试次数
`choose_total_tries`	50	总重试次数
`chooseleaf_descend_once`	1	chooseleaf 是否仅下降一次
`chooseleaf_stable`	1	稳定的 chooseleaf 算法

# 查看当前参数
ceph osd crush show-tunables

# 调整参数（谨慎操作）
ceph osd crush tunables optimal   # 使用推荐的最优参数
ceph osd crush tunables legacy    # 使用旧版兼容参数
ceph osd crush tunables bobtail   # 使用 bobtail 版本参数

8.8 CRUSH Map 验证和模拟

# 测试 CRUSH Map（编译检查）
crushtool -c /tmp/crushmap.txt -o /tmp/crushmap-test.bin

# 模拟数据分布
crushtool -i /tmp/crushmap-test.bin --test --show-mappings --num-rep 3 --min-x 0 --max-x 100

# 模拟并统计分布均匀性
crushtool -i /tmp/crushmap-test.bin --test --show-statistics --num-rep 3 --min-x 0 --max-x 10000

# 比较两个 CRUSH Map 的分布差异
crushtool --compare --show-statistics --min-x 0 --max-x 10000 \
    --crushmap-1 /tmp/crushmap-old.bin \
    --crushmap-2 /tmp/crushmap-new.bin

8.9 业务场景：多机房部署

# 跨机房 CRUSH Map 设计
root dc1 {
    id -10
    alg straw2
    hash 0
    item rack-dc1-1 weight 5.000
    item rack-dc1-2 weight 5.000
}

root dc2 {
    id -11
    alg straw2
    hash 0
    item rack-dc2-1 weight 5.000
    item rack-dc2-2 weight 5.000
}

# 规则：2 副本在主数据中心，1 副本在备数据中心
rule cross_dc {
    id 10
    type replicated
    step take dc1
    step chooseleaf firstn 2 type rack    # dc1 中选 2 个不同机柜
    step emit
    step take dc2
    step chooseleaf firstn 1 type rack    # dc2 中选 1 个机柜
    step emit
}

注意事项：
跨机房部署需要低延迟、高带宽的网络连接
延迟超过 10ms 会显著影响写入性能
建议使用独立的集群网络（cluster_network）

扩展阅读

下一章：09 - 监控与告警 — 学习 Ceph 集群的监控方法、Prometheus/Grafana 集成和告警配置。