jemalloc 内存分配器完全指南 / 08 - 基准测试
第 8 章:基准测试
8.1 基准测试概述
内存分配器的基准测试需要评估以下维度:
| 维度 | 指标 | 工具/方法 |
|---|---|---|
| 吞吐量 | ops/sec (malloc+free/s) | 多线程循环计时 |
| 延迟 | p50/p99/p999 延迟 | 高精度计时 |
| 碎片率 | RSS / 实际使用量 | RSS 监控 |
| 可扩展性 | 多线程下的性能衰减 | 不同线程数测试 |
| CPU 开销 | perf 上报的热点 | perf record |
8.2 自定义基准测试框架
8.2.1 多线程吞吐量测试
// bench_throughput.c
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <pthread.h>
#include <time.h>
#include <getopt.h>
#define MAX_THREADS 128
typedef struct {
int id;
int n_ops;
int obj_size;
double elapsed;
} thread_arg_t;
static void *bench_worker(void *arg) {
thread_arg_t *a = (thread_arg_t *)arg;
void **ptrs = malloc(a->n_ops * sizeof(void *));
struct timespec t0, t1;
clock_gettime(CLOCK_MONOTONIC, &t0);
// 交替 malloc/free 模拟真实场景
for (int i = 0; i < a->n_ops; i++) {
ptrs[i] = malloc(a->obj_size);
if (ptrs[i]) memset(ptrs[i], 0xAB, a->obj_size);
// 每分配 4 个释放 1 个(模拟真实分配模式)
if (i >= 4 && (i % 4 == 0)) {
free(ptrs[i - 4]);
ptrs[i - 4] = NULL;
}
}
// 释放剩余
for (int i = 0; i < a->n_ops; i++) {
if (ptrs[i]) free(ptrs[i]);
}
clock_gettime(CLOCK_MONOTONIC, &t1);
a->elapsed = (t1.tv_sec - t0.tv_sec) + (t1.tv_nsec - t0.tv_nsec) / 1e9;
free(ptrs);
return NULL;
}
int main(int argc, char *argv[]) {
int n_threads = 8;
int n_ops = 100000;
int obj_size = 256;
int opt;
while ((opt = getopt(argc, argv, "t:n:s:")) != -1) {
switch (opt) {
case 't': n_threads = atoi(optarg); break;
case 'n': n_ops = atoi(optarg); break;
case 's': obj_size = atoi(optarg); break;
}
}
pthread_t threads[MAX_THREADS];
thread_arg_t args[MAX_THREADS];
struct timespec t0, t1;
clock_gettime(CLOCK_MONOTONIC, &t0);
for (int i = 0; i < n_threads; i++) {
args[i] = (thread_arg_t){i, n_ops, obj_size, 0};
pthread_create(&threads[i], NULL, bench_worker, &args[i]);
}
for (int i = 0; i < n_threads; i++) {
pthread_join(threads[i], NULL);
}
clock_gettime(CLOCK_MONOTONIC, &t1);
double total = (t1.tv_sec - t0.tv_sec) + (t1.tv_nsec - t0.tv_nsec) / 1e9;
double total_ops = (double)n_threads * n_ops * 2; // malloc + free
printf("threads=%d obj_size=%d elapsed=%.3fs throughput=%.1f Mops/s\n",
n_threads, obj_size, total, total_ops / total / 1e6);
return 0;
}
gcc -O2 -g -o bench_throughput bench_throughput.c -lpthread
# 默认 malloc
./bench_throughput -t 8 -n 200000 -s 256
# jemalloc
LD_PRELOAD=/usr/local/lib/libjemalloc.so.2 \
./bench_throughput -t 8 -n 200000 -s 256
# tcmalloc(需要安装)
LD_PRELOAD=/usr/lib/x86_64-linux-gnu/libtcmalloc.so.4 \
./bench_throughput -t 8 -n 200000 -s 256
8.2.2 延迟分布测试
// bench_latency.c
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <time.h>
#include <inttypes.h>
#define N_SAMPLES 1000000
static inline uint64_t now_ns() {
struct timespec ts;
clock_gettime(CLOCK_MONOTONIC, &ts);
return (uint64_t)ts.tv_sec * 1000000000ULL + ts.tv_nsec;
}
static int cmp_u64(const void *a, const void *b) {
uint64_t va = *(const uint64_t *)a;
uint64_t vb = *(const uint64_t *)b;
return (va > vb) - (va < vb);
}
int main() {
uint64_t *latencies = malloc(N_SAMPLES * sizeof(uint64_t));
// 采集 malloc 延迟
for (int i = 0; i < N_SAMPLES; i++) {
uint64_t t0 = now_ns();
void *p = malloc(256);
uint64_t t1 = now_ns();
latencies[i] = t1 - t0;
free(p);
}
// 排序计算百分位
qsort(latencies, N_SAMPLES, sizeof(uint64_t), cmp_u64);
printf("=== malloc(256) Latency (ns) ===\n");
printf("p50: %6" PRIu64 "\n", latencies[N_SAMPLES * 50 / 100]);
printf("p90: %6" PRIu64 "\n", latencies[N_SAMPLES * 90 / 100]);
printf("p99: %6" PRIu64 "\n", latencies[N_SAMPLES * 99 / 100]);
printf("p999: %6" PRIu64 "\n", latencies[N_SAMPLES * 999 / 1000]);
printf("max: %6" PRIu64 "\n", latencies[N_SAMPLES - 1]);
printf("mean: %6.0f\n", (double)latencies[N_SAMPLES / 2]);
free(latencies);
return 0;
}
gcc -O2 -o bench_latency bench_latency.c -lpthread
echo "=== glibc malloc ==="
./bench_latency
echo "=== jemalloc ==="
LD_PRELOAD=/usr/local/lib/libjemalloc.so.2 ./bench_latency
典型输出:
=== glibc malloc ===
p50: 45
p90: 120
p99: 380
p999: 1200
max: 12500
=== jemalloc ===
p50: 28
p90: 85
p99: 210
p999: 650
max: 4200
8.3 碎片率测试
// bench_fragmentation.c
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <malloc.h>
#ifdef USE_JEMALLOC
#include <jemalloc/jemalloc.h>
#endif
#define N_ALLOCS 100000
#define MIN_SIZE 16
#define MAX_SIZE 8192
int main() {
void *ptrs[N_ALLOCS];
size_t total_requested = 0;
// 随机分配
srand(42);
for (int i = 0; i < N_ALLOCS; i++) {
size_t sz = MIN_SIZE + rand() % (MAX_SIZE - MIN_SIZE + 1);
ptrs[i] = malloc(sz);
if (ptrs[i]) memset(ptrs[i], 0xAB, sz);
total_requested += sz;
}
// 随机释放约 70%(制造碎片)
for (int i = 0; i < N_ALLOCS; i++) {
if (rand() % 100 < 70) {
free(ptrs[i]);
ptrs[i] = NULL;
}
}
// 查看统计
struct mallinfo mi = mallinfo();
printf("=== Fragmentation Report ===\n");
printf("Total requested: %12zu bytes (%.1f MB)\n",
total_requested, total_requested / 1048576.0);
printf("Arena (from OS): %12d bytes (%.1f MB)\n",
mi.arena, mi.arena / 1048576.0);
printf("In-use: %12d bytes (%.1f MB)\n",
mi.uordblks, mi.uordblks / 1048576.0);
printf("Free in arena: %12d bytes (%.1f MB)\n",
mi.fordblks, mi.fordblks / 1048576.0);
// 碎片率 = (RSS - 实际使用) / RSS
double frag_ratio = mi.arena > 0 ?
(double)(mi.arena - mi.uordblks) / mi.arena * 100 : 0;
printf("Fragmentation: %.1f%%\n", frag_ratio);
#ifdef USE_JEMALLOC
printf("\n=== jemalloc Detailed Stats ===\n");
je_malloc_stats_print(NULL, NULL, NULL);
#endif
// 清理
for (int i = 0; i < N_ALLOCS; i++) {
if (ptrs[i]) free(ptrs[i]);
}
return 0;
}
# glibc malloc
gcc -O2 -DUSE_JEMALLOC=0 -o bench_frag bench_fragmentation.c
./bench_frag
# jemalloc
gcc -O2 -DUSE_JEMALLOC -o bench_frag_jemalloc bench_fragmentation.c -ljemalloc
./bench_frag_jemalloc
8.4 使用 memtier_benchmark 测试 Redis
Redis 是 jemalloc 最重要的用户之一。使用 memtier_benchmark 测试不同分配器的 Redis 性能:
# 安装 memtier_benchmark
sudo apt install memtier-benchmark # 或从源码编译
# 启动使用 glibc 的 Redis
redis-server --daemonize yes --port 6379
memtier_benchmark -p 6379 --protocol=redis --data-size=256 \
--key-maximum=100000 --threads=8 --clients=50 --test-time=30
# 重启使用 jemalloc 的 Redis
kill $(pgrep redis-server)
LD_PRELOAD=/usr/local/lib/libjemalloc.so.2 \
MALLOC_CONF="narenas:4,background_thread:true" \
redis-server --daemonize yes --port 6379
memtier_benchmark -p 6379 --protocol=redis --data-size=256 \
--key-maximum=100000 --threads=8 --clients=50 --test-time=30
预期结果对比
| 指标 | glibc malloc | jemalloc | 提升 |
|---|---|---|---|
| Ops/sec | ~120,000 | ~180,000 | +50% |
| p50 latency | 0.8ms | 0.5ms | -37% |
| p99 latency | 2.1ms | 1.2ms | -43% |
| RSS (1GB 数据) | ~1.4GB | ~1.1GB | -21% |
注意:实际数据取决于硬件配置和工作负载。
8.5 综合对比测试脚本
#!/bin/bash
# bench_all.sh - 对比不同分配器性能
ALLOCATORS=("glibc" "jemalloc" "tcmalloc" "mimalloc")
PRELOADS=("" \
"/usr/local/lib/libjemalloc.so.2" \
"/usr/lib/x86_64-linux-gnu/libtcmalloc.so.4" \
"/usr/lib/x86_64-linux-gnu/libmimalloc.so")
echo "=== Memory Allocator Benchmark ==="
echo "Threads: 1 2 4 8 16"
echo ""
for idx in "${!ALLOCATORS[@]}"; do
name="${ALLOCATORS[$idx]}"
preload="${PRELOADS[$idx]}"
echo "--- $name ---"
for threads in 1 2 4 8 16; do
if [ -n "$preload" ]; then
result=$(LD_PRELOAD="$preload" \
./bench_throughput -t $threads -n 100000 -s 256 2>&1)
else
result=$(./bench_throughput -t $threads -n 100000 -s 256 2>&1)
fi
printf " threads=%-2d %s\n" "$threads" "$result"
done
echo ""
done
8.6 RSS 监控脚本
#!/bin/bash
# monitor_rss.sh - 实时监控进程 RSS
PID=$1
if [ -z "$PID" ]; then
echo "Usage: $0 <pid>"
exit 1
fi
echo "Monitoring RSS for PID $PID (Ctrl+C to stop)"
echo "Time(s) RSS(MB) VSZ(MB)"
while true; do
if [ ! -d "/proc/$PID" ]; then
echo "Process $PID exited"
break
fi
rss=$(awk '/VmRSS/{print $2}' /proc/$PID/status 2>/dev/null)
vsz=$(awk '/VmSize/{print $2}' /proc/$PID/status 2>/dev/null)
elapsed=$(($(date +%s) - start_time))
printf "%-10d %-10s %-10s\n" \
$elapsed $((rss / 1024)) $((vsz / 1024))
sleep 1
done
8.7 性能分析工具
perf
# 采集 CPU 热点
perf record -g -p $(pgrep my_server) -- sleep 30
perf report
# 查看 malloc/free 的调用频率
perf stat -e 'syscalls:sys_enter_mmap,syscalls:sys_enter_brk' \
-p $(pgrep my_server) -- sleep 10
Flame Graph
# 生成火焰图(需要 FlameGraph 工具)
git clone https://github.com/brendangregg/FlameGraph.git
perf record -g -p $(pgrep my_server) -- sleep 30
perf script | FlameGraph/stackcollapse-perf.pl | FlameGraph/flamegraph.pl > flame.svg
8.8 测试注意事项
| 要点 | 说明 |
|---|---|
| 关闭 ASLR | echo 0 > /proc/sys/kernel/randomize_va_space(可选,减少波动) |
| 固定 CPU 频率 | cpupower frequency-set -g performance |
| 多轮取均值 | 至少运行 5 轮,取中位数 |
| 预热 | 先跑一轮预热,使缓存和页表稳定 |
| 隔离环境 | 关闭不必要的后台进程 |
| 相同编译选项 | -O2 -g 确保公平比较 |
8.9 本章小结
| 测试维度 | 关键指标 | 推荐工具 |
|---|---|---|
| 吞吐量 | Mops/sec | 自定义 bench |
| 延迟 | p50/p99/p999 | 自定义 bench + clock_gettime |
| 碎片率 | RSS / requested | mallinfo + RSS 监控 |
| 可扩展性 | 不同线程数的 ops | 自定义 bench |
| 实际应用 | ops/sec, latency | memtier_benchmark |
扩展阅读
上一章:第 7 章:系统集成 下一章:第 9 章:Docker 容器化