第22周：多核架构与性能优化

2026-06-24

字数统计: 1.8k字 | 阅读时长≈ 9分

第22周：多核架构与性能优化

目标：理解 DPDK 的 lcore 模型，掌握多核并行转发和性能优化技巧。

1. DPDK 多核架构

1.1 Lcore 模型

DPDK 将 CPU 核心抽象为 lcore（logical core）：

  ┌──────────┐  ┌──────────┐  ┌──────────┐  ┌──────────┐
  │  lcore 0 │  │  lcore 1 │  │  lcore 2 │  │  lcore 3 │
  │ (MASTER) │  │ (Worker) │  │ (Worker) │  │ (Worker) │
  │          │  │          │  │          │  │          │
  │ 控制面   │  │ 收包 RX0 │  │ 收包 RX1 │  │ 收包 RX2 │
  │ 管理     │  │ 转发     │  │ 转发     │  │ 转发     │
  └──────────┘  └──────────┘  └──────────┘  └──────────┘
       │              │              │              │
       └──────────────┴──────────────┴──────────────┘
                      │
               ┌──────────────┐
               │  Port 0 (RX) │
               │  Port 1 (TX) │
               └──────────────┘

lcore 分类：
  MASTER — 控制面（初始化、统计、配置）
  WORKER — 数据面（转发、处理）

典型配置：
  - 2 个 lcore：1 收 + 1 发
  - 4 个 lcore：1 收 + 2 转发 + 1 发
  - 8 个 lcore：2 收 + 4 转发 + 2 发

1.2 Lcore 分配

// 获取 lcore 信息
uint16_t lcore_id = rte_lcore_id();        // 当前 lcore ID
uint16_t master = rte_get_master_lcore();  // 主 lcore
uint16_t count = rte_lcore_count();        // 总 lcore 数

// 遍历所有 lcore
RTE_LCORE_FOREACH_SLAVE(lcore_id) {
    printf("Slave lcore: %u\n", lcore_id);
}

// 在每个 lcore 上运行函数
int ret = rte_eal_mp_remote_launch(main_loop, NULL, SKIP_MASTER);
// SKIP_MASTER = 不在主 lcore 上运行
// CALL_MASTER = 在主 lcore 上也运行

// 等待所有 lcore 完成
rte_eal_mp_wait_lcore();

1.3 典型多核架构

模式 1：1 收 1 发（2 lcore）

  lcore 0 (MASTER)     lcore 1 (WORKER)
  ┌─────────────────┐   ┌─────────────────┐
  │                 │   │                 │
  │   Port 0 TX     │◀──│  Port 0 RX     │
  │   Port 1 TX     │◀──│  Port 1 RX     │
  │                 │   │                 │
  └─────────────────┘   └─────────────────┘

模式 2：多收多发（4 lcore）

  lcore 0 (MASTER)     lcore 1          lcore 2          lcore 3
  ┌─────────────────┐  ┌──────────┐     ┌──────────┐     ┌──────────┐
  │                 │  │          │     │          │     │          │
  │  Port 0 TX     │◀─│ Port 0   │     │ Port 1   │     │ Port 2   │
  │  Port 1 TX     │◀─│ RX       │     │ RX       │     │ RX       │
  │                 │  │ 转发     │     │ 转发     │     │ 转发     │
  └─────────────────┘  └──────────┘     └──────────┘     └──────────┘

模式 3：Pipeline（流水线，4 lcore）

  lcore 1              lcore 2            lcore 3
  ┌──────────┐    ┌──────────┐    ┌──────────┐
  │ RX Port 0│───▶│  L3      │───▶│ TX Port 1│
  │          │    │ 查表     │    │          │
  └──────────┘    └──────────┘    └──────────┘

2. Burst 模式（批量处理）

2.1 为什么用 Burst？

每次系统调用的开销：
  - rte_eth_rx_burst()：每次调用有固定开销
  - 收 1 个包 vs 收 32 个包：固定开销相同

使用 Burst：
  - 一次收 32 个包
  - 分摊固定开销
  - 更好利用 DMA 批量传输

典型 Burst 大小：
  - 16：保守，适用于低延迟
  - 32：推荐值
  - 64：高吞吐
  - 128：极高吞吐（可能增加延迟）

2.2 Burst 收发实现

// burst_fwd.c — Burst 模式转发
#define BURST_SIZE 32

void burst_forward(uint16_t in_port, uint16_t out_port) {
    struct rte_mbuf *pkts[BURST_SIZE];
    uint16_t nb_rx, nb_tx;

    // 批量接收
    nb_rx = rte_eth_rx_burst(in_port, 0, pkts, BURST_SIZE);
    if (nb_rx == 0) return;

    // 批量处理
    for (int i = 0; i < nb_rx; i++) {
        struct rte_mbuf *m = pkts[i];
        // 修改包（查表、改 MAC 等）
        process_packet(m, out_port);
    }

    // 批量发送
    nb_tx = rte_eth_tx_burst(out_port, 0, pkts, nb_rx);

    // 释放未发送的包
    if (nb_tx < nb_rx) {
        rte_pktmbuf_free_bulk(&pkts[nb_tx], nb_rx - nb_tx);
    }
}

3. 性能优化

3.1 内存访问优化

// === 1. 数据局部性 ===
// 每个 lcore 使用自己的 mempool 和 ring
// 避免跨 NUMA 访问

// 坏：跨 NUMA 分配
mp = rte_pktmbuf_pool_create("MP", 8192, 256, 0,
                              RTE_MBUF_DEFAULT_BUF_SIZE,
                              1);  // 在 NUMA 1 上
// lcore 在 NUMA 0 上访问 → 跨 NUMA 慢

// 好：本地 NUMA 分配
mp = rte_pktmbuf_pool_create("MP", 8192, 256, 0,
                              RTE_MBUF_DEFAULT_BUF_SIZE,
                              SOCKET_ID_ANY);  // 当前 NUMA

// === 2. 缓存对齐 ===
struct my_data {
    uint32_t counter;
    char pad[CACHE_LINE_SIZE];  // 填充到 cache line 大小
} __rte_cache_aligned;
// 防止 false sharing（多个 lcore 写同一 cache line）

// === 3. 批量 mbuf 分配 ===
// 好：使用 bulk
rte_pktmbuf_alloc_bulk(pool, pkts, 32);

// 坏：逐个分配
for (int i = 0; i < 32; i++)
    pkts[i] = rte_pktmbuf_alloc(pool);

3.2 CPU 亲和性与隔离

# === 隔离 CPU ===
# 在 GRUB 中设置：
# GRUB_CMDLINE_LINUX="isolcpus=2-7 nohz_full=2-7 rcu_nocbs=2-7"

# 解释：
# isolcpus=2-7     — 隔离 CPU 2-7（用户态任务）
# nohz_full=2-7    — 这些 CPU 不接收 periodic timer tick
# rcu_nocbs=2-7    — 这些 CPU 不处理 RCU callback

# 将 DPDK lcore 绑定到隔离的 CPU
# ./app -l 2-7 --socket-mem 1024 -- ...

# === CPU 频率 ===
# 确保 CPU 频率不被节能策略降频
sudo cpupower frequency-set -g performance

# 查看当前频率
cat /proc/cpuinfo | grep "cpu MHz"

# === IRQ 亲和性 ===
# 将网卡中断移到非 lcore CPU
for irq in $(grep -l eth0 /proc/irq/*/actions); do
    echo 1 | sudo tee ${irq}/smp_affinity_list  # CPU 0
done

3.3 mbuf 预取

// 预取数据到缓存（减少 cache miss）
#include <rte_prefetch.h>

void process_packets(struct rte_mbuf **pkts, int nb_pkts) {
    for (int i = 0; i < nb_pkts; i++) {
        // 预取当前 mbuf 的数据
        rte_prefetch0(rte_pktmbuf_mtod(pkts[i], void *));

        // 同时预取下一个（重叠 CPU cache 和 memory access）
        if (i + 1 < nb_pkts)
            rte_prefetch0(rte_pktmbuf_mtod(pkts[i+1], void *));

        // 处理当前包
        do_something(pkts[i]);
    }
}

3.4 向量化处理

// 使用 SIMD 指令批量处理（编译器自动向量化）
// -O3 会自动使用 SSE/AVX

// 批量校验和
uint16_t rte_ipv4_udptcp_cksum_mbuf(struct rte_mbuf *m,
                                     uint32_t off, uint32_t len);

// 批量 MAC 地址比较
rte_ether_addr_cmp(&a, &b);  // 使用 SSE 加速

3.5 避免的陷阱

// ❌ 不要做的事：

// 1. 在转发路径中调用 printf/syslog
printf("Processing packet\n");  // 极慢！

// 2. 在转发路径中 malloc/free
char *buf = malloc(256);       // 禁止！
// 用 rte_malloc 或预先分配

// 3. 在转发路径中加锁
rte_spinlock_lock(&lock);      // 避免！
// 用 per-lcore 数据或 lock-free 结构

// 4. 使用小 burst size
rte_eth_rx_burst(port, 0, pkts, 1);  // 每次只收 1 个！

// 5. 跨 NUMA 访问
mp = rte_pktmbuf_pool_create(..., 1);  // 创建在 NUMA 1
// lcore 在 NUMA 0 → 跨 NUMA 慢

// 6. 共享 mutable 状态
static int counter;  // 多 lcore 同时写 → 竞争
// 用 per-lcore 计数器
static __thread int counter;  // thread-local

4. 多核转发示例

// multicore_fwd.c
#include <rte_eal.h>
#include <rte_ethdev.h>
#include <rte_mbuf.h>

#define BURST_SIZE 32
#define NB_MBUF    65536

static struct rte_mempool *mbuf_pool;

// 每个 lcore 的配置
struct lcore_conf {
    uint16_t rx_port;
    uint16_t tx_port;
    uint8_t  rx_queue;
    uint8_t  tx_queue;
} __rte_cache_aligned;

static struct lcore_conf lcore_confs[RTE_MAX_LCORE];

// 主循环（每个 lcore 一个实例）
static int main_loop(__rte_unused void *arg) {
    struct lcore_conf *conf = &lcore_confs[rte_lcore_id()];
    struct rte_mbuf *pkts[BURST_SIZE];
    uint16_t nb_rx, nb_tx;

    printf("lcore %u: RX=%u TX=%u\n",
           rte_lcore_id(), conf->rx_port, conf->tx_port);

    while (1) {
        // 接收
        nb_rx = rte_eth_rx_burst(conf->rx_port, conf->rx_queue,
                                  pkts, BURST_SIZE);
        if (nb_rx == 0) continue;

        // 处理每个包
        for (int i = 0; i < nb_rx; i++) {
            struct rte_mbuf *m = pkts[i];

            // 修改 MAC（简单交换）
            struct rte_ether_hdr *eth;
            eth = rte_pktmbuf_mtod(m, struct rte_ether_hdr *);

            struct rte_ether_addr tmp = eth->src_addr;
            eth->src_addr = eth->dst_addr;
            eth->dst_addr = tmp;
        }

        // 发送
        nb_tx = rte_eth_tx_burst(conf->tx_port, conf->tx_queue,
                                  pkts, nb_rx);

        // 释放未发送的
        if (nb_tx < nb_rx)
            rte_pktmbuf_free_bulk(&pkts[nb_tx], nb_rx - nb_tx);
    }

    return 0;
}

int main(int argc, char **argv) {
    int ret;
    uint16_t port_id;
    int nb_ports;

    // EAL 初始化
    ret = rte_eal_init(argc, argv);
    if (ret < 0) return 1;
    argc -= ret; argv += ret;

    // 创建 mbuf pool
    mbuf_pool = rte_pktmbuf_pool_create("MBUF", NB_MBUF, 256,
                                         0, RTE_MBUF_DEFAULT_BUF_SIZE,
                                         SOCKET_ID_ANY);

    // 初始化端口
    nb_ports = rte_eth_dev_count_avail();
    RTE_ETH_FOREACH_DEV(port_id) {
        port_init(port_id);
    }

    // 配置 lcore 映射
    int slave_lcore = 0;
    RTE_ETH_FOREACH_DEV(port_id) {
        if (slave_lcore < rte_lcore_count() - 1) {
            lcore_confs[rte_get_next_lcore(-1, 0, 1)].rx_port = port_id;
            lcore_confs[rte_get_next_lcore(-1, 0, 1)].tx_port =
                (port_id + 1) % nb_ports;
            slave_lcore++;
        }
    }

    // 启动所有 lcore
    rte_eal_mp_remote_launch(main_loop, NULL, CALL_MASTER);
    rte_eal_mp_wait_lcore();

    return 0;
}

5. 性能测试

5.1 吞吐测试

# === testpmd 吞吐测试 ===

# 启动 testpmd
sudo ./build/app/dpdk-testpmd -l 0-3 --socket-mem 1024 \
    -- --portmask=0x3 --forward-mode=io \
    --nb-cores=2 --mbcache=250 --txd=1024 --rxd=1024

# 在 testpmd 中：
testpmd> start tx_first  # 开始从端口 0 发到端口 1
testpmd> show port stats all  # 查看统计

# 目标指标（10GbE，64B 包）：
#   PPS:    ≥ 14.88 Mpps（线速）
#   吞吐:   ≥ 9.5 Gbps

5.2 延迟测试

# === 使用 DPDK 自带的 latency 测试 ===

# 发送固定大小的包，测量 PPS
sudo ./build/app/dpdk-l2fwd -l 0-3 --socket-mem 1024 \
    -- -p 0x3 -T 1 --latency-stats 1000000

# 或使用自定义 latency 测试
# 记录每个包的发送/接收时间戳
// uint64_t start = rte_rdtsc();
// ... 转发 ...
// uint64_t end = rte_rdtsc();
// latency = (end - start) / cpu_hz;

5.3 性能基准参考

单 lcore 转发性能（Intel Xeon E5-2680 v4）:

包大小  | 64B  | 128B  | 256B  | 512B  | 1024B | 1518B
------- |------|-------|-------|-------|-------|------
PPS     | 12M  | 10M   | 7M    | 4M    | 2.5M  | 2M
吞吐    | 6.2G | 10.2G | 14.4G | 16.5G | 20.5G | 24.4G

10GbE 线速（64B）：14.88 Mpps
25GbE 线速（64B）：37.24 Mpps
100GbE 线速（64B）：148.81 Mpps

CPU 占用（单 lcore 跑满 10GbE 64B）：
  - 约 70-80% 单核
  - 需要 2 个 lcore 留余量

6. 性能调优检查清单

# 1. CPU 隔离
sudo grubby --update-kernel=ALL --args="isolcpus=2-7 nohz_full=2-7"
sudo reboot

# 2. 设置 CPU 频率
sudo cpupower frequency-set -g performance

# 3. 配置 hugepage
echo 1024 | sudo tee /sys/kernel/mm/hugepages/hugepages-2048kB/nr_hugepages
sudo mount -t hugetlbfs nodev /mnt/huge

# 4. 绑定网卡
sudo dpdk-devbind.py --bind=vfio-pci 0000:01:00.0 0000:01:00.1

# 5. 禁用节能
for cpu in /sys/devices/system/cpu/cpu[2-7]; do
    echo performance | sudo tee $cpu/cpufreq/scaling_governor
done

# 6. 检查 IRQ 绑定
cat /proc/interrupts | grep eth

# 7. 运行并观察
sudo ./my_fwd -l 2-7 --socket-mem 1024

本文作者： CoderSong
本文链接： https://jack-song-gif.github.io/2026/06/24/第22周：多核架构与性能优化/
版权声明： 本博客所有文章除特别声明外，均采用 MIT 许可协议。转载请注明出处！