DPDK系列之三十八网卡优化

一、网卡优化

一般来说，对某个设备的优化，除了涉及到硬件设备本身，也要兼顾着上下游的软硬件，协调优化，才可能达到最优。同样，对于DPDK来讲，除了要考虑软件层面还要考虑硬件平台及相关设置置的控制。将其看成一个整体入手，才能真正的把优化做到最好。

二、软件层面

在实际的应用场景中，可能存在着收发包较少的情况；也可能存在着连续的大数据量吞吐的情况；而更多的情况可能是上面的两种情况交互。最典型的可能是某个时间段上网的人很多，可有一段时间又没什么人上网。比如上下班时间，和交通拥堵一样，也会有这种高峰低谷的情况。这就需要DPDK根据情况来进行动态的控制，软件层面做这种设计还是比较方便的。实际上，DPDK提供了三种模式来分别适应上述不同的场景：
1、异步中断模式
其实中断都是异步的。好像没听说过同步中断的情况。此模式比较适合于收发较少的情况，能更好的利用硬件资源
2、轮询模式
其实就是反复的去操作接口，特别适合于大数据量的吞吐的情况。
3、混合模式
这个其实就是一种对上述二者场景进行兼容的一种情况，可能在实际情况中更合适。

三、IO优化

这个其实就涉及到数据处理过程，数据本身的粒度大小，处理数据的批量数量，缓存的对齐以及特定硬件的一个SIMD的指令优化控制等等。DPDK中采用Burst的方式来收发包，减少对内存或Cache的访问次数。同时利用批处理来提高对时延和吞吐的优化。

四、平台及配置优化

这个就比较多了，一般来说，它和技术本身没有什么太大关系，更和硬件、OS及相关的具体的提供的功能有关。同时，可能还要涉及到一些BIOS的支持，诸如大页内存、电源设置、硬件本身固件的一些支持以及总线的控制等等。

五、相关设置适配

相关设置的适配，涉及到收、发包队列的长度，以及队列中的可释放描述符的数量阈值。这都需要根据文档的说明，按照实际场景的需求来进行安排。从而能更好的发挥整个软件层面的适配性，更好的协调软件和网卡的数据收发，达到最优。

六、源码分析

了解了相关的优化措施后，看一下DPDK中相关的源码，在DPDK中的轮询模式中，有Gigabit、10Gigabit 及 40Gigabit 和半虚拟化IO的轮询模式驱动程序，它支持RTC和管道两种情况，这里不展开：

//\lib\librte_ethdev\rte_ethdev.h
static inline uint16_t
rte_eth_rx_burst(uint16_t port_id, uint16_t queue_id,
		 struct rte_mbuf **rx_pkts, const uint16_t nb_pkts)
{
   
	struct rte_eth_dev *dev = &rte_eth_devices[port_id];
	uint16_t nb_rx;

#ifdef RTE_LIBRTE_ETHDEV_DEBUG
	RTE_ETH_VALID_PORTID_OR_ERR_RET(port_id, 0);
	RTE_FUNC_PTR_OR_ERR_RET(*dev->rx_pkt_burst, 0);

	if (queue_id >= dev->data->nb_rx_queues) {
   
		RTE_ETHDEV_LOG(ERR, "Invalid RX queue_id=%u\n", queue_id);
		return 0;
	}
#endif
	nb_rx = (*dev->rx_pkt_burst)(dev->data->rx_queues[queue_id],
				     rx_pkts, nb_pkts);

#ifdef RTE_ETHDEV_RXTX_CALLBACKS
	struct rte_eth_rxtx_callback *cb;

	/* __ATOMIC_RELEASE memory order was used when the
	 * call back was inserted into the list.
	 * Since there is a clear dependency between loading
	 * cb and cb->fn/cb->next, __ATOMIC_ACQUIRE memory order is
	 * not required.
	 */
	cb = __atomic_load_n(&dev->post_rx_burst_cbs[queue_id],
				__ATOMIC_RELAXED);

	if (unlikely(cb != NULL)) {
   
		do {
   
			nb_rx = cb->fn.rx(port_id, queue_id, rx_pkts, nb_rx,
						nb_pkts, cb->param);
			cb = cb->next;
		} while (cb != NULL);
	}
#endif

	return nb_rx;
}
static inline uint16_t
rte_eth_tx_burst(uint16_t port_id, uint16_t queue_id,
		 struct rte_mbuf **tx_pkts, uint16_t nb_pkts)
{
   
	struct rte_eth_dev *dev = &rte_eth_devices[port_id];

#ifdef RTE_LIBRTE_ETHDEV_DEBUG
	RTE_ETH_VALID_PORTID_OR_ERR_RET(port_id, 0);
	RTE_FUNC_PTR_OR_ERR_RET(*dev->tx_pkt_burst, 0);

	if (queue_id >= dev->data->nb_tx_queues) {
   
		RTE_ETHDEV_LOG(ERR, "Invalid TX queue_id=%u\n", queue_id);
		return 0;
	}
#endif

#ifdef RTE_ETHDEV_RXTX_CALLBACKS
	struct rte_eth_rxtx_callback *cb;

	/* __ATOMIC_RELEASE memory order was used when the
	 * call back was inserted into the list.
	 * Since there is a clear dependency between loading
	 * cb and cb->fn/cb->next, __ATOMIC_ACQUIRE memory order is
	 * not required.
	 */
	cb = __atomic_load_n(&dev->pre_tx_burst_cbs[queue_id],
				__ATOMIC_RELAXED);

	if (unlikely(cb != NULL)) {
   
		do {
   
			nb_pkts = cb->fn.tx(port_id, queue_id, tx_pkts, nb_pkts,
					cb->param);
			cb = cb->next;
		} while (cb != NULL);
	}
#endif

	return (*dev->tx_pkt_burst)(dev->data->tx_queues[queue_id], tx_pkts, nb_pkts);
}

在burst中的rx的接收函数中，有行代码要注意，一个是通过rx_pkt_burst真正处理数据，另外一个就是循环查找有无数据的回调函数。这两个函数在PMD层可以通过不断的循环来调用，就可以达到轮询的目的。在前者的“eth_rx_burst_t rx_pkt_burst”中，源码中注释说明其指向PMD接收函数，这下就明白了吧。可以在eth_em_rx_init(drivers/net/e1000/em_rxtx.c)中看到相关的代码：

int
eth_em_rx_init(struct rte_eth_dev * dev)
{
   
	struct e1000_hw *hw;
......
	if (hw->mac.type == e1000_82573)
		E1000_WRITE_REG(hw, E1000_RDTR, 0x20);

	dev->rx_pkt_burst = (eth_rx_burst_t)eth_em_recv_pkts;
  ......
}

uint16_t
eth_em_recv_pkts(void *rx_queue, struct rte_mbuf **rx_pkts,
		uint16_t nb_pkts)
{
   
	volatile struct e1000_rx_desc *rx_ring;
	volatile struct e1000_rx_desc *rxdp;
	struct em_rx_queue *rxq;
	struct em_rx_entry *sw_ring;
	struct em_rx_entry *rxe;
	struct rte_mbuf *rxm;
	struct rte_mbuf *nmb;
	struct e1000_rx_desc rxd;
	uint64_t dma_addr;
	uint16_t pkt_len;
	uint16_t rx_id;
	uint16_t nb_rx;
	uint16_t nb_hold;
	uint8_t status;

	rxq = rx_queue;

	nb_rx = 0;
	nb_hold = 0;
	rx_id = rxq->rx_tail;
	rx_ring = rxq->rx_ring;
	sw_ring = rxq->sw_ring;
	while (nb_rx < nb_pkts) {
   
		/*
		 * The order of operations here is important as the DD status
		 * bit must not be read after any other descriptor fields.
		 * rx_ring and rxdp are pointing to volatile data so the order
		 * of accesses cannot be reordered by the compiler. If they were
		 * not volatile, they could be reordered which could lead to
		 * using invalid descriptor fields when read from rxd.
		 */
		rxdp = &rx_ring[rx_id];
		status = rxdp->status;
		if (! (status & E1000_RXD_STAT_DD))
			break;
		rxd = *rxdp;

		/*
		 * End of packet.
		 *
		 * If the E1000_RXD_STAT_EOP flag is not set, the RX packet is
		 * likely to be invalid and to be dropped by the various
		 * validation checks performed by the network stack.
		 *
		 * Allocate a new mbuf to replenish the RX ring descriptor.
		 * If the allocation fails:
		 *    - arrange for that RX descriptor to be the first one
		 *      being parsed the next time the receive function is
		 *      invoked [on the same queue].
		 *
		 *    - Stop parsing the RX ring and return immediately.
		 *
		 * This policy do not drop the packet received in the RX
		 * descriptor for which the allocation of a new mbuf failed.
		 * Thus, it allows that packet to be later retrieved if
		 * mbuf have been freed in the mean time.
		 * As a side effect, holding RX descriptors instead of
		 * systematically giving them back to the NIC may lead to
		 * RX ring exhaustion situations.
		 * However, the NIC can gracefully prevent such situations
		 * to happen by sending specific "back-pressure" flow control
		 * frames to its peer(s).
		 */
		PMD_RX_LOG(DEBUG, "port_id=%u queue_id=%u rx_id=%u "
			   "status=0x%x pkt_len=%u",
			   (unsigned) rxq->port_id, (unsigned) rxq->queue_id,
			   (unsigned) rx_id, (unsigned) status,
			   (unsigned) rte_le_to_cpu_16(rxd.length));

		nmb = rte_mbuf_raw_alloc(rxq->mb_pool);
		if (nmb == NULL) {
   
			PMD_RX_LOG(DEBUG, "RX mbuf alloc failed port_id=%u "
				   "queue_id=%u",
				   (unsigned) rxq->port_id,
				   (unsigned) rxq->queue_id);
			rte_eth_devices[rxq->port_id].data->rx_mbuf_alloc_failed++;
			break;
		}

		nb_hold++;
		rxe = &sw_ring[rx_id];
		rx_id++;
		if (rx_id == rxq->nb_rx_desc)
			rx_id = 0;

		/* Prefetch next mbuf while processing current one. */
		rte_em_prefetch(sw_ring[rx_id].mbuf);

		/*
		 * When next RX descriptor is on a cache-line boundary,
		 * prefetch the next 4 RX descriptors and the next 8 pointers
		 * to mbufs.
		 */
		if ((rx_id & 0x3) == 0) {
   
			rte_em_prefetch(&rx_ring[rx_id]);
			rte_em_prefetch(&sw_ring[rx_id]);
		}

		/* Rearm RXD: attach new mbuf and reset status to zero. */

		rxm = rxe->mbuf;
		rxe->mbuf = nmb;
		dma_addr =
			rte_cpu_to_le_64(rte_mbuf_data_iova_default(nmb));
		rxdp->buffer_addr = dma_addr;
		rxdp->status = 0;

		/*
		 * Initialize the returned mbuf.
		 * 1) setup generic mbuf fields:
		 *    - number of segments,
		 *    - next segment,
		 *    - packet length,
		 *    - RX port identifier.
		 * 2) integrate hardware offload data, if any:
		 *    - RSS flag & hash,
		 *    - IP checksum flag,
		 *    - VLAN TCI, if any,
		 *    - error flags.
		 */
		pkt_len = (uint16_t) (rte_le_to_cpu_16(rxd.length) -
				rxq->crc_len);
		rxm->data_off = RTE_PKTMBUF_HEADROOM;
		rte_packet_prefetch((char *)rxm->buf_addr + rxm->data_off);
		rxm->nb_segs = 1;
		rxm->next = NULL;
		rxm->pkt_len = pkt_len;
		rxm->data_len = pkt_len;
		rxm->port = rxq->port_id;

		rxm->ol_flags = rx_desc_status_to_pkt_flags(status);
		rxm->ol_flags = rxm->ol_flags |
				rx_desc_error_to_pkt_flags(rxd.errors);

		/* Only valid if PKT_RX_VLAN set in pkt_flags */
		rxm->vlan_tci = rte_le_to_cpu_16(rxd.special);

		/*
		 * Store the mbuf address into the next entry of the array
		 * of returned packets.
		 */
		rx_pkts[nb_rx++] = rxm;
	}
	rxq->rx_tail = rx_id;

	/*
	 * If the number of free RX descriptors is greater than the RX free
	 * threshold of the queue, advance the Receive Descriptor Tail (RDT)
	 * register.
	 * Update the RDT with the value of the last processed RX descriptor
	 * minus 1, to guarantee that the RDT register is never equal to the
	 * RDH register, which creates a "full" ring situtation from the
	 * hardware point of view...
	 */
	nb_hold = (uint16_t) (nb_hold + rxq->nb_rx_hold);
	if (nb_hold > rxq->rx_free_thresh) {
   
		PMD_RX_LOG(DEBUG, "port_id=%u queue_id=%u rx_tail=%u "
			   "nb_hold=%u nb_rx=%u",
			   (unsigned) rxq->port_id, (unsigned) rxq->queue_id,
			   (unsigned) rx_id, (unsigned) nb_hold,
			   (unsigned) nb_rx);
		rx_id = (uint16_t) ((rx_id == 0) ?
			(rxq->nb_rx_desc - 1) : (rx_id - 1));
		E1000_PCI_REG_WRITE(rxq->rdt_reg_addr, rx_id);
		nb_hold = 0;
	}
	rxq->nb_rx_hold = nb_hold;
	return nb_rx;
}

上面的代码是轮询机制中的主要的代码。代码整体比较容易理解，DMA操作将报文写到相关的内存即m_buf中（ring），rxq->rx_tail = rx_id;这行代码是不是有点熟悉，类似于计数器。它控制着不断拿到新的m_buf并写入到ring中。
轮询的方式也比较好理解，通过UIO略过内核然后PMD拦截了硬中断，直接操作。
DPDK实现了uio、定时器alarm,和vfio三种中断。中断操作可以理解成线程正在忙着，突然来了一个信号，让线程别干了去忙别的。那么CPU为了安全调度，需要处理上下文的堆栈，数据等等，然后才能进行相关的操作。这个过程需要耗费非常多的时钟周期，如果中断比较多的话，可以想象数据的吞吐量会下降多少。这部分代码待后面的文章再展开分析。

七、总结

总之，要想把一个系统优化，从整体上来看，一定不能在某个环节有明显的短板，这和计算机的木桶原理异曲同工。不但如此，这诸多环节中的处理协调也要尽量保持一致，而不能是生产者疯狂的生产，完全不顾消费者的处理。这也是DPDK提供三种处理模式的原因，因场景而异，不能一言而论。能否把DPDK用好，除了软件给力，整体的运维也要跟上。