Linux delay相关函数实现

1、udelay

与 sleep 相关函数相比，delay 的最大区别是忙等、一直占用 CPU，而 sleep 会让出 CPU 控制权。

mdelay、ndelay都是基于 udelay 来实现的。在 include/linux/delay.h 中，如下：

/*
 * Using udelay() for intervals greater than a few milliseconds can
 * risk overflow for high loops_per_jiffy (high bogomips) machines. The
 * mdelay() provides a wrapper to prevent this.  For delays greater
 * than MAX_UDELAY_MS milliseconds, the wrapper is used.  Architecture
 * specific values can be defined in asm-???/delay.h as an override.
 * The 2nd mdelay() definition ensures GCC will optimize away the 
 * while loop for the common cases where n <= MAX_UDELAY_MS  --  Paul G.
 */
#ifndef MAX_UDELAY_MS
#define MAX_UDELAY_MS   5
#endif
 
#ifndef mdelay
#define mdelay(n) (\
    (__builtin_constant_p(n) && (n)<=MAX_UDELAY_MS) ? udelay((n)*1000) : \
    ({
     unsigned long __ms=(n); while (__ms--) udelay(1000);}))
#endif
 
#ifndef ndelay
static inline void ndelay(unsigned long x)
{
   
    udelay(DIV_ROUND_UP(x, 1000));
}
#define ndelay(x) ndelay(x)
#endif
 
#define DIV_ROUND_UP(n,d) (((n) + (d) - 1) / (d))

gcc 的内建函数 __builtin_constant_p 用于判断 n 是否为编译时常数，如果 n 是常数，返回 1，否则返回 0。
mdelay 实现，如果参数为常数，且小于 5，则直接调用 udelay，说明 udelay 最大支持 5000us 延时。否则则循环调用 udelay 达到延时目的。

当然，从注释中我们可以看到，include/linux/delay.h 目录下只是通用的实现，可以被架构相关的实现覆盖掉。
所以接下来来看 udelay 实现。这里讨论基于 ARM 处理器架构的实现，udelay 实现在arch/arm/include/asm/delay.h中。

/*
 * division by multiplication: you don't have to worry about
 * loss of precision.
 *
 * Use only for very small delays ( < 2 msec).  Should probably use a
 * lookup table, really, as the multiplications take much too long with
 * short delays.  This is a "reasonable" implementation, though (and the
 * first constant multiplications gets optimized away if the delay is
 * a constant)
 */
 #define MAX_UDELAY_MS	2

#define __udelay(n)		arm_delay_ops.udelay(n)
#define __const_udelay(n)	arm_delay_ops.const_udelay(n)

#define udelay(n)							\
	(__builtin_constant_p(n) ?					\
	  ((n) > (MAX_UDELAY_MS * 1000) ? __bad_udelay() :		\
			__const_udelay((n) * UDELAY_MULT)) :		\
	  __udelay(n))

1.1 loops_per_jiffy

这里我们先要了解一下，有个很重要的变量：jiffies

全局变量 jiffies 用来记录自系统启动以来产生的节拍的总数。启动时，内核将该变量初始化为 0，此后，每次时钟中断处理程序都会增加该变量的值。一秒内时钟中断的次数等于 Hz，所以 jiffies 一秒内增加的值也就是 Hz。

而 loops_per_jiffy 的定义就是每个 jiffies 内需要 loop 的个数。Linux 内核在启动时就计算出了当前处理器一个 jiffies 内可以处理的循环次数，也就是 loops_per_jiffy，在 Linux 系统启动过程中可以查看到：

[    0.577610] clocksource: hpet: mask: 0xffffffff max_cycles: 0xffffffff, max_idle_ns: 133484882848 ns
[    0.577674] APIC: Switch to symmetric I/O mode setup
 x2apic enabled
[    0.577973] Switched APIC routing to physical x2apic.
[    0.578765] ..TIMER: vector=0x30 apic1=0 pin1=2 apic2=-1 pin2=-1
[    0.578809] clocksource: tsc-early: mask: 0xffffffffffffffff max_cycles: 0x23fa781fa7b, max_idle_ns: 440795226023 ns
[    0.578814] Calibrating delay loop (skipped) preset value.. 4992.00 BogoMIPS (lpj=9984004)
[    0.578903] x86/cpu: User Mode Instruction Prevention (UMIP) activated

loops_per_jiffy 是通过 calibrate_delay_converge 函数计算的，init/calibrate.c，我们这里详细讲解下：

define LPS_PREC 8

static unsigned long calibrate_delay_converge(void)
{
   
	/* First stage - slowly accelerate to find initial bounds */
	unsigned long lpj, lpj_base, ticks, loopadd, loopadd_base, chop_limit;
	int trials = 0, band = 0, trial_in_band = 0;

	lpj = (1<<12);

	/* wait for "start of" clock tick */
	ticks = jiffies;
	while (ticks == jiffies)  			//等待下一个时钟节拍（jiffies ）
		; /* nothing */
	/* Go .. */
	ticks = jiffies;
	do {
   
		if (++trial_in_band == (1<<band)) {
   
			++band;
			trial_in_band = 0;
		}
		__delay(lpj * band);
		trials += band;
	} while (ticks == jiffies);	//计算一个 jiffies 内可以 __delay 的 loops 数（__delay 函数的参数就是 loops）
								//这里可以算出，一个 jiffies 内 loops 数为 lpj * trials 
	/*
	 * We overshot, so retreat to a clear underestimate. Then estimate
	 * the largest likely undershoot. This defines our chop bounds.
	 */
	trials -= band;				//去除上面do{...}while(x)里循环的最后一次，因为最后一次时钟节拍已经变更，所以不能统计到里面
	loopadd_base = lpj * band;  //上面do{...}while(x)最后一次循环所需的时间
	lpj_base = lpj * trials;    //去除最后一次、一个 jiffies 实际需要的 loops 数

recalibrate:
	lpj = lpj_base;			   //去除最后一次、一个 jiffies 实际需要的 loops 数
	loopadd = loopadd_base;    //上面do{...}while(x)最后一次循环所需的时间
							   //下面主要就是针对 do{...}while(x)里循环的最后一次进行微调，这点很重要，记下

	/*
	 * Do a binary approximation to get lpj set to
	 * equal one clock (up to LPS_PREC bits)
	 */
	chop_limit = lpj >> LPS_PREC;  //用于控制循环计算的次数，一个时钟节拍分频/2^8， 2^8=256
	/*
	 * 采用二分法的方式，无限靠近真值, do{...}while(x)里循环的最后一次 > 一个时钟节拍内256分频后的值，
	 */
	while (loopadd > chop_limit) {
    
		lpj += loopadd;		//对 lpj 值做出假设，满足 shorter than 1 tick 即保存下来;
		ticks = jiffies;
		while (ticks == jiffies)
			; /* nothing */
		ticks = jiffies;
		__delay(lpj);
		if (jiffies != ticks)	/* longer than 1 tick */
			lpj -= loopadd;
		loopadd >>= 1;		//对上面do{...}while(x)里循环的最后一次值进行“二分法”，已达到精确值
	}
	/*
	 * If we incremented every single time possible, presume we've
	 * massively underestimated initially, and retry with a higher
	 * start, and larger range. (Only seen on x86_64, due to SMIs)
	 */
	if (lpj + loopadd * 2 == lpj_base + loopadd_base * 2) {
   
		lpj_base = lpj;
		loopadd_base <<= 2;
		goto recalibrate;
	}

	return lpj;
}

上面 calibrate_delay_converge 函数中调用的 __delay 实际上是下面的 __loop_delay 函数。__loop_delay 实现就是将参数一直 subs 递减，反复跳转。所以我的理解，一个 loop 就是一条 arm 递减指令+跳转指令。

ENTRY(__loop_delay)
		subs	r0, r0, #1
		bhi	__loop_delay
		ret	lr

至此，loops_per_jiffy 变量就已经计算完毕，后面的 udelay 、BogoMIPS 计算都会用到该变量。

1.2 udelay

udelay 函数，最终会调用到 __loop_const_udelay 或者 __loop_udelay，二者实现在 arch/arm/lib/delay-loop.S 中，如下：

/* SPDX-License-Identifier: GPL-2.0-only */
/*
 *  linux/arch/arm/lib/delay.S
 *
 *  Copyright (C) 1995, 1996 Russell King
 */
#include <linux/linkage.h>
#include <asm/assembler.h>
#include <asm/delay.h>

#ifdef CONFIG_ARCH_RPC
		.arch	armv4
#endif

		.text

.LC0:		.word	loops_per_jiffy
.LC1:		.word	UDELAY_MULT

/*
 * loops = r0 * HZ * loops_per_jiffy / 1000000
 *
 * r0  <= 2000
 * HZ  <= 1000
 */

ENTRY(__loop_udelay)
		ldr	r2, .LC1
		mul	r0, r2, r0		@ r0 = delay_us * UDELAY_MULT
ENTRY(__loop_const_udelay)			@ 0 <= r0 <= 0xfffffaf0
		ldr	r2, .LC0
		ldr	r2, [r2]
		umull	r1, r0, r2, r0		@ r0-r1 = r0 * loops_per_jiffy
		adds	r1, r1, #0xffffffff	@ rounding up ...
		adcs	r0, r0, r0		@ and right shift by 31
		reteq	lr

		.align 3

@ Delay routine
ENTRY(__loop_delay)
		subs	r0, r0, #1
#if 0
		retls	lr
		subs	r0, r0, #1
		retls	lr
		subs	r0, r0, #1
		retls	lr
		subs	r0, r0, #1
		retls	lr
		subs	r0, r0, #1
		retls	lr
		subs	r0, r0, #1
		retls	lr
		subs	r0, r0, #1
		retls	lr
		subs	r0, r0, #1
#endif
		bhi	__loop_delay
		ret	lr
ENDPROC(__loop_udelay)
ENDPROC(__loop_const_udelay)
ENDPROC(__loop_delay)

1.3 UDELAY_MULT

关于 UDELAY_MULT 变量，可就有说法了，这个值是怎么来的？
首先，这个值定义在 arch/arm/include/asm/delay.h 中。我们看文件一开头的注释：

/*
 * Loop (or tick) based delay:
 *
 * loops = loops_per_jiffy * jiffies_per_sec * delay_us / us_per_sec
 *
 * where:
 *
 * jiffies_per_sec = HZ
 * us_per_sec = 1000000
 *
 * Therefore the constant part is HZ / 1000000 which is a small
 * fractional number. To make this usable with integer math, we
 * scale up this constant by 2^31, perform the actual multiplication,
 * and scale the result back down by 2^31 with a simple shift:
 *
 * loops = (loops_per_jiffy * delay_us * UDELAY_MULT) >> 31
 *
 * where:
 *
 * UDELAY_MULT = 2^31 * HZ / 1000000
 *             = (2^31 / 1000000) * HZ
 *             = 2147.483648 * HZ
 *             = 2147 * HZ + 483648 * HZ / 1000000
 *
 * 31 is the biggest scale shift value that won't overflow 32 bits for
 * delay_us * UDELAY_MULT assuming HZ <= 1000 and delay_us <= 2000.
 */

#define MAX_UDELAY_MS	2
#define UDELAY_MULT	UL(2147 * HZ + 483648 * HZ / 1000000)
#define UDELAY_SHIFT	31

读完一遍，感觉注释已经写的很清楚了。

Therefore the constant part is HZ / 1000000 which is a small fractional number. To make this usable with integer math, we scale up this constant by 2^31, perform the actual multiplication, and scale the result back down by 2^31 with a simple shift:

因此常数部分是HZ / 1000000，这是一个小分数。为了使其可用于整数数学，我们将该常量放大 2^31，执行实际的乘法，然后通过简单的移位将结果缩小 2^31

解释清楚了 UDELAY_MULT 的由来，也就知道为什么 __loop_const_udelay 中会有一个右移 31 位了

ENTRY(__loop_const_udelay)			@ 0 <= r0 <= 0xfffffaf0
		ldr	r2, .LC0
		ldr	r2, [r2]
		umull	r1, r0, r2, r0		@ r0-r1 = r0 * loops_per_jiffy
		adds	r1, r1, #0xffffffff	@ rounding up ...
		adcs	r0, r0, r0		@ and right shift by 31
		reteq	lr

		.align 3

这里解释下上面的 ARM 汇编：

ldr	r2, .LC0
ldr	r2, [r2]

这里就是简单的，将 .LC0 ，也就是 loops_per_jiffy 加载到 r2 寄存器中

umull	r1, r0, r2, r0		@ r0-r1 = r0 * loops_per_jiffy

将 r2 寄存器值与 r0 寄存器值（__loop_const_udelay 函数入参）相乘、结果保存在 r1（低32位）、r0（高32位）

adds	r1, r1, #0xffffffff	@ rounding up ...

判断 r1 寄存器是否为0，如果不为 0 ，产生进位

adcs	r0, r0, r0		@ and right shift by 31

带进位加，即 r0 + r0 + 进位，结果保存在 r0 和 r1 寄存器。其中 r0 寄存器保存高 32 位。如果单看结果 r0 寄存器的话，就相当于将 2 倍的 r0 右移了 32 位，等价于将 r0 右移 31 位

至此，我们就已经计算出了，需要 loop 的总数，保存在 r0 寄存器中，接下来就是实际的去不断减一、跳转…

ENTRY(__loop_delay)
		subs	r0, r0, #1
		bhi	__loop_delay
		ret	lr

udelay 实现结束

2、BogoMIPS

BogoMIPS (Bogo–Bogus–伪的，MIPS–millions of instruction per second) 按照字面的解释是“不太真实的MIPS”。MIPS是millions of instructions per second(百万条指令每秒)的缩写
之所以不太真实，那是因为其计算方法并不十分精确。BogoMIPS 的值在系统系统时，在一闪而过的启动信息里可以看到；也可以 dmesg 看到；还可以通过查看/proc/cpuifo看到。BogoMIPS 的值是 linux 内核通过在一个时钟节拍里不断的执行循环指令而估算出来，它实际上反应了 CPU 的速度。

[    0.577610] clocksource: hpet: mask: 0xffffffff max_cycles: 0xffffffff, max_idle_ns: 133484882848 ns
[    0.577674] APIC: Switch to symmetric I/O mode setup
 x2apic enabled
[    0.577973] Switched APIC routing to physical x2apic.
[    0.578765] ..TIMER: vector=0x30 apic1=0 pin1=2 apic2=-1 pin2=-1
[    0.578809] clocksource: tsc-early: mask: 0xffffffffffffffff max_cycles: 0x23fa781fa7b, max_idle_ns: 440795226023 ns
[    0.578814] Calibrating delay loop (skipped) preset value.. 4992.00 BogoMIPS (lpj=9984004)
[    0.578903] x86/cpu: User Mode Instruction Prevention (UMIP) activated

BogoMIPS 的计算和上面 calibrate_delay_converge 函数在同一个文件中，init/calibrate.c

void calibrate_delay(void)
{
   
	unsigned long lpj;
	static bool printed;
	int this_cpu = smp_processor_id();

	if (per_cpu(cpu_loops_per_jiffy, this_cpu)) {
   
		lpj = per_cpu(cpu_loops_per_jiffy, this_cpu);
		if (!printed)
			pr_info("Calibrating delay loop (skipped) "
				"already calibrated this CPU");
	} else if (preset_lpj) {
   
		lpj = preset_lpj;
		if (!printed)
			pr_info("Calibrating delay loop (skipped) "
				"preset value.. ");
	} else if ((!printed) && lpj_fine) {
   
		lpj = lpj_fine;
		pr_info("Calibrating delay loop (skipped), "
			"value calculated using timer frequency.. ");
	} else if ((lpj = calibrate_delay_is_known())) {
   
		;
	} else if ((lpj = calibrate_delay_direct()) != 0) {
   
		if (!printed)
			pr_info("Calibrating delay using timer "
				"specific routine.. ");
	} else {
   
		if (!printed)
			pr_info("Calibrating delay loop... ");
		lpj = calibrate_delay_converge();
	}
	per_cpu(cpu_loops_per_jiffy, this_cpu) = lpj;
	if (!printed)
		pr_cont("%lu.%02lu BogoMIPS (lpj=%lu)\n",
			lpj/(500000/HZ),
			(lpj/(5000/HZ)) % 100, lpj);

	loops_per_jiffy = lpj;
	printed = true;

	calibration_delay_done();
}

可以看到，BogoMIPS 实际上就是 lpj/(500000/HZ)

1、udelay

1.1 loops_per_jiffy

1.2 udelay

1.3 UDELAY_MULT

2、BogoMIPS

相关推荐

最近更新

热门阅读