1、udelay
与 sleep 相关函数相比,delay 的最大区别是忙等、一直占用 CPU,而 sleep 会让出 CPU 控制权。
mdelay、ndelay都是基于 udelay 来实现的。在 include/linux/delay.h 中,如下:
/*
* Using udelay() for intervals greater than a few milliseconds can
* risk overflow for high loops_per_jiffy (high bogomips) machines. The
* mdelay() provides a wrapper to prevent this. For delays greater
* than MAX_UDELAY_MS milliseconds, the wrapper is used. Architecture
* specific values can be defined in asm-???/delay.h as an override.
* The 2nd mdelay() definition ensures GCC will optimize away the
* while loop for the common cases where n <= MAX_UDELAY_MS -- Paul G.
*/
#ifndef MAX_UDELAY_MS
#define MAX_UDELAY_MS 5
#endif
#ifndef mdelay
#define mdelay(n) (\
(__builtin_constant_p(n) && (n)<=MAX_UDELAY_MS) ? udelay((n)*1000) : \
({
unsigned long __ms=(n); while (__ms--) udelay(1000);}))
#endif
#ifndef ndelay
static inline void ndelay(unsigned long x)
{
udelay(DIV_ROUND_UP(x, 1000));
}
#define ndelay(x) ndelay(x)
#endif
#define DIV_ROUND_UP(n,d) (((n) + (d) - 1) / (d))
gcc 的内建函数 __builtin_constant_p 用于判断 n 是否为编译时常数,如果 n 是常数,返回 1,否则返回 0。
mdelay 实现,如果参数为常数,且小于 5,则直接调用 udelay,说明 udelay 最大支持 5000us 延时。否则则循环调用 udelay 达到延时目的。
当然,从注释中我们可以看到,include/linux/delay.h 目录下只是通用的实现,可以被架构相关的实现覆盖掉。
所以接下来来看 udelay 实现。这里讨论基于 ARM 处理器架构的实现,udelay 实现在arch/arm/include/asm/delay.h中。
/*
* division by multiplication: you don't have to worry about
* loss of precision.
*
* Use only for very small delays ( < 2 msec). Should probably use a
* lookup table, really, as the multiplications take much too long with
* short delays. This is a "reasonable" implementation, though (and the
* first constant multiplications gets optimized away if the delay is
* a constant)
*/
#define MAX_UDELAY_MS 2
#define __udelay(n) arm_delay_ops.udelay(n)
#define __const_udelay(n) arm_delay_ops.const_udelay(n)
#define udelay(n) \
(__builtin_constant_p(n) ? \
((n) > (MAX_UDELAY_MS * 1000) ? __bad_udelay() : \
__const_udelay((n) * UDELAY_MULT)) : \
__udelay(n))
1.1 loops_per_jiffy
这里我们先要了解一下,有个很重要的变量:jiffies
全局变量 jiffies 用来记录自系统启动以来产生的节拍的总数。启动时,内核将该变量初始化为 0,此后,每次时钟中断处理程序都会增加该变量的值。一秒内时钟中断的次数等于 Hz,所以 jiffies 一秒内增加的值也就是 Hz。
而 loops_per_jiffy 的定义就是每个 jiffies 内需要 loop 的个数。Linux 内核在启动时就计算出了当前处理器一个 jiffies 内可以处理的循环次数,也就是 loops_per_jiffy,在 Linux 系统启动过程中可以查看到:
[ 0.577610] clocksource: hpet: mask: 0xffffffff max_cycles: 0xffffffff, max_idle_ns: 133484882848 ns
[ 0.577674] APIC: Switch to symmetric I/O mode setup
x2apic enabled
[ 0.577973] Switched APIC routing to physical x2apic.
[ 0.578765] ..TIMER: vector=0x30 apic1=0 pin1=2 apic2=-1 pin2=-1
[ 0.578809] clocksource: tsc-early: mask: 0xffffffffffffffff max_cycles: 0x23fa781fa7b, max_idle_ns: 440795226023 ns
[ 0.578814] Calibrating delay loop (skipped) preset value.. 4992.00 BogoMIPS (lpj=9984004)
[ 0.578903] x86/cpu: User Mode Instruction Prevention (UMIP) activated
loops_per_jiffy 是通过 calibrate_delay_converge 函数计算的,init/calibrate.c,我们这里详细讲解下:
define LPS_PREC 8
static unsigned long calibrate_delay_converge(void)
{
/* First stage - slowly accelerate to find initial bounds */
unsigned long lpj, lpj_base, ticks, loopadd, loopadd_base, chop_limit;
int trials = 0, band = 0, trial_in_band = 0;
lpj = (1<<12);
/* wait for "start of" clock tick */
ticks = jiffies;
while (ticks == jiffies) //等待下一个时钟节拍(jiffies )
; /* nothing */
/* Go .. */
ticks = jiffies;
do {
if (++trial_in_band == (1<<band)) {
++band;
trial_in_band = 0;
}
__delay(lpj * band);
trials += band;
} while (ticks == jiffies); //计算一个 jiffies 内可以 __delay 的 loops 数(__delay 函数的参数就是 loops)
//这里可以算出,一个 jiffies 内 loops 数为 lpj * trials
/*
* We overshot, so retreat to a clear underestimate. Then estimate
* the largest likely undershoot. This defines our chop bounds.
*/
trials -= band; //去除上面do{...}while(x)里循环的最后一次,因为最后一次时钟节拍已经变更,所以不能统计到里面
loopadd_base = lpj * band; //上面do{...}while(x)最后一次循环所需的时间
lpj_base = lpj * trials; //去除最后一次、一个 jiffies 实际需要的 loops 数
recalibrate:
lpj = lpj_base; //去除最后一次、一个 jiffies 实际需要的 loops 数
loopadd = loopadd_base; //上面do{...}while(x)最后一次循环所需的时间
//下面主要就是针对 do{...}while(x)里循环的最后一次进行微调,这点很重要,记下
/*
* Do a binary approximation to get lpj set to
* equal one clock (up to LPS_PREC bits)
*/
chop_limit = lpj >> LPS_PREC; //用于控制循环计算的次数,一个时钟节拍分频/2^8, 2^8=256
/*
* 采用二分法的方式,无限靠近真值, do{...}while(x)里循环的最后一次 > 一个时钟节拍内256分频后的值,
*/
while (loopadd > chop_limit) {
lpj += loopadd; //对 lpj 值做出假设,满足 shorter than 1 tick 即保存下来;
ticks = jiffies;
while (ticks == jiffies)
; /* nothing */
ticks = jiffies;
__delay(lpj);
if (jiffies != ticks) /* longer than 1 tick */
lpj -= loopadd;
loopadd >>= 1; //对上面do{...}while(x)里循环的最后一次值进行“二分法”,已达到精确值
}
/*
* If we incremented every single time possible, presume we've
* massively underestimated initially, and retry with a higher
* start, and larger range. (Only seen on x86_64, due to SMIs)
*/
if (lpj + loopadd * 2 == lpj_base + loopadd_base * 2) {
lpj_base = lpj;
loopadd_base <<= 2;
goto recalibrate;
}
return lpj;
}
上面 calibrate_delay_converge 函数中调用的 __delay 实际上是下面的 __loop_delay 函数。__loop_delay 实现就是将参数一直 subs 递减,反复跳转。所以我的理解,一个 loop 就是一条 arm 递减指令+跳转指令。
ENTRY(__loop_delay)
subs r0, r0, #1
bhi __loop_delay
ret lr
至此,loops_per_jiffy 变量就已经计算完毕,后面的 udelay 、BogoMIPS 计算都会用到该变量。
1.2 udelay
udelay 函数,最终会调用到 __loop_const_udelay 或者 __loop_udelay,二者实现在 arch/arm/lib/delay-loop.S 中,如下:
/* SPDX-License-Identifier: GPL-2.0-only */
/*
* linux/arch/arm/lib/delay.S
*
* Copyright (C) 1995, 1996 Russell King
*/
#include <linux/linkage.h>
#include <asm/assembler.h>
#include <asm/delay.h>
#ifdef CONFIG_ARCH_RPC
.arch armv4
#endif
.text
.LC0: .word loops_per_jiffy
.LC1: .word UDELAY_MULT
/*
* loops = r0 * HZ * loops_per_jiffy / 1000000
*
* r0 <= 2000
* HZ <= 1000
*/
ENTRY(__loop_udelay)
ldr r2, .LC1
mul r0, r2, r0 @ r0 = delay_us * UDELAY_MULT
ENTRY(__loop_const_udelay) @ 0 <= r0 <= 0xfffffaf0
ldr r2, .LC0
ldr r2, [r2]
umull r1, r0, r2, r0 @ r0-r1 = r0 * loops_per_jiffy
adds r1, r1, #0xffffffff @ rounding up ...
adcs r0, r0, r0 @ and right shift by 31
reteq lr
.align 3
@ Delay routine
ENTRY(__loop_delay)
subs r0, r0, #1
#if 0
retls lr
subs r0, r0, #1
retls lr
subs r0, r0, #1
retls lr
subs r0, r0, #1
retls lr
subs r0, r0, #1
retls lr
subs r0, r0, #1
retls lr
subs r0, r0, #1
retls lr
subs r0, r0, #1
#endif
bhi __loop_delay
ret lr
ENDPROC(__loop_udelay)
ENDPROC(__loop_const_udelay)
ENDPROC(__loop_delay)
1.3 UDELAY_MULT
关于 UDELAY_MULT 变量,可就有说法了,这个值是怎么来的?
首先,这个值定义在 arch/arm/include/asm/delay.h 中。我们看文件一开头的注释:
/*
* Loop (or tick) based delay:
*
* loops = loops_per_jiffy * jiffies_per_sec * delay_us / us_per_sec
*
* where:
*
* jiffies_per_sec = HZ
* us_per_sec = 1000000
*
* Therefore the constant part is HZ / 1000000 which is a small
* fractional number. To make this usable with integer math, we
* scale up this constant by 2^31, perform the actual multiplication,
* and scale the result back down by 2^31 with a simple shift:
*
* loops = (loops_per_jiffy * delay_us * UDELAY_MULT) >> 31
*
* where:
*
* UDELAY_MULT = 2^31 * HZ / 1000000
* = (2^31 / 1000000) * HZ
* = 2147.483648 * HZ
* = 2147 * HZ + 483648 * HZ / 1000000
*
* 31 is the biggest scale shift value that won't overflow 32 bits for
* delay_us * UDELAY_MULT assuming HZ <= 1000 and delay_us <= 2000.
*/
#define MAX_UDELAY_MS 2
#define UDELAY_MULT UL(2147 * HZ + 483648 * HZ / 1000000)
#define UDELAY_SHIFT 31
读完一遍,感觉注释已经写的很清楚了。
Therefore the constant part is HZ / 1000000 which is a small fractional number. To make this usable with integer math, we scale up this constant by 2^31, perform the actual multiplication, and scale the result back down by 2^31 with a simple shift:
因此常数部分是HZ / 1000000,这是一个小分数。 为了使其可用于整数数学,我们将该常量放大 2^31,执行实际的乘法,然后通过简单的移位将结果缩小 2^31
解释清楚了 UDELAY_MULT 的由来,也就知道为什么 __loop_const_udelay 中会有一个右移 31 位了
ENTRY(__loop_const_udelay) @ 0 <= r0 <= 0xfffffaf0
ldr r2, .LC0
ldr r2, [r2]
umull r1, r0, r2, r0 @ r0-r1 = r0 * loops_per_jiffy
adds r1, r1, #0xffffffff @ rounding up ...
adcs r0, r0, r0 @ and right shift by 31
reteq lr
.align 3
这里解释下上面的 ARM 汇编:
ldr r2, .LC0
ldr r2, [r2]
这里就是简单的,将 .LC0 ,也就是 loops_per_jiffy 加载到 r2 寄存器中
umull r1, r0, r2, r0 @ r0-r1 = r0 * loops_per_jiffy
将 r2 寄存器值与 r0 寄存器值(__loop_const_udelay 函数入参)相乘、结果保存在 r1(低32位)、r0(高32位)
adds r1, r1, #0xffffffff @ rounding up ...
判断 r1 寄存器是否为0,如果不为 0 ,产生进位
adcs r0, r0, r0 @ and right shift by 31
带进位加,即 r0 + r0 + 进位,结果保存在 r0 和 r1 寄存器。其中 r0 寄存器保存高 32 位。如果单看结果 r0 寄存器的话,就相当于将 2 倍的 r0 右移了 32 位,等价于将 r0 右移 31 位
至此,我们就已经计算出了,需要 loop 的总数,保存在 r0 寄存器中,接下来就是实际的去不断减一、跳转…
ENTRY(__loop_delay)
subs r0, r0, #1
bhi __loop_delay
ret lr
udelay 实现结束
2、BogoMIPS
BogoMIPS (Bogo–Bogus–伪的,MIPS–millions of instruction per second) 按照字面的解释是“不太真实的MIPS”。MIPS是millions of instructions per second(百万条指令每秒)的缩写
之所以不太真实,那是因为其计算方法并不十分精确。BogoMIPS 的值在系统系统时,在一闪而过的启动信息里可以看到;也可以 dmesg 看到;还可以通过查看/proc/cpuifo看到。BogoMIPS 的值是 linux 内核通过在一个时钟节拍里不断的执行循环指令而估算出来,它实际上反应了 CPU 的速度。
[ 0.577610] clocksource: hpet: mask: 0xffffffff max_cycles: 0xffffffff, max_idle_ns: 133484882848 ns
[ 0.577674] APIC: Switch to symmetric I/O mode setup
x2apic enabled
[ 0.577973] Switched APIC routing to physical x2apic.
[ 0.578765] ..TIMER: vector=0x30 apic1=0 pin1=2 apic2=-1 pin2=-1
[ 0.578809] clocksource: tsc-early: mask: 0xffffffffffffffff max_cycles: 0x23fa781fa7b, max_idle_ns: 440795226023 ns
[ 0.578814] Calibrating delay loop (skipped) preset value.. 4992.00 BogoMIPS (lpj=9984004)
[ 0.578903] x86/cpu: User Mode Instruction Prevention (UMIP) activated
BogoMIPS 的计算和上面 calibrate_delay_converge 函数在同一个文件中,init/calibrate.c
void calibrate_delay(void)
{
unsigned long lpj;
static bool printed;
int this_cpu = smp_processor_id();
if (per_cpu(cpu_loops_per_jiffy, this_cpu)) {
lpj = per_cpu(cpu_loops_per_jiffy, this_cpu);
if (!printed)
pr_info("Calibrating delay loop (skipped) "
"already calibrated this CPU");
} else if (preset_lpj) {
lpj = preset_lpj;
if (!printed)
pr_info("Calibrating delay loop (skipped) "
"preset value.. ");
} else if ((!printed) && lpj_fine) {
lpj = lpj_fine;
pr_info("Calibrating delay loop (skipped), "
"value calculated using timer frequency.. ");
} else if ((lpj = calibrate_delay_is_known())) {
;
} else if ((lpj = calibrate_delay_direct()) != 0) {
if (!printed)
pr_info("Calibrating delay using timer "
"specific routine.. ");
} else {
if (!printed)
pr_info("Calibrating delay loop... ");
lpj = calibrate_delay_converge();
}
per_cpu(cpu_loops_per_jiffy, this_cpu) = lpj;
if (!printed)
pr_cont("%lu.%02lu BogoMIPS (lpj=%lu)\n",
lpj/(500000/HZ),
(lpj/(5000/HZ)) % 100, lpj);
loops_per_jiffy = lpj;
printed = true;
calibration_delay_done();
}
可以看到,BogoMIPS 实际上就是 lpj/(500000/HZ)