负载均衡流程

1、负载均衡流程图

2、触发负载均衡函数trigger_load_balance

void trigger_load_balance(struct rq *rq)
{
    /* Don't need to rebalance while attached to NULL domain */
    if (unlikely(on_null_domain(rq)))//当前调度队列中的调度域是空的则返回
        return;

    if (time_after_eq(jiffies, rq->next_balance))//判断下一次均衡的时间是否到
        raise_softirq(SCHED_SOFTIRQ);//触发软中断,在init_sched_fair_class中初始化open_softirq(SCHED_SOFTIRQ, run_rebalance_domains);
#ifdef CONFIG_NO_HZ_COMMON
    if (nohz_kick_needed(rq, false))
        nohz_balancer_kick(false);
#endif
}

2.1 run_rebalance_domains

static __latent_entropy void run_rebalance_domains(struct softirq_action *h)
{
    struct rq *this_rq = this_rq();//获取当前运行队列
    enum cpu_idle_type idle = this_rq->idle_balance ?
                        CPU_IDLE : CPU_NOT_IDLE;//判断当前运行队列是空闲还是非空闲


    /*
     * If this cpu has a pending nohz_balance_kick, then do the
     * balancing on behalf of the other idle cpus whose ticks are
     * stopped. Do nohz_idle_balance *before* rebalance_domains to
     * give the idle cpus a chance to load balance. Else we may
     * load balance only within the local sched_domain hierarchy
     * and abort nohz_idle_balance altogether if we pull some load.
     */
    nohz_idle_balance(this_rq, idle);//给空闲cpu一个均衡的机会进行均衡,
    update_blocked_averages(this_rq->cpu);//更新阻塞平均值
#ifdef CONFIG_NO_HZ_COMMON
    if (!test_bit(NOHZ_STATS_KICK, nohz_flags(this_rq->cpu)))//如果当前cpu设置了NOHZ_STATS_KICK,则跳过,否则进行rebalance_domain
        rebalance_domains(this_rq, idle);
    clear_bit(NOHZ_STATS_KICK, nohz_flags(this_rq->cpu));
#else
    rebalance_domains(this_rq, idle);
#endif
}

2.1.1 nohz_idle_balance

static void nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle)
{
    int this_cpu = this_rq->cpu;//获取cpu
    struct rq *rq;
    struct sched_domain *sd;
    int balance_cpu;
    /* Earliest time when we have to do rebalance again */
    unsigned long next_balance = jiffies + 60*HZ;
    int update_next_balance = 0;
#ifdef CONFIG_SPRD_CORE_CTL
    cpumask_t cpus;
#endif

    if (idle != CPU_IDLE ||
        !test_bit(NOHZ_BALANCE_KICK, nohz_flags(this_cpu)))//如果cpu不是空闲,或者设置了NOHZ_BALANCE_KICK,则返回
        goto end;

    /*
     * This cpu is going to update the blocked load of idle CPUs either
     * before doing a rebalancing or just to keep metrics up to date. we
     * can safely update the next update timestamp
     */
    rcu_read_lock();//rcu读锁
    sd = rcu_dereference(this_rq->sd);//获取当前this_rq的调度域
    /*
     * Check whether there is a sched_domain available for this cpu.
     * The last other cpu can have been unplugged since the ILB has been
     * triggered and the sched_domain can now be null. The idle balance
     * sequence will quickly be aborted as there is no more idle CPUs
     */
    if (sd)
        nohz.next_update = jiffies + msecs_to_jiffies(LOAD_AVG_PERIOD);//计算下一次空闲cpu负载均衡的时间
    rcu_read_unlock();


    cpumask_andnot(&cpus, nohz.idle_cpus_mask, cpu_isolated_mask);移除隔离的cpu
    for_each_cpu(balance_cpu, &cpus) { //遍历空闲cpu

        if (balance_cpu == this_cpu || !idle_cpu(balance_cpu))//如果均衡cpu是当前cpu或者不是空闲的,则进行下一个循环。
            continue;

        /*
         * If this cpu gets work to do, stop the load balancing
         * work being done for other cpus. Next load
         * balancing owner will pick it up.
         */
        if (need_resched())//判断如果此cpu需要调度,则停止均衡
            break;

        rq = cpu_rq(balance_cpu);//获取要均衡cpu的运行队列

        /*
         * If time for next balance is due,
         * do the balance.
         */
        if (time_after_eq(jiffies, rq->next_balance)) {//判断均衡时间有没有到
            struct rq_flags rf;

            rq_lock_irq(rq, &rf);//获取运行队列锁
            update_rq_clock(rq);//更新运行队列时钟
            cpu_load_update_idle(rq);//更新队列负载
            rq_unlock_irq(rq, &rf);//释放锁

            update_blocked_averages(balance_cpu);//更新均衡cpu的阻塞平均值
            /*
             * This idle load balance softirq may have been
             * triggered only to update the blocked load and shares
             * of idle CPUs (which we have just done for
             * balance_cpu). In that case skip the actual balance.
             */
            if (!test_bit(NOHZ_STATS_KICK, nohz_flags(this_cpu)))//如果没有设置NOHZ_STATS_KICK,则进行均衡
                rebalance_domains(rq, idle);//域负载均衡
        }

        if (time_after(next_balance, rq->next_balance)) {//更新下一次均衡时间
            next_balance = rq->next_balance;
            update_next_balance = 1;
        }
    }

    /*
     * next_balance will be updated only when there is a need.
     * When the CPU is attached to null domain for ex, it will not be
     * updated.
     */
    if (likely(update_next_balance))//更新下一次均衡时间
        nohz.next_balance = next_balance;
end:
    clear_bit(NOHZ_BALANCE_KICK, nohz_flags(this_cpu));
}

2.2 rebalance_domains函数

static void rebalance_domains(struct rq *rq, enum cpu_idle_type idle)
{
    int continue_balancing = 1;
    int cpu = rq->cpu;
    unsigned long interval;
    struct sched_domain *sd;
    /* Earliest time when we have to do rebalance again */
    unsigned long next_balance = jiffies + 60*HZ;
    int update_next_balance = 0;
    int need_serialize, need_decay = 0;
    u64 max_cost = 0;

    rcu_read_lock();

    for_each_domain(cpu, sd) {//遍历调度域中每个cpu
        /*
         * Decay the newidle max times here because this is a regular
         * visit to all the domains. Decay ~1% per second.
         */
        if (time_after(jiffies, sd->next_decay_max_lb_cost)) {//判断衰减时间有没有到
            sd->max_newidle_lb_cost =
                (sd->max_newidle_lb_cost * 253) / 256;//衰减百分之一
            sd->next_decay_max_lb_cost = jiffies + HZ;//衰减时间更新
            need_decay = 1;
        }
        max_cost += sd->max_newidle_lb_cost;

        if (energy_aware() && !sd_overutilized(sd) && !sd->parent)//在使能了eas且调度域没有过载已及这是个根调度域时跳过
            continue;

        if (!(sd->flags & SD_LOAD_BALANCE)) { //判断此调度域是否设置了SD_LOAD_BALANCE
            if (time_after_eq(jiffies,
                      sd->groups->sgc->next_update))
                update_group_capacity(sd, cpu);//更新cpu调度组能力
            continue;
        }

        /*
         * Stop the load balance at this level. There is another
         * CPU in our sched group which is doing load balancing more
         * actively.
         */
        if (!continue_balancing) { //判断是否停止均衡
            if (need_decay)
                continue;
            break;
        }

        interval = get_sd_balance_interval(sd, idle != CPU_IDLE);//得到调度域的均衡间隔

        need_serialize = sd->flags & SD_SERIALIZE;//判断是否需要串行化
        if (need_serialize) {
            if (!spin_trylock(&balancing))//获取锁
                goto out;
        }

        if (time_after_eq(jiffies, sd->last_balance + interval)) { //判断均衡时间是否到
            if (load_balance(cpu, rq, sd, idle, &continue_balancing)) { //进行均衡
                /*
                 * The LBF_DST_PINNED logic could have changed
                 * env->dst_cpu, so we can't know our idle
                 * state even if we migrated tasks. Update it.
                 */
                idle = idle_cpu(cpu) ? CPU_IDLE : CPU_NOT_IDLE;//获取cpu空闲状态
            }
            sd->last_balance = jiffies;//更新均衡时间
            interval = get_sd_balance_interval(sd, idle != CPU_IDLE);//获取均衡间隔
        }
        if (need_serialize)
            spin_unlock(&balancing);//释放锁
out:
        if (time_after(next_balance, sd->last_balance + interval)) { //判断next_balance是否需要更新
            next_balance = sd->last_balance + interval;
            update_next_balance = 1;
        }
    }
    if (need_decay) { //判断是否需要衰减
        /*
         * Ensure the rq-wide value also decays but keep it at a
         * reasonable floor to avoid funnies with rq->avg_idle.
         */
        rq->max_idle_balance_cost =
            max((u64)sysctl_sched_migration_cost, max_cost);
    }
    rcu_read_unlock();

    /*
     * next_balance will be updated only when there is a need.
     * When the cpu is attached to null domain for ex, it will not be
     * updated.
     */
    if (likely(update_next_balance)) {
        rq->next_balance = next_balance;//更新运行队列下一次均衡时间

#ifdef CONFIG_NO_HZ_COMMON
        /*
         * If this CPU has been elected to perform the nohz idle
         * balance. Other idle CPUs have already rebalanced with
         * nohz_idle_balance() and nohz.next_balance has been
         * updated accordingly. This CPU is now running the idle load
         * balance for itself and we need to update the
         * nohz.next_balance accordingly.
         */
        if ((idle == CPU_IDLE) && time_after(nohz.next_balance, rq->next_balance))//如果cpu状态是空闲且运行队列的下次均衡时间小于空闲cpu的下次均衡时间
            nohz.next_balance = rq->next_balance;//更新空闲cpu的下次均衡时间
#endif
    }
}

2.2.1 load_balance

static int load_balance(int this_cpu, struct rq *this_rq,
            struct sched_domain *sd, enum cpu_idle_type idle,
            int *continue_balancing)
{
    int ld_moved, cur_ld_moved, active_balance = 0;
    struct sched_domain *sd_parent = lb_sd_parent(sd) ? sd->parent : NULL;
    struct sched_group *group;
    struct rq *busiest;
    struct rq_flags rf;
    struct cpumask *cpus = this_cpu_cpumask_var_ptr(load_balance_mask);

    

struct lb_env env = { //负载平衡环境,包含了一组与负载平衡相关的参数和状态信息
        .sd        = sd,//调度域
        .dst_cpu    = this_cpu,//均衡给此cpu
        .dst_rq        = this_rq,//均衡给此队列
        .dst_grpmask    = sched_group_span(sd->groups),//目标调度组掩码
        .idle        = idle,//cpu状态
        .loop_break    = sched_nr_migrate_break,//迁移间隔
        .cpus        = cpus,
        .fbq_type    = all,
        .tasks        = LIST_HEAD_INIT(env.tasks),
    };

    cpumask_and(cpus, sched_domain_span(sd), cpu_active_mask);//将调度域中处于active状态的cpu挑选出来

    schedstat_inc(sd->lb_count[idle]);//更新负载均衡idle类型的计数

redo:
    if (!should_we_balance(&env)) { //判断是否应该均衡
        *continue_balancing = 0;
        goto out_balanced;
    }

    group = find_busiest_group(&env);//找到最繁忙的组
    if (!group) {
        schedstat_inc(sd->lb_nobusyg[idle]);
        goto out_balanced;
    }

    busiest = find_busiest_queue(&env, group);//找到最繁忙的队列
    if (!busiest) {
        schedstat_inc(sd->lb_nobusyq[idle]);
        goto out_balanced;
    }

    BUG_ON(busiest == env.dst_rq);//最繁忙的队列不等于目的队列

    schedstat_add(sd->lb_imbalance[idle], env.imbalance);更新负载均衡idle类型不均衡的计数

    env.src_cpu = busiest->cpu;//最繁忙的队列的cpu给要均衡的cpu
    env.src_rq = busiest;//最繁忙的队列给要均衡的队列

    ld_moved = 0;
    if (busiest->nr_running > 1) { 最繁忙的运行队列中的task要大于1
        /*
         * Attempt to move tasks. If find_busiest_group has found
         * an imbalance but busiest->nr_running <= 1, the group is
         * still unbalanced. ld_moved simply stays zero, so it is
         * correctly treated as an imbalance.
         */
        env.flags |= LBF_ALL_PINNED;
        env.loop_max  = min(sysctl_sched_nr_migrate, busiest->nr_running);//最大循环的次数

more_balance:
        rq_lock_irqsave(busiest, &rf);//获取锁
        update_rq_clock(busiest);//更新最忙的队列的时钟

        /*
         * cur_ld_moved - load moved in current iteration
         * ld_moved     - cumulative load moved across iterations
         */
        cur_ld_moved = detach_tasks(&env, &rf);//出队,将要迁移的task从src cpu中移除并返回出队的个数

        /*
         * We've detached some tasks from busiest_rq. Every
         * task is masked "TASK_ON_RQ_MIGRATING", so we can safely
         * unlock busiest->lock, and we are able to be sure
         * that nobody can manipulate the tasks in parallel.
         * See task_rq_lock() family for the details.
         */

        rq_unlock(busiest, &rf);//释放锁

        if (cur_ld_moved) {
            attach_tasks(&env);//入队,将移除的task加入到新的队列中
            ld_moved += cur_ld_moved;
        }

        local_irq_restore(rf.flags);//恢复本地的中断状态

        if (env.flags & LBF_NEED_BREAK) { //判断是否设置了LBF_NEED_BREAK
            env.flags &= ~LBF_NEED_BREAK;
            goto more_balance;
        }

        /*
         * Revisit (affine) tasks on src_cpu that couldn't be moved to
         * us and move them to an alternate dst_cpu in our sched_group
         * where they can run. The upper limit on how many times we
         * iterate on same src_cpu is dependent on number of cpus in our
         * sched_group.
         *
         * This changes load balance semantics a bit on who can move
         * load to a given_cpu. In addition to the given_cpu itself
         * (or a ilb_cpu acting on its behalf where given_cpu is
         * nohz-idle), we now have balance_cpu in a position to move
         * load to given_cpu. In rare situations, this may cause
         * conflicts (balance_cpu and given_cpu/ilb_cpu deciding
         * _independently_ and at _same_ time to move some load to
         * given_cpu) causing exceess load to be moved to given_cpu.
         * This however should not happen so much in practice and
         * moreover subsequent load balance cycles should correct the
         * excess load moved.
         */
        if ((env.flags & LBF_DST_PINNED) && env.imbalance > 0) { //果sched domain仍然未达均衡均衡状态,并且在之前的均衡过程中,有因为affinity的原因导致任务无法迁移到dest cpu,这时候要继续在src rq上搜索任务,迁移到备选的dest cpu,因此,这里再次发起均衡操作。这里的均衡上下文的dest cpu设定为备选的cpu,loop也被清零,重新开始扫描。

            /* Prevent to re-select dst_cpu via env's cpus */
            cpumask_clear_cpu(env.dst_cpu, env.cpus);

            env.dst_rq     = cpu_rq(env.new_dst_cpu);//备用cpu队列
            env.dst_cpu     = env.new_dst_cpu;
            env.flags    &= ~LBF_DST_PINNED;
            env.loop     = 0;
            env.loop_break     = sched_nr_migrate_break;

            /*
             * Go back to "more_balance" rather than "redo" since we
             * need to continue with same src_cpu.
             */
            goto more_balance;
        }

        /*
         * We failed to reach balance because of affinity.
         */
        if (sd_parent) { //如果父调度域存在
            int *group_imbalance = &sd_parent->groups->sgc->imbalance;

            if ((env.flags & LBF_SOME_PINNED) && env.imbalance > 0)//由于亲和性原因不能在目标cpu上迁移而设置了LBF_SOME_PINNED
                *group_imbalance = 1;
        }

        /* All tasks on this runqueue were pinned by CPU affinity */
        if (unlikely(env.flags & LBF_ALL_PINNED)) { //设置了LBF_ALL_PINNED,由于亲和性原因在这个运行队列上的所有的任务不能迁移
            cpumask_clear_cpu(cpu_of(busiest), cpus);//清除在cpus中的busiest所在的cpu
            /*
             * Attempting to continue load balancing at the current
             * sched_domain level only makes sense if there are
             * active CPUs remaining as possible busiest CPUs to
             * pull load from which are not contained within the
             * destination group that is receiving any migrated
             * load.
             */
            if (!cpumask_subset(cpus, env.dst_grpmask)) { //如果选中的busiest cpu上的任务全部都是通过affinity锁定在了该cpu上,那么清除该cpu(为了确保下轮均衡不考虑该cpu),再次发起均衡。这种情况下,需要重新搜索source cpu,因此跳转到redo
                env.loop = 0;
                env.loop_break = sched_nr_migrate_break;
                goto redo;
            }
            goto out_all_pinned;
        }
    }

    if (!ld_moved) { //如果前面迁移的task如果为0,则走这里
        schedstat_inc(sd->lb_failed[idle]);//增加负载均衡lb_failed计数
        /*
         * Increment the failure counter only on periodic balance.
         * We do not want newidle balance, which can be very
         * frequent, pollute the failure counter causing
         * excessive cache_hot migrations and active balances.
         */
        if (idle != CPU_NEWLY_IDLE)//如果cpu状态不是刚刚处于空闲状态
            if (env.src_grp_nr_running > 1)//要迁移的调度组中的队列个数大于1
                sd->nr_balance_failed++;//失败计数加一

        if (need_active_balance(&env)) { //判断是否要启动active balance。所谓activebalance就是把当前正在运行的任务迁移到dest cpu上。也就是说经过前面一番折腾,runnable的任务都无法迁移到dest cpu,从而达到均衡,那么就考虑当前正在运行的任务
            unsigned long flags;

            raw_spin_lock_irqsave(&busiest->lock, flags);

            /* don't kick the active_load_balance_cpu_stop,
             * if the curr task on busiest cpu can't be
             * moved to this_cpu
             */
            if (!cpumask_test_cpu(this_cpu, &busiest->curr->cpus_allowed)) { //在启动active balance之前,先看看busiestcpu上当前正在运行的任务是否可以运行在dest cpu上。如果不可以的话,那么不再试图执行均衡操作,跳转到out_one_pinned
                raw_spin_unlock_irqrestore(&busiest->lock,
                                flags);
                env.flags |= LBF_ALL_PINNED;
                goto out_one_pinned;
            }

            /*
             * ->active_balance synchronizes accesses to
             * ->active_balance_work.  Once set, it's cleared
             * only after active load balance is finished.
             */
#ifdef CONFIG_SPRD_CORE_CTL
            if (!busiest->active_balance &&
                !cpu_isolated(cpu_of(busiest))) {
#else
            if (!busiest->active_balance) { //busiest cpu运行队列上设置active balance的标记
#endif
                busiest->active_balance = 1;
                busiest->push_cpu = this_cpu;
                active_balance = 1;
            }
            raw_spin_unlock_irqrestore(&busiest->lock, flags);

            if (active_balance) { //将正在运行的busiest cpu 正在运行的任务停止并进行迁移
                stop_one_cpu_nowait(cpu_of(busiest),
                    active_load_balance_cpu_stop, busiest,
                    &busiest->active_balance_work);
            }

            /* We've kicked active balancing, force task migration. */
            sd->nr_balance_failed = sd->cache_nice_tries+1;
        }
    } else
        sd->nr_balance_failed = 0;//完成了至少一个任务迁移

    if (likely(!active_balance)) {
        /* We were unbalanced, so reset the balancing interval */
        sd->balance_interval = sd->min_interval;//重新设置均衡间隔
    } else {
        /*
         * If we've begun active balancing, start to back off. This
         * case may not be covered by the all_pinned logic if there
         * is only 1 task on the busy runqueue (because we don't call
         * detach_tasks).
         */
        if (sd->balance_interval < sd->max_interval)
            sd->balance_interval *= 2;
    }

    goto out;

out_balanced:
    /*
     * We reach balance although we may have faced some affinity
     * constraints. Clear the imbalance flag if it was set.
     */
    if (sd_parent) {
        int *group_imbalance = &sd_parent->groups->sgc->imbalance;

        if (*group_imbalance)
            *group_imbalance = 0;
    }

out_all_pinned://由于所有的亲和性原因
    /*
     * We reach balance because all tasks are pinned at this level so
     * we can't migrate them. Let the imbalance flag set so parent level
     * can try to migrate them.
     */
    schedstat_inc(sd->lb_balanced[idle]);

    sd->nr_balance_failed = 0;

out_one_pinned://由某个task亲和性原因
    ld_moved = 0;

    /*
     * idle_balance() disregards balance intervals, so we could repeatedly
     * reach this code, which would lead to balance_interval skyrocketting
     * in a short amount of time. Skip the balance_interval increase logic
     * to avoid that.
     */
    if (env.idle == CPU_NEWLY_IDLE)
        goto out;

    /* tune up the balancing interval */
    if (((env.flags & LBF_ALL_PINNED) &&
            sd->balance_interval < MAX_PINNED_INTERVAL) ||
            (sd->balance_interval < sd->max_interval))
        sd->balance_interval *= 2;
out:
    return ld_moved;
}

相关推荐

  1. 云waf的负载均衡均衡流量分发功能

    2024-01-21 12:54:04       33 阅读
  2. 负载均衡

    2024-01-21 12:54:04       28 阅读
  3. 云WAF的负载均衡流量分发功能

    2024-01-21 12:54:04       33 阅读
  4. Nginx实现(负载均衡

    2024-01-21 12:54:04       61 阅读

最近更新

  1. docker php8.1+nginx base 镜像 dockerfile 配置

    2024-01-21 12:54:04       94 阅读
  2. Could not load dynamic library ‘cudart64_100.dll‘

    2024-01-21 12:54:04       101 阅读
  3. 在Django里面运行非项目文件

    2024-01-21 12:54:04       82 阅读
  4. Python语言-面向对象

    2024-01-21 12:54:04       91 阅读

热门阅读

  1. 03 OSPF

    2024-01-21 12:54:04       47 阅读
  2. R语言实现文献计量分析(1)——bibliometrix

    2024-01-21 12:54:04       60 阅读
  3. js变量提升

    2024-01-21 12:54:04       51 阅读
  4. 低代码开发:数据处理与可视化

    2024-01-21 12:54:04       57 阅读
  5. VUE v-if 和 v-show 区别和例子

    2024-01-21 12:54:04       46 阅读
  6. 句子逆序(机试)

    2024-01-21 12:54:04       53 阅读