diff -NurpP --minimal --exclude '*.orig' linux-2.6.32.1/Documentation/scheduler/sched-cfs-hard-limits.txt linux-2.6.32.1-hard/Documentation/scheduler/sched-cfs-hard-limits.txt --- linux-2.6.32.1/Documentation/scheduler/sched-cfs-hard-limits.txt 1970-01-01 01:00:00.000000000 +0100 +++ linux-2.6.32.1-hard/Documentation/scheduler/sched-cfs-hard-limits.txt 2009-12-17 14:48:21.000000000 +0100 @@ -0,0 +1,48 @@ +CPU HARD LIMITS FOR CFS GROUPS +============================== + +1. Overview +2. Interface +3. Examples + +1. Overview +----------- + +CFS is a proportional share scheduler which tries to divide the CPU time +proportionately between tasks or groups of tasks (task group/cgroup) depending +on the priority/weight of the task or shares assigned to groups of tasks. +In CFS, a task/task group can get more than its share of CPU if there are +enough idle CPU cycles available in the system, due to the work conserving +nature of the scheduler. However in certain scenarios (like pay-per-use), +it is desirable not to provide extra time to a group even in the presence +of idle CPU cycles. This is where hard limiting can be of use. + +Hard limits for task groups can be set by specifying how much CPU runtime a +group can consume within a given period. If the group consumes more CPU time +than the runtime in a given period, it gets throttled. None of the tasks of +the throttled group gets to run until the runtime of the group gets refreshed +at the beginning of the next period. + +2. Interface +------------ + +Hard limit feature adds 2 cgroup files for CFS group scheduler: + +cfs_runtime_us: Hard limit for the group in microseconds. + +cfs_period_us: Time period in microseconds within which hard limits is +enforced. + +A group gets created with default values for runtime (infinite runtime which +means hard limits disabled) and period (0.5s). Each group can set its own +values for runtime and period independent of other groups in the system. + +3. Examples +----------- + +# mount -t cgroup -ocpu none /cgroups/ +# cd /cgroups +# mkdir 1 +# cd 1/ +# echo 250000 > cfs_runtime_us /* set a 250ms runtime or limit */ +# echo 500000 > cfs_period_us /* set a 500ms period */ diff -NurpP --minimal --exclude '*.orig' linux-2.6.32.1/include/linux/sched.h linux-2.6.32.1-hard/include/linux/sched.h --- linux-2.6.32.1/include/linux/sched.h 2009-12-14 21:29:46.000000000 +0100 +++ linux-2.6.32.1-hard/include/linux/sched.h 2009-12-17 14:48:20.000000000 +0100 @@ -1183,6 +1183,12 @@ struct sched_entity { u64 nr_wakeups_affine_attempts; u64 nr_wakeups_passive; u64 nr_wakeups_idle; +#ifdef CONFIG_CFS_HARD_LIMITS + u64 throttle_start; + u64 throttle_max; + u64 throttle_count; + u64 throttle_sum; +#endif #endif #ifdef CONFIG_FAIR_GROUP_SCHED diff -NurpP --minimal --exclude '*.orig' linux-2.6.32.1/init/Kconfig linux-2.6.32.1-hard/init/Kconfig --- linux-2.6.32.1/init/Kconfig 2009-12-03 20:02:57.000000000 +0100 +++ linux-2.6.32.1-hard/init/Kconfig 2009-12-17 14:48:20.000000000 +0100 @@ -477,6 +477,19 @@ config CGROUP_SCHED endchoice +config CFS_HARD_LIMITS + bool "Hard Limits for CFS Group Scheduler" + depends on EXPERIMENTAL + depends on FAIR_GROUP_SCHED && CGROUP_SCHED + default n + help + This option enables hard limiting of CPU time obtained by + a fair task group. Use this if you want to throttle a group of tasks + based on its CPU usage. For more details refer to + Documentation/scheduler/sched-cfs-hard-limits.txt + + Say N if unsure. + menuconfig CGROUPS boolean "Control Group support" help diff -NurpP --minimal --exclude '*.orig' linux-2.6.32.1/kernel/sched.c linux-2.6.32.1-hard/kernel/sched.c --- linux-2.6.32.1/kernel/sched.c 2009-12-03 20:02:58.000000000 +0100 +++ linux-2.6.32.1-hard/kernel/sched.c 2009-12-17 14:48:21.000000000 +0100 @@ -237,6 +237,15 @@ static DEFINE_MUTEX(sched_domains_mutex) #include +#if defined(CONFIG_FAIR_GROUP_SCHED) && defined(CONFIG_CFS_HARD_LIMITS) +struct cfs_bandwidth { + spinlock_t cfs_runtime_lock; + ktime_t cfs_period; + u64 cfs_runtime; + struct hrtimer cfs_period_timer; +}; +#endif + struct cfs_rq; static LIST_HEAD(task_groups); @@ -257,6 +266,9 @@ struct task_group { /* runqueue "owned" by this group on each cpu */ struct cfs_rq **cfs_rq; unsigned long shares; +#ifdef CONFIG_CFS_HARD_LIMITS + struct cfs_bandwidth cfs_bandwidth; +#endif #endif #ifdef CONFIG_RT_GROUP_SCHED @@ -446,6 +458,19 @@ struct cfs_rq { unsigned long rq_weight; #endif #endif +#ifdef CONFIG_CFS_HARD_LIMITS + /* set when the group is throttled on this cpu */ + int cfs_throttled; + + /* runtime currently consumed by the group on this rq */ + u64 cfs_time; + + /* runtime available to the group on this rq */ + u64 cfs_runtime; + + /* Protects the cfs runtime related fields of this cfs_rq */ + spinlock_t cfs_runtime_lock; +#endif }; /* Real-Time classes' related field in a runqueue: */ @@ -1607,6 +1632,7 @@ static void update_group_shares_cpu(stru } } +static inline int cfs_rq_throttled(struct cfs_rq *cfs_rq); /* * Re-compute the task group their per cpu shares over the given domain. * This needs to be done in a bottom-up fashion because the rq weight of a @@ -1634,8 +1660,10 @@ static int tg_shares_up(struct task_grou * If there are currently no tasks on the cpu pretend there * is one of average load so that when a new task gets to * run here it will not get delayed by group starvation. + * Also if the group is throttled on this cpu, pretend that + * it has no tasks. */ - if (!weight) + if (!weight || cfs_rq_throttled(tg->cfs_rq[i])) weight = NICE_0_LOAD; rq_weight += weight; @@ -1811,6 +1839,175 @@ static void cfs_rq_set_shares(struct cfs static void calc_load_account_active(struct rq *this_rq); + +#if defined(CONFIG_RT_GROUP_SCHED) || defined(CONFIG_FAIR_GROUP_SCHED) + +#ifdef CONFIG_SMP +static inline const struct cpumask *sched_bw_period_mask(void) +{ + return cpu_rq(smp_processor_id())->rd->span; +} +#else /* !CONFIG_SMP */ +static inline const struct cpumask *sched_bw_period_mask(void) +{ + return cpu_online_mask; +} +#endif /* CONFIG_SMP */ + +#else +static inline const struct cpumask *sched_bw_period_mask(void) +{ + return cpu_online_mask; +} + +#endif + +#ifdef CONFIG_FAIR_GROUP_SCHED +#ifdef CONFIG_CFS_HARD_LIMITS + +/* + * Runtime allowed for a cfs group before it is hard limited. + * default: Infinite which means no hard limiting. + */ +u64 sched_cfs_runtime = RUNTIME_INF; + +/* + * period over which we hard limit the cfs group's bandwidth. + * default: 0.5s + */ +u64 sched_cfs_period = 500000; + +static inline u64 global_cfs_period(void) +{ + return sched_cfs_period * NSEC_PER_USEC; +} + +static inline u64 global_cfs_runtime(void) +{ + return RUNTIME_INF; +} + +void do_sched_cfs_period_timer(struct cfs_bandwidth *cfs_b); + +static inline void cfs_rq_runtime_lock(struct cfs_rq *cfs_rq) +{ + spin_lock(&cfs_rq->cfs_runtime_lock); +} + +static inline void cfs_rq_runtime_unlock(struct cfs_rq *cfs_rq) +{ + spin_unlock(&cfs_rq->cfs_runtime_lock); +} + +/* + * Refresh the runtimes of the throttled groups. + * But nothing much to do now, will populate this in later patches. + */ +static enum hrtimer_restart sched_cfs_period_timer(struct hrtimer *timer) +{ + struct cfs_bandwidth *cfs_b = + container_of(timer, struct cfs_bandwidth, cfs_period_timer); + + do_sched_cfs_period_timer(cfs_b); + hrtimer_add_expires_ns(timer, ktime_to_ns(cfs_b->cfs_period)); + return HRTIMER_RESTART; +} + +/* + * TODO: Check if this kind of timer setup is sufficient for cfs or + * should we do what rt is doing. + */ +static void start_cfs_bandwidth(struct task_group *tg) +{ + struct cfs_bandwidth *cfs_b = &tg->cfs_bandwidth; + + /* + * Timer isn't setup for groups with infinite runtime + */ + if (cfs_b->cfs_runtime == RUNTIME_INF) + return; + + if (hrtimer_active(&cfs_b->cfs_period_timer)) + return; + + hrtimer_start_range_ns(&cfs_b->cfs_period_timer, cfs_b->cfs_period, + 0, HRTIMER_MODE_REL); +} + +static void init_cfs_bandwidth(struct task_group *tg) +{ + struct cfs_bandwidth *cfs_b = &tg->cfs_bandwidth; + + cfs_b->cfs_period = ns_to_ktime(global_cfs_period()); + cfs_b->cfs_runtime = global_cfs_runtime(); + + spin_lock_init(&cfs_b->cfs_runtime_lock); + + hrtimer_init(&cfs_b->cfs_period_timer, + CLOCK_MONOTONIC, HRTIMER_MODE_REL); + cfs_b->cfs_period_timer.function = &sched_cfs_period_timer; +} + +static inline void destroy_cfs_bandwidth(struct task_group *tg) +{ + hrtimer_cancel(&tg->cfs_bandwidth.cfs_period_timer); +} + +static void init_cfs_hard_limits(struct cfs_rq *cfs_rq, struct task_group *tg) +{ + cfs_rq->cfs_time = 0; + cfs_rq->cfs_throttled = 0; + cfs_rq->cfs_runtime = tg->cfs_bandwidth.cfs_runtime; + spin_lock_init(&cfs_rq->cfs_runtime_lock); +} + +#else /* !CONFIG_CFS_HARD_LIMITS */ + +static void init_cfs_bandwidth(struct task_group *tg) +{ + return; +} + +static inline void destroy_cfs_bandwidth(struct task_group *tg) +{ + return; +} + +static void init_cfs_hard_limits(struct cfs_rq *cfs_rq, struct task_group *tg) +{ + return; +} + +static inline void cfs_rq_runtime_lock(struct cfs_rq *cfs_rq) +{ + return; +} + +static inline void cfs_rq_runtime_unlock(struct cfs_rq *cfs_rq) +{ + return; +} + +#endif /* CONFIG_CFS_HARD_LIMITS */ +#else /* !CONFIG_FAIR_GROUP_SCHED */ + +static inline void cfs_rq_runtime_lock(struct cfs_rq *cfs_rq) +{ + return; +} + +static inline void cfs_rq_runtime_unlock(struct cfs_rq *cfs_rq) +{ + return; +} + +static inline int cfs_rq_throttled(struct cfs_rq *cfs_rq) +{ + return 0; +} + +#endif /* CONFIG_FAIR_GROUP_SCHED */ + #include "sched_stats.h" #include "sched_idletask.c" #include "sched_fair.c" @@ -9164,6 +9361,32 @@ static int update_sched_domains(struct n } #endif +#ifdef CONFIG_SMP +static void disable_runtime(struct rq *rq) +{ + unsigned long flags; + + spin_lock_irqsave(&rq->lock, flags); +#if defined(CONFIG_FAIR_GROUP_SCHED) && defined(CONFIG_CFS_HARD_LIMITS) + disable_runtime_cfs(rq); +#endif + disable_runtime_rt(rq); + spin_unlock_irqrestore(&rq->lock, flags); +} + +static void enable_runtime(struct rq *rq) +{ + unsigned long flags; + + spin_lock_irqsave(&rq->lock, flags); +#if defined(CONFIG_FAIR_GROUP_SCHED) && defined(CONFIG_CFS_HARD_LIMITS) + enable_runtime_cfs(rq); +#endif + enable_runtime_rt(rq); + spin_unlock_irqrestore(&rq->lock, flags); +} +#endif + static int update_runtime(struct notifier_block *nfb, unsigned long action, void *hcpu) { @@ -9296,6 +9519,7 @@ static void init_tg_cfs_entry(struct tas struct rq *rq = cpu_rq(cpu); tg->cfs_rq[cpu] = cfs_rq; init_cfs_rq(cfs_rq, rq); + init_cfs_hard_limits(cfs_rq, tg); cfs_rq->tg = tg; if (add) list_add(&cfs_rq->leaf_cfs_rq_list, &rq->leaf_cfs_rq_list); @@ -9425,6 +9649,10 @@ void __init sched_init(void) #endif /* CONFIG_USER_SCHED */ #endif /* CONFIG_RT_GROUP_SCHED */ +#ifdef CONFIG_FAIR_GROUP_SCHED + init_cfs_bandwidth(&init_task_group); +#endif + #ifdef CONFIG_GROUP_SCHED list_add(&init_task_group.list, &task_groups); INIT_LIST_HEAD(&init_task_group.children); @@ -9451,6 +9679,7 @@ void __init sched_init(void) init_cfs_rq(&rq->cfs, rq); init_rt_rq(&rq->rt, rq); #ifdef CONFIG_FAIR_GROUP_SCHED + init_cfs_hard_limits(&rq->cfs, &init_task_group); init_task_group.shares = init_task_group_load; INIT_LIST_HEAD(&rq->leaf_cfs_rq_list); #ifdef CONFIG_CGROUP_SCHED @@ -9726,6 +9955,7 @@ static void free_fair_sched_group(struct { int i; + destroy_cfs_bandwidth(tg); for_each_possible_cpu(i) { if (tg->cfs_rq) kfree(tg->cfs_rq[i]); @@ -9752,6 +9982,7 @@ int alloc_fair_sched_group(struct task_g if (!tg->se) goto err; + init_cfs_bandwidth(tg); tg->shares = NICE_0_LOAD; for_each_possible_cpu(i) { @@ -10475,6 +10706,100 @@ static u64 cpu_shares_read_u64(struct cg return (u64) tg->shares; } + +#ifdef CONFIG_CFS_HARD_LIMITS + +static int tg_set_cfs_bandwidth(struct task_group *tg, + u64 cfs_period, u64 cfs_runtime) +{ + int i; + + spin_lock_irq(&tg->cfs_bandwidth.cfs_runtime_lock); + tg->cfs_bandwidth.cfs_period = ns_to_ktime(cfs_period); + tg->cfs_bandwidth.cfs_runtime = cfs_runtime; + + for_each_possible_cpu(i) { + struct cfs_rq *cfs_rq = tg->cfs_rq[i]; + + cfs_rq_runtime_lock(cfs_rq); + cfs_rq->cfs_runtime = cfs_runtime; + cfs_rq_runtime_unlock(cfs_rq); + } + + start_cfs_bandwidth(tg); + spin_unlock_irq(&tg->cfs_bandwidth.cfs_runtime_lock); + return 0; +} + +int tg_set_cfs_runtime(struct task_group *tg, long cfs_runtime_us) +{ + u64 cfs_runtime, cfs_period; + + cfs_period = ktime_to_ns(tg->cfs_bandwidth.cfs_period); + cfs_runtime = (u64)cfs_runtime_us * NSEC_PER_USEC; + if (cfs_runtime_us < 0) + cfs_runtime = RUNTIME_INF; + + return tg_set_cfs_bandwidth(tg, cfs_period, cfs_runtime); +} + +long tg_get_cfs_runtime(struct task_group *tg) +{ + u64 cfs_runtime_us; + + if (tg->cfs_bandwidth.cfs_runtime == RUNTIME_INF) + return -1; + + cfs_runtime_us = tg->cfs_bandwidth.cfs_runtime; + do_div(cfs_runtime_us, NSEC_PER_USEC); + return cfs_runtime_us; +} + +int tg_set_cfs_period(struct task_group *tg, long cfs_period_us) +{ + u64 cfs_runtime, cfs_period; + + cfs_period = (u64)cfs_period_us * NSEC_PER_USEC; + cfs_runtime = tg->cfs_bandwidth.cfs_runtime; + + if (cfs_period == 0) + return -EINVAL; + + return tg_set_cfs_bandwidth(tg, cfs_period, cfs_runtime); +} + +long tg_get_cfs_period(struct task_group *tg) +{ + u64 cfs_period_us; + + cfs_period_us = ktime_to_ns(tg->cfs_bandwidth.cfs_period); + do_div(cfs_period_us, NSEC_PER_USEC); + return cfs_period_us; +} + +static s64 cpu_cfs_runtime_read_s64(struct cgroup *cgrp, struct cftype *cft) +{ + return tg_get_cfs_runtime(cgroup_tg(cgrp)); +} + +static int cpu_cfs_runtime_write_s64(struct cgroup *cgrp, struct cftype *cftype, + s64 cfs_runtime_us) +{ + return tg_set_cfs_runtime(cgroup_tg(cgrp), cfs_runtime_us); +} + +static u64 cpu_cfs_period_read_u64(struct cgroup *cgrp, struct cftype *cft) +{ + return tg_get_cfs_period(cgroup_tg(cgrp)); +} + +static int cpu_cfs_period_write_u64(struct cgroup *cgrp, struct cftype *cftype, + u64 cfs_period_us) +{ + return tg_set_cfs_period(cgroup_tg(cgrp), cfs_period_us); +} + +#endif /* CONFIG_CFS_HARD_LIMITS */ #endif /* CONFIG_FAIR_GROUP_SCHED */ #ifdef CONFIG_RT_GROUP_SCHED @@ -10508,6 +10833,18 @@ static struct cftype cpu_files[] = { .read_u64 = cpu_shares_read_u64, .write_u64 = cpu_shares_write_u64, }, +#ifdef CONFIG_CFS_HARD_LIMITS + { + .name = "cfs_runtime_us", + .read_s64 = cpu_cfs_runtime_read_s64, + .write_s64 = cpu_cfs_runtime_write_s64, + }, + { + .name = "cfs_period_us", + .read_u64 = cpu_cfs_period_read_u64, + .write_u64 = cpu_cfs_period_write_u64, + }, +#endif /* CONFIG_CFS_HARD_LIMITS */ #endif #ifdef CONFIG_RT_GROUP_SCHED { diff -NurpP --minimal --exclude '*.orig' linux-2.6.32.1/kernel/sched_debug.c linux-2.6.32.1-hard/kernel/sched_debug.c --- linux-2.6.32.1/kernel/sched_debug.c 2009-12-03 20:02:58.000000000 +0100 +++ linux-2.6.32.1-hard/kernel/sched_debug.c 2009-12-17 14:48:20.000000000 +0100 @@ -80,6 +80,11 @@ static void print_cfs_group_stats(struct PN(se->wait_max); PN(se->wait_sum); P(se->wait_count); +#ifdef CONFIG_CFS_HARD_LIMITS + PN(se->throttle_max); + PN(se->throttle_sum); + P(se->throttle_count); +#endif #endif P(se->load.weight); #undef PN @@ -214,6 +219,16 @@ void print_cfs_rq(struct seq_file *m, in #ifdef CONFIG_SMP SEQ_printf(m, " .%-30s: %lu\n", "shares", cfs_rq->shares); #endif +#ifdef CONFIG_CFS_HARD_LIMITS + spin_lock_irqsave(&rq->lock, flags); + SEQ_printf(m, " .%-30s: %d\n", "cfs_throttled", + cfs_rq->cfs_throttled); + SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "cfs_time", + SPLIT_NS(cfs_rq->cfs_time)); + SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "cfs_runtime", + SPLIT_NS(cfs_rq->cfs_runtime)); + spin_unlock_irqrestore(&rq->lock, flags); +#endif /* CONFIG_CFS_HARD_LIMITS */ print_cfs_group_stats(m, cpu, cfs_rq->tg); #endif } @@ -310,7 +325,7 @@ static int sched_debug_show(struct seq_f u64 now = ktime_to_ns(ktime_get()); int cpu; - SEQ_printf(m, "Sched Debug Version: v0.09, %s %.*s\n", + SEQ_printf(m, "Sched Debug Version: v0.10, %s %.*s\n", init_utsname()->release, (int)strcspn(init_utsname()->version, " "), init_utsname()->version); diff -NurpP --minimal --exclude '*.orig' linux-2.6.32.1/kernel/sched_fair.c linux-2.6.32.1-hard/kernel/sched_fair.c --- linux-2.6.32.1/kernel/sched_fair.c 2009-12-03 20:02:58.000000000 +0100 +++ linux-2.6.32.1-hard/kernel/sched_fair.c 2009-12-17 14:48:21.000000000 +0100 @@ -189,7 +189,308 @@ find_matching_se(struct sched_entity **s } } -#else /* !CONFIG_FAIR_GROUP_SCHED */ +#ifdef CONFIG_CFS_HARD_LIMITS + +static inline void update_stats_throttle_start(struct cfs_rq *cfs_rq, + struct sched_entity *se) +{ + schedstat_set(se->throttle_start, rq_of(cfs_rq)->clock); +} + +static inline void update_stats_throttle_end(struct cfs_rq *cfs_rq, + struct sched_entity *se) +{ + schedstat_set(se->throttle_max, max(se->throttle_max, + rq_of(cfs_rq)->clock - se->throttle_start)); + schedstat_set(se->throttle_count, se->throttle_count + 1); + schedstat_set(se->throttle_sum, se->throttle_sum + + rq_of(cfs_rq)->clock - se->throttle_start); + schedstat_set(se->throttle_start, 0); +} + +static inline +struct cfs_rq *sched_cfs_period_cfs_rq(struct cfs_bandwidth *cfs_b, int cpu) +{ + return container_of(cfs_b, struct task_group, + cfs_bandwidth)->cfs_rq[cpu]; +} + +static inline int cfs_rq_throttled(struct cfs_rq *cfs_rq) +{ + return cfs_rq->cfs_throttled; +} + +#ifdef CONFIG_SMP +/* + * Ensure this RQ takes back all the runtime it lend to its neighbours. + */ +static void disable_runtime_cfs(struct rq *rq) +{ + struct root_domain *rd = rq->rd; + struct cfs_rq *cfs_rq; + + if (unlikely(!scheduler_running)) + return; + + for_each_leaf_cfs_rq(rq, cfs_rq) { + struct cfs_bandwidth *cfs_b = &cfs_rq->tg->cfs_bandwidth; + s64 want; + int i; + + spin_lock(&cfs_b->cfs_runtime_lock); + spin_lock(&cfs_rq->cfs_runtime_lock); + + /* + * Either we're all are infinity and nobody needs to borrow, + * or we're already disabled and this have nothing to do, or + * we have exactly the right amount of runtime to take out. + */ + if (cfs_rq->cfs_runtime == RUNTIME_INF || + cfs_rq->cfs_runtime == cfs_b->cfs_runtime) + goto balanced; + spin_unlock(&cfs_rq->cfs_runtime_lock); + + /* + * Calculate the difference between what we started out with + * and what we current have, that's the amount of runtime + * we lend and now have to reclaim. + */ + want = cfs_b->cfs_runtime - cfs_rq->cfs_runtime; + + /* + * Greedy reclaim, take back as much as possible. + */ + for_each_cpu(i, rd->span) { + struct cfs_rq *iter = sched_cfs_period_cfs_rq(cfs_b, i); + s64 diff; + + /* + * Can't reclaim from ourselves or disabled runqueues. + */ + if (iter == cfs_rq || iter->cfs_runtime == RUNTIME_INF) + continue; + + spin_lock(&iter->cfs_runtime_lock); + if (want > 0) { + diff = min_t(s64, iter->cfs_runtime, want); + iter->cfs_runtime -= diff; + want -= diff; + } else { + iter->cfs_runtime -= want; + want -= want; + } + + spin_unlock(&iter->cfs_runtime_lock); + if (!want) + break; + } + + spin_lock(&cfs_rq->cfs_runtime_lock); + /* + * We cannot be left wanting - that would mean some + * runtime leaked out of the system. + */ + BUG_ON(want); +balanced: + /* + * Disable all the borrow logic by pretending we have infinite + * runtime - in which case borrowing doesn't make sense. + */ + cfs_rq->cfs_runtime = RUNTIME_INF; + spin_unlock(&cfs_rq->cfs_runtime_lock); + spin_unlock(&cfs_b->cfs_runtime_lock); + } +} + +static void enable_runtime_cfs(struct rq *rq) +{ + struct cfs_rq *cfs_rq; + + if (unlikely(!scheduler_running)) + return; + + /* + * Reset each runqueue's bandwidth settings + */ + for_each_leaf_cfs_rq(rq, cfs_rq) { + struct cfs_bandwidth *cfs_b = &cfs_rq->tg->cfs_bandwidth; + + spin_lock(&cfs_b->cfs_runtime_lock); + spin_lock(&cfs_rq->cfs_runtime_lock); + cfs_rq->cfs_runtime = cfs_b->cfs_runtime; + cfs_rq->cfs_time = 0; + cfs_rq->cfs_throttled = 0; + spin_unlock(&cfs_rq->cfs_runtime_lock); + spin_unlock(&cfs_b->cfs_runtime_lock); + } +} + +/* + * Ran out of runtime, check if we can borrow some from others + * instead of getting throttled right away. + */ +static void do_cfs_balance_runtime(struct cfs_rq *cfs_rq) +{ + struct cfs_bandwidth *cfs_b = &cfs_rq->tg->cfs_bandwidth; + const struct cpumask *span = sched_bw_period_mask(); + int i, weight; + u64 cfs_period; + + weight = cpumask_weight(span); + spin_lock(&cfs_b->cfs_runtime_lock); + cfs_period = ktime_to_ns(cfs_b->cfs_period); + + for_each_cpu(i, span) { + struct cfs_rq *borrow_cfs_rq = + sched_cfs_period_cfs_rq(cfs_b, i); + s64 diff; + + if (borrow_cfs_rq == cfs_rq) + continue; + + cfs_rq_runtime_lock(borrow_cfs_rq); + if (borrow_cfs_rq->cfs_runtime == RUNTIME_INF) { + cfs_rq_runtime_unlock(borrow_cfs_rq); + continue; + } + + diff = borrow_cfs_rq->cfs_runtime - borrow_cfs_rq->cfs_time; + if (diff > 0) { + diff = div_u64((u64)diff, weight); + if (cfs_rq->cfs_runtime + diff > cfs_period) + diff = cfs_period - cfs_rq->cfs_runtime; + borrow_cfs_rq->cfs_runtime -= diff; + cfs_rq->cfs_runtime += diff; + if (cfs_rq->cfs_runtime == cfs_period) { + cfs_rq_runtime_unlock(borrow_cfs_rq); + break; + } + } + cfs_rq_runtime_unlock(borrow_cfs_rq); + } + spin_unlock(&cfs_b->cfs_runtime_lock); +} + +/* + * Called with rq->runtime_lock held. + */ +static void cfs_balance_runtime(struct cfs_rq *cfs_rq) +{ + cfs_rq_runtime_unlock(cfs_rq); + do_cfs_balance_runtime(cfs_rq); + cfs_rq_runtime_lock(cfs_rq); +} + +#else /* !CONFIG_SMP */ + +static void cfs_balance_runtime(struct cfs_rq *cfs_rq) +{ + return; +} +#endif /* CONFIG_SMP */ + +/* + * Check if group entity exceeded its runtime. If so, mark the cfs_rq as + * throttled mark the current task for reschedling. + */ +static void sched_cfs_runtime_exceeded(struct sched_entity *se, + struct task_struct *tsk_curr, unsigned long delta_exec) +{ + struct cfs_rq *cfs_rq; + + cfs_rq = group_cfs_rq(se); + + if (cfs_rq->cfs_runtime == RUNTIME_INF) + return; + + cfs_rq->cfs_time += delta_exec; + + if (cfs_rq_throttled(cfs_rq)) + return; + + if (cfs_rq->cfs_time > cfs_rq->cfs_runtime) + cfs_balance_runtime(cfs_rq); + + if (cfs_rq->cfs_time > cfs_rq->cfs_runtime) { + cfs_rq->cfs_throttled = 1; + update_stats_throttle_start(cfs_rq, se); + resched_task(tsk_curr); + } +} + +static inline void update_curr_group(struct sched_entity *curr, + unsigned long delta_exec, struct task_struct *tsk_curr) +{ + sched_cfs_runtime_exceeded(curr, tsk_curr, delta_exec); +} + +static void enqueue_entity_locked(struct cfs_rq *cfs_rq, + struct sched_entity *se, int wakeup); + +static void enqueue_throttled_entity(struct rq *rq, struct sched_entity *se) +{ + for_each_sched_entity(se) { + struct cfs_rq *gcfs_rq = group_cfs_rq(se); + + if (se->on_rq || cfs_rq_throttled(gcfs_rq) || + !gcfs_rq->nr_running) + break; + enqueue_entity_locked(cfs_rq_of(se), se, 0); + } +} + +/* + * Refresh runtimes of all cfs_rqs in this group, i,e., + * refresh runtimes of the representative cfs_rq of this + * tg on all cpus. Enqueue any throttled entity back. + */ +void do_sched_cfs_period_timer(struct cfs_bandwidth *cfs_b) +{ + int i; + const struct cpumask *span = sched_bw_period_mask(); + unsigned long flags; + + for_each_cpu(i, span) { + struct rq *rq = cpu_rq(i); + struct cfs_rq *cfs_rq = sched_cfs_period_cfs_rq(cfs_b, i); + struct sched_entity *se = cfs_rq->tg->se[i]; + + spin_lock_irqsave(&rq->lock, flags); + cfs_rq_runtime_lock(cfs_rq); + cfs_rq->cfs_time = 0; + if (cfs_rq_throttled(cfs_rq)) { + update_rq_clock(rq); + update_stats_throttle_end(cfs_rq, se); + cfs_rq->cfs_throttled = 0; + enqueue_throttled_entity(rq, se); + } + cfs_rq_runtime_unlock(cfs_rq); + spin_unlock_irqrestore(&rq->lock, flags); + } +} + +#else + +static inline void update_curr_group(struct sched_entity *curr, + unsigned long delta_exec, struct task_struct *tsk_curr) +{ + return; +} + +static inline int cfs_rq_throttled(struct cfs_rq *cfs_rq) +{ + return 0; +} + +#endif /* CONFIG_CFS_HARD_LIMITS */ + +#else /* CONFIG_FAIR_GROUP_SCHED */ + +static inline void update_curr_group(struct sched_entity *curr, + unsigned long delta_exec, struct task_struct *tsk_curr) +{ + return; +} static inline struct task_struct *task_of(struct sched_entity *se) { @@ -251,7 +552,6 @@ find_matching_se(struct sched_entity **s #endif /* CONFIG_FAIR_GROUP_SCHED */ - /************************************************************** * Scheduling class tree data structure manipulation methods: */ @@ -489,14 +789,25 @@ __update_curr(struct cfs_rq *cfs_rq, str update_min_vruntime(cfs_rq); } -static void update_curr(struct cfs_rq *cfs_rq) +static void update_curr_task(struct sched_entity *curr, + unsigned long delta_exec) +{ + struct task_struct *curtask = task_of(curr); + + trace_sched_stat_runtime(curtask, delta_exec, curr->vruntime); + cpuacct_charge(curtask, delta_exec); + account_group_exec_runtime(curtask, delta_exec); +} + +static int update_curr_common(struct cfs_rq *cfs_rq, unsigned long *delta) { struct sched_entity *curr = cfs_rq->curr; - u64 now = rq_of(cfs_rq)->clock; + struct rq *rq = rq_of(cfs_rq); + u64 now = rq->clock; unsigned long delta_exec; if (unlikely(!curr)) - return; + return 1; /* * Get the amount of time the current task was running @@ -505,20 +816,47 @@ static void update_curr(struct cfs_rq *c */ delta_exec = (unsigned long)(now - curr->exec_start); if (!delta_exec) - return; + return 1; __update_curr(cfs_rq, curr, delta_exec); curr->exec_start = now; + *delta = delta_exec; + return 0; +} - if (entity_is_task(curr)) { - struct task_struct *curtask = task_of(curr); +static void update_curr(struct cfs_rq *cfs_rq) +{ + struct sched_entity *curr = cfs_rq->curr; + struct rq *rq = rq_of(cfs_rq); + unsigned long delta_exec; - trace_sched_stat_runtime(curtask, delta_exec, curr->vruntime); - cpuacct_charge(curtask, delta_exec); - account_group_exec_runtime(curtask, delta_exec); + if (update_curr_common(cfs_rq, &delta_exec)) + return ; + + if (entity_is_task(curr)) + update_curr_task(curr, delta_exec); + else { + cfs_rq_runtime_lock(group_cfs_rq(curr)); + update_curr_group(curr, delta_exec, rq->curr); + cfs_rq_runtime_unlock(group_cfs_rq(curr)); } } +static void update_curr_locked(struct cfs_rq *cfs_rq) +{ + struct sched_entity *curr = cfs_rq->curr; + struct rq *rq = rq_of(cfs_rq); + unsigned long delta_exec; + + if (update_curr_common(cfs_rq, &delta_exec)) + return ; + + if (entity_is_task(curr)) + update_curr_task(curr, delta_exec); + else + update_curr_group(curr, delta_exec, rq->curr); +} + static inline void update_stats_wait_start(struct cfs_rq *cfs_rq, struct sched_entity *se) { @@ -740,13 +1078,9 @@ place_entity(struct cfs_rq *cfs_rq, stru se->vruntime = vruntime; } -static void -enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int wakeup) +static void enqueue_entity_common(struct cfs_rq *cfs_rq, + struct sched_entity *se, int wakeup) { - /* - * Update run-time statistics of the 'current'. - */ - update_curr(cfs_rq); account_entity_enqueue(cfs_rq, se); if (wakeup) { @@ -760,6 +1094,26 @@ enqueue_entity(struct cfs_rq *cfs_rq, st __enqueue_entity(cfs_rq, se); } +static void enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, + int wakeup) +{ + /* + * Update run-time statistics of the 'current'. + */ + update_curr(cfs_rq); + enqueue_entity_common(cfs_rq, se, wakeup); +} + +static void enqueue_entity_locked(struct cfs_rq *cfs_rq, + struct sched_entity *se, int wakeup) +{ + /* + * Update run-time statistics of the 'current'. + */ + update_curr_locked(cfs_rq); + enqueue_entity_common(cfs_rq, se, wakeup); +} + static void __clear_buddies(struct cfs_rq *cfs_rq, struct sched_entity *se) { if (!se || cfs_rq->last == se) @@ -897,6 +1251,32 @@ static struct sched_entity *pick_next_en return se; } +/* + * Called from put_prev_entity() + * If a group entity (@se) is found to be throttled, it will not be put back + * on @cfs_rq, which is equivalent to dequeing it. + */ +static int dequeue_throttled_entity(struct cfs_rq *cfs_rq, + struct sched_entity *se) +{ + struct cfs_rq *gcfs_rq = group_cfs_rq(se); + + if (entity_is_task(se)) + return 0; + + cfs_rq_runtime_lock(gcfs_rq); + if (!cfs_rq_throttled(gcfs_rq) && gcfs_rq->nr_running) { + cfs_rq_runtime_unlock(gcfs_rq); + return 0; + } + + __clear_buddies(cfs_rq, se); + account_entity_dequeue(cfs_rq, se); + cfs_rq->curr = NULL; + cfs_rq_runtime_unlock(gcfs_rq); + return 1; +} + static void put_prev_entity(struct cfs_rq *cfs_rq, struct sched_entity *prev) { /* @@ -908,6 +1288,8 @@ static void put_prev_entity(struct cfs_r check_spread(cfs_rq, prev); if (prev->on_rq) { + if (dequeue_throttled_entity(cfs_rq, prev)) + return; update_stats_wait_start(cfs_rq, prev); /* Put 'current' back into the tree. */ __enqueue_entity(cfs_rq, prev); @@ -1004,10 +1386,28 @@ static inline void hrtick_update(struct } #endif +static int enqueue_group_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, + int wakeup) +{ + struct cfs_rq *gcfs_rq = group_cfs_rq(se); + int ret = 0; + + cfs_rq_runtime_lock(gcfs_rq); + if (cfs_rq_throttled(gcfs_rq)) { + ret = 1; + goto out; + } + enqueue_entity_locked(cfs_rq, se, wakeup); +out: + cfs_rq_runtime_unlock(gcfs_rq); + return ret; +} + /* * The enqueue_task method is called before nr_running is * increased. Here we update the fair scheduling stats and * then put the task into the rbtree: + * Don't enqueue a throttled entity further into the hierarchy. */ static void enqueue_task_fair(struct rq *rq, struct task_struct *p, int wakeup) { @@ -1017,11 +1417,15 @@ static void enqueue_task_fair(struct rq for_each_sched_entity(se) { if (se->on_rq) break; + cfs_rq = cfs_rq_of(se); - enqueue_entity(cfs_rq, se, wakeup); + if (entity_is_task(se)) + enqueue_entity(cfs_rq, se, wakeup); + else + if (enqueue_group_entity(cfs_rq, se, wakeup)) + break; wakeup = 1; } - hrtick_update(rq); } @@ -1041,6 +1445,17 @@ static void dequeue_task_fair(struct rq /* Don't dequeue parent if it has other entities besides us */ if (cfs_rq->load.weight) break; + + /* + * If this cfs_rq is throttled, then it is already + * dequeued. + */ + cfs_rq_runtime_lock(cfs_rq); + if (cfs_rq_throttled(cfs_rq)) { + cfs_rq_runtime_unlock(cfs_rq); + break; + } + cfs_rq_runtime_unlock(cfs_rq); sleep = 1; } @@ -1788,9 +2203,10 @@ load_balance_fair(struct rq *this_rq, in u64 rem_load, moved_load; /* - * empty group + * empty group or throttled group */ - if (!busiest_cfs_rq->task_weight) + if (!busiest_cfs_rq->task_weight || + cfs_rq_throttled(busiest_cfs_rq)) continue; rem_load = (u64)rem_load_move * busiest_weight; @@ -1839,6 +2255,12 @@ move_one_task_fair(struct rq *this_rq, i for_each_leaf_cfs_rq(busiest, busy_cfs_rq) { /* + * Don't move task from a throttled cfs_rq + */ + if (cfs_rq_throttled(busy_cfs_rq)) + continue; + + /* * pass busy_cfs_rq argument into * load_balance_[start|next]_fair iterators */ diff -NurpP --minimal --exclude '*.orig' linux-2.6.32.1/kernel/sched_rt.c linux-2.6.32.1-hard/kernel/sched_rt.c --- linux-2.6.32.1/kernel/sched_rt.c 2009-12-03 20:02:58.000000000 +0100 +++ linux-2.6.32.1-hard/kernel/sched_rt.c 2009-12-17 14:48:21.000000000 +0100 @@ -235,18 +235,6 @@ static int rt_se_boosted(struct sched_rt return p->prio != p->normal_prio; } -#ifdef CONFIG_SMP -static inline const struct cpumask *sched_rt_period_mask(void) -{ - return cpu_rq(smp_processor_id())->rd->span; -} -#else -static inline const struct cpumask *sched_rt_period_mask(void) -{ - return cpu_online_mask; -} -#endif - static inline struct rt_rq *sched_rt_period_rt_rq(struct rt_bandwidth *rt_b, int cpu) { @@ -296,11 +284,6 @@ static inline int rt_rq_throttled(struct return rt_rq->rt_throttled; } -static inline const struct cpumask *sched_rt_period_mask(void) -{ - return cpu_online_mask; -} - static inline struct rt_rq *sched_rt_period_rt_rq(struct rt_bandwidth *rt_b, int cpu) { @@ -373,7 +356,7 @@ next: /* * Ensure this RQ takes back all the runtime it lend to its neighbours. */ -static void __disable_runtime(struct rq *rq) +static void disable_runtime_rt(struct rq *rq) { struct root_domain *rd = rq->rd; struct rt_rq *rt_rq; @@ -450,16 +433,7 @@ balanced: } } -static void disable_runtime(struct rq *rq) -{ - unsigned long flags; - - spin_lock_irqsave(&rq->lock, flags); - __disable_runtime(rq); - spin_unlock_irqrestore(&rq->lock, flags); -} - -static void __enable_runtime(struct rq *rq) +static void enable_runtime_rt(struct rq *rq) { struct rt_rq *rt_rq; @@ -482,15 +456,6 @@ static void __enable_runtime(struct rq * } } -static void enable_runtime(struct rq *rq) -{ - unsigned long flags; - - spin_lock_irqsave(&rq->lock, flags); - __enable_runtime(rq); - spin_unlock_irqrestore(&rq->lock, flags); -} - static int balance_runtime(struct rt_rq *rt_rq) { int more = 0; @@ -518,7 +483,7 @@ static int do_sched_rt_period_timer(stru if (!rt_bandwidth_enabled() || rt_b->rt_runtime == RUNTIME_INF) return 1; - span = sched_rt_period_mask(); + span = sched_bw_period_mask(); for_each_cpu(i, span) { int enqueue = 0; struct rt_rq *rt_rq = sched_rt_period_rt_rq(rt_b, i); @@ -1564,7 +1529,7 @@ static void rq_online_rt(struct rq *rq) if (rq->rt.overloaded) rt_set_overload(rq); - __enable_runtime(rq); + enable_runtime_rt(rq); cpupri_set(&rq->rd->cpupri, rq->cpu, rq->rt.highest_prio.curr); } @@ -1575,7 +1540,7 @@ static void rq_offline_rt(struct rq *rq) if (rq->rt.overloaded) rt_clear_overload(rq); - __disable_runtime(rq); + disable_runtime_rt(rq); cpupri_set(&rq->rd->cpupri, rq->cpu, CPUPRI_INVALID); }