diff -NurpP --minimal linux-2.6.31.2-vs2.3.0.36.14/Documentation/scheduler/sched-cfs-hard-limits.txt linux-2.6.31.2-vs2.3.0.36.15/Documentation/scheduler/sched-cfs-hard-limits.txt --- linux-2.6.31.2-vs2.3.0.36.14/Documentation/scheduler/sched-cfs-hard-limits.txt 1970-01-01 01:00:00.000000000 +0100 +++ linux-2.6.31.2-vs2.3.0.36.15/Documentation/scheduler/sched-cfs-hard-limits.txt 2009-10-06 04:39:46.000000000 +0200 @@ -0,0 +1,52 @@ +CPU HARD LIMITS FOR CFS GROUPS +============================== + +1. Overview +2. Interface +3. Examples + +1. Overview +----------- + +CFS is a proportional share scheduler which tries to divide the CPU time +proportionately between tasks or groups of tasks (task group/cgroup) depending +on the priority/weight of the task or shares assigned to groups of tasks. +In CFS, a task/task group can get more than its share of CPU if there are +enough idle CPU cycles available in the system, due to the work conserving +nature of the scheduler. However in certain scenarios (like pay-per-use), +it is desirable not to provide extra time to a group even in the presence +of idle CPU cycles. This is where hard limiting can be of use. + +Hard limits for task groups can be set by specifying how much CPU runtime a +group can consume within a given period. If the group consumes more CPU time +than the runtime in a given period, it gets throttled. None of the tasks of +the throttled group gets to run until the runtime of the group gets refreshed +at the beginning of the next period. + +2. Interface +------------ + +Hard limit feature adds 3 cgroup files for CFS group scheduler: + +cfs_runtime_us: Hard limit for the group in microseconds. + +cfs_period_us: Time period in microseconds within which hard limits is +enforced. + +cfs_hard_limit: The control file to enable or disable hard limiting for the +group. + +A group gets created with default values for runtime and period and with +hard limit disabled. Each group can set its own values for runtime and period +independent of other groups in the system. + +3. Examples +----------- + +# mount -t cgroup -ocpu none /cgroups/ +# cd /cgroups +# mkdir 1 +# cd 1/ +# echo 250000 > cfs_runtime_us /* set a 250ms runtime or limit */ +# echo 500000 > cfs_period_us /* set a 500ms period */ +# echo 1 > cfs_hard_limit /* enable hard limiting for group 1/ */ diff -NurpP --minimal linux-2.6.31.2-vs2.3.0.36.14/include/linux/sched.h linux-2.6.31.2-vs2.3.0.36.15/include/linux/sched.h --- linux-2.6.31.2-vs2.3.0.36.14/include/linux/sched.h 2009-10-06 05:02:05.000000000 +0200 +++ linux-2.6.31.2-vs2.3.0.36.15/include/linux/sched.h 2009-10-06 04:39:26.000000000 +0200 @@ -1027,7 +1027,7 @@ struct sched_domain; struct sched_class { const struct sched_class *next; - void (*enqueue_task) (struct rq *rq, struct task_struct *p, int wakeup); + int (*enqueue_task) (struct rq *rq, struct task_struct *p, int wakeup); void (*dequeue_task) (struct rq *rq, struct task_struct *p, int sleep); void (*yield_task) (struct rq *rq); @@ -1127,6 +1127,7 @@ struct sched_entity { u64 nr_failed_migrations_affine; u64 nr_failed_migrations_running; u64 nr_failed_migrations_hot; + u64 nr_failed_migrations_throttled; u64 nr_forced_migrations; u64 nr_forced2_migrations; @@ -1139,6 +1140,12 @@ struct sched_entity { u64 nr_wakeups_affine_attempts; u64 nr_wakeups_passive; u64 nr_wakeups_idle; +#ifdef CONFIG_CFS_HARD_LIMITS + u64 throttle_start; + u64 throttle_max; + u64 throttle_count; + u64 throttle_sum; +#endif #endif #ifdef CONFIG_FAIR_GROUP_SCHED diff -NurpP --minimal linux-2.6.31.2-vs2.3.0.36.14/init/Kconfig linux-2.6.31.2-vs2.3.0.36.15/init/Kconfig --- linux-2.6.31.2-vs2.3.0.36.14/init/Kconfig 2009-10-06 05:02:11.000000000 +0200 +++ linux-2.6.31.2-vs2.3.0.36.15/init/Kconfig 2009-10-06 04:38:47.000000000 +0200 @@ -492,6 +492,19 @@ config CGROUP_SCHED endchoice +config CFS_HARD_LIMITS + bool "Hard Limits for CFS Group Scheduler" + depends on EXPERIMENTAL + depends on FAIR_GROUP_SCHED && CGROUP_SCHED + default n + help + This option enables hard limiting of CPU time obtained by + a fair task group. Use this if you want to throttle a group of tasks + based on its CPU usage. For more details refer to + Documentation/scheduler/sched-cfs-hard-limits.txt + + Say N if unsure. + menuconfig CGROUPS boolean "Control Group support" help diff -NurpP --minimal linux-2.6.31.2-vs2.3.0.36.14/kernel/sched.c linux-2.6.31.2-vs2.3.0.36.15/kernel/sched.c --- linux-2.6.31.2-vs2.3.0.36.14/kernel/sched.c 2009-10-06 05:02:21.000000000 +0200 +++ linux-2.6.31.2-vs2.3.0.36.15/kernel/sched.c 2009-10-06 04:39:14.000000000 +0200 @@ -264,6 +264,15 @@ static DEFINE_MUTEX(sched_domains_mutex) #include +#if defined(CONFIG_FAIR_GROUP_SCHED) && defined(CONFIG_CFS_HARD_LIMITS) +struct cfs_bandwidth { + spinlock_t cfs_runtime_lock; + ktime_t cfs_period; + u64 cfs_runtime; + struct hrtimer cfs_period_timer; +}; +#endif + struct cfs_rq; static LIST_HEAD(task_groups); @@ -284,6 +293,11 @@ struct task_group { /* runqueue "owned" by this group on each cpu */ struct cfs_rq **cfs_rq; unsigned long shares; +#ifdef CONFIG_CFS_HARD_LIMITS + struct cfs_bandwidth cfs_bandwidth; + /* If set, throttle when the group exceeds its bandwidth */ + int hard_limit_enabled; +#endif #endif #ifdef CONFIG_RT_GROUP_SCHED @@ -479,6 +493,20 @@ struct cfs_rq { unsigned long rq_weight; #endif #endif +#ifdef CONFIG_CFS_HARD_LIMITS + /* set when the group is throttled on this cpu */ + int cfs_throttled; + + /* runtime currently consumed by the group on this rq */ + u64 cfs_time; + + /* runtime available to the group on this rq */ + u64 cfs_runtime; +#endif + /* + * Number of tasks at this heirarchy. + */ + unsigned long nr_tasks_running; }; /* Real-Time classes' related field in a runqueue: */ @@ -663,6 +691,11 @@ struct rq { /* BKL stats */ unsigned int bkl_count; #endif + /* + * Protects the cfs runtime related fields of all cfs_rqs under + * this rq + */ + spinlock_t runtime_lock; }; static DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues); @@ -1554,6 +1587,7 @@ update_group_shares_cpu(struct task_grou } } +static inline int cfs_rq_throttled(struct cfs_rq *cfs_rq); /* * Re-compute the task group their per cpu shares over the given domain. * This needs to be done in a bottom-up fashion because the rq weight of a @@ -1571,9 +1605,11 @@ static int tg_shares_up(struct task_grou * If there are currently no tasks on the cpu pretend there * is one of average load so that when a new task gets to * run here it will not get delayed by group starvation. + * Also if the group is throttled on this cpu, pretend that + * it has no tasks. */ weight = tg->cfs_rq[i]->load.weight; - if (!weight) + if (!weight || cfs_rq_throttled(tg->cfs_rq[i])) weight = NICE_0_LOAD; tg->cfs_rq[i]->rq_weight = weight; @@ -1597,6 +1633,7 @@ static int tg_shares_up(struct task_grou * Compute the cpu's hierarchical load factor for each task group. * This needs to be done in a top-down fashion because the load of a child * group is a fraction of its parents load. + * A throttled group's h_load is set to 0. */ static int tg_load_down(struct task_group *tg, void *data) { @@ -1605,6 +1642,8 @@ static int tg_load_down(struct task_grou if (!tg->parent) { load = cpu_rq(cpu)->load.weight; + } else if (cfs_rq_throttled(tg->cfs_rq[cpu])) { + load = 0; } else { load = tg->parent->cfs_rq[cpu]->h_load; load *= tg->cfs_rq[cpu]->shares; @@ -1734,6 +1773,187 @@ static void cfs_rq_set_shares(struct cfs static void calc_load_account_active(struct rq *this_rq); + +#if defined(CONFIG_RT_GROUP_SCHED) || defined(CONFIG_FAIR_GROUP_SCHED) + +#ifdef CONFIG_SMP +static inline const struct cpumask *sched_bw_period_mask(void) +{ + return cpu_rq(smp_processor_id())->rd->span; +} +#else /* !CONFIG_SMP */ +static inline const struct cpumask *sched_bw_period_mask(void) +{ + return cpu_online_mask; +} +#endif /* CONFIG_SMP */ + +#else +static inline const struct cpumask *sched_bw_period_mask(void) +{ + return cpu_online_mask; +} + +#endif + +#ifdef CONFIG_FAIR_GROUP_SCHED +#ifdef CONFIG_CFS_HARD_LIMITS + +/* + * Runtime allowed for a cfs group before it is hard limited. + * default: Infinite which means no hard limiting. + */ +u64 sched_cfs_runtime = RUNTIME_INF; + +/* + * period over which we hard limit the cfs group's bandwidth. + * default: 0.5s + */ +u64 sched_cfs_period = 500000; + +static inline u64 global_cfs_period(void) +{ + return sched_cfs_period * NSEC_PER_USEC; +} + +static inline u64 global_cfs_runtime(void) +{ + return RUNTIME_INF; +} + +int task_group_throttled(struct task_group *tg, int cpu); +void do_sched_cfs_period_timer(struct cfs_bandwidth *cfs_b); + +static inline int cfs_bandwidth_enabled(struct task_group *tg) +{ + return tg->hard_limit_enabled; +} + +static inline void rq_runtime_lock(struct rq *rq) +{ + spin_lock(&rq->runtime_lock); +} + +static inline void rq_runtime_unlock(struct rq *rq) +{ + spin_unlock(&rq->runtime_lock); +} + +/* + * Refresh the runtimes of the throttled groups. + * But nothing much to do now, will populate this in later patches. + */ +static enum hrtimer_restart sched_cfs_period_timer(struct hrtimer *timer) +{ + struct cfs_bandwidth *cfs_b = + container_of(timer, struct cfs_bandwidth, cfs_period_timer); + + do_sched_cfs_period_timer(cfs_b); + hrtimer_add_expires_ns(timer, ktime_to_ns(cfs_b->cfs_period)); + return HRTIMER_RESTART; +} + +/* + * TODO: Check if this kind of timer setup is sufficient for cfs or + * should we do what rt is doing. + */ +static void start_cfs_bandwidth(struct task_group *tg) +{ + struct cfs_bandwidth *cfs_b = &tg->cfs_bandwidth; + + /* + * Timer isn't setup for groups with infinite runtime or for groups + * for which hard limiting isn't enabled. + */ + if (!cfs_bandwidth_enabled(tg) || (cfs_b->cfs_runtime == RUNTIME_INF)) + return; + + if (hrtimer_active(&cfs_b->cfs_period_timer)) + return; + + hrtimer_start_range_ns(&cfs_b->cfs_period_timer, cfs_b->cfs_period, + 0, HRTIMER_MODE_REL); +} + +static void init_cfs_bandwidth(struct task_group *tg) +{ + struct cfs_bandwidth *cfs_b = &tg->cfs_bandwidth; + + cfs_b->cfs_period = ns_to_ktime(global_cfs_period()); + cfs_b->cfs_runtime = global_cfs_runtime(); + + spin_lock_init(&cfs_b->cfs_runtime_lock); + + hrtimer_init(&cfs_b->cfs_period_timer, + CLOCK_MONOTONIC, HRTIMER_MODE_REL); + cfs_b->cfs_period_timer.function = &sched_cfs_period_timer; +} + +static inline void destroy_cfs_bandwidth(struct task_group *tg) +{ + hrtimer_cancel(&tg->cfs_bandwidth.cfs_period_timer); +} + +static void init_cfs_hard_limits(struct cfs_rq *cfs_rq, struct task_group *tg) +{ + cfs_rq->cfs_time = 0; + cfs_rq->cfs_throttled = 0; + cfs_rq->cfs_runtime = tg->cfs_bandwidth.cfs_runtime; + tg->hard_limit_enabled = 0; +} + +#else /* !CONFIG_CFS_HARD_LIMITS */ + +static void init_cfs_bandwidth(struct task_group *tg) +{ + return; +} + +static inline void destroy_cfs_bandwidth(struct task_group *tg) +{ + return; +} + +static void init_cfs_hard_limits(struct cfs_rq *cfs_rq, struct task_group *tg) +{ + return; +} + +static inline void rq_runtime_lock(struct rq *rq) +{ + return; +} + +static inline void rq_runtime_unlock(struct rq *rq) +{ + return; +} + +#endif /* CONFIG_CFS_HARD_LIMITS */ +#else /* !CONFIG_FAIR_GROUP_SCHED */ + +static inline void rq_runtime_lock(struct rq *rq) +{ + return; +} + +static inline void rq_runtime_unlock(struct rq *rq) +{ + return; +} + +int task_group_throttled(struct task_group *tg, int cpu) +{ + return 0; +} + +static inline int cfs_rq_throttled(struct cfs_rq *cfs_rq) +{ + return 0; +} + +#endif /* CONFIG_FAIR_GROUP_SCHED */ + #include "sched_stats.h" #include "sched_idletask.c" #include "sched_fair.c" @@ -1783,14 +2003,17 @@ static void update_avg(u64 *avg, u64 sam *avg += diff >> 3; } -static void enqueue_task(struct rq *rq, struct task_struct *p, int wakeup) +static int enqueue_task(struct rq *rq, struct task_struct *p, int wakeup) { + int ret; + if (wakeup) p->se.start_runtime = p->se.sum_exec_runtime; sched_info_queued(p); - p->sched_class->enqueue_task(rq, p, wakeup); + ret = p->sched_class->enqueue_task(rq, p, wakeup); p->se.on_rq = 1; + return ret; } static void dequeue_task(struct rq *rq, struct task_struct *p, int sleep) @@ -1865,8 +2088,15 @@ static void activate_task(struct rq *rq, if (task_contributes_to_load(p)) rq->nr_uninterruptible--; - enqueue_task(rq, p, wakeup); - inc_nr_running(rq); + /* + * Increment rq->nr_running only if enqueue_task() succeeds. + * enqueue_task() can fail when the task being activated belongs + * to a throttled group. In this case, the task gets enqueued to + * throttled group and the group will be enqueued later when it + * gets unthrottled. rq->nr_running gets incremented at that time. + */ + if (!enqueue_task(rq, p, wakeup)) + inc_nr_running(rq); } /* @@ -3211,6 +3441,7 @@ int can_migrate_task(struct task_struct * 1) running (obviously), or * 2) cannot be migrated to this CPU due to cpus_allowed, or * 3) are cache-hot on their current CPU. + * 4) end up in throttled task groups on this CPU. */ if (!cpumask_test_cpu(this_cpu, &p->cpus_allowed)) { schedstat_inc(p, se.nr_failed_migrations_affine); @@ -3224,6 +3455,18 @@ int can_migrate_task(struct task_struct } /* + * Don't migrate the task if it belongs to a + * - throttled group on its current cpu + * - throttled group on this_cpu + * - group whose hierarchy is throttled on this_cpu + */ + if (cfs_rq_throttled(cfs_rq_of(&p->se)) || + task_group_throttled(task_group(p), this_cpu)) { + schedstat_inc(p, se.nr_failed_migrations_throttled); + return 0; + } + + /* * Aggressive migration if: * 1) task is cache cold, or * 2) too many balance attempts have failed. @@ -5911,8 +6154,10 @@ void rt_mutex_setprio(struct task_struct oldprio = p->prio; on_rq = p->se.on_rq; running = task_current(rq, p); - if (on_rq) + if (on_rq) { dequeue_task(rq, p, 0); + dec_nr_running(rq); + } if (running) p->sched_class->put_prev_task(rq, p); @@ -5926,7 +6171,8 @@ void rt_mutex_setprio(struct task_struct if (running) p->sched_class->set_curr_task(rq); if (on_rq) { - enqueue_task(rq, p, 0); + if (!enqueue_task(rq, p, 0)) + inc_nr_running(rq); check_class_changed(rq, p, prev_class, oldprio, running); } @@ -5960,8 +6206,10 @@ void set_user_nice(struct task_struct *p goto out_unlock; } on_rq = p->se.on_rq; - if (on_rq) + if (on_rq) { dequeue_task(rq, p, 0); + dec_nr_running(rq); + } p->static_prio = NICE_TO_PRIO(nice); set_load_weight(p); @@ -5970,7 +6218,8 @@ void set_user_nice(struct task_struct *p delta = p->prio - old_prio; if (on_rq) { - enqueue_task(rq, p, 0); + if (!enqueue_task(rq, p, 0)) + inc_nr_running(rq); /* * If the task increased its priority or is running and * lowered its priority, then reschedule its CPU: @@ -9134,6 +9383,7 @@ static void init_tg_cfs_entry(struct tas struct rq *rq = cpu_rq(cpu); tg->cfs_rq[cpu] = cfs_rq; init_cfs_rq(cfs_rq, rq); + init_cfs_hard_limits(cfs_rq, tg); cfs_rq->tg = tg; if (add) list_add(&cfs_rq->leaf_cfs_rq_list, &rq->leaf_cfs_rq_list); @@ -9263,6 +9513,10 @@ void __init sched_init(void) #endif /* CONFIG_USER_SCHED */ #endif /* CONFIG_RT_GROUP_SCHED */ +#ifdef CONFIG_FAIR_GROUP_SCHED + init_cfs_bandwidth(&init_task_group); +#endif + #ifdef CONFIG_GROUP_SCHED list_add(&init_task_group.list, &task_groups); INIT_LIST_HEAD(&init_task_group.children); @@ -9279,6 +9533,7 @@ void __init sched_init(void) rq = cpu_rq(i); spin_lock_init(&rq->lock); + spin_lock_init(&rq->runtime_lock); rq->nr_running = 0; rq->calc_load_active = 0; rq->calc_load_update = jiffies + LOAD_FREQ; @@ -9552,6 +9807,7 @@ static void free_fair_sched_group(struct { int i; + destroy_cfs_bandwidth(tg); for_each_possible_cpu(i) { if (tg->cfs_rq) kfree(tg->cfs_rq[i]); @@ -9578,6 +9834,7 @@ int alloc_fair_sched_group(struct task_g if (!tg->se) goto err; + init_cfs_bandwidth(tg); tg->shares = NICE_0_LOAD; for_each_possible_cpu(i) { @@ -9810,8 +10067,10 @@ void sched_move_task(struct task_struct running = task_current(rq, tsk); on_rq = tsk->se.on_rq; - if (on_rq) + if (on_rq) { dequeue_task(rq, tsk, 0); + dec_nr_running(rq); + } if (unlikely(running)) tsk->sched_class->put_prev_task(rq, tsk); @@ -9825,7 +10084,8 @@ void sched_move_task(struct task_struct if (unlikely(running)) tsk->sched_class->set_curr_task(rq); if (on_rq) - enqueue_task(rq, tsk, 0); + if (!enqueue_task(rq, tsk, 0)) + inc_nr_running(rq); task_rq_unlock(rq, &flags); } @@ -10272,6 +10532,134 @@ static u64 cpu_shares_read_u64(struct cg return (u64) tg->shares; } + +#ifdef CONFIG_CFS_HARD_LIMITS + +static int tg_set_cfs_bandwidth(struct task_group *tg, + u64 cfs_period, u64 cfs_runtime) +{ + int i, err = 0; + + spin_lock_irq(&tg->cfs_bandwidth.cfs_runtime_lock); + tg->cfs_bandwidth.cfs_period = ns_to_ktime(cfs_period); + tg->cfs_bandwidth.cfs_runtime = cfs_runtime; + + for_each_possible_cpu(i) { + struct cfs_rq *cfs_rq = tg->cfs_rq[i]; + + rq_runtime_lock(rq_of(cfs_rq)); + cfs_rq->cfs_runtime = cfs_runtime; + rq_runtime_unlock(rq_of(cfs_rq)); + } + + start_cfs_bandwidth(tg); + spin_unlock_irq(&tg->cfs_bandwidth.cfs_runtime_lock); + return err; +} + +int tg_set_cfs_runtime(struct task_group *tg, long cfs_runtime_us) +{ + u64 cfs_runtime, cfs_period; + + cfs_period = ktime_to_ns(tg->cfs_bandwidth.cfs_period); + cfs_runtime = (u64)cfs_runtime_us * NSEC_PER_USEC; + if (cfs_runtime_us < 0) + cfs_runtime = RUNTIME_INF; + + return tg_set_cfs_bandwidth(tg, cfs_period, cfs_runtime); +} + +long tg_get_cfs_runtime(struct task_group *tg) +{ + u64 cfs_runtime_us; + + if (tg->cfs_bandwidth.cfs_runtime == RUNTIME_INF) + return -1; + + cfs_runtime_us = tg->cfs_bandwidth.cfs_runtime; + do_div(cfs_runtime_us, NSEC_PER_USEC); + return cfs_runtime_us; +} + +int tg_set_cfs_period(struct task_group *tg, long cfs_period_us) +{ + u64 cfs_runtime, cfs_period; + + cfs_period = (u64)cfs_period_us * NSEC_PER_USEC; + cfs_runtime = tg->cfs_bandwidth.cfs_runtime; + + if (cfs_period == 0) + return -EINVAL; + + return tg_set_cfs_bandwidth(tg, cfs_period, cfs_runtime); +} + +long tg_get_cfs_period(struct task_group *tg) +{ + u64 cfs_period_us; + + cfs_period_us = ktime_to_ns(tg->cfs_bandwidth.cfs_period); + do_div(cfs_period_us, NSEC_PER_USEC); + return cfs_period_us; +} + +int tg_set_hard_limit_enabled(struct task_group *tg, u64 val) +{ + local_irq_disable(); + spin_lock(&tg->cfs_bandwidth.cfs_runtime_lock); + if (val > 0) { + tg->hard_limit_enabled = 1; + start_cfs_bandwidth(tg); + spin_unlock(&tg->cfs_bandwidth.cfs_runtime_lock); + } else { + destroy_cfs_bandwidth(tg); + tg->hard_limit_enabled = 0; + spin_unlock(&tg->cfs_bandwidth.cfs_runtime_lock); + /* + * Hard limiting is being disabled for this group. + * Refresh runtimes and put the throttled entities + * of the group back onto runqueue. + */ + do_sched_cfs_period_timer(&tg->cfs_bandwidth); + } + local_irq_enable(); + return 0; +} + +static s64 cpu_cfs_runtime_read_s64(struct cgroup *cgrp, struct cftype *cft) +{ + return tg_get_cfs_runtime(cgroup_tg(cgrp)); +} + +static int cpu_cfs_runtime_write_s64(struct cgroup *cgrp, struct cftype *cftype, + s64 cfs_runtime_us) +{ + return tg_set_cfs_runtime(cgroup_tg(cgrp), cfs_runtime_us); +} + +static u64 cpu_cfs_period_read_u64(struct cgroup *cgrp, struct cftype *cft) +{ + return tg_get_cfs_period(cgroup_tg(cgrp)); +} + +static int cpu_cfs_period_write_u64(struct cgroup *cgrp, struct cftype *cftype, + u64 cfs_period_us) +{ + return tg_set_cfs_period(cgroup_tg(cgrp), cfs_period_us); +} + +static u64 cpu_cfs_hard_limit_read_u64(struct cgroup *cgrp, struct cftype *cft) +{ + return cfs_bandwidth_enabled(cgroup_tg(cgrp)); +} + +static int cpu_cfs_hard_limit_write_u64(struct cgroup *cgrp, + struct cftype *cftype, u64 val) +{ + return tg_set_hard_limit_enabled(cgroup_tg(cgrp), val); +} + +#endif /* CONFIG_CFS_HARD_LIMITS */ #endif /* CONFIG_FAIR_GROUP_SCHED */ #ifdef CONFIG_RT_GROUP_SCHED @@ -10305,6 +10693,23 @@ static struct cftype cpu_files[] = { .read_u64 = cpu_shares_read_u64, .write_u64 = cpu_shares_write_u64, }, +#ifdef CONFIG_CFS_HARD_LIMITS + { + .name = "cfs_runtime_us", + .read_s64 = cpu_cfs_runtime_read_s64, + .write_s64 = cpu_cfs_runtime_write_s64, + }, + { + .name = "cfs_period_us", + .read_u64 = cpu_cfs_period_read_u64, + .write_u64 = cpu_cfs_period_write_u64, + }, + { + .name = "cfs_hard_limit", + .read_u64 = cpu_cfs_hard_limit_read_u64, + .write_u64 = cpu_cfs_hard_limit_write_u64, + }, +#endif /* CONFIG_CFS_HARD_LIMITS */ #endif #ifdef CONFIG_RT_GROUP_SCHED { diff -NurpP --minimal linux-2.6.31.2-vs2.3.0.36.14/kernel/sched_debug.c linux-2.6.31.2-vs2.3.0.36.15/kernel/sched_debug.c --- linux-2.6.31.2-vs2.3.0.36.14/kernel/sched_debug.c 2009-10-06 05:02:16.000000000 +0200 +++ linux-2.6.31.2-vs2.3.0.36.15/kernel/sched_debug.c 2009-10-06 04:39:26.000000000 +0200 @@ -80,6 +80,11 @@ static void print_cfs_group_stats(struct PN(se->wait_max); PN(se->wait_sum); P(se->wait_count); +#ifdef CONFIG_CFS_HARD_LIMITS + PN(se->throttle_max); + PN(se->throttle_sum); + P(se->throttle_count); +#endif #endif P(se->load.weight); #undef PN @@ -214,6 +219,18 @@ void print_cfs_rq(struct seq_file *m, in #ifdef CONFIG_SMP SEQ_printf(m, " .%-30s: %lu\n", "shares", cfs_rq->shares); #endif + SEQ_printf(m, " .%-30s: %ld\n", "nr_tasks_running", + cfs_rq->nr_tasks_running); +#ifdef CONFIG_CFS_HARD_LIMITS + spin_lock_irqsave(&rq->lock, flags); + SEQ_printf(m, " .%-30s: %d\n", "cfs_throttled", + cfs_rq->cfs_throttled); + SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "cfs_time", + SPLIT_NS(cfs_rq->cfs_time)); + SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "cfs_runtime", + SPLIT_NS(cfs_rq->cfs_runtime)); + spin_unlock_irqrestore(&rq->lock, flags); +#endif print_cfs_group_stats(m, cpu, cfs_rq->tg); #endif } @@ -310,7 +327,7 @@ static int sched_debug_show(struct seq_f u64 now = ktime_to_ns(ktime_get()); int cpu; - SEQ_printf(m, "Sched Debug Version: v0.09, %s %.*s\n", + SEQ_printf(m, "Sched Debug Version: v0.10, %s %.*s\n", init_utsname()->release, (int)strcspn(init_utsname()->version, " "), init_utsname()->version); @@ -415,6 +432,7 @@ void proc_sched_show_task(struct task_st P(se.nr_failed_migrations_affine); P(se.nr_failed_migrations_running); P(se.nr_failed_migrations_hot); + P(se.nr_failed_migrations_throttled); P(se.nr_forced_migrations); P(se.nr_forced2_migrations); P(se.nr_wakeups); @@ -489,6 +507,7 @@ void proc_sched_set_task(struct task_str p->se.nr_failed_migrations_affine = 0; p->se.nr_failed_migrations_running = 0; p->se.nr_failed_migrations_hot = 0; + p->se.nr_failed_migrations_throttled = 0; p->se.nr_forced_migrations = 0; p->se.nr_forced2_migrations = 0; p->se.nr_wakeups = 0; diff -NurpP --minimal linux-2.6.31.2-vs2.3.0.36.14/kernel/sched_fair.c linux-2.6.31.2-vs2.3.0.36.15/kernel/sched_fair.c --- linux-2.6.31.2-vs2.3.0.36.14/kernel/sched_fair.c 2009-10-06 05:02:16.000000000 +0200 +++ linux-2.6.31.2-vs2.3.0.36.15/kernel/sched_fair.c 2009-10-06 04:39:37.000000000 +0200 @@ -186,6 +186,286 @@ find_matching_se(struct sched_entity **s } } +#ifdef CONFIG_CFS_HARD_LIMITS + +static inline void update_stats_throttle_start(struct cfs_rq *cfs_rq, + struct sched_entity *se) +{ + schedstat_set(se->throttle_start, rq_of(cfs_rq)->clock); +} + +static inline void update_stats_throttle_end(struct cfs_rq *cfs_rq, + struct sched_entity *se) +{ + schedstat_set(se->throttle_max, max(se->throttle_max, + rq_of(cfs_rq)->clock - se->throttle_start)); + schedstat_set(se->throttle_count, se->throttle_count + 1); + schedstat_set(se->throttle_sum, se->throttle_sum + + rq_of(cfs_rq)->clock - se->throttle_start); + schedstat_set(se->throttle_start, 0); +} + +static void double_rq_runtime_lock(struct rq *rq1, struct rq *rq2) + __acquires(rq1->runtime_lock) + __acquires(rq2->runtime_lock) +{ + BUG_ON(!irqs_disabled()); + if (rq1 == rq2) { + spin_lock(&rq1->runtime_lock); + __acquire(rq2->runtime_lock); /* Fake it out ;) */ + } else { + if (rq1 < rq2) { + spin_lock(&rq1->runtime_lock); + spin_lock_nested(&rq2->runtime_lock, + SINGLE_DEPTH_NESTING); + } else { + spin_lock(&rq2->runtime_lock); + spin_lock_nested(&rq1->runtime_lock, + SINGLE_DEPTH_NESTING); + } + } + update_rq_clock(rq1); + update_rq_clock(rq2); +} + +static void double_rq_runtime_unlock(struct rq *rq1, struct rq *rq2) + __releases(rq1->runtime_lock) + __releases(rq2->runtime_lock) +{ + spin_unlock(&rq1->runtime_lock); + if (rq1 != rq2) + spin_unlock(&rq2->runtime_lock); + else + __release(rq2->runtime_lock); +} + +static inline int cfs_rq_throttled(struct cfs_rq *cfs_rq) +{ + return cfs_rq->cfs_throttled; +} + +/* + * Ran out of runtime, check if we can borrow some from others + * instead of getting throttled right away. + */ +static void do_cfs_balance_runtime(struct cfs_rq *cfs_rq) +{ + struct rq *rq = rq_of(cfs_rq); + struct cfs_bandwidth *cfs_b = &cfs_rq->tg->cfs_bandwidth; + const struct cpumask *span = sched_bw_period_mask(); + int i, weight; + u64 cfs_period; + struct task_group *tg = container_of(cfs_b, struct task_group, + cfs_bandwidth); + + weight = cpumask_weight(span); + spin_lock(&cfs_b->cfs_runtime_lock); + cfs_period = ktime_to_ns(cfs_b->cfs_period); + + for_each_cpu(i, span) { + struct cfs_rq *borrow_cfs_rq = tg->cfs_rq[i]; + struct rq *borrow_rq = rq_of(borrow_cfs_rq); + s64 diff; + + if (borrow_cfs_rq == cfs_rq) + continue; + + double_rq_runtime_lock(rq, borrow_rq); + if (borrow_cfs_rq->cfs_runtime == RUNTIME_INF) { + double_rq_runtime_unlock(rq, borrow_rq); + continue; + } + + diff = borrow_cfs_rq->cfs_runtime - borrow_cfs_rq->cfs_time; + if (diff > 0) { + diff = div_u64((u64)diff, weight); + if (cfs_rq->cfs_runtime + diff > cfs_period) + diff = cfs_period - cfs_rq->cfs_runtime; + borrow_cfs_rq->cfs_runtime -= diff; + cfs_rq->cfs_runtime += diff; + if (cfs_rq->cfs_runtime == cfs_period) { + double_rq_runtime_unlock(rq, borrow_rq); + break; + } + } + double_rq_runtime_unlock(rq, borrow_rq); + } + spin_unlock(&cfs_b->cfs_runtime_lock); +} + +/* + * Called with rq->runtime_lock held. + */ +static void cfs_balance_runtime(struct cfs_rq *cfs_rq) +{ + struct rq *rq = rq_of(cfs_rq); + + rq_runtime_unlock(rq); + do_cfs_balance_runtime(cfs_rq); + rq_runtime_lock(rq); +} + +/* + * Check if group entity exceeded its runtime. If so, mark the cfs_rq as + * throttled mark the current task for reschedling. + */ +static void sched_cfs_runtime_exceeded(struct sched_entity *se, + struct task_struct *tsk_curr, unsigned long delta_exec) +{ + struct cfs_rq *cfs_rq; + + cfs_rq = group_cfs_rq(se); + + if (!cfs_bandwidth_enabled(cfs_rq->tg)) + return; + + if (cfs_rq->cfs_runtime == RUNTIME_INF) + return; + + cfs_rq->cfs_time += delta_exec; + + if (cfs_rq_throttled(cfs_rq)) + return; + + if (cfs_rq->cfs_time > cfs_rq->cfs_runtime) + cfs_balance_runtime(cfs_rq); + + if (cfs_rq->cfs_time > cfs_rq->cfs_runtime) { + cfs_rq->cfs_throttled = 1; + update_stats_throttle_start(cfs_rq, se); + resched_task(tsk_curr); + } +} + +/* + * Check if the entity is throttled. + */ +static int entity_throttled(struct sched_entity *se) +{ + struct cfs_rq *cfs_rq; + + /* Only group entities can be throttled */ + if (entity_is_task(se)) + return 0; + + cfs_rq = group_cfs_rq(se); + if (cfs_rq_throttled(cfs_rq)) + return 1; + return 0; +} + +int task_group_throttled(struct task_group *tg, int cpu) +{ + struct sched_entity *se = tg->se[cpu]; + + for_each_sched_entity(se) { + if (entity_throttled(se)) + return 1; + } + return 0; +} + +static void enqueue_entity_locked(struct cfs_rq *cfs_rq, + struct sched_entity *se, int wakeup); +static void add_cfs_rq_tasks_running(struct sched_entity *se, + unsigned long count); +static void sub_cfs_rq_tasks_running(struct sched_entity *se, + unsigned long count); + +static void enqueue_throttled_entity(struct rq *rq, struct sched_entity *se) +{ + unsigned long nr_tasks = 0; + struct sched_entity *se_tmp = se; + int throttled = 0; + + for_each_sched_entity(se) { + if (se->on_rq) + break; + + if (entity_throttled(se)) { + throttled = 1; + break; + } + + enqueue_entity_locked(cfs_rq_of(se), se, 0); + nr_tasks += group_cfs_rq(se)->nr_tasks_running; + } + + if (!nr_tasks) + return; + + /* + * Add the number of tasks this entity has to + * all of its parent entities. + */ + add_cfs_rq_tasks_running(se_tmp, nr_tasks); + + /* + * Add the number of tasks this entity has to + * this cpu's rq only if the entity got enqueued all the + * way up without any throttled entity in the hierarchy. + */ + if (!throttled) + rq->nr_running += nr_tasks; +} + +/* + * Refresh runtimes of all cfs_rqs in this group, i,e., + * refresh runtimes of the representative cfs_rq of this + * tg on all cpus. Enqueue any throttled entity back. + */ +void do_sched_cfs_period_timer(struct cfs_bandwidth *cfs_b) +{ + int i; + const struct cpumask *span = sched_bw_period_mask(); + struct task_group *tg = container_of(cfs_b, struct task_group, + cfs_bandwidth); + unsigned long flags; + + for_each_cpu(i, span) { + struct rq *rq = cpu_rq(i); + struct cfs_rq *cfs_rq = tg->cfs_rq[i]; + struct sched_entity *se = tg->se[i]; + + spin_lock_irqsave(&rq->lock, flags); + rq_runtime_lock(rq); + cfs_rq->cfs_time = 0; + if (cfs_rq_throttled(cfs_rq)) { + update_rq_clock(rq); + update_stats_throttle_end(cfs_rq, se); + cfs_rq->cfs_throttled = 0; + enqueue_throttled_entity(rq, se); + } + rq_runtime_unlock(rq); + spin_unlock_irqrestore(&rq->lock, flags); + } +} + +#else + +static inline int cfs_rq_throttled(struct cfs_rq *cfs_rq) +{ + return 0; +} + +int task_group_throttled(struct task_group *tg, int cpu) +{ + return 0; +} + +static void sched_cfs_runtime_exceeded(struct sched_entity *se, + struct task_struct *tsk_curr, unsigned long delta_exec) +{ + return; +} + +static int entity_throttled(struct sched_entity *se) +{ + return 0; +} + +#endif /* CONFIG_CFS_HARD_LIMITS */ + #else /* CONFIG_FAIR_GROUP_SCHED */ static inline struct rq *rq_of(struct cfs_rq *cfs_rq) @@ -241,8 +521,47 @@ find_matching_se(struct sched_entity **s { } +static void sched_cfs_runtime_exceeded(struct sched_entity *se, + struct task_struct *tsk_curr, unsigned long delta_exec) +{ + return; +} + +static int entity_throttled(struct sched_entity *se) +{ + return 0; +} + #endif /* CONFIG_FAIR_GROUP_SCHED */ +static void add_cfs_rq_tasks_running(struct sched_entity *se, + unsigned long count) +{ + struct cfs_rq *cfs_rq; + + for_each_sched_entity(se) { + /* + * If any entity in the hierarchy is throttled, don't + * propogate the tasks count up since this entity isn't + * on rq yet. + */ + if (entity_throttled(se)) + break; + cfs_rq = cfs_rq_of(se); + cfs_rq->nr_tasks_running += count; + } +} + +static void sub_cfs_rq_tasks_running(struct sched_entity *se, + unsigned long count) +{ + struct cfs_rq *cfs_rq; + + for_each_sched_entity(se) { + cfs_rq = cfs_rq_of(se); + cfs_rq->nr_tasks_running -= count; + } +} /************************************************************** * Scheduling class tree data structure manipulation methods: @@ -481,10 +800,12 @@ __update_curr(struct cfs_rq *cfs_rq, str update_min_vruntime(cfs_rq); } -static void update_curr(struct cfs_rq *cfs_rq) +static void update_curr_common(struct cfs_rq *cfs_rq) { struct sched_entity *curr = cfs_rq->curr; - u64 now = rq_of(cfs_rq)->clock; + struct rq *rq = rq_of(cfs_rq); + struct task_struct *tsk_curr = rq->curr; + u64 now = rq->clock; unsigned long delta_exec; if (unlikely(!curr)) @@ -507,9 +828,23 @@ static void update_curr(struct cfs_rq *c cpuacct_charge(curtask, delta_exec); account_group_exec_runtime(curtask, delta_exec); + } else { + sched_cfs_runtime_exceeded(curr, tsk_curr, delta_exec); } } +static void update_curr(struct cfs_rq *cfs_rq) +{ + rq_runtime_lock(rq_of(cfs_rq)); + update_curr_common(cfs_rq); + rq_runtime_unlock(rq_of(cfs_rq)); +} + +static inline void update_curr_locked(struct cfs_rq *cfs_rq) +{ + update_curr_common(cfs_rq); +} + static inline void update_stats_wait_start(struct cfs_rq *cfs_rq, struct sched_entity *se) { @@ -713,13 +1048,9 @@ place_entity(struct cfs_rq *cfs_rq, stru se->vruntime = vruntime; } -static void -enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int wakeup) +static void enqueue_entity_common(struct cfs_rq *cfs_rq, + struct sched_entity *se, int wakeup) { - /* - * Update run-time statistics of the 'current'. - */ - update_curr(cfs_rq); account_entity_enqueue(cfs_rq, se); if (wakeup) { @@ -736,6 +1067,26 @@ enqueue_entity(struct cfs_rq *cfs_rq, st vx_activate_task(task_of(se)); } +static void enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, + int wakeup) +{ + /* + * Update run-time statistics of the 'current'. + */ + update_curr(cfs_rq); + enqueue_entity_common(cfs_rq, se, wakeup); +} + +static void enqueue_entity_locked(struct cfs_rq *cfs_rq, + struct sched_entity *se, int wakeup) +{ + /* + * Update run-time statistics of the 'current'. + */ + update_curr_locked(cfs_rq); + enqueue_entity_common(cfs_rq, se, wakeup); +} + static void __clear_buddies(struct cfs_rq *cfs_rq, struct sched_entity *se) { if (cfs_rq->last == se) @@ -849,8 +1200,40 @@ static struct sched_entity *pick_next_en return se; } +/* + * Called from put_prev_entity() + * If a group entity (@se) is found to be throttled, it will not be put back + * on @cfs_rq, which is equivalent to dequeing it. + */ +static void dequeue_throttled_entity(struct cfs_rq *cfs_rq, + struct sched_entity *se) +{ + unsigned long nr_tasks = group_cfs_rq(se)->nr_tasks_running; + + __clear_buddies(cfs_rq, se); + account_entity_dequeue(cfs_rq, se); + cfs_rq->curr = NULL; + + if (!nr_tasks) + return; + + /* + * Decrement the number of tasks this entity has from + * all of its parent entities. + */ + sub_cfs_rq_tasks_running(se, nr_tasks); + + /* + * Decrement the number of tasks this entity has from + * this cpu's rq. + */ + rq_of(cfs_rq)->nr_running -= nr_tasks; +} + static void put_prev_entity(struct cfs_rq *cfs_rq, struct sched_entity *prev) { + struct cfs_rq *gcfs_rq = group_cfs_rq(prev); + /* * If still on the runqueue then deactivate_task() * was not called and update_curr() has to be done: @@ -860,6 +1243,18 @@ static void put_prev_entity(struct cfs_r check_spread(cfs_rq, prev); if (prev->on_rq) { + /* + * If the group entity is throttled or if it has no + * no child entities, then don't enqueue it back. + */ + rq_runtime_lock(rq_of(cfs_rq)); + if (entity_throttled(prev) || + (gcfs_rq && !gcfs_rq->nr_running)) { + dequeue_throttled_entity(cfs_rq, prev); + rq_runtime_unlock(rq_of(cfs_rq)); + return; + } + rq_runtime_unlock(rq_of(cfs_rq)); update_stats_wait_start(cfs_rq, prev); /* Put 'current' back into the tree. */ __enqueue_entity(cfs_rq, prev); @@ -960,21 +1355,32 @@ static inline void hrtick_update(struct * The enqueue_task method is called before nr_running is * increased. Here we update the fair scheduling stats and * then put the task into the rbtree: + * Don't enqueue a throttled entity further into the hierarchy. */ -static void enqueue_task_fair(struct rq *rq, struct task_struct *p, int wakeup) +static int enqueue_task_fair(struct rq *rq, struct task_struct *p, int wakeup) { struct cfs_rq *cfs_rq; struct sched_entity *se = &p->se; + int throttled = 0; + rq_runtime_lock(rq); for_each_sched_entity(se) { if (se->on_rq) break; + if (entity_throttled(se)) { + throttled = 1; + break; + } cfs_rq = cfs_rq_of(se); - enqueue_entity(cfs_rq, se, wakeup); + enqueue_entity_locked(cfs_rq, se, wakeup); wakeup = 1; } + add_cfs_rq_tasks_running(&p->se, 1); + rq_runtime_unlock(rq); + hrtick_update(rq); + return throttled; } /* @@ -996,6 +1402,7 @@ static void dequeue_task_fair(struct rq sleep = 1; } + sub_cfs_rq_tasks_running(&p->se, 1); hrtick_update(rq); } @@ -1523,6 +1930,7 @@ static struct task_struct *pick_next_tas do { se = pick_next_entity(cfs_rq); + /* * If se was a buddy, clear it so that it will have to earn * the favour again. @@ -1632,9 +2040,9 @@ load_balance_fair(struct rq *this_rq, in u64 rem_load, moved_load; /* - * empty group + * empty group or a group with no h_load (throttled) */ - if (!busiest_cfs_rq->task_weight) + if (!busiest_cfs_rq->task_weight || !busiest_h_load) continue; rem_load = (u64)rem_load_move * busiest_weight; diff -NurpP --minimal linux-2.6.31.2-vs2.3.0.36.14/kernel/sched_rt.c linux-2.6.31.2-vs2.3.0.36.15/kernel/sched_rt.c --- linux-2.6.31.2-vs2.3.0.36.14/kernel/sched_rt.c 2009-10-06 05:02:21.000000000 +0200 +++ linux-2.6.31.2-vs2.3.0.36.15/kernel/sched_rt.c 2009-10-06 04:39:02.000000000 +0200 @@ -222,18 +222,6 @@ static int rt_se_boosted(struct sched_rt return p->prio != p->normal_prio; } -#ifdef CONFIG_SMP -static inline const struct cpumask *sched_rt_period_mask(void) -{ - return cpu_rq(smp_processor_id())->rd->span; -} -#else -static inline const struct cpumask *sched_rt_period_mask(void) -{ - return cpu_online_mask; -} -#endif - static inline struct rt_rq *sched_rt_period_rt_rq(struct rt_bandwidth *rt_b, int cpu) { @@ -283,11 +271,6 @@ static inline int rt_rq_throttled(struct return rt_rq->rt_throttled; } -static inline const struct cpumask *sched_rt_period_mask(void) -{ - return cpu_online_mask; -} - static inline struct rt_rq *sched_rt_period_rt_rq(struct rt_bandwidth *rt_b, int cpu) { @@ -505,7 +488,7 @@ static int do_sched_rt_period_timer(stru if (!rt_bandwidth_enabled() || rt_b->rt_runtime == RUNTIME_INF) return 1; - span = sched_rt_period_mask(); + span = sched_bw_period_mask(); for_each_cpu(i, span) { int enqueue = 0; struct rt_rq *rt_rq = sched_rt_period_rt_rq(rt_b, i); @@ -863,7 +846,7 @@ static void dequeue_rt_entity(struct sch /* * Adding/removing a task to/from a priority array: */ -static void enqueue_task_rt(struct rq *rq, struct task_struct *p, int wakeup) +static int enqueue_task_rt(struct rq *rq, struct task_struct *p, int wakeup) { struct sched_rt_entity *rt_se = &p->rt; @@ -876,6 +859,7 @@ static void enqueue_task_rt(struct rq *r enqueue_pushable_task(rq, p); inc_cpu_load(rq, p->se.load.weight); + return 0; } static void dequeue_task_rt(struct rq *rq, struct task_struct *p, int sleep)