diff -NurpP --minimal linux-2.6.31.2-vs2.3.0.36.14/Documentation/scheduler/sched-cfs-hard-limits.txt linux-2.6.31.2-vs2.3.0.36.15/Documentation/scheduler/sched-cfs-hard-limits.txt
--- linux-2.6.31.2-vs2.3.0.36.14/Documentation/scheduler/sched-cfs-hard-limits.txt	1970-01-01 01:00:00.000000000 +0100
+++ linux-2.6.31.2-vs2.3.0.36.15/Documentation/scheduler/sched-cfs-hard-limits.txt	2009-10-06 04:39:46.000000000 +0200
@@ -0,0 +1,52 @@
+CPU HARD LIMITS FOR CFS GROUPS
+==============================
+
+1. Overview
+2. Interface
+3. Examples
+
+1. Overview
+-----------
+
+CFS is a proportional share scheduler which tries to divide the CPU time
+proportionately between tasks or groups of tasks (task group/cgroup) depending
+on the priority/weight of the task or shares assigned to groups of tasks.
+In CFS, a task/task group can get more than its share of CPU if there are
+enough idle CPU cycles available in the system, due to the work conserving
+nature of the scheduler. However in certain scenarios (like pay-per-use),
+it is desirable not to provide extra time to a group even in the presence
+of idle CPU cycles. This is where hard limiting can be of use.
+
+Hard limits for task groups can be set by specifying how much CPU runtime a
+group can consume within a given period. If the group consumes more CPU time
+than the runtime in a given period, it gets throttled. None of the tasks of
+the throttled group gets to run until the runtime of the group gets refreshed
+at the beginning of the next period.
+
+2. Interface
+------------
+
+Hard limit feature adds 3 cgroup files for CFS group scheduler:
+
+cfs_runtime_us: Hard limit for the group in microseconds.
+
+cfs_period_us: Time period in microseconds within which hard limits is
+enforced.
+
+cfs_hard_limit: The control file to enable or disable hard limiting for the
+group.
+
+A group gets created with default values for runtime and period and with
+hard limit disabled. Each group can set its own values for runtime and period
+independent of other groups in the system.
+
+3. Examples
+-----------
+
+# mount -t cgroup -ocpu none /cgroups/
+# cd /cgroups
+# mkdir 1
+# cd 1/
+# echo 250000 > cfs_runtime_us /* set a 250ms runtime or limit */
+# echo 500000 > cfs_period_us /* set a 500ms period */
+# echo 1 > cfs_hard_limit /* enable hard limiting for group 1/ */
diff -NurpP --minimal linux-2.6.31.2-vs2.3.0.36.14/include/linux/sched.h linux-2.6.31.2-vs2.3.0.36.15/include/linux/sched.h
--- linux-2.6.31.2-vs2.3.0.36.14/include/linux/sched.h	2009-10-06 05:02:05.000000000 +0200
+++ linux-2.6.31.2-vs2.3.0.36.15/include/linux/sched.h	2009-10-06 04:39:26.000000000 +0200
@@ -1027,7 +1027,7 @@ struct sched_domain;
 struct sched_class {
 	const struct sched_class *next;
 
-	void (*enqueue_task) (struct rq *rq, struct task_struct *p, int wakeup);
+	int (*enqueue_task) (struct rq *rq, struct task_struct *p, int wakeup);
 	void (*dequeue_task) (struct rq *rq, struct task_struct *p, int sleep);
 	void (*yield_task) (struct rq *rq);
 
@@ -1127,6 +1127,7 @@ struct sched_entity {
 	u64			nr_failed_migrations_affine;
 	u64			nr_failed_migrations_running;
 	u64			nr_failed_migrations_hot;
+	u64			nr_failed_migrations_throttled;
 	u64			nr_forced_migrations;
 	u64			nr_forced2_migrations;
 
@@ -1139,6 +1140,12 @@ struct sched_entity {
 	u64			nr_wakeups_affine_attempts;
 	u64			nr_wakeups_passive;
 	u64			nr_wakeups_idle;
+#ifdef CONFIG_CFS_HARD_LIMITS
+	u64			throttle_start;
+	u64			throttle_max;
+	u64			throttle_count;
+	u64			throttle_sum;
+#endif
 #endif
 
 #ifdef CONFIG_FAIR_GROUP_SCHED
diff -NurpP --minimal linux-2.6.31.2-vs2.3.0.36.14/init/Kconfig linux-2.6.31.2-vs2.3.0.36.15/init/Kconfig
--- linux-2.6.31.2-vs2.3.0.36.14/init/Kconfig	2009-10-06 05:02:11.000000000 +0200
+++ linux-2.6.31.2-vs2.3.0.36.15/init/Kconfig	2009-10-06 04:38:47.000000000 +0200
@@ -492,6 +492,19 @@ config CGROUP_SCHED
 
 endchoice
 
+config CFS_HARD_LIMITS
+	bool "Hard Limits for CFS Group Scheduler"
+	depends on EXPERIMENTAL
+	depends on FAIR_GROUP_SCHED && CGROUP_SCHED
+	default n
+	help
+	  This option enables hard limiting of CPU time obtained by
+	  a fair task group. Use this if you want to throttle a group of tasks
+	  based on its CPU usage. For more details refer to
+	  Documentation/scheduler/sched-cfs-hard-limits.txt
+
+	  Say N if unsure.
+
 menuconfig CGROUPS
 	boolean "Control Group support"
 	help
diff -NurpP --minimal linux-2.6.31.2-vs2.3.0.36.14/kernel/sched.c linux-2.6.31.2-vs2.3.0.36.15/kernel/sched.c
--- linux-2.6.31.2-vs2.3.0.36.14/kernel/sched.c	2009-10-06 05:02:21.000000000 +0200
+++ linux-2.6.31.2-vs2.3.0.36.15/kernel/sched.c	2009-10-06 04:39:14.000000000 +0200
@@ -264,6 +264,15 @@ static DEFINE_MUTEX(sched_domains_mutex)
 
 #include <linux/cgroup.h>
 
+#if defined(CONFIG_FAIR_GROUP_SCHED) && defined(CONFIG_CFS_HARD_LIMITS)
+struct cfs_bandwidth {
+	spinlock_t		cfs_runtime_lock;
+	ktime_t			cfs_period;
+	u64			cfs_runtime;
+	struct hrtimer		cfs_period_timer;
+};
+#endif
+
 struct cfs_rq;
 
 static LIST_HEAD(task_groups);
@@ -284,6 +293,11 @@ struct task_group {
 	/* runqueue "owned" by this group on each cpu */
 	struct cfs_rq **cfs_rq;
 	unsigned long shares;
+#ifdef CONFIG_CFS_HARD_LIMITS
+	struct cfs_bandwidth cfs_bandwidth;
+	/* If set, throttle when the group exceeds its bandwidth */
+	int hard_limit_enabled;
+#endif
 #endif
 
 #ifdef CONFIG_RT_GROUP_SCHED
@@ -479,6 +493,20 @@ struct cfs_rq {
 	unsigned long rq_weight;
 #endif
 #endif
+#ifdef CONFIG_CFS_HARD_LIMITS
+	/* set when the group is throttled  on this cpu */
+	int cfs_throttled;
+
+	/* runtime currently consumed by the group on this rq */
+	u64 cfs_time;
+
+	/* runtime available to the group on this rq */
+	u64 cfs_runtime;
+#endif
+	/*
+	 * Number of tasks at this heirarchy.
+	 */
+	unsigned long nr_tasks_running;
 };
 
 /* Real-Time classes' related field in a runqueue: */
@@ -663,6 +691,11 @@ struct rq {
 	/* BKL stats */
 	unsigned int bkl_count;
 #endif
+	/*
+	 * Protects the cfs runtime related fields of all cfs_rqs under
+	 * this rq
+	 */
+	spinlock_t runtime_lock;
 };
 
 static DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues);
@@ -1554,6 +1587,7 @@ update_group_shares_cpu(struct task_grou
 	}
 }
 
+static inline int cfs_rq_throttled(struct cfs_rq *cfs_rq);
 /*
  * Re-compute the task group their per cpu shares over the given domain.
  * This needs to be done in a bottom-up fashion because the rq weight of a
@@ -1571,9 +1605,11 @@ static int tg_shares_up(struct task_grou
 		 * If there are currently no tasks on the cpu pretend there
 		 * is one of average load so that when a new task gets to
 		 * run here it will not get delayed by group starvation.
+		 * Also if the group is throttled on this cpu, pretend that
+		 * it has no tasks.
 		 */
 		weight = tg->cfs_rq[i]->load.weight;
-		if (!weight)
+		if (!weight || cfs_rq_throttled(tg->cfs_rq[i]))
 			weight = NICE_0_LOAD;
 
 		tg->cfs_rq[i]->rq_weight = weight;
@@ -1597,6 +1633,7 @@ static int tg_shares_up(struct task_grou
  * Compute the cpu's hierarchical load factor for each task group.
  * This needs to be done in a top-down fashion because the load of a child
  * group is a fraction of its parents load.
+ * A throttled group's h_load is set to 0.
  */
 static int tg_load_down(struct task_group *tg, void *data)
 {
@@ -1605,6 +1642,8 @@ static int tg_load_down(struct task_grou
 
 	if (!tg->parent) {
 		load = cpu_rq(cpu)->load.weight;
+	} else if (cfs_rq_throttled(tg->cfs_rq[cpu])) {
+		load = 0;
 	} else {
 		load = tg->parent->cfs_rq[cpu]->h_load;
 		load *= tg->cfs_rq[cpu]->shares;
@@ -1734,6 +1773,187 @@ static void cfs_rq_set_shares(struct cfs
 
 static void calc_load_account_active(struct rq *this_rq);
 
+
+#if defined(CONFIG_RT_GROUP_SCHED) || defined(CONFIG_FAIR_GROUP_SCHED)
+
+#ifdef CONFIG_SMP
+static inline const struct cpumask *sched_bw_period_mask(void)
+{
+	return cpu_rq(smp_processor_id())->rd->span;
+}
+#else /* !CONFIG_SMP */
+static inline const struct cpumask *sched_bw_period_mask(void)
+{
+	return cpu_online_mask;
+}
+#endif /* CONFIG_SMP */
+
+#else
+static inline const struct cpumask *sched_bw_period_mask(void)
+{
+	return cpu_online_mask;
+}
+
+#endif
+
+#ifdef CONFIG_FAIR_GROUP_SCHED
+#ifdef CONFIG_CFS_HARD_LIMITS
+
+/*
+ * Runtime allowed for a cfs group before it is hard limited.
+ * default: Infinite which means no hard limiting.
+ */
+u64 sched_cfs_runtime = RUNTIME_INF;
+
+/*
+ * period over which we hard limit the cfs group's bandwidth.
+ * default: 0.5s
+ */
+u64 sched_cfs_period = 500000;
+
+static inline u64 global_cfs_period(void)
+{
+	return sched_cfs_period * NSEC_PER_USEC;
+}
+
+static inline u64 global_cfs_runtime(void)
+{
+	return RUNTIME_INF;
+}
+
+int task_group_throttled(struct task_group *tg, int cpu);
+void do_sched_cfs_period_timer(struct cfs_bandwidth *cfs_b);
+
+static inline int cfs_bandwidth_enabled(struct task_group *tg)
+{
+	return tg->hard_limit_enabled;
+}
+
+static inline void rq_runtime_lock(struct rq *rq)
+{
+	spin_lock(&rq->runtime_lock);
+}
+
+static inline void rq_runtime_unlock(struct rq *rq)
+{
+	spin_unlock(&rq->runtime_lock);
+}
+
+/*
+ * Refresh the runtimes of the throttled groups.
+ * But nothing much to do now, will populate this in later patches.
+ */
+static enum hrtimer_restart sched_cfs_period_timer(struct hrtimer *timer)
+{
+	struct cfs_bandwidth *cfs_b =
+		container_of(timer, struct cfs_bandwidth, cfs_period_timer);
+
+	do_sched_cfs_period_timer(cfs_b);
+	hrtimer_add_expires_ns(timer, ktime_to_ns(cfs_b->cfs_period));
+	return HRTIMER_RESTART;
+}
+
+/*
+ * TODO: Check if this kind of timer setup is sufficient for cfs or
+ * should we do what rt is doing.
+ */
+static void start_cfs_bandwidth(struct task_group *tg)
+{
+	struct cfs_bandwidth *cfs_b = &tg->cfs_bandwidth;
+
+	/*
+	 * Timer isn't setup for groups with infinite runtime or for groups
+	 * for which hard limiting isn't enabled.
+	 */
+	if (!cfs_bandwidth_enabled(tg) || (cfs_b->cfs_runtime == RUNTIME_INF))
+		return;
+
+	if (hrtimer_active(&cfs_b->cfs_period_timer))
+		return;
+
+	hrtimer_start_range_ns(&cfs_b->cfs_period_timer, cfs_b->cfs_period,
+			0, HRTIMER_MODE_REL);
+}
+
+static void init_cfs_bandwidth(struct task_group *tg)
+{
+	struct cfs_bandwidth *cfs_b = &tg->cfs_bandwidth;
+
+	cfs_b->cfs_period = ns_to_ktime(global_cfs_period());
+	cfs_b->cfs_runtime = global_cfs_runtime();
+
+	spin_lock_init(&cfs_b->cfs_runtime_lock);
+
+	hrtimer_init(&cfs_b->cfs_period_timer,
+			CLOCK_MONOTONIC, HRTIMER_MODE_REL);
+	cfs_b->cfs_period_timer.function = &sched_cfs_period_timer;
+}
+
+static inline void destroy_cfs_bandwidth(struct task_group *tg)
+{
+	hrtimer_cancel(&tg->cfs_bandwidth.cfs_period_timer);
+}
+
+static void init_cfs_hard_limits(struct cfs_rq *cfs_rq, struct task_group *tg)
+{
+	cfs_rq->cfs_time = 0;
+	cfs_rq->cfs_throttled = 0;
+	cfs_rq->cfs_runtime = tg->cfs_bandwidth.cfs_runtime;
+	tg->hard_limit_enabled = 0;
+}
+
+#else /* !CONFIG_CFS_HARD_LIMITS */
+
+static void init_cfs_bandwidth(struct task_group *tg)
+{
+	return;
+}
+
+static inline void destroy_cfs_bandwidth(struct task_group *tg)
+{
+	return;
+}
+
+static void init_cfs_hard_limits(struct cfs_rq *cfs_rq, struct task_group *tg)
+{
+	return;
+}
+
+static inline void rq_runtime_lock(struct rq *rq)
+{
+	return;
+}
+
+static inline void rq_runtime_unlock(struct rq *rq)
+{
+	return;
+}
+
+#endif /* CONFIG_CFS_HARD_LIMITS */
+#else /* !CONFIG_FAIR_GROUP_SCHED */
+
+static inline void rq_runtime_lock(struct rq *rq)
+{
+	return;
+}
+
+static inline void rq_runtime_unlock(struct rq *rq)
+{
+	return;
+}
+
+int task_group_throttled(struct task_group *tg, int cpu)
+{
+	return 0;
+}
+
+static inline int cfs_rq_throttled(struct cfs_rq *cfs_rq)
+{
+	return 0;
+}
+
+#endif /* CONFIG_FAIR_GROUP_SCHED */
+
 #include "sched_stats.h"
 #include "sched_idletask.c"
 #include "sched_fair.c"
@@ -1783,14 +2003,17 @@ static void update_avg(u64 *avg, u64 sam
 	*avg += diff >> 3;
 }
 
-static void enqueue_task(struct rq *rq, struct task_struct *p, int wakeup)
+static int enqueue_task(struct rq *rq, struct task_struct *p, int wakeup)
 {
+	int ret;
+
 	if (wakeup)
 		p->se.start_runtime = p->se.sum_exec_runtime;
 
 	sched_info_queued(p);
-	p->sched_class->enqueue_task(rq, p, wakeup);
+	ret = p->sched_class->enqueue_task(rq, p, wakeup);
 	p->se.on_rq = 1;
+	return ret;
 }
 
 static void dequeue_task(struct rq *rq, struct task_struct *p, int sleep)
@@ -1865,8 +2088,15 @@ static void activate_task(struct rq *rq,
 	if (task_contributes_to_load(p))
 		rq->nr_uninterruptible--;
 
-	enqueue_task(rq, p, wakeup);
-	inc_nr_running(rq);
+	/*
+	 * Increment rq->nr_running only if enqueue_task() succeeds.
+	 * enqueue_task() can fail when the task being activated belongs
+	 * to a throttled group. In this case, the task gets enqueued to
+	 * throttled group and the group will be enqueued later when it
+	 * gets unthrottled. rq->nr_running gets incremented at that time.
+	 */
+	if (!enqueue_task(rq, p, wakeup))
+		inc_nr_running(rq);
 }
 
 /*
@@ -3211,6 +3441,7 @@ int can_migrate_task(struct task_struct 
 	 * 1) running (obviously), or
 	 * 2) cannot be migrated to this CPU due to cpus_allowed, or
 	 * 3) are cache-hot on their current CPU.
+	 * 4) end up in throttled task groups on this CPU.
 	 */
 	if (!cpumask_test_cpu(this_cpu, &p->cpus_allowed)) {
 		schedstat_inc(p, se.nr_failed_migrations_affine);
@@ -3224,6 +3455,18 @@ int can_migrate_task(struct task_struct 
 	}
 
 	/*
+	 * Don't migrate the task if it belongs to a
+	 * - throttled group on its current cpu
+	 * - throttled group on this_cpu
+	 * - group whose hierarchy is throttled on this_cpu
+	 */
+	if (cfs_rq_throttled(cfs_rq_of(&p->se)) ||
+		task_group_throttled(task_group(p), this_cpu)) {
+		schedstat_inc(p, se.nr_failed_migrations_throttled);
+		return 0;
+	}
+
+	/*
 	 * Aggressive migration if:
 	 * 1) task is cache cold, or
 	 * 2) too many balance attempts have failed.
@@ -5911,8 +6154,10 @@ void rt_mutex_setprio(struct task_struct
 	oldprio = p->prio;
 	on_rq = p->se.on_rq;
 	running = task_current(rq, p);
-	if (on_rq)
+	if (on_rq) {
 		dequeue_task(rq, p, 0);
+		dec_nr_running(rq);
+	}
 	if (running)
 		p->sched_class->put_prev_task(rq, p);
 
@@ -5926,7 +6171,8 @@ void rt_mutex_setprio(struct task_struct
 	if (running)
 		p->sched_class->set_curr_task(rq);
 	if (on_rq) {
-		enqueue_task(rq, p, 0);
+		if (!enqueue_task(rq, p, 0))
+			inc_nr_running(rq);
 
 		check_class_changed(rq, p, prev_class, oldprio, running);
 	}
@@ -5960,8 +6206,10 @@ void set_user_nice(struct task_struct *p
 		goto out_unlock;
 	}
 	on_rq = p->se.on_rq;
-	if (on_rq)
+	if (on_rq) {
 		dequeue_task(rq, p, 0);
+		dec_nr_running(rq);
+	}
 
 	p->static_prio = NICE_TO_PRIO(nice);
 	set_load_weight(p);
@@ -5970,7 +6218,8 @@ void set_user_nice(struct task_struct *p
 	delta = p->prio - old_prio;
 
 	if (on_rq) {
-		enqueue_task(rq, p, 0);
+		if (!enqueue_task(rq, p, 0))
+			inc_nr_running(rq);
 		/*
 		 * If the task increased its priority or is running and
 		 * lowered its priority, then reschedule its CPU:
@@ -9134,6 +9383,7 @@ static void init_tg_cfs_entry(struct tas
 	struct rq *rq = cpu_rq(cpu);
 	tg->cfs_rq[cpu] = cfs_rq;
 	init_cfs_rq(cfs_rq, rq);
+	init_cfs_hard_limits(cfs_rq, tg);
 	cfs_rq->tg = tg;
 	if (add)
 		list_add(&cfs_rq->leaf_cfs_rq_list, &rq->leaf_cfs_rq_list);
@@ -9263,6 +9513,10 @@ void __init sched_init(void)
 #endif /* CONFIG_USER_SCHED */
 #endif /* CONFIG_RT_GROUP_SCHED */
 
+#ifdef CONFIG_FAIR_GROUP_SCHED
+	init_cfs_bandwidth(&init_task_group);
+#endif
+
 #ifdef CONFIG_GROUP_SCHED
 	list_add(&init_task_group.list, &task_groups);
 	INIT_LIST_HEAD(&init_task_group.children);
@@ -9279,6 +9533,7 @@ void __init sched_init(void)
 
 		rq = cpu_rq(i);
 		spin_lock_init(&rq->lock);
+		spin_lock_init(&rq->runtime_lock);
 		rq->nr_running = 0;
 		rq->calc_load_active = 0;
 		rq->calc_load_update = jiffies + LOAD_FREQ;
@@ -9552,6 +9807,7 @@ static void free_fair_sched_group(struct
 {
 	int i;
 
+	destroy_cfs_bandwidth(tg);
 	for_each_possible_cpu(i) {
 		if (tg->cfs_rq)
 			kfree(tg->cfs_rq[i]);
@@ -9578,6 +9834,7 @@ int alloc_fair_sched_group(struct task_g
 	if (!tg->se)
 		goto err;
 
+	init_cfs_bandwidth(tg);
 	tg->shares = NICE_0_LOAD;
 
 	for_each_possible_cpu(i) {
@@ -9810,8 +10067,10 @@ void sched_move_task(struct task_struct 
 	running = task_current(rq, tsk);
 	on_rq = tsk->se.on_rq;
 
-	if (on_rq)
+	if (on_rq) {
 		dequeue_task(rq, tsk, 0);
+		dec_nr_running(rq);
+	}
 	if (unlikely(running))
 		tsk->sched_class->put_prev_task(rq, tsk);
 
@@ -9825,7 +10084,8 @@ void sched_move_task(struct task_struct 
 	if (unlikely(running))
 		tsk->sched_class->set_curr_task(rq);
 	if (on_rq)
-		enqueue_task(rq, tsk, 0);
+		if (!enqueue_task(rq, tsk, 0))
+			inc_nr_running(rq);
 
 	task_rq_unlock(rq, &flags);
 }
@@ -10272,6 +10532,134 @@ static u64 cpu_shares_read_u64(struct cg
 
 	return (u64) tg->shares;
 }
+
+#ifdef CONFIG_CFS_HARD_LIMITS
+
+static int tg_set_cfs_bandwidth(struct task_group *tg,
+		u64 cfs_period, u64 cfs_runtime)
+{
+	int i, err = 0;
+
+	spin_lock_irq(&tg->cfs_bandwidth.cfs_runtime_lock);
+	tg->cfs_bandwidth.cfs_period = ns_to_ktime(cfs_period);
+	tg->cfs_bandwidth.cfs_runtime = cfs_runtime;
+
+	for_each_possible_cpu(i) {
+		struct cfs_rq *cfs_rq = tg->cfs_rq[i];
+
+		rq_runtime_lock(rq_of(cfs_rq));
+		cfs_rq->cfs_runtime = cfs_runtime;
+		rq_runtime_unlock(rq_of(cfs_rq));
+	}
+
+	start_cfs_bandwidth(tg);
+	spin_unlock_irq(&tg->cfs_bandwidth.cfs_runtime_lock);
+	return err;
+}
+
+int tg_set_cfs_runtime(struct task_group *tg, long cfs_runtime_us)
+{
+	u64 cfs_runtime, cfs_period;
+
+	cfs_period = ktime_to_ns(tg->cfs_bandwidth.cfs_period);
+	cfs_runtime = (u64)cfs_runtime_us * NSEC_PER_USEC;
+	if (cfs_runtime_us < 0)
+		cfs_runtime = RUNTIME_INF;
+
+	return tg_set_cfs_bandwidth(tg, cfs_period, cfs_runtime);
+}
+
+long tg_get_cfs_runtime(struct task_group *tg)
+{
+	u64 cfs_runtime_us;
+
+	if (tg->cfs_bandwidth.cfs_runtime == RUNTIME_INF)
+		return -1;
+
+	cfs_runtime_us = tg->cfs_bandwidth.cfs_runtime;
+	do_div(cfs_runtime_us, NSEC_PER_USEC);
+	return cfs_runtime_us;
+}
+
+int tg_set_cfs_period(struct task_group *tg, long cfs_period_us)
+{
+	u64 cfs_runtime, cfs_period;
+
+	cfs_period = (u64)cfs_period_us * NSEC_PER_USEC;
+	cfs_runtime = tg->cfs_bandwidth.cfs_runtime;
+
+	if (cfs_period == 0)
+		return -EINVAL;
+
+	return tg_set_cfs_bandwidth(tg, cfs_period, cfs_runtime);
+}
+
+long tg_get_cfs_period(struct task_group *tg)
+{
+	u64 cfs_period_us;
+
+	cfs_period_us = ktime_to_ns(tg->cfs_bandwidth.cfs_period);
+	do_div(cfs_period_us, NSEC_PER_USEC);
+	return cfs_period_us;
+}
+
+int tg_set_hard_limit_enabled(struct task_group *tg, u64 val)
+{
+	local_irq_disable();
+	spin_lock(&tg->cfs_bandwidth.cfs_runtime_lock);
+	if (val > 0) {
+		tg->hard_limit_enabled = 1;
+		start_cfs_bandwidth(tg);
+		spin_unlock(&tg->cfs_bandwidth.cfs_runtime_lock);
+	} else {
+		destroy_cfs_bandwidth(tg);
+		tg->hard_limit_enabled = 0;
+		spin_unlock(&tg->cfs_bandwidth.cfs_runtime_lock);
+		/*
+		 * Hard limiting is being disabled for this group.
+		 * Refresh runtimes and put the throttled entities
+		 * of the group back onto runqueue.
+		 */
+		do_sched_cfs_period_timer(&tg->cfs_bandwidth);
+	}
+	local_irq_enable();
+	return 0;
+}
+
+static s64 cpu_cfs_runtime_read_s64(struct cgroup *cgrp, struct cftype *cft)
+{
+	return tg_get_cfs_runtime(cgroup_tg(cgrp));
+}
+
+static int cpu_cfs_runtime_write_s64(struct cgroup *cgrp, struct cftype *cftype,
+				s64 cfs_runtime_us)
+{
+	return tg_set_cfs_runtime(cgroup_tg(cgrp), cfs_runtime_us);
+}
+
+static u64 cpu_cfs_period_read_u64(struct cgroup *cgrp, struct cftype *cft)
+{
+	return tg_get_cfs_period(cgroup_tg(cgrp));
+}
+
+static int cpu_cfs_period_write_u64(struct cgroup *cgrp, struct cftype *cftype,
+				u64 cfs_period_us)
+{
+	return tg_set_cfs_period(cgroup_tg(cgrp), cfs_period_us);
+}
+
+static u64 cpu_cfs_hard_limit_read_u64(struct cgroup *cgrp, struct cftype *cft)
+{
+	return cfs_bandwidth_enabled(cgroup_tg(cgrp));
+}
+
+static int cpu_cfs_hard_limit_write_u64(struct cgroup *cgrp,
+		struct cftype *cftype, u64 val)
+{
+	return tg_set_hard_limit_enabled(cgroup_tg(cgrp), val);
+}
+
+#endif /* CONFIG_CFS_HARD_LIMITS */
 #endif /* CONFIG_FAIR_GROUP_SCHED */
 
 #ifdef CONFIG_RT_GROUP_SCHED
@@ -10305,6 +10693,23 @@ static struct cftype cpu_files[] = {
 		.read_u64 = cpu_shares_read_u64,
 		.write_u64 = cpu_shares_write_u64,
 	},
+#ifdef CONFIG_CFS_HARD_LIMITS
+	{
+		.name = "cfs_runtime_us",
+		.read_s64 = cpu_cfs_runtime_read_s64,
+		.write_s64 = cpu_cfs_runtime_write_s64,
+	},
+	{
+		.name = "cfs_period_us",
+		.read_u64 = cpu_cfs_period_read_u64,
+		.write_u64 = cpu_cfs_period_write_u64,
+	},
+	{
+		.name = "cfs_hard_limit",
+		.read_u64 = cpu_cfs_hard_limit_read_u64,
+		.write_u64 = cpu_cfs_hard_limit_write_u64,
+	},
+#endif /* CONFIG_CFS_HARD_LIMITS */
 #endif
 #ifdef CONFIG_RT_GROUP_SCHED
 	{
diff -NurpP --minimal linux-2.6.31.2-vs2.3.0.36.14/kernel/sched_debug.c linux-2.6.31.2-vs2.3.0.36.15/kernel/sched_debug.c
--- linux-2.6.31.2-vs2.3.0.36.14/kernel/sched_debug.c	2009-10-06 05:02:16.000000000 +0200
+++ linux-2.6.31.2-vs2.3.0.36.15/kernel/sched_debug.c	2009-10-06 04:39:26.000000000 +0200
@@ -80,6 +80,11 @@ static void print_cfs_group_stats(struct
 	PN(se->wait_max);
 	PN(se->wait_sum);
 	P(se->wait_count);
+#ifdef CONFIG_CFS_HARD_LIMITS
+	PN(se->throttle_max);
+	PN(se->throttle_sum);
+	P(se->throttle_count);
+#endif
 #endif
 	P(se->load.weight);
 #undef PN
@@ -214,6 +219,18 @@ void print_cfs_rq(struct seq_file *m, in
 #ifdef CONFIG_SMP
 	SEQ_printf(m, "  .%-30s: %lu\n", "shares", cfs_rq->shares);
 #endif
+	SEQ_printf(m, "  .%-30s: %ld\n", "nr_tasks_running",
+			cfs_rq->nr_tasks_running);
+#ifdef CONFIG_CFS_HARD_LIMITS
+	spin_lock_irqsave(&rq->lock, flags);
+	SEQ_printf(m, "  .%-30s: %d\n", "cfs_throttled",
+			cfs_rq->cfs_throttled);
+	SEQ_printf(m, "  .%-30s: %Ld.%06ld\n", "cfs_time",
+			SPLIT_NS(cfs_rq->cfs_time));
+	SEQ_printf(m, "  .%-30s: %Ld.%06ld\n", "cfs_runtime",
+			SPLIT_NS(cfs_rq->cfs_runtime));
+	spin_unlock_irqrestore(&rq->lock, flags);
+#endif
 	print_cfs_group_stats(m, cpu, cfs_rq->tg);
 #endif
 }
@@ -310,7 +327,7 @@ static int sched_debug_show(struct seq_f
 	u64 now = ktime_to_ns(ktime_get());
 	int cpu;
 
-	SEQ_printf(m, "Sched Debug Version: v0.09, %s %.*s\n",
+	SEQ_printf(m, "Sched Debug Version: v0.10, %s %.*s\n",
 		init_utsname()->release,
 		(int)strcspn(init_utsname()->version, " "),
 		init_utsname()->version);
@@ -415,6 +432,7 @@ void proc_sched_show_task(struct task_st
 	P(se.nr_failed_migrations_affine);
 	P(se.nr_failed_migrations_running);
 	P(se.nr_failed_migrations_hot);
+	P(se.nr_failed_migrations_throttled);
 	P(se.nr_forced_migrations);
 	P(se.nr_forced2_migrations);
 	P(se.nr_wakeups);
@@ -489,6 +507,7 @@ void proc_sched_set_task(struct task_str
 	p->se.nr_failed_migrations_affine	= 0;
 	p->se.nr_failed_migrations_running	= 0;
 	p->se.nr_failed_migrations_hot		= 0;
+	p->se.nr_failed_migrations_throttled	= 0;
 	p->se.nr_forced_migrations		= 0;
 	p->se.nr_forced2_migrations		= 0;
 	p->se.nr_wakeups			= 0;
diff -NurpP --minimal linux-2.6.31.2-vs2.3.0.36.14/kernel/sched_fair.c linux-2.6.31.2-vs2.3.0.36.15/kernel/sched_fair.c
--- linux-2.6.31.2-vs2.3.0.36.14/kernel/sched_fair.c	2009-10-06 05:02:16.000000000 +0200
+++ linux-2.6.31.2-vs2.3.0.36.15/kernel/sched_fair.c	2009-10-06 04:39:37.000000000 +0200
@@ -186,6 +186,286 @@ find_matching_se(struct sched_entity **s
 	}
 }
 
+#ifdef CONFIG_CFS_HARD_LIMITS
+
+static inline void update_stats_throttle_start(struct cfs_rq *cfs_rq,
+			struct sched_entity *se)
+{
+	schedstat_set(se->throttle_start, rq_of(cfs_rq)->clock);
+}
+
+static inline void update_stats_throttle_end(struct cfs_rq *cfs_rq,
+			struct sched_entity *se)
+{
+	schedstat_set(se->throttle_max, max(se->throttle_max,
+			rq_of(cfs_rq)->clock - se->throttle_start));
+	schedstat_set(se->throttle_count, se->throttle_count + 1);
+	schedstat_set(se->throttle_sum, se->throttle_sum +
+			rq_of(cfs_rq)->clock - se->throttle_start);
+	schedstat_set(se->throttle_start, 0);
+}
+
+static void double_rq_runtime_lock(struct rq *rq1, struct rq *rq2)
+	__acquires(rq1->runtime_lock)
+	__acquires(rq2->runtime_lock)
+{
+	BUG_ON(!irqs_disabled());
+	if (rq1 == rq2) {
+		spin_lock(&rq1->runtime_lock);
+		__acquire(rq2->runtime_lock);	/* Fake it out ;) */
+	} else {
+		if (rq1 < rq2) {
+			spin_lock(&rq1->runtime_lock);
+			spin_lock_nested(&rq2->runtime_lock,
+					SINGLE_DEPTH_NESTING);
+		} else {
+			spin_lock(&rq2->runtime_lock);
+			spin_lock_nested(&rq1->runtime_lock,
+					SINGLE_DEPTH_NESTING);
+		}
+	}
+	update_rq_clock(rq1);
+	update_rq_clock(rq2);
+}
+
+static void double_rq_runtime_unlock(struct rq *rq1, struct rq *rq2)
+	__releases(rq1->runtime_lock)
+	__releases(rq2->runtime_lock)
+{
+	spin_unlock(&rq1->runtime_lock);
+	if (rq1 != rq2)
+		spin_unlock(&rq2->runtime_lock);
+	else
+		__release(rq2->runtime_lock);
+}
+
+static inline int cfs_rq_throttled(struct cfs_rq *cfs_rq)
+{
+	return cfs_rq->cfs_throttled;
+}
+
+/*
+ * Ran out of runtime, check if we can borrow some from others
+ * instead of getting throttled right away.
+ */
+static void do_cfs_balance_runtime(struct cfs_rq *cfs_rq)
+{
+	struct rq *rq = rq_of(cfs_rq);
+	struct cfs_bandwidth *cfs_b = &cfs_rq->tg->cfs_bandwidth;
+	const struct cpumask *span = sched_bw_period_mask();
+	int i, weight;
+	u64 cfs_period;
+	struct task_group *tg = container_of(cfs_b, struct task_group,
+				cfs_bandwidth);
+
+	weight = cpumask_weight(span);
+	spin_lock(&cfs_b->cfs_runtime_lock);
+	cfs_period = ktime_to_ns(cfs_b->cfs_period);
+
+	for_each_cpu(i, span) {
+		struct cfs_rq *borrow_cfs_rq = tg->cfs_rq[i];
+		struct rq *borrow_rq = rq_of(borrow_cfs_rq);
+		s64 diff;
+
+		if (borrow_cfs_rq == cfs_rq)
+			continue;
+
+		double_rq_runtime_lock(rq, borrow_rq);
+		if (borrow_cfs_rq->cfs_runtime == RUNTIME_INF) {
+			double_rq_runtime_unlock(rq, borrow_rq);
+			continue;
+		}
+
+		diff = borrow_cfs_rq->cfs_runtime - borrow_cfs_rq->cfs_time;
+		if (diff > 0) {
+			diff = div_u64((u64)diff, weight);
+			if (cfs_rq->cfs_runtime + diff > cfs_period)
+				diff = cfs_period - cfs_rq->cfs_runtime;
+			borrow_cfs_rq->cfs_runtime -= diff;
+			cfs_rq->cfs_runtime += diff;
+			if (cfs_rq->cfs_runtime == cfs_period) {
+				double_rq_runtime_unlock(rq, borrow_rq);
+				break;
+			}
+		}
+		double_rq_runtime_unlock(rq, borrow_rq);
+	}
+	spin_unlock(&cfs_b->cfs_runtime_lock);
+}
+
+/*
+ * Called with rq->runtime_lock held.
+ */
+static void cfs_balance_runtime(struct cfs_rq *cfs_rq)
+{
+	struct rq *rq = rq_of(cfs_rq);
+
+	rq_runtime_unlock(rq);
+	do_cfs_balance_runtime(cfs_rq);
+	rq_runtime_lock(rq);
+}
+
+/*
+ * Check if group entity exceeded its runtime. If so, mark the cfs_rq as
+ * throttled mark the current task for reschedling.
+ */
+static void sched_cfs_runtime_exceeded(struct sched_entity *se,
+	struct task_struct *tsk_curr, unsigned long delta_exec)
+{
+	struct cfs_rq *cfs_rq;
+
+	cfs_rq = group_cfs_rq(se);
+
+	if (!cfs_bandwidth_enabled(cfs_rq->tg))
+		return;
+
+	if (cfs_rq->cfs_runtime == RUNTIME_INF)
+		return;
+
+	cfs_rq->cfs_time += delta_exec;
+
+	if (cfs_rq_throttled(cfs_rq))
+		return;
+
+	if (cfs_rq->cfs_time > cfs_rq->cfs_runtime)
+		cfs_balance_runtime(cfs_rq);
+
+	if (cfs_rq->cfs_time > cfs_rq->cfs_runtime) {
+		cfs_rq->cfs_throttled = 1;
+		update_stats_throttle_start(cfs_rq, se);
+		resched_task(tsk_curr);
+	}
+}
+
+/*
+ * Check if the entity is throttled.
+ */
+static int entity_throttled(struct sched_entity *se)
+{
+	struct cfs_rq *cfs_rq;
+
+	/* Only group entities can be throttled */
+	if (entity_is_task(se))
+		return 0;
+
+	cfs_rq = group_cfs_rq(se);
+	if (cfs_rq_throttled(cfs_rq))
+		return 1;
+	return 0;
+}
+
+int task_group_throttled(struct task_group *tg, int cpu)
+{
+	struct sched_entity *se = tg->se[cpu];
+
+	for_each_sched_entity(se) {
+		if (entity_throttled(se))
+			return 1;
+	}
+	return 0;
+}
+
+static void enqueue_entity_locked(struct cfs_rq *cfs_rq,
+		struct sched_entity *se, int wakeup);
+static void add_cfs_rq_tasks_running(struct sched_entity *se,
+		unsigned long count);
+static void sub_cfs_rq_tasks_running(struct sched_entity *se,
+		unsigned long count);
+
+static void enqueue_throttled_entity(struct rq *rq, struct sched_entity *se)
+{
+	unsigned long nr_tasks = 0;
+	struct sched_entity *se_tmp = se;
+	int throttled = 0;
+
+	for_each_sched_entity(se) {
+		if (se->on_rq)
+			break;
+
+		if (entity_throttled(se)) {
+			throttled = 1;
+			break;
+		}
+
+		enqueue_entity_locked(cfs_rq_of(se), se, 0);
+		nr_tasks += group_cfs_rq(se)->nr_tasks_running;
+	}
+
+	if (!nr_tasks)
+		return;
+
+	/*
+	 * Add the number of tasks this entity has to
+	 * all of its parent entities.
+	 */
+	add_cfs_rq_tasks_running(se_tmp, nr_tasks);
+
+	/*
+	 * Add the number of tasks this entity has to
+	 * this cpu's rq only if the entity got enqueued all the
+	 * way up without any throttled entity in the hierarchy.
+	 */
+	if (!throttled)
+		rq->nr_running += nr_tasks;
+}
+
+/*
+ * Refresh runtimes of all cfs_rqs in this group, i,e.,
+ * refresh runtimes of the representative cfs_rq of this
+ * tg on all cpus. Enqueue any throttled entity back.
+ */
+void do_sched_cfs_period_timer(struct cfs_bandwidth *cfs_b)
+{
+	int i;
+	const struct cpumask *span = sched_bw_period_mask();
+	struct task_group *tg = container_of(cfs_b, struct task_group,
+					cfs_bandwidth);
+	unsigned long flags;
+
+	for_each_cpu(i, span) {
+		struct rq *rq = cpu_rq(i);
+		struct cfs_rq *cfs_rq = tg->cfs_rq[i];
+		struct sched_entity *se = tg->se[i];
+
+		spin_lock_irqsave(&rq->lock, flags);
+		rq_runtime_lock(rq);
+		cfs_rq->cfs_time = 0;
+		if (cfs_rq_throttled(cfs_rq)) {
+			update_rq_clock(rq);
+			update_stats_throttle_end(cfs_rq, se);
+			cfs_rq->cfs_throttled = 0;
+			enqueue_throttled_entity(rq, se);
+		}
+		rq_runtime_unlock(rq);
+		spin_unlock_irqrestore(&rq->lock, flags);
+	}
+}
+
+#else
+
+static inline int cfs_rq_throttled(struct cfs_rq *cfs_rq)
+{
+	return 0;
+}
+
+int task_group_throttled(struct task_group *tg, int cpu)
+{
+	return 0;
+}
+
+static void sched_cfs_runtime_exceeded(struct sched_entity *se,
+	struct task_struct *tsk_curr, unsigned long delta_exec)
+{
+	return;
+}
+
+static int entity_throttled(struct sched_entity *se)
+{
+	return 0;
+}
+
+#endif /* CONFIG_CFS_HARD_LIMITS */
+
 #else	/* CONFIG_FAIR_GROUP_SCHED */
 
 static inline struct rq *rq_of(struct cfs_rq *cfs_rq)
@@ -241,8 +521,47 @@ find_matching_se(struct sched_entity **s
 {
 }
 
+static void sched_cfs_runtime_exceeded(struct sched_entity *se,
+	struct task_struct *tsk_curr, unsigned long delta_exec)
+{
+	return;
+}
+
+static int entity_throttled(struct sched_entity *se)
+{
+	return 0;
+}
+
 #endif	/* CONFIG_FAIR_GROUP_SCHED */
 
+static void add_cfs_rq_tasks_running(struct sched_entity *se,
+		unsigned long count)
+{
+	struct cfs_rq *cfs_rq;
+
+	for_each_sched_entity(se) {
+		/*
+		 * If any entity in the hierarchy is throttled, don't
+		 * propogate the tasks count up since this entity isn't
+		 * on rq yet.
+		 */
+		if (entity_throttled(se))
+			break;
+		cfs_rq = cfs_rq_of(se);
+		cfs_rq->nr_tasks_running += count;
+	}
+}
+
+static void sub_cfs_rq_tasks_running(struct sched_entity *se,
+		unsigned long count)
+{
+	struct cfs_rq *cfs_rq;
+
+	for_each_sched_entity(se) {
+		cfs_rq = cfs_rq_of(se);
+		cfs_rq->nr_tasks_running -= count;
+	}
+}
 
 /**************************************************************
  * Scheduling class tree data structure manipulation methods:
@@ -481,10 +800,12 @@ __update_curr(struct cfs_rq *cfs_rq, str
 	update_min_vruntime(cfs_rq);
 }
 
-static void update_curr(struct cfs_rq *cfs_rq)
+static void update_curr_common(struct cfs_rq *cfs_rq)
 {
 	struct sched_entity *curr = cfs_rq->curr;
-	u64 now = rq_of(cfs_rq)->clock;
+	struct rq *rq = rq_of(cfs_rq);
+	struct task_struct *tsk_curr = rq->curr;
+	u64 now = rq->clock;
 	unsigned long delta_exec;
 
 	if (unlikely(!curr))
@@ -507,9 +828,23 @@ static void update_curr(struct cfs_rq *c
 
 		cpuacct_charge(curtask, delta_exec);
 		account_group_exec_runtime(curtask, delta_exec);
+	} else {
+		sched_cfs_runtime_exceeded(curr, tsk_curr, delta_exec);
 	}
 }
 
+static void update_curr(struct cfs_rq *cfs_rq)
+{
+	rq_runtime_lock(rq_of(cfs_rq));
+	update_curr_common(cfs_rq);
+	rq_runtime_unlock(rq_of(cfs_rq));
+}
+
+static inline void update_curr_locked(struct cfs_rq *cfs_rq)
+{
+	update_curr_common(cfs_rq);
+}
+
 static inline void
 update_stats_wait_start(struct cfs_rq *cfs_rq, struct sched_entity *se)
 {
@@ -713,13 +1048,9 @@ place_entity(struct cfs_rq *cfs_rq, stru
 	se->vruntime = vruntime;
 }
 
-static void
-enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int wakeup)
+static void enqueue_entity_common(struct cfs_rq *cfs_rq,
+		struct sched_entity *se, int wakeup)
 {
-	/*
-	 * Update run-time statistics of the 'current'.
-	 */
-	update_curr(cfs_rq);
 	account_entity_enqueue(cfs_rq, se);
 
 	if (wakeup) {
@@ -736,6 +1067,26 @@ enqueue_entity(struct cfs_rq *cfs_rq, st
 		vx_activate_task(task_of(se));
 }
 
+static void enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se,
+		int wakeup)
+{
+	/*
+	 * Update run-time statistics of the 'current'.
+	 */
+	update_curr(cfs_rq);
+	enqueue_entity_common(cfs_rq, se, wakeup);
+}
+
+static void enqueue_entity_locked(struct cfs_rq *cfs_rq,
+		struct sched_entity *se, int wakeup)
+{
+	/*
+	 * Update run-time statistics of the 'current'.
+	 */
+	update_curr_locked(cfs_rq);
+	enqueue_entity_common(cfs_rq, se, wakeup);
+}
+
 static void __clear_buddies(struct cfs_rq *cfs_rq, struct sched_entity *se)
 {
 	if (cfs_rq->last == se)
@@ -849,8 +1200,40 @@ static struct sched_entity *pick_next_en
 	return se;
 }
 
+/*
+ * Called from put_prev_entity()
+ * If a group entity (@se) is found to be throttled, it will not be put back
+ * on @cfs_rq, which is equivalent to dequeing it.
+ */
+static void dequeue_throttled_entity(struct cfs_rq *cfs_rq,
+		struct sched_entity *se)
+{
+	unsigned long nr_tasks = group_cfs_rq(se)->nr_tasks_running;
+
+	__clear_buddies(cfs_rq, se);
+	account_entity_dequeue(cfs_rq, se);
+	cfs_rq->curr = NULL;
+
+	if (!nr_tasks)
+		return;
+
+	/*
+	 * Decrement the number of tasks this entity has from
+	 * all of its parent entities.
+	 */
+	sub_cfs_rq_tasks_running(se, nr_tasks);
+
+	/*
+	 * Decrement the number of tasks this entity has from
+	 * this cpu's rq.
+	 */
+	rq_of(cfs_rq)->nr_running -= nr_tasks;
+}
+
 static void put_prev_entity(struct cfs_rq *cfs_rq, struct sched_entity *prev)
 {
+	struct cfs_rq *gcfs_rq = group_cfs_rq(prev);
+
 	/*
 	 * If still on the runqueue then deactivate_task()
 	 * was not called and update_curr() has to be done:
@@ -860,6 +1243,18 @@ static void put_prev_entity(struct cfs_r
 
 	check_spread(cfs_rq, prev);
 	if (prev->on_rq) {
+		/*
+		 * If the group entity is throttled or if it has no
+		 * no child entities, then don't enqueue it back.
+		 */
+		rq_runtime_lock(rq_of(cfs_rq));
+		if (entity_throttled(prev) ||
+			(gcfs_rq && !gcfs_rq->nr_running)) {
+			dequeue_throttled_entity(cfs_rq, prev);
+			rq_runtime_unlock(rq_of(cfs_rq));
+			return;
+		}
+		rq_runtime_unlock(rq_of(cfs_rq));
 		update_stats_wait_start(cfs_rq, prev);
 		/* Put 'current' back into the tree. */
 		__enqueue_entity(cfs_rq, prev);
@@ -960,21 +1355,32 @@ static inline void hrtick_update(struct 
  * The enqueue_task method is called before nr_running is
  * increased. Here we update the fair scheduling stats and
  * then put the task into the rbtree:
+ * Don't enqueue a throttled entity further into the hierarchy.
  */
-static void enqueue_task_fair(struct rq *rq, struct task_struct *p, int wakeup)
+static int enqueue_task_fair(struct rq *rq, struct task_struct *p, int wakeup)
 {
 	struct cfs_rq *cfs_rq;
 	struct sched_entity *se = &p->se;
+	int throttled = 0;
 
+	rq_runtime_lock(rq);
 	for_each_sched_entity(se) {
 		if (se->on_rq)
 			break;
+		if (entity_throttled(se)) {
+			throttled = 1;
+			break;
+		}
 		cfs_rq = cfs_rq_of(se);
-		enqueue_entity(cfs_rq, se, wakeup);
+		enqueue_entity_locked(cfs_rq, se, wakeup);
 		wakeup = 1;
 	}
 
+	add_cfs_rq_tasks_running(&p->se, 1);
+	rq_runtime_unlock(rq);
+
 	hrtick_update(rq);
+	return throttled;
 }
 
 /*
@@ -996,6 +1402,7 @@ static void dequeue_task_fair(struct rq 
 		sleep = 1;
 	}
 
+	sub_cfs_rq_tasks_running(&p->se, 1);
 	hrtick_update(rq);
 }
 
@@ -1523,6 +1930,7 @@ static struct task_struct *pick_next_tas
 
 	do {
 		se = pick_next_entity(cfs_rq);
+
 		/*
 		 * If se was a buddy, clear it so that it will have to earn
 		 * the favour again.
@@ -1632,9 +2040,9 @@ load_balance_fair(struct rq *this_rq, in
 		u64 rem_load, moved_load;
 
 		/*
-		 * empty group
+		 * empty group or a group with no h_load (throttled)
 		 */
-		if (!busiest_cfs_rq->task_weight)
+		if (!busiest_cfs_rq->task_weight || !busiest_h_load)
 			continue;
 
 		rem_load = (u64)rem_load_move * busiest_weight;
diff -NurpP --minimal linux-2.6.31.2-vs2.3.0.36.14/kernel/sched_rt.c linux-2.6.31.2-vs2.3.0.36.15/kernel/sched_rt.c
--- linux-2.6.31.2-vs2.3.0.36.14/kernel/sched_rt.c	2009-10-06 05:02:21.000000000 +0200
+++ linux-2.6.31.2-vs2.3.0.36.15/kernel/sched_rt.c	2009-10-06 04:39:02.000000000 +0200
@@ -222,18 +222,6 @@ static int rt_se_boosted(struct sched_rt
 	return p->prio != p->normal_prio;
 }
 
-#ifdef CONFIG_SMP
-static inline const struct cpumask *sched_rt_period_mask(void)
-{
-	return cpu_rq(smp_processor_id())->rd->span;
-}
-#else
-static inline const struct cpumask *sched_rt_period_mask(void)
-{
-	return cpu_online_mask;
-}
-#endif
-
 static inline
 struct rt_rq *sched_rt_period_rt_rq(struct rt_bandwidth *rt_b, int cpu)
 {
@@ -283,11 +271,6 @@ static inline int rt_rq_throttled(struct
 	return rt_rq->rt_throttled;
 }
 
-static inline const struct cpumask *sched_rt_period_mask(void)
-{
-	return cpu_online_mask;
-}
-
 static inline
 struct rt_rq *sched_rt_period_rt_rq(struct rt_bandwidth *rt_b, int cpu)
 {
@@ -505,7 +488,7 @@ static int do_sched_rt_period_timer(stru
 	if (!rt_bandwidth_enabled() || rt_b->rt_runtime == RUNTIME_INF)
 		return 1;
 
-	span = sched_rt_period_mask();
+	span = sched_bw_period_mask();
 	for_each_cpu(i, span) {
 		int enqueue = 0;
 		struct rt_rq *rt_rq = sched_rt_period_rt_rq(rt_b, i);
@@ -863,7 +846,7 @@ static void dequeue_rt_entity(struct sch
 /*
  * Adding/removing a task to/from a priority array:
  */
-static void enqueue_task_rt(struct rq *rq, struct task_struct *p, int wakeup)
+static int enqueue_task_rt(struct rq *rq, struct task_struct *p, int wakeup)
 {
 	struct sched_rt_entity *rt_se = &p->rt;
 
@@ -876,6 +859,7 @@ static void enqueue_task_rt(struct rq *r
 		enqueue_pushable_task(rq, p);
 
 	inc_cpu_load(rq, p->se.load.weight);
+	return 0;
 }
 
 static void dequeue_task_rt(struct rq *rq, struct task_struct *p, int sleep)