diff -NurpP --minimal --exclude '*.orig' linux-2.6.32.1/Documentation/scheduler/sched-cfs-hard-limits.txt linux-2.6.32.1-hard/Documentation/scheduler/sched-cfs-hard-limits.txt
--- linux-2.6.32.1/Documentation/scheduler/sched-cfs-hard-limits.txt	1970-01-01 01:00:00.000000000 +0100
+++ linux-2.6.32.1-hard/Documentation/scheduler/sched-cfs-hard-limits.txt	2009-12-17 14:48:21.000000000 +0100
@@ -0,0 +1,48 @@
+CPU HARD LIMITS FOR CFS GROUPS
+==============================
+
+1. Overview
+2. Interface
+3. Examples
+
+1. Overview
+-----------
+
+CFS is a proportional share scheduler which tries to divide the CPU time
+proportionately between tasks or groups of tasks (task group/cgroup) depending
+on the priority/weight of the task or shares assigned to groups of tasks.
+In CFS, a task/task group can get more than its share of CPU if there are
+enough idle CPU cycles available in the system, due to the work conserving
+nature of the scheduler. However in certain scenarios (like pay-per-use),
+it is desirable not to provide extra time to a group even in the presence
+of idle CPU cycles. This is where hard limiting can be of use.
+
+Hard limits for task groups can be set by specifying how much CPU runtime a
+group can consume within a given period. If the group consumes more CPU time
+than the runtime in a given period, it gets throttled. None of the tasks of
+the throttled group gets to run until the runtime of the group gets refreshed
+at the beginning of the next period.
+
+2. Interface
+------------
+
+Hard limit feature adds 2 cgroup files for CFS group scheduler:
+
+cfs_runtime_us: Hard limit for the group in microseconds.
+
+cfs_period_us: Time period in microseconds within which hard limits is
+enforced.
+
+A group gets created with default values for runtime (infinite runtime which
+means hard limits disabled) and period (0.5s). Each group can set its own
+values for runtime and period independent of other groups in the system.
+
+3. Examples
+-----------
+
+# mount -t cgroup -ocpu none /cgroups/
+# cd /cgroups
+# mkdir 1
+# cd 1/
+# echo 250000 > cfs_runtime_us /* set a 250ms runtime or limit */
+# echo 500000 > cfs_period_us /* set a 500ms period */
diff -NurpP --minimal --exclude '*.orig' linux-2.6.32.1/include/linux/sched.h linux-2.6.32.1-hard/include/linux/sched.h
--- linux-2.6.32.1/include/linux/sched.h	2009-12-14 21:29:46.000000000 +0100
+++ linux-2.6.32.1-hard/include/linux/sched.h	2009-12-17 14:48:20.000000000 +0100
@@ -1183,6 +1183,12 @@ struct sched_entity {
 	u64			nr_wakeups_affine_attempts;
 	u64			nr_wakeups_passive;
 	u64			nr_wakeups_idle;
+#ifdef CONFIG_CFS_HARD_LIMITS
+	u64			throttle_start;
+	u64			throttle_max;
+	u64			throttle_count;
+	u64			throttle_sum;
+#endif
 #endif
 
 #ifdef CONFIG_FAIR_GROUP_SCHED
diff -NurpP --minimal --exclude '*.orig' linux-2.6.32.1/init/Kconfig linux-2.6.32.1-hard/init/Kconfig
--- linux-2.6.32.1/init/Kconfig	2009-12-03 20:02:57.000000000 +0100
+++ linux-2.6.32.1-hard/init/Kconfig	2009-12-17 14:48:20.000000000 +0100
@@ -477,6 +477,19 @@ config CGROUP_SCHED
 
 endchoice
 
+config CFS_HARD_LIMITS
+	bool "Hard Limits for CFS Group Scheduler"
+	depends on EXPERIMENTAL
+	depends on FAIR_GROUP_SCHED && CGROUP_SCHED
+	default n
+	help
+	  This option enables hard limiting of CPU time obtained by
+	  a fair task group. Use this if you want to throttle a group of tasks
+	  based on its CPU usage. For more details refer to
+	  Documentation/scheduler/sched-cfs-hard-limits.txt
+
+	  Say N if unsure.
+
 menuconfig CGROUPS
 	boolean "Control Group support"
 	help
diff -NurpP --minimal --exclude '*.orig' linux-2.6.32.1/kernel/sched.c linux-2.6.32.1-hard/kernel/sched.c
--- linux-2.6.32.1/kernel/sched.c	2009-12-03 20:02:58.000000000 +0100
+++ linux-2.6.32.1-hard/kernel/sched.c	2009-12-17 14:48:21.000000000 +0100
@@ -237,6 +237,15 @@ static DEFINE_MUTEX(sched_domains_mutex)
 
 #include <linux/cgroup.h>
 
+#if defined(CONFIG_FAIR_GROUP_SCHED) && defined(CONFIG_CFS_HARD_LIMITS)
+struct cfs_bandwidth {
+	spinlock_t		cfs_runtime_lock;
+	ktime_t			cfs_period;
+	u64			cfs_runtime;
+	struct hrtimer		cfs_period_timer;
+};
+#endif
+
 struct cfs_rq;
 
 static LIST_HEAD(task_groups);
@@ -257,6 +266,9 @@ struct task_group {
 	/* runqueue "owned" by this group on each cpu */
 	struct cfs_rq **cfs_rq;
 	unsigned long shares;
+#ifdef CONFIG_CFS_HARD_LIMITS
+	struct cfs_bandwidth cfs_bandwidth;
+#endif
 #endif
 
 #ifdef CONFIG_RT_GROUP_SCHED
@@ -446,6 +458,19 @@ struct cfs_rq {
 	unsigned long rq_weight;
 #endif
 #endif
+#ifdef CONFIG_CFS_HARD_LIMITS
+	/* set when the group is throttled  on this cpu */
+	int cfs_throttled;
+
+	/* runtime currently consumed by the group on this rq */
+	u64 cfs_time;
+
+	/* runtime available to the group on this rq */
+	u64 cfs_runtime;
+
+	/* Protects the cfs runtime related fields of this cfs_rq */
+	spinlock_t cfs_runtime_lock;
+#endif
 };
 
 /* Real-Time classes' related field in a runqueue: */
@@ -1607,6 +1632,7 @@ static void update_group_shares_cpu(stru
 	}
 }
 
+static inline int cfs_rq_throttled(struct cfs_rq *cfs_rq);
 /*
  * Re-compute the task group their per cpu shares over the given domain.
  * This needs to be done in a bottom-up fashion because the rq weight of a
@@ -1634,8 +1660,10 @@ static int tg_shares_up(struct task_grou
 		 * If there are currently no tasks on the cpu pretend there
 		 * is one of average load so that when a new task gets to
 		 * run here it will not get delayed by group starvation.
+		 * Also if the group is throttled on this cpu, pretend that
+		 * it has no tasks.
 		 */
-		if (!weight)
+		if (!weight || cfs_rq_throttled(tg->cfs_rq[i]))
 			weight = NICE_0_LOAD;
 
 		rq_weight += weight;
@@ -1811,6 +1839,175 @@ static void cfs_rq_set_shares(struct cfs
 
 static void calc_load_account_active(struct rq *this_rq);
 
+
+#if defined(CONFIG_RT_GROUP_SCHED) || defined(CONFIG_FAIR_GROUP_SCHED)
+
+#ifdef CONFIG_SMP
+static inline const struct cpumask *sched_bw_period_mask(void)
+{
+	return cpu_rq(smp_processor_id())->rd->span;
+}
+#else /* !CONFIG_SMP */
+static inline const struct cpumask *sched_bw_period_mask(void)
+{
+	return cpu_online_mask;
+}
+#endif /* CONFIG_SMP */
+
+#else
+static inline const struct cpumask *sched_bw_period_mask(void)
+{
+	return cpu_online_mask;
+}
+
+#endif
+
+#ifdef CONFIG_FAIR_GROUP_SCHED
+#ifdef CONFIG_CFS_HARD_LIMITS
+
+/*
+ * Runtime allowed for a cfs group before it is hard limited.
+ * default: Infinite which means no hard limiting.
+ */
+u64 sched_cfs_runtime = RUNTIME_INF;
+
+/*
+ * period over which we hard limit the cfs group's bandwidth.
+ * default: 0.5s
+ */
+u64 sched_cfs_period = 500000;
+
+static inline u64 global_cfs_period(void)
+{
+	return sched_cfs_period * NSEC_PER_USEC;
+}
+
+static inline u64 global_cfs_runtime(void)
+{
+	return RUNTIME_INF;
+}
+
+void do_sched_cfs_period_timer(struct cfs_bandwidth *cfs_b);
+
+static inline void cfs_rq_runtime_lock(struct cfs_rq *cfs_rq)
+{
+	spin_lock(&cfs_rq->cfs_runtime_lock);
+}
+
+static inline void cfs_rq_runtime_unlock(struct cfs_rq *cfs_rq)
+{
+	spin_unlock(&cfs_rq->cfs_runtime_lock);
+}
+
+/*
+ * Refresh the runtimes of the throttled groups.
+ * But nothing much to do now, will populate this in later patches.
+ */
+static enum hrtimer_restart sched_cfs_period_timer(struct hrtimer *timer)
+{
+	struct cfs_bandwidth *cfs_b =
+		container_of(timer, struct cfs_bandwidth, cfs_period_timer);
+
+	do_sched_cfs_period_timer(cfs_b);
+	hrtimer_add_expires_ns(timer, ktime_to_ns(cfs_b->cfs_period));
+	return HRTIMER_RESTART;
+}
+
+/*
+ * TODO: Check if this kind of timer setup is sufficient for cfs or
+ * should we do what rt is doing.
+ */
+static void start_cfs_bandwidth(struct task_group *tg)
+{
+	struct cfs_bandwidth *cfs_b = &tg->cfs_bandwidth;
+
+	/*
+	 * Timer isn't setup for groups with infinite runtime
+	 */
+	if (cfs_b->cfs_runtime == RUNTIME_INF)
+		return;
+
+	if (hrtimer_active(&cfs_b->cfs_period_timer))
+		return;
+
+	hrtimer_start_range_ns(&cfs_b->cfs_period_timer, cfs_b->cfs_period,
+			0, HRTIMER_MODE_REL);
+}
+
+static void init_cfs_bandwidth(struct task_group *tg)
+{
+	struct cfs_bandwidth *cfs_b = &tg->cfs_bandwidth;
+
+	cfs_b->cfs_period = ns_to_ktime(global_cfs_period());
+	cfs_b->cfs_runtime = global_cfs_runtime();
+
+	spin_lock_init(&cfs_b->cfs_runtime_lock);
+
+	hrtimer_init(&cfs_b->cfs_period_timer,
+			CLOCK_MONOTONIC, HRTIMER_MODE_REL);
+	cfs_b->cfs_period_timer.function = &sched_cfs_period_timer;
+}
+
+static inline void destroy_cfs_bandwidth(struct task_group *tg)
+{
+	hrtimer_cancel(&tg->cfs_bandwidth.cfs_period_timer);
+}
+
+static void init_cfs_hard_limits(struct cfs_rq *cfs_rq, struct task_group *tg)
+{
+	cfs_rq->cfs_time = 0;
+	cfs_rq->cfs_throttled = 0;
+	cfs_rq->cfs_runtime = tg->cfs_bandwidth.cfs_runtime;
+	spin_lock_init(&cfs_rq->cfs_runtime_lock);
+}
+
+#else /* !CONFIG_CFS_HARD_LIMITS */
+
+static void init_cfs_bandwidth(struct task_group *tg)
+{
+	return;
+}
+
+static inline void destroy_cfs_bandwidth(struct task_group *tg)
+{
+	return;
+}
+
+static void init_cfs_hard_limits(struct cfs_rq *cfs_rq, struct task_group *tg)
+{
+	return;
+}
+
+static inline void cfs_rq_runtime_lock(struct cfs_rq *cfs_rq)
+{
+	return;
+}
+
+static inline void cfs_rq_runtime_unlock(struct cfs_rq *cfs_rq)
+{
+	return;
+}
+
+#endif /* CONFIG_CFS_HARD_LIMITS */
+#else /* !CONFIG_FAIR_GROUP_SCHED */
+
+static inline void cfs_rq_runtime_lock(struct cfs_rq *cfs_rq)
+{
+	return;
+}
+
+static inline void cfs_rq_runtime_unlock(struct cfs_rq *cfs_rq)
+{
+	return;
+}
+
+static inline int cfs_rq_throttled(struct cfs_rq *cfs_rq)
+{
+	return 0;
+}
+
+#endif /* CONFIG_FAIR_GROUP_SCHED */
+
 #include "sched_stats.h"
 #include "sched_idletask.c"
 #include "sched_fair.c"
@@ -9164,6 +9361,32 @@ static int update_sched_domains(struct n
 }
 #endif
 
+#ifdef CONFIG_SMP
+static void disable_runtime(struct rq *rq)
+{
+	unsigned long flags;
+
+	spin_lock_irqsave(&rq->lock, flags);
+#if defined(CONFIG_FAIR_GROUP_SCHED) && defined(CONFIG_CFS_HARD_LIMITS)
+	disable_runtime_cfs(rq);
+#endif
+	disable_runtime_rt(rq);
+	spin_unlock_irqrestore(&rq->lock, flags);
+}
+
+static void enable_runtime(struct rq *rq)
+{
+	unsigned long flags;
+
+	spin_lock_irqsave(&rq->lock, flags);
+#if defined(CONFIG_FAIR_GROUP_SCHED) && defined(CONFIG_CFS_HARD_LIMITS)
+	enable_runtime_cfs(rq);
+#endif
+	enable_runtime_rt(rq);
+	spin_unlock_irqrestore(&rq->lock, flags);
+}
+#endif
+
 static int update_runtime(struct notifier_block *nfb,
 				unsigned long action, void *hcpu)
 {
@@ -9296,6 +9519,7 @@ static void init_tg_cfs_entry(struct tas
 	struct rq *rq = cpu_rq(cpu);
 	tg->cfs_rq[cpu] = cfs_rq;
 	init_cfs_rq(cfs_rq, rq);
+	init_cfs_hard_limits(cfs_rq, tg);
 	cfs_rq->tg = tg;
 	if (add)
 		list_add(&cfs_rq->leaf_cfs_rq_list, &rq->leaf_cfs_rq_list);
@@ -9425,6 +9649,10 @@ void __init sched_init(void)
 #endif /* CONFIG_USER_SCHED */
 #endif /* CONFIG_RT_GROUP_SCHED */
 
+#ifdef CONFIG_FAIR_GROUP_SCHED
+	init_cfs_bandwidth(&init_task_group);
+#endif
+
 #ifdef CONFIG_GROUP_SCHED
 	list_add(&init_task_group.list, &task_groups);
 	INIT_LIST_HEAD(&init_task_group.children);
@@ -9451,6 +9679,7 @@ void __init sched_init(void)
 		init_cfs_rq(&rq->cfs, rq);
 		init_rt_rq(&rq->rt, rq);
 #ifdef CONFIG_FAIR_GROUP_SCHED
+		init_cfs_hard_limits(&rq->cfs, &init_task_group);
 		init_task_group.shares = init_task_group_load;
 		INIT_LIST_HEAD(&rq->leaf_cfs_rq_list);
 #ifdef CONFIG_CGROUP_SCHED
@@ -9726,6 +9955,7 @@ static void free_fair_sched_group(struct
 {
 	int i;
 
+	destroy_cfs_bandwidth(tg);
 	for_each_possible_cpu(i) {
 		if (tg->cfs_rq)
 			kfree(tg->cfs_rq[i]);
@@ -9752,6 +9982,7 @@ int alloc_fair_sched_group(struct task_g
 	if (!tg->se)
 		goto err;
 
+	init_cfs_bandwidth(tg);
 	tg->shares = NICE_0_LOAD;
 
 	for_each_possible_cpu(i) {
@@ -10475,6 +10706,100 @@ static u64 cpu_shares_read_u64(struct cg
 
 	return (u64) tg->shares;
 }
+
+#ifdef CONFIG_CFS_HARD_LIMITS
+
+static int tg_set_cfs_bandwidth(struct task_group *tg,
+		u64 cfs_period, u64 cfs_runtime)
+{
+	int i;
+
+	spin_lock_irq(&tg->cfs_bandwidth.cfs_runtime_lock);
+	tg->cfs_bandwidth.cfs_period = ns_to_ktime(cfs_period);
+	tg->cfs_bandwidth.cfs_runtime = cfs_runtime;
+
+	for_each_possible_cpu(i) {
+		struct cfs_rq *cfs_rq = tg->cfs_rq[i];
+
+		cfs_rq_runtime_lock(cfs_rq);
+		cfs_rq->cfs_runtime = cfs_runtime;
+		cfs_rq_runtime_unlock(cfs_rq);
+	}
+
+	start_cfs_bandwidth(tg);
+	spin_unlock_irq(&tg->cfs_bandwidth.cfs_runtime_lock);
+	return 0;
+}
+
+int tg_set_cfs_runtime(struct task_group *tg, long cfs_runtime_us)
+{
+	u64 cfs_runtime, cfs_period;
+
+	cfs_period = ktime_to_ns(tg->cfs_bandwidth.cfs_period);
+	cfs_runtime = (u64)cfs_runtime_us * NSEC_PER_USEC;
+	if (cfs_runtime_us < 0)
+		cfs_runtime = RUNTIME_INF;
+
+	return tg_set_cfs_bandwidth(tg, cfs_period, cfs_runtime);
+}
+
+long tg_get_cfs_runtime(struct task_group *tg)
+{
+	u64 cfs_runtime_us;
+
+	if (tg->cfs_bandwidth.cfs_runtime == RUNTIME_INF)
+		return -1;
+
+	cfs_runtime_us = tg->cfs_bandwidth.cfs_runtime;
+	do_div(cfs_runtime_us, NSEC_PER_USEC);
+	return cfs_runtime_us;
+}
+
+int tg_set_cfs_period(struct task_group *tg, long cfs_period_us)
+{
+	u64 cfs_runtime, cfs_period;
+
+	cfs_period = (u64)cfs_period_us * NSEC_PER_USEC;
+	cfs_runtime = tg->cfs_bandwidth.cfs_runtime;
+
+	if (cfs_period == 0)
+		return -EINVAL;
+
+	return tg_set_cfs_bandwidth(tg, cfs_period, cfs_runtime);
+}
+
+long tg_get_cfs_period(struct task_group *tg)
+{
+	u64 cfs_period_us;
+
+	cfs_period_us = ktime_to_ns(tg->cfs_bandwidth.cfs_period);
+	do_div(cfs_period_us, NSEC_PER_USEC);
+	return cfs_period_us;
+}
+
+static s64 cpu_cfs_runtime_read_s64(struct cgroup *cgrp, struct cftype *cft)
+{
+	return tg_get_cfs_runtime(cgroup_tg(cgrp));
+}
+
+static int cpu_cfs_runtime_write_s64(struct cgroup *cgrp, struct cftype *cftype,
+				s64 cfs_runtime_us)
+{
+	return tg_set_cfs_runtime(cgroup_tg(cgrp), cfs_runtime_us);
+}
+
+static u64 cpu_cfs_period_read_u64(struct cgroup *cgrp, struct cftype *cft)
+{
+	return tg_get_cfs_period(cgroup_tg(cgrp));
+}
+
+static int cpu_cfs_period_write_u64(struct cgroup *cgrp, struct cftype *cftype,
+				u64 cfs_period_us)
+{
+	return tg_set_cfs_period(cgroup_tg(cgrp), cfs_period_us);
+}
+
+#endif /* CONFIG_CFS_HARD_LIMITS */
 #endif /* CONFIG_FAIR_GROUP_SCHED */
 
 #ifdef CONFIG_RT_GROUP_SCHED
@@ -10508,6 +10833,18 @@ static struct cftype cpu_files[] = {
 		.read_u64 = cpu_shares_read_u64,
 		.write_u64 = cpu_shares_write_u64,
 	},
+#ifdef CONFIG_CFS_HARD_LIMITS
+	{
+		.name = "cfs_runtime_us",
+		.read_s64 = cpu_cfs_runtime_read_s64,
+		.write_s64 = cpu_cfs_runtime_write_s64,
+	},
+	{
+		.name = "cfs_period_us",
+		.read_u64 = cpu_cfs_period_read_u64,
+		.write_u64 = cpu_cfs_period_write_u64,
+	},
+#endif /* CONFIG_CFS_HARD_LIMITS */
 #endif
 #ifdef CONFIG_RT_GROUP_SCHED
 	{
diff -NurpP --minimal --exclude '*.orig' linux-2.6.32.1/kernel/sched_debug.c linux-2.6.32.1-hard/kernel/sched_debug.c
--- linux-2.6.32.1/kernel/sched_debug.c	2009-12-03 20:02:58.000000000 +0100
+++ linux-2.6.32.1-hard/kernel/sched_debug.c	2009-12-17 14:48:20.000000000 +0100
@@ -80,6 +80,11 @@ static void print_cfs_group_stats(struct
 	PN(se->wait_max);
 	PN(se->wait_sum);
 	P(se->wait_count);
+#ifdef CONFIG_CFS_HARD_LIMITS
+	PN(se->throttle_max);
+	PN(se->throttle_sum);
+	P(se->throttle_count);
+#endif
 #endif
 	P(se->load.weight);
 #undef PN
@@ -214,6 +219,16 @@ void print_cfs_rq(struct seq_file *m, in
 #ifdef CONFIG_SMP
 	SEQ_printf(m, "  .%-30s: %lu\n", "shares", cfs_rq->shares);
 #endif
+#ifdef CONFIG_CFS_HARD_LIMITS
+	spin_lock_irqsave(&rq->lock, flags);
+	SEQ_printf(m, "  .%-30s: %d\n", "cfs_throttled",
+			cfs_rq->cfs_throttled);
+	SEQ_printf(m, "  .%-30s: %Ld.%06ld\n", "cfs_time",
+			SPLIT_NS(cfs_rq->cfs_time));
+	SEQ_printf(m, "  .%-30s: %Ld.%06ld\n", "cfs_runtime",
+			SPLIT_NS(cfs_rq->cfs_runtime));
+	spin_unlock_irqrestore(&rq->lock, flags);
+#endif /* CONFIG_CFS_HARD_LIMITS */
 	print_cfs_group_stats(m, cpu, cfs_rq->tg);
 #endif
 }
@@ -310,7 +325,7 @@ static int sched_debug_show(struct seq_f
 	u64 now = ktime_to_ns(ktime_get());
 	int cpu;
 
-	SEQ_printf(m, "Sched Debug Version: v0.09, %s %.*s\n",
+	SEQ_printf(m, "Sched Debug Version: v0.10, %s %.*s\n",
 		init_utsname()->release,
 		(int)strcspn(init_utsname()->version, " "),
 		init_utsname()->version);
diff -NurpP --minimal --exclude '*.orig' linux-2.6.32.1/kernel/sched_fair.c linux-2.6.32.1-hard/kernel/sched_fair.c
--- linux-2.6.32.1/kernel/sched_fair.c	2009-12-03 20:02:58.000000000 +0100
+++ linux-2.6.32.1-hard/kernel/sched_fair.c	2009-12-17 14:48:21.000000000 +0100
@@ -189,7 +189,308 @@ find_matching_se(struct sched_entity **s
 	}
 }
 
-#else	/* !CONFIG_FAIR_GROUP_SCHED */
+#ifdef CONFIG_CFS_HARD_LIMITS
+
+static inline void update_stats_throttle_start(struct cfs_rq *cfs_rq,
+			struct sched_entity *se)
+{
+	schedstat_set(se->throttle_start, rq_of(cfs_rq)->clock);
+}
+
+static inline void update_stats_throttle_end(struct cfs_rq *cfs_rq,
+			struct sched_entity *se)
+{
+	schedstat_set(se->throttle_max, max(se->throttle_max,
+			rq_of(cfs_rq)->clock - se->throttle_start));
+	schedstat_set(se->throttle_count, se->throttle_count + 1);
+	schedstat_set(se->throttle_sum, se->throttle_sum +
+			rq_of(cfs_rq)->clock - se->throttle_start);
+	schedstat_set(se->throttle_start, 0);
+}
+
+static inline
+struct cfs_rq *sched_cfs_period_cfs_rq(struct cfs_bandwidth *cfs_b, int cpu)
+{
+	return container_of(cfs_b, struct task_group,
+			cfs_bandwidth)->cfs_rq[cpu];
+}
+
+static inline int cfs_rq_throttled(struct cfs_rq *cfs_rq)
+{
+	return cfs_rq->cfs_throttled;
+}
+
+#ifdef CONFIG_SMP
+/*
+ * Ensure this RQ takes back all the runtime it lend to its neighbours.
+ */
+static void disable_runtime_cfs(struct rq *rq)
+{
+	struct root_domain *rd = rq->rd;
+	struct cfs_rq *cfs_rq;
+
+	if (unlikely(!scheduler_running))
+		return;
+
+	for_each_leaf_cfs_rq(rq, cfs_rq) {
+		struct cfs_bandwidth *cfs_b = &cfs_rq->tg->cfs_bandwidth;
+		s64 want;
+		int i;
+
+		spin_lock(&cfs_b->cfs_runtime_lock);
+		spin_lock(&cfs_rq->cfs_runtime_lock);
+
+		/*
+		 * Either we're all are infinity and nobody needs to borrow,
+		 * or we're already disabled and this have nothing to do, or
+		 * we have exactly the right amount of runtime to take out.
+		 */
+		 if (cfs_rq->cfs_runtime == RUNTIME_INF ||
+				cfs_rq->cfs_runtime == cfs_b->cfs_runtime)
+			goto balanced;
+		spin_unlock(&cfs_rq->cfs_runtime_lock);
+
+		/*
+		 * Calculate the difference between what we started out with
+		 * and what we current have, that's the amount of runtime
+		 * we lend and now have to reclaim.
+		 */
+		 want = cfs_b->cfs_runtime - cfs_rq->cfs_runtime;
+
+		/*
+		 * Greedy reclaim, take back as much as possible.
+		 */
+		for_each_cpu(i, rd->span) {
+			struct cfs_rq *iter = sched_cfs_period_cfs_rq(cfs_b, i);
+			s64 diff;
+
+			/*
+			 * Can't reclaim from ourselves or disabled runqueues.
+			 */
+			if (iter == cfs_rq || iter->cfs_runtime == RUNTIME_INF)
+				continue;
+
+			spin_lock(&iter->cfs_runtime_lock);
+			if (want > 0) {
+				diff = min_t(s64, iter->cfs_runtime, want);
+				iter->cfs_runtime -= diff;
+				want -= diff;
+			} else {
+				iter->cfs_runtime -= want;
+				want -= want;
+			}
+
+			spin_unlock(&iter->cfs_runtime_lock);
+			if (!want)
+				break;
+		}
+
+		spin_lock(&cfs_rq->cfs_runtime_lock);
+		/*
+		 * We cannot be left wanting - that would mean some
+		 * runtime leaked out of the system.
+		 */
+		BUG_ON(want);
+balanced:
+		/*
+		 * Disable all the borrow logic by pretending we have infinite
+		 * runtime - in which case borrowing doesn't make sense.
+		 */
+		 cfs_rq->cfs_runtime = RUNTIME_INF;
+		 spin_unlock(&cfs_rq->cfs_runtime_lock);
+		 spin_unlock(&cfs_b->cfs_runtime_lock);
+	}
+}
+
+static void enable_runtime_cfs(struct rq *rq)
+{
+	struct cfs_rq *cfs_rq;
+
+	if (unlikely(!scheduler_running))
+		return;
+
+	/*
+	 * Reset each runqueue's bandwidth settings
+	 */
+	for_each_leaf_cfs_rq(rq, cfs_rq) {
+		struct cfs_bandwidth *cfs_b = &cfs_rq->tg->cfs_bandwidth;
+
+		spin_lock(&cfs_b->cfs_runtime_lock);
+		spin_lock(&cfs_rq->cfs_runtime_lock);
+		cfs_rq->cfs_runtime = cfs_b->cfs_runtime;
+		cfs_rq->cfs_time = 0;
+		cfs_rq->cfs_throttled = 0;
+		spin_unlock(&cfs_rq->cfs_runtime_lock);
+		spin_unlock(&cfs_b->cfs_runtime_lock);
+	}
+}
+
+/*
+ * Ran out of runtime, check if we can borrow some from others
+ * instead of getting throttled right away.
+ */
+static void do_cfs_balance_runtime(struct cfs_rq *cfs_rq)
+{
+	struct cfs_bandwidth *cfs_b = &cfs_rq->tg->cfs_bandwidth;
+	const struct cpumask *span = sched_bw_period_mask();
+	int i, weight;
+	u64 cfs_period;
+
+	weight = cpumask_weight(span);
+	spin_lock(&cfs_b->cfs_runtime_lock);
+	cfs_period = ktime_to_ns(cfs_b->cfs_period);
+
+	for_each_cpu(i, span) {
+		struct cfs_rq *borrow_cfs_rq =
+				sched_cfs_period_cfs_rq(cfs_b, i);
+		s64 diff;
+
+		if (borrow_cfs_rq == cfs_rq)
+			continue;
+
+		cfs_rq_runtime_lock(borrow_cfs_rq);
+		if (borrow_cfs_rq->cfs_runtime == RUNTIME_INF) {
+			cfs_rq_runtime_unlock(borrow_cfs_rq);
+			continue;
+		}
+
+		diff = borrow_cfs_rq->cfs_runtime - borrow_cfs_rq->cfs_time;
+		if (diff > 0) {
+			diff = div_u64((u64)diff, weight);
+			if (cfs_rq->cfs_runtime + diff > cfs_period)
+				diff = cfs_period - cfs_rq->cfs_runtime;
+			borrow_cfs_rq->cfs_runtime -= diff;
+			cfs_rq->cfs_runtime += diff;
+			if (cfs_rq->cfs_runtime == cfs_period) {
+				cfs_rq_runtime_unlock(borrow_cfs_rq);
+				break;
+			}
+		}
+		cfs_rq_runtime_unlock(borrow_cfs_rq);
+	}
+	spin_unlock(&cfs_b->cfs_runtime_lock);
+}
+
+/*
+ * Called with rq->runtime_lock held.
+ */
+static void cfs_balance_runtime(struct cfs_rq *cfs_rq)
+{
+	cfs_rq_runtime_unlock(cfs_rq);
+	do_cfs_balance_runtime(cfs_rq);
+	cfs_rq_runtime_lock(cfs_rq);
+}
+
+#else /* !CONFIG_SMP */
+
+static void cfs_balance_runtime(struct cfs_rq *cfs_rq)
+{
+	return;
+}
+#endif /* CONFIG_SMP */
+
+/*
+ * Check if group entity exceeded its runtime. If so, mark the cfs_rq as
+ * throttled mark the current task for reschedling.
+ */
+static void sched_cfs_runtime_exceeded(struct sched_entity *se,
+	struct task_struct *tsk_curr, unsigned long delta_exec)
+{
+	struct cfs_rq *cfs_rq;
+
+	cfs_rq = group_cfs_rq(se);
+
+	if (cfs_rq->cfs_runtime == RUNTIME_INF)
+		return;
+
+	cfs_rq->cfs_time += delta_exec;
+
+	if (cfs_rq_throttled(cfs_rq))
+		return;
+
+	if (cfs_rq->cfs_time > cfs_rq->cfs_runtime)
+		cfs_balance_runtime(cfs_rq);
+
+	if (cfs_rq->cfs_time > cfs_rq->cfs_runtime) {
+		cfs_rq->cfs_throttled = 1;
+		update_stats_throttle_start(cfs_rq, se);
+		resched_task(tsk_curr);
+	}
+}
+
+static inline void update_curr_group(struct sched_entity *curr,
+		unsigned long delta_exec, struct task_struct *tsk_curr)
+{
+	sched_cfs_runtime_exceeded(curr, tsk_curr, delta_exec);
+}
+
+static void enqueue_entity_locked(struct cfs_rq *cfs_rq,
+		struct sched_entity *se, int wakeup);
+
+static void enqueue_throttled_entity(struct rq *rq, struct sched_entity *se)
+{
+	for_each_sched_entity(se) {
+		struct cfs_rq *gcfs_rq = group_cfs_rq(se);
+
+		if (se->on_rq || cfs_rq_throttled(gcfs_rq) ||
+				!gcfs_rq->nr_running)
+			break;
+		enqueue_entity_locked(cfs_rq_of(se), se, 0);
+	}
+}
+
+/*
+ * Refresh runtimes of all cfs_rqs in this group, i,e.,
+ * refresh runtimes of the representative cfs_rq of this
+ * tg on all cpus. Enqueue any throttled entity back.
+ */
+void do_sched_cfs_period_timer(struct cfs_bandwidth *cfs_b)
+{
+	int i;
+	const struct cpumask *span = sched_bw_period_mask();
+	unsigned long flags;
+
+	for_each_cpu(i, span) {
+		struct rq *rq = cpu_rq(i);
+		struct cfs_rq *cfs_rq = sched_cfs_period_cfs_rq(cfs_b, i);
+		struct sched_entity *se = cfs_rq->tg->se[i];
+
+		spin_lock_irqsave(&rq->lock, flags);
+		cfs_rq_runtime_lock(cfs_rq);
+		cfs_rq->cfs_time = 0;
+		if (cfs_rq_throttled(cfs_rq)) {
+			update_rq_clock(rq);
+			update_stats_throttle_end(cfs_rq, se);
+			cfs_rq->cfs_throttled = 0;
+			enqueue_throttled_entity(rq, se);
+		}
+		cfs_rq_runtime_unlock(cfs_rq);
+		spin_unlock_irqrestore(&rq->lock, flags);
+	}
+}
+
+#else
+
+static inline void update_curr_group(struct sched_entity *curr,
+		unsigned long delta_exec, struct task_struct *tsk_curr)
+{
+	return;
+}
+
+static inline int cfs_rq_throttled(struct cfs_rq *cfs_rq)
+{
+	return 0;
+}
+
+#endif /* CONFIG_CFS_HARD_LIMITS */
+
+#else	/* CONFIG_FAIR_GROUP_SCHED */
+
+static inline void update_curr_group(struct sched_entity *curr,
+		unsigned long delta_exec, struct task_struct *tsk_curr)
+{
+	return;
+}
 
 static inline struct task_struct *task_of(struct sched_entity *se)
 {
@@ -251,7 +552,6 @@ find_matching_se(struct sched_entity **s
 
 #endif	/* CONFIG_FAIR_GROUP_SCHED */
 
-
 /**************************************************************
  * Scheduling class tree data structure manipulation methods:
  */
@@ -489,14 +789,25 @@ __update_curr(struct cfs_rq *cfs_rq, str
 	update_min_vruntime(cfs_rq);
 }
 
-static void update_curr(struct cfs_rq *cfs_rq)
+static void update_curr_task(struct sched_entity *curr,
+		unsigned long delta_exec)
+{
+	struct task_struct *curtask = task_of(curr);
+
+	trace_sched_stat_runtime(curtask, delta_exec, curr->vruntime);
+	cpuacct_charge(curtask, delta_exec);
+	account_group_exec_runtime(curtask, delta_exec);
+}
+
+static int update_curr_common(struct cfs_rq *cfs_rq, unsigned long *delta)
 {
 	struct sched_entity *curr = cfs_rq->curr;
-	u64 now = rq_of(cfs_rq)->clock;
+	struct rq *rq = rq_of(cfs_rq);
+	u64 now = rq->clock;
 	unsigned long delta_exec;
 
 	if (unlikely(!curr))
-		return;
+		return 1;
 
 	/*
 	 * Get the amount of time the current task was running
@@ -505,20 +816,47 @@ static void update_curr(struct cfs_rq *c
 	 */
 	delta_exec = (unsigned long)(now - curr->exec_start);
 	if (!delta_exec)
-		return;
+		return 1;
 
 	__update_curr(cfs_rq, curr, delta_exec);
 	curr->exec_start = now;
+	*delta = delta_exec;
+	return 0;
+}
 
-	if (entity_is_task(curr)) {
-		struct task_struct *curtask = task_of(curr);
+static void update_curr(struct cfs_rq *cfs_rq)
+{
+	struct sched_entity *curr = cfs_rq->curr;
+	struct rq *rq = rq_of(cfs_rq);
+	unsigned long delta_exec;
 
-		trace_sched_stat_runtime(curtask, delta_exec, curr->vruntime);
-		cpuacct_charge(curtask, delta_exec);
-		account_group_exec_runtime(curtask, delta_exec);
+	if (update_curr_common(cfs_rq, &delta_exec))
+		return ;
+
+	if (entity_is_task(curr))
+		update_curr_task(curr, delta_exec);
+	else {
+		cfs_rq_runtime_lock(group_cfs_rq(curr));
+		update_curr_group(curr, delta_exec, rq->curr);
+		cfs_rq_runtime_unlock(group_cfs_rq(curr));
 	}
 }
 
+static void update_curr_locked(struct cfs_rq *cfs_rq)
+{
+	struct sched_entity *curr = cfs_rq->curr;
+	struct rq *rq = rq_of(cfs_rq);
+	unsigned long delta_exec;
+
+	if (update_curr_common(cfs_rq, &delta_exec))
+		return ;
+
+	if (entity_is_task(curr))
+		update_curr_task(curr, delta_exec);
+	else
+		update_curr_group(curr, delta_exec, rq->curr);
+}
+
 static inline void
 update_stats_wait_start(struct cfs_rq *cfs_rq, struct sched_entity *se)
 {
@@ -740,13 +1078,9 @@ place_entity(struct cfs_rq *cfs_rq, stru
 	se->vruntime = vruntime;
 }
 
-static void
-enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int wakeup)
+static void enqueue_entity_common(struct cfs_rq *cfs_rq,
+		struct sched_entity *se, int wakeup)
 {
-	/*
-	 * Update run-time statistics of the 'current'.
-	 */
-	update_curr(cfs_rq);
 	account_entity_enqueue(cfs_rq, se);
 
 	if (wakeup) {
@@ -760,6 +1094,26 @@ enqueue_entity(struct cfs_rq *cfs_rq, st
 		__enqueue_entity(cfs_rq, se);
 }
 
+static void enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se,
+		int wakeup)
+{
+	/*
+	 * Update run-time statistics of the 'current'.
+	 */
+	update_curr(cfs_rq);
+	enqueue_entity_common(cfs_rq, se, wakeup);
+}
+
+static void enqueue_entity_locked(struct cfs_rq *cfs_rq,
+		struct sched_entity *se, int wakeup)
+{
+	/*
+	 * Update run-time statistics of the 'current'.
+	 */
+	update_curr_locked(cfs_rq);
+	enqueue_entity_common(cfs_rq, se, wakeup);
+}
+
 static void __clear_buddies(struct cfs_rq *cfs_rq, struct sched_entity *se)
 {
 	if (!se || cfs_rq->last == se)
@@ -897,6 +1251,32 @@ static struct sched_entity *pick_next_en
 	return se;
 }
 
+/*
+ * Called from put_prev_entity()
+ * If a group entity (@se) is found to be throttled, it will not be put back
+ * on @cfs_rq, which is equivalent to dequeing it.
+ */
+static int dequeue_throttled_entity(struct cfs_rq *cfs_rq,
+		struct sched_entity *se)
+{
+	struct cfs_rq *gcfs_rq = group_cfs_rq(se);
+
+	if (entity_is_task(se))
+		return 0;
+
+	cfs_rq_runtime_lock(gcfs_rq);
+	if (!cfs_rq_throttled(gcfs_rq) && gcfs_rq->nr_running) {
+		cfs_rq_runtime_unlock(gcfs_rq);
+		return 0;
+	}
+
+	__clear_buddies(cfs_rq, se);
+	account_entity_dequeue(cfs_rq, se);
+	cfs_rq->curr = NULL;
+	cfs_rq_runtime_unlock(gcfs_rq);
+	return 1;
+}
+
 static void put_prev_entity(struct cfs_rq *cfs_rq, struct sched_entity *prev)
 {
 	/*
@@ -908,6 +1288,8 @@ static void put_prev_entity(struct cfs_r
 
 	check_spread(cfs_rq, prev);
 	if (prev->on_rq) {
+		if (dequeue_throttled_entity(cfs_rq, prev))
+			return;
 		update_stats_wait_start(cfs_rq, prev);
 		/* Put 'current' back into the tree. */
 		__enqueue_entity(cfs_rq, prev);
@@ -1004,10 +1386,28 @@ static inline void hrtick_update(struct 
 }
 #endif
 
+static int enqueue_group_entity(struct cfs_rq *cfs_rq, struct sched_entity *se,
+		 int wakeup)
+{
+	struct cfs_rq *gcfs_rq = group_cfs_rq(se);
+	int ret = 0;
+
+	cfs_rq_runtime_lock(gcfs_rq);
+	if (cfs_rq_throttled(gcfs_rq)) {
+		ret = 1;
+		goto out;
+	}
+	enqueue_entity_locked(cfs_rq, se, wakeup);
+out:
+	cfs_rq_runtime_unlock(gcfs_rq);
+	return ret;
+}
+
 /*
  * The enqueue_task method is called before nr_running is
  * increased. Here we update the fair scheduling stats and
  * then put the task into the rbtree:
+ * Don't enqueue a throttled entity further into the hierarchy.
  */
 static void enqueue_task_fair(struct rq *rq, struct task_struct *p, int wakeup)
 {
@@ -1017,11 +1417,15 @@ static void enqueue_task_fair(struct rq 
 	for_each_sched_entity(se) {
 		if (se->on_rq)
 			break;
+
 		cfs_rq = cfs_rq_of(se);
-		enqueue_entity(cfs_rq, se, wakeup);
+		if (entity_is_task(se))
+			enqueue_entity(cfs_rq, se, wakeup);
+		else
+			if (enqueue_group_entity(cfs_rq, se, wakeup))
+				break;
 		wakeup = 1;
 	}
-
 	hrtick_update(rq);
 }
 
@@ -1041,6 +1445,17 @@ static void dequeue_task_fair(struct rq 
 		/* Don't dequeue parent if it has other entities besides us */
 		if (cfs_rq->load.weight)
 			break;
+
+		/*
+		 * If this cfs_rq is throttled, then it is already
+		 * dequeued.
+		 */
+		cfs_rq_runtime_lock(cfs_rq);
+		if (cfs_rq_throttled(cfs_rq)) {
+			cfs_rq_runtime_unlock(cfs_rq);
+			break;
+		}
+		cfs_rq_runtime_unlock(cfs_rq);
 		sleep = 1;
 	}
 
@@ -1788,9 +2203,10 @@ load_balance_fair(struct rq *this_rq, in
 		u64 rem_load, moved_load;
 
 		/*
-		 * empty group
+		 * empty group or throttled group
 		 */
-		if (!busiest_cfs_rq->task_weight)
+		if (!busiest_cfs_rq->task_weight ||
+				cfs_rq_throttled(busiest_cfs_rq))
 			continue;
 
 		rem_load = (u64)rem_load_move * busiest_weight;
@@ -1839,6 +2255,12 @@ move_one_task_fair(struct rq *this_rq, i
 
 	for_each_leaf_cfs_rq(busiest, busy_cfs_rq) {
 		/*
+		 * Don't move task from a throttled cfs_rq
+		 */
+		if (cfs_rq_throttled(busy_cfs_rq))
+			continue;
+
+		/*
 		 * pass busy_cfs_rq argument into
 		 * load_balance_[start|next]_fair iterators
 		 */
diff -NurpP --minimal --exclude '*.orig' linux-2.6.32.1/kernel/sched_rt.c linux-2.6.32.1-hard/kernel/sched_rt.c
--- linux-2.6.32.1/kernel/sched_rt.c	2009-12-03 20:02:58.000000000 +0100
+++ linux-2.6.32.1-hard/kernel/sched_rt.c	2009-12-17 14:48:21.000000000 +0100
@@ -235,18 +235,6 @@ static int rt_se_boosted(struct sched_rt
 	return p->prio != p->normal_prio;
 }
 
-#ifdef CONFIG_SMP
-static inline const struct cpumask *sched_rt_period_mask(void)
-{
-	return cpu_rq(smp_processor_id())->rd->span;
-}
-#else
-static inline const struct cpumask *sched_rt_period_mask(void)
-{
-	return cpu_online_mask;
-}
-#endif
-
 static inline
 struct rt_rq *sched_rt_period_rt_rq(struct rt_bandwidth *rt_b, int cpu)
 {
@@ -296,11 +284,6 @@ static inline int rt_rq_throttled(struct
 	return rt_rq->rt_throttled;
 }
 
-static inline const struct cpumask *sched_rt_period_mask(void)
-{
-	return cpu_online_mask;
-}
-
 static inline
 struct rt_rq *sched_rt_period_rt_rq(struct rt_bandwidth *rt_b, int cpu)
 {
@@ -373,7 +356,7 @@ next:
 /*
  * Ensure this RQ takes back all the runtime it lend to its neighbours.
  */
-static void __disable_runtime(struct rq *rq)
+static void disable_runtime_rt(struct rq *rq)
 {
 	struct root_domain *rd = rq->rd;
 	struct rt_rq *rt_rq;
@@ -450,16 +433,7 @@ balanced:
 	}
 }
 
-static void disable_runtime(struct rq *rq)
-{
-	unsigned long flags;
-
-	spin_lock_irqsave(&rq->lock, flags);
-	__disable_runtime(rq);
-	spin_unlock_irqrestore(&rq->lock, flags);
-}
-
-static void __enable_runtime(struct rq *rq)
+static void enable_runtime_rt(struct rq *rq)
 {
 	struct rt_rq *rt_rq;
 
@@ -482,15 +456,6 @@ static void __enable_runtime(struct rq *
 	}
 }
 
-static void enable_runtime(struct rq *rq)
-{
-	unsigned long flags;
-
-	spin_lock_irqsave(&rq->lock, flags);
-	__enable_runtime(rq);
-	spin_unlock_irqrestore(&rq->lock, flags);
-}
-
 static int balance_runtime(struct rt_rq *rt_rq)
 {
 	int more = 0;
@@ -518,7 +483,7 @@ static int do_sched_rt_period_timer(stru
 	if (!rt_bandwidth_enabled() || rt_b->rt_runtime == RUNTIME_INF)
 		return 1;
 
-	span = sched_rt_period_mask();
+	span = sched_bw_period_mask();
 	for_each_cpu(i, span) {
 		int enqueue = 0;
 		struct rt_rq *rt_rq = sched_rt_period_rt_rq(rt_b, i);
@@ -1564,7 +1529,7 @@ static void rq_online_rt(struct rq *rq)
 	if (rq->rt.overloaded)
 		rt_set_overload(rq);
 
-	__enable_runtime(rq);
+	enable_runtime_rt(rq);
 
 	cpupri_set(&rq->rd->cpupri, rq->cpu, rq->rt.highest_prio.curr);
 }
@@ -1575,7 +1540,7 @@ static void rq_offline_rt(struct rq *rq)
 	if (rq->rt.overloaded)
 		rt_clear_overload(rq);
 
-	__disable_runtime(rq);
+	disable_runtime_rt(rq);
 
 	cpupri_set(&rq->rd->cpupri, rq->cpu, CPUPRI_INVALID);
 }