--- olpc-2.6-master.00/fs/proc/array.c 2007-02-28 20:05:28.000000000 -0500 +++ olpc-2.6-master-vs22x.02/fs/proc/array.c 2007-03-01 17:09:59.000000000 -0500 @@ -134,8 +136,9 @@ static const char *task_state_array[] = "D (disk sleep)", /* 2 */ "T (stopped)", /* 4 */ "T (tracing stop)", /* 8 */ - "Z (zombie)", /* 16 */ - "X (dead)" /* 32 */ + "H (on hold)", /* 16 */ + "Z (zombie)", /* 32 */ + "X (dead)", /* 64 */ }; static inline const char * get_task_state(struct task_struct *tsk) --- olpc-2.6-master.00/fs/proc/array.c 2007-02-28 20:05:28.000000000 -0500 +++ olpc-2.6-master-vs22x.02/fs/proc/array.c 2007-03-01 17:09:59.000000000 -0500 @@ -144,7 +147,8 @@ static inline const char * get_task_stat TASK_INTERRUPTIBLE | TASK_UNINTERRUPTIBLE | TASK_STOPPED | - TASK_TRACED)) | + TASK_TRACED | + TASK_ONHOLD)) | (tsk->exit_state & (EXIT_ZOMBIE | EXIT_DEAD)); const char **p = &task_state_array[0]; --- olpc-2.6-master.00/include/linux/sched.h 2007-02-28 20:05:29.000000000 -0500 +++ olpc-2.6-master-vs22x.02/include/linux/sched.h 2007-03-01 11:52:20.000000000 -0500 @@ -145,12 +147,13 @@ extern unsigned long weighted_cpuload(co #define TASK_UNINTERRUPTIBLE 2 #define TASK_STOPPED 4 #define TASK_TRACED 8 +#define TASK_ONHOLD 16 /* in tsk->exit_state */ -#define EXIT_ZOMBIE 16 -#define EXIT_DEAD 32 +#define EXIT_ZOMBIE 32 +#define EXIT_DEAD 64 /* in tsk->state again */ -#define TASK_NONINTERACTIVE 64 -#define TASK_DEAD 128 +#define TASK_NONINTERACTIVE 128 +#define TASK_DEAD 256 #define __set_task_state(tsk, state_value) \ do { (tsk)->state = (state_value); } while (0) --- olpc-2.6-master.00/include/linux/vserver/sched.h 1969-12-31 19:00:00.000000000 -0500 +++ olpc-2.6-master-vs22x.02/include/linux/vserver/sched.h 2007-03-01 11:52:20.000000000 -0500 @@ -0,0 +1,26 @@ +#ifndef _VX_SCHED_H +#define _VX_SCHED_H + + +#ifdef __KERNEL__ + +struct timespec; + +void vx_vsi_uptime(struct timespec *, struct timespec *); + + +struct vx_info; + +void vx_update_load(struct vx_info *); + + +int vx_tokens_recalc(struct _vx_sched_pc *, + unsigned long *, unsigned long *, int [2]); + +void vx_update_sched_param(struct _vx_sched *sched, + struct _vx_sched_pc *sched_pc); + +#endif /* __KERNEL__ */ +#else /* _VX_SCHED_H */ +#warning duplicate inclusion +#endif /* _VX_SCHED_H */ --- olpc-2.6-master.00/kernel/sched.c 2007-02-28 20:05:29.000000000 -0500 +++ olpc-2.6-master-vs22x.02/kernel/sched.c 2007-03-01 11:52:20.000000000 -0500 @@ -255,6 +257,16 @@ struct rq { struct task_struct *migration_thread; struct list_head migration_queue; #endif + unsigned long norm_time; + unsigned long idle_time; +#ifdef CONFIG_VSERVER_IDLETIME + int idle_skip; +#endif +#ifdef CONFIG_VSERVER_HARDCPU + struct list_head hold_queue; + unsigned long nr_onhold; + int idle_tokens; +#endif #ifdef CONFIG_SCHEDSTATS /* latency stats */ --- olpc-2.6-master.00/kernel/sched.c 2007-02-28 20:05:29.000000000 -0500 +++ olpc-2.6-master-vs22x.02/kernel/sched.c 2007-03-01 11:52:20.000000000 -0500 @@ -687,6 +699,7 @@ sched_info_switch(struct task_struct *pr */ static void dequeue_task(struct task_struct *p, struct prio_array *array) { + BUG_ON(p->state & TASK_ONHOLD); array->nr_active--; list_del(&p->run_list); if (list_empty(array->queue + p->prio)) --- olpc-2.6-master.00/kernel/sched.c 2007-02-28 20:05:29.000000000 -0500 +++ olpc-2.6-master-vs22x.02/kernel/sched.c 2007-03-01 11:52:20.000000000 -0500 @@ -695,6 +708,7 @@ static void dequeue_task(struct task_str static void enqueue_task(struct task_struct *p, struct prio_array *array) { + BUG_ON(p->state & TASK_ONHOLD); sched_info_queued(p); list_add_tail(&p->run_list, array->queue + p->prio); __set_bit(p->prio, array->bitmap); --- olpc-2.6-master.00/kernel/sched.c 2007-02-28 20:05:29.000000000 -0500 +++ olpc-2.6-master-vs22x.02/kernel/sched.c 2007-03-01 11:52:20.000000000 -0500 @@ -708,6 +722,7 @@ static void enqueue_task(struct task_str */ static void requeue_task(struct task_struct *p, struct prio_array *array) { + BUG_ON(p->state & TASK_ONHOLD); list_move_tail(&p->run_list, array->queue + p->prio); } --- olpc-2.6-master.00/kernel/sched.c 2007-02-28 20:05:29.000000000 -0500 +++ olpc-2.6-master-vs22x.02/kernel/sched.c 2007-03-01 11:52:20.000000000 -0500 @@ -714,6 +729,7 @@ ***** static inline void enqueue_task_head(struct task_struct *p, struct prio_array *array) { + BUG_ON(p->state & TASK_ONHOLD); list_add(&p->run_list, array->queue + p->prio); __set_bit(p->prio, array->bitmap); array->nr_active++; --- olpc-2.6-master.00/kernel/sched.c 2007-02-28 20:05:29.000000000 -0500 +++ olpc-2.6-master-vs22x.02/kernel/sched.c 2007-03-01 11:52:20.000000000 -0500 @@ -742,6 +758,10 @@ static inline int __normal_prio(struct t bonus = CURRENT_BONUS(p) - MAX_BONUS / 2; prio = p->static_prio - bonus; + + /* adjust effective priority */ + prio = vx_adjust_prio(p, prio, MAX_USER_PRIO); + if (prio < MAX_RT_PRIO) prio = MAX_RT_PRIO; if (prio > MAX_PRIO-1) --- olpc-2.6-master.00/kernel/sched.c 2007-02-28 20:05:29.000000000 -0500 +++ olpc-2.6-master-vs22x.02/kernel/sched.c 2007-03-01 11:52:20.000000000 -0500 @@ -851,6 +871,9 @@ static int effective_prio(struct task_st return p->prio; } +#include "sched_mon.h" + + /* * __activate_task - move a task to the runqueue. */ --- olpc-2.6-master.00/kernel/sched.c 2007-02-28 20:05:29.000000000 -0500 +++ olpc-2.6-master-vs22x.02/kernel/sched.c 2007-03-01 11:52:20.000000000 -0500 @@ -860,6 +883,7 @@ static void __activate_task(struct task_ if (batch_task(p)) target = rq->expired; + vxm_activate_task(p, rq); enqueue_task(p, target); inc_nr_running(p, rq); } --- olpc-2.6-master.00/kernel/sched.c 2007-02-28 20:05:29.000000000 -0500 +++ olpc-2.6-master-vs22x.02/kernel/sched.c 2007-03-01 11:52:20.000000000 -0500 @@ -869,6 +893,7 @@ static void __activate_task(struct task_ */ static inline void __activate_idle_task(struct task_struct *p, struct rq *rq) { + vxm_activate_idle(p, rq); enqueue_task_head(p, rq->active); inc_nr_running(p, rq); } --- olpc-2.6-master.00/kernel/sched.c 2007-02-28 20:05:29.000000000 -0500 +++ olpc-2.6-master-vs22x.02/kernel/sched.c 2007-03-01 11:52:20.000000000 -0500 @@ -1003,6 +1028,7 @@ static void activate_task(struct task_st } p->timestamp = now; out: + vx_activate_task(p); __activate_task(p, rq); } --- olpc-2.6-master.00/kernel/sched.c 2007-02-28 20:05:29.000000000 -0500 +++ olpc-2.6-master-vs22x.02/kernel/sched.c 2007-03-01 11:52:20.000000000 -0500 @@ -1007,9 +1033,9 @@ ***** } /* - * deactivate_task - remove a task from the runqueue. + * __deactivate_task - remove a task from the runqueue. */ -static void deactivate_task(struct task_struct *p, struct rq *rq) +static void __deactivate_task(struct task_struct *p, struct rq *rq) { dec_nr_running(p, rq); dequeue_task(p, p->array); --- olpc-2.6-master.00/kernel/sched.c 2007-02-28 20:05:29.000000000 -0500 +++ olpc-2.6-master-vs22x.02/kernel/sched.c 2007-03-01 11:52:20.000000000 -0500 @@ -1013,6 +1039,7 @@ ***** { dec_nr_running(p, rq); dequeue_task(p, p->array); + vxm_deactivate_task(p, rq); p->array = NULL; } --- olpc-2.6-master.00/kernel/sched.c 2007-02-28 20:05:29.000000000 -0500 +++ olpc-2.6-master-vs22x.02/kernel/sched.c 2007-03-01 11:52:20.000000000 -0500 @@ -1016,6 +1043,15 @@ ***** p->array = NULL; } +static inline +void deactivate_task(struct task_struct *p, struct rq *rq) +{ + vx_deactivate_task(p); + __deactivate_task(p, rq); +} + +#include "sched_hard.h" + /* * resched_task - mark a task 'to be rescheduled now'. * --- olpc-2.6-master.00/kernel/sched.c 2007-02-28 20:05:29.000000000 -0500 +++ olpc-2.6-master-vs22x.02/kernel/sched.c 2007-03-01 11:52:20.000000000 -0500 @@ -1091,6 +1127,7 @@ migrate_task(struct task_struct *p, int { struct rq *rq = task_rq(p); + vxm_migrate_task(p, rq, dest_cpu); /* * If the task is not on a runqueue (and not running), then * it is sufficient to simply update the task's cpu field. --- olpc-2.6-master.00/kernel/sched.c 2007-02-28 20:05:29.000000000 -0500 +++ olpc-2.6-master-vs22x.02/kernel/sched.c 2007-03-01 11:52:20.000000000 -0500 @@ -1419,6 +1456,12 @@ static int try_to_wake_up(struct task_st rq = task_rq_lock(p, &flags); old_state = p->state; + + /* we need to unhold suspended tasks */ + if (old_state & TASK_ONHOLD) { + vx_unhold_task(p, rq); + old_state = p->state; + } if (!(old_state & state)) goto out; --- olpc-2.6-master.00/kernel/sched.c 2007-02-28 20:05:29.000000000 -0500 +++ olpc-2.6-master-vs22x.02/kernel/sched.c 2007-03-01 11:52:20.000000000 -0500 @@ -1526,6 +1569,7 @@ out_activate: #endif /* CONFIG_SMP */ if (old_state == TASK_UNINTERRUPTIBLE) { rq->nr_uninterruptible--; + vx_uninterruptible_dec(p); /* * Tasks on involuntary sleep don't earn * sleep_avg beyond just interactive state. --- olpc-2.6-master.00/kernel/sched.c 2007-02-28 20:05:29.000000000 -0500 +++ olpc-2.6-master-vs22x.02/kernel/sched.c 2007-03-01 11:52:20.000000000 -0500 @@ -1577,7 +1621,7 @@ int fastcall wake_up_state(struct task_s return try_to_wake_up(p, state, 0); } -static void task_running_tick(struct rq *rq, struct task_struct *p); +static void task_running_tick(struct rq *rq, struct task_struct *p, int cpu); /* * Perform scheduler related setup for a newly forked process p. * p is forked by current. --- olpc-2.6-master.00/kernel/sched.c 2007-02-28 20:05:29.000000000 -0500 +++ olpc-2.6-master-vs22x.02/kernel/sched.c 2007-03-01 11:52:20.000000000 -0500 @@ -1638,7 +1682,7 @@ void fastcall sched_fork(struct task_str * runqueue lock is not a problem. */ current->time_slice = 1; - task_running_tick(cpu_rq(cpu), current); + task_running_tick(cpu_rq(cpu), current, cpu); } local_irq_enable(); put_cpu(); --- olpc-2.6-master.00/kernel/sched.c 2007-02-28 20:05:29.000000000 -0500 +++ olpc-2.6-master-vs22x.02/kernel/sched.c 2007-03-01 11:52:20.000000000 -0500 @@ -1673,6 +1717,7 @@ void fastcall wake_up_new_task(struct ta p->prio = effective_prio(p); + vx_activate_task(p); if (likely(cpu == this_cpu)) { if (!(clone_flags & CLONE_VM)) { /* --- olpc-2.6-master.00/kernel/sched.c 2007-02-28 20:05:29.000000000 -0500 +++ olpc-2.6-master-vs22x.02/kernel/sched.c 2007-03-01 11:52:20.000000000 -0500 @@ -1684,6 +1729,7 @@ void fastcall wake_up_new_task(struct ta __activate_task(p, rq); else { p->prio = current->prio; + BUG_ON(p->state & TASK_ONHOLD); p->normal_prio = current->normal_prio; list_add_tail(&p->run_list, ¤t->run_list); p->array = current->array; --- olpc-2.6-master.00/kernel/sched.c 2007-02-28 20:05:29.000000000 -0500 +++ olpc-2.6-master-vs22x.02/kernel/sched.c 2007-03-01 11:52:20.000000000 -0500 @@ -3090,7 +3139,7 @@ ***** /* Add user time to cpustat. */ tmp = cputime_to_cputime64(cputime); - if (TASK_NICE(p) > 0) + if (nice) cpustat->nice = cputime64_add(cpustat->nice, tmp); else cpustat->user = cputime64_add(cpustat->user, tmp); --- olpc-2.6-master.00/kernel/sched.c 2007-02-28 20:05:29.000000000 -0500 +++ olpc-2.6-master-vs22x.02/kernel/sched.c 2007-03-01 11:52:20.000000000 -0500 @@ -3148,7 +3199,7 @@ void account_steal_time(struct task_stru cpustat->steal = cputime64_add(cpustat->steal, tmp); } -static void task_running_tick(struct rq *rq, struct task_struct *p) +static void task_running_tick(struct rq *rq, struct task_struct *p, int cpu) { if (p->array != rq->active) { /* Task has expired but was not scheduled yet */ --- olpc-2.6-master.00/kernel/sched.c 2007-02-28 20:05:29.000000000 -0500 +++ olpc-2.6-master-vs22x.02/kernel/sched.c 2007-03-01 11:52:20.000000000 -0500 @@ -3178,7 +3229,7 @@ static void task_running_tick(struct rq } goto out_unlock; } - if (!--p->time_slice) { + if (vx_need_resched(p, --p->time_slice, cpu)) { dequeue_task(p, rq->active); set_tsk_need_resched(p); p->prio = effective_prio(p); --- olpc-2.6-master.00/kernel/sched.c 2007-02-28 20:05:29.000000000 -0500 +++ olpc-2.6-master-vs22x.02/kernel/sched.c 2007-03-01 11:52:20.000000000 -0500 @@ -3238,12 +3289,14 @@ void scheduler_tick(void) struct rq *rq = cpu_rq(cpu); update_cpu_clock(p, rq, now); + vxm_sync(now, cpu); - if (p == rq->idle) + if (p == rq->idle) { /* Task on the idle queue */ wake_priority_sleeper(rq); - else - task_running_tick(rq, p); + vx_idle_resched(rq); + } else + task_running_tick(rq, p, cpu); #ifdef CONFIG_SMP update_load(rq); if (time_after_eq(jiffies, rq->next_balance)) --- olpc-2.6-master.00/kernel/sched.c 2007-02-28 20:05:29.000000000 -0500 +++ olpc-2.6-master-vs22x.02/kernel/sched.c 2007-03-01 11:52:20.000000000 -0500 @@ -3495,8 +3548,10 @@ need_resched_nonpreemptible: unlikely(signal_pending(prev)))) prev->state = TASK_RUNNING; else { - if (prev->state == TASK_UNINTERRUPTIBLE) + if (prev->state == TASK_UNINTERRUPTIBLE) { rq->nr_uninterruptible++; + vx_uninterruptible_inc(prev); + } deactivate_task(prev, rq); } } --- olpc-2.6-master.00/kernel/sched.c 2007-02-28 20:05:29.000000000 -0500 +++ olpc-2.6-master-vs22x.02/kernel/sched.c 2007-03-01 11:52:20.000000000 -0500 @@ -3502,7 +3557,16 @@ ***** } cpu = smp_processor_id(); + vx_set_rq_time(rq, jiffies); +try_unhold: + vx_try_unhold(rq, cpu); +pick_next: + if (unlikely(!rq->nr_running)) { + /* can we skip idle time? */ + if (vx_try_skip(rq, cpu)) + goto try_unhold; + idle_balance(cpu, rq); if (!rq->nr_running) { next = rq->idle; --- olpc-2.6-master.00/kernel/sched.c 2007-02-28 20:05:29.000000000 -0500 +++ olpc-2.6-master-vs22x.02/kernel/sched.c 2007-03-01 11:52:20.000000000 -0500 @@ -3529,6 +3593,10 @@ need_resched_nonpreemptible: queue = array->queue + idx; next = list_entry(queue->next, struct task_struct, run_list); + /* check before we schedule this context */ + if (!vx_schedule(next, rq, cpu)) + goto pick_next; + if (!rt_task(next) && interactive_sleep(next->sleep_type)) { unsigned long long delta = now - next->timestamp; if (unlikely((long long)(now - next->timestamp) < 0)) --- olpc-2.6-master.00/kernel/sched.c 2007-02-28 20:05:29.000000000 -0500 +++ olpc-2.6-master-vs22x.02/kernel/sched.c 2007-03-01 11:52:20.000000000 -0500 @@ -4303,6 +4371,7 @@ recheck: oldprio = p->prio; __setscheduler(p, policy, param->sched_priority); if (array) { + vx_activate_task(p); __activate_task(p, rq); /* * Reschedule if we are currently running on this runqueue and --- olpc-2.6-master.00/kernel/sched.c 2007-02-28 20:05:29.000000000 -0500 +++ olpc-2.6-master-vs22x.02/kernel/sched.c 2007-03-01 11:52:20.000000000 -0500 @@ -5090,6 +5159,7 @@ static int __migrate_task(struct task_st p->timestamp = p->timestamp - rq_src->most_recent_timestamp + rq_dest->most_recent_timestamp; deactivate_task(p, rq_src); + vx_activate_task(p); __activate_task(p, rq_dest); if (TASK_PREEMPTS_CURR(p, rq_dest)) resched_task(rq_dest->curr); --- olpc-2.6-master.00/kernel/sched.c 2007-02-28 20:05:29.000000000 -0500 +++ olpc-2.6-master-vs22x.02/kernel/sched.c 2007-03-01 11:52:20.000000000 -0500 @@ -6937,7 +7007,10 @@ void __init sched_init(void) INIT_LIST_HEAD(&rq->migration_queue); #endif atomic_set(&rq->nr_iowait, 0); - +#ifdef CONFIG_VSERVER_HARDCPU + INIT_LIST_HEAD(&rq->hold_queue); + rq->nr_onhold = 0; +#endif for (j = 0; j < 2; j++) { array = rq->arrays + j; for (k = 0; k < MAX_PRIO; k++) { --- olpc-2.6-master.00/kernel/sched.c 2007-02-28 20:05:29.000000000 -0500 +++ olpc-2.6-master-vs22x.02/kernel/sched.c 2007-03-01 11:52:20.000000000 -0500 @@ -7020,6 +7093,7 @@ void normalize_rt_tasks(void) deactivate_task(p, task_rq(p)); __setscheduler(p, SCHED_NORMAL, 0); if (array) { + vx_activate_task(p); __activate_task(p, task_rq(p)); resched_task(rq->curr); } --- olpc-2.6-master.00/kernel/sched_hard.h 1969-12-31 19:00:00.000000000 -0500 +++ olpc-2.6-master-vs22x.02/kernel/sched_hard.h 2007-03-01 11:52:20.000000000 -0500 @@ -0,0 +1,324 @@ + +#ifdef CONFIG_VSERVER_IDLELIMIT + +/* + * vx_idle_resched - reschedule after maxidle + */ +static inline +void vx_idle_resched(struct rq *rq) +{ + /* maybe have a better criterion for paused */ + if (!--rq->idle_tokens && !list_empty(&rq->hold_queue)) + set_need_resched(); +} + +#else /* !CONFIG_VSERVER_IDLELIMIT */ + +#define vx_idle_resched(rq) + +#endif /* CONFIG_VSERVER_IDLELIMIT */ + + + +#ifdef CONFIG_VSERVER_IDLETIME + +#define vx_set_rq_min_skip(rq, min) \ + (rq)->idle_skip = (min) + +#define vx_save_min_skip(ret, min, val) \ + __vx_save_min_skip(ret, min, val) + +static inline +void __vx_save_min_skip(int ret, int *min, int val) +{ + if (ret > -2) + return; + if ((*min > val) || !*min) + *min = val; +} + +static inline +int vx_try_skip(struct rq *rq, int cpu) +{ + /* artificially advance time */ + if (rq->idle_skip > 0) { + vxdprintk(list_empty(&rq->hold_queue), + "hold queue empty on cpu %d", cpu); + rq->idle_time += rq->idle_skip; + vxm_idle_skip(rq, cpu); + return 1; + } + return 0; +} + +#else /* !CONFIG_VSERVER_IDLETIME */ + +#define vx_set_rq_min_skip(rq, min) \ + ({ int dummy = (min); dummy; }) + +#define vx_save_min_skip(ret, min, val) + +static inline +int vx_try_skip(struct rq *rq, int cpu) +{ + return 0; +} + +#endif /* CONFIG_VSERVER_IDLETIME */ + + + +#ifdef CONFIG_VSERVER_HARDCPU + +#define vx_set_rq_max_idle(rq, max) \ + (rq)->idle_tokens = (max) + +#define vx_save_max_idle(ret, min, val) \ + __vx_save_max_idle(ret, min, val) + +static inline +void __vx_save_max_idle(int ret, int *min, int val) +{ + if (*min > val) + *min = val; +} + + +/* + * vx_hold_task - put a task on the hold queue + */ +static inline +void vx_hold_task(struct task_struct *p, struct rq *rq) +{ + __deactivate_task(p, rq); + p->state |= TASK_ONHOLD; + /* a new one on hold */ + rq->nr_onhold++; + vxm_hold_task(p, rq); + list_add_tail(&p->run_list, &rq->hold_queue); +} + +/* + * vx_unhold_task - put a task back to the runqueue + */ +static inline +void vx_unhold_task(struct task_struct *p, struct rq *rq) +{ + list_del(&p->run_list); + /* one less waiting */ + rq->nr_onhold--; + p->state &= ~TASK_ONHOLD; + enqueue_task(p, rq->expired); + inc_nr_running(p, rq); + vxm_unhold_task(p, rq); + + if (p->static_prio < rq->best_expired_prio) + rq->best_expired_prio = p->static_prio; +} + +unsigned long nr_onhold(void) +{ + unsigned long i, sum = 0; + + for_each_online_cpu(i) + sum += cpu_rq(i)->nr_onhold; + + return sum; +} + + + +static inline +int __vx_tokens_avail(struct _vx_sched_pc *sched_pc) +{ + return sched_pc->tokens; +} + +static inline +void __vx_consume_token(struct _vx_sched_pc *sched_pc) +{ + sched_pc->tokens--; +} + +static inline +int vx_need_resched(struct task_struct *p, int slice, int cpu) +{ + struct vx_info *vxi = p->vx_info; + + if (vx_info_flags(vxi, VXF_SCHED_HARD|VXF_SCHED_PRIO, 0)) { + struct _vx_sched_pc *sched_pc = + &vx_per_cpu(vxi, sched_pc, cpu); + int tokens; + + /* maybe we can simplify that to decrement + the token counter unconditional? */ + + if ((tokens = __vx_tokens_avail(sched_pc)) > 0) + __vx_consume_token(sched_pc); + + /* for tokens > 0, one token was consumed */ + if (tokens < 2) + slice = 0; + } + vxm_need_resched(p, slice, cpu); + return (slice == 0); +} + + +#define vx_set_rq_time(rq, time) do { \ + rq->norm_time = time; \ +} while (0) + + +static inline +void vx_try_unhold(struct rq *rq, int cpu) +{ + struct vx_info *vxi = NULL; + struct list_head *l, *n; + int maxidle = HZ; + int minskip = 0; + + /* nothing to do? what about pause? */ + if (list_empty(&rq->hold_queue)) + return; + + list_for_each_safe(l, n, &rq->hold_queue) { + int ret, delta_min[2]; + struct _vx_sched_pc *sched_pc; + struct task_struct *p; + + p = list_entry(l, struct task_struct, run_list); + /* don't bother with same context */ + if (vxi == p->vx_info) + continue; + + vxi = p->vx_info; + /* ignore paused contexts */ + if (vx_info_flags(vxi, VXF_SCHED_PAUSE, 0)) + continue; + + sched_pc = &vx_per_cpu(vxi, sched_pc, cpu); + + /* recalc tokens */ + vxm_sched_info(sched_pc, vxi, cpu); + ret = vx_tokens_recalc(sched_pc, + &rq->norm_time, &rq->idle_time, delta_min); + vxm_tokens_recalc(sched_pc, rq, vxi, cpu); + + if (ret > 0) { + /* we found a runable context */ + vx_unhold_task(p, rq); + break; + } + vx_save_max_idle(ret, &maxidle, delta_min[0]); + vx_save_min_skip(ret, &minskip, delta_min[1]); + } + vx_set_rq_max_idle(rq, maxidle); + vx_set_rq_min_skip(rq, minskip); + vxm_rq_max_min(rq, cpu); +} + + +static inline +int vx_schedule(struct task_struct *next, struct rq *rq, int cpu) +{ + struct vx_info *vxi = next->vx_info; + struct _vx_sched_pc *sched_pc; + int delta_min[2]; + int flags, ret; + + if (!vxi) + return 1; + + flags = vxi->vx_flags; + + if (unlikely(vs_check_flags(flags , VXF_SCHED_PAUSE, 0))) + goto put_on_hold; + if (!vs_check_flags(flags , VXF_SCHED_HARD|VXF_SCHED_PRIO, 0)) + return 1; + + sched_pc = &vx_per_cpu(vxi, sched_pc, cpu); +#ifdef CONFIG_SMP + /* update scheduler params */ + if (cpu_isset(cpu, vxi->sched.update)) { + vx_update_sched_param(&vxi->sched, sched_pc); + vxm_update_sched(sched_pc, vxi, cpu); + cpu_clear(cpu, vxi->sched.update); + } +#endif + vxm_sched_info(sched_pc, vxi, cpu); + ret = vx_tokens_recalc(sched_pc, + &rq->norm_time, &rq->idle_time, delta_min); + vxm_tokens_recalc(sched_pc, rq, vxi, cpu); + + if (!vs_check_flags(flags , VXF_SCHED_HARD, 0)) + return 1; + + if (unlikely(ret < 0)) { + vx_save_max_idle(ret, &rq->idle_tokens, delta_min[0]); + vx_save_min_skip(ret, &rq->idle_skip, delta_min[1]); + vxm_rq_max_min(rq, cpu); + put_on_hold: + vx_hold_task(next, rq); + return 0; + } + return 1; +} + + +#else /* CONFIG_VSERVER_HARDCPU */ + +static inline +void vx_hold_task(struct task_struct *p, struct rq *rq) +{ + return; +} + +static inline +void vx_unhold_task(struct task_struct *p, struct rq *rq) +{ + return; +} + +unsigned long nr_onhold(void) +{ + return 0; +} + + +static inline +int vx_need_resched(struct task_struct *p, int slice, int cpu) +{ + return (slice == 0); +} + + +#define vx_set_rq_time(rq, time) + +static inline +void vx_try_unhold(struct rq *rq, int cpu) +{ + return; +} + +static inline +int vx_schedule(struct task_struct *next, struct rq *rq, int cpu) +{ + struct vx_info *vxi = next->vx_info; + struct _vx_sched_pc *sched_pc; + int delta_min[2]; + int ret; + + if (!vx_info_flags(vxi, VXF_SCHED_PRIO, 0)) + return 1; + + sched_pc = &vx_per_cpu(vxi, sched_pc, cpu); + vxm_sched_info(sched_pc, vxi, cpu); + ret = vx_tokens_recalc(sched_pc, + &rq->norm_time, &rq->idle_time, delta_min); + vxm_tokens_recalc(sched_pc, rq, vxi, cpu); + return 1; +} + +#endif /* CONFIG_VSERVER_HARDCPU */ + --- olpc-2.6-master.00/kernel/vserver/sched.c 1969-12-31 19:00:00.000000000 -0500 +++ olpc-2.6-master-vs22x.02/kernel/vserver/sched.c 2007-03-01 17:09:59.000000000 -0500 @@ -0,0 +1,423 @@ +/* + * linux/kernel/vserver/sched.c + * + * Virtual Server: Scheduler Support + * + * Copyright (C) 2004-2006 Herbert Pötzl + * + * V0.01 adapted Sam Vilains version to 2.6.3 + * V0.02 removed legacy interface + * V0.03 changed vcmds to vxi arg + * + */ + +#include +#include +#include +#include + +#include +#include + +#define vxd_check_range(val, min, max) do { \ + vxlprintk((valmax), \ + "check_range(%ld,%ld,%ld)", \ + (long)val, (long)min, (long)max, \ + __FILE__, __LINE__); \ + } while (0) + + +void vx_update_sched_param(struct _vx_sched *sched, + struct _vx_sched_pc *sched_pc) +{ + unsigned int set_mask = sched->update_mask; + + if (set_mask & VXSM_FILL_RATE) + sched_pc->fill_rate[0] = sched->fill_rate[0]; + if (set_mask & VXSM_INTERVAL) + sched_pc->interval[0] = sched->interval[0]; + if (set_mask & VXSM_FILL_RATE2) + sched_pc->fill_rate[1] = sched->fill_rate[1]; + if (set_mask & VXSM_INTERVAL2) + sched_pc->interval[1] = sched->interval[1]; + if (set_mask & VXSM_TOKENS) + sched_pc->tokens = sched->tokens; + if (set_mask & VXSM_TOKENS_MIN) + sched_pc->tokens_min = sched->tokens_min; + if (set_mask & VXSM_TOKENS_MAX) + sched_pc->tokens_max = sched->tokens_max; + if (set_mask & VXSM_PRIO_BIAS) + sched_pc->prio_bias = sched->prio_bias; + + if (set_mask & VXSM_IDLE_TIME) + sched_pc->flags |= VXSF_IDLE_TIME; + else + sched_pc->flags &= ~VXSF_IDLE_TIME; + + /* reset time */ + sched_pc->norm_time = jiffies; +} + + +/* + * recalculate the context's scheduling tokens + * + * ret > 0 : number of tokens available + * ret < 0 : on hold, check delta_min[] + * -1 only jiffies + * -2 also idle time + * + */ +int vx_tokens_recalc(struct _vx_sched_pc *sched_pc, + unsigned long *norm_time, unsigned long *idle_time, int delta_min[2]) +{ + long delta; + long tokens = 0; + int flags = sched_pc->flags; + + /* how much time did pass? */ + delta = *norm_time - sched_pc->norm_time; + vxd_check_range(delta, 0, INT_MAX); + + if (delta >= sched_pc->interval[0]) { + long tokens, integral; + + /* calc integral token part */ + tokens = delta / sched_pc->interval[0]; + integral = tokens * sched_pc->interval[0]; + tokens *= sched_pc->fill_rate[0]; +#ifdef CONFIG_VSERVER_HARDCPU + delta_min[0] = delta - integral; + vxd_check_range(delta_min[0], 0, sched_pc->interval[0]); +#endif + /* advance time */ + sched_pc->norm_time += delta; + + /* add tokens */ + sched_pc->tokens += tokens; + sched_pc->token_time += tokens; + } + else + delta_min[0] = delta; + +#ifdef CONFIG_VSERVER_IDLETIME + if (!(flags & VXSF_IDLE_TIME)) + goto skip_idle; + + /* how much was the idle skip? */ + delta = *idle_time - sched_pc->idle_time; + vxd_check_range(delta, 0, INT_MAX); + + if (delta >= sched_pc->interval[1]) { + long tokens, integral; + + /* calc fair share token part */ + tokens = delta / sched_pc->interval[1]; + integral = tokens * sched_pc->interval[1]; + tokens *= sched_pc->fill_rate[1]; + delta_min[1] = delta - integral; + vxd_check_range(delta_min[1], 0, sched_pc->interval[1]); + + /* advance idle time */ + sched_pc->idle_time += integral; + + /* add tokens */ + sched_pc->tokens += tokens; + sched_pc->token_time += tokens; + } + else + delta_min[1] = delta; +skip_idle: +#endif + + /* clip at maximum */ + if (sched_pc->tokens > sched_pc->tokens_max) + sched_pc->tokens = sched_pc->tokens_max; + tokens = sched_pc->tokens; + + if ((flags & VXSF_ONHOLD)) { + /* can we unhold? */ + if (tokens >= sched_pc->tokens_min) { + flags &= ~VXSF_ONHOLD; + sched_pc->hold_ticks += + *norm_time - sched_pc->onhold; + } + else + goto on_hold; + } else { + /* put on hold? */ + if (tokens <= 0) { + flags |= VXSF_ONHOLD; + sched_pc->onhold = *norm_time; + goto on_hold; + } + } + sched_pc->flags = flags; + return tokens; + +on_hold: + tokens = sched_pc->tokens_min - tokens; + sched_pc->flags = flags; + BUG_ON(tokens < 0); + +#ifdef CONFIG_VSERVER_HARDCPU + /* next interval? */ + if (!sched_pc->fill_rate[0]) + delta_min[0] = HZ; + else if (tokens > sched_pc->fill_rate[0]) + delta_min[0] += sched_pc->interval[0] * + tokens / sched_pc->fill_rate[0]; + else + delta_min[0] = sched_pc->interval[0] - delta_min[0]; + vxd_check_range(delta_min[0], 0, INT_MAX); + +#ifdef CONFIG_VSERVER_IDLETIME + if (!(flags & VXSF_IDLE_TIME)) + return -1; + + /* next interval? */ + if (!sched_pc->fill_rate[1]) + delta_min[1] = HZ; + else if (tokens > sched_pc->fill_rate[1]) + delta_min[1] += sched_pc->interval[1] * + tokens / sched_pc->fill_rate[1]; + else + delta_min[1] = sched_pc->interval[1] - delta_min[1]; + vxd_check_range(delta_min[1], 0, INT_MAX); + + return -2; +#else + return -1; +#endif /* CONFIG_VSERVER_IDLETIME */ +#else + return 0; +#endif /* CONFIG_VSERVER_HARDCPU */ +} + +static inline unsigned long msec_to_ticks(unsigned long msec) +{ + return msecs_to_jiffies(msec); +} + +static inline unsigned long ticks_to_msec(unsigned long ticks) +{ + return jiffies_to_msecs(ticks); +} + +static inline unsigned long ticks_to_usec(unsigned long ticks) +{ + return jiffies_to_usecs(ticks); +} + + +static int do_set_sched(struct vx_info *vxi, struct vcmd_sched_v5 *data) +{ + unsigned int set_mask = data->mask; + unsigned int update_mask; + int i, cpu; + + /* Sanity check data values */ + if (data->tokens_max <= 0) + data->tokens_max = HZ; + if (data->tokens_min < 0) + data->tokens_min = HZ/3; + if (data->tokens_min >= data->tokens_max) + data->tokens_min = data->tokens_max; + + if (data->prio_bias > MAX_PRIO_BIAS) + data->prio_bias = MAX_PRIO_BIAS; + if (data->prio_bias < MIN_PRIO_BIAS) + data->prio_bias = MIN_PRIO_BIAS; + + spin_lock(&vxi->sched.tokens_lock); + + /* sync up on delayed updates */ + for_each_cpu_mask(cpu, vxi->sched.update) + vx_update_sched_param(&vxi->sched, + &vx_per_cpu(vxi, sched_pc, cpu)); + + if (set_mask & VXSM_FILL_RATE) + vxi->sched.fill_rate[0] = data->fill_rate[0]; + if (set_mask & VXSM_FILL_RATE2) + vxi->sched.fill_rate[1] = data->fill_rate[1]; + if (set_mask & VXSM_INTERVAL) + vxi->sched.interval[0] = (set_mask & VXSM_MSEC) ? + msec_to_ticks(data->interval[0]) : data->interval[0]; + if (set_mask & VXSM_INTERVAL2) + vxi->sched.interval[1] = (set_mask & VXSM_MSEC) ? + msec_to_ticks(data->interval[1]) : data->interval[1]; + if (set_mask & VXSM_TOKENS) + vxi->sched.tokens = data->tokens; + if (set_mask & VXSM_TOKENS_MIN) + vxi->sched.tokens_min = data->tokens_min; + if (set_mask & VXSM_TOKENS_MAX) + vxi->sched.tokens_max = data->tokens_max; + if (set_mask & VXSM_PRIO_BIAS) + vxi->sched.prio_bias = data->prio_bias; + + /* Sanity check rate/interval */ + for (i=0; i<2; i++) { + if (data->fill_rate[i] < 0) + data->fill_rate[i] = 0; + if (data->interval[i] <= 0) + data->interval[i] = HZ; + } + + update_mask = vxi->sched.update_mask & VXSM_SET_MASK; + update_mask |= (set_mask & (VXSM_SET_MASK|VXSM_IDLE_TIME)); + vxi->sched.update_mask = update_mask; +#ifdef CONFIG_SMP + rmb(); + if (set_mask & VXSM_CPU_ID) { + vxi->sched.update = cpumask_of_cpu(data->cpu_id); + cpus_and(vxi->sched.update, cpu_online_map, + vxi->sched.update); + } + else + vxi->sched.update = cpu_online_map; + + /* forced reload? */ + if (set_mask & VXSM_FORCE) { + for_each_cpu_mask(cpu, vxi->sched.update) + vx_update_sched_param(&vxi->sched, + &vx_per_cpu(vxi, sched_pc, cpu)); + vxi->sched.update = CPU_MASK_NONE; + } +#else + /* on UP we update immediately */ + vx_update_sched_param(&vxi->sched, + &vx_per_cpu(vxi, sched_pc, 0)); +#endif + + spin_unlock(&vxi->sched.tokens_lock); + return 0; +} + +#define COPY_IDS(C) C(cpu_id); C(bucket_id) +#define COPY_PRI(C) C(prio_bias) +#define COPY_TOK(C) C(tokens); C(tokens_min); C(tokens_max) +#define COPY_FRI(C) C(fill_rate[0]); C(interval[0]); \ + C(fill_rate[1]); C(interval[1]); + +#define COPY_VALUE(name) vc_data.name = data->name + +static int do_set_sched_v4(struct vx_info *vxi, struct vcmd_set_sched_v4 *data) +{ + struct vcmd_sched_v5 vc_data; + + vc_data.mask = data->set_mask; + COPY_IDS(COPY_VALUE); + COPY_PRI(COPY_VALUE); + COPY_TOK(COPY_VALUE); + vc_data.fill_rate[0] = vc_data.fill_rate[1] = data->fill_rate; + vc_data.interval[0] = vc_data.interval[1] = data->interval; + return do_set_sched(vxi, &vc_data); +} + +int vc_set_sched_v3(struct vx_info *vxi, void __user *data) +{ + struct vcmd_set_sched_v3 vc_data; + struct vcmd_set_sched_v4 vc_data_v4; + + if (copy_from_user (&vc_data, data, sizeof(vc_data))) + return -EFAULT; + + /* structures are binary compatible */ + memcpy(&vc_data_v4, &vc_data, sizeof(vc_data)); + vc_data_v4.set_mask &= VXSM_V3_MASK; + vc_data_v4.bucket_id = 0; + + return do_set_sched_v4(vxi, &vc_data_v4); +} + +int vc_set_sched_v4(struct vx_info *vxi, void __user *data) +{ + struct vcmd_set_sched_v4 vc_data; + + if (copy_from_user (&vc_data, data, sizeof(vc_data))) + return -EFAULT; + + return do_set_sched_v4(vxi, &vc_data); +} + + /* latest interface is v5 */ + +int vc_set_sched(struct vx_info *vxi, void __user *data) +{ + struct vcmd_sched_v5 vc_data; + + if (copy_from_user (&vc_data, data, sizeof(vc_data))) + return -EFAULT; + + return do_set_sched(vxi, &vc_data); +} + + +int vc_get_sched(struct vx_info *vxi, void __user *data) +{ + struct vcmd_sched_v5 vc_data; + + if (copy_from_user (&vc_data, data, sizeof(vc_data))) + return -EFAULT; + + if (vc_data.mask & VXSM_CPU_ID) { + int cpu = vc_data.cpu_id; + struct _vx_sched_pc *data; + + if (!cpu_possible(cpu)) + return -EINVAL; + + data = &vx_per_cpu(vxi, sched_pc, cpu); + COPY_TOK(COPY_VALUE); + COPY_PRI(COPY_VALUE); + COPY_FRI(COPY_VALUE); + + if (data->flags & VXSF_IDLE_TIME) + vc_data.mask |= VXSM_IDLE_TIME; + } else { + struct _vx_sched *data = &vxi->sched; + + COPY_TOK(COPY_VALUE); + COPY_PRI(COPY_VALUE); + COPY_FRI(COPY_VALUE); + } + + if (vc_data.mask & VXSM_MSEC) { + vc_data.interval[0] = ticks_to_msec(vc_data.interval[0]); + vc_data.interval[1] = ticks_to_msec(vc_data.interval[1]); + } + + if (copy_to_user (data, &vc_data, sizeof(vc_data))) + return -EFAULT; + return 0; +} + + +int vc_sched_info(struct vx_info *vxi, void __user *data) +{ + struct vcmd_sched_info vc_data; + int cpu; + + if (copy_from_user (&vc_data, data, sizeof(vc_data))) + return -EFAULT; + + cpu = vc_data.cpu_id; + if (!cpu_possible(cpu)) + return -EINVAL; + + if (vxi) { + struct _vx_sched_pc *sched_pc = + &vx_per_cpu(vxi, sched_pc, cpu); + + vc_data.user_msec = ticks_to_msec(sched_pc->user_ticks); + vc_data.sys_msec = ticks_to_msec(sched_pc->sys_ticks); + vc_data.hold_msec = ticks_to_msec(sched_pc->hold_ticks); + vc_data.vavavoom = sched_pc->vavavoom; + } + vc_data.token_usec = ticks_to_usec(1); + + if (copy_to_user (data, &vc_data, sizeof(vc_data))) + return -EFAULT; + return 0; +} +