--- linux-2.6.18.2/fs/proc/array.c 2006-09-20 16:58:35 +0200 +++ linux-2.6.18.2-vs2.1.1/fs/proc/array.c 2006-10-25 03:39:09 +0200 @@ -135,7 +137,9 @@ static const char *task_state_array[] = "T (stopped)", /* 4 */ "T (tracing stop)", /* 8 */ "Z (zombie)", /* 16 */ - "X (dead)" /* 32 */ + "X (dead)", /* 32 */ + "N (noninteractive)", /* 64 */ + "H (on hold)" /* 128 */ }; static inline const char * get_task_state(struct task_struct *tsk) --- linux-2.6.18.2/fs/proc/array.c 2006-09-20 16:58:35 +0200 +++ linux-2.6.18.2-vs2.1.1/fs/proc/array.c 2006-10-25 03:39:09 +0200 @@ -144,7 +148,8 @@ static inline const char * get_task_stat TASK_INTERRUPTIBLE | TASK_UNINTERRUPTIBLE | TASK_STOPPED | - TASK_TRACED)) | + TASK_TRACED | + TASK_ONHOLD)) | (tsk->exit_state & (EXIT_ZOMBIE | EXIT_DEAD)); const char **p = &task_state_array[0]; --- linux-2.6.18.2/include/linux/sched.h 2006-09-20 16:58:44 +0200 +++ linux-2.6.18.2-vs2.1.1/include/linux/sched.h 2006-10-06 23:09:03 +0200 @@ -148,6 +150,7 @@ extern unsigned long weighted_cpuload(co #define EXIT_DEAD 32 /* in tsk->state again */ #define TASK_NONINTERACTIVE 64 +#define TASK_ONHOLD 128 #define __set_task_state(tsk, state_value) \ do { (tsk)->state = (state_value); } while (0) --- linux-2.6.18.2/include/linux/vserver/sched.h 1970-01-01 01:00:00 +0100 +++ linux-2.6.18.2-vs2.1.1/include/linux/vserver/sched.h 2006-09-25 15:40:02 +0200 @@ -0,0 +1,26 @@ +#ifndef _VX_SCHED_H +#define _VX_SCHED_H + + +#ifdef __KERNEL__ + +struct timespec; + +void vx_vsi_uptime(struct timespec *, struct timespec *); + + +struct vx_info; + +void vx_update_load(struct vx_info *); + + +int vx_tokens_recalc(struct _vx_sched_pc *, + unsigned long *, unsigned long *, int [2]); + +void vx_update_sched_param(struct _vx_sched *sched, + struct _vx_sched_pc *sched_pc); + +#endif /* __KERNEL__ */ +#else /* _VX_SCHED_H */ +#warning duplicate inclusion +#endif /* _VX_SCHED_H */ --- linux-2.6.18.2/kernel/sched.c 2006-11-04 19:43:24 +0100 +++ linux-2.6.18.2-vs2.1.1/kernel/sched.c 2006-10-28 19:04:30 +0200 @@ -243,6 +246,16 @@ struct rq { struct task_struct *migration_thread; struct list_head migration_queue; #endif + unsigned long norm_time; + unsigned long idle_time; +#ifdef CONFIG_VSERVER_IDLETIME + int idle_skip; +#endif +#ifdef CONFIG_VSERVER_HARDCPU + struct list_head hold_queue; + unsigned long nr_onhold; + int idle_tokens; +#endif #ifdef CONFIG_SCHEDSTATS /* latency stats */ --- linux-2.6.18.2/kernel/sched.c 2006-11-04 19:43:24 +0100 +++ linux-2.6.18.2-vs2.1.1/kernel/sched.c 2006-10-28 19:04:30 +0200 @@ -672,6 +685,7 @@ sched_info_switch(struct task_struct *pr */ static void dequeue_task(struct task_struct *p, struct prio_array *array) { + BUG_ON(p->state & TASK_ONHOLD); array->nr_active--; list_del(&p->run_list); if (list_empty(array->queue + p->prio)) --- linux-2.6.18.2/kernel/sched.c 2006-11-04 19:43:24 +0100 +++ linux-2.6.18.2-vs2.1.1/kernel/sched.c 2006-10-28 19:04:30 +0200 @@ -680,6 +694,7 @@ static void dequeue_task(struct task_str static void enqueue_task(struct task_struct *p, struct prio_array *array) { + BUG_ON(p->state & TASK_ONHOLD); sched_info_queued(p); list_add_tail(&p->run_list, array->queue + p->prio); __set_bit(p->prio, array->bitmap); --- linux-2.6.18.2/kernel/sched.c 2006-11-04 19:43:24 +0100 +++ linux-2.6.18.2-vs2.1.1/kernel/sched.c 2006-10-28 19:04:30 +0200 @@ -693,6 +708,7 @@ static void enqueue_task(struct task_str */ static void requeue_task(struct task_struct *p, struct prio_array *array) { + BUG_ON(p->state & TASK_ONHOLD); list_move_tail(&p->run_list, array->queue + p->prio); } --- linux-2.6.18.2/kernel/sched.c 2006-11-04 19:43:24 +0100 +++ linux-2.6.18.2-vs2.1.1/kernel/sched.c 2006-10-28 19:04:30 +0200 @@ -699,6 +715,7 @@ ***** static inline void enqueue_task_head(struct task_struct *p, struct prio_array *array) { + BUG_ON(p->state & TASK_ONHOLD); list_add(&p->run_list, array->queue + p->prio); __set_bit(p->prio, array->bitmap); array->nr_active++; --- linux-2.6.18.2/kernel/sched.c 2006-11-04 19:43:24 +0100 +++ linux-2.6.18.2-vs2.1.1/kernel/sched.c 2006-10-28 19:04:30 +0200 @@ -727,6 +744,10 @@ static inline int __normal_prio(struct t bonus = CURRENT_BONUS(p) - MAX_BONUS / 2; prio = p->static_prio - bonus; + + /* adjust effective priority */ + prio = vx_adjust_prio(p, prio, MAX_USER_PRIO); + if (prio < MAX_RT_PRIO) prio = MAX_RT_PRIO; if (prio > MAX_PRIO-1) --- linux-2.6.18.2/kernel/sched.c 2006-11-04 19:43:24 +0100 +++ linux-2.6.18.2-vs2.1.1/kernel/sched.c 2006-10-28 19:04:30 +0200 @@ -836,6 +857,9 @@ static int effective_prio(struct task_st return p->prio; } +#include "sched_mon.h" + + /* * __activate_task - move a task to the runqueue. */ --- linux-2.6.18.2/kernel/sched.c 2006-11-04 19:43:24 +0100 +++ linux-2.6.18.2-vs2.1.1/kernel/sched.c 2006-10-28 19:04:30 +0200 @@ -845,6 +869,7 @@ static void __activate_task(struct task_ if (batch_task(p)) target = rq->expired; + vxm_activate_task(p, rq); enqueue_task(p, target); inc_nr_running(p, rq); } --- linux-2.6.18.2/kernel/sched.c 2006-11-04 19:43:24 +0100 +++ linux-2.6.18.2-vs2.1.1/kernel/sched.c 2006-10-28 19:04:30 +0200 @@ -854,6 +879,7 @@ static void __activate_task(struct task_ */ static inline void __activate_idle_task(struct task_struct *p, struct rq *rq) { + vxm_activate_idle(p, rq); enqueue_task_head(p, rq->active); inc_nr_running(p, rq); } --- linux-2.6.18.2/kernel/sched.c 2006-11-04 19:43:24 +0100 +++ linux-2.6.18.2-vs2.1.1/kernel/sched.c 2006-10-28 19:04:30 +0200 @@ -975,6 +1001,7 @@ static void activate_task(struct task_st } p->timestamp = now; + vx_activate_task(p); __activate_task(p, rq); } --- linux-2.6.18.2/kernel/sched.c 2006-11-04 19:43:24 +0100 +++ linux-2.6.18.2-vs2.1.1/kernel/sched.c 2006-10-28 19:04:30 +0200 @@ -981,7 +1008,7 @@ ***** /* * deactivate_task - remove a task from the runqueue. */ -static void deactivate_task(struct task_struct *p, struct rq *rq) +static void __deactivate_task(struct task_struct *p, struct rq *rq) { dec_nr_running(p, rq); dequeue_task(p, p->array); --- linux-2.6.18.2/kernel/sched.c 2006-11-04 19:43:24 +0100 +++ linux-2.6.18.2-vs2.1.1/kernel/sched.c 2006-10-28 19:04:30 +0200 @@ -985,6 +1012,7 @@ ***** { dec_nr_running(p, rq); dequeue_task(p, p->array); + vxm_deactivate_task(p, rq); p->array = NULL; } --- linux-2.6.18.2/kernel/sched.c 2006-11-04 19:43:24 +0100 +++ linux-2.6.18.2-vs2.1.1/kernel/sched.c 2006-10-28 19:04:30 +0200 @@ -988,6 +1016,15 @@ ***** p->array = NULL; } +static inline +void deactivate_task(struct task_struct *p, struct rq *rq) +{ + vx_deactivate_task(p); + __deactivate_task(p, rq); +} + +#include "sched_hard.h" + /* * resched_task - mark a task 'to be rescheduled now'. * --- linux-2.6.18.2/kernel/sched.c 2006-11-04 19:43:24 +0100 +++ linux-2.6.18.2-vs2.1.1/kernel/sched.c 2006-10-28 19:04:30 +0200 @@ -1063,6 +1100,7 @@ migrate_task(struct task_struct *p, int { struct rq *rq = task_rq(p); + vxm_migrate_task(p, rq, dest_cpu); /* * If the task is not on a runqueue (and not running), then * it is sufficient to simply update the task's cpu field. --- linux-2.6.18.2/kernel/sched.c 2006-11-04 19:43:24 +0100 +++ linux-2.6.18.2-vs2.1.1/kernel/sched.c 2006-10-28 19:04:30 +0200 @@ -1383,6 +1421,12 @@ static int try_to_wake_up(struct task_st rq = task_rq_lock(p, &flags); old_state = p->state; + + /* we need to unhold suspended tasks */ + if (old_state & TASK_ONHOLD) { + vx_unhold_task(p, rq); + old_state = p->state; + } if (!(old_state & state)) goto out; --- linux-2.6.18.2/kernel/sched.c 2006-11-04 19:43:24 +0100 +++ linux-2.6.18.2-vs2.1.1/kernel/sched.c 2006-10-28 19:04:30 +0200 @@ -1488,6 +1532,7 @@ out_activate: #endif /* CONFIG_SMP */ if (old_state == TASK_UNINTERRUPTIBLE) { rq->nr_uninterruptible--; + vx_uninterruptible_dec(p); /* * Tasks on involuntary sleep don't earn * sleep_avg beyond just interactive state. --- linux-2.6.18.2/kernel/sched.c 2006-11-04 19:43:24 +0100 +++ linux-2.6.18.2-vs2.1.1/kernel/sched.c 2006-10-28 19:04:30 +0200 @@ -1634,6 +1679,7 @@ void fastcall wake_up_new_task(struct ta p->prio = effective_prio(p); + vx_activate_task(p); if (likely(cpu == this_cpu)) { if (!(clone_flags & CLONE_VM)) { /* --- linux-2.6.18.2/kernel/sched.c 2006-11-04 19:43:24 +0100 +++ linux-2.6.18.2-vs2.1.1/kernel/sched.c 2006-10-28 19:04:30 +0200 @@ -1645,6 +1691,7 @@ void fastcall wake_up_new_task(struct ta __activate_task(p, rq); else { p->prio = current->prio; + BUG_ON(p->state & TASK_ONHOLD); p->normal_prio = current->normal_prio; list_add_tail(&p->run_list, ¤t->run_list); p->array = current->array; --- linux-2.6.18.2/kernel/sched.c 2006-11-04 19:43:24 +0100 +++ linux-2.6.18.2-vs2.1.1/kernel/sched.c 2006-10-28 19:04:30 +0200 @@ -2957,7 +3007,7 @@ ***** /* Add user time to cpustat. */ tmp = cputime_to_cputime64(cputime); - if (TASK_NICE(p) > 0) + if (nice) cpustat->nice = cputime64_add(cpustat->nice, tmp); else cpustat->user = cputime64_add(cpustat->user, tmp); --- linux-2.6.18.2/kernel/sched.c 2006-11-04 19:43:24 +0100 +++ linux-2.6.18.2-vs2.1.1/kernel/sched.c 2006-10-28 19:04:30 +0200 @@ -3036,6 +3089,7 @@ ***** if (p == rq->idle) { if (wake_priority_sleeper(rq)) goto out; + vx_idle_resched(rq); rebalance_tick(cpu, rq, SCHED_IDLE); return; } --- linux-2.6.18.2/kernel/sched.c 2006-11-04 19:43:24 +0100 +++ linux-2.6.18.2-vs2.1.1/kernel/sched.c 2006-10-28 19:04:30 +0200 @@ -3068,7 +3122,7 @@ void scheduler_tick(void) } goto out_unlock; } - if (!--p->time_slice) { + if (vx_need_resched(p, --p->time_slice, cpu)) { dequeue_task(p, rq->active); set_tsk_need_resched(p); p->prio = effective_prio(p); --- linux-2.6.18.2/kernel/sched.c 2006-11-04 19:43:24 +0100 +++ linux-2.6.18.2-vs2.1.1/kernel/sched.c 2006-10-28 19:04:30 +0200 @@ -3358,8 +3412,10 @@ need_resched_nonpreemptible: unlikely(signal_pending(prev)))) prev->state = TASK_RUNNING; else { - if (prev->state == TASK_UNINTERRUPTIBLE) + if (prev->state == TASK_UNINTERRUPTIBLE) { rq->nr_uninterruptible++; + vx_uninterruptible_inc(prev); + } deactivate_task(prev, rq); } } --- linux-2.6.18.2/kernel/sched.c 2006-11-04 19:43:24 +0100 +++ linux-2.6.18.2-vs2.1.1/kernel/sched.c 2006-10-28 19:04:30 +0200 @@ -3365,7 +3421,16 @@ ***** } cpu = smp_processor_id(); + vx_set_rq_time(rq, jiffies); +try_unhold: + vx_try_unhold(rq, cpu); +pick_next: + if (unlikely(!rq->nr_running)) { + /* can we skip idle time? */ + if (vx_try_skip(rq, cpu)) + goto try_unhold; + idle_balance(cpu, rq); if (!rq->nr_running) { next = rq->idle; --- linux-2.6.18.2/kernel/sched.c 2006-11-04 19:43:24 +0100 +++ linux-2.6.18.2-vs2.1.1/kernel/sched.c 2006-10-28 19:04:30 +0200 @@ -3392,6 +3457,10 @@ need_resched_nonpreemptible: queue = array->queue + idx; next = list_entry(queue->next, struct task_struct, run_list); + /* check before we schedule this context */ + if (!vx_schedule(next, rq, cpu)) + goto pick_next; + if (!rt_task(next) && interactive_sleep(next->sleep_type)) { unsigned long long delta = now - next->timestamp; if (unlikely((long long)(now - next->timestamp) < 0)) --- linux-2.6.18.2/kernel/sched.c 2006-11-04 19:43:24 +0100 +++ linux-2.6.18.2-vs2.1.1/kernel/sched.c 2006-10-28 19:04:30 +0200 @@ -4161,6 +4230,7 @@ recheck: oldprio = p->prio; __setscheduler(p, policy, param->sched_priority); if (array) { + vx_activate_task(p); __activate_task(p, rq); /* * Reschedule if we are currently running on this runqueue and --- linux-2.6.18.2/kernel/sched.c 2006-11-04 19:43:24 +0100 +++ linux-2.6.18.2-vs2.1.1/kernel/sched.c 2006-10-28 19:04:30 +0200 @@ -4949,6 +5019,7 @@ static int __migrate_task(struct task_st p->timestamp = p->timestamp - rq_src->timestamp_last_tick + rq_dest->timestamp_last_tick; deactivate_task(p, rq_src); + vx_activate_task(p); __activate_task(p, rq_dest); if (TASK_PREEMPTS_CURR(p, rq_dest)) resched_task(rq_dest->curr); --- linux-2.6.18.2/kernel/sched.c 2006-11-04 19:43:24 +0100 +++ linux-2.6.18.2-vs2.1.1/kernel/sched.c 2006-10-28 19:04:30 +0200 @@ -6789,7 +6860,10 @@ void __init sched_init(void) INIT_LIST_HEAD(&rq->migration_queue); #endif atomic_set(&rq->nr_iowait, 0); - +#ifdef CONFIG_VSERVER_HARDCPU + INIT_LIST_HEAD(&rq->hold_queue); + rq->nr_onhold = 0; +#endif for (j = 0; j < 2; j++) { array = rq->arrays + j; for (k = 0; k < MAX_PRIO; k++) { --- linux-2.6.18.2/kernel/sched.c 2006-11-04 19:43:24 +0100 +++ linux-2.6.18.2-vs2.1.1/kernel/sched.c 2006-10-28 19:04:30 +0200 @@ -6865,6 +6939,7 @@ void normalize_rt_tasks(void) deactivate_task(p, task_rq(p)); __setscheduler(p, SCHED_NORMAL, 0); if (array) { + vx_activate_task(p); __activate_task(p, task_rq(p)); resched_task(rq->curr); } --- linux-2.6.18.2/kernel/sched_hard.h 1970-01-01 01:00:00 +0100 +++ linux-2.6.18.2-vs2.1.1/kernel/sched_hard.h 2006-09-25 18:55:03 +0200 @@ -0,0 +1,324 @@ + +#ifdef CONFIG_VSERVER_IDLELIMIT + +/* + * vx_idle_resched - reschedule after maxidle + */ +static inline +void vx_idle_resched(struct rq *rq) +{ + /* maybe have a better criterion for paused */ + if (!--rq->idle_tokens && !list_empty(&rq->hold_queue)) + set_need_resched(); +} + +#else /* !CONFIG_VSERVER_IDLELIMIT */ + +#define vx_idle_resched(rq) + +#endif /* CONFIG_VSERVER_IDLELIMIT */ + + + +#ifdef CONFIG_VSERVER_IDLETIME + +#define vx_set_rq_min_skip(rq, min) \ + (rq)->idle_skip = (min) + +#define vx_save_min_skip(ret, min, val) \ + __vx_save_min_skip(ret, min, val) + +static inline +void __vx_save_min_skip(int ret, int *min, int val) +{ + if (ret > -2) + return; + if ((*min > val) || !*min) + *min = val; +} + +static inline +int vx_try_skip(struct rq *rq, int cpu) +{ + /* artificially advance time */ + if (rq->idle_skip > 0) { + vxdprintk(list_empty(&rq->hold_queue), + "hold queue empty on cpu %d", cpu); + rq->idle_time += rq->idle_skip; + vxm_idle_skip(rq, cpu); + return 1; + } + return 0; +} + +#else /* !CONFIG_VSERVER_IDLETIME */ + +#define vx_set_rq_min_skip(rq, min) \ + ({ int dummy = (min); dummy; }) + +#define vx_save_min_skip(ret, min, val) + +static inline +int vx_try_skip(struct rq *rq, int cpu) +{ + return 0; +} + +#endif /* CONFIG_VSERVER_IDLETIME */ + + + +#ifdef CONFIG_VSERVER_HARDCPU + +#define vx_set_rq_max_idle(rq, max) \ + (rq)->idle_tokens = (max) + +#define vx_save_max_idle(ret, min, val) \ + __vx_save_max_idle(ret, min, val) + +static inline +void __vx_save_max_idle(int ret, int *min, int val) +{ + if (*min > val) + *min = val; +} + + +/* + * vx_hold_task - put a task on the hold queue + */ +static inline +void vx_hold_task(struct task_struct *p, struct rq *rq) +{ + __deactivate_task(p, rq); + p->state |= TASK_ONHOLD; + /* a new one on hold */ + rq->nr_onhold++; + vxm_hold_task(p, rq); + list_add_tail(&p->run_list, &rq->hold_queue); +} + +/* + * vx_unhold_task - put a task back to the runqueue + */ +static inline +void vx_unhold_task(struct task_struct *p, struct rq *rq) +{ + list_del(&p->run_list); + /* one less waiting */ + rq->nr_onhold--; + p->state &= ~TASK_ONHOLD; + enqueue_task(p, rq->expired); + inc_nr_running(p, rq); + vxm_unhold_task(p, rq); + + if (p->static_prio < rq->best_expired_prio) + rq->best_expired_prio = p->static_prio; +} + +unsigned long nr_onhold(void) +{ + unsigned long i, sum = 0; + + for_each_online_cpu(i) + sum += cpu_rq(i)->nr_onhold; + + return sum; +} + + + +static inline +int __vx_tokens_avail(struct _vx_sched_pc *sched_pc) +{ + return sched_pc->tokens; +} + +static inline +void __vx_consume_token(struct _vx_sched_pc *sched_pc) +{ + sched_pc->tokens--; +} + +static inline +int vx_need_resched(struct task_struct *p, int slice, int cpu) +{ + struct vx_info *vxi = p->vx_info; + + if (vx_info_flags(vxi, VXF_SCHED_HARD|VXF_SCHED_PRIO, 0)) { + struct _vx_sched_pc *sched_pc = + &vx_per_cpu(vxi, sched_pc, cpu); + int tokens; + + /* maybe we can simplify that to decrement + the token counter unconditional? */ + + if ((tokens = __vx_tokens_avail(sched_pc)) > 0) + __vx_consume_token(sched_pc); + + /* for tokens > 0, one token was consumed */ + if (tokens < 2) + slice = 0; + } + vxm_need_resched(p, slice, cpu); + return (slice == 0); +} + + +#define vx_set_rq_time(rq, time) do { \ + rq->norm_time = time; \ +} while (0) + + +static inline +void vx_try_unhold(struct rq *rq, int cpu) +{ + struct vx_info *vxi = NULL; + struct list_head *l, *n; + int maxidle = HZ; + int minskip = 0; + + /* nothing to do? what about pause? */ + if (list_empty(&rq->hold_queue)) + return; + + list_for_each_safe(l, n, &rq->hold_queue) { + int ret, delta_min[2]; + struct _vx_sched_pc *sched_pc; + struct task_struct *p; + + p = list_entry(l, struct task_struct, run_list); + /* don't bother with same context */ + if (vxi == p->vx_info) + continue; + + vxi = p->vx_info; + /* ignore paused contexts */ + if (vx_info_flags(vxi, VXF_SCHED_PAUSE, 0)) + continue; + + sched_pc = &vx_per_cpu(vxi, sched_pc, cpu); + + /* recalc tokens */ + vxm_sched_info(sched_pc, vxi, cpu); + ret = vx_tokens_recalc(sched_pc, + &rq->norm_time, &rq->idle_time, delta_min); + vxm_tokens_recalc(sched_pc, rq, vxi, cpu); + + if (ret > 0) { + /* we found a runable context */ + vx_unhold_task(p, rq); + break; + } + vx_save_max_idle(ret, &maxidle, delta_min[0]); + vx_save_min_skip(ret, &minskip, delta_min[1]); + } + vx_set_rq_max_idle(rq, maxidle); + vx_set_rq_min_skip(rq, minskip); + vxm_rq_max_min(rq, cpu); +} + + +static inline +int vx_schedule(struct task_struct *next, struct rq *rq, int cpu) +{ + struct vx_info *vxi = next->vx_info; + struct _vx_sched_pc *sched_pc; + int delta_min[2]; + int flags, ret; + + if (!vxi) + return 1; + + flags = vxi->vx_flags; + + if (unlikely(vx_check_flags(flags , VXF_SCHED_PAUSE, 0))) + goto put_on_hold; + if (!vx_check_flags(flags , VXF_SCHED_HARD|VXF_SCHED_PRIO, 0)) + return 1; + + sched_pc = &vx_per_cpu(vxi, sched_pc, cpu); +#ifdef CONFIG_SMP + /* update scheduler params */ + if (cpu_isset(cpu, vxi->sched.update)) { + vx_update_sched_param(&vxi->sched, sched_pc); + vxm_update_sched(sched_pc, vxi, cpu); + cpu_clear(cpu, vxi->sched.update); + } +#endif + vxm_sched_info(sched_pc, vxi, cpu); + ret = vx_tokens_recalc(sched_pc, + &rq->norm_time, &rq->idle_time, delta_min); + vxm_tokens_recalc(sched_pc, rq, vxi, cpu); + + if (!vx_check_flags(flags , VXF_SCHED_HARD, 0)) + return 1; + + if (unlikely(ret < 0)) { + vx_save_max_idle(ret, &rq->idle_tokens, delta_min[0]); + vx_save_min_skip(ret, &rq->idle_skip, delta_min[1]); + vxm_rq_max_min(rq, cpu); + put_on_hold: + vx_hold_task(next, rq); + return 0; + } + return 1; +} + + +#else /* CONFIG_VSERVER_HARDCPU */ + +static inline +void vx_hold_task(struct task_struct *p, struct rq *rq) +{ + return; +} + +static inline +void vx_unhold_task(struct task_struct *p, struct rq *rq) +{ + return; +} + +unsigned long nr_onhold(void) +{ + return 0; +} + + +static inline +int vx_need_resched(struct task_struct *p, int slice, int cpu) +{ + return (slice == 0); +} + + +#define vx_set_rq_time(rq, time) + +static inline +void vx_try_unhold(struct rq *rq, int cpu) +{ + return; +} + +static inline +int vx_schedule(struct task_struct *next, struct rq *rq, int cpu) +{ + struct vx_info *vxi = next->vx_info; + struct _vx_sched_pc *sched_pc; + int delta_min[2]; + int ret; + + if (!vx_info_flags(vxi, VXF_SCHED_PRIO, 0)) + return 1; + + sched_pc = &vx_per_cpu(vxi, sched_pc, cpu); + vxm_sched_info(sched_pc, vxi, cpu); + ret = vx_tokens_recalc(sched_pc, + &rq->norm_time, &rq->idle_time, delta_min); + vxm_tokens_recalc(sched_pc, rq, vxi, cpu); + return 1; +} + +#endif /* CONFIG_VSERVER_HARDCPU */ + --- linux-2.6.18.2/kernel/vserver/sched.c 1970-01-01 01:00:00 +0100 +++ linux-2.6.18.2-vs2.1.1/kernel/vserver/sched.c 2006-10-28 17:24:28 +0200 @@ -0,0 +1,318 @@ +/* + * linux/kernel/vserver/sched.c + * + * Virtual Server: Scheduler Support + * + * Copyright (C) 2004-2006 Herbert Pötzl + * + * V0.01 adapted Sam Vilains version to 2.6.3 + * V0.02 removed legacy interface + * V0.03 changed vcmds to vxi arg + * + */ + +#include +#include +#include +#include + +#include +#include + +#define vxd_check_range(val, min, max) do { \ + vxlprintk((valmax), \ + "check_range(%ld,%ld,%ld)", \ + (long)val, (long)min, (long)max, \ + __FILE__, __LINE__); \ + } while (0) + + +void vx_update_sched_param(struct _vx_sched *sched, + struct _vx_sched_pc *sched_pc) +{ + unsigned int set_mask = sched->update_mask; + + if (set_mask & VXSM_FILL_RATE) + sched_pc->fill_rate[0] = sched->fill_rate[0]; + if (set_mask & VXSM_INTERVAL) + sched_pc->interval[0] = sched->interval[0]; + if (set_mask & VXSM_FILL_RATE2) + sched_pc->fill_rate[1] = sched->fill_rate[1]; + if (set_mask & VXSM_INTERVAL2) + sched_pc->interval[1] = sched->interval[1]; + if (set_mask & VXSM_TOKENS) + sched_pc->tokens = sched->tokens; + if (set_mask & VXSM_TOKENS_MIN) + sched_pc->tokens_min = sched->tokens_min; + if (set_mask & VXSM_TOKENS_MAX) + sched_pc->tokens_max = sched->tokens_max; + + if (set_mask & VXSM_IDLE_TIME) + sched_pc->flags |= VXSF_IDLE_TIME; + else + sched_pc->flags &= ~VXSF_IDLE_TIME; + + /* reset time */ + sched_pc->norm_time = jiffies; +} + + +/* + * recalculate the context's scheduling tokens + * + * ret > 0 : number of tokens available + * ret < 0 : on hold, check delta_min[] + * -1 only jiffies + * -2 also idle time + * + */ +int vx_tokens_recalc(struct _vx_sched_pc *sched_pc, + unsigned long *norm_time, unsigned long *idle_time, int delta_min[2]) +{ + long delta; + long tokens = 0; + int flags = sched_pc->flags; + + /* how much time did pass? */ + delta = *norm_time - sched_pc->norm_time; + vxd_check_range(delta, 0, INT_MAX); + + if (delta >= sched_pc->interval[0]) { + long tokens, integral; + + /* calc integral token part */ + tokens = delta / sched_pc->interval[0]; + integral = tokens * sched_pc->interval[0]; + tokens *= sched_pc->fill_rate[0]; +#ifdef CONFIG_VSERVER_HARDCPU + delta_min[0] = delta - integral; + vxd_check_range(delta_min[0], 0, sched_pc->interval[0]); +#endif + /* advance time */ + sched_pc->norm_time += delta; + + /* add tokens */ + sched_pc->tokens += tokens; + sched_pc->token_time += tokens; + } + else + delta_min[0] = delta; + +#ifdef CONFIG_VSERVER_IDLETIME + if (!(flags & VXSF_IDLE_TIME)) + goto skip_idle; + + /* how much was the idle skip? */ + delta = *idle_time - sched_pc->idle_time; + vxd_check_range(delta, 0, INT_MAX); + + if (delta >= sched_pc->interval[1]) { + long tokens, integral; + + /* calc fair share token part */ + tokens = delta / sched_pc->interval[1]; + integral = tokens * sched_pc->interval[1]; + tokens *= sched_pc->fill_rate[1]; + delta_min[1] = delta - integral; + vxd_check_range(delta_min[1], 0, sched_pc->interval[1]); + + /* advance idle time */ + sched_pc->idle_time += integral; + + /* add tokens */ + sched_pc->tokens += tokens; + sched_pc->token_time += tokens; + } + else + delta_min[1] = delta; +skip_idle: +#endif + + /* clip at maximum */ + if (sched_pc->tokens > sched_pc->tokens_max) + sched_pc->tokens = sched_pc->tokens_max; + tokens = sched_pc->tokens; + + if ((flags & VXSF_ONHOLD)) { + /* can we unhold? */ + if (tokens >= sched_pc->tokens_min) { + flags &= ~VXSF_ONHOLD; + sched_pc->hold_ticks += + *norm_time - sched_pc->onhold; + } + else + goto on_hold; + } else { + /* put on hold? */ + if (tokens <= 0) { + flags |= VXSF_ONHOLD; + sched_pc->onhold = *norm_time; + goto on_hold; + } + } + sched_pc->flags = flags; + return tokens; + +on_hold: + tokens = sched_pc->tokens_min - tokens; + sched_pc->flags = flags; + BUG_ON(tokens < 0); + +#ifdef CONFIG_VSERVER_HARDCPU + /* next interval? */ + if (!sched_pc->fill_rate[0]) + delta_min[0] = HZ; + else if (tokens > sched_pc->fill_rate[0]) + delta_min[0] += sched_pc->interval[0] * + tokens / sched_pc->fill_rate[0]; + else + delta_min[0] = sched_pc->interval[0] - delta_min[0]; + vxd_check_range(delta_min[0], 0, INT_MAX); + +#ifdef CONFIG_VSERVER_IDLETIME + if (!(flags & VXSF_IDLE_TIME)) + return -1; + + /* next interval? */ + if (!sched_pc->fill_rate[1]) + delta_min[1] = HZ; + else if (tokens > sched_pc->fill_rate[1]) + delta_min[1] += sched_pc->interval[1] * + tokens / sched_pc->fill_rate[1]; + else + delta_min[1] = sched_pc->interval[1] - delta_min[1]; + vxd_check_range(delta_min[1], 0, INT_MAX); + + return -2; +#else + return -1; +#endif /* CONFIG_VSERVER_IDLETIME */ +#else + return 0; +#endif /* CONFIG_VSERVER_HARDCPU */ +} + + +static int do_set_sched(struct vx_info *vxi, struct vcmd_set_sched_v4 *data) +{ + unsigned int set_mask = data->set_mask; + unsigned int update_mask; + + /* Sanity check data values */ + if (data->fill_rate < 0) + data->fill_rate = 1; + if (data->interval <= 0) + data->interval = HZ; + if (data->tokens_max <= 0) + data->tokens_max = HZ; + if (data->tokens_min < 0) + data->tokens_min = data->fill_rate*3; + if (data->tokens_min >= data->tokens_max) + data->tokens_min = data->tokens_max; + + if (data->prio_bias > MAX_PRIO_BIAS) + data->prio_bias = MAX_PRIO_BIAS; + if (data->prio_bias < MIN_PRIO_BIAS) + data->prio_bias = MIN_PRIO_BIAS; + + spin_lock(&vxi->sched.tokens_lock); + + if (set_mask & VXSM_FILL_RATE) + vxi->sched.fill_rate[0] = data->fill_rate; + if (set_mask & VXSM_INTERVAL) + vxi->sched.interval[0] = data->interval; + if (set_mask & VXSM_FILL_RATE2) + vxi->sched.fill_rate[1] = data->fill_rate; + if (set_mask & VXSM_INTERVAL2) + vxi->sched.interval[1] = data->interval; + if (set_mask & VXSM_TOKENS) + vxi->sched.tokens = data->tokens; + if (set_mask & VXSM_TOKENS_MIN) + vxi->sched.tokens_min = data->tokens_min; + if (set_mask & VXSM_TOKENS_MAX) + vxi->sched.tokens_max = data->tokens_max; + if (set_mask & VXSM_PRIO_BIAS) + vxi->sched.prio_bias = data->prio_bias; + + update_mask = vxi->sched.update_mask & VXSM_SET_MASK; + update_mask |= (set_mask & (VXSM_SET_MASK|VXSM_IDLE_TIME)); + vxi->sched.update_mask = update_mask; +#ifdef CONFIG_SMP + rmb(); + if (set_mask & VXSM_CPU_ID) + vxi->sched.update = cpumask_of_cpu(data->cpu_id); + else + vxi->sched.update = CPU_MASK_ALL; + /* forced reload? */ + if (set_mask & VXSM_FORCE) { + int cpu; + + for_each_possible_cpu(cpu) + vx_update_sched_param(&vxi->sched, + &vx_per_cpu(vxi, sched_pc, cpu)); + } +#else + /* on UP we update immediately */ + vx_update_sched_param(&vxi->sched, + &vx_per_cpu(vxi, sched_pc, 0)); +#endif + + spin_unlock(&vxi->sched.tokens_lock); + return 0; +} + + +#ifdef CONFIG_VSERVER_LEGACY + +#define COPY_MASK_V2(name, mask) \ + if (vc_data.name != SCHED_KEEP) { \ + vc_data_v4.name = vc_data.name; \ + vc_data_v4.set_mask |= mask; \ + } + +int vc_set_sched_v2(struct vx_info *vxi, void __user *data) +{ + struct vcmd_set_sched_v2 vc_data; + struct vcmd_set_sched_v4 vc_data_v4 = { .set_mask = 0 }; + + if (copy_from_user (&vc_data, data, sizeof(vc_data))) + return -EFAULT; + + COPY_MASK_V2(fill_rate, VXSM_FILL_RATE); + COPY_MASK_V2(interval, VXSM_INTERVAL); + COPY_MASK_V2(tokens, VXSM_TOKENS); + COPY_MASK_V2(tokens_min, VXSM_TOKENS_MIN); + COPY_MASK_V2(tokens_max, VXSM_TOKENS_MAX); + vc_data_v4.bucket_id = 0; + + do_set_sched(vxi, &vc_data_v4); + return 0; +} +#endif + +int vc_set_sched_v3(struct vx_info *vxi, void __user *data) +{ + struct vcmd_set_sched_v3 vc_data; + struct vcmd_set_sched_v4 vc_data_v4; + + if (copy_from_user (&vc_data, data, sizeof(vc_data))) + return -EFAULT; + + /* structures are binary compatible */ + memcpy(&vc_data_v4, &vc_data, sizeof(vc_data)); + vc_data_v4.set_mask &= VXSM_V3_MASK; + vc_data_v4.bucket_id = 0; + + return do_set_sched(vxi, &vc_data_v4); +} + +int vc_set_sched(struct vx_info *vxi, void __user *data) +{ + struct vcmd_set_sched_v4 vc_data; + + if (copy_from_user (&vc_data, data, sizeof(vc_data))) + return -EFAULT; + + return do_set_sched(vxi, &vc_data); +} +