diff -NurpP --minimal linux-2.6.16-vs2.1.1-rc14.10/include/linux/vserver/monitor.h linux-2.6.16-vs2.1.1-rc14.11/include/linux/vserver/monitor.h --- linux-2.6.16-vs2.1.1-rc14.10/include/linux/vserver/monitor.h 1970-01-01 01:00:00 +0100 +++ linux-2.6.16-vs2.1.1-rc14.11/include/linux/vserver/monitor.h 2006-04-03 02:10:30 +0200 @@ -0,0 +1,97 @@ +#ifndef _VX_MONITOR_H +#define _VX_MONITOR_H + +#include + + +enum { + VXM_UNUSED = 0, + + VXM_SYNC = 0x10, + + VXM_UPDATE = 0x20, + VXM_UPDATE_1, + VXM_UPDATE_2, + + VXM_RQINFO_1 = 0x24, + VXM_RQINFO_2, + + VXM_ACTIVATE = 0x40, + VXM_DEACTIVATE, + VXM_IDLE, + + VXM_HOLD = 0x44, + VXM_UNHOLD, + + VXM_MIGRATE = 0x48, + VXM_RESCHED, + + /* all other bits are flags */ + VXM_SCHED = 0x80, +}; + +struct _vxm_update_1 { + uint32_t tokens_max; + uint32_t fill_rate; + uint32_t interval; +}; + +struct _vxm_update_2 { + uint32_t tokens_min; + uint32_t fill_rate; + uint32_t interval; +}; + +struct _vxm_rqinfo_1 { + uint16_t running; + uint16_t onhold; + uint16_t iowait; + uint16_t uintr; + uint32_t idle_tokens; +}; + +struct _vxm_rqinfo_2 { + uint32_t norm_time; + uint32_t idle_time; + uint32_t idle_skip; +}; + +struct _vxm_sched { + uint32_t tokens; + uint32_t norm_time; + uint32_t idle_time; +}; + +struct _vxm_task { + uint16_t pid; + uint16_t state; +}; + +struct _vxm_event { + uint32_t jif; + union { + uint32_t seq; + uint32_t sec; + }; + union { + uint32_t tokens; + uint32_t nsec; + struct _vxm_task tsk; + }; +}; + +struct _vx_mon_entry { + uint16_t type; + uint16_t xid; + union { + struct _vxm_event ev; + struct _vxm_sched sd; + struct _vxm_update_1 u1; + struct _vxm_update_2 u2; + struct _vxm_rqinfo_1 q1; + struct _vxm_rqinfo_2 q2; + }; +}; + + +#endif /* _VX_MONITOR_H */ diff -NurpP --minimal linux-2.6.16-vs2.1.1-rc14.10/kernel/sched.c linux-2.6.16-vs2.1.1-rc14.11/kernel/sched.c --- linux-2.6.16-vs2.1.1-rc14.10/kernel/sched.c 2006-03-20 18:09:05 +0100 +++ linux-2.6.16-vs2.1.1-rc14.11/kernel/sched.c 2006-04-03 02:10:30 +0200 @@ -683,11 +683,15 @@ static int effective_prio(task_t *p) return prio; } +#include "sched_mon.h" + + /* * __activate_task - move a task to the runqueue. */ static inline void __activate_task(task_t *p, runqueue_t *rq) { + vxm_activate_task(p, rq); enqueue_task(p, rq->active); rq->nr_running++; } @@ -697,6 +701,7 @@ static inline void __activate_task(task_ */ static inline void __activate_idle_task(task_t *p, runqueue_t *rq) { + vxm_activate_idle(p, rq); enqueue_task_head(p, rq->active); rq->nr_running++; } @@ -825,6 +830,7 @@ static void __deactivate_task(struct tas { rq->nr_running--; dequeue_task(p, p->array); + vxm_deactivate_task(p, rq); p->array = NULL; } @@ -901,6 +907,7 @@ static int migrate_task(task_t *p, int d { runqueue_t *rq = task_rq(p); + vxm_migrate_task(p, rq, dest_cpu); /* * If the task is not on a runqueue (and not running), then * it is sufficient to simply update the task's cpu field. @@ -2616,6 +2623,7 @@ void scheduler_tick(void) unsigned long long now = sched_clock(); update_cpu_clock(p, rq, now); + vxm_sync(now, cpu); rq->timestamp_last_tick = now; @@ -2996,7 +3004,7 @@ pick_next: if (unlikely(!rq->nr_running)) { go_idle: /* can we skip idle time? */ - if (vx_try_skip(rq)) + if (vx_try_skip(rq, cpu)) goto try_unhold; idle_balance(cpu, rq); diff -NurpP --minimal linux-2.6.16-vs2.1.1-rc14.10/kernel/sched_hard.h linux-2.6.16-vs2.1.1-rc14.11/kernel/sched_hard.h --- linux-2.6.16-vs2.1.1-rc14.10/kernel/sched_hard.h 2006-03-23 20:36:40 +0100 +++ linux-2.6.16-vs2.1.1-rc14.11/kernel/sched_hard.h 2006-04-03 02:10:30 +0200 @@ -44,11 +44,12 @@ void __vx_save_min_skip(int ret, int *mi } static inline -int vx_try_skip(runqueue_t *rq) +int vx_try_skip(runqueue_t *rq, int cpu) { /* artificially advance time */ if (rq->idle_skip && !list_empty(&rq->hold_queue)) { rq->idle_time += rq->idle_skip; + vxm_idle_skip(rq, cpu); return 1; } return 0; @@ -62,7 +63,7 @@ int vx_try_skip(runqueue_t *rq) #define vx_save_min_skip(ret, min, val) static inline -int vx_try_skip(runqueue_t *rq) +int vx_try_skip(runqueue_t *rq, int cpu) { return 0; } @@ -97,6 +98,7 @@ void vx_hold_task(struct task_struct *p, p->state |= TASK_ONHOLD; /* a new one on hold */ rq->nr_onhold++; + vxm_hold_task(p, rq); list_add_tail(&p->run_list, &rq->hold_queue); } @@ -112,6 +114,7 @@ void vx_unhold_task(struct task_struct * p->state &= ~TASK_ONHOLD; enqueue_task(p, rq->expired); rq->nr_running++; + vxm_unhold_task(p, rq); if (p->static_prio < rq->best_expired_prio) rq->best_expired_prio = p->static_prio; @@ -159,8 +162,9 @@ int vx_need_resched(struct task_struct * /* for tokens > 0, one token was consumed */ if (tokens < 2) - return 1; + slice = 0; } + vxm_need_resched(p, slice, cpu); return (slice == 0); } @@ -199,8 +203,10 @@ void vx_try_unhold(runqueue_t *rq, int c sched_pc = &vx_per_cpu(vxi, sched_pc, cpu); /* recalc tokens */ + vxm_sched_info(sched_pc, vxi, cpu); ret = vx_tokens_recalc(sched_pc, &rq->norm_time, &rq->idle_time, delta_min); + vxm_tokens_recalc(sched_pc, rq, vxi, cpu); if (ret > 0) { /* we found a runable context */ @@ -212,6 +218,7 @@ void vx_try_unhold(runqueue_t *rq, int c } vx_set_rq_max_idle(rq, maxidle); vx_set_rq_min_skip(rq, minskip); + vxm_rq_max_min(rq, cpu); } @@ -238,11 +245,14 @@ int vx_schedule(struct task_struct *next /* update scheduler params */ if (cpu_isset(cpu, vxi->sched.update)) { vx_update_sched_param(&vxi->sched, sched_pc); + vxm_update_sched(sched_pc, vxi, cpu); cpu_clear(cpu, vxi->sched.update); } #endif + vxm_sched_info(sched_pc, vxi, cpu); ret = vx_tokens_recalc(sched_pc, &rq->norm_time, &rq->idle_time, delta_min); + vxm_tokens_recalc(sched_pc, rq, vxi, cpu); if (!vx_check_flags(flags , VXF_SCHED_HARD, 0)) return 1; @@ -250,6 +260,7 @@ int vx_schedule(struct task_struct *next if (unlikely(ret < 0)) { vx_save_max_idle(ret, &rq->idle_tokens, delta_min[0]); vx_save_min_skip(ret, &rq->idle_skip, delta_min[1]); + vxm_rq_max_min(rq, cpu); put_on_hold: vx_hold_task(next, rq); return 0; @@ -305,8 +316,10 @@ int vx_schedule(struct task_struct *next return 1; sched_pc = &vx_per_cpu(vxi, sched_pc, cpu); + vxm_sched_info(sched_pc, vxi, cpu); ret = vx_tokens_recalc(sched_pc, &rq->norm_time, &rq->idle_time, delta_min); + vxm_tokens_recalc(sched_pc, rq, vxi, cpu); return 1; } diff -NurpP --minimal linux-2.6.16-vs2.1.1-rc14.10/kernel/sched_mon.h linux-2.6.16-vs2.1.1-rc14.11/kernel/sched_mon.h --- linux-2.6.16-vs2.1.1-rc14.10/kernel/sched_mon.h 1970-01-01 01:00:00 +0100 +++ linux-2.6.16-vs2.1.1-rc14.11/kernel/sched_mon.h 2006-04-03 02:12:32 +0200 @@ -0,0 +1,189 @@ + +#include + +#ifdef CONFIG_VSERVER_MONITOR + +struct _vx_mon_entry *vxm_advance(int cpu); + + +static inline +void __vxm_basic(struct _vx_mon_entry *entry, xid_t xid, int type) +{ + entry->type = type; + entry->xid = xid; +} + +static inline +void __vxm_sync(int cpu) +{ + struct _vx_mon_entry *entry = vxm_advance(cpu); + struct timespec now = current_kernel_time(); + + __vxm_basic(entry, 0, VXM_SYNC); + entry->ev.sec = now.tv_sec; + entry->ev.nsec = now.tv_nsec; +} + +static inline +void __vxm_task(struct task_struct *p, int type) +{ + struct _vx_mon_entry *entry = vxm_advance(task_cpu(p)); + + __vxm_basic(entry, p->xid, type); + entry->ev.tsk.pid = p->pid; + entry->ev.tsk.state = p->state; +} + +static inline +void __vxm_sched(struct _vx_sched_pc *s, struct vx_info *vxi, int cpu) +{ + struct _vx_mon_entry *entry = vxm_advance(cpu); + + __vxm_basic(entry, vxi->vx_id, (VXM_SCHED | s->flags)); + entry->sd.tokens = s->tokens; + entry->sd.norm_time = s->norm_time; + entry->sd.idle_time = s->idle_time; +} + +static inline +void __vxm_rqinfo1(runqueue_t *q, int cpu) +{ + struct _vx_mon_entry *entry = vxm_advance(cpu); + + entry->type = VXM_RQINFO_1; + entry->xid = ((unsigned)(long)q >> 16) & 0xffff; + entry->q1.running = q->nr_running; + entry->q1.onhold = q->nr_onhold; + entry->q1.iowait = atomic_read(&q->nr_iowait); + entry->q1.uintr = q->nr_uninterruptible; + entry->q1.idle_tokens = q->idle_tokens; +} + +static inline +void __vxm_rqinfo2(runqueue_t *q, int cpu) +{ + struct _vx_mon_entry *entry = vxm_advance(cpu); + + entry->type = VXM_RQINFO_2; + entry->xid = (unsigned)(long)q & 0xffff; + entry->q2.norm_time = q->norm_time; + entry->q2.idle_time = q->idle_time; + entry->q2.idle_skip = q->idle_skip; +} + +static inline +void __vxm_update(struct _vx_sched_pc *s, struct vx_info *vxi, int cpu) +{ + struct _vx_mon_entry *entry = vxm_advance(cpu); + + __vxm_basic(entry, vxi->vx_id, VXM_UPDATE); + entry->ev.tokens = s->tokens; +} + +static inline +void __vxm_update1(struct _vx_sched_pc *s, struct vx_info *vxi, int cpu) +{ + struct _vx_mon_entry *entry = vxm_advance(cpu); + + __vxm_basic(entry, vxi->vx_id, VXM_UPDATE_1); + entry->u1.tokens_max = s->tokens_max; + entry->u1.fill_rate = s->fill_rate[0]; + entry->u1.interval = s->interval[0]; +} + +static inline +void __vxm_update2(struct _vx_sched_pc *s, struct vx_info *vxi, int cpu) +{ + struct _vx_mon_entry *entry = vxm_advance(cpu); + + __vxm_basic(entry, vxi->vx_id, VXM_UPDATE_2); + entry->u2.tokens_min = s->tokens_min; + entry->u2.fill_rate = s->fill_rate[1]; + entry->u2.interval = s->interval[1]; +} + + +#define vxm_activate_task(p,q) __vxm_task(p, VXM_ACTIVATE) +#define vxm_activate_idle(p,q) __vxm_task(p, VXM_IDLE) +#define vxm_deactivate_task(p,q) __vxm_task(p, VXM_DEACTIVATE) +#define vxm_hold_task(p,q) __vxm_task(p, VXM_HOLD) +#define vxm_unhold_task(p,q) __vxm_task(p, VXM_UNHOLD) + +static inline +void vxm_migrate_task(struct task_struct *p, runqueue_t *rq, int dest) +{ + __vxm_task(p, VXM_MIGRATE); + __vxm_rqinfo1(rq, task_cpu(p)); + __vxm_rqinfo2(rq, task_cpu(p)); +} + +static inline +void vxm_idle_skip(runqueue_t *rq, int cpu) +{ + __vxm_rqinfo1(rq, cpu); + __vxm_rqinfo2(rq, cpu); +} + +static inline +void vxm_need_resched(struct task_struct *p, int slice, int cpu) +{ + if (slice) + return; + + __vxm_task(p, VXM_RESCHED); +} + +static inline +void vxm_sync(unsigned long now, int cpu) +{ + if (!CONFIG_VSERVER_MONITOR_SYNC || + (now % CONFIG_VSERVER_MONITOR_SYNC)) + return; + + __vxm_sync(cpu); +} + +#define vxm_sched_info(s,v,c) __vxm_sched(s,v,c) + +static inline +void vxm_tokens_recalc(struct _vx_sched_pc *s, runqueue_t *rq, + struct vx_info *vxi, int cpu) +{ + __vxm_sched(s, vxi, cpu); + __vxm_rqinfo2(rq, cpu); +} + +static inline +void vxm_update_sched(struct _vx_sched_pc *s, struct vx_info *vxi, int cpu) +{ + __vxm_sched(s, vxi, cpu); + __vxm_update(s, vxi, cpu); + __vxm_update1(s, vxi, cpu); + __vxm_update2(s, vxi, cpu); +} + +static inline +void vxm_rq_max_min(runqueue_t *rq, int cpu) +{ + __vxm_rqinfo1(rq, cpu); + __vxm_rqinfo2(rq, cpu); +} + +#else /* CONFIG_VSERVER_MONITOR */ + +#define vxm_activate_task(t,q) do { } while (0) +#define vxm_activate_idle(t,q) do { } while (0) +#define vxm_deactivate_task(t,q) do { } while (0) +#define vxm_hold_task(t,q) do { } while (0) +#define vxm_unhold_task(t,q) do { } while (0) +#define vxm_migrate_task(t,q,d) do { } while (0) +#define vxm_idle_skip(q,c) do { } while (0) +#define vxm_need_resched(t,s,c) do { } while (0) +#define vxm_sync(s,v,c) do { } while (0) +#define vxm_sched_info(s,v,c) do { } while (0) +#define vxm_tokens_recalc(s,q,v,c) do { } while (0) +#define vxm_update_sched(s,v,c) do { } while (0) +#define vxm_rq_max_min(q,c) do { } while (0) + +#endif /* CONFIG_VSERVER_MONITOR */ + diff -NurpP --minimal linux-2.6.16-vs2.1.1-rc14.10/kernel/vserver/Kconfig linux-2.6.16-vs2.1.1-rc14.11/kernel/vserver/Kconfig --- linux-2.6.16-vs2.1.1-rc14.10/kernel/vserver/Kconfig 2006-03-23 23:10:54 +0100 +++ linux-2.6.16-vs2.1.1-rc14.11/kernel/vserver/Kconfig 2006-04-03 02:10:30 +0200 @@ -199,6 +199,33 @@ config VSERVER_HISTORY_SIZE This allows you to specify the number of entries in the per-CPU history buffer. +config VSERVER_MONITOR + bool "VServer Scheduling Monitor" + depends on VSERVER_DEBUG + default n + help + Set this to yes if you want to record the scheduling + decisions, so that they can be relayed to userspace + for detailed analysis. + +config VSERVER_MONITOR_SIZE + int "Per-CPU Monitor Queue Size (32-65536)" + depends on VSERVER_MONITOR + range 32 65536 + default 1024 + help + This allows you to specify the number of entries in + the per-CPU scheduling monitor buffer. + +config VSERVER_MONITOR_SYNC + int "Per-CPU Monitor Sync Interval (0-65536)" + depends on VSERVER_MONITOR + range 0 65536 + default 256 + help + This allows you to specify the interval in ticks + when a time sync entry is inserted. + endmenu diff -NurpP --minimal linux-2.6.16-vs2.1.1-rc14.10/kernel/vserver/Makefile linux-2.6.16-vs2.1.1-rc14.11/kernel/vserver/Makefile --- linux-2.6.16-vs2.1.1-rc14.10/kernel/vserver/Makefile 2006-03-20 18:09:05 +0100 +++ linux-2.6.16-vs2.1.1-rc14.11/kernel/vserver/Makefile 2006-04-03 02:10:30 +0200 @@ -13,4 +13,5 @@ vserver-$(CONFIG_VSERVER_DEBUG) += sysct vserver-$(CONFIG_VSERVER_LEGACY) += legacy.o vserver-$(CONFIG_VSERVER_LEGACYNET) += legacynet.o vserver-$(CONFIG_VSERVER_HISTORY) += history.o +vserver-$(CONFIG_VSERVER_MONITOR) += monitor.o diff -NurpP --minimal linux-2.6.16-vs2.1.1-rc14.10/kernel/vserver/monitor.c linux-2.6.16-vs2.1.1-rc14.11/kernel/vserver/monitor.c --- linux-2.6.16-vs2.1.1-rc14.10/kernel/vserver/monitor.c 1970-01-01 01:00:00 +0100 +++ linux-2.6.16-vs2.1.1-rc14.11/kernel/vserver/monitor.c 2006-04-03 02:10:30 +0200 @@ -0,0 +1,64 @@ +/* + * kernel/vserver/monitor.c + * + * Virtual Context Scheduler Monitor + * + * Copyright (C) 2006 Herbert Pötzl + * + * V0.01 basic design + * + */ + +#include +#include +#include +#include +#include + +#include +#include +#include + +#include + + +#ifdef CONFIG_VSERVER_MONITOR +#define VXM_SIZE CONFIG_VSERVER_MONITOR_SIZE +#else +#define VXM_SIZE 64 +#endif + +struct _vx_monitor { + unsigned int counter; + + struct _vx_mon_entry entry[VXM_SIZE+1]; +}; + + +DEFINE_PER_CPU(struct _vx_monitor, vx_monitor_buffer); + +unsigned volatile int vxm_active = 1; + +static atomic_t sequence = ATOMIC_INIT(0); + + +/* vxm_advance() + + * requires disabled preemption */ + +struct _vx_mon_entry *vxm_advance(int cpu) +{ + struct _vx_monitor *mon = &per_cpu(vx_monitor_buffer, cpu); + struct _vx_mon_entry *entry; + unsigned int index; + + index = vxm_active ? (mon->counter++ % VXM_SIZE) : VXM_SIZE; + entry = &mon->entry[index]; + + entry->ev.seq = atomic_inc_return(&sequence); + entry->ev.jif = jiffies; + return entry; +} + +EXPORT_SYMBOL_GPL(vxm_advance); +