--- linux-2.6.16-rc1/drivers/block/Kconfig	2006-01-26 22:34:51 +0100
+++ linux-2.6.16-rc1-vs2.1.0.9.4/drivers/block/Kconfig	2006-01-21 18:28:15 +0100
@@ -315,6 +315,13 @@ config BLK_DEV_CRYPTOLOOP
 	  instead, which can be configured to be on-disk compatible with the
 	  cryptoloop device.
 
+config BLK_DEV_VROOT
+	tristate "Virtual Root device support"
+	depends on QUOTACTL
+	---help---
+	  Saying Y here will allow you to use quota/fs ioctls on a shared
+	  partition within a virtual server without compromising security.
+
 config BLK_DEV_NBD
 	tristate "Network block device support"
 	depends on NET
--- linux-2.6.16-rc1/drivers/block/Makefile	2006-01-03 17:29:21 +0100
+++ linux-2.6.16-rc1-vs2.1.0.9.4/drivers/block/Makefile	2006-01-21 18:28:15 +0100
@@ -30,4 +30,5 @@ obj-$(CONFIG_BLK_DEV_CRYPTOLOOP) += cryp
 obj-$(CONFIG_VIODASD)		+= viodasd.o
 obj-$(CONFIG_BLK_DEV_SX8)	+= sx8.o
 obj-$(CONFIG_BLK_DEV_UB)	+= ub.o
+obj-$(CONFIG_BLK_DEV_VROOT)	+= vroot.o
 
--- linux-2.6.16-rc1/drivers/block/vroot.c	1970-01-01 01:00:00 +0100
+++ linux-2.6.16-rc1-vs2.1.0.9.4/drivers/block/vroot.c	2006-01-21 18:28:15 +0100
@@ -0,0 +1,289 @@
+/*
+ *  linux/drivers/block/vroot.c
+ *
+ *  written by Herbert Pötzl, 9/11/2002
+ *  ported to 2.6.10 by Herbert Pötzl, 30/12/2004
+ *
+ *  based on the loop.c code by Theodore Ts'o.
+ *
+ * Copyright (C) 2002-2005 by Herbert Pötzl.
+ * Redistribution of this file is permitted under the
+ * GNU General Public License.
+ *
+ */
+
+#include <linux/config.h>
+#include <linux/module.h>
+#include <linux/moduleparam.h>
+#include <linux/file.h>
+#include <linux/major.h>
+#include <linux/blkdev.h>
+#include <linux/devfs_fs_kernel.h>
+
+#include <linux/vroot.h>
+#include <linux/vserver/debug.h>
+
+
+static int max_vroot = 8;
+
+static struct vroot_device *vroot_dev;
+static struct gendisk **disks;
+
+
+static int vroot_set_dev(
+	struct vroot_device *vr,
+	struct file *vr_file,
+	struct block_device *bdev,
+	unsigned int arg)
+{
+	struct block_device *real_bdev;
+	struct file *file;
+	struct inode *inode;
+	int error;
+
+	error = -EBUSY;
+	if (vr->vr_state != Vr_unbound)
+		goto out;
+
+	error = -EBADF;
+	file = fget(arg);
+	if (!file)
+		goto out;
+
+	error = -EINVAL;
+	inode = file->f_dentry->d_inode;
+
+
+	if (S_ISBLK(inode->i_mode)) {
+		real_bdev = inode->i_bdev;
+		vr->vr_device = real_bdev;
+		__iget(real_bdev->bd_inode);
+	} else
+		goto out_fput;
+
+	vxdprintk(VXD_CBIT(misc, 0),
+		"vroot[%d]_set_dev: dev=" VXF_DEV,
+		vr->vr_number, VXD_DEV(real_bdev));
+
+	vr->vr_state = Vr_bound;
+	error = 0;
+
+ out_fput:
+	fput(file);
+ out:
+	return error;
+}
+
+static int vroot_clr_dev(
+	struct vroot_device *vr,
+	struct file *vr_file,
+	struct block_device *bdev)
+{
+	struct block_device *real_bdev;
+
+	if (vr->vr_state != Vr_bound)
+		return -ENXIO;
+	if (vr->vr_refcnt > 1)	/* we needed one fd for the ioctl */
+		return -EBUSY;
+
+	real_bdev = vr->vr_device;
+
+	vxdprintk(VXD_CBIT(misc, 0),
+		"vroot[%d]_clr_dev: dev=" VXF_DEV,
+		vr->vr_number, VXD_DEV(real_bdev));
+
+	bdput(real_bdev);
+	vr->vr_state = Vr_unbound;
+	vr->vr_device = NULL;
+	return 0;
+}
+
+
+static int vr_ioctl(struct inode * inode, struct file * file,
+	unsigned int cmd, unsigned long arg)
+{
+	struct vroot_device *vr = inode->i_bdev->bd_disk->private_data;
+	int err;
+
+	down(&vr->vr_ctl_mutex);
+	switch (cmd) {
+	case VROOT_SET_DEV:
+		err = vroot_set_dev(vr, file, inode->i_bdev, arg);
+		break;
+	case VROOT_CLR_DEV:
+		err = vroot_clr_dev(vr, file, inode->i_bdev);
+		break;
+	default:
+		err = -EINVAL;
+		break;
+	}
+	up(&vr->vr_ctl_mutex);
+	return err;
+}
+
+static int vr_open(struct inode *inode, struct file *file)
+{
+	struct vroot_device *vr = inode->i_bdev->bd_disk->private_data;
+
+	down(&vr->vr_ctl_mutex);
+	vr->vr_refcnt++;
+	up(&vr->vr_ctl_mutex);
+	return 0;
+}
+
+static int vr_release(struct inode *inode, struct file *file)
+{
+	struct vroot_device *vr = inode->i_bdev->bd_disk->private_data;
+
+	down(&vr->vr_ctl_mutex);
+	--vr->vr_refcnt;
+	up(&vr->vr_ctl_mutex);
+	return 0;
+}
+
+static struct block_device_operations vr_fops = {
+	.owner =	THIS_MODULE,
+	.open =		vr_open,
+	.release =	vr_release,
+	.ioctl =	vr_ioctl,
+};
+
+struct block_device *__vroot_get_real_bdev(struct block_device *bdev)
+{
+	struct inode *inode = bdev->bd_inode;
+	struct vroot_device *vr;
+	struct block_device *real_bdev;
+	int minor = iminor(inode);
+
+	vr = &vroot_dev[minor];
+	real_bdev = vr->vr_device;
+
+	vxdprintk(VXD_CBIT(misc, 0),
+		"vroot[%d]_get_real_bdev: dev=" VXF_DEV,
+		vr->vr_number, VXD_DEV(real_bdev));
+
+	if (vr->vr_state != Vr_bound)
+		return ERR_PTR(-ENXIO);
+
+	__iget(real_bdev->bd_inode);
+	return real_bdev;
+}
+
+/*
+ * And now the modules code and kernel interface.
+ */
+
+module_param(max_vroot, int, 0);
+
+MODULE_PARM_DESC(max_vroot, "Maximum number of vroot devices (1-256)");
+MODULE_LICENSE("GPL");
+MODULE_ALIAS_BLOCKDEV_MAJOR(VROOT_MAJOR);
+
+MODULE_AUTHOR ("Herbert Pötzl");
+MODULE_DESCRIPTION ("Virtual Root Device Mapper");
+
+
+int __init vroot_init(void)
+{
+	int err, i;
+
+	if (max_vroot < 1 || max_vroot > 256) {
+		max_vroot = MAX_VROOT_DEFAULT;
+		printk(KERN_WARNING "vroot: invalid max_vroot "
+			"(must be between 1 and 256), "
+			"using default (%d)\n", max_vroot);
+	}
+
+	if (register_blkdev(VROOT_MAJOR, "vroot"))
+		return -EIO;
+
+	err = -ENOMEM;
+	vroot_dev = kmalloc(max_vroot * sizeof(struct vroot_device), GFP_KERNEL);
+	if (!vroot_dev)
+		goto out_mem1;
+	memset(vroot_dev, 0, max_vroot * sizeof(struct vroot_device));
+
+	disks = kmalloc(max_vroot * sizeof(struct gendisk *), GFP_KERNEL);
+	if (!disks)
+		goto out_mem2;
+
+	for (i = 0; i < max_vroot; i++) {
+		disks[i] = alloc_disk(1);
+		if (!disks[i])
+			goto out_mem3;
+	}
+
+	devfs_mk_dir("vroot");
+
+	for (i = 0; i < max_vroot; i++) {
+		struct vroot_device *vr = &vroot_dev[i];
+		struct gendisk *disk = disks[i];
+
+		memset(vr, 0, sizeof(*vr));
+		init_MUTEX(&vr->vr_ctl_mutex);
+		vr->vr_number = i;
+		disk->major = VROOT_MAJOR;
+		disk->first_minor = i;
+		disk->fops = &vr_fops;
+		sprintf(disk->disk_name, "vroot%d", i);
+		sprintf(disk->devfs_name, "vroot/%d", i);
+		disk->private_data = vr;
+	}
+
+	err = register_vroot_grb(&__vroot_get_real_bdev);
+	if (err)
+		goto out_reg;
+
+	for (i = 0; i < max_vroot; i++)
+		add_disk(disks[i]);
+	printk(KERN_INFO "vroot: loaded (max %d devices)\n", max_vroot);
+	return 0;
+
+out_reg:
+	devfs_remove("vroot");
+out_mem3:
+	while (i--)
+		put_disk(disks[i]);
+	kfree(disks);
+out_mem2:
+	kfree(vroot_dev);
+out_mem1:
+	unregister_blkdev(VROOT_MAJOR, "vroot");
+	printk(KERN_ERR "vroot: ran out of memory\n");
+	return err;
+}
+
+void vroot_exit(void)
+{
+	int i;
+
+	if (unregister_vroot_grb(&__vroot_get_real_bdev))
+		printk(KERN_WARNING "vroot: cannot unregister grb\n");
+
+	for (i = 0; i < max_vroot; i++) {
+		del_gendisk(disks[i]);
+		put_disk(disks[i]);
+	}
+	devfs_remove("vroot");
+	if (unregister_blkdev(VROOT_MAJOR, "vroot"))
+		printk(KERN_WARNING "vroot: cannot unregister blkdev\n");
+
+	kfree(disks);
+	kfree(vroot_dev);
+}
+
+module_init(vroot_init);
+module_exit(vroot_exit);
+
+#ifndef MODULE
+
+static int __init max_vroot_setup(char *str)
+{
+	max_vroot = simple_strtol(str, NULL, 0);
+	return 1;
+}
+
+__setup("max_vroot=", max_vroot_setup);
+
+#endif
+
--- linux-2.6.16-rc1/fs/quota.c	2006-01-26 22:35:12 +0100
+++ linux-2.6.16-rc1-vs2.1.0.9.4/fs/quota.c	2006-01-21 18:28:18 +0100
@@ -337,6 +343,43 @@ static int do_quotactl(struct super_bloc
 	return 0;
 }
 
+#if defined(CONFIG_BLK_DEV_VROOT) || defined(CONFIG_BLK_DEV_VROOT_MODULE)
+
+#include <linux/vroot.h>
+#include <linux/kallsyms.h>
+
+static vroot_grb_func *vroot_get_real_bdev = NULL;
+
+static spinlock_t vroot_grb_lock = SPIN_LOCK_UNLOCKED;
+
+int register_vroot_grb(vroot_grb_func *func) {
+	int ret = -EBUSY;
+
+	spin_lock(&vroot_grb_lock);
+	if (!vroot_get_real_bdev) {
+		vroot_get_real_bdev = func;
+		ret = 0;
+	}
+	spin_unlock(&vroot_grb_lock);
+	return ret;
+}
+EXPORT_SYMBOL(register_vroot_grb);
+
+int unregister_vroot_grb(vroot_grb_func *func) {
+	int ret = -EINVAL;
+
+	spin_lock(&vroot_grb_lock);
+	if (vroot_get_real_bdev) {
+		vroot_get_real_bdev = NULL;
+		ret = 0;
+	}
+	spin_unlock(&vroot_grb_lock);
+	return ret;
+}
+EXPORT_SYMBOL(unregister_vroot_grb);
+
+#endif
+
 /*
  * This is the system call interface. This communicates with
  * the user-level programs. Currently this only supports diskquota
--- linux-2.6.16-rc1/fs/quota.c	2006-01-26 22:35:12 +0100
+++ linux-2.6.16-rc1-vs2.1.0.9.4/fs/quota.c	2006-01-21 18:28:18 +0100
@@ -362,6 +406,23 @@ asmlinkage long sys_quotactl(unsigned in
 		putname(tmp);
 		if (IS_ERR(bdev))
 			return PTR_ERR(bdev);
+#if defined(CONFIG_BLK_DEV_VROOT) || defined(CONFIG_BLK_DEV_VROOT_MODULE)
+		if (bdev && bdev->bd_inode &&
+			imajor(bdev->bd_inode) == VROOT_MAJOR) {
+			struct block_device *bdnew = (void *)-EINVAL;
+
+			if (vroot_get_real_bdev)
+				bdnew = vroot_get_real_bdev(bdev);
+			else
+				vxdprintk(VXD_CBIT(misc, 0),
+					"vroot_get_real_bdev not set");
+
+			bdput(bdev);
+			if (IS_ERR(bdnew))
+				return PTR_ERR(bdnew);
+			bdev = bdnew;
+		}
+#endif
 		sb = get_super(bdev);
 		bdput(bdev);
 		if (!sb)
--- linux-2.6.16-rc1/include/linux/major.h	2005-08-29 22:25:41 +0200
+++ linux-2.6.16-rc1-vs2.1.0.9.4/include/linux/major.h	2006-01-21 18:28:15 +0100
@@ -15,6 +15,7 @@
 #define HD_MAJOR		IDE0_MAJOR
 #define PTY_SLAVE_MAJOR		3
 #define TTY_MAJOR		4
+#define VROOT_MAJOR		4
 #define TTYAUX_MAJOR		5
 #define LP_MAJOR		6
 #define VCS_MAJOR		7
--- linux-2.6.16-rc1/include/linux/vroot.h	1970-01-01 01:00:00 +0100
+++ linux-2.6.16-rc1-vs2.1.0.9.4/include/linux/vroot.h	2006-01-21 18:28:15 +0100
@@ -0,0 +1,51 @@
+
+/*
+ * include/linux/vroot.h
+ *
+ * written by Herbert Pötzl, 9/11/2002
+ * ported to 2.6 by Herbert Pötzl, 30/12/2004
+ *
+ * Copyright (C) 2002-2005 by Herbert Pötzl.
+ * Redistribution of this file is permitted under the
+ * GNU General Public License.
+ */
+
+#ifndef _LINUX_VROOT_H
+#define _LINUX_VROOT_H
+
+
+#ifdef __KERNEL__
+
+/* Possible states of device */
+enum {
+	Vr_unbound,
+	Vr_bound,
+};
+
+struct vroot_device {
+	int		vr_number;
+	int		vr_refcnt;
+
+	struct semaphore	vr_ctl_mutex;
+	struct block_device    *vr_device;
+	int			vr_state;
+};
+
+
+typedef struct block_device *(vroot_grb_func)(struct block_device *);
+
+extern int register_vroot_grb(vroot_grb_func *);
+extern int unregister_vroot_grb(vroot_grb_func *);
+
+#endif /* __KERNEL__ */
+
+#define MAX_VROOT_DEFAULT	8
+
+/*
+ * IOCTL commands --- we will commandeer 0x56 ('V')
+ */
+
+#define VROOT_SET_DEV		0x5600
+#define VROOT_CLR_DEV		0x5601
+
+#endif /* _LINUX_VROOT_H */
--- linux-2.6.16-rc1/kernel/sched_hard.h	1970-01-01 01:00:00 +0100
+++ linux-2.6.16-rc1-vs2.1.0.9.4/kernel/sched_hard.h	2006-01-21 18:28:15 +0100
@@ -0,0 +1,315 @@
+#ifndef _VX_SCHED_HARD_H
+#define _VX_SCHED_HARD_H
+
+#ifndef CONFIG_VSERVER
+#warning config options missing
+#endif
+
+
+#ifdef CONFIG_VSERVER_IDLELIMIT
+
+/*
+ * vx_idle_resched - reschedule after maxidle
+ */
+static inline
+void vx_idle_resched(runqueue_t *rq)
+{
+	if (!--rq->idle_tokens && !list_empty(&rq->hold_queue))
+		set_need_resched();
+}
+
+#else /* !CONFIG_VSERVER_IDLELIMIT */
+
+#define vx_idle_resched(rq)
+
+#endif /* CONFIG_VSERVER_IDLELIMIT */
+
+
+
+#ifdef CONFIG_VSERVER_IDLETIME
+
+#define vx_set_rq_min_skip(rq, min)		\
+	(rq)->idle_skip = (min)
+
+#define vx_save_min_skip(ret, min, val)		\
+	__vx_save_min_skip(ret, min, val)
+
+static inline
+void __vx_save_min_skip(int ret, int *min, int val)
+{
+	if (ret > -2)
+		return;
+	if ((*min > val) || !*min)
+		*min = val;
+}
+
+static inline
+int vx_try_skip(runqueue_t *rq)
+{
+	/* artificially advance time */
+	if (rq->idle_skip && !list_empty(&rq->hold_queue)) {
+		rq->idle_time += rq->idle_skip;
+		return 1;
+	}
+	return 0;
+}
+
+#else /* !CONFIG_VSERVER_IDLETIME */
+
+#define vx_set_rq_min_skip(rq, min)		\
+	({ int dummy = (min); dummy; })
+
+#define vx_save_min_skip(ret, min, val)
+
+static inline
+int vx_try_skip(runqueue_t *rq)
+{
+	return 0;
+}
+
+#endif /* CONFIG_VSERVER_IDLETIME */
+
+
+
+#ifdef CONFIG_VSERVER_HARDCPU
+
+#define vx_set_rq_max_idle(rq, max)		\
+	(rq)->idle_tokens = (max)
+
+#define vx_save_max_idle(ret, min, val)		\
+	__vx_save_max_idle(ret, min, val)
+
+static inline
+void __vx_save_max_idle(int ret, int *min, int val)
+{
+	if (*min > val)
+		*min = val;
+}
+
+
+/*
+ * vx_hold_task - put a task on the hold queue
+ */
+static inline
+void vx_hold_task(struct task_struct *p, runqueue_t *rq)
+{
+	__deactivate_task(p, rq);
+	p->state |= TASK_ONHOLD;
+	/* a new one on hold */
+	rq->nr_onhold++;
+	list_add_tail(&p->run_list, &rq->hold_queue);
+}
+
+/*
+ * vx_unhold_task - put a task back to the runqueue
+ */
+static inline
+void vx_unhold_task(struct task_struct *p, runqueue_t *rq)
+{
+	list_del(&p->run_list);
+	/* one less waiting */
+	rq->nr_onhold--;
+	p->state &= ~TASK_ONHOLD;
+	enqueue_task(p, rq->expired);
+	rq->nr_running++;
+
+	if (p->static_prio < rq->best_expired_prio)
+		rq->best_expired_prio = p->static_prio;
+}
+
+unsigned long nr_onhold(void)
+{
+	unsigned long i, sum = 0;
+
+	for_each_online_cpu(i)
+		sum += cpu_rq(i)->nr_onhold;
+
+	return sum;
+}
+
+
+
+static inline
+int __vx_tokens_avail(struct _vx_sched_pc *sched_pc)
+{
+	return sched_pc->tokens;
+}
+
+static inline
+void __vx_consume_token(struct _vx_sched_pc *sched_pc)
+{
+	sched_pc->tokens--;
+}
+
+static inline
+int vx_need_resched(struct task_struct *p, int slice, int cpu)
+{
+	struct vx_info *vxi = p->vx_info;
+
+	if (vxi) {
+		struct _vx_sched_pc *sched_pc =
+			&vx_per_cpu(vxi, sched_pc, cpu);
+		int tokens;
+
+		/* maybe we can simplify that to decrement
+		   the token counter unconditional? */
+
+		if ((tokens = __vx_tokens_avail(sched_pc)) > 0)
+			__vx_consume_token(sched_pc);
+
+		/* for tokens > 0, one token was consumed */
+		if (tokens < 2)
+			return 1;
+	}
+	return (slice == 0);
+}
+
+
+#define vx_set_rq_time(rq, time) do {	\
+	rq->norm_time = time;		\
+} while (0)
+
+
+static inline
+void vx_try_unhold(runqueue_t *rq, int cpu)
+{
+	struct vx_info *vxi = NULL;
+	struct list_head *l, *n;
+	int maxidle = HZ;
+	int minskip = 0;
+
+	/* nothing to do? */
+	if (list_empty(&rq->hold_queue))
+		return;
+
+	list_for_each_safe(l, n, &rq->hold_queue) {
+		int ret, delta_min[2];
+		struct _vx_sched_pc *sched_pc;
+		struct task_struct *p;
+
+		p = list_entry(l, task_t, run_list);
+		if (vxi == p->vx_info)
+			continue;
+
+		vxi = p->vx_info;
+		/* ignore paused contexts */
+		if (vx_info_flags(vxi, VXF_SCHED_PAUSE, 0))
+			continue;
+
+		sched_pc = &vx_per_cpu(vxi, sched_pc, cpu);
+
+		/* recalc tokens */
+		ret = vx_tokens_recalc(sched_pc,
+			&rq->norm_time, &rq->idle_time, delta_min);
+
+		if (ret > 0) {
+			/* we found a runable context */
+			vx_unhold_task(p, rq);
+			break;
+		}
+		vx_save_max_idle(ret, &maxidle, delta_min[0]);
+		vx_save_min_skip(ret, &minskip, delta_min[1]);
+	}
+	vx_set_rq_max_idle(rq, maxidle);
+	vx_set_rq_min_skip(rq, minskip);
+}
+
+
+static inline
+int vx_schedule(struct task_struct *next, runqueue_t *rq, int cpu)
+{
+	struct vx_info *vxi = next->vx_info;
+	struct _vx_sched_pc *sched_pc;
+	int delta_min[2];
+	int flags, ret;
+
+	if (!vxi)
+		return 1;
+
+	flags = vxi->vx_flags;
+
+	if (unlikely(vx_check_flags(flags , VXF_SCHED_PAUSE, 0)))
+		goto put_on_hold;
+	if (!vx_check_flags(flags , VXF_SCHED_HARD|VXF_SCHED_PRIO, 0))
+		return 1;
+
+	sched_pc = &vx_per_cpu(vxi, sched_pc, cpu);
+#ifdef CONFIG_SMP
+	/* update scheduler params */
+	if (cpu_isset(cpu, vxi->sched.update)) {
+		vx_update_sched_param(&vxi->sched, sched_pc);
+		cpu_clear(cpu, vxi->sched.update);
+	}
+#endif
+	ret  = vx_tokens_recalc(sched_pc,
+		&rq->norm_time, &rq->idle_time, delta_min);
+
+	if (!vx_check_flags(flags , VXF_SCHED_HARD, 0))
+		return 1;
+
+	if (unlikely(ret < 0)) {
+		vx_save_max_idle(ret, &rq->idle_tokens, delta_min[0]);
+		vx_save_min_skip(ret, &rq->idle_skip, delta_min[1]);
+	put_on_hold:
+		vx_hold_task(next, rq);
+		return 0;
+	}
+	return 1;
+}
+
+
+#else /* CONFIG_VSERVER_HARDCPU */
+
+static inline
+void vx_hold_task(struct task_struct *p, runqueue_t *rq)
+{
+	return;
+}
+
+static inline
+void vx_unhold_task(struct task_struct *p, runqueue_t *rq)
+{
+	return;
+}
+
+unsigned long nr_onhold(void)
+{
+	return 0;
+}
+
+
+static inline
+int vx_need_resched(struct task_struct *p, int slice, int cpu)
+{
+	return (slice == 0);
+}
+
+
+#define vx_set_rq_time(rq, time)
+
+static inline
+void vx_try_unhold(runqueue_t *rq, int cpu)
+{
+	return;
+}
+
+static inline
+int vx_schedule(struct task_struct *next, runqueue_t *rq, int cpu)
+{
+	struct vx_info *vxi = next->vx_info;
+	struct _vx_sched_pc *sched_pc;
+	int delta_min[2];
+	int ret;
+
+	if (!vx_info_flags(vxi, VXF_SCHED_PRIO, 0))
+		return 1;
+
+	sched_pc = &vx_per_cpu(vxi, sched_pc, cpu);
+	ret  = vx_tokens_recalc(sched_pc,
+		&rq->norm_time, &rq->idle_time, delta_min);
+	return 1;
+}
+
+#endif /* CONFIG_VSERVER_HARDCPU */
+
+#endif /* _VX_SCHED_HARD_H */