/*
 *  linux/kernel/vcontext.c
 *
 *  Virtual Context Support
 *
 *  Copyright (C) 2003  Herbert Pötzl
 *
 *  V0.01  context helper
 *  V0.02  vx_ctx_kill syscall command
 *  V0.03  replaced context_info calls
 *  V0.04  redesign of struct (de)alloc
 *  V0.05  added O(1) scheduler stuff
 *
 */

#include <linux/config.h>
#include <linux/linkage.h>
#include <linux/utsname.h>
#include <linux/slab.h>
#include <linux/vcontext.h>
#include <linux/vswitch.h>
#include <linux/vinline.h>
#include <linux/sched.h>
#include <linux/kernel_stat.h>

#include <asm/errno.h>
#include <asm/uaccess.h>


int vc_ctx_kill(uint32_t id, void *data)
{
	int retval, count=0;
	struct vcmd_ctx_kill_v0 vc_data;
	struct siginfo info;
	struct task_struct *p;
	pid_t initpid = 0;

	if (copy_from_user (&vc_data, data, sizeof(vc_data)))
		return -EFAULT;
	if (!vx_check(0, VX_ADMIN))
		return -EPERM;
	
	info.si_signo = vc_data.sig;
	info.si_errno = 0;
	info.si_code = SI_USER;
	info.si_pid = current->pid;
	info.si_uid = current->uid;

	retval = -ESRCH;
	read_lock(&tasklist_lock);
	switch (vc_data.pid) {
	case -1:
	case  0:
		for_each_task(p) {
			if (!initpid && vx_task_id(p) == id && p->vx_info)
				initpid = p->vx_info->vx_initpid;
			if (vx_task_id(p) == id  && p->pid > 1
				&& (!vc_data.pid || initpid != p->pid)
				&& thread_group_leader(p)) {
				int err = send_sig_info(vc_data.sig, &info, p);

				++count;
				if (err != -EPERM)
					retval = err;
			}
		}
		break;
		
	default:
	p = find_task_by_pid(vc_data.pid);
		if (p) {
			if (!thread_group_leader(p)) {
				struct task_struct *tg;
			
				tg = find_task_by_pid(p->tgid);
				if (tg)
					p = tg;
			}
			if ((id == -1) || (vx_task_id(p) == id))
				retval = send_sig_info(vc_data.sig, &info, p);
		}
		break;
	}
	read_unlock(&tasklist_lock);
	return retval;
}

int vc_get_rlimit(uint32_t id, void *data)
{
	return -ENOSYS;
}

int vc_set_rlimit(uint32_t id, void *data)
{
	return -ENOSYS;
}

int vc_get_rlimit_mask(uint32_t id, void *data)
{
	return -ENOSYS;
}


static struct vx_info *find_vx_info(int);

/*
 * vc_set_sched - switched syscall to alter a context's sched. prio
 *
 * Negative values indicate to leave the value as is
 */
int vc_set_sched(uint32_t ctx, void *data)
{
	struct vcmd_set_sched_v1 vc_data;
	struct vx_info *s;

	if (!vx_check(0, VX_ADMIN))
		return -ENOSYS;
		
	if (copy_from_user (&vc_data, data, sizeof(vc_data)))
		return -EFAULT;
	
	s = find_vx_info(ctx);
	if (!s)
		return -EINVAL;

	spin_lock(&s->sched.tokens_lock);

	if (vc_data.fill_rate > -1)
		s->sched.tokens_fr = vc_data.fill_rate;
	if (vc_data.period > -1)
		s->sched.tokens_div = vc_data.period;
	if (vc_data.fill_level > -1)
		s->sched.tokens = vc_data.fill_level;
	if (vc_data.bucket_size > -1)
		s->sched.tokens_max = vc_data.bucket_size;

	/* Sanity check the resultant values */
	if (s->sched.tokens_fr == 0)
		s->sched.tokens_fr = 1;
	if (s->sched.tokens_div == 0)
		s->sched.tokens_div = HZ;   /* arbitrary large number */
	if (s->sched.tokens_max == 0)
		s->sched.tokens_max = 1;
	if (s->sched.tokens > s->sched.tokens_max)
		s->sched.tokens = s->sched.tokens_max;

	if (vc_data.options & TBF_SCHED_ENABLE)
		s->vx_flags |= VX_INFO_SCHED;
	if (vc_data.options & TBF_SCHED_DISABLE)
		s->vx_flags &= ~VX_INFO_SCHED;

	spin_unlock(&s->sched.tokens_lock);
	put_vx_info(s);

	return 0;
}


/*  system functions */


LIST_HEAD(vx_infos);

spinlock_t vxlist_lock
	__cacheline_aligned_in_smp = SPIN_LOCK_UNLOCKED;


/*
 *	struct vx_info allocation and deallocation
 */

static struct vx_info *alloc_vx_info(int id)
{
	struct vx_info *new = NULL;
	
	vxdprintk("alloc_vx_info(%d)\n", id);
	/* would this benefit from a slab cache? */
	new = kmalloc(sizeof(struct vx_info), GFP_KERNEL);
	if (!new)
		return 0;

	memset (new, 0, sizeof(struct vx_info));
	new->vx_id = id;
	/* rest of init goes here */
	
	/* scheduling; hard code starting values as constants */
	new->sched.tokens_fr = 1;
	new->sched.tokens_div = 4;
	new->sched.tokens     = HZ * 5;
	new->sched.tokens_max = HZ * 10;
	new->sched.tokens_jfy = jiffies;
	new->sched.tokens_lock = SPIN_LOCK_UNLOCKED;
	
	new->virt.nr_threads = 1;
	// new->virt.bias_cswtch = kstat.context_swtch;
	new->virt.bias_jiffies = jiffies;
	/* new->virt.bias_idle = init_tasks[0]->times.tms_utime +
		init_tasks[0]->times.tms_stime; */
	down_read(&uts_sem);
	new->virt.utsname = system_utsname;
	up_read(&uts_sem);
	
	vxdprintk("alloc_vx_info(%d) = %p\n", id, new);
	return new;
}

extern int vx_proc_destroy(struct vx_info *);

void free_vx_info(struct vx_info *vxi)
{
	vxdprintk("free_vx_info(%p)\n", vxi);
	vx_proc_destroy(vxi);
	kfree(vxi);
}


/*
 *	struct vx_info search by id
 *	assumes vxlist_lock is held
 */

static __inline__ struct vx_info *__find_vx_info(int id)
{
	struct vx_info *vxi;

	list_for_each_entry(vxi, &vx_infos, vx_list)
		if (vxi->vx_id == id)
			return vxi;
	return 0;
}


/*
 *	struct vx_info ref stuff
 */

static struct vx_info *find_vx_info(int id)
{
	struct vx_info *vxi;
	
	spin_lock(&vxlist_lock);
	if ((vxi = __find_vx_info(id)))
		get_vx_info(vxi);
	spin_unlock(&vxlist_lock);
	return vxi;
}


/*
 *	struct vx_info search by id
 *	assumes vxlist_lock is held
 */

static __inline__ xid_t __vx_dynamic_id(void)
{
	static xid_t seq = MAX_S_CONTEXT;
	xid_t barrier = seq;
	
	do {
		if (++seq > MAX_S_CONTEXT)
			seq = MIN_D_CONTEXT;
		if (!__find_vx_info(seq))
			return seq;
	} while (barrier != seq);
	return 0;
}


extern int vx_proc_create(struct vx_info *);

static struct vx_info *find_or_create_vx_info(int id)
{
	struct vx_info *new, *vxi = NULL;
	
	vxdprintk("find_or_create_vx_info(%d)\n", id);
	if (!(new = alloc_vx_info(id)))
		return 0;

	spin_lock(&vxlist_lock);

	/* dynamic context requested */
	if (id == VX_DYNAMIC_ID) {
		id = __vx_dynamic_id();
		if (!id) {
			printk(KERN_ERR "no dynamic context available.\n");
			goto out_unlock;
		}
		new->vx_id = id;
	}
	/* existing context requested */
	else if ((vxi = __find_vx_info(id))) {
		vxdprintk("find_or_create_vx_info(%d) = %p (found)\n", id, vxi);
		get_vx_info(vxi);
		goto out_unlock;
	}

	/* new context requested */
	vxdprintk("find_or_create_vx_info(%d) = %p (new)\n", id, vxi);
	atomic_set(&new->vx_refcount, 1);
	list_add(&new->vx_list, &vx_infos);
	vx_proc_create(new);
	vxi = new, new = NULL;

out_unlock:
	spin_unlock(&vxlist_lock);
	if (new)
		free_vx_info(new);
	return vxi;
}


static int vx_migrate_user(struct task_struct *p, struct vx_info *vxi)
{
	struct user_struct *new_user, *old_user;
	
	if (!p || !vxi)
		BUG();
	new_user = alloc_uid(vxi->vx_id, p->uid);
	if (!new_user)
		return -ENOMEM;

	old_user = p->user;
	if (new_user != old_user) {
		atomic_inc(&new_user->processes);
		atomic_dec(&old_user->processes);
		p->user = new_user;
	}
	free_uid(old_user);
	return 0;
}

/*
 *	migrate task to new context
 *	gets vxi, puts old_vxi on change
 */

static int vx_migrate_task(struct task_struct *p, struct vx_info *vxi)
{
	struct vx_info *old_vxi;
	int ret = 0;
	
	if (!p || !vxi)
		BUG();

	vxdprintk("vx_migrate_task(%p,%p[#%d.%d)\n", p, vxi,
		vxi->vx_id, atomic_read(&vxi->vx_refcount));
	spin_lock(&p->alloc_lock);
	if ((old_vxi = p->vx_info) == vxi)
		goto out;

	if (!(ret = vx_migrate_user(p, vxi))) {
		if (old_vxi)
			old_vxi->virt.nr_threads--;
		vxi->virt.nr_threads++;
		p->vx_info = get_vx_info(vxi);
		p->vx_id = vxi->vx_id;
		if (old_vxi)
			put_vx_info(old_vxi);
	}
out:
	spin_unlock(&p->alloc_lock);
	return ret;
}


static int vx_set_initpid(struct vx_info *vxi, int pid)
{
	int ret = 0;
	if (vxi->vx_initpid)
		ret = -EPERM;
	else
		vxi->vx_initpid = pid;
	return ret;
}

int vc_new_s_context(uint32_t ctx, void *data)
{
	int ret = -EPERM;
	struct vcmd_new_s_context_v1 vc_data;
	struct vx_info *new_vxi;

	if (copy_from_user(&vc_data, data, sizeof(vc_data)))
		return -EFAULT;

	/* legacy hack, will be removed soon */
	if (ctx == -2) {
		/* assign flags and initpid */
		if (!current->vx_info)
			return -EINVAL;
		ret = 0;
		if (vc_data.flags & VX_INFO_INIT)
			ret = vx_set_initpid(current->vx_info, current->tgid);
		if (ret == 0) {
			/* We keep the same vx_id, but lower the capabilities */
			current->cap_bset &= (~vc_data.remove_cap);
			ret = vx_current_id();
			current->vx_info->vx_flags |= vc_data.flags;
		}
		return ret;
	}
	
	if (!vx_check(0, VX_ADMIN) ||
		!capable(CAP_SYS_ADMIN) ||
		(current->vx_info &&
		(current->vx_info->vx_flags & VX_INFO_LOCK)))
		return -EPERM;

	if (((ctx > MAX_S_CONTEXT) && (ctx != VX_DYNAMIC_ID)) ||
		(ctx == 0))
		return -EINVAL;
		
	if ((ctx == VX_DYNAMIC_ID) || (ctx < MIN_D_CONTEXT))
		new_vxi = find_or_create_vx_info(ctx);
	else
		new_vxi = find_vx_info(ctx);
		
	if (!new_vxi)
		return -EINVAL;

	ret = vx_migrate_task(current, new_vxi);
	if (ret == 0) {
		current->cap_bset &= (~vc_data.remove_cap);
		new_vxi->vx_flags |= vc_data.flags;
		if (vc_data.flags & VX_INFO_INIT)
			vx_set_initpid(new_vxi, current->tgid);
		ret = new_vxi->vx_id;
	}
	put_vx_info(new_vxi);
	return ret;
}


LIST_HEAD(ip_infos);

spinlock_t iplist_lock
	__cacheline_aligned_in_smp = SPIN_LOCK_UNLOCKED;


/*
 *	struct ip_info allocation and deallocation
 */

static struct ip_info *alloc_ip_info(void)
{
	struct ip_info *new = NULL;
	
	vxdprintk("alloc_ip_info()\n");
	/* would this benefit from a slab cache? */
	new = kmalloc(sizeof(struct ip_info), GFP_KERNEL);
	if (!new)
		return 0;
	
	memset (new, 0, sizeof(struct ip_info));
	/* rest of init goes here */
	
	
	vxdprintk("alloc_ip_info() = %p\n", new);
	return new;
}

// extern int ip_proc_destroy(struct ip_info *);

void free_ip_info(struct ip_info *ipi)
{
	vxdprintk("free_ip_info(%p)\n", ipi);
//	ip_proc_destroy(ipi);
	kfree(ipi);
}

static struct ip_info *create_ip_info(void)
{
	struct ip_info *new;
	
	vxdprintk("create_ip_info()\n");
	if (!(new = alloc_ip_info()))
		return 0;

	spin_lock(&iplist_lock);

	/* new ip info */
	atomic_set(&new->ip_refcount, 1);
	list_add(&new->ip_list, &ip_infos);
//	ip_proc_create(new);

	spin_unlock(&iplist_lock);
	return new;
}


/*  set ipv4 root (syscall) */

int vc_set_ipv4root(uint32_t nbip, void *data)
{
	int i, err = -EPERM;
	struct vcmd_set_ipv4root_v3 vc_data;
	struct ip_info *new_ipi, *ipi = current->ip_info;

	if (nbip < 0 || nbip > NB_IPV4ROOT)
		return -EINVAL;
	if (copy_from_user (&vc_data, data, sizeof(vc_data)))
		return -EFAULT;

	if (!ipi || ipi->ipv4[0] == 0 || capable(CAP_NET_ADMIN))
		// We are allowed to change everything
		err = 0;
	else if (ipi) {
		int found = 0;
		
		// We are allowed to select a subset of the currently
		// installed IP numbers. No new one allowed
		// We can't change the broadcast address though
		for (i=0; i<nbip; i++) {
			int j;
			__u32 ipip = vc_data.ip_mask_pair[i].ip;
			for (j=0; j<ipi->nbipv4; j++) {
				if (ipip == ipi->ipv4[j]) {
					found++;
					break;
				}
			}
		}
		if ((found == nbip) &&
			(vc_data.broadcast == ipi->v4_bcast))
			err = 0;
	}
	if (err)
		return err;

	new_ipi = create_ip_info();
	if (!new_ipi)
		return -EINVAL;

	new_ipi->nbipv4 = nbip;
	for (i=0; i<nbip; i++) {
		new_ipi->ipv4[i] = vc_data.ip_mask_pair[i].ip;
		new_ipi->mask[i] = vc_data.ip_mask_pair[i].mask;
	}
	new_ipi->v4_bcast = vc_data.broadcast;
	current->ip_info = new_ipi;
	put_ip_info(ipi);
	return 0;
}