--- linux-2.6.18.2/include/linux/vserver/network.h 1970-01-01 01:00:00 +0100 +++ linux-2.6.18.2-vs2.1.1/include/linux/vserver/network.h 2006-09-25 15:40:02 +0200 @@ -0,0 +1,142 @@ +#ifndef _VX_NETWORK_H +#define _VX_NETWORK_H + +#include + + +#define MAX_N_CONTEXT 65535 /* Arbitrary limit */ + +#define NX_DYNAMIC_ID ((uint32_t)-1) /* id for dynamic context */ + +#define NB_IPV4ROOT 16 + + +/* network flags */ + +#define NXF_INFO_LOCK 0x00000001 + +#define NXF_STATE_SETUP (1ULL<<32) +#define NXF_STATE_ADMIN (1ULL<<34) + +#define NXF_SC_HELPER (1ULL<<36) +#define NXF_PERSISTENT (1ULL<<38) + +#define NXF_ONE_TIME (0x0005ULL<<32) + +#define NXF_INIT_SET (NXF_STATE_ADMIN) + + +/* address types */ + +#define NXA_TYPE_IPV4 1 +#define NXA_TYPE_IPV6 2 + +#define NXA_MOD_BCAST (1<<8) + +#define NXA_TYPE_ANY ((uint16_t)-1) + + +#ifdef __KERNEL__ + +#include +#include +#include +#include + + +struct nx_info { + struct hlist_node nx_hlist; /* linked list of nxinfos */ + nid_t nx_id; /* vnet id */ + atomic_t nx_usecnt; /* usage count */ + atomic_t nx_tasks; /* tasks count */ + int nx_state; /* context state */ + + uint64_t nx_flags; /* network flag word */ + uint64_t nx_ncaps; /* network capabilities */ + + int nbipv4; + __u32 ipv4[NB_IPV4ROOT]; /* Process can only bind to these IPs */ + /* The first one is used to connect */ + /* and for bind any service */ + /* The other must be used explicity */ + __u32 mask[NB_IPV4ROOT]; /* Netmask for each ipv4 */ + /* Used to select the proper source */ + /* address for sockets */ + __u32 v4_bcast; /* Broadcast address to receive UDP */ + + char nx_name[65]; /* network context name */ +}; + + +/* status flags */ + +#define NXS_HASHED 0x0001 +#define NXS_SHUTDOWN 0x0100 +#define NXS_RELEASED 0x8000 + +/* check conditions */ + +#define NX_ADMIN 0x0001 +#define NX_WATCH 0x0002 +#define NX_BLEND 0x0004 +#define NX_HOSTID 0x0008 + +#define NX_IDENT 0x0010 +#define NX_EQUIV 0x0020 +#define NX_PARENT 0x0040 +#define NX_CHILD 0x0080 + +#define NX_ARG_MASK 0x00F0 + +#define NX_DYNAMIC 0x0100 +#define NX_STATIC 0x0200 + +#define NX_ATR_MASK 0x0F00 + + +extern struct nx_info *lookup_nx_info(int); + +extern int get_nid_list(int, unsigned int *, int); +extern int nid_is_hashed(nid_t); + +extern int nx_migrate_task(struct task_struct *, struct nx_info *); + +extern long vs_net_change(struct nx_info *, unsigned int); + +struct in_ifaddr; +struct net_device; + +#ifdef CONFIG_INET +int ifa_in_nx_info(struct in_ifaddr *, struct nx_info *); +int dev_in_nx_info(struct net_device *, struct nx_info *); + +#else /* CONFIG_INET */ +static inline +int ifa_in_nx_info(struct in_ifaddr *a, struct nx_info *n) +{ + return 1; +} + +static inline +int dev_in_nx_info(struct net_device *d, struct nx_info *n) +{ + return 1; +} +#endif /* CONFIG_INET */ + +struct sock; + +#ifdef CONFIG_INET +int nx_addr_conflict(struct nx_info *, uint32_t, struct sock *); +#else /* CONFIG_INET */ +static inline +int nx_addr_conflict(struct nx_info *n, uint32_t a, struct sock *s) +{ + return 1; +} +#endif /* CONFIG_INET */ + +#endif /* __KERNEL__ */ +#else /* _VX_NETWORK_H */ +#warning duplicate inclusion +#endif /* _VX_NETWORK_H */ --- linux-2.6.18.2/include/net/inet_hashtables.h 2006-09-20 16:58:44 +0200 +++ linux-2.6.18.2-vs2.1.1/include/net/inet_hashtables.h 2006-09-20 17:01:45 +0200 @@ -271,6 +271,25 @@ static inline int inet_iif(const struct return ((struct rtable *)skb->dst)->rt_iif; } +/* + * Check if a given address matches for an inet socket + * + * nxi: the socket's nx_info if any + * addr: to be verified address + * saddr: socket addresses + */ +static inline int inet_addr_match ( + struct nx_info *nxi, + uint32_t addr, + uint32_t saddr) +{ + if (addr && (saddr == addr)) + return 1; + if (!saddr) + return addr_in_nx_info(nxi, addr); + return 0; +} + extern struct sock *__inet_lookup_listener(const struct hlist_head *head, const u32 daddr, const unsigned short hnum, --- linux-2.6.18.2/include/net/inet_hashtables.h 2006-09-20 16:58:44 +0200 +++ linux-2.6.18.2-vs2.1.1/include/net/inet_hashtables.h 2006-09-20 17:01:45 +0200 @@ -291,7 +310,7 @@ static inline struct sock * const struct inet_sock *inet = inet_sk((sk = __sk_head(head))); if (inet->num == hnum && !sk->sk_node.next && - (!inet->rcv_saddr || inet->rcv_saddr == daddr) && + inet_addr_match(sk->sk_nx_info, daddr, inet->rcv_saddr) && (sk->sk_family == PF_INET || !ipv6_only_sock(sk)) && !sk->sk_bound_dev_if) goto sherry_cache; --- linux-2.6.18.2/include/net/inet_sock.h 2006-09-20 16:58:44 +0200 +++ linux-2.6.18.2-vs2.1.1/include/net/inet_sock.h 2006-09-20 17:01:45 +0200 @@ -114,6 +114,7 @@ struct inet_sock { /* Socket demultiplex comparisons on incoming packets. */ __u32 daddr; __u32 rcv_saddr; + __u32 rcv_saddr2; /* Second bound ipv4 addr, for ipv4root */ __u16 dport; __u16 num; __u32 saddr; --- linux-2.6.18.2/include/net/inet_timewait_sock.h 2006-09-20 16:58:44 +0200 +++ linux-2.6.18.2-vs2.1.1/include/net/inet_timewait_sock.h 2006-09-20 17:01:45 +0200 @@ -115,6 +115,10 @@ struct inet_timewait_sock { #define tw_refcnt __tw_common.skc_refcnt #define tw_hash __tw_common.skc_hash #define tw_prot __tw_common.skc_prot +#define tw_xid __tw_common.skc_xid +#define tw_vx_info __tw_common.skc_vx_info +#define tw_nid __tw_common.skc_nid +#define tw_nx_info __tw_common.skc_nx_info volatile unsigned char tw_substate; /* 3 bits hole, try to pack */ unsigned char tw_rcv_wscale; --- linux-2.6.18.2/include/net/route.h 2006-09-20 16:58:44 +0200 +++ linux-2.6.18.2-vs2.1.1/include/net/route.h 2006-10-18 04:06:32 +0200 @@ -143,6 +146,59 @@ static inline char rt_tos2priority(u8 to return ip_tos2prio[IPTOS_TOS(tos)>>1]; } +#define IPI_LOOPBACK htonl(INADDR_LOOPBACK) + +static inline int ip_find_src(struct nx_info *nxi, struct rtable **rp, struct flowi *fl) +{ + int err; + int i, n = nxi->nbipv4; + u32 ipv4root = nxi->ipv4[0]; + + if (ipv4root == 0) + return 0; + + if (fl->fl4_src == 0) { + if (n > 1) { + u32 foundsrc; + + err = __ip_route_output_key(rp, fl); + if (err) { + fl->fl4_src = ipv4root; + err = __ip_route_output_key(rp, fl); + } + if (err) + return err; + + foundsrc = (*rp)->rt_src; + ip_rt_put(*rp); + + for (i=0; imask[i]; + u32 ipv4 = nxi->ipv4[i]; + u32 net4 = ipv4 & mask; + + if (foundsrc == ipv4) { + fl->fl4_src = ipv4; + break; + } + if (!fl->fl4_src && (foundsrc & mask) == net4) + fl->fl4_src = ipv4; + } + } + if (fl->fl4_src == 0) + fl->fl4_src = (fl->fl4_dst == IPI_LOOPBACK) + ? IPI_LOOPBACK : ipv4root; + } else { + for (i=0; iipv4[i] == fl->fl4_src) + break; + } + if (i == n) + return -EPERM; + } + return 0; +} + static inline int ip_route_connect(struct rtable **rp, u32 dst, u32 src, u32 tos, int oif, u8 protocol, u16 sport, u16 dport, struct sock *sk) --- linux-2.6.18.2/include/net/route.h 2006-09-20 16:58:44 +0200 +++ linux-2.6.18.2-vs2.1.1/include/net/route.h 2006-10-18 04:06:32 +0200 @@ -157,7 +213,27 @@ static inline int ip_route_connect(struc .dport = dport } } }; int err; - if (!dst || !src) { + struct nx_info *nx_info = current->nx_info; + + if (sk) + nx_info = sk->sk_nx_info; + vxdprintk(VXD_CBIT(net, 4), + "ip_route_connect(%p) %p,%p;%lx", + sk, nx_info, sk->sk_socket, + (sk->sk_socket?sk->sk_socket->flags:0)); + + if (nx_info) { + err = ip_find_src(nx_info, rp, &fl); + if (err) + return err; + if (fl.fl4_dst == IPI_LOOPBACK && !vx_check(0, VX_ADMIN)) + fl.fl4_dst = nx_info->ipv4[0]; +#ifdef CONFIG_VSERVER_REMAP_SADDR + if (fl.fl4_src == IPI_LOOPBACK && !vx_check(0, VX_ADMIN)) + fl.fl4_src = nx_info->ipv4[0]; +#endif + } + if (!fl.fl4_dst || !fl.fl4_src) { err = __ip_route_output_key(rp, &fl); if (err) return err; --- linux-2.6.18.2/kernel/vserver/network.c 1970-01-01 01:00:00 +0100 +++ linux-2.6.18.2-vs2.1.1/kernel/vserver/network.c 2006-09-25 15:40:02 +0200 @@ -0,0 +1,752 @@ +/* + * linux/kernel/vserver/network.c + * + * Virtual Server: Network Support + * + * Copyright (C) 2003-2006 Herbert Pötzl + * + * V0.01 broken out from vcontext V0.05 + * V0.02 cleaned up implementation + * V0.03 added equiv nx commands + * V0.04 switch to RCU based hash + * V0.05 and back to locking again + * V0.06 changed vcmds to nxi arg + * + */ + +#include +#include +#include + +#include +#include + + +/* __alloc_nx_info() + + * allocate an initialized nx_info struct + * doesn't make it visible (hash) */ + +static struct nx_info *__alloc_nx_info(nid_t nid) +{ + struct nx_info *new = NULL; + + vxdprintk(VXD_CBIT(nid, 1), "alloc_nx_info(%d)*", nid); + + /* would this benefit from a slab cache? */ + new = kmalloc(sizeof(struct nx_info), GFP_KERNEL); + if (!new) + return 0; + + memset (new, 0, sizeof(struct nx_info)); + new->nx_id = nid; + INIT_HLIST_NODE(&new->nx_hlist); + atomic_set(&new->nx_usecnt, 0); + atomic_set(&new->nx_tasks, 0); + new->nx_state = 0; + + new->nx_flags = NXF_INIT_SET; + + /* rest of init goes here */ + + vxdprintk(VXD_CBIT(nid, 0), + "alloc_nx_info(%d) = %p", nid, new); + return new; +} + +/* __dealloc_nx_info() + + * final disposal of nx_info */ + +static void __dealloc_nx_info(struct nx_info *nxi) +{ + vxdprintk(VXD_CBIT(nid, 0), + "dealloc_nx_info(%p)", nxi); + + nxi->nx_hlist.next = LIST_POISON1; + nxi->nx_id = -1; + + BUG_ON(atomic_read(&nxi->nx_usecnt)); + BUG_ON(atomic_read(&nxi->nx_tasks)); + + nxi->nx_state |= NXS_RELEASED; + kfree(nxi); +} + +static void __shutdown_nx_info(struct nx_info *nxi) +{ + nxi->nx_state |= NXS_SHUTDOWN; + vs_net_change(nxi, VSC_NETDOWN); +} + +/* exported stuff */ + +void free_nx_info(struct nx_info *nxi) +{ + /* context shutdown is mandatory */ + BUG_ON(nxi->nx_state != NXS_SHUTDOWN); + + /* context must not be hashed */ + BUG_ON(nxi->nx_state & NXS_HASHED); + + BUG_ON(atomic_read(&nxi->nx_usecnt)); + BUG_ON(atomic_read(&nxi->nx_tasks)); + + __dealloc_nx_info(nxi); +} + + +/* hash table for nx_info hash */ + +#define NX_HASH_SIZE 13 + +struct hlist_head nx_info_hash[NX_HASH_SIZE]; + +static spinlock_t nx_info_hash_lock = SPIN_LOCK_UNLOCKED; + + +static inline unsigned int __hashval(nid_t nid) +{ + return (nid % NX_HASH_SIZE); +} + + + +/* __hash_nx_info() + + * add the nxi to the global hash table + * requires the hash_lock to be held */ + +static inline void __hash_nx_info(struct nx_info *nxi) +{ + struct hlist_head *head; + + vxd_assert_lock(&nx_info_hash_lock); + vxdprintk(VXD_CBIT(nid, 4), + "__hash_nx_info: %p[#%d]", nxi, nxi->nx_id); + + /* context must not be hashed */ + BUG_ON(nx_info_state(nxi, NXS_HASHED)); + + nxi->nx_state |= NXS_HASHED; + head = &nx_info_hash[__hashval(nxi->nx_id)]; + hlist_add_head(&nxi->nx_hlist, head); +} + +/* __unhash_nx_info() + + * remove the nxi from the global hash table + * requires the hash_lock to be held */ + +static inline void __unhash_nx_info(struct nx_info *nxi) +{ + vxd_assert_lock(&nx_info_hash_lock); + vxdprintk(VXD_CBIT(nid, 4), + "__unhash_nx_info: %p[#%d]", nxi, nxi->nx_id); + + /* context must be hashed */ + BUG_ON(!nx_info_state(nxi, NXS_HASHED)); + + nxi->nx_state &= ~NXS_HASHED; + hlist_del(&nxi->nx_hlist); +} + + +/* __lookup_nx_info() + + * requires the hash_lock to be held + * doesn't increment the nx_refcnt */ + +static inline struct nx_info *__lookup_nx_info(nid_t nid) +{ + struct hlist_head *head = &nx_info_hash[__hashval(nid)]; + struct hlist_node *pos; + struct nx_info *nxi; + + vxd_assert_lock(&nx_info_hash_lock); + hlist_for_each(pos, head) { + nxi = hlist_entry(pos, struct nx_info, nx_hlist); + + if (nxi->nx_id == nid) + goto found; + } + nxi = NULL; +found: + vxdprintk(VXD_CBIT(nid, 0), + "__lookup_nx_info(#%u): %p[#%u]", + nid, nxi, nxi?nxi->nx_id:0); + return nxi; +} + + +/* __nx_dynamic_id() + + * find unused dynamic nid + * requires the hash_lock to be held */ + +static inline nid_t __nx_dynamic_id(void) +{ + static nid_t seq = MAX_N_CONTEXT; + nid_t barrier = seq; + + vxd_assert_lock(&nx_info_hash_lock); + do { + if (++seq > MAX_N_CONTEXT) + seq = MIN_D_CONTEXT; + if (!__lookup_nx_info(seq)) { + vxdprintk(VXD_CBIT(nid, 4), + "__nx_dynamic_id: [#%d]", seq); + return seq; + } + } while (barrier != seq); + return 0; +} + +/* __create_nx_info() + + * create the requested context + * get() and hash it */ + +static struct nx_info * __create_nx_info(int id) +{ + struct nx_info *new, *nxi = NULL; + + vxdprintk(VXD_CBIT(nid, 1), "create_nx_info(%d)*", id); + + if (!(new = __alloc_nx_info(id))) + return ERR_PTR(-ENOMEM); + + /* required to make dynamic xids unique */ + spin_lock(&nx_info_hash_lock); + + /* dynamic context requested */ + if (id == NX_DYNAMIC_ID) { +#ifdef CONFIG_VSERVER_DYNAMIC_IDS + id = __nx_dynamic_id(); + if (!id) { + printk(KERN_ERR "no dynamic context available.\n"); + nxi = ERR_PTR(-EAGAIN); + goto out_unlock; + } + new->nx_id = id; +#else + printk(KERN_ERR "dynamic contexts disabled.\n"); + nxi = ERR_PTR(-EINVAL); + goto out_unlock; +#endif + } + /* static context requested */ + else if ((nxi = __lookup_nx_info(id))) { + vxdprintk(VXD_CBIT(nid, 0), + "create_nx_info(%d) = %p (already there)", id, nxi); + if (nx_info_flags(nxi, NXF_STATE_SETUP, 0)) + nxi = ERR_PTR(-EBUSY); + else + nxi = ERR_PTR(-EEXIST); + goto out_unlock; + } + /* dynamic nid creation blocker */ + else if (id >= MIN_D_CONTEXT) { + vxdprintk(VXD_CBIT(nid, 0), + "create_nx_info(%d) (dynamic rejected)", id); + nxi = ERR_PTR(-EINVAL); + goto out_unlock; + } + + /* new context */ + vxdprintk(VXD_CBIT(nid, 0), + "create_nx_info(%d) = %p (new)", id, new); + __hash_nx_info(get_nx_info(new)); + nxi = new, new = NULL; + +out_unlock: + spin_unlock(&nx_info_hash_lock); + if (new) + __dealloc_nx_info(new); + return nxi; +} + + + +/* exported stuff */ + + +void unhash_nx_info(struct nx_info *nxi) +{ + __shutdown_nx_info(nxi); + spin_lock(&nx_info_hash_lock); + __unhash_nx_info(nxi); + spin_unlock(&nx_info_hash_lock); +} + +#ifdef CONFIG_VSERVER_LEGACYNET + +struct nx_info *create_nx_info(void) +{ + return __create_nx_info(NX_DYNAMIC_ID); +} + +#endif + +/* lookup_nx_info() + + * search for a nx_info and get() it + * negative id means current */ + +struct nx_info *lookup_nx_info(int id) +{ + struct nx_info *nxi = NULL; + + if (id < 0) { + nxi = get_nx_info(current->nx_info); + } else if (id > 1) { + spin_lock(&nx_info_hash_lock); + nxi = get_nx_info(__lookup_nx_info(id)); + spin_unlock(&nx_info_hash_lock); + } + return nxi; +} + +/* nid_is_hashed() + + * verify that nid is still hashed */ + +int nid_is_hashed(nid_t nid) +{ + int hashed; + + spin_lock(&nx_info_hash_lock); + hashed = (__lookup_nx_info(nid) != NULL); + spin_unlock(&nx_info_hash_lock); + return hashed; +} + + +#ifdef CONFIG_PROC_FS + +/* get_nid_list() + + * get a subset of hashed nids for proc + * assumes size is at least one */ + +int get_nid_list(int index, unsigned int *nids, int size) +{ + int hindex, nr_nids = 0; + + /* only show current and children */ + if (!nx_check(0, VX_ADMIN|VX_WATCH)) { + if (index > 0) + return 0; + nids[nr_nids] = nx_current_nid(); + return 1; + } + + for (hindex = 0; hindex < NX_HASH_SIZE; hindex++) { + struct hlist_head *head = &nx_info_hash[hindex]; + struct hlist_node *pos; + + spin_lock(&nx_info_hash_lock); + hlist_for_each(pos, head) { + struct nx_info *nxi; + + if (--index > 0) + continue; + + nxi = hlist_entry(pos, struct nx_info, nx_hlist); + nids[nr_nids] = nxi->nx_id; + if (++nr_nids >= size) { + spin_unlock(&nx_info_hash_lock); + goto out; + } + } + /* keep the lock time short */ + spin_unlock(&nx_info_hash_lock); + } +out: + return nr_nids; +} +#endif + + +/* + * migrate task to new network + * gets nxi, puts old_nxi on change + */ + +int nx_migrate_task(struct task_struct *p, struct nx_info *nxi) +{ + struct nx_info *old_nxi; + int ret = 0; + + if (!p || !nxi) + BUG(); + + vxdprintk(VXD_CBIT(nid, 5), + "nx_migrate_task(%p,%p[#%d.%d.%d])", + p, nxi, nxi->nx_id, + atomic_read(&nxi->nx_usecnt), + atomic_read(&nxi->nx_tasks)); + + if (nx_info_flags(nxi, NXF_INFO_LOCK, 0)) + return -EACCES; + + /* maybe disallow this completely? */ + old_nxi = task_get_nx_info(p); + if (old_nxi == nxi) + goto out; + + task_lock(p); + if (old_nxi) + clr_nx_info(&p->nx_info); + claim_nx_info(nxi, p); + set_nx_info(&p->nx_info, nxi); + p->nid = nxi->nx_id; + task_unlock(p); + + vxdprintk(VXD_CBIT(nid, 5), + "moved task %p into nxi:%p[#%d]", + p, nxi, nxi->nx_id); + + if (old_nxi) + release_nx_info(old_nxi, p); +out: + put_nx_info(old_nxi); + return ret; +} + + +#ifdef CONFIG_INET + +#include +#include + +int ifa_in_nx_info(struct in_ifaddr *ifa, struct nx_info *nxi) +{ + if (!nxi) + return 1; + if (!ifa) + return 0; + return addr_in_nx_info(nxi, ifa->ifa_local); +} + +int dev_in_nx_info(struct net_device *dev, struct nx_info *nxi) +{ + struct in_device *in_dev; + struct in_ifaddr **ifap; + struct in_ifaddr *ifa; + int ret = 0; + + if (!nxi) + return 1; + + in_dev = in_dev_get(dev); + if (!in_dev) + goto out; + + for (ifap = &in_dev->ifa_list; (ifa = *ifap) != NULL; + ifap = &ifa->ifa_next) { + if (addr_in_nx_info(nxi, ifa->ifa_local)) { + ret = 1; + break; + } + } + in_dev_put(in_dev); +out: + return ret; +} + +/* + * check if address is covered by socket + * + * sk: the socket to check against + * addr: the address in question (must be != 0) + */ +static inline int __addr_in_socket(struct sock *sk, uint32_t addr) +{ + struct nx_info *nxi = sk->sk_nx_info; + uint32_t saddr = inet_rcv_saddr(sk); + + vxdprintk(VXD_CBIT(net, 5), + "__addr_in_socket(%p,%d.%d.%d.%d) %p:%d.%d.%d.%d %p;%lx", + sk, VXD_QUAD(addr), nxi, VXD_QUAD(saddr), sk->sk_socket, + (sk->sk_socket?sk->sk_socket->flags:0)); + + if (saddr) { + /* direct address match */ + return (saddr == addr); + } else if (nxi) { + /* match against nx_info */ + return addr_in_nx_info(nxi, addr); + } else { + /* unrestricted any socket */ + return 1; + } +} + + +int nx_addr_conflict(struct nx_info *nxi, uint32_t addr, struct sock *sk) +{ + vxdprintk(VXD_CBIT(net, 2), + "nx_addr_conflict(%p,%p) %d.%d,%d.%d", + nxi, sk, VXD_QUAD(addr)); + + if (addr) { + /* check real address */ + return __addr_in_socket(sk, addr); + } else if (nxi) { + /* check against nx_info */ + int i, n = nxi->nbipv4; + + for (i=0; iipv4[i])) + return 1; + return 0; + } else { + /* check against any */ + return 1; + } +} + +#endif /* CONFIG_INET */ + +void nx_set_persistent(struct nx_info *nxi) +{ + get_nx_info(nxi); + claim_nx_info(nxi, current); +} + +void nx_clear_persistent(struct nx_info *nxi) +{ + vxdprintk(VXD_CBIT(nid, 6), + "nx_clear_persistent(%p[#%d])", nxi, nxi->nx_id); + + release_nx_info(nxi, current); + put_nx_info(nxi); +} + +void nx_update_persistent(struct nx_info *nxi) +{ + if (nx_info_flags(nxi, NXF_PERSISTENT, 0)) + nx_set_persistent(nxi); + else + nx_clear_persistent(nxi); +} + +/* vserver syscall commands below here */ + +/* taks nid and nx_info functions */ + +#include + + +int vc_task_nid(uint32_t id, void __user *data) +{ + nid_t nid; + + if (id) { + struct task_struct *tsk; + + if (!vx_check(0, VX_ADMIN|VX_WATCH)) + return -EPERM; + + read_lock(&tasklist_lock); + tsk = find_task_by_real_pid(id); + nid = (tsk) ? tsk->nid : -ESRCH; + read_unlock(&tasklist_lock); + } + else + nid = nx_current_nid(); + return nid; +} + + +int vc_nx_info(struct nx_info *nxi, void __user *data) +{ + struct vcmd_nx_info_v0 vc_data; + + vc_data.nid = nxi->nx_id; + + if (copy_to_user (data, &vc_data, sizeof(vc_data))) + return -EFAULT; + return 0; +} + + +/* network functions */ + +int vc_net_create(uint32_t nid, void __user *data) +{ + struct vcmd_net_create vc_data = { .flagword = NXF_INIT_SET }; + struct nx_info *new_nxi; + int ret; + + if (data && copy_from_user (&vc_data, data, sizeof(vc_data))) + return -EFAULT; + + if ((nid > MAX_S_CONTEXT) && (nid != VX_DYNAMIC_ID)) + return -EINVAL; + if (nid < 2) + return -EINVAL; + + new_nxi = __create_nx_info(nid); + if (IS_ERR(new_nxi)) + return PTR_ERR(new_nxi); + + /* initial flags */ + new_nxi->nx_flags = vc_data.flagword; + + /* get a reference for persistent contexts */ + if ((vc_data.flagword & NXF_PERSISTENT)) + nx_set_persistent(new_nxi); + + ret = -ENOEXEC; + if (vs_net_change(new_nxi, VSC_NETUP)) + goto out_unhash; + ret = nx_migrate_task(current, new_nxi); + if (!ret) { + /* return context id on success */ + ret = new_nxi->nx_id; + goto out; + } +out_unhash: + /* prepare for context disposal */ + new_nxi->nx_state |= NXS_SHUTDOWN; + if ((vc_data.flagword & NXF_PERSISTENT)) + nx_clear_persistent(new_nxi); + __unhash_nx_info(new_nxi); +out: + put_nx_info(new_nxi); + return ret; +} + + +int vc_net_migrate(struct nx_info *nxi, void __user *data) +{ + return nx_migrate_task(current, nxi); +} + +int vc_net_add(struct nx_info *nxi, void __user *data) +{ + struct vcmd_net_addr_v0 vc_data; + int index, pos, ret = 0; + + if (data && copy_from_user (&vc_data, data, sizeof(vc_data))) + return -EFAULT; + + switch (vc_data.type) { + case NXA_TYPE_IPV4: + if ((vc_data.count < 1) || (vc_data.count > 4)) + return -EINVAL; + break; + + default: + break; + } + + switch (vc_data.type) { + case NXA_TYPE_IPV4: + index = 0; + while ((index < vc_data.count) && + ((pos = nxi->nbipv4) < NB_IPV4ROOT)) { + nxi->ipv4[pos] = vc_data.ip[index]; + nxi->mask[pos] = vc_data.mask[index]; + index++; + nxi->nbipv4++; + } + ret = index; + break; + + case NXA_TYPE_IPV4|NXA_MOD_BCAST: + nxi->v4_bcast = vc_data.ip[0]; + ret = 1; + break; + + default: + ret = -EINVAL; + break; + } + return ret; +} + +int vc_net_remove(struct nx_info * nxi, void __user *data) +{ + struct vcmd_net_addr_v0 vc_data; + + if (data && copy_from_user (&vc_data, data, sizeof(vc_data))) + return -EFAULT; + + switch (vc_data.type) { + case NXA_TYPE_ANY: + nxi->nbipv4 = 0; + break; + + default: + return -EINVAL; + } + return 0; +} + +int vc_get_nflags(struct nx_info *nxi, void __user *data) +{ + struct vcmd_net_flags_v0 vc_data; + + vc_data.flagword = nxi->nx_flags; + + /* special STATE flag handling */ + vc_data.mask = vx_mask_flags(~0UL, nxi->nx_flags, NXF_ONE_TIME); + + if (copy_to_user (data, &vc_data, sizeof(vc_data))) + return -EFAULT; + return 0; +} + +int vc_set_nflags(struct nx_info *nxi, void __user *data) +{ + struct vcmd_net_flags_v0 vc_data; + uint64_t mask, trigger; + + if (copy_from_user (&vc_data, data, sizeof(vc_data))) + return -EFAULT; + + /* special STATE flag handling */ + mask = vx_mask_mask(vc_data.mask, nxi->nx_flags, NXF_ONE_TIME); + trigger = (mask & nxi->nx_flags) ^ (mask & vc_data.flagword); + + nxi->nx_flags = vx_mask_flags(nxi->nx_flags, + vc_data.flagword, mask); + if (trigger & NXF_PERSISTENT) + nx_update_persistent(nxi); + + return 0; +} + +int vc_get_ncaps(struct nx_info *nxi, void __user *data) +{ + struct vcmd_net_caps_v0 vc_data; + + vc_data.ncaps = nxi->nx_ncaps; + vc_data.cmask = ~0UL; + + if (copy_to_user (data, &vc_data, sizeof(vc_data))) + return -EFAULT; + return 0; +} + +int vc_set_ncaps(struct nx_info *nxi, void __user *data) +{ + struct vcmd_net_caps_v0 vc_data; + + if (copy_from_user (&vc_data, data, sizeof(vc_data))) + return -EFAULT; + + nxi->nx_ncaps = vx_mask_flags(nxi->nx_ncaps, + vc_data.ncaps, vc_data.cmask); + return 0; +} + + +#include + +EXPORT_SYMBOL_GPL(free_nx_info); +EXPORT_SYMBOL_GPL(unhash_nx_info); + --- linux-2.6.18.2/net/ipv4/af_inet.c 2006-09-20 16:58:50 +0200 +++ linux-2.6.18.2-vs2.1.1/net/ipv4/af_inet.c 2006-10-18 04:06:32 +0200 @@ -402,6 +405,10 @@ int inet_bind(struct socket *sock, struc unsigned short snum; int chk_addr_ret; int err; + __u32 s_addr; /* Address used for validation */ + __u32 s_addr1; /* Address used for socket */ + __u32 s_addr2; /* Broadcast address for the socket */ + struct nx_info *nxi = sk->sk_nx_info; /* If the socket has its own bind function then use it. (RAW) */ if (sk->sk_prot->bind) { --- linux-2.6.18.2/net/ipv4/af_inet.c 2006-09-20 16:58:50 +0200 +++ linux-2.6.18.2-vs2.1.1/net/ipv4/af_inet.c 2006-10-18 04:06:32 +0200 @@ -412,7 +419,40 @@ int inet_bind(struct socket *sock, struc if (addr_len < sizeof(struct sockaddr_in)) goto out; - chk_addr_ret = inet_addr_type(addr->sin_addr.s_addr); + s_addr = addr->sin_addr.s_addr; + s_addr1 = s_addr; + s_addr2 = 0xffffffffl; + + vxdprintk(VXD_CBIT(net, 3), + "inet_bind(%p)* %p,%p;%lx %d.%d.%d.%d", + sk, sk->sk_nx_info, sk->sk_socket, + (sk->sk_socket?sk->sk_socket->flags:0), + VXD_QUAD(s_addr)); + if (nxi) { + __u32 v4_bcast = nxi->v4_bcast; + __u32 ipv4root = nxi->ipv4[0]; + int nbipv4 = nxi->nbipv4; + + if (s_addr == 0) { + /* bind to any for 1-n */ + s_addr = ipv4root; + s_addr1 = (nbipv4 > 1) ? 0 : s_addr; + s_addr2 = v4_bcast; + } else if (s_addr == IPI_LOOPBACK) { + /* rewrite localhost to ipv4root */ + s_addr = ipv4root; + s_addr1 = ipv4root; + } else if (s_addr != v4_bcast) { + /* normal address bind */ + if (!addr_in_nx_info(nxi, s_addr)) + return -EADDRNOTAVAIL; + } + } + chk_addr_ret = inet_addr_type(s_addr); + + vxdprintk(VXD_CBIT(net, 3), + "inet_bind(%p) %d.%d.%d.%d, %d.%d.%d.%d, %d.%d.%d.%d", + sk, VXD_QUAD(s_addr), VXD_QUAD(s_addr1), VXD_QUAD(s_addr2)); /* Not specified by any standard per-se, however it breaks too * many applications when removed. It is unfortunate since --- linux-2.6.18.2/net/ipv4/af_inet.c 2006-09-20 16:58:50 +0200 +++ linux-2.6.18.2-vs2.1.1/net/ipv4/af_inet.c 2006-10-18 04:06:32 +0200 @@ -424,7 +464,7 @@ int inet_bind(struct socket *sock, struc err = -EADDRNOTAVAIL; if (!sysctl_ip_nonlocal_bind && !inet->freebind && - addr->sin_addr.s_addr != INADDR_ANY && + s_addr != INADDR_ANY && chk_addr_ret != RTN_LOCAL && chk_addr_ret != RTN_MULTICAST && chk_addr_ret != RTN_BROADCAST) --- linux-2.6.18.2/net/ipv4/af_inet.c 2006-09-20 16:58:50 +0200 +++ linux-2.6.18.2-vs2.1.1/net/ipv4/af_inet.c 2006-10-18 04:06:32 +0200 @@ -449,7 +489,8 @@ int inet_bind(struct socket *sock, struc if (sk->sk_state != TCP_CLOSE || inet->num) goto out_release_sock; - inet->rcv_saddr = inet->saddr = addr->sin_addr.s_addr; + inet->rcv_saddr = inet->saddr = s_addr1; + inet->rcv_saddr2 = s_addr2; if (chk_addr_ret == RTN_MULTICAST || chk_addr_ret == RTN_BROADCAST) inet->saddr = 0; /* Use device */ --- linux-2.6.18.2/net/ipv4/inet_connection_sock.c 2006-09-20 16:58:50 +0200 +++ linux-2.6.18.2-vs2.1.1/net/ipv4/inet_connection_sock.c 2006-09-20 17:01:45 +0200 @@ -39,7 +39,6 @@ int sysctl_local_port_range[2] = { 1024, int inet_csk_bind_conflict(const struct sock *sk, const struct inet_bind_bucket *tb) { - const u32 sk_rcv_saddr = inet_rcv_saddr(sk); struct sock *sk2; struct hlist_node *node; int reuse = sk->sk_reuse; --- linux-2.6.18.2/net/ipv4/inet_connection_sock.c 2006-09-20 16:58:50 +0200 +++ linux-2.6.18.2-vs2.1.1/net/ipv4/inet_connection_sock.c 2006-09-20 17:01:45 +0200 @@ -52,9 +51,8 @@ int inet_csk_bind_conflict(const struct sk->sk_bound_dev_if == sk2->sk_bound_dev_if)) { if (!reuse || !sk2->sk_reuse || sk2->sk_state == TCP_LISTEN) { - const u32 sk2_rcv_saddr = inet_rcv_saddr(sk2); - if (!sk2_rcv_saddr || !sk_rcv_saddr || - sk2_rcv_saddr == sk_rcv_saddr) + if (nx_addr_conflict(sk->sk_nx_info, + inet_rcv_saddr(sk), sk2)) break; } } --- linux-2.6.18.2/net/ipv4/inet_hashtables.c 2006-09-20 16:58:50 +0200 +++ linux-2.6.18.2-vs2.1.1/net/ipv4/inet_hashtables.c 2006-09-20 17:01:45 +0200 @@ -138,11 +138,10 @@ struct sock *__inet_lookup_listener(cons const __u32 rcv_saddr = inet->rcv_saddr; int score = sk->sk_family == PF_INET ? 1 : 0; - if (rcv_saddr) { - if (rcv_saddr != daddr) - continue; + if (inet_addr_match(sk->sk_nx_info, daddr, rcv_saddr)) score += 2; - } + else + continue; if (sk->sk_bound_dev_if) { if (sk->sk_bound_dev_if != dif) continue; --- linux-2.6.18.2/net/ipv4/raw.c 2006-09-20 16:58:50 +0200 +++ linux-2.6.18.2-vs2.1.1/net/ipv4/raw.c 2006-09-25 15:40:02 +0200 @@ -102,6 +102,27 @@ static void raw_v4_unhash(struct sock *s write_unlock_bh(&raw_v4_lock); } + +/* + * Check if a given address matches for a socket + * + * nxi: the socket's nx_info if any + * addr: to be verified address + * saddr/baddr: socket addresses + */ +static inline int raw_addr_match ( + struct nx_info *nxi, + uint32_t addr, + uint32_t saddr, + uint32_t baddr) +{ + if (addr && (saddr == addr || baddr == addr)) + return 1; + if (!saddr) + return addr_in_nx_info(nxi, addr); + return 0; +} + struct sock *__raw_v4_lookup(struct sock *sk, unsigned short num, __be32 raddr, __be32 laddr, int dif) --- linux-2.6.18.2/net/ipv4/raw.c 2006-09-20 16:58:50 +0200 +++ linux-2.6.18.2-vs2.1.1/net/ipv4/raw.c 2006-09-25 15:40:02 +0200 @@ -113,7 +134,8 @@ struct sock *__raw_v4_lookup(struct sock if (inet->num == num && !(inet->daddr && inet->daddr != raddr) && - !(inet->rcv_saddr && inet->rcv_saddr != laddr) && + raw_addr_match(sk->sk_nx_info, laddr, + inet->rcv_saddr, inet->rcv_saddr2) && !(sk->sk_bound_dev_if && sk->sk_bound_dev_if != dif)) goto found; /* gotcha */ } --- linux-2.6.18.2/net/ipv4/raw.c 2006-09-20 16:58:50 +0200 +++ linux-2.6.18.2-vs2.1.1/net/ipv4/raw.c 2006-09-25 15:40:02 +0200 @@ -313,6 +335,11 @@ static int raw_send_hdrinc(struct sock * iph->check = ip_fast_csum((unsigned char *)iph, iph->ihl); } + err = -EPERM; + if (!vx_check(0, VX_ADMIN) && !capable(CAP_NET_RAW) + && (!addr_in_nx_info(sk->sk_nx_info, iph->saddr))) + goto error_free; + err = NF_HOOK(PF_INET, NF_IP_LOCAL_OUT, skb, NULL, rt->u.dst.dev, dst_output); if (err > 0) --- linux-2.6.18.2/net/ipv4/raw.c 2006-09-20 16:58:50 +0200 +++ linux-2.6.18.2-vs2.1.1/net/ipv4/raw.c 2006-09-25 15:40:02 +0200 @@ -324,6 +351,7 @@ out: error_fault: err = -EFAULT; +error_free: kfree_skb(skb); error: IP_INC_STATS(IPSTATS_MIB_OUTDISCARDS); --- linux-2.6.18.2/net/ipv4/raw.c 2006-09-20 16:58:50 +0200 +++ linux-2.6.18.2-vs2.1.1/net/ipv4/raw.c 2006-09-25 15:40:02 +0200 @@ -484,6 +512,12 @@ static int raw_sendmsg(struct kiocb *ioc if (!inet->hdrincl) raw_probe_proto_opt(&fl, msg); + if (sk->sk_nx_info) { + err = ip_find_src(sk->sk_nx_info, &rt, &fl); + + if (err) + goto done; + } err = ip_route_output_flow(&rt, &fl, sk, !(msg->msg_flags&MSG_DONTWAIT)); } if (err) --- linux-2.6.18.2/net/ipv4/udp.c 2006-09-20 16:58:51 +0200 +++ linux-2.6.18.2-vs2.1.1/net/ipv4/udp.c 2006-10-18 04:06:32 +0200 @@ -175,8 +175,7 @@ gotit: struct inet_sock *inet2 = inet_sk(sk2); if (inet2->num == snum && - sk2 != sk && - !ipv6_only_sock(sk2) && + sk2 != sk && !ipv6_only_sock(sk2) && (!sk2->sk_bound_dev_if || !sk->sk_bound_dev_if || sk2->sk_bound_dev_if == sk->sk_bound_dev_if) && --- linux-2.6.18.2/net/ipv4/udp.c 2006-09-20 16:58:51 +0200 +++ linux-2.6.18.2-vs2.1.1/net/ipv4/udp.c 2006-10-18 04:06:32 +0200 @@ -180,9 +179,8 @@ ***** (!sk2->sk_bound_dev_if || !sk->sk_bound_dev_if || sk2->sk_bound_dev_if == sk->sk_bound_dev_if) && - (!inet2->rcv_saddr || - !inet->rcv_saddr || - inet2->rcv_saddr == inet->rcv_saddr) && + nx_addr_conflict(sk->sk_nx_info, + inet_rcv_saddr(sk), sk2) && (!sk2->sk_reuse || !sk->sk_reuse)) goto fail; } --- linux-2.6.18.2/net/ipv4/udp.c 2006-09-20 16:58:51 +0200 +++ linux-2.6.18.2-vs2.1.1/net/ipv4/udp.c 2006-10-18 04:06:32 +0200 @@ -237,6 +235,11 @@ static struct sock *udp_v4_lookup_longwa if (inet->rcv_saddr != daddr) continue; score+=2; + } else if (sk->sk_nx_info) { + if (addr_in_nx_info(sk->sk_nx_info, daddr)) + score+=2; + else + continue; } if (inet->daddr) { if (inet->daddr != saddr) --- linux-2.6.18.2/net/ipv4/udp.c 2006-09-20 16:58:51 +0200 +++ linux-2.6.18.2-vs2.1.1/net/ipv4/udp.c 2006-10-18 04:06:32 +0200 @@ -293,7 +296,8 @@ static inline struct sock *udp_v4_mcast_ if (inet->num != hnum || (inet->daddr && inet->daddr != rmt_addr) || (inet->dport != rmt_port && inet->dport) || - (inet->rcv_saddr && inet->rcv_saddr != loc_addr) || + (inet->rcv_saddr && inet->rcv_saddr != loc_addr && + inet->rcv_saddr2 && inet->rcv_saddr2 != loc_addr) || ipv6_only_sock(s) || (s->sk_bound_dev_if && s->sk_bound_dev_if != dif)) continue; --- linux-2.6.18.2/net/ipv4/udp.c 2006-09-20 16:58:51 +0200 +++ linux-2.6.18.2-vs2.1.1/net/ipv4/udp.c 2006-10-18 04:06:32 +0200 @@ -603,6 +607,19 @@ int udp_sendmsg(struct kiocb *iocb, stru .uli_u = { .ports = { .sport = inet->sport, .dport = dport } } }; + struct nx_info *nxi = sk->sk_nx_info; + + if (nxi) { + err = ip_find_src(nxi, &rt, &fl); + if (err) + goto out; + if (daddr == IPI_LOOPBACK && !vx_check(0, VX_ADMIN)) + daddr = fl.fl4_dst = nxi->ipv4[0]; +#ifdef CONFIG_VSERVER_REMAP_SADDR + if (saddr == IPI_LOOPBACK && !vx_check(0, VX_ADMIN)) + saddr = fl.fl4_src = nxi->ipv4[0]; +#endif + } err = ip_route_output_flow(&rt, &fl, sk, !(msg->msg_flags&MSG_DONTWAIT)); if (err) goto out; --- linux-2.6.18.2/net/ipv6/addrconf.c 2006-09-20 16:58:51 +0200 +++ linux-2.6.18.2-vs2.1.1/net/ipv6/addrconf.c 2006-09-20 17:01:45 +0200 @@ -2698,7 +2698,10 @@ static void if6_seq_stop(struct seq_file static int if6_seq_show(struct seq_file *seq, void *v) { struct inet6_ifaddr *ifp = (struct inet6_ifaddr *)v; - seq_printf(seq, + + /* no ipv6 inside a vserver for now */ + if (vx_check(0, VX_ADMIN|VX_WATCH)) + seq_printf(seq, NIP6_SEQFMT " %02x %02x %02x %02x %8s\n", NIP6(ifp->addr), ifp->idev->dev->ifindex, --- linux-2.6.18.2/net/ipv6/addrconf.c 2006-09-20 16:58:51 +0200 +++ linux-2.6.18.2-vs2.1.1/net/ipv6/addrconf.c 2006-09-20 17:01:45 +0200 @@ -3133,6 +3136,10 @@ static int inet6_dump_addr(struct sk_buf struct ifmcaddr6 *ifmca; struct ifacaddr6 *ifaca; + /* no ipv6 inside a vserver for now */ + if (skb->sk && skb->sk->sk_vx_info) + return skb->len; + s_idx = cb->args[0]; s_ip_idx = ip_idx = cb->args[1]; read_lock(&dev_base_lock); --- linux-2.6.18.2/net/ipv6/addrconf.c 2006-09-20 16:58:51 +0200 +++ linux-2.6.18.2-vs2.1.1/net/ipv6/addrconf.c 2006-09-20 17:01:45 +0200 @@ -3414,6 +3421,10 @@ static int inet6_dump_ifinfo(struct sk_b struct net_device *dev; struct inet6_dev *idev; + /* no ipv6 inside a vserver for now */ + if (skb->sk && skb->sk->sk_vx_info) + return skb->len; + read_lock(&dev_base_lock); for (dev=dev_base, idx=0; dev; dev = dev->next, idx++) { if (idx < s_idx)