--- linux-2.6.11.11/include/linux/ip.h 2005-03-02 12:38:52 +0100 +++ linux-2.6.11.11-vs2.0-rc3/include/linux/ip.h 2005-06-01 14:34:17 +0200 @@ -118,6 +118,7 @@ struct inet_sock { /* Socket demultiplex comparisons on incoming packets. */ __u32 daddr; /* Foreign IPv4 addr */ __u32 rcv_saddr; /* Bound local IPv4 addr */ + __u32 rcv_saddr2; /* Second bound ipv4 addr, for ipv4root */ __u16 dport; /* Destination port */ __u16 num; /* Local port */ __u32 saddr; /* Sending source */ --- linux-2.6.11.11/include/linux/vserver/network.h 1970-01-01 01:00:00 +0100 +++ linux-2.6.11.11-vs2.0-rc3/include/linux/vserver/network.h 2005-06-01 14:34:17 +0200 @@ -0,0 +1,78 @@ +#ifndef _VX_NETWORK_H +#define _VX_NETWORK_H + +#include + + +#define MAX_N_CONTEXT 65535 /* Arbitrary limit */ + +#define NX_DYNAMIC_ID ((uint32_t)-1) /* id for dynamic context */ + +#define NB_IPV4ROOT 16 + + +/* context flags */ + +#define NXF_STATE_SETUP (1ULL<<32) + + +#ifdef __KERNEL__ + +#include +#include +#include +#include + + +struct nx_info { + struct hlist_node nx_hlist; /* linked list of nxinfos */ + nid_t nx_id; /* vnet id */ + atomic_t nx_usecnt; /* usage count */ + atomic_t nx_tasks; /* tasks count */ + int nx_state; /* context state */ + + uint64_t nx_flags; /* network flag word */ + uint64_t nx_ncaps; /* network capabilities */ + + int nbipv4; + __u32 ipv4[NB_IPV4ROOT]; /* Process can only bind to these IPs */ + /* The first one is used to connect */ + /* and for bind any service */ + /* The other must be used explicity */ + __u32 mask[NB_IPV4ROOT]; /* Netmask for each ipv4 */ + /* Used to select the proper source */ + /* address for sockets */ + __u32 v4_bcast; /* Broadcast address to receive UDP */ + + char nx_name[65]; /* network context name */ +}; + + +/* status flags */ + +#define NXS_HASHED 0x0001 +#define NXS_SHUTDOWN 0x0100 +#define NXS_RELEASED 0x8000 + +extern struct nx_info *locate_nx_info(int); +extern struct nx_info *locate_or_create_nx_info(int); + +extern int get_nid_list(int, unsigned int *, int); +extern int nid_is_hashed(nid_t); + +extern int nx_migrate_task(struct task_struct *, struct nx_info *); + +struct in_ifaddr; +struct net_device; + +int ifa_in_nx_info(struct in_ifaddr *, struct nx_info *); +int dev_in_nx_info(struct net_device *, struct nx_info *); + +struct sock; + +int nx_addr_conflict(struct nx_info *, uint32_t, struct sock *); + +#endif /* __KERNEL__ */ +#else /* _VX_NETWORK_H */ +#warning duplicate inclusion +#endif /* _VX_NETWORK_H */ --- linux-2.6.11.11/include/net/route.h 2005-03-02 12:38:54 +0100 +++ linux-2.6.11.11-vs2.0-rc3/include/net/route.h 2005-06-01 14:34:17 +0200 @@ -143,6 +144,59 @@ static inline char rt_tos2priority(u8 to return ip_tos2prio[IPTOS_TOS(tos)>>1]; } +#define IPI_LOOPBACK 0x0100007f + +static inline int ip_find_src(struct nx_info *nxi, struct rtable **rp, struct flowi *fl) +{ + int err; + int i, n = nxi->nbipv4; + u32 ipv4root = nxi->ipv4[0]; + + if (ipv4root == 0) + return 0; + + if (fl->fl4_src == 0) { + if (n > 1) { + u32 foundsrc; + + err = __ip_route_output_key(rp, fl); + if (err) { + fl->fl4_src = ipv4root; + err = __ip_route_output_key(rp, fl); + } + if (err) + return err; + + foundsrc = (*rp)->rt_src; + ip_rt_put(*rp); + + for (i=0; imask[i]; + u32 ipv4 = nxi->ipv4[i]; + u32 net4 = ipv4 & mask; + + if (foundsrc == ipv4) { + fl->fl4_src = ipv4; + break; + } + if (!fl->fl4_src && (foundsrc & mask) == net4) + fl->fl4_src = ipv4; + } + } + if (fl->fl4_src == 0) + fl->fl4_src = (fl->fl4_dst == IPI_LOOPBACK) + ? IPI_LOOPBACK : ipv4root; + } else { + for (i=0; iipv4[i] == fl->fl4_src) + break; + } + if (i == n) + return -EPERM; + } + return 0; +} + static inline int ip_route_connect(struct rtable **rp, u32 dst, u32 src, u32 tos, int oif, u8 protocol, u16 sport, u16 dport, struct sock *sk) --- linux-2.6.11.11/include/net/route.h 2005-03-02 12:38:54 +0100 +++ linux-2.6.11.11-vs2.0-rc3/include/net/route.h 2005-06-01 14:34:17 +0200 @@ -157,7 +211,23 @@ static inline int ip_route_connect(struc .dport = dport } } }; int err; - if (!dst || !src) { + struct nx_info *nx_info = current->nx_info; + + if (sk) + nx_info = sk->sk_nx_info; + vxdprintk(VXD_CBIT(net, 4), + "ip_route_connect(%p) %p,%p;%lx", + sk, nx_info, sk->sk_socket, + (sk->sk_socket?sk->sk_socket->flags:0)); + + if (nx_info) { + err = ip_find_src(nx_info, rp, &fl); + if (err) + return err; + if (fl.fl4_dst == IPI_LOOPBACK && !vx_check(0, VX_ADMIN)) + fl.fl4_dst = nx_info->ipv4[0]; + } + if (!fl.fl4_dst || !fl.fl4_src) { err = __ip_route_output_key(rp, &fl); if (err) return err; --- linux-2.6.11.11/include/net/tcp.h 2005-03-02 12:38:54 +0100 +++ linux-2.6.11.11-vs2.0-rc3/include/net/tcp.h 2005-06-01 14:34:17 +0200 @@ -192,6 +192,10 @@ struct tcp_tw_bucket { #define tw_node __tw_common.skc_node #define tw_bind_node __tw_common.skc_bind_node #define tw_refcnt __tw_common.skc_refcnt +#define tw_xid __tw_common.skc_xid +#define tw_vx_info __tw_common.skc_vx_info +#define tw_nid __tw_common.skc_nid +#define tw_nx_info __tw_common.skc_nx_info volatile unsigned char tw_substate; unsigned char tw_rcv_wscale; __u16 tw_sport; --- linux-2.6.11.11/kernel/vserver/network.c 1970-01-01 01:00:00 +0100 +++ linux-2.6.11.11-vs2.0-rc3/kernel/vserver/network.c 2005-06-01 14:34:17 +0200 @@ -0,0 +1,677 @@ +/* + * linux/kernel/vserver/network.c + * + * Virtual Server: Network Support + * + * Copyright (C) 2003-2005 Herbert Pötzl + * + * V0.01 broken out from vcontext V0.05 + * V0.02 cleaned up implementation + * V0.03 added equiv nx commands + * V0.04 switch to RCU based hash + * V0.05 and back to locking again + * + */ + +#include +#include +#include +#include +#include + +#include + + +/* __alloc_nx_info() + + * allocate an initialized nx_info struct + * doesn't make it visible (hash) */ + +static struct nx_info *__alloc_nx_info(nid_t nid) +{ + struct nx_info *new = NULL; + + vxdprintk(VXD_CBIT(nid, 1), "alloc_nx_info(%d)*", nid); + + /* would this benefit from a slab cache? */ + new = kmalloc(sizeof(struct nx_info), GFP_KERNEL); + if (!new) + return 0; + + memset (new, 0, sizeof(struct nx_info)); + new->nx_id = nid; + INIT_HLIST_NODE(&new->nx_hlist); + atomic_set(&new->nx_usecnt, 0); + atomic_set(&new->nx_tasks, 0); + new->nx_state = 0; + + /* rest of init goes here */ + + vxdprintk(VXD_CBIT(nid, 0), + "alloc_nx_info(%d) = %p", nid, new); + return new; +} + +/* __dealloc_nx_info() + + * final disposal of nx_info */ + +static void __dealloc_nx_info(struct nx_info *nxi) +{ + vxdprintk(VXD_CBIT(nid, 0), + "dealloc_nx_info(%p)", nxi); + + nxi->nx_hlist.next = LIST_POISON1; + nxi->nx_id = -1; + + BUG_ON(atomic_read(&nxi->nx_usecnt)); + BUG_ON(atomic_read(&nxi->nx_tasks)); + + nxi->nx_state |= NXS_RELEASED; + kfree(nxi); +} + +static void __shutdown_nx_info(struct nx_info *nxi) +{ + nxi->nx_state |= NXS_SHUTDOWN; +} + +/* exported stuff */ + +void free_nx_info(struct nx_info *nxi) +{ + /* context shutdown is mandatory */ + BUG_ON(nxi->nx_state != NXS_SHUTDOWN); + + /* context must not be hashed */ + BUG_ON(nxi->nx_state & NXS_HASHED); + + BUG_ON(atomic_read(&nxi->nx_usecnt)); + BUG_ON(atomic_read(&nxi->nx_tasks)); + + __dealloc_nx_info(nxi); +} + + +/* hash table for nx_info hash */ + +#define NX_HASH_SIZE 13 + +struct hlist_head nx_info_hash[NX_HASH_SIZE]; + +static spinlock_t nx_info_hash_lock = SPIN_LOCK_UNLOCKED; + + +static inline unsigned int __hashval(nid_t nid) +{ + return (nid % NX_HASH_SIZE); +} + + + +/* __hash_nx_info() + + * add the nxi to the global hash table + * requires the hash_lock to be held */ + +static inline void __hash_nx_info(struct nx_info *nxi) +{ + struct hlist_head *head; + + vxd_assert_lock(&nx_info_hash_lock); + vxdprintk(VXD_CBIT(nid, 4), + "__hash_nx_info: %p[#%d]", nxi, nxi->nx_id); + + /* context must not be hashed */ + BUG_ON(nx_info_state(nxi, NXS_HASHED)); + + nxi->nx_state |= NXS_HASHED; + head = &nx_info_hash[__hashval(nxi->nx_id)]; + hlist_add_head(&nxi->nx_hlist, head); +} + +/* __unhash_nx_info() + + * remove the nxi from the global hash table + * requires the hash_lock to be held */ + +static inline void __unhash_nx_info(struct nx_info *nxi) +{ + vxd_assert_lock(&nx_info_hash_lock); + vxdprintk(VXD_CBIT(nid, 4), + "__unhash_nx_info: %p[#%d]", nxi, nxi->nx_id); + + /* context must be hashed */ + BUG_ON(!nx_info_state(nxi, NXS_HASHED)); + + nxi->nx_state &= ~NXS_HASHED; + hlist_del(&nxi->nx_hlist); +} + + +/* __lookup_nx_info() + + * requires the hash_lock to be held + * doesn't increment the nx_refcnt */ + +static inline struct nx_info *__lookup_nx_info(nid_t nid) +{ + struct hlist_head *head = &nx_info_hash[__hashval(nid)]; + struct hlist_node *pos; + struct nx_info *nxi; + + vxd_assert_lock(&nx_info_hash_lock); + hlist_for_each(pos, head) { + nxi = hlist_entry(pos, struct nx_info, nx_hlist); + + if (nxi->nx_id == nid) + goto found; + } + nxi = NULL; +found: + vxdprintk(VXD_CBIT(nid, 0), + "__lookup_nx_info(#%u): %p[#%u]", + nid, nxi, nxi?nxi->nx_id:0); + return nxi; +} + + +/* __nx_dynamic_id() + + * find unused dynamic nid + * requires the hash_lock to be held */ + +static inline nid_t __nx_dynamic_id(void) +{ + static nid_t seq = MAX_N_CONTEXT; + nid_t barrier = seq; + + vxd_assert_lock(&nx_info_hash_lock); + do { + if (++seq > MAX_N_CONTEXT) + seq = MIN_D_CONTEXT; + if (!__lookup_nx_info(seq)) { + vxdprintk(VXD_CBIT(nid, 4), + "__nx_dynamic_id: [#%d]", seq); + return seq; + } + } while (barrier != seq); + return 0; +} + +/* __create_nx_info() + + * create the requested context + * get() and hash it */ + +static struct nx_info * __create_nx_info(int id) +{ + struct nx_info *new, *nxi = NULL; + + vxdprintk(VXD_CBIT(nid, 1), "create_nx_info(%d)*", id); + + if (!(new = __alloc_nx_info(id))) + return ERR_PTR(-ENOMEM); + + /* required to make dynamic xids unique */ + spin_lock(&nx_info_hash_lock); + + /* dynamic context requested */ + if (id == NX_DYNAMIC_ID) { + id = __nx_dynamic_id(); + if (!id) { + printk(KERN_ERR "no dynamic context available.\n"); + nxi = ERR_PTR(-EAGAIN); + goto out_unlock; + } + new->nx_id = id; + } + /* static context requested */ + else if ((nxi = __lookup_nx_info(id))) { + vxdprintk(VXD_CBIT(nid, 0), + "create_nx_info(%d) = %p (already there)", id, nxi); + if (nx_info_flags(nxi, NXF_STATE_SETUP, 0)) + nxi = ERR_PTR(-EBUSY); + else + nxi = ERR_PTR(-EEXIST); + goto out_unlock; + } + /* dynamic nid creation blocker */ + else if (id >= MIN_D_CONTEXT) { + vxdprintk(VXD_CBIT(nid, 0), + "create_nx_info(%d) (dynamic rejected)", id); + nxi = ERR_PTR(-EINVAL); + goto out_unlock; + } + + /* new context */ + vxdprintk(VXD_CBIT(nid, 0), + "create_nx_info(%d) = %p (new)", id, new); + __hash_nx_info(get_nx_info(new)); + nxi = new, new = NULL; + +out_unlock: + spin_unlock(&nx_info_hash_lock); + if (new) + __dealloc_nx_info(new); + return nxi; +} + + + +/* exported stuff */ + + +void unhash_nx_info(struct nx_info *nxi) +{ + __shutdown_nx_info(nxi); + spin_lock(&nx_info_hash_lock); + __unhash_nx_info(nxi); + spin_unlock(&nx_info_hash_lock); +} + +#ifdef CONFIG_VSERVER_LEGACYNET + +struct nx_info *create_nx_info(void) +{ + return __create_nx_info(NX_DYNAMIC_ID); +} + +#endif + +/* locate_nx_info() + + * search for a nx_info and get() it + * negative id means current */ + +struct nx_info *locate_nx_info(int id) +{ + struct nx_info *nxi = NULL; + + if (id < 0) { + nxi = get_nx_info(current->nx_info); + } else if (id > 1) { + spin_lock(&nx_info_hash_lock); + nxi = get_nx_info(__lookup_nx_info(id)); + spin_unlock(&nx_info_hash_lock); + } + return nxi; +} + +/* nid_is_hashed() + + * verify that nid is still hashed */ + +int nid_is_hashed(nid_t nid) +{ + int hashed; + + spin_lock(&nx_info_hash_lock); + hashed = (__lookup_nx_info(nid) != NULL); + spin_unlock(&nx_info_hash_lock); + return hashed; +} + + +#ifdef CONFIG_PROC_FS + +int get_nid_list(int index, unsigned int *nids, int size) +{ + int hindex, nr_nids = 0; + + for (hindex = 0; hindex < NX_HASH_SIZE; hindex++) { + struct hlist_head *head = &nx_info_hash[hindex]; + struct hlist_node *pos; + + spin_lock(&nx_info_hash_lock); + hlist_for_each(pos, head) { + struct nx_info *nxi; + + if (--index > 0) + continue; + + nxi = hlist_entry(pos, struct nx_info, nx_hlist); + nids[nr_nids] = nxi->nx_id; + if (++nr_nids >= size) { + spin_unlock(&nx_info_hash_lock); + goto out; + } + } + /* keep the lock time short */ + spin_unlock(&nx_info_hash_lock); + } +out: + return nr_nids; +} +#endif + + +/* + * migrate task to new network + * gets nxi, puts old_nxi on change + */ + +int nx_migrate_task(struct task_struct *p, struct nx_info *nxi) +{ + struct nx_info *old_nxi; + int ret = 0; + + if (!p || !nxi) + BUG(); + + vxdprintk(VXD_CBIT(nid, 5), + "nx_migrate_task(%p,%p[#%d.%d.%d])", + p, nxi, nxi->nx_id, + atomic_read(&nxi->nx_usecnt), + atomic_read(&nxi->nx_tasks)); + + /* maybe disallow this completely? */ + old_nxi = task_get_nx_info(p); + if (old_nxi == nxi) + goto out; + + task_lock(p); + if (old_nxi) + clr_nx_info(&p->nx_info); + claim_nx_info(nxi, p); + set_nx_info(&p->nx_info, nxi); + p->nid = nxi->nx_id; + task_unlock(p); + + vxdprintk(VXD_CBIT(nid, 5), + "moved task %p into nxi:%p[#%d]", + p, nxi, nxi->nx_id); + + if (old_nxi) + release_nx_info(old_nxi, p); +out: + put_nx_info(old_nxi); + return ret; +} + + +#include +#include + + +int ifa_in_nx_info(struct in_ifaddr *ifa, struct nx_info *nxi) +{ + if (!nxi) + return 1; + if (!ifa) + return 0; + return addr_in_nx_info(nxi, ifa->ifa_address); +} + +int dev_in_nx_info(struct net_device *dev, struct nx_info *nxi) +{ + struct in_device *in_dev = __in_dev_get(dev); + struct in_ifaddr **ifap = NULL; + struct in_ifaddr *ifa = NULL; + + if (!nxi) + return 1; + if (!in_dev) + return 0; + + for (ifap = &in_dev->ifa_list; (ifa = *ifap) != NULL; + ifap = &ifa->ifa_next) { + if (addr_in_nx_info(nxi, ifa->ifa_address)) + return 1; + } + return 0; +} + +/* + * check if address is covered by socket + * + * sk: the socket to check against + * addr: the address in question (must be != 0) + */ +static inline int __addr_in_socket(struct sock *sk, uint32_t addr) +{ + struct nx_info *nxi = sk->sk_nx_info; + uint32_t saddr = tcp_v4_rcv_saddr(sk); + + vxdprintk(VXD_CBIT(net, 5), + "__addr_in_socket(%p,%d.%d.%d.%d) %p:%d.%d.%d.%d %p;%lx", + sk, VXD_QUAD(addr), nxi, VXD_QUAD(saddr), sk->sk_socket, + (sk->sk_socket?sk->sk_socket->flags:0)); + + if (saddr) { + /* direct address match */ + return (saddr == addr); + } else if (nxi) { + /* match against nx_info */ + return addr_in_nx_info(nxi, addr); + } else { + /* unrestricted any socket */ + return 1; + } +} + + +int nx_addr_conflict(struct nx_info *nxi, uint32_t addr, struct sock *sk) +{ + vxdprintk(VXD_CBIT(net, 2), + "nx_addr_conflict(%p,%p) %d.%d,%d.%d", + nxi, sk, VXD_QUAD(addr)); + + if (addr) { + /* check real address */ + return __addr_in_socket(sk, addr); + } else if (nxi) { + /* check against nx_info */ + int i, n = nxi->nbipv4; + + for (i=0; iipv4[i])) + return 1; + return 0; + } else { + /* check against any */ + return 1; + } +} + + +/* vserver syscall commands below here */ + +/* taks nid and nx_info functions */ + +#include + + +int vc_task_nid(uint32_t id, void __user *data) +{ + nid_t nid; + + if (id) { + struct task_struct *tsk; + + if (!vx_check(0, VX_ADMIN|VX_WATCH)) + return -EPERM; + + read_lock(&tasklist_lock); + tsk = find_task_by_real_pid(id); + nid = (tsk) ? tsk->nid : -ESRCH; + read_unlock(&tasklist_lock); + } + else + nid = current->nid; + return nid; +} + + +int vc_nx_info(uint32_t id, void __user *data) +{ + struct nx_info *nxi; + struct vcmd_nx_info_v0 vc_data; + + if (!vx_check(0, VX_ADMIN)) + return -ENOSYS; + if (!capable(CAP_SYS_ADMIN) || !capable(CAP_SYS_RESOURCE)) + return -EPERM; + + nxi = locate_nx_info(id); + if (!nxi) + return -ESRCH; + + vc_data.nid = nxi->nx_id; + put_nx_info(nxi); + + if (copy_to_user (data, &vc_data, sizeof(vc_data))) + return -EFAULT; + return 0; +} + + +/* network functions */ + +int vc_net_create(uint32_t nid, void __user *data) +{ + struct nx_info *new_nxi; + int ret; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + if ((nid > MAX_S_CONTEXT) && (nid != VX_DYNAMIC_ID)) + return -EINVAL; + + if (nid < 2) + return -EINVAL; + + new_nxi = __create_nx_info(nid); + if (IS_ERR(new_nxi)) + return PTR_ERR(new_nxi); + +/* + new_nxi = __loc_nx_info(nid, &ret); + if (!new_nxi) + return ret; + if (!(new_nxi->nx_flags & VXF_STATE_SETUP)) { + ret = -EEXIST; + goto out_put; + } +*/ + ret = new_nxi->nx_id; + nx_migrate_task(current, new_nxi); + /* if this fails, we might end up with a hashed nx_info */ + put_nx_info(new_nxi); + return ret; +} + + +int vc_net_migrate(uint32_t id, void __user *data) +{ + struct nx_info *nxi; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + nxi = locate_nx_info(id); + if (!nxi) + return -ESRCH; + nx_migrate_task(current, nxi); + put_nx_info(nxi); + return 0; +} + + +int vc_get_nflags(uint32_t id, void __user *data) +{ + struct nx_info *nxi; + struct vcmd_net_flags_v0 vc_data; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + nxi = locate_nx_info(id); + if (!nxi) + return -ESRCH; + + vc_data.flagword = nxi->nx_flags; + + /* special STATE flag handling */ + vc_data.mask = vx_mask_flags(~0UL, nxi->nx_flags, IPF_ONE_TIME); + + put_nx_info(nxi); + + if (copy_to_user (data, &vc_data, sizeof(vc_data))) + return -EFAULT; + return 0; +} + +int vc_set_nflags(uint32_t id, void __user *data) +{ + struct nx_info *nxi; + struct vcmd_net_flags_v0 vc_data; + uint64_t mask, trigger; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + if (copy_from_user (&vc_data, data, sizeof(vc_data))) + return -EFAULT; + + nxi = locate_nx_info(id); + if (!nxi) + return -ESRCH; + + /* special STATE flag handling */ + mask = vx_mask_mask(vc_data.mask, nxi->nx_flags, IPF_ONE_TIME); + trigger = (mask & nxi->nx_flags) ^ (mask & vc_data.flagword); + + nxi->nx_flags = vx_mask_flags(nxi->nx_flags, + vc_data.flagword, mask); + put_nx_info(nxi); + return 0; +} + +int vc_get_ncaps(uint32_t id, void __user *data) +{ + struct nx_info *nxi; + struct vcmd_net_caps_v0 vc_data; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + nxi = locate_nx_info(id); + if (!nxi) + return -ESRCH; + + vc_data.ncaps = nxi->nx_ncaps; + vc_data.cmask = ~0UL; + put_nx_info(nxi); + + if (copy_to_user (data, &vc_data, sizeof(vc_data))) + return -EFAULT; + return 0; +} + +int vc_set_ncaps(uint32_t id, void __user *data) +{ + struct nx_info *nxi; + struct vcmd_net_caps_v0 vc_data; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + if (copy_from_user (&vc_data, data, sizeof(vc_data))) + return -EFAULT; + + nxi = locate_nx_info(id); + if (!nxi) + return -ESRCH; + + nxi->nx_ncaps = vx_mask_flags(nxi->nx_ncaps, + vc_data.ncaps, vc_data.cmask); + put_nx_info(nxi); + return 0; +} + + +#include + +EXPORT_SYMBOL_GPL(free_nx_info); +EXPORT_SYMBOL_GPL(unhash_nx_info); + --- linux-2.6.11.11/net/ipv4/af_inet.c 2005-03-02 12:39:09 +0100 +++ linux-2.6.11.11-vs2.0-rc3/net/ipv4/af_inet.c 2005-06-01 14:34:17 +0200 @@ -393,6 +397,10 @@ int inet_bind(struct socket *sock, struc unsigned short snum; int chk_addr_ret; int err; + __u32 s_addr; /* Address used for validation */ + __u32 s_addr1; /* Address used for socket */ + __u32 s_addr2; /* Broadcast address for the socket */ + struct nx_info *nxi = sk->sk_nx_info; /* If the socket has its own bind function then use it. (RAW) */ if (sk->sk_prot->bind) { --- linux-2.6.11.11/net/ipv4/af_inet.c 2005-03-02 12:39:09 +0100 +++ linux-2.6.11.11-vs2.0-rc3/net/ipv4/af_inet.c 2005-06-01 14:34:17 +0200 @@ -403,7 +411,40 @@ int inet_bind(struct socket *sock, struc if (addr_len < sizeof(struct sockaddr_in)) goto out; - chk_addr_ret = inet_addr_type(addr->sin_addr.s_addr); + s_addr = addr->sin_addr.s_addr; + s_addr1 = s_addr; + s_addr2 = 0xffffffffl; + + vxdprintk(VXD_CBIT(net, 3), + "inet_bind(%p)* %p,%p;%lx %d.%d.%d.%d", + sk, sk->sk_nx_info, sk->sk_socket, + (sk->sk_socket?sk->sk_socket->flags:0), + VXD_QUAD(s_addr)); + if (nxi) { + __u32 v4_bcast = nxi->v4_bcast; + __u32 ipv4root = nxi->ipv4[0]; + int nbipv4 = nxi->nbipv4; + + if (s_addr == 0) { + /* bind to any for 1-n */ + s_addr = ipv4root; + s_addr1 = (nbipv4 > 1) ? 0 : s_addr; + s_addr2 = v4_bcast; + } else if (s_addr == 0x0100007f) { + /* rewrite localhost to ipv4root */ + s_addr = ipv4root; + s_addr1 = ipv4root; + } else if (s_addr != v4_bcast) { + /* normal address bind */ + if (!addr_in_nx_info(nxi, s_addr)) + return -EADDRNOTAVAIL; + } + } + chk_addr_ret = inet_addr_type(s_addr); + + vxdprintk(VXD_CBIT(net, 3), + "inet_bind(%p) %d.%d.%d.%d, %d.%d.%d.%d, %d.%d.%d.%d", + sk, VXD_QUAD(s_addr), VXD_QUAD(s_addr1), VXD_QUAD(s_addr2)); /* Not specified by any standard per-se, however it breaks too * many applications when removed. It is unfortunate since --- linux-2.6.11.11/net/ipv4/af_inet.c 2005-03-02 12:39:09 +0100 +++ linux-2.6.11.11-vs2.0-rc3/net/ipv4/af_inet.c 2005-06-01 14:34:17 +0200 @@ -415,7 +456,7 @@ int inet_bind(struct socket *sock, struc err = -EADDRNOTAVAIL; if (!sysctl_ip_nonlocal_bind && !inet->freebind && - addr->sin_addr.s_addr != INADDR_ANY && + s_addr != INADDR_ANY && chk_addr_ret != RTN_LOCAL && chk_addr_ret != RTN_MULTICAST && chk_addr_ret != RTN_BROADCAST) --- linux-2.6.11.11/net/ipv4/af_inet.c 2005-03-02 12:39:09 +0100 +++ linux-2.6.11.11-vs2.0-rc3/net/ipv4/af_inet.c 2005-06-01 14:34:17 +0200 @@ -440,7 +481,8 @@ int inet_bind(struct socket *sock, struc if (sk->sk_state != TCP_CLOSE || inet->num) goto out_release_sock; - inet->rcv_saddr = inet->saddr = addr->sin_addr.s_addr; + inet->rcv_saddr = inet->saddr = s_addr1; + inet->rcv_saddr2 = s_addr2; if (chk_addr_ret == RTN_MULTICAST || chk_addr_ret == RTN_BROADCAST) inet->saddr = 0; /* Use device */ --- linux-2.6.11.11/net/ipv4/devinet.c 2005-03-02 12:39:09 +0100 +++ linux-2.6.11.11-vs2.0-rc3/net/ipv4/devinet.c 2005-06-01 14:34:17 +0200 @@ -489,6 +489,33 @@ static __inline__ int inet_abc_len(u32 a return rc; } +/* + Check that a device is not member of the ipv4root assigned to the process + Return true if this is the case + + If the process is not bound to specific IP, then it returns 0 (all + interface are fine). +*/ +static inline int devinet_notiproot (struct in_ifaddr *ifa) +{ + int ret = 0; + struct nx_info *nxi; + + if ((nxi = current->nx_info)) { + int i; + int nbip = nxi->nbipv4; + __u32 addr = ifa->ifa_local; + ret = 1; + for (i=0; iipv4[i] == addr) { + ret = 0; + break; + } + } + } + return ret; +} + int devinet_ioctl(unsigned int cmd, void __user *arg) { --- linux-2.6.11.11/net/ipv4/raw.c 2005-03-02 12:39:10 +0100 +++ linux-2.6.11.11-vs2.0-rc3/net/ipv4/raw.c 2005-06-01 14:34:17 +0200 @@ -102,6 +102,27 @@ static void raw_v4_unhash(struct sock *s write_unlock_bh(&raw_v4_lock); } + +/* + * Check if a given address matches for a socket + * + * nxi: the socket's nx_info if any + * addr: to be verified address + * saddr/baddr: socket addresses + */ +static inline int raw_addr_match ( + struct nx_info *nxi, + uint32_t addr, + uint32_t saddr, + uint32_t baddr) +{ + if (addr && (saddr == addr || baddr == addr)) + return 1; + if (!saddr) + return addr_in_nx_info(nxi, addr); + return 0; +} + struct sock *__raw_v4_lookup(struct sock *sk, unsigned short num, unsigned long raddr, unsigned long laddr, int dif) --- linux-2.6.11.11/net/ipv4/raw.c 2005-03-02 12:39:10 +0100 +++ linux-2.6.11.11-vs2.0-rc3/net/ipv4/raw.c 2005-06-01 14:34:17 +0200 @@ -113,7 +134,8 @@ struct sock *__raw_v4_lookup(struct sock if (inet->num == num && !(inet->daddr && inet->daddr != raddr) && - !(inet->rcv_saddr && inet->rcv_saddr != laddr) && + raw_addr_match(sk->sk_nx_info, laddr, + inet->rcv_saddr, inet->rcv_saddr2) && !(sk->sk_bound_dev_if && sk->sk_bound_dev_if != dif)) goto found; /* gotcha */ } --- linux-2.6.11.11/net/ipv4/raw.c 2005-03-02 12:39:10 +0100 +++ linux-2.6.11.11-vs2.0-rc3/net/ipv4/raw.c 2005-06-01 14:34:17 +0200 @@ -308,6 +330,10 @@ static int raw_send_hdrinc(struct sock * iph->check = ip_fast_csum((unsigned char *)iph, iph->ihl); } + err = -EPERM; + if (!vx_check(0, VX_ADMIN) && !capable(CAP_NET_RAW) + && (!addr_in_nx_info(sk->sk_nx_info, iph->saddr))) + goto error; err = NF_HOOK(PF_INET, NF_IP_LOCAL_OUT, skb, NULL, rt->u.dst.dev, dst_output); --- linux-2.6.11.11/net/ipv4/raw.c 2005-03-02 12:39:10 +0100 +++ linux-2.6.11.11-vs2.0-rc3/net/ipv4/raw.c 2005-06-01 14:34:17 +0200 @@ -480,6 +506,12 @@ static int raw_sendmsg(struct kiocb *ioc if (!inet->hdrincl) raw_probe_proto_opt(&fl, msg); + if (sk->sk_nx_info) { + err = ip_find_src(sk->sk_nx_info, &rt, &fl); + + if (err) + goto done; + } err = ip_route_output_flow(&rt, &fl, sk, !(msg->msg_flags&MSG_DONTWAIT)); } if (err) --- linux-2.6.11.11/net/ipv4/tcp_ipv4.c 2005-03-02 12:39:10 +0100 +++ linux-2.6.11.11-vs2.0-rc3/net/ipv4/tcp_ipv4.c 2005-06-01 14:34:17 +0200 @@ -181,7 +182,6 @@ void tcp_bind_hash(struct sock *sk, stru static inline int tcp_bind_conflict(struct sock *sk, struct tcp_bind_bucket *tb) { - const u32 sk_rcv_saddr = tcp_v4_rcv_saddr(sk); struct sock *sk2; struct hlist_node *node; int reuse = sk->sk_reuse; --- linux-2.6.11.11/net/ipv4/tcp_ipv4.c 2005-03-02 12:39:10 +0100 +++ linux-2.6.11.11-vs2.0-rc3/net/ipv4/tcp_ipv4.c 2005-06-01 14:34:17 +0200 @@ -194,9 +194,8 @@ static inline int tcp_bind_conflict(stru sk->sk_bound_dev_if == sk2->sk_bound_dev_if)) { if (!reuse || !sk2->sk_reuse || sk2->sk_state == TCP_LISTEN) { - const u32 sk2_rcv_saddr = tcp_v4_rcv_saddr(sk2); - if (!sk2_rcv_saddr || !sk_rcv_saddr || - sk2_rcv_saddr == sk_rcv_saddr) + if (nx_addr_conflict(sk->sk_nx_info, + tcp_v4_rcv_saddr(sk), sk2)) break; } } --- linux-2.6.11.11/net/ipv4/tcp_ipv4.c 2005-03-02 12:39:10 +0100 +++ linux-2.6.11.11-vs2.0-rc3/net/ipv4/tcp_ipv4.c 2005-06-01 14:34:17 +0200 @@ -405,6 +404,26 @@ void tcp_unhash(struct sock *sk) wake_up(&tcp_lhash_wait); } + +/* + * Check if a given address matches for a tcp socket + * + * nxi: the socket's nx_info if any + * addr: to be verified address + * saddr: socket addresses + */ +static inline int tcp_addr_match ( + struct nx_info *nxi, + uint32_t addr, + uint32_t saddr) +{ + if (addr && (saddr == addr)) + return 1; + if (!saddr) + return addr_in_nx_info(nxi, addr); + return 0; +} + /* Don't inline this cruft. Here are some nice properties to * exploit here. The BSD API does not allow a listening TCP * to specify the remote port nor the remote address for the --- linux-2.6.11.11/net/ipv4/tcp_ipv4.c 2005-03-02 12:39:10 +0100 +++ linux-2.6.11.11-vs2.0-rc3/net/ipv4/tcp_ipv4.c 2005-06-01 14:34:17 +0200 @@ -426,11 +445,10 @@ static struct sock *__tcp_v4_lookup_list __u32 rcv_saddr = inet->rcv_saddr; score = (sk->sk_family == PF_INET ? 1 : 0); - if (rcv_saddr) { - if (rcv_saddr != daddr) - continue; + if (tcp_addr_match(sk->sk_nx_info, daddr, rcv_saddr)) score+=2; - } + else + continue; if (sk->sk_bound_dev_if) { if (sk->sk_bound_dev_if != dif) continue; --- linux-2.6.11.11/net/ipv4/tcp_ipv4.c 2005-03-02 12:39:10 +0100 +++ linux-2.6.11.11-vs2.0-rc3/net/ipv4/tcp_ipv4.c 2005-06-01 14:34:17 +0200 @@ -460,8 +478,8 @@ static inline struct sock *tcp_v4_lookup struct inet_sock *inet = inet_sk((sk = __sk_head(head))); if (inet->num == hnum && !sk->sk_node.next && - (!inet->rcv_saddr || inet->rcv_saddr == daddr) && (sk->sk_family == PF_INET || !ipv6_only_sock(sk)) && + tcp_addr_match(sk->sk_nx_info, daddr, inet->rcv_saddr) && !sk->sk_bound_dev_if) goto sherry_cache; sk = __tcp_v4_lookup_listener(head, daddr, hnum, dif); --- linux-2.6.11.11/net/ipv4/udp.c 2005-03-02 12:39:10 +0100 +++ linux-2.6.11.11-vs2.0-rc3/net/ipv4/udp.c 2005-06-01 14:34:17 +0200 @@ -174,14 +174,12 @@ gotit: struct inet_sock *inet2 = inet_sk(sk2); if (inet2->num == snum && - sk2 != sk && - !ipv6_only_sock(sk2) && + sk2 != sk && !ipv6_only_sock(sk2) && (!sk2->sk_bound_dev_if || !sk->sk_bound_dev_if || sk2->sk_bound_dev_if == sk->sk_bound_dev_if) && - (!inet2->rcv_saddr || - !inet->rcv_saddr || - inet2->rcv_saddr == inet->rcv_saddr) && + nx_addr_conflict(sk->sk_nx_info, + tcp_v4_rcv_saddr(sk), sk2) && (!sk2->sk_reuse || !sk->sk_reuse)) goto fail; } --- linux-2.6.11.11/net/ipv4/udp.c 2005-03-02 12:39:10 +0100 +++ linux-2.6.11.11-vs2.0-rc3/net/ipv4/udp.c 2005-06-01 14:34:17 +0200 @@ -216,6 +214,17 @@ static void udp_v4_unhash(struct sock *s write_unlock_bh(&udp_hash_lock); } +static inline int udp_in_list(struct nx_info *nx_info, u32 addr) +{ + int n = nx_info->nbipv4; + int i; + + for (i=0; iipv4[i] == addr) + return 1; + return 0; +} + /* UDP is nearly always wildcards out the wazoo, it makes no sense to try * harder than this. -DaveM */ --- linux-2.6.11.11/net/ipv4/udp.c 2005-03-02 12:39:10 +0100 +++ linux-2.6.11.11-vs2.0-rc3/net/ipv4/udp.c 2005-06-01 14:34:17 +0200 @@ -236,6 +245,11 @@ static struct sock *udp_v4_lookup_longwa if (inet->rcv_saddr != daddr) continue; score+=2; + } else if (sk->sk_nx_info) { + if (udp_in_list(sk->sk_nx_info, daddr)) + score+=2; + else + continue; } if (inet->daddr) { if (inet->daddr != saddr) --- linux-2.6.11.11/net/ipv4/udp.c 2005-03-02 12:39:10 +0100 +++ linux-2.6.11.11-vs2.0-rc3/net/ipv4/udp.c 2005-06-01 14:34:17 +0200 @@ -292,7 +306,8 @@ static inline struct sock *udp_v4_mcast_ if (inet->num != hnum || (inet->daddr && inet->daddr != rmt_addr) || (inet->dport != rmt_port && inet->dport) || - (inet->rcv_saddr && inet->rcv_saddr != loc_addr) || + (inet->rcv_saddr && inet->rcv_saddr != loc_addr && + inet->rcv_saddr2 && inet->rcv_saddr2 != loc_addr) || ipv6_only_sock(s) || (s->sk_bound_dev_if && s->sk_bound_dev_if != dif)) continue; --- linux-2.6.11.11/net/ipv4/udp.c 2005-03-02 12:39:10 +0100 +++ linux-2.6.11.11-vs2.0-rc3/net/ipv4/udp.c 2005-06-01 14:34:17 +0200 @@ -601,6 +616,15 @@ int udp_sendmsg(struct kiocb *iocb, stru .uli_u = { .ports = { .sport = inet->sport, .dport = dport } } }; + struct nx_info *nxi = sk->sk_nx_info; + + if (nxi) { + err = ip_find_src(nxi, &rt, &fl); + if (err) + goto out; + if (daddr == IPI_LOOPBACK && !vx_check(0, VX_ADMIN)) + daddr = fl.fl4_dst = nxi->ipv4[0]; + } err = ip_route_output_flow(&rt, &fl, sk, !(msg->msg_flags&MSG_DONTWAIT)); if (err) goto out; --- linux-2.6.11.11/net/ipv6/addrconf.c 2005-03-02 12:39:10 +0100 +++ linux-2.6.11.11-vs2.0-rc3/net/ipv6/addrconf.c 2005-06-01 14:34:17 +0200 @@ -2339,7 +2339,10 @@ static void if6_seq_stop(struct seq_file static int if6_seq_show(struct seq_file *seq, void *v) { struct inet6_ifaddr *ifp = (struct inet6_ifaddr *)v; - seq_printf(seq, + + /* no ipv6 inside a vserver for now */ + if (vx_check(0, VX_ADMIN|VX_WATCH)) + seq_printf(seq, "%04x%04x%04x%04x%04x%04x%04x%04x %02x %02x %02x %02x %8s\n", NIP6(ifp->addr), ifp->idev->dev->ifindex, --- linux-2.6.11.11/net/ipv6/addrconf.c 2005-03-02 12:39:10 +0100 +++ linux-2.6.11.11-vs2.0-rc3/net/ipv6/addrconf.c 2005-06-01 14:34:17 +0200 @@ -2695,6 +2698,10 @@ static int inet6_dump_addr(struct sk_buf struct ifmcaddr6 *ifmca; struct ifacaddr6 *ifaca; + /* no ipv6 inside a vserver for now */ + if (skb->sk && skb->sk->sk_vx_info) + return skb->len; + s_idx = cb->args[0]; s_ip_idx = ip_idx = cb->args[1]; read_lock(&dev_base_lock); --- linux-2.6.11.11/net/ipv6/addrconf.c 2005-03-02 12:39:10 +0100 +++ linux-2.6.11.11-vs2.0-rc3/net/ipv6/addrconf.c 2005-06-01 14:34:17 +0200 @@ -2914,6 +2921,10 @@ static int inet6_dump_ifinfo(struct sk_b struct net_device *dev; struct inet6_dev *idev; + /* no ipv6 inside a vserver for now */ + if (skb->sk && skb->sk->sk_vx_info) + return skb->len; + read_lock(&dev_base_lock); for (dev=dev_base, idx=0; dev; dev = dev->next, idx++) { if (idx < s_idx)