Add lwip: a new lwIP-based TCP/IP service

This commit adds a new TCP/IP service to MINIX 3.  As its core, the
service uses the lwIP TCP/IP stack for maintenance reasons.  The
service aims to be compatible with NetBSD userland, including its
low-level network management utilities.  It also aims to support
modern features such as IPv6.  In summary, the new LWIP service has
support for the following main features:

- TCP, UDP, RAW sockets with mostly standard BSD API semantics;
- IPv6 support: host mode (complete) and router mode (partial);
- most of the standard BSD API socket options (SO_);
- all of the standard BSD API message flags (MSG_);
- the most used protocol-specific socket and control options;
- a default loopback interface and the ability to create one more;
- configuration-free ethernet interfaces and driver tracking;
- queuing and multiple concurrent requests to each ethernet driver;
- standard ioctl(2)-based BSD interface management;
- radix tree backed, destination-based routing;
- routing sockets for standard BSD route reporting and management;
- multicast traffic and multicast group membership tracking;
- Berkeley Packet Filter (BPF) devices;
- standard and custom sysctl(7) nodes for many internals;
- a slab allocation based, hybrid static/dynamic memory pool model.

Many of its modules come with fairly elaborate comments that cover
many aspects of what is going on.  The service is primarily a socket
driver built on top of the libsockdriver library, but for BPF devices
it is at the same time also a character driver.

Change-Id: Ib0c02736234b21143915e5fcc0fda8fe408f046f
This commit is contained in:
David van Moolenbroek 2016-09-29 23:07:07 +00:00
parent 0f03189a6a
commit ef8d499e2d
60 changed files with 25803 additions and 11 deletions

View File

@ -194,7 +194,7 @@
./etc/system.conf.d/hello minix-base
./etc/system.conf.d/inet minix-base obsolete
./etc/system.conf.d/ipc minix-base
./etc/system.conf.d/lwip minix-base obsolete
./etc/system.conf.d/lwip minix-base
./etc/system.conf.d/random minix-base
./etc/system.conf.d/uds minix-base
./etc/system.conf.d/usb_hub minix-base
@ -277,7 +277,7 @@
./service/is minix-base
./service/isofs minix-base
./service/log minix-base
./service/lwip minix-base obsolete
./service/lwip minix-base
./service/memory minix-base
./service/mfs minix-base
./service/mib minix-base

View File

@ -1182,6 +1182,7 @@
./usr/include/minix/blockdriver.h minix-comp
./usr/include/minix/blockdriver_mt.h minix-comp
./usr/include/minix/board.h minix-comp
./usr/include/minix/bpf.h minix-comp
./usr/include/minix/btrace.h minix-comp
./usr/include/minix/callnr.h minix-comp
./usr/include/minix/chardriver.h minix-comp
@ -1208,6 +1209,7 @@
./usr/include/minix/hgfs.h minix-comp
./usr/include/minix/i2c.h minix-comp
./usr/include/minix/i2cdriver.h minix-comp
./usr/include/minix/if.h minix-comp
./usr/include/minix/input.h minix-comp
./usr/include/minix/inputdriver.h minix-comp
./usr/include/minix/ioctl.h minix-comp

View File

@ -200,7 +200,7 @@
./usr/libdata/debug/service/is.debug minix-debug debug
./usr/libdata/debug/service/isofs.debug minix-debug debug
./usr/libdata/debug/service/log.debug minix-debug debug
./usr/libdata/debug/service/lwip.debug minix-debug debug,obsolete
./usr/libdata/debug/service/lwip.debug minix-debug debug
./usr/libdata/debug/service/memory.debug minix-debug debug
./usr/libdata/debug/service/mfs.debug minix-debug debug
./usr/libdata/debug/service/mib.debug minix-debug debug

View File

@ -130,6 +130,9 @@ do
;;
6,0) des="line printer, parallel port" dev=lp
;;
7,0)
des="Berkeley Packet Filter device" dev=bpf
;;
9,0)
des="unix98 pseudoterminal master" dev=ptmx
;;

View File

@ -33,6 +33,7 @@ RAMDISK_DEVICES="
STD_DEVICES="
${RAMDISK_DEVICES}
bmp085b1s77 bmp085b2s77 bmp085b3s77
bpf
eepromb1s50 eepromb1s51 eepromb1s52 eepromb1s53
eepromb1s54 eepromb1s55 eepromb1s56 eepromb1s57
eepromb2s50 eepromb2s51 eepromb2s52 eepromb2s53
@ -128,6 +129,7 @@ Where key is one of the following:
tty00 ... tty03 # Make serial lines
ttyp0 ... ttyq0 ... # Make tty, pty pairs
audio mixer # Make audio devices
bpf # Make /dev/bpf
klog # Make /dev/klog
ptmx # Make /dev/ptmx
random # Make /dev/random, /dev/urandom
@ -215,6 +217,13 @@ do
makedev bmp085b${bus}s77 c ${major} 0 ${uname} ${gname} 444
;;
bpf)
# Berkeley Packet Filter device, for the LWIP service
# This is a cloning device, but some programs (e.g., dhclient)
# assume individual devices are numbered, so also create bpf0.
makedev ${dev} c 7 0 ${uname} ${gname} 600
makedev ${dev}0 c 7 0 ${uname} ${gname} 600
;;
c[0-3]d[0-7])
# Whole disk devices.
disk=`expr ${dev} : '...\\(.\\)'`

View File

@ -125,7 +125,7 @@ service_get_policies(struct policies * pol, index_t slot)
{ .label = "ptyfs", .policy_str = "" },
{ .label = "vbfs", .policy_str = "" },
/* net */
{ .label = "lwip", .policy_str = "" },
{ .label = "lwip", .policy_str = "reset" },
/* servers */
{ .label = "devman", .policy_str = "restart" },
{ .label = "ds", .policy_str = "restart" },

View File

@ -5,14 +5,14 @@ INCSDIR= /usr/include/minix
INCS+= paths.h param.h
INCS+= acpi.h audio_fw.h bitmap.h \
bdev.h blockdriver.h blockdriver_mt.h \
board.h btrace.h \
board.h bpf.h btrace.h \
callnr.h chardriver.h clkconf.h com.h \
config.h const.h cpufeature.h \
debug.h devio.h devman.h dmap.h \
driver.h drivers.h drvlib.h ds.h \
endpoint.h fb.h fsdriver.h fslib.h gpio.h gcov.h hash.h \
hgfs.h i2c.h i2cdriver.h ioctl.h input.h \
inputdriver.h ipc.h ipc_filter.h ipcconst.h \
hgfs.h i2c.h i2cdriver.h if.h input.h inputdriver.h \
ioctl.h ipc.h ipc_filter.h ipcconst.h \
keymap.h log.h mmio.h mthread.h minlib.h \
netdriver.h optset.h padconf.h partition.h portio.h \
priv.h procfs.h profile.h \

42
minix/include/minix/bpf.h Normal file
View File

@ -0,0 +1,42 @@
#ifndef _MINIX_BPF_H
#define _MINIX_BPF_H
#include <net/bpf.h>
/*
* MINIX3-specific extensions to the NetBSD Berkeley Packet Filter header.
* These extensions are necessary because NetBSD BPF uses a few ioctl(2)
* structure formats that contain pointers--something that MINIX3 has to avoid,
* due to its memory granting mechanisms. Thus, those ioctl(2) calls have to
* be converted from NetBSD to MINIX3 format. We currently do that in libc.
* This header specifies the numbers and formats for the MINIX3 versions.
*
* See <minix/if.h> for details on how things work here.
*/
/* BIOCSETF: set BPF filter program. */
/*
* This ioctl is an exception, as it is write-only, so we do not need the
* original structure. Also, the size of this structure is currently slightly
* over 4KB, which makes it too big for a regular ioctl call. Thus, we have to
* use a big ioctl call. Note that future changes of BPF_MAXINSNS will
* unfortunately (necessarily) change the ioctl call number.
*/
struct minix_bpf_program {
u_int mbf_len;
struct bpf_insn mbf_insns[BPF_MAXINSNS];
};
#define MINIX_BIOCSETF _IOW_BIG(2, struct minix_bpf_program)
/* BIOCGDLTLIST: retrieve list of possible data link types. */
#define MINIX_BPF_MAXDLT 256
struct minix_bpf_dltlist {
struct bpf_dltlist mbfl_dltlist; /* MUST be first */
u_int mbfl_list[MINIX_BPF_MAXDLT];
};
#define MINIX_BIOCGDLTLIST _IOWR('B', 119, struct minix_bpf_dltlist)
#endif /* !_MINIX_BPF_H */

View File

@ -25,7 +25,7 @@
#define TTY_MAJOR 4 /* 4 = /dev/tty00 (ttys) */
#define CTTY_MAJOR 5 /* 5 = /dev/tty */
#define PRINTER_MAJOR 6 /* 6 = /dev/lp (printer driver) */
/* 7 = (unused) */
#define TCPIP_MAJOR 7 /* 7 = /dev/bpf (TCP/IP service) */
/* 8 = /dev/c1 */
#define PTY_MAJOR 9 /* 9 = /dev/ptyp0 (pty driver) */
/* 10 = /dev/c2 */

51
minix/include/minix/if.h Normal file
View File

@ -0,0 +1,51 @@
#ifndef _MINIX_IF_H
#define _MINIX_IF_H
#include <net/if.h>
#include <net/if_media.h>
/*
* MINIX3-specific extensions to the network interface headers. These
* extensions are necessary because NetBSD IF uses a few ioctl(2) structure
* formats that contain pointers--something that MINIX3 has to avoid, due to
* its memory granting mechanisms. Thus, those ioctl(2) calls have to be
* converted from NetBSD to MINIX3 format. We currently do that in libc.
* This header specifies the numbers and formats for the MINIX3 versions.
*
* The general idea is that we rewrite the ioctl request data to include both
* the original structure and a buffer for the array of values to which the
* original structure uses a pointer. Important: in those cases, the original
* structure is expected to be the first element of the replacement structure.
*
* There is typically no configured upper bound for the maximum number of
* values in the array, and so we pick size values that are hopefully always
* oversized and yet keep the ioctl sizes within the range of regular ioctls
* (4095 bytes, as per sys/ioccom.h). If there may be larger amounts of data,
* we have to use "big" ioctls.
*
* For the replacement ioctl codes, we use the original ioctl class and number
* with a different size. That should virtually eliminate the possibility of
* accidental collisions.
*/
/* SIOCGIFMEDIA: retrieve interface media status and types. */
#define MINIX_IF_MAXMEDIA 256
struct minix_ifmediareq {
struct ifmediareq mifm_ifm; /* MUST be first */
int mifm_list[MINIX_IF_MAXMEDIA];
};
#define MINIX_SIOCGIFMEDIA _IOWR('i', 54, struct minix_ifmediareq)
/* SIOCIFGCLONERS: retrieve interface "cloners" (virtual types). */
#define MINIX_IF_MAXCLONERS 128
struct minix_if_clonereq {
struct if_clonereq mifcr_ifcr; /* MUST be first */
char mifcr_buffer[MINIX_IF_MAXCLONERS * IFNAMSIZ];
};
#define MINIX_SIOCIFGCLONERS _IOWR('i', 120, struct minix_if_clonereq)
#endif /* !_MINIX_IF_H */

View File

@ -28,6 +28,7 @@
#define MINIX_TEST 0
#define MINIX_MIB 1
#define MINIX_PROC 2
#define MINIX_LWIP 3
/*
* These identifiers, under MINIX_TEST, are used by test87 to test the MIB

View File

@ -9,6 +9,10 @@
#include <sys/ioccom.h>
#include <stdarg.h>
#include <fcntl.h>
#include <stdlib.h>
#include <minix/if.h>
#include <minix/bpf.h>
#include <assert.h>
static void rewrite_i2c_netbsd_to_minix(minix_i2c_ioctl_exec_t *out,
i2c_ioctl_exec_t *in);
@ -45,6 +49,199 @@ static void rewrite_i2c_minix_to_netbsd(i2c_ioctl_exec_t *out,
}
}
/*
* Convert a network interface related IOCTL with pointers to a flat format
* suitable for MINIX3. Return a pointer to the new data on success, or zero
* (with errno set) on failure. The original request code is given in
* 'request' and must be replaced by the new request code to be used.
*/
static vir_bytes
ioctl_convert_if_to_minix(void * data, unsigned long * request)
{
struct minix_ifmediareq *mifm;
struct ifmediareq *ifm;
struct minix_if_clonereq *mifcr;
struct if_clonereq *ifcr;
switch (*request) {
case SIOCGIFMEDIA:
ifm = (struct ifmediareq *)data;
mifm = (struct minix_ifmediareq *)malloc(sizeof(*mifm));
if (mifm != NULL) {
/*
* The count may exceed MINIX_IF_MAXMEDIA, and should
* be truncated as needed by the IF implementation.
*/
memcpy(&mifm->mifm_ifm, ifm, sizeof(*ifm));
*request = MINIX_SIOCGIFMEDIA;
} else
errno = ENOMEM;
return (vir_bytes)mifm;
case SIOCIFGCLONERS:
ifcr = (struct if_clonereq *)data;
mifcr = (struct minix_if_clonereq *)malloc(sizeof(*mifcr));
if (mifcr != NULL) {
/*
* The count may exceed MINIX_IF_MAXCLONERS, and should
* be truncated as needed by the IF implementation.
*/
memcpy(&mifcr->mifcr_ifcr, ifcr, sizeof(*ifcr));
*request = MINIX_SIOCIFGCLONERS;
} else
errno = ENOMEM;
return (vir_bytes)mifcr;
default:
assert(0);
errno = ENOTTY;
return 0;
}
}
/*
* Convert a the result of a network interface related IOCTL with pointers from
* the flat format used to make the call to MINIX3. Called on success only.
* The given request code is that of the (NetBSD-type) original.
*/
static void
ioctl_convert_if_from_minix(vir_bytes addr, void * data, unsigned long request)
{
struct minix_ifmediareq *mifm;
struct ifmediareq *ifm;
struct minix_if_clonereq *mifcr;
struct if_clonereq *ifcr;
int count;
switch (request) {
case SIOCGIFMEDIA:
mifm = (struct minix_ifmediareq *)addr;
ifm = (struct ifmediareq *)data;
memcpy(ifm, &mifm->mifm_ifm, sizeof(*ifm));
if (ifm->ifm_ulist != NULL && ifm->ifm_count > 0)
memcpy(ifm->ifm_ulist, mifm->mifm_list,
ifm->ifm_count * sizeof(ifm->ifm_ulist[0]));
break;
case SIOCIFGCLONERS:
mifcr = (struct minix_if_clonereq *)addr;
ifcr = (struct if_clonereq *)data;
memcpy(ifcr, &mifcr->mifcr_ifcr, sizeof(*ifcr));
count = (ifcr->ifcr_count < ifcr->ifcr_total) ?
ifcr->ifcr_count : ifcr->ifcr_total;
if (ifcr->ifcr_buffer != NULL && count > 0)
memcpy(ifcr->ifcr_buffer, mifcr->mifcr_buffer,
count * IFNAMSIZ);
break;
default:
assert(0);
}
}
/*
* Convert a BPF (Berkeley Packet Filter) related IOCTL with pointers to a flat
* format suitable for MINIX3. Return a pointer to the new data on success, or
* zero (with errno set) on failure. The original request code is given in
* 'request' and must be replaced by the new request code to be used.
*/
static vir_bytes
ioctl_convert_bpf_to_minix(void * data, unsigned long * request)
{
struct minix_bpf_program *mbf;
struct bpf_program *bf;
struct minix_bpf_dltlist *mbfl;
struct bpf_dltlist *bfl;
switch (*request) {
case BIOCSETF:
bf = (struct bpf_program *)data;
if (bf->bf_len > __arraycount(mbf->mbf_insns)) {
errno = EINVAL;
return 0;
}
mbf = (struct minix_bpf_program *)malloc(sizeof(*mbf));
if (mbf != NULL) {
mbf->mbf_len = bf->bf_len;
memcpy(mbf->mbf_insns, bf->bf_insns,
bf->bf_len * sizeof(mbf->mbf_insns[0]));
*request = MINIX_BIOCSETF;
} else
errno = ENOMEM;
return (vir_bytes)mbf;
case BIOCGDLTLIST:
bfl = (struct bpf_dltlist *)data;
mbfl = (struct minix_bpf_dltlist *)malloc(sizeof(*mbfl));
if (mbfl != NULL) {
/*
* The length may exceed MINIX_BPF_MAXDLT, and should
* be truncated as needed by the BPF implementation.
*/
memcpy(&mbfl->mbfl_dltlist, bfl, sizeof(*bfl));
*request = MINIX_BIOCGDLTLIST;
} else
errno = ENOMEM;
return (vir_bytes)mbfl;
default:
assert(0);
errno = ENOTTY;
return 0;
}
}
/*
* Convert a the result of BPF (Berkeley Packet Filter) related IOCTL with
* pointers from the flat format used to make the call to MINIX3. Called on
* success only. The given request code is that of the (NetBSD-type) original.
*/
static void
ioctl_convert_bpf_from_minix(vir_bytes addr, void * data,
unsigned long request)
{
struct minix_bpf_dltlist *mbfl;
struct bpf_dltlist *bfl;
switch (request) {
case BIOCGDLTLIST:
mbfl = (struct minix_bpf_dltlist *)addr;
bfl = (struct bpf_dltlist *)data;
memcpy(bfl, &mbfl->mbfl_dltlist, sizeof(*bfl));
if (bfl->bfl_list != NULL && bfl->bfl_len > 0)
memcpy(bfl->bfl_list, mbfl->mbfl_list,
bfl->bfl_len * sizeof(bfl->bfl_list[0]));
break;
default:
assert(0);
}
}
/*
* Library implementation of FIOCLEX and FIONCLEX.
*/
@ -110,6 +307,7 @@ ioctl_to_fcntl(int fd, unsigned long request, void * data)
int ioctl(int fd, unsigned long request, ...)
{
minix_i2c_ioctl_exec_t i2c;
int r, request_save;
message m;
vir_bytes addr;
@ -124,8 +322,6 @@ int ioctl(int fd, unsigned long request, ...)
* To support compatibility with interfaces on other systems, certain
* requests are re-written to flat structures (i.e. without pointers).
*/
minix_i2c_ioctl_exec_t i2c;
request_save = request;
switch (request) {
@ -142,6 +338,19 @@ int ioctl(int fd, unsigned long request, ...)
addr = (vir_bytes) &i2c;
request = MINIX_I2C_IOCTL_EXEC;
break;
case SIOCGIFMEDIA:
case SIOCIFGCLONERS:
if ((addr = ioctl_convert_if_to_minix(data, &request)) == 0)
return -1; /* errno has already been set */
break;
case BIOCSETF:
case BIOCGDLTLIST:
if ((addr = ioctl_convert_bpf_to_minix(data, &request)) == 0)
return -1; /* errno has already been set */
break;
default:
/* Keep original as-is */
addr = (vir_bytes)data;
@ -155,11 +364,30 @@ int ioctl(int fd, unsigned long request, ...)
r = _syscall(VFS_PROC_NR, VFS_IOCTL, &m);
/* Translate back to original form */
/*
* Translate back to original form. Do this on failure as well, as
* temporarily allocated resources may have to be freed up again.
*/
switch (request_save) {
case I2C_IOCTL_EXEC:
rewrite_i2c_minix_to_netbsd(data, &i2c);
break;
case SIOCGIFMEDIA:
case SIOCIFGCLONERS:
if (r == 0)
ioctl_convert_if_from_minix(addr, data, request_save);
free((void *)addr);
break;
case BIOCGDLTLIST:
if (r == 0)
ioctl_convert_bpf_from_minix(addr, data, request_save);
/* FALLTHROUGH */
case BIOCSETF:
free((void *)addr);
break;
default:
/* Nothing to do */
break;

View File

@ -1,6 +1,7 @@
.include <bsd.own.mk>
.if ${MKIMAGEONLY} == "no"
SUBDIR+= lwip
SUBDIR+= uds
.endif # ${MKIMAGEONLY} == "no"

34
minix/net/lwip/Makefile Normal file
View File

@ -0,0 +1,34 @@
# Makefile for the lwIP TCP/IP socket driver service (LWIP)
.include <bsd.own.mk>
PROG= lwip
SRCS= lwip.c mempool.c pchain.c addr.c addrpol.c tcpisn.c mcast.c ipsock.c \
pktsock.c tcpsock.c udpsock.c rawsock.c ifdev.c ifaddr.c loopif.c \
ethif.c ndev.c rttree.c route.c rtsock.c lnksock.c lldata.c mibtree.c \
ifconf.c bpfdev.c bpf_filter.c util.c
FILES=${PROG}.conf
FILESNAME=${PROG}
FILESDIR= /etc/system.conf.d
CPPFLAGS+= -I${NETBSDSRCDIR}/minix/lib/liblwip/dist/src/include
CPPFLAGS+= -I${NETBSDSRCDIR}/minix/lib/liblwip/lib
# Disabling USE_INET6 only superficially hides IPv6 support in the service.
.if (${USE_INET6} != "no")
CPPFLAGS+= -DINET6
.endif
# Some warnings are the result of usage of lwIP macros. We must not generate
# errors for those, but even producing the warnings is not helpful, so we
# disable them altogether.
CPPFLAGS+= -Wno-address
DPADD+= ${LIBLWIP} ${LIBSOCKEVENT} ${LIBSOCKDRIVER} ${LIBCHARDRIVER} \
${LIBSYS} ${LIBTIMERS}
LDADD+= -llwip -lsockevent -lsockdriver -lchardriver -lsys -ltimers
WARNS?= 5
.include <minix.service.mk>

692
minix/net/lwip/addr.c Normal file
View File

@ -0,0 +1,692 @@
/* LWIP service - addr.c - socket address verification and conversion */
#include "lwip.h"
/*
* Return TRUE if the given socket address is of type AF_UNSPEC, or FALSE
* otherwise.
*/
int
addr_is_unspec(const struct sockaddr * addr, socklen_t addr_len)
{
return (addr_len >= offsetof(struct sockaddr, sa_data) &&
addr->sa_family == AF_UNSPEC);
}
/*
* Check whether the given multicast address is generally valid. This check
* should not be moved into addr_get_inet(), as we do not want to forbid
* creating routes for such addresses, for example. We do however apply the
* restrictions here to all provided source and destination addresses. Return
* TRUE if the address is an acceptable multicast address, or FALSE otherwise.
*/
int
addr_is_valid_multicast(const ip_addr_t * ipaddr)
{
uint8_t scope;
assert(ip_addr_ismulticast(ipaddr));
/* We apply restrictions to IPv6 multicast addresses only. */
if (IP_IS_V6(ipaddr)) {
scope = ip6_addr_multicast_scope(ip_2_ip6(ipaddr));
if (scope == IP6_MULTICAST_SCOPE_RESERVED0 ||
scope == IP6_MULTICAST_SCOPE_RESERVEDF)
return FALSE;
/*
* We do not impose restrictions on the three defined embedded
* flags, even though we put no effort into supporting them,
* especially in terms of automatically creating routes for
* all cases. We do force the fourth flag to be zero.
* Unfortunately there is no lwIP macro to check for this flag.
*/
if (ip_2_ip6(ipaddr)->addr[0] & PP_HTONL(0x00800000UL))
return FALSE;
/* Prevent KAME-embedded zone IDs from entering the system. */
if (ip6_addr_has_scope(ip_2_ip6(ipaddr), IP6_UNKNOWN) &&
(ip_2_ip6(ipaddr)->addr[0] & PP_HTONL(0x0000ffffUL)))
return FALSE;
}
return TRUE;
}
/*
* Load a sockaddr structure, as copied from userland, as a lwIP-style IP
* address and (optionally) a port number. The expected type of IP address is
* given as 'type', which must be one of IPADDR_TYPE_{V4,ANY,V6}. If it is
* IPADDR_TYPE_V4, 'addr' is expected to point to a sockaddr_in structure. If
* it is IPADDR_TYPE_{ANY,V6}, 'addr' is expected to point to a sockaddr_in6
* structure. For the _ANY case, the result will be an _ANY address only if it
* is the unspecified (all-zeroes) address and a _V6 address in all other
* cases. For the _V6 case, the result will always be a _V6 address. The
* length of the structure pointed to by 'addr' is given as 'addr_len'. If the
* boolean 'kame' flag is set, addresses will be interpreted to be KAME style,
* meaning that for scoped IPv6 addresses, the zone is embedded in the address
* rather than given in sin6_scope_id. On success, store the resulting IP
* address in 'ipaddr'. If 'port' is not NULL, store the port number in it;
* otherwise, ignore the port number. On any parsing failure, return an
* appropriate negative error code.
*/
int
addr_get_inet(const struct sockaddr * addr, socklen_t addr_len, uint8_t type,
ip_addr_t * ipaddr, int kame, uint16_t * port)
{
struct sockaddr_in sin;
struct sockaddr_in6 sin6;
ip6_addr_t *ip6addr;
uint32_t ifindex;
switch (type) {
case IPADDR_TYPE_V4:
if (addr_len != sizeof(sin))
return EINVAL;
/*
* Getting around strict aliasing problems. Oh, the irony of
* doing an extra memcpy so that the compiler can do a better
* job at optimizing..
*/
memcpy(&sin, addr, sizeof(sin));
if (sin.sin_family != AF_INET)
return EAFNOSUPPORT;
ip_addr_set_ip4_u32(ipaddr, sin.sin_addr.s_addr);
if (port != NULL)
*port = ntohs(sin.sin_port);
return OK;
case IPADDR_TYPE_ANY:
case IPADDR_TYPE_V6:
if (addr_len != sizeof(sin6))
return EINVAL;
/* Again, strict aliasing.. */
memcpy(&sin6, addr, sizeof(sin6));
if (sin6.sin6_family != AF_INET6)
return EAFNOSUPPORT;
memset(ipaddr, 0, sizeof(*ipaddr));
/*
* This is a bit ugly, but NetBSD does not expose s6_addr32 and
* s6_addr is a series of bytes, which is a mismatch for lwIP.
* The alternative would be another memcpy..
*/
ip6addr = ip_2_ip6(ipaddr);
assert(sizeof(ip6addr->addr) == sizeof(sin6.sin6_addr));
memcpy(ip6addr->addr, &sin6.sin6_addr, sizeof(ip6addr->addr));
/*
* If the address may have a scope, extract the zone ID.
* Where the zone ID is depends on the 'kame' parameter: KAME-
* style addresses have it embedded within the address, whereas
* non-KAME addresses use the (misnamed) sin6_scope_id field.
*/
if (ip6_addr_has_scope(ip6addr, IP6_UNKNOWN)) {
if (kame) {
ifindex =
ntohl(ip6addr->addr[0]) & 0x0000ffffUL;
ip6addr->addr[0] &= PP_HTONL(0xffff0000UL);
} else {
/*
* Reject KAME-style addresses for normal
* socket calls, to save ourselves the trouble
* of mixed address styles elsewhere.
*/
if (ip6addr->addr[0] & PP_HTONL(0x0000ffffUL))
return EINVAL;
ifindex = sin6.sin6_scope_id;
}
/*
* Reject invalid zone IDs. This also enforces that
* no zone IDs wider than eight bits enter the system.
* As a side effect, it is not possible to add routes
* for invalid zones, but that should be no problem.
*/
if (ifindex != 0 &&
ifdev_get_by_index(ifindex) == NULL)
return ENXIO;
ip6_addr_set_zone(ip6addr, ifindex);
} else
ip6_addr_clear_zone(ip6addr);
/*
* Set the type to ANY if it was ANY and the address itself is
* ANY as well. Otherwise, we are binding to a specific IPv6
* address, so IPV6_V6ONLY stops being relevant and we should
* leave the address set to V6. Destination addresses for ANY
* are set to V6 elsewhere.
*/
if (type == IPADDR_TYPE_ANY && ip6_addr_isany(ip6addr))
IP_SET_TYPE(ipaddr, type);
else
IP_SET_TYPE(ipaddr, IPADDR_TYPE_V6);
if (port != NULL)
*port = ntohs(sin6.sin6_port);
return OK;
default:
return EAFNOSUPPORT;
}
}
/*
* Store an lwIP-style IP address and port number as a sockaddr structure
* (sockaddr_in or sockaddr_in6, depending on the given IP address) to be
* copied to userland. The result is stored in the buffer pointed to by
* 'addr'. Before the call, 'addr_len' must be set to the size of this buffer.
* This is an internal check to prevent buffer overflows, and must not be used
* to validate input, since a mismatch will trigger a panic. After the call,
* 'addr_len' will be set to the size of the resulting structure. The lwIP-
* style address is given as 'ipaddr'. If the boolean 'kame' flag is set, the
* address will be stored KAME-style, meaning that for scoped IPv6 addresses,
* the address zone will be stored embedded in the address rather than in
* sin6_scope_id. If relevant, 'port' contains the port number in host-byte
* order; otherwise it should be set to zone.
*/
void
addr_put_inet(struct sockaddr * addr, socklen_t * addr_len,
const ip_addr_t * ipaddr, int kame, uint16_t port)
{
struct sockaddr_in sin;
struct sockaddr_in6 sin6;
const ip6_addr_t *ip6addr;
uint32_t zone;
switch (IP_GET_TYPE(ipaddr)) {
case IPADDR_TYPE_V4:
if (*addr_len < sizeof(sin))
panic("provided address buffer too small");
memset(&sin, 0, sizeof(sin));
sin.sin_len = sizeof(sin);
sin.sin_family = AF_INET;
sin.sin_port = htons(port);
sin.sin_addr.s_addr = ip_addr_get_ip4_u32(ipaddr);
memcpy(addr, &sin, sizeof(sin));
*addr_len = sizeof(sin);
break;
case IPADDR_TYPE_ANY:
case IPADDR_TYPE_V6:
if (*addr_len < sizeof(sin6))
panic("provided address buffer too small");
ip6addr = ip_2_ip6(ipaddr);
memset(&sin6, 0, sizeof(sin6));
sin6.sin6_len = sizeof(sin6);
sin6.sin6_family = AF_INET6;
sin6.sin6_port = htons(port);
memcpy(&sin6.sin6_addr, ip6addr->addr, sizeof(sin6.sin6_addr));
/*
* If the IPv6 address has a zone set, it must be scoped, and
* we put the zone in the result. It may occur that a scoped
* IPv6 address does not have a zone here though, for example
* if packet routing fails for sendto() with a zoneless address
* on an unbound socket, resulting in an RTM_MISS message. In
* such cases, simply leave the zone index blank in the result.
*/
if (ip6_addr_has_zone(ip6addr)) {
assert(ip6_addr_has_scope(ip6addr, IP6_UNKNOWN));
zone = ip6_addr_zone(ip6addr);
assert(zone <= UINT8_MAX);
if (kame)
sin6.sin6_addr.s6_addr[3] = zone;
else
sin6.sin6_scope_id = zone;
}
memcpy(addr, &sin6, sizeof(sin6));
*addr_len = sizeof(sin6);
break;
default:
panic("unknown IP address type: %u", IP_GET_TYPE(ipaddr));
}
}
/*
* Load a link-layer sockaddr structure (sockaddr_dl), as copied from userland,
* and return the contained name and/or hardware address. The address is
* provided as 'addr', with length 'addr_len'. On success, return OK. If
* 'name' is not NULL, it must be of size 'name_max', and will be used to store
* the (null-terminated) interface name in the given structure if present, or
* the empty string if not. If 'hwaddr' is not NULL, it will be used to store
* the hardware address in the given structure, which must in that case be
* present and exactly 'hwaddr_len' bytes long. On any parsing failure, return
* an appropriate negative error code.
*/
int
addr_get_link(const struct sockaddr * addr, socklen_t addr_len, char * name,
size_t name_max, uint8_t * hwaddr, size_t hwaddr_len)
{
struct sockaddr_dlx sdlx;
size_t nlen, alen;
if (addr_len < offsetof(struct sockaddr_dlx, sdlx_data))
return EINVAL;
/*
* We cannot prevent callers from passing in massively oversized
* sockaddr_dl structure. However, we insist that all the actual data
* be contained within the size of our sockaddr_dlx version.
*/
if (addr_len > sizeof(sdlx))
addr_len = sizeof(sdlx);
memcpy(&sdlx, addr, addr_len);
if (sdlx.sdlx_family != AF_LINK)
return EAFNOSUPPORT;
/* Address selectors are not currently supported. */
if (sdlx.sdlx_slen != 0)
return EINVAL;
nlen = (size_t)sdlx.sdlx_nlen;
alen = (size_t)sdlx.sdlx_alen;
/* The nlen and alen fields are 8-bit, so no risks of overflow here. */
if (addr_len < offsetof(struct sockaddr_dlx, sdlx_data) + nlen + alen)
return EINVAL;
/*
* Copy out the name, truncating it if needed. The name in the
* sockaddr is not null terminated, so we have to do that. If the
* sockaddr has no name, copy out an empty name.
*/
if (name != NULL) {
assert(name_max > 0);
if (name_max > nlen + 1)
name_max = nlen + 1;
memcpy(name, sdlx.sdlx_data, name_max - 1);
name[name_max - 1] = '\0';
}
/*
* Copy over the hardware address. For simplicity, we require that the
* caller specify the exact hardware address length.
*/
if (hwaddr != NULL) {
if (alen != hwaddr_len)
return EINVAL;
memcpy(hwaddr, sdlx.sdlx_data + nlen, hwaddr_len);
}
return OK;
}
/*
* Store a link-layer sockaddr structure (sockaddr_dl), to be copied to
* userland. The result is stored in the buffer pointed to by 'addr'. Before
* the call, 'addr_len' must be set to the size of this buffer. This is an
* internal check to prevent buffer overflows, and must not be used to validate
* input, since a mismatch will trigger a panic. After the call, 'addr_len'
* will be set to the size of the resulting structure. The given interface
* index 'ifindex' and (IFT_) interface type 'type' will always be stored in
* the resulting structure. If 'name' is not NULL, it must be a null-
* terminated interface name string which will be included in the structure.
* If 'hwaddr' is not NULL, it must be a hardware address of length
* 'hwaddr_len', which will also be included in the structure.
*/
void
addr_put_link(struct sockaddr * addr, socklen_t * addr_len, uint32_t ifindex,
uint32_t type, const char * name, const uint8_t * hwaddr,
size_t hwaddr_len)
{
struct sockaddr_dlx sdlx;
size_t name_len;
socklen_t len;
name_len = (name != NULL) ? strlen(name) : 0;
if (hwaddr == NULL)
hwaddr_len = 0;
assert(name_len < IFNAMSIZ);
assert(hwaddr_len <= NETIF_MAX_HWADDR_LEN);
len = offsetof(struct sockaddr_dlx, sdlx_data) + name_len + hwaddr_len;
if (*addr_len < len)
panic("provided address buffer too small");
memset(&sdlx, 0, sizeof(sdlx));
sdlx.sdlx_len = len;
sdlx.sdlx_family = AF_LINK;
sdlx.sdlx_index = ifindex;
sdlx.sdlx_type = type;
sdlx.sdlx_nlen = name_len;
sdlx.sdlx_alen = hwaddr_len;
if (name_len > 0)
memcpy(sdlx.sdlx_data, name, name_len);
if (hwaddr_len > 0)
memcpy(sdlx.sdlx_data + name_len, hwaddr, hwaddr_len);
memcpy(addr, &sdlx, len);
*addr_len = len;
}
/*
* Convert an IPv4 or IPv6 netmask, given as sockaddr structure 'addr', to a
* prefix length. The length of the sockaddr structure is given as 'addr_len'.
* For consistency with addr_get_inet(), the expected address type is given as
* 'type', and must be either IPADDR_TYPE_V4 or IPADDR_TYPE_V6. On success,
* return OK with the number of set prefix bits returned in 'prefix', and
* optionally with a lwIP representation of the netmask stored in 'ipaddr' (if
* not NULL). On failure, return an appropriate negative error code. Note
* that this function does not support compressed IPv4 network masks; such
* addresses must be expanded before a call to this function.
*/
int
addr_get_netmask(const struct sockaddr * addr, socklen_t addr_len,
uint8_t type, unsigned int * prefix, ip_addr_t * ipaddr)
{
struct sockaddr_in sin;
struct sockaddr_in6 sin6;
unsigned int byte, bit;
uint32_t val;
switch (type) {
case IPADDR_TYPE_V4:
if (addr_len != sizeof(sin))
return EINVAL;
memcpy(&sin, addr, sizeof(sin));
if (sin.sin_family != AF_INET)
return EAFNOSUPPORT;
val = ntohl(sin.sin_addr.s_addr);
/* Find the first zero bit. */
for (bit = 0; bit < IP4_BITS; bit++)
if (!(val & (1 << (IP4_BITS - bit - 1))))
break;
*prefix = bit;
/* All bits after the first zero bit must also be zero. */
if (bit < IP4_BITS &&
(val & ((1 << (IP4_BITS - bit - 1)) - 1)))
return EINVAL;
if (ipaddr != NULL)
ip_addr_set_ip4_u32(ipaddr, sin.sin_addr.s_addr);
return OK;
case IPADDR_TYPE_V6:
if (addr_len != sizeof(sin6))
return EINVAL;
memcpy(&sin6, addr, sizeof(sin6));
if (sin6.sin6_family != AF_INET6)
return EAFNOSUPPORT;
/* Find the first zero bit. */
for (byte = 0; byte < __arraycount(sin6.sin6_addr.s6_addr);
byte++)
if (sin6.sin6_addr.s6_addr[byte] != 0xff)
break;
/* If all bits are set, there is nothing more to do. */
if (byte == __arraycount(sin6.sin6_addr.s6_addr)) {
*prefix = __arraycount(sin6.sin6_addr.s6_addr) * NBBY;
return OK;
}
for (bit = 0; bit < NBBY; bit++)
if (!(sin6.sin6_addr.s6_addr[byte] &
(1 << (NBBY - bit - 1))))
break;
*prefix = byte * NBBY + bit;
/* All bits after the first zero bit must also be zero. */
if (bit < NBBY && (sin6.sin6_addr.s6_addr[byte] &
((1 << (NBBY - bit - 1)) - 1)))
return EINVAL;
for (byte++; byte < __arraycount(sin6.sin6_addr.s6_addr);
byte++)
if (sin6.sin6_addr.s6_addr[byte] != 0)
return EINVAL;
if (ipaddr != NULL) {
ip_addr_set_zero_ip6(ipaddr);
memcpy(ip_2_ip6(ipaddr)->addr, &sin6.sin6_addr,
sizeof(ip_2_ip6(ipaddr)->addr));
}
return OK;
default:
panic("unknown IP address type: %u", type);
}
}
/*
* Generate a raw network mask based on the given prefix length.
*/
void
addr_make_netmask(uint8_t * addr, socklen_t addr_len, unsigned int prefix)
{
unsigned int byte, bit;
byte = prefix / NBBY;
bit = prefix % NBBY;
assert(byte + !!bit <= addr_len);
if (byte > 0)
memset(addr, 0xff, byte);
if (bit != 0)
addr[byte++] = (uint8_t)(0xff << (NBBY - bit));
if (byte < addr_len)
memset(&addr[byte], 0, addr_len - byte);
}
/*
* Store a network mask as a sockaddr structure, in 'addr'. Before the call,
* 'addr_len' must be set to the memory size of 'addr'. The address type is
* given as 'type', and must be either IPADDR_TYPE_V4 or IPADDR_TYPE_V6. The
* prefix length from which to generate the network mask is given as 'prefix'.
* Upon return, 'addr_len' is set to the size of the resulting sockaddr
* structure.
*/
void
addr_put_netmask(struct sockaddr * addr, socklen_t * addr_len, uint8_t type,
unsigned int prefix)
{
struct sockaddr_in sin;
struct sockaddr_in6 sin6;
switch (type) {
case IPADDR_TYPE_V4:
if (*addr_len < sizeof(sin))
panic("provided address buffer too small");
assert(prefix <= IP4_BITS);
memset(&sin, 0, sizeof(sin));
sin.sin_len = sizeof(sin);
sin.sin_family = AF_INET;
addr_make_netmask((uint8_t *)&sin.sin_addr.s_addr,
sizeof(sin.sin_addr.s_addr), prefix);
memcpy(addr, &sin, sizeof(sin));
*addr_len = sizeof(sin);
break;
case IPADDR_TYPE_V6:
if (*addr_len < sizeof(sin6))
panic("provided address buffer too small");
assert(prefix <= IP6_BITS);
memset(&sin6, 0, sizeof(sin6));
sin6.sin6_len = sizeof(sin6);
sin6.sin6_family = AF_INET6;
addr_make_netmask(sin6.sin6_addr.s6_addr,
sizeof(sin6.sin6_addr.s6_addr), prefix);
memcpy(addr, &sin6, sizeof(sin6));
*addr_len = sizeof(sin6);
break;
default:
panic("unknown IP address type: %u", type);
}
}
/*
* Normalize the given address in 'src' to the given number of prefix bits,
* setting all other bits to zero. Return the result in 'dst'.
*/
void
addr_normalize(ip_addr_t * dst, const ip_addr_t * src, unsigned int prefix)
{
unsigned int addr_len, byte, bit;
const uint8_t *srcaddr;
uint8_t type, *dstaddr;
type = IP_GET_TYPE(src);
memset(dst, 0, sizeof(*dst));
IP_SET_TYPE(dst, type);
switch (type) {
case IPADDR_TYPE_V4:
srcaddr = (const uint8_t *)&ip_2_ip4(src)->addr;
dstaddr = (uint8_t *)&ip_2_ip4(dst)->addr;
addr_len = sizeof(ip_2_ip4(src)->addr);
break;
case IPADDR_TYPE_V6:
ip6_addr_set_zone(ip_2_ip6(dst), ip6_addr_zone(ip_2_ip6(src)));
srcaddr = (const uint8_t *)&ip_2_ip6(src)->addr;
dstaddr = (uint8_t *)&ip_2_ip6(dst)->addr;
addr_len = sizeof(ip_2_ip6(src)->addr);
break;
default:
panic("unknown IP address type: %u", type);
}
byte = prefix / NBBY;
bit = prefix % NBBY;
assert(byte + !!bit <= addr_len);
if (byte > 0)
memcpy(dstaddr, srcaddr, byte);
if (bit != 0) {
dstaddr[byte] =
srcaddr[byte] & (uint8_t)(0xff << (NBBY - bit));
byte++;
}
}
/*
* Return the number of common bits between the given two addresses, up to the
* given maximum. Thus, return a value between 0 and 'max' inclusive.
*/
unsigned int
addr_get_common_bits(const ip_addr_t * ipaddr1, const ip_addr_t * ipaddr2,
unsigned int max)
{
unsigned int addr_len, prefix, bit;
const uint8_t *addr1, *addr2;
uint8_t byte;
switch (IP_GET_TYPE(ipaddr1)) {
case IPADDR_TYPE_V4:
assert(IP_IS_V4(ipaddr2));
addr1 = (const uint8_t *)&ip_2_ip4(ipaddr1)->addr;
addr2 = (const uint8_t *)&ip_2_ip4(ipaddr2)->addr;
addr_len = sizeof(ip_2_ip4(ipaddr1)->addr);
break;
case IPADDR_TYPE_V6:
assert(IP_IS_V6(ipaddr2));
addr1 = (const uint8_t *)&ip_2_ip6(ipaddr1)->addr;
addr2 = (const uint8_t *)&ip_2_ip6(ipaddr2)->addr;
addr_len = sizeof(ip_2_ip6(ipaddr1)->addr);
break;
default:
panic("unknown IP address type: %u", IP_GET_TYPE(ipaddr1));
}
if (addr_len > max * NBBY)
addr_len = max * NBBY;
prefix = 0;
for (prefix = 0; addr_len > 0; addr1++, addr2++, prefix += NBBY) {
if ((byte = (*addr1 ^ *addr2)) != 0) {
/* TODO: see if we want a lookup table for this. */
for (bit = 0; bit < NBBY; bit++, prefix++)
if (byte & (1 << (NBBY - bit - 1)))
break;
break;
}
}
if (prefix > max)
prefix = max;
return prefix;
}
/*
* Convert the given IPv4 address to an IPv4-mapped IPv6 address.
*/
void
addr_make_v4mapped_v6(ip_addr_t * dst, const ip4_addr_t * src)
{
IP_ADDR6(dst, 0, 0, PP_HTONL(0x0000ffffUL), ip4_addr_get_u32(src));
}

33
minix/net/lwip/addr.h Normal file
View File

@ -0,0 +1,33 @@
#ifndef MINIX_NET_LWIP_ADDR_H
#define MINIX_NET_LWIP_ADDR_H
int addr_is_unspec(const struct sockaddr * addr, socklen_t addr_len);
int addr_is_valid_multicast(const ip_addr_t * ipaddr);
int addr_get_inet(const struct sockaddr * addr, socklen_t addr_len,
uint8_t type, ip_addr_t * ipaddr, int kame, uint16_t * port);
void addr_put_inet(struct sockaddr * addr, socklen_t * addr_len,
const ip_addr_t * ipaddr, int kame, uint16_t port);
int addr_get_link(const struct sockaddr * addr, socklen_t addr_len,
char * name, size_t name_max, uint8_t * hwaddr, size_t hwaddr_len);
void addr_put_link(struct sockaddr * addr, socklen_t * addr_len,
uint32_t ifindex, uint32_t type, const char * name,
const uint8_t * hwaddr, size_t hwaddr_len);
int addr_get_netmask(const struct sockaddr * addr, socklen_t addr_len,
uint8_t type, unsigned int * prefix, ip_addr_t * ipaddr);
void addr_make_netmask(uint8_t * addr, socklen_t addr_len,
unsigned int prefix);
void addr_put_netmask(struct sockaddr * addr, socklen_t * addr_len,
uint8_t type, unsigned int prefix);
void addr_normalize(ip_addr_t * dst, const ip_addr_t * src,
unsigned int prefix);
unsigned int addr_get_common_bits(const ip_addr_t * addr1,
const ip_addr_t * addr2, unsigned int max);
void addr_make_v4mapped_v6(ip_addr_t * dst, const ip4_addr_t * src);
#endif /* !MINIX_NET_LWIP_ADDR_H */

143
minix/net/lwip/addrpol.c Normal file
View File

@ -0,0 +1,143 @@
/* LWIP service - addrpol.c - address policy table and values */
/*
* The main purpose of this module is to implement the address policy table
* described in RFC 6724. In general, the policy table is used for two
* purposes: source address selection, which is part of this service, and
* destination address selection, which is implemented in libc. NetBSD 7, the
* version that MINIX 3 is synced against at this moment, does not actually
* implement the libc part yet, though. That will change with NetBSD 8, where
* libc uses sysctl(7) to obtain the kernel's policy table, which itself can be
* changed with the new ip6addrctl(8) utility. Once we resync to NetBSD 8, we
* will also have to support this new functionality, and this module is where
* it would be implemented. Since NetBSD 7 is even lacking the necessary
* definitions, we cannot do that ahead of time, though. Thus, until then,
* this module is rather simple, as it only implements a static policy table
* used for source address selection. No changes beyond this module should be
* necessary, e.g. we are purposely not caching labels for local addresses.
*/
#include "lwip.h"
/*
* Address policy table. Currently hardcoded to the default of RFC 6724.
* Sorted by prefix length, so that the first match is always also the longest.
*/
static const struct {
ip_addr_t ipaddr;
unsigned int prefix;
int precedence;
int label;
} addrpol_table[] = {
{ IPADDR6_INIT_HOST(0, 0, 0, 1), 128, 50, 0 },
{ IPADDR6_INIT_HOST(0, 0, 0x0000ffffUL, 0), 96, 35, 4 },
{ IPADDR6_INIT_HOST(0, 0, 0, 0), 96, 1, 3 },
{ IPADDR6_INIT_HOST(0x20010000UL, 0, 0, 0), 32, 5, 5 },
{ IPADDR6_INIT_HOST(0x20020000UL, 0, 0, 0), 16, 30, 2 },
{ IPADDR6_INIT_HOST(0x3ffe0000UL, 0, 0, 0), 16, 1, 12 },
{ IPADDR6_INIT_HOST(0xfec00000UL, 0, 0, 0), 10, 1, 11 },
{ IPADDR6_INIT_HOST(0xfc000000UL, 0, 0, 0), 7, 3, 13 },
{ IPADDR6_INIT_HOST(0, 0, 0, 0), 0, 40, 1 }
};
/*
* Obtain the label value for the given IP address from the address policy
* table. Currently only IPv6 addresses may be given. This function is linear
* in number of address policy table entries, requiring a relatively expensive
* normalization operation for each entry, so it should not be called lightly.
* Its results should not be cached beyond local contexts either, because the
* policy table itself may be changed from userland (in the future).
*
* TODO: convert IPv4 addresses to IPv4-mapped IPv6 addresses.
* TODO: embed the interface index in link-local addresses.
*/
int
addrpol_get_label(const ip_addr_t * iporig)
{
ip_addr_t ipaddr;
unsigned int i;
assert(IP_IS_V6(iporig));
/*
* The policy table is sorted by prefix length such that the first
* match is also the one with the longest prefix, and as such the best.
*/
for (i = 0; i < __arraycount(addrpol_table); i++) {
addr_normalize(&ipaddr, iporig, addrpol_table[i].prefix);
if (ip_addr_cmp(&addrpol_table[i].ipaddr, &ipaddr))
return addrpol_table[i].label;
}
/*
* We cannot possibly get here with the default policy table, because
* the last entry will always match. It is not clear what we should
* return if there is no matching entry, though. For now, we return
* the default label value for the default (::/0) entry, which is 1.
*/
return 1;
}
/*
* Return an opaque positive value (possibly zero) that represents the scope of
* the given IP address. A larger value indicates a wider scope. The 'is_src'
* flag indicates whether the address is a source or a destination address,
* which affects the value returned for unknown addresses. A scope is a direct
* function of only the given address, so the result may be cached on a per-
* address basis without risking invalidation at any point in time.
*/
int
addrpol_get_scope(const ip_addr_t * ipaddr, int is_src)
{
const ip6_addr_t *ip6addr;
/*
* For now, all IPv4 addresses are considered global. This function is
* currently called only for IPv6 addresses anyway.
*/
if (IP_IS_V4(ipaddr))
return IP6_MULTICAST_SCOPE_GLOBAL;
assert(IP_IS_V6(ipaddr));
ip6addr = ip_2_ip6(ipaddr);
/*
* These are ordered not by ascending scope, but (roughly) by expected
* likeliness to match, for performance reasons.
*/
if (ip6_addr_isglobal(ip6addr))
return IP6_MULTICAST_SCOPE_GLOBAL;
if (ip6_addr_islinklocal(ip6addr) || ip6_addr_isloopback(ip6addr))
return IP6_MULTICAST_SCOPE_LINK_LOCAL;
/*
* We deliberately deviate from RFC 6724 Sec. 3.1 by considering
* Unique-Local Addresses (ULAs) to be of smaller scope than global
* addresses, to avoid that during source address selection, a
* preferred ULA is picked over a deprecated global address when given
* a global address as destination, as that would likely result in
* broken two-way communication.
*/
if (ip6_addr_isuniquelocal(ip6addr))
return IP6_MULTICAST_SCOPE_ORGANIZATION_LOCAL;
if (ip6_addr_ismulticast(ip6addr))
return ip6_addr_multicast_scope(ip6addr);
/* Site-local addresses are deprecated. */
if (ip6_addr_issitelocal(ip6addr))
return IP6_MULTICAST_SCOPE_SITE_LOCAL;
/*
* If the address is a source address, give it a scope beyond global to
* make sure that a "real" global address is picked first. If the
* address is a destination address, give it a global scope so as to
* pick "real" global addresses over unknown-scope source addresses.
*/
if (is_src)
return IP6_MULTICAST_SCOPE_RESERVEDF; /* greater than GLOBAL */
else
return IP6_MULTICAST_SCOPE_GLOBAL;
}

561
minix/net/lwip/bpf_filter.c Normal file
View File

@ -0,0 +1,561 @@
/* LWIP service - bpf_filter.c - Berkeley Packet Filter core implementation */
/*
* This is basically a drop-in replacement of NetBSD's bpf_filter.c, which
* itself can be compiled for either the NetBSD kernel or for userland. On
* MINIX 3, we would like to perform certain checks that NetBSD implements only
* for its kernel (e.g., memory store access validation) while replacing the
* NetBSD kernel specifics with our own (pbuf instead of mbuf, no BPF contexts
* for now, etc.). As a result, it is easier to reimplement the whole thing,
* because there is not all that much to it.
*
* Support for the standard BSD API allows us to run standard tests against
* this module from userland, where _MINIX_SYSTEM is not defined. MINIX 3
* specific extensions are enabled only if _MINIX_SYSTEM is defined.
*/
#include <string.h>
#include <limits.h>
#include <net/bpf.h>
#include <minix/bitmap.h>
#ifdef _MINIX_SYSTEM
#include "lwip.h"
/*
* Obtain an unsigned 32-bit value in network byte order from the pbuf chain
* 'pbuf' at offset 'k'. The given offset is guaranteed to be within bounds.
*/
static uint32_t
bpf_get32_ext(const struct pbuf * pbuf, uint32_t k)
{
uint32_t val;
unsigned int i;
/*
* Find the pbuf that contains the first byte. We expect that most
* filters will operate only on the headers of packets, so that we
* mostly avoid going through this O(n) loop. Since only the superuser
* can open BPF devices at all, we need not be worried about abuse in
* this regard. However, it turns out that this loop is particularly
* CPU-intensive after all, we can probably improve it by caching the
* last visited pbuf, as read locality is likely high.
*/
while (k >= pbuf->len) {
k -= pbuf->len;
pbuf = pbuf->next;
assert(pbuf != NULL);
}
/*
* We assume that every pbuf has some data, but we make no assumptions
* about any minimum amount of data per pbuf. Therefore, we may have
* to take the bytes from anywhere between one and four pbufs.
* Hopefully the compiler will unroll this loop for us.
*/
val = (uint32_t)(((u_char *)pbuf->payload)[k]) << 24;
for (i = 0; i < 3; i++) {
if (k >= (uint32_t)pbuf->len - 1) {
k = 0;
pbuf = pbuf->next;
assert(pbuf != NULL);
} else
k++;
val = (val << 8) | (uint32_t)(((u_char *)pbuf->payload)[k]);
}
return val;
}
/*
* Obtain an unsigned 16-bit value in network byte order from the pbuf chain
* 'pbuf' at offset 'k'. The given offset is guaranteed to be within bounds.
*/
static uint32_t
bpf_get16_ext(const struct pbuf * pbuf, uint32_t k)
{
/* As above. */
while (k >= pbuf->len) {
k -= pbuf->len;
pbuf = pbuf->next;
assert(pbuf != NULL);
}
/*
* There are only two possible cases to cover here: either the two
* bytes are in the same pbuf, or they are in subsequent ones.
*/
if (k < (uint32_t)pbuf->len - 1) {
return ((uint32_t)(((u_char *)pbuf->payload)[k]) << 8) |
(uint32_t)(((u_char *)pbuf->next->payload)[k + 1]);
} else {
assert(pbuf->next != NULL);
return ((uint32_t)(((u_char *)pbuf->payload)[k]) << 8) |
(uint32_t)(((u_char *)pbuf->next->payload)[0]);
}
}
/*
* Obtain an unsigned 8-bit value from the pbuf chain 'pbuf' at offset 'k'.
* The given offset is guaranteed to be within bounds.
*/
static uint32_t
bpf_get8_ext(const struct pbuf * pbuf, uint32_t k)
{
/* As above. */
while (k >= pbuf->len) {
k -= pbuf->len;
pbuf = pbuf->next;
assert(pbuf != NULL);
}
return (uint32_t)(((u_char *)pbuf->payload)[k]);
}
#endif /* _MINIX_SYSTEM */
/*
* Execute a BPF filter program on (the first part of) a packet, and return the
* maximum size of the packet that should be delivered to the filter owner.
*
* The 'pc' parameter points to an array of BPF instructions that together form
* the filter program to be executed. If 'pc' is NULL, the packet is fully
* accepted. Otherwise, the given program MUST have passed a previous call to
* bpf_validate(). Not doing so will allow for arbitrary memory access.
*
* The 'packet' array contains up to the whole packet. The value of 'total'
* denotes the total length of the packet; 'len' contains the size of the array
* 'packet'. Chunked storage of the packet is not supported at this time.
*
* If executing the program succeeds, the return value is the maximum number of
* bytes from the packet to be delivered. The return value may exceed the full
* packet size. If the number of bytes returned is zero, the packet is to be
* ignored. If the program fails to execute properly and return a value, a
* value of zero is returned as well, thus also indicating that the packet
* should be ignored. This is intentional: it saves filter programs from
* having to perform explicit checks on the packet they are filtering.
*/
u_int
bpf_filter(const struct bpf_insn * pc, const u_char * packet, u_int total,
u_int len)
#ifdef _MINIX_SYSTEM
{
return bpf_filter_ext(pc, NULL /*pbuf*/, packet, total, len);
}
u_int
bpf_filter_ext(const struct bpf_insn * pc, const struct pbuf * pbuf,
const u_char * packet, u_int total, u_int len)
#endif /* _MINIX_SYSTEM */
{
uint32_t k, a, x, mem[BPF_MEMWORDS];
/* An empty program accepts all packets. */
if (pc == NULL)
return UINT_MAX;
/*
* We need not clear 'mem': the checker guarantees that each memory
* store word is always written before it is read.
*/
a = 0;
x = 0;
/* Execute the program. */
for (;; pc++) {
k = pc->k;
switch (pc->code) {
case BPF_LD+BPF_W+BPF_IND: /* A <- P[X+k:4] */
if (k + x < k)
return 0;
k += x;
/* FALLTHROUGH */
case BPF_LD+BPF_W+BPF_ABS: /* A <- P[k:4] */
/*
* 'k' may have any value, so check bounds in such a
* way that 'k' cannot possibly overflow and wrap.
*/
if (len >= 3 && k < len - 3)
a = ((uint32_t)packet[k] << 24) |
((uint32_t)packet[k + 1] << 16) |
((uint32_t)packet[k + 2] << 8) |
(uint32_t)packet[k + 3];
#ifdef _MINIX_SYSTEM
else if (total >= 3 && k < total - 3)
a = bpf_get32_ext(pbuf, k);
#endif /* _MINIX_SYSTEM */
else
return 0;
break;
case BPF_LD+BPF_H+BPF_IND: /* A <- P[X+k:2] */
if (k + x < k)
return 0;
k += x;
/* FALLTHROUGH */
case BPF_LD+BPF_H+BPF_ABS: /* A <- P[k:2] */
/* As above. */
if (len >= 1 && k < len - 1)
a = ((uint32_t)packet[k] << 8) |
(uint32_t)packet[k + 1];
#ifdef _MINIX_SYSTEM
else if (total >= 1 && k < total - 1)
a = bpf_get16_ext(pbuf, k);
#endif /* _MINIX_SYSTEM */
else
return 0;
break;
case BPF_LD+BPF_B+BPF_IND: /* A <- P[X+k:1] */
if (k + x < k)
return 0;
k += x;
/* FALLTHROUGH */
case BPF_LD+BPF_B+BPF_ABS: /* A <- P[k:1] */
if (k < len)
a = (uint32_t)packet[k];
#ifdef _MINIX_SYSTEM
else if (k < total)
a = bpf_get8_ext(pbuf, k);
#endif /* _MINIX_SYSTEM */
else
return 0;
break;
case BPF_LD+BPF_W+BPF_LEN: /* A <- len */
a = total;
break;
case BPF_LD+BPF_IMM: /* A <- k */
a = k;
break;
case BPF_LD+BPF_MEM: /* A <- M[k] */
a = mem[k];
break;
case BPF_LDX+BPF_IMM: /* X <- k */
x = k;
break;
case BPF_LDX+BPF_MEM: /* X <- M[k] */
x = mem[k];
break;
case BPF_LDX+BPF_LEN: /* X <- len */
x = total;
break;
case BPF_LDX+BPF_B+BPF_MSH: /* X <- 4*(P[k:1]&0xf) */
if (k < len)
x = ((uint32_t)packet[k] & 0xf) << 2;
#ifdef _MINIX_SYSTEM
else if (k < total)
x = (bpf_get8_ext(pbuf, k) & 0xf) << 2;
#endif /* _MINIX_SYSTEM */
else
return 0;
break;
case BPF_ST: /* M[k] <- A */
mem[k] = a;
break;
case BPF_STX: /* M[k] <- X */
mem[k] = x;
break;
case BPF_ALU+BPF_ADD+BPF_K: /* A <- A + k */
a += k;
break;
case BPF_ALU+BPF_SUB+BPF_K: /* A <- A - k */
a -= k;
break;
case BPF_ALU+BPF_MUL+BPF_K: /* A <- A * k */
a *= k;
break;
case BPF_ALU+BPF_DIV+BPF_K: /* A <- A / k */
a /= k;
break;
case BPF_ALU+BPF_MOD+BPF_K: /* A <- A % k */
a %= k;
break;
case BPF_ALU+BPF_AND+BPF_K: /* A <- A & k */
a &= k;
break;
case BPF_ALU+BPF_OR+BPF_K: /* A <- A | k */
a |= k;
break;
case BPF_ALU+BPF_XOR+BPF_K: /* A <- A ^ k */
a ^= k;
break;
case BPF_ALU+BPF_LSH+BPF_K: /* A <- A << k */
a <<= k;
break;
case BPF_ALU+BPF_RSH+BPF_K: /* A <- A >> k */
a >>= k;
break;
case BPF_ALU+BPF_ADD+BPF_X: /* A <- A + X */
a += x;
break;
case BPF_ALU+BPF_SUB+BPF_X: /* A <- A - X */
a -= x;
break;
case BPF_ALU+BPF_MUL+BPF_X: /* A <- A * X */
a *= x;
break;
case BPF_ALU+BPF_DIV+BPF_X: /* A <- A / X */
if (x == 0)
return 0;
a /= x;
break;
case BPF_ALU+BPF_MOD+BPF_X: /* A <- A % X */
if (x == 0)
return 0;
a %= x;
break;
case BPF_ALU+BPF_AND+BPF_X: /* A <- A & X */
a &= x;
break;
case BPF_ALU+BPF_OR+BPF_X: /* A <- A | X */
a |= x;
break;
case BPF_ALU+BPF_XOR+BPF_X: /* A <- A ^ X */
a ^= x;
break;
case BPF_ALU+BPF_LSH+BPF_X: /* A <- A << X */
if (x >= 32)
return 0;
a <<= x;
break;
case BPF_ALU+BPF_RSH+BPF_X: /* A <- A >> X */
if (x >= 32)
return 0;
a >>= x;
break;
case BPF_ALU+BPF_NEG: /* A <- -A */
a = -a;
break;
case BPF_JMP+BPF_JA: /* pc += k */
pc += k;
break;
case BPF_JMP+BPF_JGT+BPF_K: /* pc += (A > k) ? jt : jf */
pc += (a > k) ? pc->jt : pc->jf;
break;
case BPF_JMP+BPF_JGE+BPF_K: /* pc += (A >= k) ? jt : jf */
pc += (a >= k) ? pc->jt : pc->jf;
break;
case BPF_JMP+BPF_JEQ+BPF_K: /* pc += (A == k) ? jt : jf */
pc += (a == k) ? pc->jt : pc->jf;
break;
case BPF_JMP+BPF_JSET+BPF_K: /* pc += (A & k) ? jt : jf */
pc += (a & k) ? pc->jt : pc->jf;
break;
case BPF_JMP+BPF_JGT+BPF_X: /* pc += (A > X) ? jt : jf */
pc += (a > x) ? pc->jt : pc->jf;
break;
case BPF_JMP+BPF_JGE+BPF_X: /* pc += (A >= X) ? jt : jf */
pc += (a >= x) ? pc->jt : pc->jf;
break;
case BPF_JMP+BPF_JEQ+BPF_X: /* pc += (A == X) ? jt : jf */
pc += (a == x) ? pc->jt : pc->jf;
break;
case BPF_JMP+BPF_JSET+BPF_X: /* pc += (A & X) ? jt : jf */
pc += (a & x) ? pc->jt : pc->jf;
break;
case BPF_RET+BPF_A: /* accept A bytes */
return a;
case BPF_RET+BPF_K: /* accept K bytes */
return k;
case BPF_MISC+BPF_TAX: /* X <- A */
x = a;
break;
case BPF_MISC+BPF_TXA: /* A <- X */
a = x;
break;
default: /* unknown instruction */
return 0;
}
}
/* NOTREACHED */
}
/*
* In order to avoid having to perform explicit memory allocation, we store
* some validation state on the stack, using data types that are as small as
* possible for the current definitions. The data types, and in fact the whole
* assumption that we can store the state on the stack, may need to be revised
* if certain constants are increased in the future. As of writing, the
* validation routine uses a little over 1KB of stack memory.
*/
#if BPF_MEMWORDS <= 16 /* value as of writing: 16 */
typedef uint16_t meminv_t;
#else
#error "increased BPF_MEMWORDS may require code revision"
#endif
#if BPF_MAXINSNS > 2048 /* value as of writing: 512 */
#error "increased BPF_MAXINSNS may require code revision"
#endif
/*
* Verify that the given filter program is safe to execute, by performing as
* many static validity checks as possible. The program is given as 'insns',
* which must be an array of 'ninsns' BPF instructions. Unlike bpf_filter(),
* this function does not accept empty filter programs. The function returns 1
* if the program was successfully validated, or 0 if the program should not be
* accepted.
*/
int
bpf_validate(const struct bpf_insn * insns, int ninsns)
{
bitchunk_t reachable[BITMAP_CHUNKS(BPF_MAXINSNS)];
meminv_t invalid, meminv[BPF_MAXINSNS];
const struct bpf_insn *insn;
u_int pc, count, target;
int advance;
if (insns == NULL || ninsns <= 0 || ninsns > BPF_MAXINSNS)
return 0;
count = (u_int)ninsns;
memset(reachable, 0, sizeof(reachable[0]) * BITMAP_CHUNKS(count));
memset(meminv, 0, sizeof(meminv[0]) * count);
SET_BIT(reachable, 0);
meminv[0] = (meminv_t)~0;
for (pc = 0; pc < count; pc++) {
/* We completely ignore instructions that are not reachable. */
if (!GET_BIT(reachable, pc))
continue;
invalid = meminv[pc];
advance = 1;
insn = &insns[pc];
switch (insn->code) {
case BPF_LD+BPF_W+BPF_ABS:
case BPF_LD+BPF_H+BPF_ABS:
case BPF_LD+BPF_B+BPF_ABS:
case BPF_LD+BPF_W+BPF_IND:
case BPF_LD+BPF_H+BPF_IND:
case BPF_LD+BPF_B+BPF_IND:
case BPF_LD+BPF_LEN:
case BPF_LD+BPF_IMM:
case BPF_LDX+BPF_IMM:
case BPF_LDX+BPF_LEN:
case BPF_LDX+BPF_B+BPF_MSH:
case BPF_ALU+BPF_ADD+BPF_K:
case BPF_ALU+BPF_SUB+BPF_K:
case BPF_ALU+BPF_MUL+BPF_K:
case BPF_ALU+BPF_AND+BPF_K:
case BPF_ALU+BPF_OR+BPF_K:
case BPF_ALU+BPF_XOR+BPF_K:
case BPF_ALU+BPF_ADD+BPF_X:
case BPF_ALU+BPF_SUB+BPF_X:
case BPF_ALU+BPF_MUL+BPF_X:
case BPF_ALU+BPF_DIV+BPF_X:
case BPF_ALU+BPF_MOD+BPF_X:
case BPF_ALU+BPF_AND+BPF_X:
case BPF_ALU+BPF_OR+BPF_X:
case BPF_ALU+BPF_XOR+BPF_X:
case BPF_ALU+BPF_LSH+BPF_X:
case BPF_ALU+BPF_RSH+BPF_X:
case BPF_ALU+BPF_NEG:
case BPF_MISC+BPF_TAX:
case BPF_MISC+BPF_TXA:
/* Nothing we can check for these. */
break;
case BPF_ALU+BPF_DIV+BPF_K:
case BPF_ALU+BPF_MOD+BPF_K:
/* No division by zero. */
if (insn->k == 0)
return 0;
break;
case BPF_ALU+BPF_LSH+BPF_K:
case BPF_ALU+BPF_RSH+BPF_K:
/* Do not invoke undefined behavior. */
if (insn->k >= 32)
return 0;
break;
case BPF_LD+BPF_MEM:
case BPF_LDX+BPF_MEM:
/*
* Only allow loading words that have been stored in
* all execution paths leading up to this instruction.
*/
if (insn->k >= BPF_MEMWORDS ||
(invalid & (1 << insn->k)))
return 0;
break;
case BPF_ST:
case BPF_STX:
if (insn->k >= BPF_MEMWORDS)
return 0;
invalid &= ~(1 << insn->k);
break;
case BPF_JMP+BPF_JA:
/*
* Make sure that the target instruction of the jump is
* still part of the program, and mark it as reachable.
*/
if (insn->k >= count - pc - 1)
return 0;
target = pc + insn->k + 1;
SET_BIT(reachable, target);
meminv[target] |= invalid;
advance = 0;
break;
case BPF_JMP+BPF_JGT+BPF_K:
case BPF_JMP+BPF_JGE+BPF_K:
case BPF_JMP+BPF_JEQ+BPF_K:
case BPF_JMP+BPF_JSET+BPF_K:
case BPF_JMP+BPF_JGT+BPF_X:
case BPF_JMP+BPF_JGE+BPF_X:
case BPF_JMP+BPF_JEQ+BPF_X:
case BPF_JMP+BPF_JSET+BPF_X:
/*
* Make sure that both target instructions are still
* part of the program, and mark both as reachable.
* There is no chance that the additions will overflow.
*/
target = pc + insn->jt + 1;
if (target >= count)
return 0;
SET_BIT(reachable, target);
meminv[target] |= invalid;
target = pc + insn->jf + 1;
if (target >= count)
return 0;
SET_BIT(reachable, target);
meminv[target] |= invalid;
advance = 0;
break;
case BPF_RET+BPF_A:
case BPF_RET+BPF_K:
advance = 0;
break;
default:
return 0;
}
/*
* After most instructions, we simply advance to the next. For
* one thing, this means that there must be a next instruction
* at all.
*/
if (advance) {
if (pc + 1 == count)
return 0;
SET_BIT(reachable, pc + 1);
meminv[pc + 1] |= invalid;
}
}
/* The program has passed all our basic tests. */
return 1;
}

1365
minix/net/lwip/bpfdev.c Normal file

File diff suppressed because it is too large Load Diff

18
minix/net/lwip/bpfdev.h Normal file
View File

@ -0,0 +1,18 @@
#ifndef MINIX_NET_LWIP_BPFDEV_H
#define MINIX_NET_LWIP_BPFDEV_H
/*
* BPF link structure, used to abstract away the details of the BPF structure
* from other modules.
*/
struct bpfdev_link {
TAILQ_ENTRY(bpfdev_link) bpfl_next;
};
void bpfdev_init(void);
void bpfdev_process(message * m_ptr, int ipc_status);
void bpfdev_detach(struct bpfdev_link * bpf);
void bpfdev_input(struct bpfdev_link * bpf, const struct pbuf * pbuf);
void bpfdev_output(struct bpfdev_link * bpf, const struct pbuf * pbuf);
#endif /* !MINIX_NET_LWIP_BPFDEV_H */

1718
minix/net/lwip/ethif.c Normal file

File diff suppressed because it is too large Load Diff

24
minix/net/lwip/ethif.h Normal file
View File

@ -0,0 +1,24 @@
#ifndef MINIX_NET_LWIP_ETHIF_H
#define MINIX_NET_LWIP_ETHIF_H
#include "ndev.h"
struct ethif;
void ethif_init(void);
struct ethif *ethif_add(ndev_id_t id, const char * name, uint32_t caps);
int ethif_enable(struct ethif * ethif, const char * name,
const struct ndev_hwaddr * hwaddr, uint8_t hwaddr_len, uint32_t caps,
uint32_t link, uint32_t media);
void ethif_disable(struct ethif * ethif);
void ethif_remove(struct ethif * ethif);
void ethif_configured(struct ethif * ethif, int32_t result);
void ethif_sent(struct ethif * ethif, int32_t result);
void ethif_received(struct ethif * ethif, int32_t result);
void ethif_status(struct ethif * ethif, uint32_t link, uint32_t media,
uint32_t oerror, uint32_t coll, uint32_t ierror, uint32_t iqdrop);
#endif /* !MINIX_NET_LWIP_ETHIF_H */

2224
minix/net/lwip/ifaddr.c Normal file

File diff suppressed because it is too large Load Diff

70
minix/net/lwip/ifaddr.h Normal file
View File

@ -0,0 +1,70 @@
#ifndef MINIX_NET_LWIP_IFADDR_H
#define MINIX_NET_LWIP_IFADDR_H
/* Possible values of ifdev_v6flags[] elements. */
#define IFADDR_V6F_AUTOCONF 0x01 /* autoconfigured address, no subnet */
#define IFADDR_V6F_TEMPORARY 0x02 /* temporary (privacy) address */
#define IFADDR_V6F_HWBASED 0x04 /* auto-derived from MAC address */
typedef int ifaddr_v4_num_t; /* interface IPv4 address number */
typedef int ifaddr_v6_num_t; /* interface IPv6 address number */
typedef int ifaddr_dl_num_t; /* interface link address number */
extern int ifaddr_auto_linklocal;
extern int ifaddr_accept_rtadv;
void ifaddr_init(struct ifdev * ifdev);
int ifaddr_v4_find(struct ifdev * ifdev, const struct sockaddr_in * addr,
ifaddr_v4_num_t * num);
int ifaddr_v4_enum(struct ifdev * ifdev, ifaddr_v4_num_t * num);
int ifaddr_v4_get(struct ifdev * ifdev, ifaddr_v4_num_t num,
struct sockaddr_in * addr, struct sockaddr_in * mask,
struct sockaddr_in * bcast, struct sockaddr_in * dest);
int ifaddr_v4_get_flags(struct ifdev * ifdev, ifaddr_v4_num_t num);
int ifaddr_v4_add(struct ifdev * ifdev, const struct sockaddr_in * addr,
const struct sockaddr_in * mask, const struct sockaddr_in * bcast,
const struct sockaddr_in * dest, int flags);
void ifaddr_v4_del(struct ifdev * ifdev, ifaddr_v4_num_t num);
void ifaddr_v4_clear(struct ifdev * ifdev);
struct ifdev *ifaddr_v4_map_by_addr(const ip4_addr_t * ip4addr);
int ifaddr_v6_find(struct ifdev * ifdev, const struct sockaddr_in6 * addr6,
ifaddr_v6_num_t * num);
int ifaddr_v6_enum(struct ifdev * ifdev, ifaddr_v6_num_t * num);
void ifaddr_v6_get(struct ifdev * ifdev, ifaddr_v6_num_t num,
struct sockaddr_in6 * addr6, struct sockaddr_in6 * mask6,
struct sockaddr_in6 * dest6);
int ifaddr_v6_get_flags(struct ifdev * ifdev, ifaddr_v6_num_t num);
void ifaddr_v6_get_lifetime(struct ifdev * ifdev, ifaddr_v6_num_t num,
struct in6_addrlifetime * lifetime);
int ifaddr_v6_add(struct ifdev * ifdev, const struct sockaddr_in6 * addr6,
const struct sockaddr_in6 * mask6, const struct sockaddr_in6 * dest6,
int flags, const struct in6_addrlifetime * lifetime);
void ifaddr_v6_del(struct ifdev * ifdev, ifaddr_v6_num_t num);
void ifaddr_v6_clear(struct ifdev * ifdev);
void ifaddr_v6_check(struct ifdev * ifdev);
void ifaddr_v6_set_up(struct ifdev * ifdev);
void ifaddr_v6_set_linklocal(struct ifdev * ifdev);
struct ifdev *ifaddr_v6_map_by_addr(const ip6_addr_t * ip6addr);
struct ifdev *ifaddr_map_by_addr(const ip_addr_t * ipaddr);
struct ifdev *ifaddr_map_by_subnet(const ip_addr_t * ipaddr);
const ip_addr_t *ifaddr_select(const ip_addr_t * dst_addr,
struct ifdev * ifdev, struct ifdev ** ifdevp);
int ifaddr_is_zone_mismatch(const ip6_addr_t * ipaddr, struct ifdev * ifdev);
int ifaddr_dl_find(struct ifdev * ifdev, const struct sockaddr_dlx * addr,
socklen_t addr_len, ifaddr_dl_num_t * num);
int ifaddr_dl_enum(struct ifdev * ifdev, ifaddr_dl_num_t * num);
void ifaddr_dl_get(struct ifdev * ifdev, ifaddr_dl_num_t num,
struct sockaddr_dlx * addr);
int ifaddr_dl_get_flags(struct ifdev * ifdev, ifaddr_dl_num_t num);
int ifaddr_dl_add(struct ifdev * ifdev, const struct sockaddr_dlx * addr,
socklen_t addr_len, int flags);
int ifaddr_dl_del(struct ifdev * ifdev, ifaddr_dl_num_t num);
void ifaddr_dl_clear(struct ifdev * ifdev);
void ifaddr_dl_update(struct ifdev * ifdev, const uint8_t * hwaddr,
int is_factory);
#endif /* !MINIX_NET_LWIP_IFADDR_H */

930
minix/net/lwip/ifconf.c Normal file
View File

@ -0,0 +1,930 @@
/* LWIP service - ifconf.c - interface configuration */
#include "lwip.h"
#include "ifaddr.h"
#include "lldata.h"
#include <net/if_media.h>
#include <minix/if.h>
#define LOOPBACK_IFNAME "lo0" /* name of the loopback interface */
/*
* Initialize the first loopback device, which is present by default.
*/
void
ifconf_init(void)
{
const struct sockaddr_in addr = {
.sin_family = AF_INET,
.sin_addr = { htonl(INADDR_LOOPBACK) }
};
struct sockaddr_in6 ll_addr6 = {
.sin6_family = AF_INET6,
};
const struct sockaddr_in6 lo_addr6 = {
.sin6_family = AF_INET6,
.sin6_addr = IN6ADDR_LOOPBACK_INIT
};
const struct in6_addrlifetime lifetime = {
.ia6t_vltime = ND6_INFINITE_LIFETIME,
.ia6t_pltime = ND6_INFINITE_LIFETIME
};
struct sockaddr_in6 mask6;
struct ifdev *ifdev;
socklen_t addr_len;
int r;
if ((r = ifdev_create(LOOPBACK_IFNAME)) != OK)
panic("unable to create loopback interface: %d", r);
if ((ifdev = ifdev_find_by_name(LOOPBACK_IFNAME)) == NULL)
panic("unable to find loopback interface");
if ((r = ifaddr_v4_add(ifdev, &addr, NULL, NULL, NULL, 0)) != OK)
panic("unable to set IPv4 address on loopback interface: %d",
r);
addr_len = sizeof(mask6);
addr_put_netmask((struct sockaddr *)&mask6, &addr_len, IPADDR_TYPE_V6,
64 /*prefix*/);
ll_addr6.sin6_addr.s6_addr[0] = 0xfe;
ll_addr6.sin6_addr.s6_addr[1] = 0x80;
ll_addr6.sin6_addr.s6_addr[15] = ifdev_get_index(ifdev);
if ((r = ifaddr_v6_add(ifdev, &ll_addr6, &mask6, NULL, 0,
&lifetime)) != OK)
panic("unable to set IPv6 address on loopback interface: %d",
r);
addr_len = sizeof(mask6);
addr_put_netmask((struct sockaddr *)&mask6, &addr_len, IPADDR_TYPE_V6,
128 /*prefix*/);
if ((r = ifaddr_v6_add(ifdev, &lo_addr6, &mask6, NULL, 0,
&lifetime)) != OK)
panic("unable to set IPv6 address on loopback interface: %d",
r);
if ((r = ifdev_set_ifflags(ifdev, IFF_UP)) != OK)
panic("unable to bring up loopback interface");
}
/*
* Process an address family independent IOCTL request with an "ifreq"
* structure.
*/
static int
ifconf_ioctl_ifreq(unsigned long request, const struct sockdriver_data * data)
{
struct ifdev *ifdev;
struct ifreq ifr;
int r;
if ((r = sockdriver_copyin(data, 0, &ifr, sizeof(ifr))) != OK)
return r;
if (request != SIOCIFCREATE) {
ifr.ifr_name[sizeof(ifr.ifr_name) - 1] = '\0';
if ((ifdev = ifdev_find_by_name(ifr.ifr_name)) == NULL)
return ENXIO;
} else
ifdev = NULL;
switch (request) {
case SIOCGIFFLAGS:
ifr.ifr_flags = ifdev_get_ifflags(ifdev);
return sockdriver_copyout(data, 0, &ifr, sizeof(ifr));
case SIOCSIFFLAGS:
/*
* Unfortunately, ifr_flags is a signed integer and the sign
* bit is in fact used as a flag, so without explicit casting
* we end up setting all upper bits of the (full) integer. If
* NetBSD ever extends the field, this assert should trigger..
*/
assert(sizeof(ifr.ifr_flags) == sizeof(short));
return ifdev_set_ifflags(ifdev, (unsigned short)ifr.ifr_flags);
case SIOCGIFMETRIC:
ifr.ifr_metric = ifdev_get_metric(ifdev);
return sockdriver_copyout(data, 0, &ifr, sizeof(ifr));
case SIOCSIFMETRIC:
/* The metric is not used within the operating system. */
ifdev_set_metric(ifdev, ifr.ifr_metric);
return OK;
case SIOCSIFMEDIA:
return ifdev_set_ifmedia(ifdev, ifr.ifr_media);
case SIOCGIFMTU:
ifr.ifr_mtu = ifdev_get_mtu(ifdev);
return sockdriver_copyout(data, 0, &ifr, sizeof(ifr));
case SIOCSIFMTU:
return ifdev_set_mtu(ifdev, ifr.ifr_mtu);
case SIOCIFCREATE:
if (memchr(ifr.ifr_name, '\0', sizeof(ifr.ifr_name)) == NULL)
return EINVAL;
return ifdev_create(ifr.ifr_name);
case SIOCIFDESTROY:
return ifdev_destroy(ifdev);
case SIOCGIFDLT:
ifr.ifr_dlt = ifdev_get_dlt(ifdev);
return sockdriver_copyout(data, 0, &ifr, sizeof(ifr));
case SIOCGIFINDEX:
ifr.ifr_index = ifdev_get_index(ifdev);
return sockdriver_copyout(data, 0, &ifr, sizeof(ifr));
default:
return ENOTTY;
}
}
/*
* Process an address family independent IOCTL request with an "ifcapreq"
* structure.
*/
static int
ifconf_ioctl_ifcap(unsigned long request,
const struct sockdriver_data * data)
{
struct ifdev *ifdev;
struct ifcapreq ifcr;
int r;
if ((r = sockdriver_copyin(data, 0, &ifcr, sizeof(ifcr))) != OK)
return r;
ifcr.ifcr_name[sizeof(ifcr.ifcr_name) - 1] = '\0';
if ((ifdev = ifdev_find_by_name(ifcr.ifcr_name)) == NULL)
return ENXIO;
switch (request) {
case SIOCSIFCAP:
return ifdev_set_ifcap(ifdev, ifcr.ifcr_capenable);
case SIOCGIFCAP:
ifdev_get_ifcap(ifdev, &ifcr.ifcr_capabilities,
&ifcr.ifcr_capenable);
return sockdriver_copyout(data, 0, &ifcr, sizeof(ifcr));
default:
return ENOTTY;
}
}
/*
* Process an address family independent IOCTL request with an "ifmediareq"
* structure.
*/
static int
ifconf_ioctl_ifmedia(unsigned long request,
const struct sockdriver_data * data)
{
struct ifdev *ifdev;
struct ifmediareq ifm;
int r;
if ((r = sockdriver_copyin(data, 0, &ifm, sizeof(ifm))) != OK)
return r;
ifm.ifm_name[sizeof(ifm.ifm_name) - 1] = '\0';
if ((ifdev = ifdev_find_by_name(ifm.ifm_name)) == NULL)
return ENXIO;
switch (request) {
case MINIX_SIOCGIFMEDIA:
if ((r = ifdev_get_ifmedia(ifdev, &ifm.ifm_current,
&ifm.ifm_active)) != OK)
return r;
ifm.ifm_mask = 0;
switch (ifdev_get_link(ifdev)) {
case LINK_STATE_UP:
ifm.ifm_status = IFM_AVALID | IFM_ACTIVE;
break;
case LINK_STATE_DOWN:
ifm.ifm_status = IFM_AVALID;
break;
default:
ifm.ifm_status = 0;
break;
}
/*
* TODO: support for the list of supported media types. This
* one is not easy, because we cannot simply suspend the IOCTL
* and query the driver. For now, return only entry (which is
* the minimum for ifconfig(8) not to complain), namely the
* currently selected one.
*/
if (ifm.ifm_ulist != NULL) {
if (ifm.ifm_count < 1)
return ENOMEM;
/*
* Copy out the 'list', which consists of one entry.
* If we were to produce multiple entries, we would
* have to check against the MINIX_IF_MAXMEDIA limit.
*/
if ((r = sockdriver_copyout(data,
offsetof(struct minix_ifmediareq, mifm_list),
&ifm.ifm_current, sizeof(ifm.ifm_current))) != OK)
return r;
}
ifm.ifm_count = 1;
return sockdriver_copyout(data, 0, &ifm, sizeof(ifm));
default:
return ENOTTY;
}
}
/*
* Process an address family independent IOCTL request with an "if_clonereq"
* structure.
*/
static int
ifconf_ioctl_ifclone(unsigned long request,
const struct sockdriver_data * data)
{
struct if_clonereq ifcr;
const char *ptr;
char name[IFNAMSIZ];
size_t off;
unsigned int num;
int r;
if ((r = sockdriver_copyin(data, 0, &ifcr, sizeof(ifcr))) != OK)
return r;
if (ifcr.ifcr_count < 0)
return EINVAL;
off = offsetof(struct minix_if_clonereq, mifcr_buffer);
for (num = 0; (ptr = ifdev_enum_vtypes(num)) != NULL; num++) {
/* Prevent overflow in case we ever have over 128 vtypes.. */
if (num == MINIX_IF_MAXCLONERS)
break;
if (ifcr.ifcr_buffer == NULL ||
num >= (unsigned int)ifcr.ifcr_count)
continue;
memset(name, 0, sizeof(name));
strlcpy(name, ptr, sizeof(name));
if ((r = sockdriver_copyout(data, off, name,
sizeof(name))) != OK)
return r;
off += sizeof(name);
}
ifcr.ifcr_total = num;
return sockdriver_copyout(data, 0, &ifcr, sizeof(ifcr));
}
/*
* Process an address family independent IOCTL request with an "if_addrprefreq"
* structure.
*/
static int
ifconf_ioctl_ifaddrpref(unsigned long request,
const struct sockdriver_data * data)
{
struct ifdev *ifdev;
struct if_addrprefreq ifap;
int r;
if ((r = sockdriver_copyin(data, 0, &ifap, sizeof(ifap))) != OK)
return r;
ifap.ifap_name[sizeof(ifap.ifap_name) - 1] = '\0';
if ((ifdev = ifdev_find_by_name(ifap.ifap_name)) == NULL)
return ENXIO;
/*
* For now, we simply support only a preference of 0. We do not try to
* look up the given address, nor do we return the looked up address.
*/
switch (request) {
case SIOCSIFADDRPREF:
if (ifap.ifap_preference != 0)
return EINVAL;
return OK;
case SIOCGIFADDRPREF:
ifap.ifap_preference = 0;
return sockdriver_copyout(data, 0, &ifap, sizeof(ifap));
default:
return ENOTTY;
}
}
/*
* Process an IOCTL request for AF_INET with an "ifreq" structure.
*/
static int
ifconf_ioctl_v4_ifreq(unsigned long request,
const struct sockdriver_data * data)
{
struct sockaddr_in addr, mask, bcast, dest, *sin = NULL /*gcc*/;
struct ifdev *ifdev;
struct ifreq ifr;
ifaddr_v4_num_t num;
int r, flags;
if ((r = sockdriver_copyin(data, 0, &ifr, sizeof(ifr))) != OK)
return r;
ifr.ifr_name[sizeof(ifr.ifr_name) - 1] = '\0';
if ((ifdev = ifdev_find_by_name(ifr.ifr_name)) == NULL)
return ENXIO;
switch (request) {
case SIOCGIFADDR:
case SIOCGIFNETMASK:
case SIOCGIFBRDADDR:
case SIOCGIFDSTADDR:
/* Retrieve all addresses, then copy out the desired one. */
switch (request) {
case SIOCGIFADDR: sin = &addr; break;
case SIOCGIFNETMASK: sin = &mask; break;
case SIOCGIFBRDADDR: sin = &bcast; break;
case SIOCGIFDSTADDR: sin = &dest; break;
}
sin->sin_len = 0;
if ((r = ifaddr_v4_get(ifdev, (ifaddr_v4_num_t)0, &addr, &mask,
&bcast, &dest)) != OK)
return r;
if (sin->sin_len == 0) /* not filled in */
return EADDRNOTAVAIL;
memcpy(&ifr.ifr_addr, sin, sizeof(*sin));
return sockdriver_copyout(data, 0, &ifr, sizeof(ifr));
case SIOCGIFAFLAG_IN:
if ((r = ifaddr_v4_find(ifdev,
(struct sockaddr_in *)&ifr.ifr_addr, &num)) != OK)
return r;
ifr.ifr_addrflags = ifaddr_v4_get_flags(ifdev, num);
return sockdriver_copyout(data, 0, &ifr, sizeof(ifr));
case SIOCSIFADDR:
/*
* This one is slightly different from the rest, in that we
* either set or update the primary address: if we set it, we
* must let _add() generate a matching netmask automatically,
* while if we update it, _add() would fail unless we first
* delete the old entry.
*/
sin = (struct sockaddr_in *)&ifr.ifr_addr;
if ((r = ifaddr_v4_get(ifdev, (ifaddr_v4_num_t)0, &addr, &mask,
&bcast, &dest)) == OK) {
flags = ifaddr_v4_get_flags(ifdev, (ifaddr_v4_num_t)0);
ifaddr_v4_del(ifdev, (ifaddr_v4_num_t)0);
/*
* If setting the new address fails, reinstating the
* old address should always work. This is really ugly
* as it generates routing socket noise, but this call
* is deprecated anyway.
*/
if ((r = ifaddr_v4_add(ifdev, sin, &mask, &bcast,
&dest, 0 /*flags*/)) != OK)
(void)ifaddr_v4_add(ifdev, &addr, &mask,
&bcast, &dest, flags);
return r;
} else
return ifaddr_v4_add(ifdev, sin, NULL /*mask*/,
NULL /*bcast*/, NULL /*dest*/, 0 /*flags*/);
case SIOCSIFNETMASK:
case SIOCSIFBRDADDR:
case SIOCSIFDSTADDR:
/* These calls only update the existing primary address. */
if ((r = ifaddr_v4_get(ifdev, (ifaddr_v4_num_t)0, &addr, &mask,
&bcast, &dest)) != OK)
return r;
sin = (struct sockaddr_in *)&ifr.ifr_addr;
switch (request) {
case SIOCSIFNETMASK: memcpy(&mask, sin, sizeof(mask)); break;
case SIOCSIFBRDADDR: memcpy(&bcast, sin, sizeof(bcast)); break;
case SIOCSIFDSTADDR: memcpy(&dest, sin, sizeof(dest)); break;
}
return ifaddr_v4_add(ifdev, &addr, &mask, &bcast, &dest,
ifaddr_v4_get_flags(ifdev, (ifaddr_v4_num_t)0));
case SIOCDIFADDR:
if ((r = ifaddr_v4_find(ifdev,
(struct sockaddr_in *)&ifr.ifr_addr, &num)) != OK)
return r;
ifaddr_v4_del(ifdev, num);
return OK;
default:
return ENOTTY;
}
}
/*
* Process an IOCTL request for AF_INET with an "ifaliasreq" structure.
*/
static int
ifconf_ioctl_v4_ifalias(unsigned long request,
const struct sockdriver_data * data)
{
struct ifdev *ifdev;
struct ifaliasreq ifra;
struct sockaddr_in dest;
ifaddr_v4_num_t num;
int r;
if ((r = sockdriver_copyin(data, 0, &ifra, sizeof(ifra))) != OK)
return r;
ifra.ifra_name[sizeof(ifra.ifra_name) - 1] = '\0';
if ((ifdev = ifdev_find_by_name(ifra.ifra_name)) == NULL)
return ENXIO;
switch (request) {
case SIOCAIFADDR:
return ifaddr_v4_add(ifdev,
(struct sockaddr_in *)&ifra.ifra_addr,
(struct sockaddr_in *)&ifra.ifra_mask,
(struct sockaddr_in *)&ifra.ifra_broadaddr,
(struct sockaddr_in *)&ifra.ifra_dstaddr, 0 /*flags*/);
case SIOCGIFALIAS:
if ((r = ifaddr_v4_find(ifdev,
(struct sockaddr_in *)&ifra.ifra_addr, &num)) != OK)
return r;
/*
* The broadcast and destination address are stored in the same
* ifaliasreq field. We cannot pass a pointer to the same
* field to ifaddr_v4_get(). So, use a temporary variable.
*/
(void)ifaddr_v4_get(ifdev, num,
(struct sockaddr_in *)&ifra.ifra_addr,
(struct sockaddr_in *)&ifra.ifra_mask,
(struct sockaddr_in *)&ifra.ifra_broadaddr, &dest);
if (ifra.ifra_broadaddr.sa_len == 0)
memcpy(&ifra.ifra_dstaddr, &dest, sizeof(dest));
return sockdriver_copyout(data, 0, &ifra, sizeof(ifra));
default:
return ENOTTY;
}
}
/*
* Process an IOCTL request for AF_INET.
*/
static int
ifconf_ioctl_v4(unsigned long request, const struct sockdriver_data * data,
endpoint_t user_endpt)
{
switch (request) {
case SIOCSIFADDR:
case SIOCSIFDSTADDR:
case SIOCSIFBRDADDR:
case SIOCSIFNETMASK:
case SIOCDIFADDR:
if (!util_is_root(user_endpt))
return EPERM;
/* FALLTHROUGH */
case SIOCGIFADDR:
case SIOCGIFDSTADDR:
case SIOCGIFBRDADDR:
case SIOCGIFNETMASK:
case SIOCGIFAFLAG_IN:
return ifconf_ioctl_v4_ifreq(request, data);
case SIOCAIFADDR:
if (!util_is_root(user_endpt))
return EPERM;
/* FALLTHROUGH */
case SIOCGIFALIAS:
return ifconf_ioctl_v4_ifalias(request, data);
default:
return ENOTTY;
}
}
#ifdef INET6
/*
* Process an IOCTL request for AF_INET6 with an "in6_ifreq" structure.
*/
static int
ifconf_ioctl_v6_ifreq(unsigned long request,
const struct sockdriver_data * data)
{
struct ifdev *ifdev;
struct in6_ifreq ifr;
ifaddr_v6_num_t num;
int r;
if ((r = sockdriver_copyin(data, 0, &ifr, sizeof(ifr))) != OK)
return r;
ifr.ifr_name[sizeof(ifr.ifr_name) - 1] = '\0';
if ((ifdev = ifdev_find_by_name(ifr.ifr_name)) == NULL)
return ENXIO;
if ((r = ifaddr_v6_find(ifdev, &ifr.ifr_addr, &num)) != OK)
return r;
switch (request) {
case SIOCGIFADDR_IN6:
/* This IOCTL basically checks if the given address exists. */
ifaddr_v6_get(ifdev, num, &ifr.ifr_addr, NULL, NULL);
return sockdriver_copyout(data, 0, &ifr, sizeof(ifr));
case SIOCDIFADDR_IN6:
ifaddr_v6_del(ifdev, num);
return OK;
case SIOCGIFNETMASK_IN6:
ifaddr_v6_get(ifdev, num, NULL, &ifr.ifr_addr, NULL);
return sockdriver_copyout(data, 0, &ifr, sizeof(ifr));
case SIOCGIFAFLAG_IN6:
ifr.ifr_ifru.ifru_flags6 = ifaddr_v6_get_flags(ifdev, num);
return sockdriver_copyout(data, 0, &ifr, sizeof(ifr));
case SIOCGIFALIFETIME_IN6:
ifaddr_v6_get_lifetime(ifdev, num,
&ifr.ifr_ifru.ifru_lifetime);
return sockdriver_copyout(data, 0, &ifr, sizeof(ifr));
default:
return ENOTTY;
}
}
/*
* Process an IOCTL request for AF_INET6 with an "in6_aliasreq" structure.
*/
static int
ifconf_ioctl_v6_ifalias(unsigned long request,
const struct sockdriver_data * data)
{
struct ifdev *ifdev;
struct in6_aliasreq ifra;
int r;
if ((r = sockdriver_copyin(data, 0, &ifra, sizeof(ifra))) != OK)
return r;
ifra.ifra_name[sizeof(ifra.ifra_name) - 1] = '\0';
if ((ifdev = ifdev_find_by_name(ifra.ifra_name)) == NULL)
return ENXIO;
switch (request) {
case SIOCAIFADDR_IN6:
return ifaddr_v6_add(ifdev, &ifra.ifra_addr,
&ifra.ifra_prefixmask, &ifra.ifra_dstaddr,
ifra.ifra_flags, &ifra.ifra_lifetime);
default:
return ENOTTY;
}
}
/*
* Process an IOCTL request for AF_INET6 with an "in6_ndireq" structure.
*/
static int
ifconf_ioctl_v6_ndireq(unsigned long request,
const struct sockdriver_data * data)
{
struct ifdev *ifdev;
struct in6_ndireq ndi;
int r;
if ((r = sockdriver_copyin(data, 0, &ndi, sizeof(ndi))) != OK)
return r;
ndi.ifname[sizeof(ndi.ifname) - 1] = '\0';
if ((ifdev = ifdev_find_by_name(ndi.ifname)) == NULL)
return ENXIO;
switch (request) {
case SIOCGIFINFO_IN6:
memset(&ndi.ndi, 0, sizeof(ndi.ndi));
ndi.ndi.linkmtu = ifdev_get_mtu(ifdev);
ndi.ndi.flags = ifdev_get_nd6flags(ifdev);
ndi.ndi.initialized = 1;
/* TODO: all the other fields.. */
return sockdriver_copyout(data, 0, &ndi, sizeof(ndi));
case SIOCSIFINFO_IN6:
/* TODO: all the other fields.. */
/* FALLTHROUGH */
case SIOCSIFINFO_FLAGS:
return ifdev_set_nd6flags(ifdev, ndi.ndi.flags);
default:
return ENOTTY;
}
}
/*
* Process an IOCTL request for AF_INET6 with an "in6_nbrinfo" structure.
*/
static int
ifconf_ioctl_v6_nbrinfo(unsigned long request,
const struct sockdriver_data * data)
{
struct ifdev *ifdev;
struct sockaddr_in6 addr;
struct in6_nbrinfo nbri;
lldata_ndp_num_t num;
int r;
if ((r = sockdriver_copyin(data, 0, &nbri, sizeof(nbri))) != OK)
return r;
nbri.ifname[sizeof(nbri.ifname) - 1] = '\0';
if ((ifdev = ifdev_find_by_name(nbri.ifname)) == NULL)
return ENXIO;
switch (request) {
case SIOCGNBRINFO_IN6:
/*
* Convert the given in6_addr to a full sockaddr_in6, mainly
* for internal consistency. It would have been nice if the
* KAME management API had had any sort of consistency itself.
*/
memset(&addr, 0, sizeof(addr));
addr.sin6_family = AF_INET6;
memcpy(&addr.sin6_addr.s6_addr, &nbri.addr,
sizeof(addr.sin6_addr.s6_addr));
if ((r = lldata_ndp_find(ifdev, &addr, &num)) != OK)
return r;
lldata_ndp_get_info(num, &nbri.asked, &nbri.isrouter,
&nbri.state, &nbri.expire);
return sockdriver_copyout(data, 0, &nbri, sizeof(nbri));
default:
return ENOTTY;
}
}
/*
* Process an IOCTL request for AF_INET6.
*/
static int
ifconf_ioctl_v6(unsigned long request, const struct sockdriver_data * data,
endpoint_t user_endpt)
{
switch (request) {
case SIOCDIFADDR_IN6:
if (!util_is_root(user_endpt))
return EPERM;
/* FALLTHROUGH */
case SIOCGIFADDR_IN6:
case SIOCGIFNETMASK_IN6:
case SIOCGIFAFLAG_IN6:
case SIOCGIFALIFETIME_IN6:
return ifconf_ioctl_v6_ifreq(request, data);
case SIOCAIFADDR_IN6:
if (!util_is_root(user_endpt))
return EPERM;
return ifconf_ioctl_v6_ifalias(request, data);
case SIOCSIFINFO_IN6:
case SIOCSIFINFO_FLAGS:
if (!util_is_root(user_endpt))
return EPERM;
/* FALLTHROUGH */
case SIOCGIFINFO_IN6:
return ifconf_ioctl_v6_ndireq(request, data);
case SIOCGNBRINFO_IN6:
return ifconf_ioctl_v6_nbrinfo(request, data);
default:
return ENOTTY;
}
}
#endif /* INET6 */
/*
* Process an IOCTL request for AF_LINK with an "if_laddrreq" structure.
*/
static int
ifconf_ioctl_dl_lifaddr(unsigned long request,
const struct sockdriver_data * data)
{
struct ifdev *ifdev;
struct if_laddrreq iflr;
ifaddr_dl_num_t num;
int r;
if ((r = sockdriver_copyin(data, 0, &iflr, sizeof(iflr))) != OK)
return r;
iflr.iflr_name[sizeof(iflr.iflr_name) - 1] = '\0';
if ((ifdev = ifdev_find_by_name(iflr.iflr_name)) == NULL)
return ENXIO;
switch (request) {
case SIOCGLIFADDR:
if (iflr.flags & IFLR_PREFIX) {
/* We ignore the prefix length, like NetBSD does. */
if ((r = ifaddr_dl_find(ifdev,
(struct sockaddr_dlx *)&iflr.addr,
sizeof(iflr.addr), &num)) != OK)
return r;
} else
num = (ifaddr_dl_num_t)0; /* this always works */
ifaddr_dl_get(ifdev, num, (struct sockaddr_dlx *)&iflr.addr);
iflr.flags = ifaddr_dl_get_flags(ifdev, num);
memset(&iflr.dstaddr, 0, sizeof(iflr.dstaddr));
return sockdriver_copyout(data, 0, &iflr, sizeof(iflr));
case SIOCALIFADDR:
return ifaddr_dl_add(ifdev, (struct sockaddr_dlx *)&iflr.addr,
sizeof(iflr.addr), iflr.flags);
case SIOCDLIFADDR:
if ((r = ifaddr_dl_find(ifdev,
(struct sockaddr_dlx *)&iflr.addr, sizeof(iflr.addr),
&num)) != OK)
return r;
return ifaddr_dl_del(ifdev, num);
default:
return ENOTTY;
}
}
/*
* Process an IOCTL request for AF_LINK.
*/
static int
ifconf_ioctl_dl(unsigned long request, const struct sockdriver_data * data,
endpoint_t user_endpt)
{
switch (request) {
case SIOCALIFADDR:
case SIOCDLIFADDR:
if (!util_is_root(user_endpt))
return EPERM;
/* FALLTHROUGH */
case SIOCGLIFADDR:
return ifconf_ioctl_dl_lifaddr(request, data);
default:
return ENOTTY;
}
}
/*
* Process an IOCTL request. This routine is shared between TCP, UDP, RAW, and
* link sockets. The given socket may be used to obtain the target domain:
* AF_INET, AF_INET6, or AF_LINK.
*/
int
ifconf_ioctl(struct sock * sock, unsigned long request,
const struct sockdriver_data * data, endpoint_t user_endpt)
{
int domain;
domain = sockevent_get_domain(sock);
switch (request) {
case SIOCSIFFLAGS:
case SIOCSIFMETRIC:
case SIOCSIFMEDIA:
case SIOCSIFMTU:
case SIOCIFCREATE:
case SIOCIFDESTROY:
if (!util_is_root(user_endpt))
return EPERM;
/* FALLTHROUGH */
case SIOCGIFFLAGS:
case SIOCGIFMETRIC:
case SIOCGIFMTU:
case SIOCGIFDLT:
case SIOCGIFINDEX:
return ifconf_ioctl_ifreq(request, data);
case SIOCSIFCAP:
if (!util_is_root(user_endpt))
return EPERM;
/* FALLTHROUGH */
case SIOCGIFCAP:
return ifconf_ioctl_ifcap(request, data);
case MINIX_SIOCGIFMEDIA:
return ifconf_ioctl_ifmedia(request, data);
case MINIX_SIOCIFGCLONERS:
return ifconf_ioctl_ifclone(request, data);
case SIOCSIFADDRPREF:
if (!util_is_root(user_endpt))
return EPERM;
/* FALLTHROUGH */
case SIOCGIFADDRPREF:
return ifconf_ioctl_ifaddrpref(request, data);
default:
switch (domain) {
case AF_INET:
return ifconf_ioctl_v4(request, data, user_endpt);
#ifdef INET6
case AF_INET6:
return ifconf_ioctl_v6(request, data, user_endpt);
#endif /* INET6 */
case AF_LINK:
return ifconf_ioctl_dl(request, data, user_endpt);
default:
return ENOTTY;
}
}
}

1064
minix/net/lwip/ifdev.c Normal file

File diff suppressed because it is too large Load Diff

155
minix/net/lwip/ifdev.h Normal file
View File

@ -0,0 +1,155 @@
#ifndef MINIX_NET_LWIP_IFDEV_H
#define MINIX_NET_LWIP_IFDEV_H
#include <net/if.h>
#include <net/if_types.h>
#include <netinet6/in6_var.h>
#include <netinet6/nd6.h>
/*
* NetBSD makes setting a hardware address through ifconfig(8) a whole lot
* harder than it needs to be, namely by keeping a list of possible hardware
* addresses and marking one of them as active. For us, that level of extra
* flexibility is completely useless. In order to shield individual interface
* modules from having to deal with the rather extended interface for the list
* management, we maintain the list in ifdev and simply use a iop_set_hwaddr()
* call to the modules when the active address changes. This setting is the
* maximum number of hardware addresses in the list maintained by ifdev. It
* should be at least 2, or changing hardware addresses will not be possible.
*/
#define IFDEV_NUM_HWADDRS 3
struct ifdev;
struct bpfdev_link;
struct sockaddr_dlx;
/* Interface operations table. */
struct ifdev_ops {
err_t (* iop_init)(struct ifdev * ifdev, struct netif * netif);
err_t (* iop_input)(struct pbuf * pbuf, struct netif * netif);
err_t (* iop_output)(struct ifdev * ifdev, struct pbuf * pbuf,
struct netif * netif);
err_t (* iop_output_v4)(struct netif * netif, struct pbuf * pbuf,
const ip4_addr_t * ipaddr);
err_t (* iop_output_v6)(struct netif * netif, struct pbuf * pbuf,
const ip6_addr_t * ipaddr);
void (* iop_hdrcmplt)(struct ifdev * ifdev, struct pbuf * pbuf);
void (* iop_poll)(struct ifdev * ifdev);
int (* iop_set_ifflags)(struct ifdev * ifdev, unsigned int ifflags);
void (* iop_get_ifcap)(struct ifdev * ifdev, uint64_t * ifcap,
uint64_t * ifena);
int (* iop_set_ifcap)(struct ifdev * ifdev, uint64_t ifcap);
void (* iop_get_ifmedia)(struct ifdev * ifdev, int * ifcurrent,
int * ifactive);
int (* iop_set_ifmedia)(struct ifdev * ifdev, int ifmedia);
void (* iop_set_promisc)(struct ifdev * ifdev, int promisc);
int (* iop_set_hwaddr)(struct ifdev * ifdev, const uint8_t * hwaddr);
int (* iop_set_mtu)(struct ifdev * ifdev, unsigned int mtu);
int (* iop_destroy)(struct ifdev * ifdev);
};
/* Hardware address list entry. The first entry, if any, is the active one. */
struct ifdev_hwaddr {
uint8_t ifhwa_addr[NETIF_MAX_HWADDR_LEN];
uint8_t ifhwa_flags;
};
#define IFHWAF_VALID 0x01 /* entry contains an address */
#define IFHWAF_FACTORY 0x02 /* factory (device-given) address */
/* Interface structure. */
struct ifdev {
TAILQ_ENTRY(ifdev) ifdev_next; /* list of active interfaces */
char ifdev_name[IFNAMSIZ]; /* interface name, null terminated */
unsigned int ifdev_ifflags; /* NetBSD-style interface flags */
unsigned int ifdev_dlt; /* data link type (DLT_) */
unsigned int ifdev_promisc; /* number of promiscuity requestors */
struct netif ifdev_netif; /* lwIP interface structure */
struct if_data ifdev_data; /* NetBSD-style interface data */
char ifdev_v4set; /* interface has an IPv4 address? */
uint8_t ifdev_v6prefix[LWIP_IPV6_NUM_ADDRESSES]; /* IPv6 prefixes */
uint8_t ifdev_v6flags[LWIP_IPV6_NUM_ADDRESSES]; /* v6 address flags */
uint8_t ifdev_v6state[LWIP_IPV6_NUM_ADDRESSES]; /* v6 shadow states */
uint8_t ifdev_v6scope[LWIP_IPV6_NUM_ADDRESSES]; /* cached v6 scopes */
struct ifdev_hwaddr ifdev_hwlist[IFDEV_NUM_HWADDRS]; /* HW addr's */
uint32_t ifdev_nd6flags; /* ND6-related flags (ND6_IFF_) */
const struct ifdev_ops *ifdev_ops; /* interface operations table */
TAILQ_HEAD(, bpfdev_link) ifdev_bpf; /* list of attached BPF devices */
};
#define ifdev_get_name(ifdev) ((ifdev)->ifdev_name)
#define ifdev_get_ifflags(ifdev) ((ifdev)->ifdev_ifflags)
#define ifdev_get_dlt(ifdev) ((ifdev)->ifdev_dlt)
#define ifdev_is_promisc(ifdev) ((ifdev)->ifdev_promisc != 0)
#define ifdev_get_netif(ifdev) (&(ifdev)->ifdev_netif)
#define ifdev_get_nd6flags(ifdev) ((ifdev)->ifdev_nd6flags)
#define ifdev_get_iftype(ifdev) ((ifdev)->ifdev_data.ifi_type)
#define ifdev_get_hwlen(ifdev) ((ifdev)->ifdev_data.ifi_addrlen)
#define ifdev_get_hdrlen(ifdev) ((ifdev)->ifdev_data.ifi_hdrlen)
#define ifdev_get_link(ifdev) ((ifdev)->ifdev_data.ifi_link_state)
#define ifdev_get_mtu(ifdev) ((ifdev)->ifdev_data.ifi_mtu)
#define ifdev_get_metric(ifdev) ((ifdev)->ifdev_data.ifi_metric)
#define ifdev_get_ifdata(ifdev) (&(ifdev)->ifdev_data)
#define ifdev_is_loopback(ifdev) ((ifdev)->ifdev_ifflags & IFF_LOOPBACK)
#define ifdev_is_up(ifdev) ((ifdev)->ifdev_ifflags & IFF_UP)
#define ifdev_is_link_up(ifdev) (netif_is_link_up(&(ifdev)->ifdev_netif))
#define ifdev_set_metric(ifdev, metric) \
((void)((ifdev)->ifdev_data.ifi_metric = (metric)))
#define ifdev_get_index(ifdev) \
((uint32_t)(netif_get_index(ifdev_get_netif(ifdev))))
#define ifdev_output_drop(ifdev) ((ifdev)->ifdev_data.ifi_oerrors++)
#define netif_get_ifdev(netif) ((struct ifdev *)(netif)->state)
void ifdev_init(void);
void ifdev_poll(void);
void ifdev_register(const char * name, int (* create)(const char *));
void ifdev_input(struct ifdev * ifdev, struct pbuf * pbuf,
struct netif * netif, int to_bpf);
err_t ifdev_output(struct ifdev * ifdev, struct pbuf * pbuf,
struct netif * netif, int to_bpf, int hdrcmplt);
void ifdev_attach_bpf(struct ifdev * ifdev, struct bpfdev_link * bpfl);
void ifdev_detach_bpf(struct ifdev * ifdev, struct bpfdev_link * bpfl);
struct ifdev *ifdev_get_by_index(uint32_t ifindex);
struct ifdev *ifdev_find_by_name(const char * name);
struct ifdev *ifdev_enum(struct ifdev * last);
int ifdev_check_name(const char * name, unsigned int * vtype_slot);
int ifdev_set_promisc(struct ifdev * ifdev);
void ifdev_clear_promisc(struct ifdev * ifdev);
int ifdev_set_ifflags(struct ifdev * ifdev, unsigned int ifflags);
void ifdev_update_ifflags(struct ifdev * ifdev, unsigned int ifflags);
void ifdev_get_ifcap(struct ifdev * ifdev, uint64_t * ifcap,
uint64_t * ifena);
int ifdev_set_ifcap(struct ifdev * ifdev, uint64_t ifena);
int ifdev_get_ifmedia(struct ifdev * ifdev, int * ifcurrent, int * ifactive);
int ifdev_set_ifmedia(struct ifdev * ifdev, int ifmedia);
int ifdev_set_mtu(struct ifdev * ifdev, unsigned int mtu);
int ifdev_set_nd6flags(struct ifdev * ifdev, uint32_t nd6flags);
void ifdev_add(struct ifdev * ifdev, const char * name, unsigned int ifflags,
unsigned int iftype, size_t hdrlen, size_t addrlen, unsigned int dlt,
unsigned int mtu, uint32_t nd6flags, const struct ifdev_ops * iop);
int ifdev_remove(struct ifdev * ifdev);
struct ifdev *ifdev_get_loopback(void);
void ifdev_update_link(struct ifdev * ifdev, int link);
void ifdev_update_hwaddr(struct ifdev * ifdev, const uint8_t * hwaddr,
int is_factory);
int ifdev_create(const char * name);
int ifdev_destroy(struct ifdev * ifdev);
const char *ifdev_enum_vtypes(unsigned int num);
#endif /* !MINIX_NET_LWIP_IFDEV_H */

761
minix/net/lwip/ipsock.c Normal file
View File

@ -0,0 +1,761 @@
/* LWIP service - ipsock.c - shared IP-level socket code */
#include "lwip.h"
#include "ifaddr.h"
#define ip6_hdr __netbsd_ip6_hdr /* conflicting definitions */
#include <net/route.h>
#include <netinet/ip.h>
#include <netinet/in_pcb.h>
#include <netinet6/in6_pcb.h>
#undef ip6_hdr
/* The following are sysctl(7) settings. */
int lwip_ip4_forward = 0; /* We patch lwIP to check these.. */
int lwip_ip6_forward = 0; /* ..two settings at run time. */
static int ipsock_v6only = 1;
/* The CTL_NET PF_INET IPPROTO_IP subtree. */
static struct rmib_node net_inet_ip_table[] = {
/* 1*/ [IPCTL_FORWARDING] = RMIB_INTPTR(RMIB_RW, &lwip_ip4_forward,
"forwarding",
"Enable forwarding of INET diagrams"),
/* 3*/ [IPCTL_DEFTTL] = RMIB_INT(RMIB_RO, IP_DEFAULT_TTL, "ttl",
"Default TTL for an INET diagram"),
/*23*/ [IPCTL_LOOPBACKCKSUM] = RMIB_FUNC(RMIB_RW | CTLTYPE_INT, sizeof(int),
loopif_cksum, "do_loopback_cksum",
"Perform IP checksum on loopback"),
};
static struct rmib_node net_inet_ip_node =
RMIB_NODE(RMIB_RO, net_inet_ip_table, "ip", "IPv4 related settings");
/* The CTL_NET PF_INET6 IPPROTO_IPV6 subtree. */
static struct rmib_node net_inet6_ip6_table[] = {
/* 1*/ [IPV6CTL_FORWARDING] = RMIB_INTPTR(RMIB_RW, &lwip_ip6_forward,
"forwarding",
"Enable forwarding of INET6 diagrams"),
/*
* The following functionality is not
* implemented in lwIP at this time.
*/
/* 2*/ [IPV6CTL_SENDREDIRECTS] = RMIB_INT(RMIB_RO, 0, "redirect", "Enable "
"sending of ICMPv6 redirect messages"),
/* 3*/ [IPV6CTL_DEFHLIM] = RMIB_INT(RMIB_RO, IP_DEFAULT_TTL, "hlim",
"Hop limit for an INET6 datagram"),
/*12*/ [IPV6CTL_ACCEPT_RTADV] = RMIB_INTPTR(RMIB_RW, &ifaddr_accept_rtadv,
"accept_rtadv",
"Accept router advertisements"),
/*16*/ [IPV6CTL_DAD_COUNT] = RMIB_INT(RMIB_RO,
LWIP_IPV6_DUP_DETECT_ATTEMPTS, "dad_count",
"Number of Duplicate Address Detection "
"probes to send"),
/*24*/ [IPV6CTL_V6ONLY] = RMIB_INTPTR(RMIB_RW, &ipsock_v6only,
"v6only", "Disallow PF_INET6 sockets from "
"connecting to PF_INET sockets"),
/*
* The following setting is significantly
* different from NetBSD, and therefore it has
* a somewhat different description as well.
*/
/*35*/ [IPV6CTL_AUTO_LINKLOCAL]= RMIB_INTPTR(RMIB_RW, &ifaddr_auto_linklocal,
"auto_linklocal", "Enable global support "
"for adding IPv6link-local addresses to "
"interfaces"),
/*
* Temporary addresses are managed entirely by
* userland. We only maintain the settings.
*/
/*+0*/ [IPV6CTL_MAXID] = RMIB_INT(RMIB_RW, 0, "use_tempaddr",
"Use temporary address"),
/*+1*/ [IPV6CTL_MAXID + 1] = RMIB_INT(RMIB_RW, 86400, "temppltime",
"Preferred lifetime of a temporary "
"address"),
/*+2*/ [IPV6CTL_MAXID + 2] = RMIB_INT(RMIB_RW, 604800, "tempvltime",
"Valid lifetime of a temporary address"),
};
static struct rmib_node net_inet6_ip6_node =
RMIB_NODE(RMIB_RO, net_inet6_ip6_table, "ip6", "IPv6 related settings");
/*
* Initialize the IP sockets module.
*/
void
ipsock_init(void)
{
/*
* Register the net.inet.ip and net.inet6.ip6 subtrees. Unlike for the
* specific protocols (TCP/UDP/RAW), here the IPv4 and IPv6 subtrees
* are and must be separate, even though many settings are shared
* between the two at the lwIP level. Ultimately we may have to split
* the subtrees for the specific protocols, too, though..
*/
mibtree_register_inet(AF_INET, IPPROTO_IP, &net_inet_ip_node);
mibtree_register_inet(AF_INET6, IPPROTO_IPV6, &net_inet6_ip6_node);
}
/*
* Return the lwIP IP address type (IPADDR_TYPE_) for the given IP socket.
*/
static int
ipsock_get_type(struct ipsock * ip)
{
if (!(ip->ip_flags & IPF_IPV6))
return IPADDR_TYPE_V4;
else if (ip->ip_flags & IPF_V6ONLY)
return IPADDR_TYPE_V6;
else
return IPADDR_TYPE_ANY;
}
/*
* Create an IP socket, for the given (PF_/AF_) domain and initial send and
* receive buffer sizes. Return the lwIP IP address type that should be used
* to create the corresponding PCB. Return a pointer to the libsockevent
* socket in 'sockp'. This function must not allocate any resources in any
* form, as socket creation may still fail later, in which case no destruction
* function is called.
*/
int
ipsock_socket(struct ipsock * ip, int domain, size_t sndbuf, size_t rcvbuf,
struct sock ** sockp)
{
ip->ip_flags = (domain == AF_INET6) ? IPF_IPV6 : 0;
if (domain == AF_INET6 && ipsock_v6only)
ip->ip_flags |= IPF_V6ONLY;
ip->ip_sndbuf = sndbuf;
ip->ip_rcvbuf = rcvbuf;
/* Important: when adding settings here, also change ipsock_clone(). */
*sockp = &ip->ip_sock;
return ipsock_get_type(ip);
}
/*
* Clone the given socket 'ip' into the new socket 'newip', using the socket
* identifier 'newid'. In particular, tell libsockevent about the clone and
* copy over any settings from 'ip' to 'newip' that can be inherited on a
* socket. Cloning is used for new TCP connections arriving on listening TCP
* sockets. This function must not fail.
*/
void
ipsock_clone(struct ipsock * ip, struct ipsock * newip, sockid_t newid)
{
sockevent_clone(&ip->ip_sock, &newip->ip_sock, newid);
/* Inherit all settings from the original socket. */
newip->ip_flags = ip->ip_flags;
newip->ip_sndbuf = ip->ip_sndbuf;
newip->ip_rcvbuf = ip->ip_rcvbuf;
}
/*
* Create an <any> address for the given socket, taking into account whether
* the socket is IPv4, IPv6, or mixed. The generated address, stored in
* 'ipaddr', will have the same type as returned from the ipsock_socket() call.
*/
void
ipsock_get_any_addr(struct ipsock * ip, ip_addr_t * ipaddr)
{
ip_addr_set_any(ipsock_is_ipv6(ip), ipaddr);
if (ipsock_is_ipv6(ip) && !ipsock_is_v6only(ip))
IP_SET_TYPE(ipaddr, IPADDR_TYPE_ANY);
}
/*
* Verify whether the given (properly scoped) IP address is a valid source
* address for the given IP socket. The 'allow_mcast' flag indicates whether
* the source address is allowed to be a multicast address. Return OK on
* success. If 'ifdevp' is not NULL, it is filled with either the interface
* that owns the address, or NULL if the address is (while valid) not
* associated with a particular interface. On failure, return a negative error
* code. This function must be called, in one way or another, for every source
* address used for binding or sending on a IP-layer socket.
*/
int
ipsock_check_src_addr(struct ipsock * ip, ip_addr_t * ipaddr, int allow_mcast,
struct ifdev ** ifdevp)
{
ip6_addr_t *ip6addr;
struct ifdev *ifdev;
uint32_t inaddr, zone;
int is_mcast;
/*
* TODO: for now, forbid binding to multicast addresses. Callers that
* never allow multicast addresses anyway (e.g., IPV6_PKTINFO) should
* do their own check for this; the one here may eventually be removed.
*/
is_mcast = ip_addr_ismulticast(ipaddr);
if (is_mcast && !allow_mcast)
return EADDRNOTAVAIL;
if (IP_IS_V6(ipaddr)) {
/*
* The given address must not have a KAME-style embedded zone.
* This check is already performed in addr_get_inet(), but we
* have to replicate it here because not all source addresses
* go through addr_get_inet().
*/
ip6addr = ip_2_ip6(ipaddr);
if (ip6_addr_has_scope(ip6addr, IP6_UNKNOWN) &&
(ip6addr->addr[0] & PP_HTONL(0x0000ffffUL)))
return EINVAL;
/*
* lwIP does not support IPv4-mapped IPv6 addresses, so these
* must be converted to plain IPv4 addresses instead. The IPv4
* 'any' address is not supported in this form. In V6ONLY
* mode, refuse connecting or sending to IPv4-mapped addresses
* at all.
*/
if (ip6_addr_isipv4mappedipv6(ip6addr)) {
if (ipsock_is_v6only(ip))
return EINVAL;
inaddr = ip6addr->addr[3];
if (inaddr == PP_HTONL(INADDR_ANY))
return EADDRNOTAVAIL;
ip_addr_set_ip4_u32(ipaddr, inaddr);
}
}
ifdev = NULL;
if (!ip_addr_isany(ipaddr)) {
if (IP_IS_V6(ipaddr) &&
ip6_addr_lacks_zone(ip_2_ip6(ipaddr), IP6_UNKNOWN))
return EADDRNOTAVAIL;
/*
* If the address is a unicast address, it must be assigned to
* an interface. Otherwise, if it is a zoned multicast
* address, the zone denotes the interface. For global
* multicast addresses, we cannot determine an interface.
*/
if (!is_mcast) {
if ((ifdev = ifaddr_map_by_addr(ipaddr)) == NULL)
return EADDRNOTAVAIL;
} else {
/* Some multicast addresses are not acceptable. */
if (!addr_is_valid_multicast(ipaddr))
return EINVAL;
if (IP_IS_V6(ipaddr) &&
ip6_addr_has_zone(ip_2_ip6(ipaddr))) {
zone = ip6_addr_zone(ip_2_ip6(ipaddr));
if ((ifdev = ifdev_get_by_index(zone)) == NULL)
return ENXIO;
}
}
}
if (ifdevp != NULL)
*ifdevp = ifdev;
return OK;
}
/*
* Retrieve and validate a source address for use in a socket bind call on
* socket 'ip'. The user-provided address is given as 'addr', with length
* 'addr_len'. The socket's current local IP address and port are given as
* 'local_ip' and 'local_port', respectively; for raw sockets, the given local
* port number is always zero. The caller's endpoint is given as 'user_endpt',
* used to make sure only root can bind to local port numbers. The boolean
* 'allow_mcast' flag indicates whether the source address is allowed to be a
* multicast address. On success, return OK with the source IP address stored
* in 'src_addr' and, if 'src_port' is not NULL, the port number to bind to
* stored in 'portp'. Otherwise, return a negative error code. This function
* performs all the tasks necessary before the socket can be bound using a lwIP
* call.
*/
int
ipsock_get_src_addr(struct ipsock * ip, const struct sockaddr * addr,
socklen_t addr_len, endpoint_t user_endpt, ip_addr_t * local_ip,
uint16_t local_port, int allow_mcast, ip_addr_t * src_addr,
uint16_t * src_port)
{
uint16_t port;
int r;
/*
* If the socket has been bound already, it cannot be bound again.
* We check this by checking whether the current local port is non-
* zero. This rule does not apply to raw sockets, but raw sockets have
* no port numbers anyway, so this conveniently works out. However,
* raw sockets may not be rebound after being connected, but that is
* checked before we even get here.
*/
if (local_port != 0)
return EINVAL;
/* Parse the user-provided address. */
if ((r = addr_get_inet(addr, addr_len, ipsock_get_type(ip), src_addr,
FALSE /*kame*/, &port)) != OK)
return r;
/* Validate the user-provided address. */
if ((r = ipsock_check_src_addr(ip, src_addr, allow_mcast,
NULL /*ifdevp*/)) != OK)
return r;
/*
* If we are interested in port numbers at all (for non-raw sockets,
* meaning portp is not NULL), make sure that only the superuser can
* bind to privileged port numbers. For raw sockets, only the
* superuser can open a socket anyway, so we need no check here.
*/
if (src_port != NULL) {
if (port != 0 && port < IPPORT_RESERVED &&
!util_is_root(user_endpt))
return EACCES;
*src_port = port;
}
return OK;
}
/*
* Retrieve and validate a destination address for use in a socket connect or
* sendto call. The user-provided address is given as 'addr', with length
* 'addr_len'. The socket's current local IP address is given as 'local_addr'.
* On success, return OK with the destination IP address stored in 'dst_addr'
* and, if 'dst_port' is not NULL, the port number to bind to stored in
* 'dst_port'. Otherwise, return a negative error code. This function must be
* called, in one way or another, for every destination address used for
* connecting or sending on a IP-layer socket.
*/
int
ipsock_get_dst_addr(struct ipsock * ip, const struct sockaddr * addr,
socklen_t addr_len, const ip_addr_t * local_addr, ip_addr_t * dst_addr,
uint16_t * dst_port)
{
uint16_t port;
int r;
/* Parse the user-provided address. */
if ((r = addr_get_inet(addr, addr_len, ipsock_get_type(ip), dst_addr,
FALSE /*kame*/, &port)) != OK)
return r;
/* Destination addresses are always specific. */
if (IP_GET_TYPE(dst_addr) == IPADDR_TYPE_ANY)
IP_SET_TYPE(dst_addr, IPADDR_TYPE_V6);
/*
* lwIP does not support IPv4-mapped IPv6 addresses, so these must be
* supported to plain IPv4 addresses instead. In V6ONLY mode, refuse
* connecting or sending to IPv4-mapped addresses at all.
*/
if (IP_IS_V6(dst_addr) &&
ip6_addr_isipv4mappedipv6(ip_2_ip6(dst_addr))) {
if (ipsock_is_v6only(ip))
return EINVAL;
ip_addr_set_ip4_u32(dst_addr, ip_2_ip6(dst_addr)->addr[3]);
}
/*
* Now make sure that the local and remote addresses are of the same
* family. The local address may be of type IPADDR_TYPE_ANY, which is
* allowed for both IPv4 and IPv6. Even for connectionless socket
* types we must perform this check as part of connect calls (as well
* as sendto calls!) because otherwise we will create problems for
* sysctl based socket enumeration (i.e., netstat), which uses the
* local IP address type to determine the socket family.
*/
if (IP_GET_TYPE(local_addr) != IPADDR_TYPE_ANY &&
IP_IS_V6(local_addr) != IP_IS_V6(dst_addr))
return EINVAL;
/*
* TODO: on NetBSD, an 'any' destination address is replaced with a
* local interface address.
*/
if (ip_addr_isany(dst_addr))
return EHOSTUNREACH;
/*
* If the address is a multicast address, the multicast address itself
* must be valid.
*/
if (ip_addr_ismulticast(dst_addr) &&
!addr_is_valid_multicast(dst_addr))
return EINVAL;
/*
* TODO: decide whether to add a zone to a scoped IPv6 address that
* lacks a zone. For now, we let lwIP handle this, as lwIP itself
* will always add the zone at some point. If anything changes there,
* this would be the place to set the zone (using a route lookup).
*/
/*
* For now, we do not forbid or alter any other particular destination
* addresses.
*/
if (dst_port != NULL) {
/*
* Disallow connecting/sending to port zero. There is no error
* code that applies well to this case, so we copy NetBSD's.
*/
if (port == 0)
return EADDRNOTAVAIL;
*dst_port = port;
}
return OK;
}
/*
* Store the address 'ipaddr' associated with the socket 'ip' (for example, it
* may be the local or remote IP address of the socket) as a sockaddr structure
* in 'addr'. A port number is provided as 'port' (in host-byte order) if
* relevant, and zero is passed in otherwise. This function MUST only be
* called from contexts where 'addr' is a buffer provided by libsockevent or
* libsockdriver, meaning that it is of size SOCKADDR_MAX. The value pointed
* to by 'addr_len' is not expected to be initialized in calls to this function
* (and will typically zero). On return, 'addr_len' is filled with the length
* of the address generated in 'addr'. This function never fails.
*/
void
ipsock_put_addr(struct ipsock * ip, struct sockaddr * addr,
socklen_t * addr_len, ip_addr_t * ipaddr, uint16_t port)
{
ip_addr_t mappedaddr;
/*
* If the socket is an AF_INET6-type socket, and the given address is
* an IPv4-type address, store it as an IPv4-mapped IPv6 address.
*/
if (ipsock_is_ipv6(ip) && IP_IS_V4(ipaddr)) {
addr_make_v4mapped_v6(&mappedaddr, ip_2_ip4(ipaddr));
ipaddr = &mappedaddr;
}
/*
* We have good reasons to keep the sockdriver and sockevent APIs as
* they are, namely, defaulting 'addr_len' to zero such that the caller
* must provide a non-zero length (only) when returning a valid
* address. The consequence here is that we have to know the size of
* the provided buffer. For libsockevent callbacks, we are always
* guaranteed to get a buffer of at least this size.
*/
*addr_len = SOCKADDR_MAX;
addr_put_inet(addr, addr_len, ipaddr, FALSE /*kame*/, port);
}
/*
* Set socket options on an IP socket.
*/
int
ipsock_setsockopt(struct ipsock * ip, int level, int name,
const struct sockdriver_data * data, socklen_t len,
struct ipopts * ipopts)
{
int r, val, allow;
uint8_t type;
switch (level) {
case SOL_SOCKET:
switch (name) {
case SO_SNDBUF:
if ((r = sockdriver_copyin_opt(data, &val, sizeof(val),
len)) != OK)
return r;
if (val <= 0 || (size_t)val < ipopts->sndmin ||
(size_t)val > ipopts->sndmax)
return EINVAL;
ip->ip_sndbuf = val;
return OK;
case SO_RCVBUF:
if ((r = sockdriver_copyin_opt(data, &val, sizeof(val),
len)) != OK)
return r;
if (val <= 0 || (size_t)val < ipopts->rcvmin ||
(size_t)val > ipopts->rcvmax)
return EINVAL;
ip->ip_rcvbuf = val;
return OK;
}
break;
case IPPROTO_IP:
if (ipsock_is_ipv6(ip))
break;
switch (name) {
case IP_TOS:
if ((r = sockdriver_copyin_opt(data, &val, sizeof(val),
len)) != OK)
return r;
if (val < 0 || val > UINT8_MAX)
return EINVAL;
*ipopts->tos = (uint8_t)val;
return OK;
case IP_TTL:
if ((r = sockdriver_copyin_opt(data, &val, sizeof(val),
len)) != OK)
return r;
if (val < 0 || val > UINT8_MAX)
return EINVAL;
*ipopts->ttl = (uint8_t)val;
return OK;
}
break;
case IPPROTO_IPV6:
if (!ipsock_is_ipv6(ip))
break;
switch (name) {
case IPV6_UNICAST_HOPS:
if ((r = sockdriver_copyin_opt(data, &val, sizeof(val),
len)) != OK)
return r;
if (val < -1 || val > UINT8_MAX)
return EINVAL;
if (val == -1)
val = IP_DEFAULT_TTL;
*ipopts->ttl = val;
return OK;
case IPV6_TCLASS:
if ((r = sockdriver_copyin_opt(data, &val, sizeof(val),
len)) != OK)
return r;
if (val < -1 || val > UINT8_MAX)
return EINVAL;
if (val == -1)
val = 0;
*ipopts->tos = val;
return OK;
case IPV6_V6ONLY:
if ((r = sockdriver_copyin_opt(data, &val, sizeof(val),
len)) != OK)
return r;
/*
* If the socket has been bound to an actual address,
* we still allow the option to be changed, but it no
* longer has any effect.
*/
type = IP_GET_TYPE(ipopts->local_ip);
allow = (type == IPADDR_TYPE_ANY ||
(type == IPADDR_TYPE_V6 &&
ip_addr_isany(ipopts->local_ip)));
if (val) {
ip->ip_flags |= IPF_V6ONLY;
type = IPADDR_TYPE_V6;
} else {
ip->ip_flags &= ~IPF_V6ONLY;
type = IPADDR_TYPE_ANY;
}
if (allow)
IP_SET_TYPE(ipopts->local_ip, type);
return OK;
}
break;
}
return ENOPROTOOPT;
}
/*
* Retrieve socket options on an IP socket.
*/
int
ipsock_getsockopt(struct ipsock * ip, int level, int name,
const struct sockdriver_data * data, socklen_t * len,
struct ipopts * ipopts)
{
int val;
switch (level) {
case SOL_SOCKET:
switch (name) {
case SO_SNDBUF:
val = ip->ip_sndbuf;
return sockdriver_copyout_opt(data, &val, sizeof(val),
len);
case SO_RCVBUF:
val = ip->ip_rcvbuf;
return sockdriver_copyout_opt(data, &val, sizeof(val),
len);
}
break;
case IPPROTO_IP:
if (ipsock_is_ipv6(ip))
break;
switch (name) {
case IP_TOS:
val = (int)*ipopts->tos;
return sockdriver_copyout_opt(data, &val, sizeof(val),
len);
case IP_TTL:
val = (int)*ipopts->ttl;
return sockdriver_copyout_opt(data, &val, sizeof(val),
len);
}
break;
case IPPROTO_IPV6:
if (!ipsock_is_ipv6(ip))
break;
switch (name) {
case IPV6_UNICAST_HOPS:
val = *ipopts->ttl;
return sockdriver_copyout_opt(data, &val, sizeof(val),
len);
case IPV6_TCLASS:
val = *ipopts->tos;
return sockdriver_copyout_opt(data, &val, sizeof(val),
len);
case IPV6_V6ONLY:
val = !!(ip->ip_flags & IPF_V6ONLY);
return sockdriver_copyout_opt(data, &val, sizeof(val),
len);
}
break;
}
return ENOPROTOOPT;
}
/*
* Fill the given kinfo_pcb sysctl(7) structure with IP-level information.
*/
void
ipsock_get_info(struct kinfo_pcb * ki, const ip_addr_t * local_ip,
uint16_t local_port, const ip_addr_t * remote_ip, uint16_t remote_port)
{
ip_addr_t ipaddr;
socklen_t len;
uint8_t type;
len = sizeof(ki->ki_spad); /* use this for the full size, not ki_src */
addr_put_inet(&ki->ki_src, &len, local_ip, TRUE /*kame*/, local_port);
/*
* At this point, the local IP address type has already been used to
* determine whether this is an IPv4 or IPv6 socket. While not ideal,
* that is the best we can do: we cannot use IPv4-mapped IPv6 addresses
* in lwIP PCBs, we cannot store the original type in those PCBs, and
* we also cannot rely on the PCB having an associated ipsock object
* anymore. We also cannot use the ipsock only when present: it could
* make a TCP PCB "jump" from IPv6 to IPv4 in the netstat listing when
* it goes into TIME_WAIT state, for example.
*
* So, use *only* the type of the local IP address to determine whether
* this is an IPv4 or an IPv6 socket. At the same time, do *not* rely
* on the remote IP address being IPv4 for a local IPv4 address; it may
* be of type IPADDR_TYPE_V6 for an unconnected socket bound to an
* IPv4-mapped IPv6 address. Pretty messy, but we're limited by what
* lwIP offers here. Since it's just netstat, it need not be perfect.
*/
if ((type = IP_GET_TYPE(local_ip)) == IPADDR_TYPE_V4) {
if (!ip_addr_isany(local_ip) || local_port != 0)
ki->ki_prstate = INP_BOUND;
/*
* Make sure the returned socket address types are consistent.
* The only case where the remote IP address is not IPv4 here
* is when it is not set yet, so there is no need to check
* whether it is the 'any' address: it always is.
*/
if (IP_GET_TYPE(remote_ip) != IPADDR_TYPE_V4) {
ip_addr_set_zero_ip4(&ipaddr);
remote_ip = &ipaddr;
}
} else {
if (!ip_addr_isany(local_ip) || local_port != 0)
ki->ki_prstate = IN6P_BOUND;
if (type != IPADDR_TYPE_ANY)
ki->ki_pflags |= IN6P_IPV6_V6ONLY;
}
len = sizeof(ki->ki_dpad); /* use this for the full size, not ki_dst */
addr_put_inet(&ki->ki_dst, &len, remote_ip, TRUE /*kame*/,
remote_port);
/* Check the type of the *local* IP address here. See above. */
if (!ip_addr_isany(remote_ip) || remote_port != 0) {
if (type == IPADDR_TYPE_V4)
ki->ki_prstate = INP_CONNECTED;
else
ki->ki_prstate = IN6P_CONNECTED;
}
}

95
minix/net/lwip/ipsock.h Normal file
View File

@ -0,0 +1,95 @@
#ifndef MINIX_NET_LWIP_IPSOCK_H
#define MINIX_NET_LWIP_IPSOCK_H
/* IP-level socket, shared by TCP, UDP, and RAW. */
struct ipsock {
struct sock ip_sock; /* socket object, MUST be first */
unsigned int ip_flags; /* all socket flags */
size_t ip_sndbuf; /* send buffer size */
size_t ip_rcvbuf; /* receive buffer size */
};
/*
* Socket flags. In order to reduce memory consumption, all these flags are
* stored in the same field (ipsock.ip_flags) and thus must not overlap between
* the same users of the field, and that is why they are all here. For
* example, UDPF/PKTF/IPF should all be unique, and TCPF/IPF should be unique,
* but UDPF/PKTF may overlap with TCPF and UDPF may overlap with RAWF. In
* practice, we have no UDPF or RAWF flags and plenty of space to make all
* flags unique anyway.
*/
#define IPF_IPV6 0x0000001 /* socket is IPv6 */
#define IPF_V6ONLY 0x0000002 /* socket is IPv6 only */
#define PKTF_RECVINFO 0x0000010 /* receive ancillary PKTINFO */
#define PKTF_RECVTTL 0x0000020 /* receive ancillary TTL */
#define PKTF_RECVTOS 0x0000040 /* receive ancillary TOS */
#define PKTF_MCAWARE 0x0000080 /* owner is multicast aware */
#define TCPF_CONNECTING 0x0001000 /* attempting to connect */
#define TCPF_SENT_FIN 0x0002000 /* send FIN when possible */
#define TCPF_RCVD_FIN 0x0004000 /* received FIN from peer */
#define TCPF_FULL 0x0008000 /* PCB send buffer is full */
#define TCPF_OOM 0x0010000 /* memory allocation failed */
#define ipsock_get_sock(ip) (&(ip)->ip_sock)
#define ipsock_is_ipv6(ip) ((ip)->ip_flags & IPF_IPV6)
#define ipsock_is_v6only(ip) ((ip)->ip_flags & IPF_V6ONLY)
#define ipsock_get_flags(ip) ((ip)->ip_flags)
#define ipsock_get_flag(ip,fl) ((ip)->ip_flags & (fl))
#define ipsock_set_flag(ip,fl) ((ip)->ip_flags |= (fl))
#define ipsock_clear_flag(ip,fl) ((ip)->ip_flags &= ~(fl))
#define ipsock_get_sndbuf(ip) ((ip)->ip_sndbuf)
#define ipsock_get_rcvbuf(ip) ((ip)->ip_rcvbuf)
/*
* IP-level option pointers. This is necessary because even though lwIP's
* TCP, UDP, and RAW PCBs share the same initial fields, the C standard does
* not permit generic access to such initial fields (due to both possible
* padding differences and strict-aliasing rules). The fields in this
* structure are therefore pointers to the initial fields of each of the PCB
* structures. If lwIP ever groups its IP PCB fields into a single structure
* and uses that structure as first field of each of the other PCBs, then we
* will be able to replace this structure with a pointer to the IP PCB instead.
* For convenience we also carry the send and receive buffer limits here.
*/
struct ipopts {
ip_addr_t *local_ip;
ip_addr_t *remote_ip;
uint8_t *tos;
uint8_t *ttl;
size_t sndmin;
size_t sndmax;
size_t rcvmin;
size_t rcvmax;
};
struct ifdev;
void ipsock_init(void);
int ipsock_socket(struct ipsock * ip, int domain, size_t sndbuf, size_t rcvbuf,
struct sock ** sockp);
void ipsock_clone(struct ipsock * ip, struct ipsock * newip, sockid_t newid);
void ipsock_get_any_addr(struct ipsock * ip, ip_addr_t * ipaddr);
int ipsock_check_src_addr(struct ipsock * ip, ip_addr_t * ipaddr,
int allow_mcast, struct ifdev ** ifdevp);
int ipsock_get_src_addr(struct ipsock * ip, const struct sockaddr * addr,
socklen_t addr_len, endpoint_t user_endpt, ip_addr_t * local_ip,
uint16_t local_port, int allow_mcast, ip_addr_t * ipaddr,
uint16_t * portp);
int ipsock_get_dst_addr(struct ipsock * ip, const struct sockaddr * addr,
socklen_t addr_len, const ip_addr_t * local_addr, ip_addr_t * dst_addr,
uint16_t * dst_port);
void ipsock_put_addr(struct ipsock * ip, struct sockaddr * addr,
socklen_t * addr_len, ip_addr_t * ipaddr, uint16_t port);
int ipsock_setsockopt(struct ipsock * ip, int level, int name,
const struct sockdriver_data * data, socklen_t len,
struct ipopts * ipopts);
int ipsock_getsockopt(struct ipsock * ip, int level, int name,
const struct sockdriver_data * data, socklen_t * len,
struct ipopts * ipopts);
void ipsock_get_info(struct kinfo_pcb * ki, const ip_addr_t * local_ip,
uint16_t local_port, const ip_addr_t * remote_ip,
uint16_t remote_port);
#endif /* !MINIX_NET_LWIP_IPSOCK_H */

584
minix/net/lwip/lldata.c Normal file
View File

@ -0,0 +1,584 @@
/* LWIP service - lldata.c - link-layer (ARP, NDP) data related routines */
/*
* This module is largely isolated from the regular routing code. There are
* two reasons for that. First, mixing link-layer routes with regular routes
* would not work well due to the fact that lwIP keeps these data structures
* entirely separate. Second, as of version 8, NetBSD keeps the IP-layer and
* link-layer routing separate as well.
*
* Unfortunately, lwIP does not provide much in the way of implementing the
* functionality that would be expected for this module. As such, the current
* implementation is very restricted and simple.
*
* For ARP table entries, lwIP only allows for adding and deleting static
* entries. Non-static entries cannot be deleted. Incomplete (pending)
* entries cannot even be enumerated, nor can (e.g.) expiry information be
* obtained. The lwIP ARP datastructures are completely hidden, so there is no
* way to overcome these limitations without changing lwIP itself. As a
* result, not all functionality of the arp(8) userland utility is supported.
*
* For NDP table entries, lwIP offers no API at all. However, since the data
* structures are exposed directly, we can use those to implement full support
* for exposing information in a read-only way. However, manipulating data
* structures directly from here is too risky, nor does lwIP currently support
* the concept of static NDP table entries. Therefore, adding, changing, and
* deleting NDP entries is currently not supported, and will also first require
* changes to lwIP itself.
*
* The ndp(8) userland utility is also able to show and manipulate various
* other neighbor discovery related tables and settings. We support only a
* small subset of them. The main reason for this is that the other tables,
* in particular the prefix and default router lists, are not relevant: on
* MINIX 3, these are always managed fully in userland (usually dhcpcd(8)), and
* we even hardcode lwIP not to parse Router Advertisement messages at all, so
* even though those tables are still part of lwIP, they are always empty.
* Other ndp(8) functionality are unsupported for similar reasons.
*/
#include "lwip.h"
#include "lldata.h"
#include "route.h"
#include "rtsock.h"
#include "lwip/etharp.h"
#include "lwip/nd6.h"
#include "lwip/priv/nd6_priv.h" /* for neighbor_cache */
/*
* Process a routing command specifically for an ARP table entry. Return OK if
* the routing command has been processed successfully and a routing socket
* reply message has already been generated. Return a negative error code on
* failure, in which case the caller will generate a reply message instead.
*/
static int
lldata_arp_process(unsigned int type, const ip_addr_t * dst_addr,
const struct eth_addr * gw_addr, struct ifdev * ifdev,
unsigned int flags, const struct rtsock_request * rtr)
{
const ip4_addr_t *ip4addr;
struct eth_addr ethaddr, *ethptr;
struct netif *netif;
lldata_arp_num_t num;
err_t err;
netif = (ifdev != NULL) ? ifdev_get_netif(ifdev) : NULL;
num = etharp_find_addr(netif, ip_2_ip4(dst_addr), &ethptr, &ip4addr);
if (type != RTM_ADD && num < 0)
return ESRCH;
else if (type == RTM_ADD && num >= 0)
return EEXIST;
switch (type) {
case RTM_CHANGE:
/*
* This request is not used by arp(8), so keep things simple.
* For RTM_ADD we support only static entries; we support only
* those too here, and thus we can use delete-and-readd. If
* the ethernet address is not being changed, try readding the
* entry with the previous ethernet address.
*/
if (gw_addr == NULL)
gw_addr = ethptr;
if (etharp_remove_static_entry(ip_2_ip4(dst_addr)) != ERR_OK)
return EPERM;
/* FALLTHROUGH */
case RTM_ADD:
assert(gw_addr != NULL);
memcpy(&ethaddr, gw_addr, sizeof(ethaddr));
/*
* Adding static, permanent, unpublished, non-proxy entries is
* all that lwIP supports right now. We also do not get to
* specify the interface, and the way lwIP picks the interface
* may in fact result in a different one.
*/
if ((err = etharp_add_static_entry(ip_2_ip4(dst_addr),
&ethaddr)) != ERR_OK)
return util_convert_err(err);
if ((num = etharp_find_addr(NULL /*netif*/, ip_2_ip4(dst_addr),
&ethptr, &ip4addr)) < 0)
panic("unable to find just-added static ARP entry");
/* FALLTHROUGH */
case RTM_LOCK:
case RTM_GET:
rtsock_msg_arp(num, type, rtr);
return OK;
case RTM_DELETE:
memcpy(&ethaddr, ethptr, sizeof(ethaddr));
if (etharp_remove_static_entry(ip_2_ip4(dst_addr)) != ERR_OK)
return EPERM;
/*
* FIXME: the following block is a hack, because we cannot
* predict whether the above removal will succeed, while at the
* same time we need the entry to be present in order to report
* the deleted address to the routing socket. We temporarily
* readd and then remove the entry just for the purpose of
* generating the routing socket reply. There are other ways
* to resolve this, but only a better lwIP etharp API would
* allow us to resolve this problem cleanly.
*/
(void)etharp_add_static_entry(ip_2_ip4(dst_addr), &ethaddr);
num = etharp_find_addr(NULL /*netif*/, ip_2_ip4(dst_addr),
&ethptr, &ip4addr);
assert(num >= 0);
rtsock_msg_arp(num, type, rtr);
(void)etharp_remove_static_entry(ip_2_ip4(dst_addr));
return OK;
default:
return EINVAL;
}
}
/*
* Enumerate ARP table entries. Return TRUE if there is at least one more ARP
* table entry, of which the number is stored in 'num'. The caller should set
* 'num' to 0 initially, and increase it by one between a successful call and
* the next call. Return FALSE if there are no more ARP table entries.
*/
int
lldata_arp_enum(lldata_arp_num_t * num)
{
ip4_addr_t *ip4addr;
struct netif *netif;
struct eth_addr *ethaddr;
for (; *num < ARP_TABLE_SIZE; ++*num) {
if (etharp_get_entry(*num, &ip4addr, &netif, &ethaddr))
return TRUE;
}
return FALSE;
}
/*
* Obtain information about the ARP table entry identified by 'num'. The IPv4
* address of the entry is stored in 'addr'. Its ethernet address is stored in
* 'gateway'. The associated interface is stored in 'ifdevp', and the entry's
* routing flags (RTF_) are stored in 'flagsp'.
*/
void
lldata_arp_get(lldata_arp_num_t num, struct sockaddr_in * addr,
struct sockaddr_dlx * gateway, struct ifdev ** ifdevp,
unsigned int * flagsp)
{
ip_addr_t ipaddr;
ip4_addr_t *ip4addr;
struct netif *netif;
struct ifdev *ifdev;
struct eth_addr *ethaddr;
socklen_t addr_len;
if (!etharp_get_entry(num, &ip4addr, &netif, &ethaddr))
panic("request for invalid ARP entry");
ip_addr_copy_from_ip4(ipaddr, *ip4addr);
assert(netif != NULL);
ifdev = netif_get_ifdev(netif);
addr_len = sizeof(*addr);
addr_put_inet((struct sockaddr *)addr, &addr_len, &ipaddr,
TRUE /*kame*/, 0 /*port*/);
addr_len = sizeof(*gateway);
addr_put_link((struct sockaddr *)gateway, &addr_len,
ifdev_get_index(ifdev), ifdev_get_iftype(ifdev), NULL /*name*/,
ethaddr->addr, sizeof(ethaddr->addr));
*ifdevp = ifdev;
/*
* TODO: this is not necessarily accurate, but lwIP does not provide us
* with information as to whether this is a static entry or not..
*/
*flagsp = RTF_HOST | RTF_LLINFO | RTF_LLDATA | RTF_STATIC | RTF_CLONED;
}
/*
* Obtain information about the ND6 neighbor cache entry 'i', which must be a
* number between 0 (inclusive) and LWIP_ND6_NUM_NEIGHBORS (exclusive). If an
* entry with this number exists, return a pointer to its IPv6 address, and
* additional information in each of the given pointers if not NULL. The
* associated interface is stored in 'netif'. If the entry has an associated
* link-layer address, a pointer to it is stored in 'lladdr'. The entry's
* state (ND6_{INCOMPLETE,REACHABLE,STALE,DELAY,PROBE}) is stored in 'state'.
* The 'isrouter' parameter is filled with a boolean value indicating whether
* the entry is for a router. For ND6_INCOMPLETE and ND6_PROBE, the number of
* probes sent so far is stored in 'probes_sent'; for other states, the value
* is set to zero. For ND6_REACHABLE and ND6_DELAY, the time until expiration
* in ND6_TMR_INTERVAL-millisecond units is stored in 'expire_time'; for other
* states, the value is set to zero. If an entry with number 'i' does not
* exist, NULL is returned.
*
* TODO: upstream this function to lwIP.
*/
static const ip6_addr_t *
nd6_get_neighbor_cache_entry(int8_t i, struct netif ** netif,
const uint8_t ** lladdr, uint8_t * state, uint8_t * isrouter,
uint32_t * probes_sent, uint32_t * expire_time)
{
if (i < 0 || i >= LWIP_ND6_NUM_NEIGHBORS ||
neighbor_cache[i].state == ND6_NO_ENTRY)
return NULL;
if (netif != NULL)
*netif = neighbor_cache[i].netif;
if (lladdr != NULL) {
if (neighbor_cache[i].state != ND6_INCOMPLETE)
*lladdr = neighbor_cache[i].lladdr;
else
*lladdr = NULL;
}
if (state != NULL)
*state = neighbor_cache[i].state;
if (isrouter != NULL)
*isrouter = neighbor_cache[i].isrouter;
if (probes_sent != NULL) {
if (neighbor_cache[i].state == ND6_INCOMPLETE ||
neighbor_cache[i].state == ND6_PROBE)
*probes_sent = neighbor_cache[i].counter.probes_sent;
else
*probes_sent = 0;
}
if (expire_time != NULL) {
switch (neighbor_cache[i].state) {
case ND6_REACHABLE:
*expire_time =
neighbor_cache[i].counter.reachable_time /
ND6_TMR_INTERVAL;
break;
case ND6_DELAY:
*expire_time = neighbor_cache[i].counter.delay_time;
break;
case ND6_INCOMPLETE:
case ND6_PROBE:
/* Probes are sent once per timer tick. */
*expire_time = (LWIP_ND6_MAX_MULTICAST_SOLICIT + 1 -
neighbor_cache[i].counter.probes_sent) *
(ND6_TMR_INTERVAL / 1000);
break;
default:
/* Stale entries do not expire; they get replaced. */
*expire_time = 0;
break;
}
}
return &neighbor_cache[i].next_hop_address;
}
/*
* Find a neighbor cache entry by IPv6 address. Return its index number if
* found, or -1 if not. This is a reimplementation of the exact same function
* internal to lwIP.
*
* TODO: make this function public in lwIP.
*/
static int8_t
nd6_find_neighbor_cache_entry(const ip6_addr_t * addr)
{
int8_t i;
for (i = 0; i < LWIP_ND6_NUM_NEIGHBORS; i++) {
if (ip6_addr_cmp(addr, &neighbor_cache[i].next_hop_address))
return i;
}
return -1;
}
/*
* Find an NDP table entry based on the given interface and IPv6 address. On
* success, return OK, with the entry's index number stored in 'nump'. On
* failure, return an appropriate error code.
*/
int
lldata_ndp_find(struct ifdev * ifdev, const struct sockaddr_in6 * addr,
lldata_ndp_num_t * nump)
{
ip_addr_t ipaddr;
int8_t i;
int r;
if ((r = addr_get_inet((const struct sockaddr *)addr, sizeof(*addr),
IPADDR_TYPE_V6, &ipaddr, TRUE /*kame*/, NULL /*port*/)) != OK)
return r;
/*
* For given link-local addresses, no zone may be provided in the
* address at all. In such cases, add the zone ourselves, using the
* given interface.
*/
if (ip6_addr_lacks_zone(ip_2_ip6(&ipaddr), IP6_UNKNOWN))
ip6_addr_assign_zone(ip_2_ip6(&ipaddr), IP6_UNKNOWN,
ifdev_get_netif(ifdev));
i = nd6_find_neighbor_cache_entry(ip_2_ip6(&ipaddr));
if (i < 0)
return ESRCH;
/*
* We should compare the neighbor cache entry's associated netif to
* the given ifdev, but since the lwIP neighbor cache is currently not
* keyed by netif anyway (i.e. the internal lookups are purely by IPv6
* address as well), doing so makes little sense in practice.
*/
*nump = (lldata_ndp_num_t)i;
return OK;
}
/*
* Process a routing command specifically for an NDP table entry. Return OK if
* the routing command has been processed successfully and a routing socket
* reply message has already been generated. Return a negative error code on
* failure, in which case the caller will generate a reply message instead.
*/
static int
lldata_ndp_process(unsigned int type, const ip_addr_t * dst_addr,
const struct eth_addr * gw_addr,
struct ifdev * ifdev, unsigned int flags,
const struct rtsock_request * rtr)
{
lldata_ndp_num_t num;
num = (lldata_ndp_num_t)
nd6_find_neighbor_cache_entry(ip_2_ip6(dst_addr));
if (type != RTM_ADD && num < 0)
return ESRCH;
else if (type == RTM_ADD && num >= 0)
return EEXIST;
switch (type) {
case RTM_LOCK:
case RTM_GET:
rtsock_msg_arp(num, type, rtr);
return OK;
case RTM_ADD:
case RTM_CHANGE:
case RTM_DELETE:
/* TODO: add lwIP support to implement these commands. */
return ENOSYS;
default:
return EINVAL;
}
}
/*
* Enumerate NDP table entries. Return TRUE if there is at least one more NDP
* table entry, of which the number is stored in 'num'. The caller should set
* 'num' to 0 initially, and increase it by one between a successful call and
* the next call. Return FALSE if there are no more NDP table entries.
*/
int
lldata_ndp_enum(lldata_ndp_num_t * num)
{
for (; *num < LWIP_ND6_NUM_NEIGHBORS; ++*num) {
if (nd6_get_neighbor_cache_entry(*num, NULL /*netif*/,
NULL /*lladdr*/, NULL /*state*/, NULL /*isrouter*/,
NULL /*probes_sent*/, NULL /*expire_time*/) != NULL)
return TRUE;
}
return FALSE;
}
/*
* Obtain information about the NDP table entry identified by 'num'. The IPv6
* address of the entry is stored in 'addr'. Its ethernet address is stored in
* 'gateway'. The associated interface is stored in 'ifdevp', and the entry's
* routing flags (RTF_) are stored in 'flagsp'.
*/
void
lldata_ndp_get(lldata_ndp_num_t num, struct sockaddr_in6 * addr,
struct sockaddr_dlx * gateway, struct ifdev ** ifdevp,
unsigned int * flagsp)
{
const ip6_addr_t *ip6addr;
ip_addr_t ipaddr;
struct netif *netif;
struct ifdev *ifdev;
const uint8_t *lladdr;
socklen_t addr_len;
ip6addr = nd6_get_neighbor_cache_entry(num, &netif, &lladdr,
NULL /*state*/, NULL /*isrouter*/, NULL /*probes_sent*/,
NULL /*expire_time*/);
assert(ip6addr != NULL);
ip_addr_copy_from_ip6(ipaddr, *ip6addr);
ifdev = netif_get_ifdev(netif);
assert(ifdev != NULL);
addr_len = sizeof(*addr);
addr_put_inet((struct sockaddr *)addr, &addr_len, &ipaddr,
TRUE /*kame*/, 0 /*port*/);
addr_len = sizeof(*gateway);
addr_put_link((struct sockaddr *)gateway, &addr_len,
ifdev_get_index(ifdev), ifdev_get_iftype(ifdev), NULL /*name*/,
lladdr, ifdev_get_hwlen(ifdev));
*ifdevp = ifdev;
*flagsp = RTF_HOST | RTF_LLINFO | RTF_LLDATA | RTF_CLONED;
}
/*
* Obtain information about the NDP table entry with the number 'num', which
* must be obtained through a previous call to lldata_ndp_find(). On return,
* 'asked' is filled with the number of probes sent so far (0 if inapplicable),
* 'isrouter' is set to 1 or 0 depending on whether the entry is for a router,
* 'state' is set to the entry's state (ND6_LLINFO_), and 'expire' is set to
* either the UNIX timestamp of expiry for the entry; 0 for permanent entries.
* None of the given pointers must be NULL. This function always succeeds.
*/
void
lldata_ndp_get_info(lldata_ndp_num_t num, long * asked, int * isrouter,
int * state, int * expire)
{
uint32_t nd6_probes_sent = 0 /*gcc*/, nd6_expire_time = 0 /*gcc*/;
uint8_t nd6_state = 0 /*gcc*/, nd6_isrouter = 0 /*gcc*/;
(void)nd6_get_neighbor_cache_entry(num, NULL /*netif*/,
NULL /*lladdr*/, &nd6_state, &nd6_isrouter, &nd6_probes_sent,
&nd6_expire_time);
*asked = (long)nd6_probes_sent;
*isrouter = !!nd6_isrouter;
switch (nd6_state) {
case ND6_INCOMPLETE: *state = ND6_LLINFO_INCOMPLETE; break;
case ND6_REACHABLE: *state = ND6_LLINFO_REACHABLE; break;
case ND6_STALE: *state = ND6_LLINFO_STALE; break;
case ND6_DELAY: *state = ND6_LLINFO_DELAY; break;
case ND6_PROBE: *state = ND6_LLINFO_PROBE; break;
default: panic("unknown ND6 state %u", nd6_state);
}
if (nd6_expire_time != 0)
*expire = clock_time(NULL) +
(int)nd6_expire_time * (ND6_TMR_INTERVAL / 1000);
else
*expire = 0;
}
/*
* Process a routing command specifically for a link-layer route, as one of the
* specific continuations of processing started by route_process(). The RTM_
* routing command is given as 'type'. The route destination is given as
* 'dst_addr'; its address type determines whether the operation is for ARP or
* NDP. The sockaddr structure for 'gateway' is passed on as is and may have
* to be parsed here if not NULL. 'ifdev' is the interface to be associated
* with the route; it is non-NULL only if an interface name (IFP) or address
* (IFA) was given. The RTF_ flags field has been checked against the globally
* supported flags, but may have to be checked for flags that do not apply to
* ARP/NDP routes. Return OK or a negative error code, following the same
* semantics as route_process().
*/
int
lldata_process(unsigned int type, const ip_addr_t * dst_addr,
const struct sockaddr * gateway, struct ifdev * ifdev,
unsigned int flags, const struct rtsock_request * rtr)
{
const struct route_entry *route;
struct eth_addr ethaddr, *gw_addr;
int r;
assert(flags & RTF_LLDATA);
/*
* It seems that RTF_UP does not apply to link-layer routing entries.
* We basically accept any flags that we can return, but we do not
* actually check most of them anywhere.
*/
if ((flags & ~(RTF_HOST | RTF_LLINFO | RTF_LLDATA | RTF_STATIC |
RTF_CLONED | RTF_ANNOUNCE)) != 0)
return EINVAL;
gw_addr = NULL;
if (type == RTM_ADD || type == RTM_CHANGE) {
/*
* Link-layer entries are always host entries. Not all
* requests pass in this flag though, so check only when the
* flags are supposed to be set.
*/
if ((type == RTM_ADD || type == RTM_CHANGE) &&
!(flags & RTF_HOST))
return EINVAL;
/* lwIP does not support publishing custom entries. */
if (flags & RTF_ANNOUNCE)
return ENOSYS;
/* RTF_GATEWAY is always cleared for link-layer entries. */
if (gateway != NULL) {
if ((r = addr_get_link(gateway, gateway->sa_len,
NULL /*name*/, 0 /*name_max*/, ethaddr.addr,
sizeof(ethaddr.addr))) != OK)
return r;
gw_addr = &ethaddr;
}
if (type == RTM_ADD) {
if (gateway == NULL)
return EINVAL;
/*
* If no interface has been specified, see if the
* destination address is on a locally connected
* network. If so, use that network's interface.
* Otherwise reject the request altogether: we must
* have an interface to which to associate the entry.
*/
if (ifdev == NULL) {
if ((route = route_lookup(dst_addr)) != NULL &&
!(route_get_flags(route) & RTF_GATEWAY))
ifdev = route_get_ifdev(route);
else
return ENETUNREACH;
}
}
}
if (IP_IS_V4(dst_addr))
return lldata_arp_process(type, dst_addr, gw_addr, ifdev,
flags, rtr);
else
return lldata_ndp_process(type, dst_addr, gw_addr, ifdev,
flags, rtr);
}

27
minix/net/lwip/lldata.h Normal file
View File

@ -0,0 +1,27 @@
#ifndef MINIX_NET_LWIP_LLDATA_H
#define MINIX_NET_LWIP_LLDATA_H
struct rtsock_request;
typedef int lldata_arp_num_t; /* ARP table entry number */
typedef int lldata_ndp_num_t; /* NDP table entry number */
int lldata_arp_enum(lldata_arp_num_t * num);
void lldata_arp_get(lldata_arp_num_t num, struct sockaddr_in * addr,
struct sockaddr_dlx * gateway, struct ifdev ** ifdevp,
unsigned int * flagsp);
int lldata_ndp_find(struct ifdev * ifdev,
const struct sockaddr_in6 * addr, lldata_ndp_num_t * nump);
int lldata_ndp_enum(lldata_ndp_num_t * num);
void lldata_ndp_get(lldata_ndp_num_t num, struct sockaddr_in6 * addr,
struct sockaddr_dlx * gateway, struct ifdev ** ifdevp,
unsigned int * flagsp);
void lldata_ndp_get_info(lldata_ndp_num_t num, long * asked, int * isrouter,
int * state, int * expire);
int lldata_process(unsigned int type, const ip_addr_t * dst_addr,
const struct sockaddr * gateway, struct ifdev * ifdev,
unsigned int flags, const struct rtsock_request * rtr);
#endif /* !MINIX_NET_LWIP_LLDATA_H */

77
minix/net/lwip/lnksock.c Normal file
View File

@ -0,0 +1,77 @@
/* LWIP service - lnksock.c - link sockets */
/*
* This module contains absolutely minimal support for AF_LINK type sockets,
* because for now we need them only to support a specific set of IOCTLs, as
* required by for example ifconfig(8).
*/
#include "lwip.h"
/* The number of link sockets. */
#define NR_LNKSOCK 4
static struct lnksock {
struct sock lnk_sock; /* socket object, MUST be first */
SIMPLEQ_ENTRY(lnksock) lnk_next; /* next in free list */
} lnk_array[NR_LNKSOCK];
static SIMPLEQ_HEAD(, lnksock) lnk_freelist; /* list of free link sockets */
static const struct sockevent_ops lnksock_ops;
/*
* Initialize the link sockets module.
*/
void
lnksock_init(void)
{
unsigned int slot;
/* Initialize the list of free link sockets. */
SIMPLEQ_INIT(&lnk_freelist);
for (slot = 0; slot < __arraycount(lnk_array); slot++)
SIMPLEQ_INSERT_TAIL(&lnk_freelist, &lnk_array[slot], lnk_next);
}
/*
* Create a link socket.
*/
sockid_t
lnksock_socket(int type, int protocol, struct sock ** sockp,
const struct sockevent_ops ** ops)
{
struct lnksock *lnk;
if (type != SOCK_DGRAM)
return EPROTOTYPE;
if (protocol != 0)
return EPROTONOSUPPORT;
if (SIMPLEQ_EMPTY(&lnk_freelist))
return ENOBUFS;
lnk = SIMPLEQ_FIRST(&lnk_freelist);
SIMPLEQ_REMOVE_HEAD(&lnk_freelist, lnk_next);
*sockp = &lnk->lnk_sock;
*ops = &lnksock_ops;
return SOCKID_LNK | (sockid_t)(lnk - lnk_array);
}
/*
* Free up a closed link socket.
*/
static void
lnksock_free(struct sock * sock)
{
struct lnksock *lnk = (struct lnksock *)sock;
SIMPLEQ_INSERT_HEAD(&lnk_freelist, lnk, lnk_next);
}
static const struct sockevent_ops lnksock_ops = {
.sop_ioctl = ifconf_ioctl,
.sop_free = lnksock_free
};

420
minix/net/lwip/loopif.c Normal file
View File

@ -0,0 +1,420 @@
/* LWIP service - loopif.c - loopback interfaces */
/*
* There is always at least one loopback device. This device is used also to
* loop back packets sent on other interfaces to the local interface address.
* Therefore, not all packets on the loopback device have a source or
* destination address corresponding to the loopback device.
*/
#include "lwip.h"
/*
* As a safety measure, if lwIP somehow gets stuck in a loop replying to its
* own packets on a loopback interface, stop with immediately feeding packets
* back into lwIP after this many packets. The remaining packets will still be
* delivered, but not before the main message loop has had a chance to run.
*/
#define LOOPIF_LIMIT 65536
/*
* The MTU is restricted to 65531 bytes, because we need space for a 4-byte
* header to identify the original interface of the packet.
*/
#define LOOPIF_MAX_MTU (UINT16_MAX - sizeof(uint32_t)) /* maximum MTU */
#define LOOPIF_DEF_MTU LOOPIF_MAX_MTU /* default MTU */
#define NR_LOOPIF 2 /* number of loopback devices */
struct loopif {
struct ifdev loopif_ifdev; /* interface device, MUST be first */
struct pbuf *loopif_head; /* head of pending loopback packets */
struct pbuf **loopif_tailp; /* tail ptr-ptr of pending packets */
TAILQ_ENTRY(loopif) loopif_next; /* next in free list */
} loopif_array[NR_LOOPIF];
static TAILQ_HEAD(, loopif) loopif_freelist; /* free loop interfaces list */
static TAILQ_HEAD(, loopif) loopif_activelist; /* active loop interfaces */
#define loopif_get_netif(loopif) (ifdev_get_netif(&(loopif)->loopif_ifdev))
static unsigned int loopif_cksum_flags;
static int loopif_create(const char *name);
static const struct ifdev_ops loopif_ops;
/*
* Initialize the loopback interface module.
*/
void
loopif_init(void)
{
unsigned int slot;
/* Initialize the lists of loopback interfaces. */
TAILQ_INIT(&loopif_freelist);
TAILQ_INIT(&loopif_activelist);
for (slot = 0; slot < __arraycount(loopif_array); slot++)
TAILQ_INSERT_TAIL(&loopif_freelist, &loopif_array[slot],
loopif_next);
/*
* The default is to perform no checksumming on loopback interfaces,
* except for ICMP messages because otherwise we would need additional
* changes in the code receiving those. In fact, for future
* compatibility, disable only those flags that we manage ourselves.
*/
loopif_cksum_flags = NETIF_CHECKSUM_ENABLE_ALL &
~(NETIF_CHECKSUM_GEN_IP | NETIF_CHECKSUM_CHECK_IP |
NETIF_CHECKSUM_GEN_UDP | NETIF_CHECKSUM_CHECK_UDP |
NETIF_CHECKSUM_GEN_TCP | NETIF_CHECKSUM_CHECK_TCP);
/* Tell the ifdev module that users may create more loopif devices. */
ifdev_register("lo", loopif_create);
}
/*
* Polling function, invoked after each message loop iteration. Forward any
* packets received on the output side of the loopback device during this
* loop iteration, to the input side of the device.
*/
static void
loopif_poll(struct ifdev * ifdev)
{
struct loopif *loopif = (struct loopif *)ifdev;
struct pbuf *pbuf, **pnext;
struct ifdev *oifdev;
struct netif *netif;
uint32_t oifindex;
unsigned int count;
static int warned = FALSE;
count = 0;
while ((pbuf = loopif->loopif_head) != NULL) {
/*
* Prevent endless loops. Keep in mind that packets may be
* added to the queue as part of processing packets from the
* queue here, so the queue itself will never reach this
* length. As such the limit can (and must) be fairly high.
*
* In any case, if this warning is shown, that basically means
* that a bug in lwIP has been triggered. There should be no
* such bugs, so if there are, they should be fixed in lwIP.
*/
if (count++ == LOOPIF_LIMIT) {
if (!warned) {
printf("LWIP: excess loopback traffic, "
"throttling output\n");
warned = TRUE;
}
break;
}
pnext = pchain_end(pbuf);
if ((loopif->loopif_head = *pnext) == NULL)
loopif->loopif_tailp = &loopif->loopif_head;
*pnext = NULL;
/*
* Get the original interface for the packet, which if non-zero
* must also be used to pass the packet back to. The interface
* should still exist in all cases, but better safe than sorry.
*/
memcpy(&oifindex, pbuf->payload, sizeof(oifindex));
util_pbuf_header(pbuf, -(int)sizeof(oifindex));
if (oifindex != 0 &&
(oifdev = ifdev_get_by_index(oifindex)) != NULL)
netif = ifdev_get_netif(oifdev);
else
netif = NULL;
/*
* Loopback devices hand packets to BPF on output only. Doing
* so on input as well would duplicate all captured packets.
*/
ifdev_input(ifdev, pbuf, netif, FALSE /*to_bpf*/);
}
}
/*
* Process a packet as output on a loopback interface. Packets cannot be
* passed back into lwIP right away, nor can the original packets be passed
* back into lwIP. Therefore, make a copy of the packet, and pass it back to
* lwIP at the end of the current message loop iteration.
*/
static err_t
loopif_output(struct ifdev * ifdev, struct pbuf * pbuf, struct netif * netif)
{
struct loopif *loopif = (struct loopif *)ifdev;
struct ifdev *oifdev;
struct pbuf *pcopy;
uint32_t oifindex;
/* Reject oversized packets immediately. This should not happen. */
if (pbuf->tot_len > UINT16_MAX - sizeof(oifindex)) {
printf("LWIP: attempt to send oversized loopback packet\n");
return ERR_MEM;
}
/*
* If the service is low on memory, this is a likely place where
* allocation failures will occur. Thus, do not print anything here.
* The user can diagnose such problems with interface statistics.
*/
pcopy = pchain_alloc(PBUF_RAW, sizeof(oifindex) + pbuf->tot_len);
if (pcopy == NULL) {
ifdev_output_drop(ifdev);
return ERR_MEM;
}
/*
* If the packet was purposely diverted from a non-loopback interface
* to this interface, we have to remember the original interface, so
* that we can pass back the packet to that interface as well. If we
* don't, packets to link-local addresses assigned to non-loopback
* interfaces will not be processed correctly.
*/
if (netif != NULL) {
oifdev = netif_get_ifdev(netif);
oifindex = ifdev_get_index(oifdev);
} else
oifindex = 0;
assert(pcopy->len >= sizeof(oifindex));
memcpy(pcopy->payload, &oifindex, sizeof(oifindex));
util_pbuf_header(pcopy, -(int)sizeof(oifindex));
if (pbuf_copy(pcopy, pbuf) != ERR_OK)
panic("unexpected pbuf copy failure");
pcopy->flags |= pbuf->flags & (PBUF_FLAG_LLMCAST | PBUF_FLAG_LLBCAST);
util_pbuf_header(pcopy, sizeof(oifindex));
*loopif->loopif_tailp = pcopy;
loopif->loopif_tailp = pchain_end(pcopy);
return ERR_OK;
}
/*
* Initialization function for a loopback-type netif interface, called from
* lwIP at interface creation time.
*/
static err_t
loopif_init_netif(struct ifdev * ifdev, struct netif * netif)
{
netif->name[0] = 'l';
netif->name[1] = 'o';
/*
* FIXME: unfortunately, lwIP does not allow one to enable multicast on
* an interface without also enabling multicast management traffic
* (that is, IGMP and MLD). Thus, for now, joining multicast groups
* and assigning local IPv6 addresses will incur such traffic even on
* loopback interfaces. For now this is preferable over not supporting
* multicast on loopback interfaces at all.
*/
netif->flags |= NETIF_FLAG_IGMP | NETIF_FLAG_MLD6;
NETIF_SET_CHECKSUM_CTRL(netif, loopif_cksum_flags);
return ERR_OK;
}
/*
* Create a new loopback device.
*/
static int
loopif_create(const char * name)
{
struct loopif *loopif;
/* Find a free loopback interface slot, if available. */
if (TAILQ_EMPTY(&loopif_freelist))
return ENOBUFS;
loopif = TAILQ_FIRST(&loopif_freelist);
TAILQ_REMOVE(&loopif_freelist, loopif, loopif_next);
/* Initialize the loopif structure. */
TAILQ_INSERT_HEAD(&loopif_activelist, loopif, loopif_next);
loopif->loopif_head = NULL;
loopif->loopif_tailp = &loopif->loopif_head;
/*
* For simplicity and efficiency, we do not prepend the address family
* (IPv4/IPv6) to the packet for BPF, which means our loopback devices
* are of type DLT_RAW rather than (NetBSD's) DLT_NULL.
*/
ifdev_add(&loopif->loopif_ifdev, name, IFF_LOOPBACK | IFF_MULTICAST,
IFT_LOOP, 0 /*hdrlen*/, 0 /*addrlen*/, DLT_RAW, LOOPIF_MAX_MTU,
0 /*nd6flags*/, &loopif_ops);
ifdev_update_link(&loopif->loopif_ifdev, LINK_STATE_UP);
return OK;
}
/*
* Destroy an existing loopback device.
*/
static int
loopif_destroy(struct ifdev * ifdev)
{
struct loopif *loopif = (struct loopif *)ifdev;
struct pbuf *pbuf, **pnext;
int r;
/*
* The ifdev module may refuse to remove this interface if it is the
* loopback interface used to loop back packets for other interfaces.
*/
if ((r = ifdev_remove(&loopif->loopif_ifdev)) != OK)
return r;
/*
* Clean up. The loopback queue can be non-empty only if we have been
* throttling in case of a feedback loop.
*/
while ((pbuf = loopif->loopif_head) != NULL) {
pnext = pchain_end(pbuf);
if ((loopif->loopif_head = *pnext) == NULL)
loopif->loopif_tailp = &loopif->loopif_head;
*pnext = NULL;
pbuf_free(pbuf);
}
TAILQ_REMOVE(&loopif_activelist, loopif, loopif_next);
TAILQ_INSERT_HEAD(&loopif_freelist, loopif, loopif_next);
return OK;
}
/*
* Set NetBSD-style interface flags (IFF_) for a loopback interface.
*/
static int
loopif_set_ifflags(struct ifdev * ifdev, unsigned int ifflags)
{
struct loopif *loopif = (struct loopif *)ifdev;
/*
* Only the IFF_UP flag may be set and cleared. We adjust the
* IFF_RUNNING flag immediately based on this flag. This is a bit
* dangerous, but the caller takes this possibility into account.
*/
if ((ifflags & ~IFF_UP) != 0)
return EINVAL;
if (ifflags & IFF_UP)
ifdev_update_ifflags(&loopif->loopif_ifdev,
ifdev_get_ifflags(&loopif->loopif_ifdev) | IFF_RUNNING);
else
ifdev_update_ifflags(&loopif->loopif_ifdev,
ifdev_get_ifflags(&loopif->loopif_ifdev) & ~IFF_RUNNING);
return OK;
}
/*
* Set the Maximum Transmission Unit for this interface. Return TRUE if the
* new value is acceptable, in which case the caller will do the rest. Return
* FALSE otherwise.
*/
static int
loopif_set_mtu(struct ifdev * ifdev __unused, unsigned int mtu)
{
return (mtu <= LOOPIF_MAX_MTU);
}
static const struct ifdev_ops loopif_ops = {
.iop_init = loopif_init_netif,
.iop_input = ip_input,
.iop_output = loopif_output,
.iop_poll = loopif_poll,
.iop_set_ifflags = loopif_set_ifflags,
.iop_set_mtu = loopif_set_mtu,
.iop_destroy = loopif_destroy,
};
/*
* Set and/or retrieve a per-protocol loopback checksumming option through
* sysctl(7).
*/
ssize_t
loopif_cksum(struct rmib_call * call, struct rmib_node * node __unused,
struct rmib_oldp * oldp, struct rmib_newp * newp)
{
struct loopif *loopif;
unsigned int flags;
int r, val;
/*
* The third name field is the protocol. We ignore the domain (the
* second field), thus sharing settings between PF_INET and PF_INET6.
* This is necessary because lwIP does not support TCP/UDP checksumming
* flags on a per-domain basis.
*/
switch (call->call_oname[2]) {
case IPPROTO_IP:
flags = NETIF_CHECKSUM_GEN_IP | NETIF_CHECKSUM_CHECK_IP;
break;
case IPPROTO_UDP:
flags = NETIF_CHECKSUM_GEN_UDP | NETIF_CHECKSUM_CHECK_UDP;
break;
case IPPROTO_TCP:
flags = NETIF_CHECKSUM_GEN_TCP | NETIF_CHECKSUM_CHECK_TCP;
break;
default:
return EINVAL;
}
/* Copy out the old (current) checksumming option. */
if (oldp != NULL) {
val = !!(loopif_cksum_flags & flags);
if ((r = rmib_copyout(oldp, 0, &val, sizeof(val))) < 0)
return r;
}
if (newp != NULL) {
if ((r = rmib_copyin(newp, &val, sizeof(val))) != OK)
return r;
if (val)
loopif_cksum_flags |= flags;
else
loopif_cksum_flags &= ~flags;
/*
* Apply the new checksum flags to all loopback interfaces.
* Technically, this may result in dropped packets when
* enabling checksumming on a throttled loopif, but that is a
* case so rare and unimportant that we ignore it.
*/
TAILQ_FOREACH(loopif, &loopif_activelist, loopif_next) {
NETIF_SET_CHECKSUM_CTRL(loopif_get_netif(loopif),
loopif_cksum_flags);
}
}
/* Return the length of the node. */
return sizeof(val);
}

382
minix/net/lwip/lwip.c Normal file
View File

@ -0,0 +1,382 @@
/* LWIP service - lwip.c - main program and dispatch code */
#include "lwip.h"
#include "tcpisn.h"
#include "mcast.h"
#include "ethif.h"
#include "rtsock.h"
#include "route.h"
#include "bpfdev.h"
#include "lwip/init.h"
#include "lwip/sys.h"
#include "lwip/timeouts.h"
#include "arch/cc.h"
static int running, recheck_timer;
static minix_timer_t lwip_timer;
static void expire_lwip_timer(int);
/*
* Return the system uptime in milliseconds. Also remember that lwIP retrieved
* the system uptime during this call, so that we know to check for timer
* updates at the end of the current iteration of the message loop.
*/
uint32_t
sys_now(void)
{
recheck_timer = TRUE;
/* TODO: avoid 64-bit arithmetic if possible. */
return (uint32_t)(((uint64_t)getticks() * 1000) / sys_hz());
}
/*
* Check if and when lwIP has its next timeout, and set or cancel our timer
* accordingly.
*/
static void
set_lwip_timer(void)
{
uint32_t next_timeout;
clock_t ticks;
/* Ask lwIP when the next alarm is supposed to go off, if any. */
next_timeout = sys_timeouts_sleeptime();
/*
* Set or update the lwIP timer. We rely on set_timer() asking the
* kernel for an alarm only if the timeout is different from the one we
* gave it last time (if at all). However, due to conversions between
* absolute and relative times, and the fact that we cannot guarantee
* that the uptime itself does not change while executing these
* routines, set_timer() will sometimes be issuing a kernel call even
* if the alarm has not changed. Not a huge deal, but fixing this will
* require a different interface to lwIP and/or the timers library.
*/
if (next_timeout != (uint32_t)-1) {
/*
* Round up the next timeout (which is in milliseconds) to the
* number of clock ticks to add to the current time. Avoid any
* potential for overflows, no matter how unrealistic..
*/
if (next_timeout > TMRDIFF_MAX / sys_hz())
ticks = TMRDIFF_MAX;
else
ticks = (next_timeout * sys_hz() + 999) / 1000;
set_timer(&lwip_timer, ticks, expire_lwip_timer, 0 /*unused*/);
} else
cancel_timer(&lwip_timer); /* not really needed.. */
}
/*
* The timer for lwIP timeouts has gone off. Check timeouts, and possibly set
* a new timer.
*/
static void
expire_lwip_timer(int arg __unused)
{
/* Let lwIP do its work. */
sys_check_timeouts();
/*
* See if we have to update our timer for the next lwIP timer. Doing
* this here, rather than from the main loop, avoids one kernel call.
*/
set_lwip_timer();
recheck_timer = FALSE;
}
/*
* Check whether we should adjust our local timer based on a change in the next
* lwIP timeout.
*/
static void
check_lwip_timer(void)
{
/*
* We make the assumption that whenever lwIP starts a timer, it will
* need to retrieve the current time. Thus, whenever sys_now() is
* called, we set the 'recheck_timer' flag. Here, we check whether to
* (re)set our lwIP timer only if the flag is set. As a result, we do
* not have to mess with timers for literally every incoming message.
*
* When lwIP stops a timer, it does not call sys_now(), and thus, we
* may miss such updates. However, timers being stopped should be rare
* and getting too many alarm messages is not a big deal.
*/
if (!recheck_timer)
return;
set_lwip_timer();
/* Reset the flag for the next message loop iteration. */
recheck_timer = FALSE;
}
/*
* Return a random number, for use by lwIP.
*/
uint32_t
lwip_hook_rand(void)
{
/*
* The current known uses of this hook are for selection of initial
* TCP/UDP port numbers and for multicast-related timer randomness.
* The former case exists only to avoid picking the same starting port
* numbers after a reboot. After that, simple sequential iteration of
* the port numbers is used. The latter case varies the response time
* for sending multicast messages. Thus, none of the current uses of
* this function require proper randomness, and so we use the simplest
* approach, with time-based initialization to cover the reboot case.
* The sequential port number selection could be improved upon, but
* such an extension would probably bypass this hook anyway.
*/
return lrand48();
}
/*
* Create a new socket, with the given domain, type, and protocol, for the user
* process identified by 'user_endpt'. On success, return the new socket's
* identifier, with the libsockevent socket stored in 'sock' and an operations
* table stored in 'ops'. On failure, return a negative error code.
*/
static sockid_t
alloc_socket(int domain, int type, int protocol, endpoint_t user_endpt,
struct sock ** sock, const struct sockevent_ops **ops)
{
switch (domain) {
case PF_INET:
#ifdef INET6
case PF_INET6:
#endif /* INET6 */
switch (type) {
case SOCK_STREAM:
return tcpsock_socket(domain, protocol, sock, ops);
case SOCK_DGRAM:
return udpsock_socket(domain, protocol, sock, ops);
case SOCK_RAW:
if (!util_is_root(user_endpt))
return EACCES;
return rawsock_socket(domain, protocol, sock, ops);
default:
return EPROTOTYPE;
}
case PF_ROUTE:
return rtsock_socket(type, protocol, sock, ops);
case PF_LINK:
return lnksock_socket(type, protocol, sock, ops);
default:
/* This means that the service has been misconfigured. */
printf("socket() with unsupported domain %d\n", domain);
return EAFNOSUPPORT;
}
}
/*
* Initialize the service.
*/
static int
init(int type __unused, sef_init_info_t * init __unused)
{
/*
* Initialize the random number seed. See the lwip_hook_rand() comment
* on why this weak random number source is currently sufficient.
*/
srand48(clock_time(NULL));
/* Initialize the lwIP library. */
lwip_init();
/* Initialize the socket events library. */
sockevent_init(alloc_socket);
/* Initialize various helper modules. */
mempool_init();
tcpisn_init();
mcast_init();
/* Initialize the high-level socket modules. */
ipsock_init();
tcpsock_init();
udpsock_init();
rawsock_init();
/* Initialize the various network interface modules. */
ifdev_init();
loopif_init();
ethif_init();
/* Initialize the network device driver module. */
ndev_init();
/* Initialize the low-level socket modules. */
rtsock_init();
lnksock_init();
/* Initialize the routing module. */
route_init();
/* Initialize other device modules. */
bpfdev_init();
/*
* Initialize the MIB module, after all other modules have registered
* their subtrees with this module.
*/
mibtree_init();
/*
* After everything else has been initialized, set up the default
* configuration - in particular, a loopback interface.
*/
ifconf_init();
/*
* Initialize the master timer for all the lwIP timers. Just in case
* lwIP starts a timer right away, perform a first check upon entry of
* the message loop.
*/
init_timer(&lwip_timer);
recheck_timer = TRUE;
running = TRUE;
return OK;
}
/*
* Perform initialization using the System Event Framework (SEF).
*/
static void
startup(void)
{
sef_setcb_init_fresh(init);
/*
* This service requires stateless restarts, in that several parts of
* the system (including VFS and drivers) expect that if restarted,
* this service comes back up with a new endpoint. Therefore, do not
* set a _restart callback here.
*
* TODO: support for live update.
*
* TODO: support for immediate shutdown if no sockets are in use, as
* also done by UDS. For now, we never shut down immediately, giving
* other processes the opportunity to close sockets on system shutdown.
*/
sef_startup();
}
/*
* The lwIP-based TCP/IP sockets driver.
*/
int
main(void)
{
message m;
int r, ipc_status;
startup();
while (running) {
/*
* For various reasons, the loopback interface does not pass
* packets back into the stack right away. Instead, it queues
* them up for later processing. We do that processing here.
*/
ifdev_poll();
/*
* Unfortunately, lwIP does not tell us when it starts or stops
* timers. This means that we have to check ourselves every
* time we have called into lwIP. For simplicity, we perform
* the check here.
*/
check_lwip_timer();
if ((r = sef_receive_status(ANY, &m, &ipc_status)) != OK) {
if (r == EINTR)
continue; /* sef_cancel() was called */
panic("sef_receive_status failed: %d", r);
}
/* Process the received message. */
if (is_ipc_notify(ipc_status)) {
switch (m.m_source) {
case CLOCK:
expire_timers(m.m_notify.timestamp);
break;
case DS_PROC_NR:
/* Network drivers went up and/or down. */
ndev_check();
break;
default:
printf("unexpected notify from %d\n",
m.m_source);
}
continue;
}
switch (m.m_source) {
case MIB_PROC_NR:
rmib_process(&m, ipc_status);
break;
case VFS_PROC_NR:
/* Is this a socket device request? */
if (IS_SDEV_RQ(m.m_type)) {
sockevent_process(&m, ipc_status);
break;
}
/* Is this a character (or block) device request? */
if (IS_CDEV_RQ(m.m_type) || IS_BDEV_RQ(m.m_type)) {
bpfdev_process(&m, ipc_status);
break;
}
/* FALLTHROUGH */
default:
/* Is this a network device driver response? */
if (IS_NDEV_RS(m.m_type)) {
ndev_process(&m, ipc_status);
break;
}
printf("unexpected message %d from %d\n",
m.m_type, m.m_source);
}
}
return 0;
}

10
minix/net/lwip/lwip.conf Normal file
View File

@ -0,0 +1,10 @@
service lwip
{
domain
INET INET6 ROUTE LINK
;
system KILL; # for SIGPIPE
ipc
SYSTEM vfs rs vm mib
;
};

130
minix/net/lwip/lwip.h Normal file
View File

@ -0,0 +1,130 @@
#ifndef MINIX_NET_LWIP_LWIP_H
#define MINIX_NET_LWIP_LWIP_H
#include <minix/drivers.h>
#include <minix/sockevent.h>
#include <minix/rmib.h>
#include <netinet/in.h>
#include <sys/ioctl.h>
#include <net/bpf.h>
#include "lwip/ip.h"
#include "lwiphooks.h"
#include "addr.h"
#include "ipsock.h"
#include "ifdev.h"
#include "util.h"
/*
* The standard sockaddr_dl is an absolute pain, because the actual structure
* is dynamically sized, while the standard definition is neither the minimum
* nor the maximum size. We use our own version, which uses the maximum size
* that we will ever produce and accept. This greatly simplifies dealing with
* this structure while also limiting stack usage a bit.
*/
struct sockaddr_dlx {
uint8_t sdlx_len; /* actual length of this structure */
sa_family_t sdlx_family; /* address family, always AF_LINK */
uint16_t sdlx_index; /* interface index */
uint8_t sdlx_type; /* interface type (IFT_) */
uint8_t sdlx_nlen; /* interface name length, w/o nul */
uint8_t sdlx_alen; /* link-layer address length */
uint8_t sdlx_slen; /* selector length, always 0 */
uint8_t sdlx_data[IFNAMSIZ + NETIF_MAX_HWADDR_LEN];
};
STATIC_SOCKADDR_MAX_ASSERT(sockaddr_in);
STATIC_SOCKADDR_MAX_ASSERT(sockaddr_in6);
STATIC_SOCKADDR_MAX_ASSERT(sockaddr_dlx);
/* This is our own, much smaller internal version of sockaddr_storage. */
union sockaddr_any {
struct sockaddr sa;
struct sockaddr_in sin;
struct sockaddr_in6 sin6;
struct sockaddr_dlx sdlx;
};
/* Number of bits in each of the types of IP addresses. */
#define IP4_BITS 32 /* number of bits in an IPv4 address */
#define IP6_BITS 128 /* number of bits in an IPv6 address */
/*
* Each socket module maintains its own set of sockets, but all sockets must be
* given globally unique identifiers. Therefore, we use these modifier masks,
* which are bitwise OR'ed with the per-module socket identifiers.
*/
#define SOCKID_TCP 0x00000000
#define SOCKID_UDP 0x00100000
#define SOCKID_RAW 0x00200000
#define SOCKID_RT 0x00400000
#define SOCKID_LNK 0x00800000
/*
* Static remote MIB node identifiers for nodes that are dynamically numbered
* on NetBSD, because they do not have a corresponding protocol family number.
*/
#define NET_INTERFACES (PF_MAX) /* net.interfaces (TODO) */
#define NET_BPF (PF_MAX + 1) /* net.bpf */
#define ROOT_EUID 0 /* effective user ID of superuser */
/*
* Function declarations. Modules with more extended interfaces have their own
* header files.
*/
/* mempool.c */
void mempool_init(void);
unsigned int mempool_cur_buffers(void);
unsigned int mempool_max_buffers(void);
/* pchain.c */
struct pbuf **pchain_end(struct pbuf * pbuf);
size_t pchain_size(struct pbuf * pbuf);
/* addrpol.c */
int addrpol_get_label(const ip_addr_t * ipaddr);
int addrpol_get_scope(const ip_addr_t * ipaddr, int is_src);
/* tcpsock.c */
void tcpsock_init(void);
sockid_t tcpsock_socket(int domain, int protocol, struct sock ** sock,
const struct sockevent_ops ** ops);
/* udpsock.c */
void udpsock_init(void);
sockid_t udpsock_socket(int domain, int protocol, struct sock ** sock,
const struct sockevent_ops ** ops);
/* rawsock.c */
void rawsock_init(void);
sockid_t rawsock_socket(int domain, int protocol, struct sock ** sock,
const struct sockevent_ops ** ops);
/* loopif.c */
void loopif_init(void);
ssize_t loopif_cksum(struct rmib_call * call, struct rmib_node * node,
struct rmib_oldp * oldp, struct rmib_newp * newp);
/* lnksock.c */
void lnksock_init(void);
sockid_t lnksock_socket(int type, int protocol, struct sock ** sock,
const struct sockevent_ops ** ops);
/* mibtree.c */
void mibtree_init(void);
void mibtree_register_inet(int domain, int protocol, struct rmib_node * node);
void mibtree_register_lwip(struct rmib_node * node);
/* ifconf.c */
void ifconf_init(void);
int ifconf_ioctl(struct sock * sock, unsigned long request,
const struct sockdriver_data * data, endpoint_t user_endpt);
/* bpf_filter.c */
u_int bpf_filter_ext(const struct bpf_insn * pc, const struct pbuf * pbuf,
const u_char * packet, u_int total, u_int len);
#endif /* !MINIX_NET_LWIP_LWIP_H */

283
minix/net/lwip/mcast.c Normal file
View File

@ -0,0 +1,283 @@
/* LWIP service - mcast.c - per-socket multicast membership tracking */
/*
* Each socket has a linked list of multicast groups of which it is a member.
* The linked list consists of 'mcast_member' elements. There is both a global
* limit (the number of elements in 'mcast_array') and a per-socket limit on
* group membership. Since multiple sockets may join the same multicast
* groups, there is not a one-to-one relationship between our membership
* structures and the lwIP IGMP/MLD membership structures. Moreover, linking
* to the latter structures directly is not intended by lwIP, so we have to
* keep our own tracking independent, which in particular means that we have to
* make a copy of the multicast group address.
*
* We currently put no effort into saving memory on storing that group address.
* Optimization is complicated by the fact that we have to be able to remove
* membership structures when their corresponding interface disappears, which
* currently involves removal without knowing about the corresponding socket,
* and therefore the socket's address family. All of this can be changed.
*
* There is no function to test whether a particular socket is a member of a
* multicast group. The pktsock module currently makes the assumption that if
* a socket has been joined to any multicast groups, or set any multicast
* options, the application is multicast aware and therefore able to figure out
* whether it is interested in particular packets, and so we do not filter
* incoming packets against the receiving socket's multicast list. This should
* be more or less in line with what W. Richard Stevens say that the BSDs do.
*/
#include "lwip.h"
#include "mcast.h"
#include "lwip/igmp.h"
#include "lwip/mld6.h"
/*
* The per-socket limit on group membership. In theory, the limit should be
* high enough that a single socket can join a particular multicast group on
* all interfaces that support multicast. In practice, we set it a bit lower
* to prevent one socket from using up half of the entries per address family.
* Setting it to IP_MAX_MEMBERSHIPS is definitely excessive right now..
*/
#define MAX_GROUPS_PER_SOCKET 8
static struct mcast_member {
LIST_ENTRY(mcast_member) mm_next; /* next in socket, free list */
struct ifdev * mm_ifdev; /* interface (NULL: free) */
ip_addr_t mm_group; /* group address */
} mcast_array[NR_IPV4_MCAST_GROUP + NR_IPV6_MCAST_GROUP];
static LIST_HEAD(, mcast_member) mcast_freelist;
/*
* Initialize the per-socket multicast membership module.
*/
void
mcast_init(void)
{
unsigned int slot;
/* Initialize the list of free multicast membership entries. */
LIST_INIT(&mcast_freelist);
for (slot = 0; slot < __arraycount(mcast_array); slot++) {
mcast_array[slot].mm_ifdev = NULL;
LIST_INSERT_HEAD(&mcast_freelist, &mcast_array[slot], mm_next);
}
}
/*
* Reset the multicast head for a socket. The socket must not have any
* previous multicast group memberships.
*/
void
mcast_reset(struct mcast_head * mcast_head)
{
LIST_INIT(&mcast_head->mh_list);
}
/*
* Attempt to add a per-socket multicast membership association. The given
* 'mcast_head' pointer is part of a socket. The 'group' parameter is the
* multicast group to join. It is a properly zoned address, but has not been
* checked in any other way. If 'ifdev' is not NULL, it is the interface for
* the membership; if it is NULL, an interface will be selected using routing.
* Return OK if the membership has been successfully removed, or a negative
* error code otherwise.
*/
int
mcast_join(struct mcast_head * mcast_head, const ip_addr_t * group,
struct ifdev * ifdev)
{
struct mcast_member *mm;
struct netif *netif;
unsigned int count;
err_t err;
/*
* The callers of this function perform only checks that depend on the
* address family. We check everything else here.
*/
if (!ip_addr_ismulticast(group))
return EADDRNOTAVAIL;
if (!addr_is_valid_multicast(group))
return EINVAL;
/*
* If no interface was specified, pick one with a routing query. Note
* that scoped IPv6 addresses do require an interface to be specified.
*/
if (ifdev == NULL) {
netif = ip_route(IP46_ADDR_ANY(IP_GET_TYPE(group)), group);
if (netif == NULL)
return EHOSTUNREACH;
ifdev = netif_get_ifdev(netif);
}
assert(ifdev != NULL);
assert(!IP_IS_V6(group) ||
!ip6_addr_lacks_zone(ip_2_ip6(group), IP6_MULTICAST));
/* The interface must support multicast. */
if (!(ifdev_get_ifflags(ifdev) & IFF_MULTICAST))
return EADDRNOTAVAIL;
/*
* First see if this socket is already joined to the given group, which
* is an error. While looking, also count the number of groups the
* socket has joined already, to enforce the per-socket limit.
*/
count = 0;
LIST_FOREACH(mm, &mcast_head->mh_list, mm_next) {
if (mm->mm_ifdev == ifdev && ip_addr_cmp(&mm->mm_group, group))
return EEXIST;
count++;
}
if (count >= MAX_GROUPS_PER_SOCKET)
return ENOBUFS;
/* Do we have a free membership structure available? */
if (LIST_EMPTY(&mcast_freelist))
return ENOBUFS;
/*
* Nothing can go wrong as far as we are concerned. Ask lwIP to join
* the multicast group. This may result in a multicast list update at
* the driver end.
*/
netif = ifdev_get_netif(ifdev);
if (IP_IS_V6(group))
err = mld6_joingroup_netif(netif, ip_2_ip6(group));
else
err = igmp_joingroup_netif(netif, ip_2_ip4(group));
if (err != ERR_OK)
return util_convert_err(err);
/*
* Success. Allocate, initialize, and attach a membership structure to
* the socket.
*/
mm = LIST_FIRST(&mcast_freelist);
LIST_REMOVE(mm, mm_next);
mm->mm_ifdev = ifdev;
mm->mm_group = *group;
LIST_INSERT_HEAD(&mcast_head->mh_list, mm, mm_next);
return OK;
}
/*
* Free the given per-socket multicast membership structure, which must
* previously have been associated with a socket. If 'leave_group' is set,
* also tell lwIP to leave the corresponding multicast group.
*/
static void
mcast_free(struct mcast_member * mm, int leave_group)
{
struct netif *netif;
err_t err;
assert(mm->mm_ifdev != NULL);
if (leave_group) {
netif = ifdev_get_netif(mm->mm_ifdev);
if (IP_IS_V6(&mm->mm_group))
err = mld6_leavegroup_netif(netif,
ip_2_ip6(&mm->mm_group));
else
err = igmp_leavegroup_netif(netif,
ip_2_ip4(&mm->mm_group));
if (err != ERR_OK)
panic("lwIP multicast membership desynchronization");
}
LIST_REMOVE(mm, mm_next);
mm->mm_ifdev = NULL;
LIST_INSERT_HEAD(&mcast_freelist, mm, mm_next);
}
/*
* Attempt to remove a per-socket multicast membership association. The given
* 'mcast_head' pointer is part of a socket. The 'group' parameter is the
* multicast group to leave. It is a properly zoned address, but has not been
* checked in any other way. If 'ifdev' is not NULL, it is the interface of
* the membership; if it is NULL, a membership matching the address on any
* interface will suffice. As such, the parameter requirements mirror those of
* mcast_join(). Return OK if the membership has been successfully removed, or
* a negative error code otherwise.
*/
int
mcast_leave(struct mcast_head * mcast_head, const ip_addr_t * group,
struct ifdev * ifdev)
{
struct mcast_member *mm;
/*
* Look up a matching entry. The fact that we must find a match for
* the given address and interface, keeps us from having to perform
* various other checks, such as whether the given address is a
* multicast address at all. The exact error codes are not specified.
*/
LIST_FOREACH(mm, &mcast_head->mh_list, mm_next) {
if ((ifdev == NULL || mm->mm_ifdev == ifdev) &&
ip_addr_cmp(&mm->mm_group, group))
break;
}
if (mm == NULL)
return ESRCH;
mcast_free(mm, TRUE /*leave_group*/);
return OK;
}
/*
* Remove all per-socket multicast membership associations of the given socket.
* This function is called when the socket is closed.
*/
void
mcast_leave_all(struct mcast_head * mcast_head)
{
struct mcast_member *mm;
while (!LIST_EMPTY(&mcast_head->mh_list)) {
mm = LIST_FIRST(&mcast_head->mh_list);
mcast_free(mm, TRUE /*leave_group*/);
}
}
/*
* The given interface is about to disappear. Remove and free any per-socket
* multicast membership structures associated with the interface, without
* leaving the multicast group itself (as that will happen a bit later anyway).
*/
void
mcast_clear(struct ifdev * ifdev)
{
unsigned int slot;
for (slot = 0; slot < __arraycount(mcast_array); slot++) {
if (mcast_array[slot].mm_ifdev != ifdev)
continue;
mcast_free(&mcast_array[slot], FALSE /*leave_group*/);
}
}

21
minix/net/lwip/mcast.h Normal file
View File

@ -0,0 +1,21 @@
#ifndef MINIX_NET_LWIP_MCAST_H
#define MINIX_NET_LWIP_MCAST_H
struct mcast_member;
struct mcast_head {
LIST_HEAD(, mcast_member) mh_list;
};
#define mcast_isempty(mcast_head) (LIST_EMPTY(&(mcast_head)->mh_list))
void mcast_init(void);
void mcast_reset(struct mcast_head * mcast_head);
int mcast_join(struct mcast_head * mcast_head, const ip_addr_t * group,
struct ifdev * ifdev);
int mcast_leave(struct mcast_head * mcast_head, const ip_addr_t * group,
struct ifdev * ifdev);
void mcast_leave_all(struct mcast_head * mcast_head);
void mcast_clear(struct ifdev * ifdev);
#endif /* !MINIX_NET_LWIP_MCAST_H */

821
minix/net/lwip/mempool.c Normal file
View File

@ -0,0 +1,821 @@
/* LWIP service - mempool.c - memory pool management and slab allocation */
/*
* This module should be considered a replacement for lwIP's PBUF_POOL and
* custom-pools functionality. lwIP's PBUF_POOL system allows a PBUF_POOL type
* allocation for a moderately large amount of memory, for example for a full-
* sized packet, to be turned into a chain of "pbuf" buffers, each of a static
* size. Most of lwIP can deal with such pbuf chains, because many other types
* of allocations also end up consisting of pbuf chains. However, lwIP will
* never use PBUF_POOL for its own memory allocations, and use PBUF_RAM
* allocations instead. Such PBUF_RAM allocations always return one single
* pbuf with a contiguous memory area. lwIP's custom pools support allows such
* PBUF_RAM allocations to draw from user-defined pools of statically allocated
* memory, as an alternative to turning such allocations into malloc() calls.
*
* However, lwIP itself does not offer a way to combine these two pool systems:
* the PBUF_POOL buffer pool and the custom pools are completely separate. We
* want to be able to draw both kinds of memory from the same pool. This is
* the first reason that we are using our own memory pools. The second is
* something that lwIP could never offer anyway: we would like to provide a
* certain amount of static/preallocated memory for those types of allocations,
* but optionally also add a much larger amount of dynamic memory when needed.
*
* In order to make this module work, we do not use PBUF_POOL anywhere.
* Instead, we use chained static-sized PBUF_RAM allocations for all types of
* allocations that we manage ourselves--see pchain_alloc(). We tell lwIP to
* use the functions in this module to do the malloc-type allocations for those
* PBUF_RAM buffers. As such, this module manages all PBUF_RAM allocations,
* both from our own code and from lwIP. Note that we do still use lwIP's own
* pools for various lwIP structures. We do want to keep the isolation
* provided by the use of such pools, even though that means that we have to
* provision some of those pools for the worst case, resulting in some memory
* overhead that is unnecessary for the common case.
*
* With PBUF_RAM allocation redirection system in place, this module has to
* manage the memory for those allocations. It does this based on the
* assertion that there are three main classes of PBUF_RAM allocation sizes:
*
* - "large" allocations: these are allocations for up to MEMPOOL_BUFSIZE bytes
* of PBUF_RAM data, where MEMPOOL_BUFSIZE is the allocation granularity that
* we have picked for the individual buffers in larger chains. It is set to
* 512 bytes right now, mainly to keep pbuf chains for full-sized ethernet
* packets short, which has many performance advantages. Since the pbuf
* header itself also takes some space (16 bytes, right now), this results in
* allocations seen by mempool_malloc() of up to just over 512 bytes.
* - "small" allocations: these are allocations mostly for packet headers, as
* needed by lwIP to prepend to (mainly TCP) packet data that we give to it.
* The size of these allocations varies, but most are 76 bytes (80 bytes if
* we ever add VLAN support), plus once again the pbuf header.
* - "excessive" allocations: these are allocations larger than the maximum
* we have configured, effectively requesting contiguous memory of (possibly
* far) more than 512 bytes. We do not make such allocations ourselves, as
* we only ever create pbuf chains. Thus, any such allocations come from
* lwIP. There are a few locations in lwIP that attempt to make those kinds
* of allocations, but we replace one important case in the lwIP code with
* a chained allocation, (currently) leaving only one case: allocation of
* ICMP ping reply packets. In this module, we outright *deny* any excessive
* allocations. Practically, that means that no replies are generated for
* requests exceeding around 460 bytes, which is in fact not bad, especially
* since we have multicast ICMP ping replying enabled. If any new cases of
* excessive allocations are added to lwIP in the future, we will have to
* deal with those on a case-by-case basis, but for now this should be all.
*
* This module caters to the first two types of allocations. For large buffer
* allocations, it provides a standard slab allocator, with a hardcoded slab
* size of MEMPOOL_LARGE_COUNT buffers with a 512-byte data area each. One
* slab is allocated at service start-up; additional slabs up to a configured
* maximum are allocated on demand. Once fallen out of use, all but one slabs
* will be freed after a while, using a timer. The current per-slab count of
* 512 large buffers, combined with the buffer size of 512 plus the pbuf header
* plus a bit of extra overhead, results in about 266 KB per slab.
*
* For small buffer allocations, there are two facilities. First, there is a
* static pool of small buffers. This pool currently provides 256 small-sized
* buffers, mainly in order to allow packet headers to be produced even in low-
* memory conditions. In addition, small buffers may be formed by allocating
* and then splitting up one large buffer. The module is currently configured
* to split one large buffer into four small buffers, which yields a small
* buffer size of just over 100 bytes--enough for the packet headers while
* leaving little slack on either side.
*
* It is important to note that large and small buffer allocations are freed up
* through the same function, with no information on the original allocation
* size. As a result, we have to distinguish between large and small buffers
* using a unified system. In particular, this module prepends each of its
* allocations by a single pointer, which points to a header structure that is
* at the very beginning of the slab that contains the allocated buffer. That
* header structure contains information about the type of slab (large or
* small) as well as some accounting information used by both types.
*
* For large-buffer slabs, this header is part of a larger structure with for
* example the slab's list of free buffers. This larger structure is then
* followed by the actual buffers in the slab.
*
* For small-buffer slabs, the header is followed directly by the actual small
* buffers. Thus, when a large buffer is split up into four small buffers, the
* data area of that large buffer consists of a small-type slab header and four
* small buffers. The large buffer itself is simply considered in use, as
* though it was allocated for regular data. This nesting approach saves a lot
* of memory for small allocations, at the cost of a bit more computation.
*
* It should be noted that all allocations should be (and are) pointer-aligned.
* Normally lwIP would check for this, but we cannot tell lwIP the platform
* pointer size without hardcoding that size. This module performs proper
* alignment of all buffers itself though, regardless of the pointer size.
*/
#include "lwip.h"
#include <sys/mman.h>
/* Alignment to pointer sizes. */
#define MEMPOOL_ALIGN_DOWN(s) ((s) & ~(sizeof(void *) - 1))
#define MEMPOOL_ALIGN_UP(s) MEMPOOL_ALIGN_DOWN((s) + sizeof(void *) - 1)
/* Large buffers: per-slab count and data area size. */
#define MEMPOOL_LARGE_COUNT 512
#define MEMPOOL_LARGE_SIZE \
(MEMPOOL_ALIGN_UP(sizeof(struct pbuf)) + MEMPOOL_BUFSIZE)
/* Small buffers: per-slab count and data area size. */
#define MEMPOOL_SMALL_COUNT 4
#define MEMPOOL_SMALL_SIZE \
(MEMPOOL_ALIGN_DOWN(MEMPOOL_LARGE_SIZE / MEMPOOL_SMALL_COUNT) - \
sizeof(struct mempool_header))
/* Memory pool slab header, part of both small and large slabs. */
struct mempool_header {
union {
struct {
uint8_t mhui_flags;
uint32_t mhui_inuse;
} mhu_info;
void *mhu_align; /* force pointer alignment */
} mh_u;
};
#define mh_flags mh_u.mhu_info.mhui_flags
#define mh_inuse mh_u.mhu_info.mhui_inuse
/* Header flags. */
#define MHF_SMALL 0x01 /* slab is for small buffers, not large ones */
#define MHF_STATIC 0x02 /* small slab is statically allocated */
#define MHF_MARKED 0x04 /* large empty slab is up for deallocation */
/*
* Large buffer. When allocated, mlb_header points to the (header of) the
* containing large slab, and mlb_data is returned for arbitrary use by the
* user of the buffer. When free, mlb_header is NULL and instead mlb_header2
* points to the containing slab (allowing for double-free detection), and the
* buffer is on the slab's free list by using mlb_next.
*/
struct mempool_large_buf {
struct mempool_header *mlb_header;
union {
struct {
struct mempool_header *mlbuf_header2;
LIST_ENTRY(mempool_large_buf) mlbuf_next;
} mlbu_free;
char mlbu_data[MEMPOOL_LARGE_SIZE];
} mlb_u;
};
#define mlb_header2 mlb_u.mlbu_free.mlbuf_header2
#define mlb_next mlb_u.mlbu_free.mlbuf_next
#define mlb_data mlb_u.mlbu_data
/* Small buffer. Same idea, different size. */
struct mempool_small_buf {
struct mempool_header *msb_header;
union {
struct {
struct mempool_header *msbuf_header2;
TAILQ_ENTRY(mempool_small_buf) msbuf_next;
} msbu_free;
char msbu_data[MEMPOOL_SMALL_SIZE];
} msb_u;
};
#define msb_header2 msb_u.msbu_free.msbuf_header2
#define msb_next msb_u.msbu_free.msbuf_next
#define msb_data msb_u.msbu_data
/*
* A large slab, including header, other per-slab fields, and large buffers.
* Each of these structures is on exactly one of three slab lists, depending
* on whether all its buffers are free (empty), some but not all of its buffers
* are in use (partial), or all of its buffers are in use (full). The mls_next
* field is used for that list. The mls_free field is the per-slab list of
* free buffers.
*/
struct mempool_large_slab {
struct mempool_header mls_header; /* MUST be first */
LIST_ENTRY(mempool_large_slab) mls_next;
LIST_HEAD(, mempool_large_buf) mls_free;
struct mempool_large_buf mls_buf[MEMPOOL_LARGE_COUNT];
};
/* The three slab lists for large slabs, as described above. */
static LIST_HEAD(, mempool_large_slab) mempool_empty_slabs;
static LIST_HEAD(, mempool_large_slab) mempool_partial_slabs;
static LIST_HEAD(, mempool_large_slab) mempool_full_slabs;
/*
* A small slab, including header and small buffers. We use unified free lists
* for small buffers, and these small slabs are not part of any lists
* themselves, so we need neither of the two fields from large slabs for that.
*/
struct mempool_small_slab {
struct mempool_header mss_header; /* MUST be first */
struct mempool_small_buf mss_buf[MEMPOOL_SMALL_COUNT];
};
/*
* The free lists for static small buffers (from the static pool, see below)
* and dynamic small buffers (as obtained by splitting large buffers).
*/
static TAILQ_HEAD(, mempool_small_buf) mempool_small_static_freelist;
static TAILQ_HEAD(, mempool_small_buf) mempool_small_dynamic_freelist;
/*
* A static pool of small buffers. Small buffers are somewhat more important
* than large buffers, because they are used for packet headers. The purpose
* of this static pool is to be able to make progress even if all large buffers
* are allocated for data, typically in the case that the system is low on
* memory. Note that the number of static small buffers is the given number of
* small slabs multiplied by MEMPOOL_SMALL_COUNT, hence the division.
*/
#define MEMPOOL_SMALL_SLABS (256 / MEMPOOL_SMALL_COUNT)
static struct mempool_small_slab mempool_small_pool[MEMPOOL_SMALL_SLABS];
/*
* The following setting (mempool_max_slabs) can be changed through sysctl(7).
* As such it may be set by userland to a completely arbitrary value and must
* be sanity-checked before any actual use. The default is picked such that
* all TCP sockets can fill up their send and receive queues: (TCP_SNDBUF_DEF +
* TCP_RCVBUF_DEF) * NR_TCPSOCK / (MEMPOOL_BUFSIZE * MEMPOOL_LARGE_COUNT) =
* (32768 + 32768) * 256 / (512 * 512) = 64. We put in the resulting number
* rather than the formula because not all those definitions are public.
*/
#define MEMPOOL_DEFAULT_MAX_SLABS 64 /* about 17 MB of memory */
static int mempool_max_slabs; /* maximum number of large slabs */
static int mempool_nr_slabs; /* current number of large slabs */
static int mempool_nr_large; /* current number of large buffers */
static int mempool_used_large; /* large buffers currently in use */
static int mempool_used_small; /* small buffers currently in use */
/*
* Number of clock ticks between timer invocations. The timer is used to
* deallocate unused slabs.
*/
#define MEMPOOL_TIMER_TICKS (10 * sys_hz())
static minix_timer_t mempool_timer;
static int mempool_defer_alloc; /* allocation failed, defer next try */
/* The CTL_MINIX MINIX_LWIP "mempool" subtree. Dynamically numbered. */
static struct rmib_node minix_lwip_mempool_table[] = {
RMIB_INTPTR(RMIB_RW, &mempool_max_slabs, "slab_max",
"Maximum number of memory slabs (configurable)"),
RMIB_INTPTR(RMIB_RO, &mempool_nr_slabs, "slab_num",
"Current number of memory slabs"),
RMIB_INT(RMIB_RO, sizeof(struct mempool_large_slab), "slab_size",
"Byte size of a single memory slab"),
RMIB_INT(RMIB_RO, MEMPOOL_LARGE_COUNT, "slab_bufs",
"Number of large buffers per memory slab"),
RMIB_INTPTR(RMIB_RO, &mempool_nr_large, "large_num",
"Current total number of large buffers"),
RMIB_INTPTR(RMIB_RO, &mempool_used_large, "large_used",
"Current number of used large buffers"),
RMIB_INT(RMIB_RO, MEMPOOL_LARGE_SIZE, "large_size",
"Byte size of a single large buffer"),
RMIB_INTPTR(RMIB_RO, &mempool_used_small, "small_used",
"Current number of used small buffers"),
RMIB_INT(RMIB_RO, MEMPOOL_SMALL_SIZE, "small_size",
"Byte size of a single small buffer"),
};
static struct rmib_node minix_lwip_mempool_node =
RMIB_NODE(RMIB_RO, minix_lwip_mempool_table, "mempool",
"Memory pool settings");
/*
* Initialize the given "slab" of small buffers. The slab may either come from
* the statically allocated pool ('is_static' is TRUE) or a single large buffer
* that we aim to chop up into small buffers.
*/
static void
mempool_prepare_small(struct mempool_small_slab * mss, int is_static)
{
struct mempool_small_buf *msb;
unsigned int count;
mss->mss_header.mh_flags = MHF_SMALL | ((is_static) ? MHF_STATIC : 0);
mss->mss_header.mh_inuse = 0;
msb = mss->mss_buf;
for (count = 0; count < MEMPOOL_SMALL_COUNT; count++, msb++) {
msb->msb_header = NULL;
msb->msb_header2 = &mss->mss_header;
if (is_static)
TAILQ_INSERT_HEAD(&mempool_small_static_freelist, msb,
msb_next);
else
TAILQ_INSERT_HEAD(&mempool_small_dynamic_freelist, msb,
msb_next);
}
}
/*
* Allocate a new slab for large buffers, if allowed by policy and possible.
*/
static void
mempool_new_slab(void)
{
struct mempool_large_slab *mls;
struct mempool_large_buf *mlb;
unsigned int count;
/*
* See if allocating a new slab would result in overrunning the
* configured maximum number of large buffers. Round the maximum,
* which is probably what the user intended.
*/
if (mempool_cur_buffers() + MEMPOOL_LARGE_COUNT / 2 >
mempool_max_buffers()) {
assert(mempool_nr_slabs > 0);
return;
}
/*
* If a previous allocation failed before during this timer interval,
* do not try again now.
*/
if (mempool_defer_alloc)
return;
/*
* Allocate the slab. Preallocate the memory, or we might crash later
* during low-memory conditions. If allocation fails, simply do
* nothing further. The caller will check the free lists.
*/
mls = (struct mempool_large_slab *)mmap(NULL,
sizeof(struct mempool_large_slab), PROT_READ | PROT_WRITE,
MAP_ANON | MAP_PRIVATE | MAP_PREALLOC, -1, 0);
if (mls == MAP_FAILED) {
if (mempool_nr_slabs == 0)
panic("unable to allocate initial memory pool");
/*
* Do not keep hammering VM with mmap requests when the system
* is out of memory. Try again after the next timer tick.
*/
mempool_defer_alloc = TRUE;
return;
}
/* Initialize the new slab. */
mls->mls_header.mh_flags = 0;
mls->mls_header.mh_inuse = 0;
mlb = mls->mls_buf;
LIST_INIT(&mls->mls_free);
for (count = 0; count < MEMPOOL_LARGE_COUNT; count++, mlb++) {
mlb->mlb_header = NULL;
mlb->mlb_header2 = &mls->mls_header;
LIST_INSERT_HEAD(&mls->mls_free, mlb, mlb_next);
}
LIST_INSERT_HEAD(&mempool_empty_slabs, mls, mls_next);
mempool_nr_slabs++;
mempool_nr_large += MEMPOOL_LARGE_COUNT;
}
/*
* Deallocate a slab for large buffers, if allowed.
*/
static void
mempool_destroy_slab(struct mempool_large_slab * mls)
{
assert(mempool_nr_slabs > 0);
assert(!(mls->mls_header.mh_flags & MHF_SMALL));
assert(mls->mls_header.mh_inuse == 0);
/* Never deallocate the last large slab. */
if (mempool_nr_slabs == 1)
return;
LIST_REMOVE(mls, mls_next);
if (munmap(mls, sizeof(*mls)) != 0)
panic("munmap failed: %d", -errno);
assert(mempool_nr_large > MEMPOOL_LARGE_COUNT);
mempool_nr_large -= MEMPOOL_LARGE_COUNT;
mempool_nr_slabs--;
}
/*
* Regular timer. Deallocate empty slabs already marked for deallocation, and
* mark any other empty slabs for deallocation.
*/
static void
mempool_tick(int arg __unused)
{
struct mempool_large_slab *mls, *tmls;
/*
* Go through all the empty slabs, destroying marked slabs and marking
* unmarked slabs.
*/
LIST_FOREACH_SAFE(mls, &mempool_empty_slabs, mls_next, tmls) {
if (mls->mls_header.mh_flags & MHF_MARKED)
mempool_destroy_slab(mls);
else
mls->mls_header.mh_flags |= MHF_MARKED;
}
/*
* If allocation failed during the last interval, allow a new attempt
* during the next.
*/
mempool_defer_alloc = FALSE;
/* Set the next timer. */
set_timer(&mempool_timer, MEMPOOL_TIMER_TICKS, mempool_tick, 0);
}
/*
* Initialize the memory pool module.
*/
void
mempool_init(void)
{
unsigned int slot;
/* These checks are for absolutely essential points. */
assert(sizeof(void *) == MEM_ALIGNMENT);
assert(sizeof(struct mempool_small_slab) <= MEMPOOL_LARGE_SIZE);
assert(offsetof(struct mempool_small_buf, msb_data) == sizeof(void *));
assert(offsetof(struct mempool_large_buf, mlb_data) == sizeof(void *));
/* Initialize module-local variables. */
LIST_INIT(&mempool_empty_slabs);
LIST_INIT(&mempool_partial_slabs);
LIST_INIT(&mempool_full_slabs);
TAILQ_INIT(&mempool_small_static_freelist);
TAILQ_INIT(&mempool_small_dynamic_freelist);
mempool_max_slabs = MEMPOOL_DEFAULT_MAX_SLABS;
mempool_nr_slabs = 0;
mempool_nr_large = 0;
mempool_used_large = 0;
mempool_used_small = 0;
mempool_defer_alloc = FALSE;
/* Initialize the static pool of small buffers. */
for (slot = 0; slot < __arraycount(mempool_small_pool); slot++)
mempool_prepare_small(&mempool_small_pool[slot],
TRUE /*is_static*/);
/*
* Allocate one large slab. The service needs at least one large slab
* for basic operation, and therefore will never deallocate the last.
*/
mempool_new_slab();
/* Set a regular low-frequency timer to deallocate unused slabs. */
set_timer(&mempool_timer, MEMPOOL_TIMER_TICKS, mempool_tick, 0);
/* Register the minix.lwip.mempool subtree. */
mibtree_register_lwip(&minix_lwip_mempool_node);
}
/*
* Return the total number of large buffers currently in the system, regardless
* of allocation status.
*/
unsigned int
mempool_cur_buffers(void)
{
return mempool_nr_large;
}
/*
* Return the maximum number of large buffers that the system has been allowed
* to allocate. Note that due to low-memory conditions, this maximum may not
* be allocated in practice even when desired.
*/
unsigned int
mempool_max_buffers(void)
{
if (mempool_max_slabs <= 1)
return MEMPOOL_LARGE_COUNT;
if ((size_t)mempool_max_slabs >
INT_MAX / sizeof(struct mempool_large_slab))
return INT_MAX / sizeof(struct mempool_large_slab);
return (size_t)mempool_max_slabs * MEMPOOL_LARGE_COUNT;
}
/*
* Allocate a large buffer, either by taking one off a free list or by
* allocating a new large slab. On success, return a pointer to the data area
* of the large buffer. This data area is exactly MEMPOOL_LARGE_SIZE bytes in
* size. If no large buffer could be allocated, return NULL.
*/
static void *
mempool_alloc_large(void)
{
struct mempool_large_slab *mls;
struct mempool_large_buf *mlb;
/*
* Find a large slab that has free large blocks. As is standard for
* slab allocation, favor partially used slabs over empty slabs for
* eventual consolidation. If both lists are empty, try allocating a
* new slab. If that fails, we are out of memory, and return NULL.
*/
if (!LIST_EMPTY(&mempool_partial_slabs))
mls = LIST_FIRST(&mempool_partial_slabs);
else {
if (LIST_EMPTY(&mempool_empty_slabs)) {
mempool_new_slab();
if (LIST_EMPTY(&mempool_empty_slabs))
return NULL; /* out of memory */
}
mls = LIST_FIRST(&mempool_empty_slabs);
}
/* Allocate a block from the slab that we picked. */
assert(mls != NULL);
assert(!LIST_EMPTY(&mls->mls_free));
mlb = LIST_FIRST(&mls->mls_free);
LIST_REMOVE(mlb, mlb_next);
assert(mlb->mlb_header == NULL);
assert(mlb->mlb_header2 == &mls->mls_header);
mlb->mlb_header = &mls->mls_header;
/*
* Adjust accounting for the large slab, which may involve moving it
* to another list.
*/
assert(mls->mls_header.mh_inuse < MEMPOOL_LARGE_COUNT);
mls->mls_header.mh_inuse++;
if (mls->mls_header.mh_inuse == MEMPOOL_LARGE_COUNT) {
LIST_REMOVE(mls, mls_next);
LIST_INSERT_HEAD(&mempool_full_slabs, mls, mls_next);
} else if (mls->mls_header.mh_inuse == 1) {
LIST_REMOVE(mls, mls_next);
LIST_INSERT_HEAD(&mempool_partial_slabs, mls, mls_next);
}
assert(mempool_used_large < mempool_nr_large);
mempool_used_large++;
/* Return the block's data area. */
return (void *)mlb->mlb_data;
}
/*
* Allocate a small buffer, either by taking one off a free list or by
* allocating a large buffer and splitting it up in new free small buffers. On
* success, return a pointer to the data area of the small buffer. This data
* area is exactly MEMPOOL_SMALL_SIZE bytes in size. If no small buffer could
* be allocated, return NULL.
*/
static void *
mempool_alloc_small(void)
{
struct mempool_small_slab *mss;
struct mempool_small_buf *msb;
struct mempool_header *mh;
/*
* Find a free small block and take it off the free list. Try the
* static free list before the dynamic one, so that after a peak in
* buffer usage we are likely to be able to free up the dynamic slabs
* quickly. If both lists are empty, try allocating a large block to
* divvy up into small blocks. If that fails, we are out of memory.
*/
if (!TAILQ_EMPTY(&mempool_small_static_freelist)) {
msb = TAILQ_FIRST(&mempool_small_static_freelist);
TAILQ_REMOVE(&mempool_small_static_freelist, msb, msb_next);
} else {
if (TAILQ_EMPTY(&mempool_small_dynamic_freelist)) {
mss =
(struct mempool_small_slab *)mempool_alloc_large();
if (mss == NULL)
return NULL; /* out of memory */
/* Initialize the small slab, including its blocks. */
mempool_prepare_small(mss, FALSE /*is_static*/);
}
msb = TAILQ_FIRST(&mempool_small_dynamic_freelist);
assert(msb != NULL);
TAILQ_REMOVE(&mempool_small_dynamic_freelist, msb, msb_next);
}
/* Mark the small block as allocated, and return its data area. */
assert(msb != NULL);
assert(msb->msb_header == NULL);
assert(msb->msb_header2 != NULL);
mh = msb->msb_header2;
msb->msb_header = mh;
assert(mh->mh_inuse < MEMPOOL_SMALL_COUNT);
mh->mh_inuse++;
mempool_used_small++;
return (void *)msb->msb_data;
}
/*
* Memory pool wrapper function for malloc() calls from lwIP.
*/
void *
mempool_malloc(size_t size)
{
/*
* It is currently expected that there will be allocation attempts for
* sizes larger than our large size, in particular for ICMP ping
* replies as described elsewhere. As such, we cannot print any
* warnings here. For now, refusing these excessive allocations should
* not be a problem in practice.
*/
if (size > MEMPOOL_LARGE_SIZE)
return NULL;
if (size <= MEMPOOL_SMALL_SIZE)
return mempool_alloc_small();
else
return mempool_alloc_large();
}
/*
* Memory pool wrapper function for free() calls from lwIP.
*/
void
mempool_free(void * ptr)
{
struct mempool_large_slab *mls;
struct mempool_large_buf *mlb;
struct mempool_small_slab *mss;
struct mempool_small_buf *msb;
struct mempool_header *mh;
unsigned int count;
/*
* Get a pointer to the slab header, which is right before the data
* area for both large and small buffers. This pointer is NULL if the
* buffer is free, which would indicate that something is very wrong.
*/
ptr = (void *)((char *)ptr - sizeof(mh));
memcpy(&mh, ptr, sizeof(mh));
if (mh == NULL)
panic("mempool_free called on unallocated object!");
/*
* If the slab header says that the slab is for small buffers, deal
* with that case first. If we free up the last small buffer of a
* dynamically allocated small slab, we also free up the entire small
* slab, which is in fact the data area of a large buffer.
*/
if (mh->mh_flags & MHF_SMALL) {
/*
* Move the small buffer onto the appropriate small free list.
*/
msb = (struct mempool_small_buf *)ptr;
msb->msb_header2 = mh;
msb->msb_header = NULL;
/*
* Simple heuristic, unless the buffer is static: favor reuse
* of small buffers in containers that are already in use
* for other small buffers as well, for consolidation.
*/
if (mh->mh_flags & MHF_STATIC)
TAILQ_INSERT_HEAD(&mempool_small_static_freelist, msb,
msb_next);
else if (mh->mh_inuse > 1)
TAILQ_INSERT_HEAD(&mempool_small_dynamic_freelist, msb,
msb_next);
else
TAILQ_INSERT_TAIL(&mempool_small_dynamic_freelist, msb,
msb_next);
assert(mh->mh_inuse > 0);
mh->mh_inuse--;
assert(mempool_used_small > 0);
mempool_used_small--;
/*
* If the small buffer is statically allocated, or it was not
* the last allocated small buffer in its containing large
* buffer, then we are done.
*/
if (mh->mh_inuse > 0 || (mh->mh_flags & MHF_STATIC))
return;
/*
* Otherwise, free the containing large buffer as well. First,
* remove all its small buffers from the free list.
*/
mss = (struct mempool_small_slab *)mh;
msb = mss->mss_buf;
for (count = 0; count < MEMPOOL_SMALL_COUNT; count++, msb++) {
assert(msb->msb_header == NULL);
assert(msb->msb_header2 == mh);
TAILQ_REMOVE(&mempool_small_dynamic_freelist, msb,
msb_next);
}
/* Then, fall through to the large-buffer free code. */
ptr = (void *)((char *)mh - sizeof(mh));
memcpy(&mh, ptr, sizeof(mh));
assert(mh != NULL);
assert(!(mh->mh_flags & MHF_SMALL));
}
/*
* Move the large buffer onto the free list of the large slab to which
* it belongs.
*/
mls = (struct mempool_large_slab *)mh;
mlb = (struct mempool_large_buf *)ptr;
mlb->mlb_header2 = &mls->mls_header;
mlb->mlb_header = NULL;
LIST_INSERT_HEAD(&mls->mls_free, mlb, mlb_next);
/*
* Adjust accounting for the large slab, which may involve moving it
* to another list.
*/
assert(mls->mls_header.mh_inuse > 0);
mls->mls_header.mh_inuse--;
if (mls->mls_header.mh_inuse == 0) {
LIST_REMOVE(mls, mls_next);
LIST_INSERT_HEAD(&mempool_empty_slabs, mls, mls_next);
mls->mls_header.mh_flags &= ~MHF_MARKED;
} else if (mls->mls_header.mh_inuse == MEMPOOL_LARGE_COUNT - 1) {
LIST_REMOVE(mls, mls_next);
LIST_INSERT_HEAD(&mempool_partial_slabs, mls, mls_next);
}
assert(mempool_used_large > 0);
mempool_used_large--;
}
/*
* Memory pool wrapper function for calloc() calls from lwIP.
*/
void *
mempool_calloc(size_t num, size_t size)
{
void *ptr;
size_t total;
/*
* Standard overflow check. This can be improved, but it doesn't have
* to be, because in practice lwIP never calls calloc() anyway.
*/
if (num > 0 && size > 0 && (size_t)-1 / size < num)
return NULL;
total = num * size;
if ((ptr = mempool_malloc(total)) == NULL)
return NULL;
memset(ptr, 0, total);
return ptr;
}

141
minix/net/lwip/mibtree.c Normal file
View File

@ -0,0 +1,141 @@
/* LWIP service - mibtree.c - sysctl support for */
/*
* This file acts as a dispatcher for the net.inet, net.inet6, and minix.lwip
* sysctl trees. It does not cover the other net.* trees; these are taken care
* of in other source files.
*/
#include "lwip.h"
#include <minix/sysctl.h>
#define MAX_PROTO 6 /* maximum # of INET protocols with subtrees */
static struct rmib_indir net_inet_indir[MAX_PROTO];
static unsigned int net_inet_indir_count = 0;
static struct rmib_node net_inet_node =
RMIB_SNODE(RMIB_RO, net_inet_indir, "inet", "PF_INET related settings");
#ifdef INET6
static struct rmib_indir net_inet6_indir[MAX_PROTO];
static unsigned int net_inet6_indir_count = 0;
static struct rmib_node net_inet6_node =
RMIB_SNODE(RMIB_RO, net_inet6_indir, "inet6", "PF_INET6 related settings");
#endif /* INET6 */
#define MAX_LWIP 4 /* maximum # of miscellaneous LWIP subtrees */
static struct rmib_indir minix_lwip_indir[MAX_LWIP];
static unsigned int minix_lwip_indir_count = 0;
static struct rmib_node minix_lwip_node =
RMIB_SNODE(RMIB_RO, minix_lwip_indir, "lwip",
"LWIP service information and settings");
/*
* Initialize the status module by registering the net.inet, net.inet6, and
* minix.lwip trees with the MIB service. Other modules must have added all
* subtrees to those trees through mibtree_register_*() before this point.
*/
void
mibtree_init(void)
{
const int inet_mib[] = { CTL_NET, PF_INET };
#ifdef INET6
const int inet6_mib[] = { CTL_NET, PF_INET6 };
#endif /* INET6 */
const int lwip_mib[] = { CTL_MINIX, MINIX_LWIP };
int r;
/*
* Register the "net.inet", "net.inet6", and "minix.lwip" subtrees with
* the MIB service.
*
* These calls only return local failures. Remote failures (in the MIB
* service) are silently ignored. So, we can safely panic on failure.
*/
if ((r = rmib_register(inet_mib, __arraycount(inet_mib),
&net_inet_node)) != OK)
panic("unable to register net.inet RMIB tree: %d", r);
#ifdef INET6
if ((r = rmib_register(inet6_mib, __arraycount(inet6_mib),
&net_inet6_node)) != OK)
panic("unable to register net.inet6 RMIB tree: %d", r);
#endif /* INET6 */
if ((r = rmib_register(lwip_mib, __arraycount(lwip_mib),
&minix_lwip_node)) != OK)
panic("unable to register minix.lwip RMIB tree: %d", r);
}
/*
* Add a subtree to the local net.inet or net.inet6 tree. This function must
* only be called *before* mibtree_init(), as the latter will register the
* final tree with the MIB service.
*/
void
mibtree_register_inet(int domain, int protocol, struct rmib_node * node)
{
struct rmib_node *parent;
struct rmib_indir *indir;
unsigned int i, *count;
switch (domain) {
case PF_INET:
parent = &net_inet_node;
indir = net_inet_indir;
count = &net_inet_indir_count;
break;
case PF_INET6:
#ifdef INET6
parent = &net_inet6_node;
indir = net_inet6_indir;
count = &net_inet6_indir_count;
break;
#else /* !INET6 */
return;
#endif /* !INET6 */
default:
panic("invalid domain %d", domain);
}
assert(*count < MAX_PROTO);
/* Insertion sort. */
for (i = 0; i < *count; i++) {
assert(indir[i].rindir_id != (unsigned int)protocol);
if (indir[i].rindir_id > (unsigned int)protocol)
break;
}
if (i < *count)
memmove(&indir[i + 1], &indir[i],
sizeof(indir[0]) * (*count - i));
indir[i].rindir_id = protocol;
indir[i].rindir_node = node;
parent->rnode_size = ++*count;
}
/*
* Add a miscellaneous subtree to the local minix.lwip tree. This function
* must only be called *before* mibtree_init(), as the latter will register the
* final tree with the MIB service. Note that the given subtrees are numbered
* arbitrarily. We use sparse trees here only to avoid having to declare
* external variables, which is a bit of a hack, but with the expected low
* number of miscellaneous subtrees there will be no performance penalty.
*/
void
mibtree_register_lwip(struct rmib_node * node)
{
unsigned int i;
i = minix_lwip_indir_count;
assert(i < __arraycount(minix_lwip_indir));
minix_lwip_indir[i].rindir_id = i;
minix_lwip_indir[i].rindir_node = node;
minix_lwip_node.rnode_size = ++minix_lwip_indir_count;
}

1019
minix/net/lwip/ndev.c Normal file

File diff suppressed because it is too large Load Diff

33
minix/net/lwip/ndev.h Normal file
View File

@ -0,0 +1,33 @@
#ifndef MINIX_NET_LWIP_NDEV_H
#define MINIX_NET_LWIP_NDEV_H
/* The maximum supported number of network device drivers. */
#define NR_NDEV 8
typedef uint32_t ndev_id_t;
struct ndev_hwaddr {
uint8_t nhwa_addr[NDEV_HWADDR_MAX];
};
struct ndev_conf {
uint32_t nconf_set; /* fields to set (NDEV_SET_) */
uint32_t nconf_mode; /* desired mode (NDEV_MODE_) */
struct ndev_hwaddr *nconf_mclist; /* multicast list pointer */
size_t nconf_mccount; /* multicast list count */
uint32_t nconf_caps; /* capabilities (NDEV_CAP_) */
uint32_t nconf_flags; /* flags to set (NDEV_FLAG_) */
uint32_t nconf_media; /* media selection (IFM_) */
struct ndev_hwaddr nconf_hwaddr; /* desired hardware address */
};
void ndev_init(void);
void ndev_check(void);
void ndev_process(const message * m_ptr, int ipc_status);
int ndev_conf(ndev_id_t id, const struct ndev_conf * nconf);
int ndev_send(ndev_id_t id, const struct pbuf * pbuf);
int ndev_can_recv(ndev_id_t id);
int ndev_recv(ndev_id_t id, struct pbuf * pbuf);
#endif /* !MINIX_NET_LWIP_NDEV_H */

154
minix/net/lwip/pchain.c Normal file
View File

@ -0,0 +1,154 @@
/* LWIP service - pchain.c - pbuf chain utility functions */
#include "lwip.h"
/*
* Allocate a chain of pbuf buffers as though it were a PBUF_POOL allocation,
* except that each buffer is of type PBUF_RAM. Return the pbuf chain on
* success, or NULL on memory allocation failure.
*/
struct pbuf *
pchain_alloc(int layer, size_t size)
{
struct pbuf *pbuf, *phead, **pnext;
size_t chunk, left;
int offset = 0;
/*
* Check for length overflow. Note that we do this before prepending
* the header, because otherwise we could never send a full-sized
* (65535-byte) IP packet. This does mean that we are generating a
* pbuf chain that has over 64KB worth of allocated space, but our
* header hiding ensures that tot_len stays under 64KB. A check in
* pbuf_header() prevents that later header adjustments end up lifting
* tot_len over this limit.
*/
if (size > UINT16_MAX)
return NULL;
/*
* Unfortunately, we have no choice but to replicate this block from
* lwIP's pbuf_alloc() code. It is however unlikely that the offsets
* change for the currently supported layer types, and we do not need
* to support any layer types that we do not use ourselves.
*/
switch (layer) {
case PBUF_TRANSPORT:
offset = PBUF_LINK_ENCAPSULATION_HLEN + PBUF_LINK_HLEN +
PBUF_IP_HLEN + PBUF_TRANSPORT_HLEN;
break;
case PBUF_IP:
offset = PBUF_LINK_ENCAPSULATION_HLEN + PBUF_LINK_HLEN +
PBUF_IP_HLEN;
break;
case PBUF_LINK:
offset = PBUF_LINK_ENCAPSULATION_HLEN + PBUF_LINK_HLEN;
break;
case PBUF_RAW_TX:
offset = PBUF_LINK_ENCAPSULATION_HLEN;
break;
case PBUF_RAW:
offset = 0;
break;
default:
panic("invalid pbuf layer: %d", layer);
}
chunk = size + offset;
if (chunk > MEMPOOL_BUFSIZE)
chunk = MEMPOOL_BUFSIZE;
if ((phead = pbuf_alloc(PBUF_RAW, chunk, PBUF_RAM)) == NULL)
return NULL;
if (offset > 0)
util_pbuf_header(phead, -offset);
phead->tot_len = size;
pnext = &phead->next;
for (left = size - (chunk - offset); left > 0; left -= chunk) {
chunk = (left < MEMPOOL_BUFSIZE) ? left : MEMPOOL_BUFSIZE;
if ((pbuf = pbuf_alloc(PBUF_RAW, chunk, PBUF_RAM)) == NULL) {
/*
* Adjust tot_len to match the actual length of the
* chain so far, just in case pbuf_free() starts caring
* about this in the future.
*/
for (pbuf = phead; pbuf != NULL; pbuf = pbuf->next)
pbuf->tot_len -= left;
pbuf_free(phead);
return NULL;
}
pbuf->tot_len = left;
*pnext = pbuf;
pnext = &pbuf->next;
}
return phead;
}
/*
* Given the (non-empty) chain of buffers 'pbuf', return a pointer to the
* 'next' field of the last buffer in the chain. This function is packet queue
* friendly. A packet queue is a queue of packet chains, where each chain is
* delimited using the 'tot_len' field. As a result, while the pointer
* returned is never NULL, the value pointed to by the returned pointer may or
* may not be NULL (and will point to the next chain if not NULL). As notable
* exception, in cases where the buffer type is a single PBUF_REF, 'tot_len'
* may be zero and 'len' may be non-zero. In such cases, the chain consists of
* that single buffer only. This function must handle that case as well.
*/
struct pbuf **
pchain_end(struct pbuf * pbuf)
{
assert(pbuf != NULL);
while (pbuf->tot_len > pbuf->len) {
pbuf = pbuf->next;
assert(pbuf != NULL);
}
return &pbuf->next;
}
/*
* Given the (non-empty) chain of buffers 'pbuf', return a byte size estimation
* of the memory used by the chain, rounded up to pool buffer sizes. This
* function is packet queue friendly.
*/
size_t
pchain_size(struct pbuf * pbuf)
{
size_t size;
assert(pbuf != NULL);
/*
* Count the first buffer separately, as its length may be seriously
* off due to header hiding. While the caller should always provide
* exactly the same pbuf chain twice if it intends to get back the same
* size twice, this also protects against accidental size differences
* due to header hiding in that case.
*/
size = MEMPOOL_BUFSIZE;
/*
* Round up the size of the rest of the chain to whole buffers.
*/
if (pbuf->tot_len > pbuf->len) {
size += pbuf->tot_len - pbuf->len + MEMPOOL_BUFSIZE - 1;
size -= size % MEMPOOL_BUFSIZE;
}
return size;
}

1236
minix/net/lwip/pktsock.c Normal file

File diff suppressed because it is too large Load Diff

63
minix/net/lwip/pktsock.h Normal file
View File

@ -0,0 +1,63 @@
#ifndef MINIX_NET_LWIP_PKTSOCK_H
#define MINIX_NET_LWIP_PKTSOCK_H
#include "mcast.h"
/* Packet-level socket, shared by UDP and RAW. */
struct pktsock {
struct ipsock pkt_ipsock; /* IP socket object, MUST be first */
struct pbuf *pkt_rcvhead; /* receive buffer, first packet */
struct pbuf **pkt_rcvtailp; /* receive buffer, last ptr-ptr */
size_t pkt_rcvlen; /* receive buffer, length in bytes */
struct mcast_head pkt_mcast; /* multicast membership list */
ip6_addr_p_t pkt_srcaddr; /* IPV6_PKTINFO: source address */
uint32_t pkt_ifindex; /* IPV6_KPTINFO: interface index */
};
#define pktsock_get_ipsock(pkt) (&(pkt)->pkt_ipsock)
#define pktsock_get_ifindex(pkt) ((pkt)->pkt_ifindex)
/* Options when sending packets. */
struct pktopt {
uint8_t pkto_flags; /* packet send flags (PKTOF_) */
uint8_t pkto_tos; /* type of service for the packet */
uint8_t pkto_ttl; /* time-to-live for the packet */
uint8_t pkto_mcast_ttl; /* time-to-live for multicast packet */
ip6_addr_p_t pkto_srcaddr; /* IPV6_PKTINFO: source address */
unsigned int pkto_ifindex; /* IPV6_PKTINFO: interface index */
};
#define PKTOF_TTL 0x01 /* send packet with custom TTL value */
#define PKTOF_TOS 0x02 /* send packet with custom TOS value */
#define PKTOF_PKTINFO 0x04 /* send packet with src addr, on if. */
int pktsock_socket(struct pktsock * pkt, int domain, size_t sndbuf,
size_t rcvbuf, struct sock ** sockp);
int pktsock_test_input(struct pktsock * pkt, struct pbuf * pbuf);
void pktsock_input(struct pktsock * pkt, struct pbuf * pbuf,
const ip_addr_t * srcaddr, uint16_t port);
int pktsock_get_pktinfo(struct pktsock * pkt, struct pktopt * pkto,
struct ifdev ** ifdevp, ip_addr_t * src_addrp);
int pktsock_get_ctl(struct pktsock * pkt, const struct sockdriver_data * ctl,
socklen_t ctl_len, struct pktopt * pkto);
int pktsock_get_data(struct pktsock * pkt, const struct sockdriver_data * data,
size_t len, struct pbuf * pbuf);
int pktsock_pre_recv(struct sock * sock, endpoint_t user_endpt, int flags);
int pktsock_recv(struct sock * sock, const struct sockdriver_data * data,
size_t len, size_t * off, const struct sockdriver_data * ctl,
socklen_t ctl_len, socklen_t * ctl_off, struct sockaddr * addr,
socklen_t * addr_len, endpoint_t user_endpt, int flags, size_t min,
int * rflags);
int pktsock_test_recv(struct sock * sock, size_t min, size_t * size);
void pktsock_set_mcaware(struct pktsock * pkt);
int pktsock_setsockopt(struct pktsock * pkt, int level, int name,
const struct sockdriver_data * data, socklen_t len,
struct ipopts * ipopts);
int pktsock_getsockopt(struct pktsock * pkt, int level, int name,
const struct sockdriver_data * data, socklen_t * len,
struct ipopts * ipopts);
void pktsock_shutdown(struct pktsock * pkt, unsigned int mask);
void pktsock_close(struct pktsock * pkt);
size_t pktsock_get_recvlen(struct pktsock * pkt);
#endif /* !MINIX_NET_LWIP_PKTSOCK_H */

1341
minix/net/lwip/rawsock.c Normal file

File diff suppressed because it is too large Load Diff

1654
minix/net/lwip/route.c Normal file

File diff suppressed because it is too large Load Diff

39
minix/net/lwip/route.h Normal file
View File

@ -0,0 +1,39 @@
#ifndef MINIX_NET_LWIP_ROUTE_H
#define MINIX_NET_LWIP_ROUTE_H
#include <net/route.h>
struct route_entry;
struct rtsock_request;
void route_init(void);
int route_add(const ip_addr_t * addr, unsigned int prefix,
const ip_addr_t * gateway, struct ifdev * ifdev, unsigned int flags,
const struct rtsock_request * rtr);
int route_can_add(const ip_addr_t * addr, unsigned int prefix, int is_host);
struct route_entry *route_find(const ip_addr_t * addr, unsigned int prefix,
int is_host);
struct route_entry *route_lookup(const ip_addr_t * addr);
void route_delete(struct route_entry * route,
const struct rtsock_request * rtr);
void route_clear(struct ifdev * ifdev);
int route_process(unsigned int type, const struct sockaddr * dst,
const struct sockaddr * mask, const struct sockaddr * gateway,
const struct sockaddr * ifp, const struct sockaddr * ifa,
unsigned int flags, unsigned long inits,
const struct rt_metrics * rmx, const struct rtsock_request * rtr);
void route_get(const struct route_entry * route, union sockaddr_any * addr,
union sockaddr_any * mask, union sockaddr_any * gateway,
union sockaddr_any * ifp, union sockaddr_any * ifa,
struct ifdev ** ifdev, unsigned int * flags, unsigned int * use);
unsigned int route_get_flags(const struct route_entry * route);
struct ifdev *route_get_ifdev(const struct route_entry * route);
int route_is_ipv6(const struct route_entry * route);
struct route_entry *route_enum_v4(struct route_entry * last);
struct route_entry *route_enum_v6(struct route_entry * last);
int route_output_v4(struct ifdev * ifdev, const ip4_addr_t * ipaddr,
err_t * err);
int route_output_v6(struct ifdev * ifdev, const ip6_addr_t * ipaddr,
err_t * err);
#endif /* !MINIX_NET_LWIP_ROUTE_H */

1912
minix/net/lwip/rtsock.c Normal file

File diff suppressed because it is too large Load Diff

32
minix/net/lwip/rtsock.h Normal file
View File

@ -0,0 +1,32 @@
#ifndef MINIX_NET_LWIP_RTSOCK_H
#define MINIX_NET_LWIP_RTSOCK_H
#include "ifaddr.h"
#include "lldata.h"
struct route_entry;
struct rtsock_request;
void rtsock_init(void);
sockid_t rtsock_socket(int type, int protocol, struct sock ** sock,
const struct sockevent_ops ** ops);
void rtsock_msg_ifannounce(struct ifdev * ifdev, int arrival);
void rtsock_msg_ifinfo(struct ifdev * ifdev);
void rtsock_msg_addr_dl(struct ifdev * ifdev, unsigned int type,
ifaddr_dl_num_t num);
void rtsock_msg_addr_v4(struct ifdev * ifdev, unsigned int type,
ifaddr_v4_num_t num);
void rtsock_msg_addr_v6(struct ifdev * ifdev, unsigned int type,
ifaddr_v6_num_t num);
void rtsock_msg_miss(const struct sockaddr * addr);
void rtsock_msg_route(const struct route_entry * route, unsigned int type,
const struct rtsock_request * rtr);
void rtsock_msg_arp(lldata_arp_num_t num, unsigned int type,
const struct rtsock_request * rtr);
void rtsock_msg_ndp(lldata_ndp_num_t num, unsigned int type,
const struct rtsock_request * rtr);
#endif /* !MINIX_NET_LWIP_RTSOCK_H */

744
minix/net/lwip/rttree.c Normal file
View File

@ -0,0 +1,744 @@
/* LWIP service - rttree.c - generic routing tree data structure */
/*
* This module implements the Net/3 binary radix (Patricia) tree as described
* in TCP/IP Illustrated Vol.2, with a few important changes. First and
* foremost, we make the assumption that all address masks are "normal", i.e.,
* they can be expressed in terms of a "prefix length" or "bit count", meaning
* that the first so many bits of the mask are set and the remaining bits are
* all clear. Based on this assumption, we store routing entries not just in
* leaf nodes, but rather in a node at the bit count of the routing entry's
* mask; this node may then also have children. As a result, instead of "leaf"
* and "internal" nodes, this module instead uses "data" and "link" nodes:
*
* - Data nodes are nodes with an associated routing entry. The data node
* structure is always the first field of its corresponding routing entry
* structure. Data nodes may have zero, one, or two children. Its children
* are always a refinement of the address mask in the routing entry.
* - Link nodes are nodes with no associated routing entry. They always have
* exactly two children. As with BSD's "internal" nodes: since the tree
* needs no more than one link node per routing entry, each routing entry
* structure contains a link node, which may be used anywhere in the tree.
*
* The result of this approach is that we do not use a linked list for each
* leaf, since entries with the same address and different masks are not stored
* as part of the same leaf node. There is however still one case where a
* linked list would be necessary: the coexistence of a full-mask network entry
* and a host entry (net/32 vs host for IPv4, net/128 vs host for IPv6). Since
* this tree implementation is not used for ARP/ND6 (host) entries, the need to
* support that case is not as high, and so it is currently not supported. It
* can be added later if needed. In that case, the prototype of only
* rttree_find_exact() will have to be changed, since rttree_add() already
* supports the difference by passing a full mask vs passing no mask at all.
*
* There are other differences with the BSD implementation, and certainly also
* more opportunities for improving performance. For now, the implementation
* should be good enough for its intended purpose.
*/
#include "lwip.h"
#include "rttree.h"
#define RTTREE_BITS_TO_BYTE(bits) ((bits) >> 3)
#define RTTREE_BITS_TO_SHIFT(bits) (7 - ((bits) & 7))
#define RTTREE_BITS_TO_BYTES(bits) (RTTREE_BITS_TO_BYTE((bits) + 7))
/*
* The given node is being added to the given routing tree, and just had its
* bit count assigned. Precompute any additional fields used for fast address
* access on the node.
*/
static void
rttree_precompute(struct rttree * tree __unused, struct rttree_node * node)
{
node->rtn_byte = RTTREE_BITS_TO_BYTE(node->rtn_bits);
node->rtn_shift = RTTREE_BITS_TO_SHIFT(node->rtn_bits);
}
/*
* For an operation on the routing tree 'tree', test whether the bit 'bit' is
* set or clear in 'addr'. Return 1 if the address has the bit set, 0 if it
* does not.
*/
static unsigned int
rttree_test(const struct rttree * tree __unused, const void * addr,
unsigned int bit)
{
unsigned int byte, shift;
byte = RTTREE_BITS_TO_BYTE(bit);
shift = RTTREE_BITS_TO_SHIFT(bit);
return (((const uint8_t *)addr)[byte] >> shift) & 1;
}
/*
* For an operation on the routing tree 'tree', test whether a particular bit
* as identified by the routing node 'node' is set or clear in 'address',
* effectively computing the side (left or right) to take when descending down
* the tree. Return 1 if the address has the bit set, 0 if it does not.
*/
static inline unsigned int
rttree_side(const struct rttree * tree, const struct rttree_node * node,
const void * addr)
{
return (((const uint8_t *)addr)[node->rtn_byte] >>
node->rtn_shift) & 1;
}
/*
* Check for the routing tree 'tree' whether the routing entry 'entry' matches
* the address 'addr' exactly. Return TRUE or FALSE depending on the outcome.
* This function must be called only on entries that have already been
* determined to span the full bit width.
*/
static inline int
rttree_equals(const struct rttree * tree, const struct rttree_entry * entry,
const void * addr)
{
unsigned int bits;
bits = tree->rtt_bits;
assert(bits == entry->rte_data.rtn_bits);
return !memcmp(entry->rte_addr, addr, RTTREE_BITS_TO_BYTE(bits));
}
/*
* Check for the routing tree 'tree' whether the routing entry 'entry' matches
* the address 'addr'. Return TRUE if the address is matched by the entry's
* address and mask, or FALSE if not.
*/
static inline int
rttree_match(const struct rttree * tree, const struct rttree_entry * entry,
const void * addr)
{
const uint8_t *aptr, *aptr2, *mptr;
unsigned int bits, bytes;
if ((bits = entry->rte_data.rtn_bits) == 0)
return TRUE;
if ((mptr = (const uint8_t *)entry->rte_mask) == NULL)
return rttree_equals(tree, entry, addr);
aptr = (const uint8_t *)addr;
aptr2 = (const uint8_t *)entry->rte_addr;
for (bytes = RTTREE_BITS_TO_BYTES(bits); bytes > 0; bytes--) {
if ((*aptr & *mptr) != *aptr2)
return FALSE;
aptr++;
aptr2++;
mptr++;
}
return TRUE;
}
/*
* Find the first bit that differs between the two given addresses. Return the
* bit number if found, or the full bit width if the addresses are equal.
*/
static unsigned int
rttree_diff(const struct rttree * tree, const void * addr, const void * addr2)
{
const uint8_t *aptr, *aptr2;
unsigned int bit, i;
uint8_t b;
aptr = (const uint8_t *)addr;
aptr2 = (const uint8_t *)addr2;
for (bit = 0; bit < tree->rtt_bits; bit += NBBY, aptr++, aptr2++) {
if ((b = *aptr ^ *aptr2) != 0) {
for (i = 0; i < NBBY; i++)
if (b & (1 << (NBBY - i - 1)))
break;
return bit + i;
}
}
return bit;
}
/*
* Add a link node to the free list of the given routing tree, marking it as
* free in the process.
*/
static void
rttree_add_free(struct rttree * tree, struct rttree_node * node)
{
node->rtn_child[0] = NULL;
if ((node->rtn_child[1] = tree->rtt_free) != NULL)
node->rtn_child[1]->rtn_child[0] = node;
tree->rtt_free = node;
node->rtn_parent = NULL;
node->rtn_type = RTNT_FREE;
}
/*
* Remove the given free link node from the free list. The caller must already
* have verified that the node is on the free list, and has to change the node
* type as appropriate afterward.
*/
static void
rttree_del_free(struct rttree * tree, struct rttree_node * node)
{
assert(node->rtn_type == RTNT_FREE);
if (node->rtn_child[0] != NULL)
node->rtn_child[0]->rtn_child[1] = node->rtn_child[1];
else
tree->rtt_free = node->rtn_child[1];
if (node->rtn_child[1] != NULL)
node->rtn_child[1]->rtn_child[0] = node->rtn_child[0];
}
/*
* Obtain, remove, and return a free link node from the free list. This
* function must be called only when it is already known that the free list is
* not empty. The caller has to change the node type as appropriate afterward.
*/
static struct rttree_node *
rttree_get_free(struct rttree * tree)
{
struct rttree_node * node;
node = tree->rtt_free;
assert(node != NULL);
assert(node->rtn_type == RTNT_FREE);
rttree_del_free(tree, node);
return node;
}
/*
* Initialize the given routing tree, with the given address bit width.
*/
void
rttree_init(struct rttree * tree, unsigned int bits)
{
tree->rtt_root = NULL;
tree->rtt_free = NULL;
tree->rtt_bits = bits;
}
/*
* Look up the most narrow routing tree entry that matches the given address.
* Return the entry on success, or NULL if no matching entry is found.
*/
struct rttree_entry *
rttree_lookup_match(struct rttree * tree, const void * addr)
{
struct rttree_entry *entry, *best;
struct rttree_node *node;
unsigned int side;
/*
* The current implementation is "forward-tracking", testing all
* potentially matching entries while descending into the tree and
* remembering the "best" (narrowest matching) entry. The assumption
* here is that most lookups will end up returning the default route or
* another broad route, and thus quickly fail a narrower match and bail
* out early. This assumption is in part motivated by the fact that
* our routing trees do not store link-layer (ARP/ND6) entries. If
* desired, the implementation can easily be rewritten to do
* backtracking instead.
*/
best = NULL;
for (node = tree->rtt_root; node != NULL;
node = node->rtn_child[side]) {
if (node->rtn_type == RTNT_DATA) {
entry = (struct rttree_entry *)node;
if (!rttree_match(tree, entry, addr))
break;
best = entry;
}
side = rttree_side(tree, node, addr);
}
return best;
}
/*
* Look up a routing entry that is an exact match for the given (full) address.
* Return the entry if it was found, or NULL otherwise.
*/
struct rttree_entry *
rttree_lookup_host(struct rttree * tree, const void * addr)
{
struct rttree_entry *entry;
struct rttree_node *node;
unsigned int side;
for (node = tree->rtt_root; node != NULL;
node = node->rtn_child[side]) {
if (node->rtn_type == RTNT_DATA &&
node->rtn_bits == tree->rtt_bits) {
entry = (struct rttree_entry *)node;
if (rttree_equals(tree, entry, addr))
return entry;
break;
}
side = rttree_side(tree, node, addr);
}
return NULL;
}
/*
* Look up a routing entry that is an exact match for the given address and
* prefix length. Return the entry if found, or NULL otherwise.
*/
struct rttree_entry *
rttree_lookup_exact(struct rttree * tree, const void * addr,
unsigned int prefix)
{
struct rttree_entry *entry;
struct rttree_node *node;
unsigned int side;
for (node = tree->rtt_root; node != NULL && node->rtn_bits <= prefix;
node = node->rtn_child[side]) {
if (node->rtn_type == RTNT_DATA) {
entry = (struct rttree_entry *)node;
if (!rttree_match(tree, entry, addr))
return NULL;
if (node->rtn_bits == prefix)
return entry;
}
side = rttree_side(tree, node, addr);
}
return NULL;
}
/*
* Enumerate entries in the routing tree. If 'last' is NULL, return the first
* entry. Otherwise, return the next entry starting from 'last'. In both
* cases, if no (more) entries are present in the tree, return NULL. The order
* of the returned entries is stable across tree modifications and the function
* may be called multiple times on the same entry. More specifically, it is
* safe to continue enumeration from a previous entry after deleting its
* successor from the tree.
*/
struct rttree_entry *
rttree_enum(struct rttree * tree, struct rttree_entry * last)
{
struct rttree_node *node, *parent;
/*
* For the first query, we may have to return the tree root right away.
* For subsequent queries, we have to move ahead by at least one node.
*/
if (last == NULL) {
if ((node = tree->rtt_root) == NULL)
return NULL;
if (node->rtn_type == RTNT_DATA)
return (struct rttree_entry *)node;
} else
node = &last->rte_data;
/* A basic iterative pre-order binary-tree depth-first search. */
do {
assert(node != NULL);
/* Can we descend further, either left or right? */
if (node->rtn_child[0] != NULL)
node = node->rtn_child[0];
else if (node->rtn_child[1] != NULL)
node = node->rtn_child[1];
else {
/*
* No. Go back up the tree, until we can go right
* where we went left before.. or run out of tree.
*/
for (;; node = parent) {
if ((parent = node->rtn_parent) == NULL)
return NULL;
if (parent->rtn_child[0] == node &&
parent->rtn_child[1] != NULL) {
node = parent->rtn_child[1];
break;
}
}
}
/* Skip link nodes. */
} while (node->rtn_type != RTNT_DATA);
return (struct rttree_entry *)node;
}
/*
* Set the node 'node' to be part of tree 'tree', with type 'type' (either
* RTNT_DATA or RTNT_LINK) and a bit count of 'prefix'. The node is set to be
* a child of 'parent' on side 'side', unless 'parent' is NULL in which case
* the node is set to be the topmost node in the tree (and 'side' is ignored).
* The node's children are set to 'left' and 'right'; for each, if not NULL,
* its parent is set to 'node'.
*/
static void
rttree_set(struct rttree * tree, struct rttree_node * node, int type,
unsigned int prefix, struct rttree_node * parent, int side,
struct rttree_node * left, struct rttree_node * right)
{
assert(type == RTNT_DATA || type == RTNT_LINK);
assert(prefix <= tree->rtt_bits);
assert(side == 0 || side == 1);
node->rtn_type = type;
node->rtn_bits = prefix;
/* With rtn_bits assigned, precompute any derived fields. */
rttree_precompute(tree, node);
if ((node->rtn_parent = parent) != NULL)
parent->rtn_child[side] = node;
else
tree->rtt_root = node;
if ((node->rtn_child[0] = left) != NULL)
left->rtn_parent = node;
if ((node->rtn_child[1] = right) != NULL)
right->rtn_parent = node;
}
/*
* In the routing tree 'tree', replace old node 'onode' with new node 'node',
* setting the type of the latter to 'type'. The tree is updated accordingly,
* but it is left up to the caller to deal with the old node as appropriate.
*/
static void
rttree_replace(struct rttree * tree, struct rttree_node * onode,
struct rttree_node * node, int type)
{
struct rttree_node *parent;
unsigned int side;
/*
* Replacing one data node with another data node is not something that
* is currently being done, even if it would work.
*/
assert(onode->rtn_type != RTNT_DATA || node->rtn_type != RTNT_DATA);
assert(onode->rtn_child[0] != NULL);
assert(onode->rtn_child[1] != NULL);
parent = onode->rtn_parent;
side = (parent != NULL && parent->rtn_child[1] == onode);
rttree_set(tree, node, type, onode->rtn_bits, parent, side,
onode->rtn_child[0], onode->rtn_child[1]);
}
/*
* Add a new routing entry 'entry' to the routing tree 'tree'. The entry
* object will be initialized as a result. The address to add is given as
* 'addr', and the address mask as 'mask'. Both those pointers must be point
* to memory that is as long-lived as the routing entry; this is typically
* accomplished by storing them in a larger object that embeds 'entry'.
* However, 'mask' may be NULL, signifying a host type entry with an implied
* full mask. If not NULL, the given mask must be normalized, i.e., it must
* consist of a run of zero or more 1-bits followed by a remainder of only
* 0-bits. The number of 1-bits must also be given as a bit count 'prefix',
* even if 'mask' is NULL. The address must be normalized to its mask: no bits
* starting from bit 'prefix' must be set in 'addr'. Return OK if adding the
* routing entry succeeded, or EEXIST if an entry already exists for the
* combination of that address and mask. If the caller has already verified
* with rttree_lookup_exact() that no such entry exists, the call will succeed.
*/
int
rttree_add(struct rttree * tree, struct rttree_entry * entry,
const void * addr, const void * mask, unsigned int prefix)
{
struct rttree_node *node, *parent, *link;
struct rttree_entry *other_entry;
unsigned int bit, side, side2;
int match;
assert(mask != NULL || prefix == tree->rtt_bits);
/*
* We start by determining the path, bit count, and method of the
* addition. We do this with a lookup on the address, for the full
* address width--that is, not limited to the given prefix length. As
* a result, at some point we will find either a NULL pointer, or a
* data node with a width that is at least as large as the given prefix
* length. The NULL case is easy: we EXTEND the tree with our new
* entry wherever we ran into the NULL pointer.
*
* If instead we find a sufficiently wide data node, then we see if it
* is a match for the new address. If so, our new data node should
* either be INSERTed between two nodes along the path taken so far, or
* REPLACE a link node along that path with the new data node. If it
* it is not a match, then the action to take depends on whether the
* first differing bit falls within the given prefix length: if so, we
* have to BRANCH along the path, using a link node allocated for that
* differing bit; if not, we should use INSERT or REPLACE after all.
*
* As the only exceptional case, we might in fact find an entry for the
* exact same address and prefix length as what is being added. In the
* current design of the routing tree, this is always a failure case.
*/
parent = NULL;
side = 0;
other_entry = NULL;
for (node = tree->rtt_root; node != NULL;
node = node->rtn_child[side]) {
if (node->rtn_type == RTNT_DATA) {
other_entry = (struct rttree_entry *)node;
bit = rttree_diff(tree, other_entry->rte_addr, addr);
match = (bit >= node->rtn_bits);
/* Test whether the exact entry already exists. */
if (match && node->rtn_bits == prefix)
return EEXIST;
/*
* Test the INSERT/REPLACE and BRANCH cases. Note that
* this condition is in a terse, optimized form that
* does not map directly to the two different cases.
*/
if (!match || node->rtn_bits > prefix) {
if (bit > prefix)
bit = prefix;
break;
}
}
parent = node;
side = rttree_side(tree, node, addr);
}
/*
* At this point, addition is going to succeed no matter what. Start
* by initializing part of 'entry'. In particular, add the given
* entry's link node to the list of free link nodes, because the common
* case is that we end up not using it. If we do, we will just take it
* off again right away. The entry's data node will be initialized as
* part of the addition process below.
*/
entry->rte_addr = addr;
entry->rte_mask = mask;
rttree_add_free(tree, &entry->rte_link);
/*
* First deal with the EXTEND case. In that case we already know the
* intended parent and the side (left/right) for the addition.
*/
if (node == NULL) {
assert(parent == NULL || parent->rtn_bits < prefix);
assert(parent == NULL || parent->rtn_child[side] == NULL);
rttree_set(tree, &entry->rte_data, RTNT_DATA, prefix, parent,
side, NULL /*left*/, NULL /*right*/);
return OK;
}
/*
* For the other three cases, we now have to walk back along the path
* we have taken so far in order to find the correct insertion point.
*/
while (parent != NULL && parent->rtn_bits >= bit) {
node = parent;
parent = node->rtn_parent;
}
if (bit == prefix && node->rtn_bits == bit) {
/*
* The REPLACE case. Replace the link node 'node' with our new
* entry. Afterwards, mark the link node as free.
*/
assert(node->rtn_type != RTNT_DATA);
rttree_replace(tree, node, &entry->rte_data, RTNT_DATA);
rttree_add_free(tree, node);
} else if (bit == prefix) {
/*
* The INSERT case. Insert the data node between 'parent' and
* 'node'. Note that 'parent' may be NULL. We need to use the
* address we found earlier, as 'other_entry', to determine
* whether we should add 'node' to the left or right of the
* inserted data node.
*/
assert(node->rtn_bits > bit);
assert(parent == NULL || parent->rtn_bits < bit);
assert(other_entry != NULL);
side = (parent != NULL && parent->rtn_child[1] == node);
side2 = rttree_test(tree, other_entry->rte_addr, bit);
rttree_set(tree, &entry->rte_data, RTNT_DATA, prefix, parent,
side, (!side2) ? node : NULL, (side2) ? node : NULL);
} else {
/*
* The BRANCH case. In this case, it is impossible that we
* find a link node with a bit count equal to the first
* differing bit between the address we found and the address
* we want to insert: if such a node existed, we would have
* descended down its other child during the initial lookup.
*
* Interpose a link node between 'parent' and 'current' for bit
* 'bit', with its other child set to point to 'entry'. Again,
* we need to perform an additional bit test here, because even
* though we know that the address we found during the lookup
* differs from the given address at bit 'bit', we do not know
* the value of either bit yet.
*/
assert(bit < prefix);
assert(node->rtn_bits > bit);
assert(parent == NULL || parent->rtn_bits < bit);
link = rttree_get_free(tree);
side = (parent != NULL && parent->rtn_child[1] == node);
side2 = rttree_test(tree, addr, bit);
/* Use NULL for the data node we are about to add. */
rttree_set(tree, link, RTNT_LINK, bit, parent, side,
(side2) ? node : NULL, (!side2) ? node : NULL);
/* This addition will replace the NULL pointer again. */
rttree_set(tree, &entry->rte_data, RTNT_DATA, prefix, link,
side2, NULL /*left*/, NULL /*right*/);
}
return OK;
}
/*
* Remove a particular node 'node' from the routing tree 'tree'. The given
* node must have zero or one children. As integrity check only, if 'nonempty'
* is set, the node must have one child. If the node has one child, that child
* will be linked to the node's parent (or the tree root), thus cutting the
* node itself out of the tree. If the node has zero children, the
* corresponding slot in its parent (or the tree root) will be cleared. The
* function will return a pointer to the parent node if it too qualifies for
* removal afterwards, or NULL if no further removal action needs to be taken.
*/
static struct rttree_node *
rttree_remove(struct rttree * tree, struct rttree_node * node,
int nonempty __unused)
{
struct rttree_node *parent, *child;
unsigned int side;
if ((child = node->rtn_child[0]) == NULL)
child = node->rtn_child[1];
assert(child != NULL || !nonempty);
if ((parent = node->rtn_parent) != NULL) {
side = (parent->rtn_child[1] == node);
parent->rtn_child[side] = child;
if (child != NULL)
child->rtn_parent = parent;
else if (parent->rtn_type == RTNT_LINK)
return parent;
} else {
tree->rtt_root = child;
if (child != NULL)
child->rtn_parent = NULL;
}
return NULL;
}
/*
* Delete the routing entry 'entry' from the routing tree 'tree'. The entry
* must have been added before. This function always succeeds.
*/
void
rttree_delete(struct rttree * tree, struct rttree_entry * entry)
{
struct rttree_node *node, *link;
/*
* Remove the data node from the tree. If the data node also has two
* children, we have to replace it with a link node. Otherwise, we
* have to remove it and, if it has no children at all, possibly remove
* its parent as well.
*/
node = &entry->rte_data;
assert(node->rtn_type == RTNT_DATA);
if (node->rtn_child[0] != NULL && node->rtn_child[1] != NULL) {
/*
* The link node we allocate here may actually be the entry's
* own link node. We do not make an exception for that case
* here, as we have to deal with the entry's link node being in
* use a bit further down anyway.
*/
link = rttree_get_free(tree);
rttree_replace(tree, node, link, RTNT_LINK);
} else {
/*
* Remove the data node from the tree. If the node has no
* children, its removal may leave a link node with one child.
* That would be its original parent. That node must then also
* be removed from the tree, and freed up.
*/
link = rttree_remove(tree, node, FALSE /*nonempty*/);
if (link != NULL) {
(void)rttree_remove(tree, link, TRUE /*nonempty*/);
rttree_add_free(tree, link);
}
}
/*
* Remove the entry's link node from either the tree or the free list,
* depending on the type currently assigned to it. If it has to be
* removed from the tree, it must be replaced with another link node.
* There will always be enough link nodes available for this to work.
*/
node = &entry->rte_link;
if (node->rtn_type == RTNT_LINK) {
link = rttree_get_free(tree);
rttree_replace(tree, node, link, RTNT_LINK);
} else {
assert(node->rtn_type == RTNT_FREE);
rttree_del_free(tree, node);
}
}

50
minix/net/lwip/rttree.h Normal file
View File

@ -0,0 +1,50 @@
#ifndef MINIX_NET_LWIP_RTTREE_H
#define MINIX_NET_LWIP_RTTREE_H
/* Routing table node structure. */
struct rttree_node {
struct rttree_node *rtn_child[2]; /* left child node */
struct rttree_node *rtn_parent; /* parent node */
uint8_t rtn_type; /* node type (RNT_) */
uint8_t rtn_bits; /* prefix bit count */
uint8_t rtn_byte; /* bits-derived byte index */
uint8_t rtn_shift; /* bits-derived shift count */
};
#define RTNT_DATA 0 /* data node (entry) */
#define RTNT_LINK 1 /* link node, in use */
#define RTNT_FREE 2 /* link node, free */
/* Routing table entry structure. */
struct rttree_entry {
struct rttree_node rte_data; /* data node - MUST be first */
struct rttree_node rte_link; /* link node */
const void *rte_addr; /* pointer to address */
const void *rte_mask; /* pointer to mask */
};
/* Routing table structure. */
struct rttree {
struct rttree_node *rtt_root; /* root of the route tree */
struct rttree_node *rtt_free; /* free internal nodes list */
uint8_t rtt_bits; /* number of bits in address */
};
#define rttree_get_addr(entry) ((entry)->rte_addr)
#define rttree_get_mask(entry) ((entry)->rte_mask)
#define rttree_get_prefix(entry) ((entry)->rte_data.rtn_bits)
void rttree_init(struct rttree * tree, unsigned int bits);
struct rttree_entry *rttree_lookup_match(struct rttree * tree,
const void * addr);
struct rttree_entry *rttree_lookup_host(struct rttree * tree,
const void * addr);
struct rttree_entry *rttree_lookup_exact(struct rttree * tree,
const void * addr, unsigned int prefix);
struct rttree_entry *rttree_enum(struct rttree * tree,
struct rttree_entry * entry);
int rttree_add(struct rttree * tree, struct rttree_entry * entry,
const void * addr, const void * mask, unsigned int prefix);
void rttree_delete(struct rttree * tree, struct rttree_entry * entry);
#endif /* !MINIX_NET_LWIP_RTTREE_H */

203
minix/net/lwip/tcpisn.c Normal file
View File

@ -0,0 +1,203 @@
/* LWIP service - tcpisn.c - TCP Initial Sequence Number generation */
/*
* This module implements the TCP ISN algorithm standardized in RFC 6528. It
* currently uses the current time, at clock tick granularity, as source for
* the 4-microsecond timer, and SHA256 as the hashing algorithm. As part of
* the input to the hash function, we use an "ISN secret" that can be set
* through the (hidden, root-only) net.inet.tcp.isn_secret sysctl(7) node.
* Ideally, the secret should remain the same across system reboots; it is left
* up to userland to take care of that.
*
* TODO: while this module provides the strongest possible implementation of
* the algorithm, it is also quite heavyweight. We should consider allowing
* for a more configurable level of strength, perhaps with the possibility for
* less powerful platforms to revert to simple use of a random number.
*/
#include "lwip.h"
#include "tcpisn.h"
#include <sys/sha2.h>
/*
* The TCP ISN hash input consists of the TCP 4-tuple of the new connection and
* a static secret. The 4-tuple consists of two IP addresses, at most 16 bytes
* (128 bits, for IPv6) each, and two port numbers, two bytes (16 bits) each.
* We use the SHA256 input block size of 64 bytes to avoid copying, so that
* leaves us with 28 bytes of room for the static secret. We use 16 bytes, and
* leave the rest blank. As a sidenote, while hardcoding sizes is not nice, we
* really need to get the layout exactly right in this case.
*/
#define TCPISN_TUPLE_LENGTH (16 * 2 + 2 * 2)
#if TCPISN_SECRET_LENGTH > (SHA256_BLOCK_LENGTH - TCPISN_TUPLE_LENGTH)
#error "TCP ISN secret length exceeds remainder of hash block"
#endif
/* We are using memchr() on this, so do not remove the '32' size here! */
static const uint8_t tcpisn_hextab[32] = "0123456789abcdef0123456789ABCDEF";
static uint8_t tcpisn_input[SHA256_BLOCK_LENGTH] __aligned(4);
static int tcpisn_set;
/*
* Initialize the TCP ISN module.
*/
void
tcpisn_init(void)
{
time_t boottime;
/*
* Part of the input to the hash function is kept as is between calls
* to the TCP ISN hook. In particular, we zero the entire input here,
* so that the padding is zero. We also zero the area where the secret
* will be stored, but we put in the system boot time as a last effort
* to try to create at least some minimal amount of unpredictability.
* The boot time is by no means sufficient though, so issue a warning
* if a TCP ISN is requested before an actual secret is set. Note that
* an actual secret will overwrite the boot time based pseudo-secret.
*/
memset(tcpisn_input, 0, sizeof(tcpisn_input));
(void)getuptime(NULL, NULL, &boottime);
memcpy(&tcpisn_input[TCPISN_TUPLE_LENGTH], &boottime,
sizeof(boottime));
tcpisn_set = FALSE;
}
/*
* Set and/or retrieve the ISN secret. In order to allow the hash value to be
* set from the command line, this sysctl(7) node is a hex-encoded string.
*/
ssize_t
tcpisn_secret(struct rmib_call * call __unused,
struct rmib_node * node __unused, struct rmib_oldp * oldp,
struct rmib_newp * newp)
{
uint8_t secret[TCPISN_SECRET_HEX_LENGTH], byte, *p;
unsigned int i;
int r;
/* First copy out the old (current) ISN secret. */
if (oldp != NULL) {
for (i = 0; i < TCPISN_SECRET_LENGTH; i++) {
byte = tcpisn_input[TCPISN_TUPLE_LENGTH + i];
secret[i * 2] = tcpisn_hextab[byte >> 4];
secret[i * 2 + 1] = tcpisn_hextab[byte & 0xf];
}
secret[i * 2] = '\0';
assert(i * 2 + 1 == sizeof(secret));
if ((r = rmib_copyout(oldp, 0, secret, sizeof(secret))) < 0)
return r;
}
/*
* Then copy in the new ISN secret. We require the given string to be
* exactly as large as we need.
*/
if (newp != NULL) {
/* Copy in the user-given string. */
if ((r = rmib_copyin(newp, secret, sizeof(secret))) != OK)
return r;
if (secret[i * 2] != '\0')
return EINVAL;
/* Hex-decode the given string (in place). */
for (i = 0; i < TCPISN_SECRET_LENGTH; i++) {
if ((p = memchr(tcpisn_hextab, secret[i * 2],
sizeof(tcpisn_hextab))) == NULL)
return EINVAL;
secret[i] = ((uint8_t)(p - tcpisn_hextab) & 0xf) << 4;
if ((p = memchr(tcpisn_hextab, secret[i * 2 + 1],
sizeof(tcpisn_hextab))) == NULL)
return EINVAL;
secret[i] |= (uint8_t)(p - tcpisn_hextab) & 0xf;
}
/* Once fully validated, switch to the new secret. */
memcpy(&tcpisn_input[TCPISN_TUPLE_LENGTH], secret,
TCPISN_SECRET_LENGTH);
tcpisn_set = TRUE;
}
/* Return the length of the node. */
return sizeof(secret);
}
/*
* Hook to generate an Initial Sequence Number (ISN) for a new TCP connection.
*/
uint32_t
lwip_hook_tcp_isn(const ip_addr_t * local_ip, uint16_t local_port,
const ip_addr_t * remote_ip, uint16_t remote_port)
{
uint8_t output[SHA256_DIGEST_LENGTH] __aligned(4);
SHA256_CTX ctx;
clock_t realtime;
time_t boottime;
uint32_t isn;
if (!tcpisn_set) {
printf("LWIP: warning, no TCP ISN secret has been set\n");
tcpisn_set = TRUE; /* print the warning only once */
}
if (IP_IS_V6(local_ip)) {
assert(IP_IS_V6(remote_ip));
memcpy(&tcpisn_input[0], &ip_2_ip6(local_ip)->addr, 16);
memcpy(&tcpisn_input[16], &ip_2_ip6(remote_ip)->addr, 16);
} else {
assert(IP_IS_V4(local_ip));
assert(IP_IS_V4(remote_ip));
/*
* Store IPv4 addresses as IPv4-mapped IPv6 addresses, even
* though lwIP will never give us an IPv4-mapped IPv6 address,
* so as to ensure completely disjoint address spaces and thus
* no potential abuse of IPv6 addresses in order to predict
* ISNs for IPv4 connections.
*/
memset(&tcpisn_input[0], 0, 10);
tcpisn_input[10] = 0xff;
tcpisn_input[11] = 0xff;
memcpy(&tcpisn_input[12], &ip_2_ip4(local_ip)->addr, 4);
memset(&tcpisn_input[16], 0, 10);
tcpisn_input[26] = 0xff;
tcpisn_input[27] = 0xff;
memcpy(&tcpisn_input[28], &ip_2_ip4(local_ip)->addr, 4);
}
tcpisn_input[32] = local_port >> 8;
tcpisn_input[33] = local_port & 0xff;
tcpisn_input[34] = remote_port >> 8;
tcpisn_input[35] = remote_port & 0xff;
/* The rest of the input (secret and padding) is already filled in. */
SHA256_Init(&ctx); /* this call zeroes a buffer we don't use.. */
SHA256_Update(&ctx, tcpisn_input, sizeof(tcpisn_input));
SHA256_Final(output, &ctx);
/* Arbitrarily take the first 32 bits from the generated hash. */
memcpy(&isn, output, sizeof(isn));
/*
* Add the current time in 4-microsecond units. The time value should
* be wall-clock accurate and stable even across system reboots and
* downtime. Do not precompute the boot time part: it may change.
*/
(void)getuptime(NULL, &realtime, &boottime);
isn += (uint32_t)boottime * 250000;
isn += (uint32_t)(((uint64_t)realtime * 250000) / sys_hz());
/* The result is the ISN to use for this connection. */
return isn;
}

20
minix/net/lwip/tcpisn.h Normal file
View File

@ -0,0 +1,20 @@
#ifndef MINIX_NET_LWIP_TCPISN_H
#define MINIX_NET_LWIP_TCPISN_H
/*
* Length, in bytes, of the secret (random seed) that is used as part of the
* input to the hashing function that generates TCP Initial Sequence Numbers.
*/
#define TCPISN_SECRET_LENGTH 16
/*
* Size of the hexadecimal-string representation of the secret, including
* trailing null terminator.
*/
#define TCPISN_SECRET_HEX_LENGTH (TCPISN_SECRET_LENGTH * 2 + 1)
void tcpisn_init(void);
ssize_t tcpisn_secret(struct rmib_call * call, struct rmib_node * node,
struct rmib_oldp * oldp, struct rmib_newp * newp);
#endif /* !MINIX_NET_LWIP_TCPISN_H */

2793
minix/net/lwip/tcpsock.c Normal file

File diff suppressed because it is too large Load Diff

997
minix/net/lwip/udpsock.c Normal file
View File

@ -0,0 +1,997 @@
/* LWIP service - udpsock.c - UDP sockets */
#include "lwip.h"
#include "ifaddr.h"
#include "pktsock.h"
#include "lwip/udp.h"
#include <netinet/udp.h>
#include <netinet/ip_var.h>
#include <netinet/udp_var.h>
/* The number of UDP sockets. Inherited from the lwIP configuration. */
#define NR_UDPSOCK MEMP_NUM_UDP_PCB
/*
* Outgoing packets are not getting buffered, so the send buffer size simply
* determines the maximum size for sent packets. The send buffer maximum is
* therefore limited to the maximum size of a single packet (64K-1 bytes),
* which is already enforced by lwIP's 16-bit length parameter to pbuf_alloc().
*
* The actual transmission may enforce a lower limit, though. The full packet
* size must not exceed the same 64K-1 limit, and that includes any headers
* that still have to be prepended to the given packet. The size of those
* headers depends on the socket type (IPv4/IPv6) and the IP_HDRINCL setting.
*/
#define UDP_MAX_PAYLOAD (UINT16_MAX)
#define UDP_SNDBUF_MIN 1 /* minimum UDP send buffer size */
#define UDP_SNDBUF_DEF 8192 /* default UDP send buffer size */
#define UDP_SNDBUF_MAX UDP_MAX_PAYLOAD /* maximum UDP send buffer size */
#define UDP_RCVBUF_MIN MEMPOOL_BUFSIZE /* minimum UDP receive buffer size */
#define UDP_RCVBUF_DEF 32768 /* default UDP receive buffer size */
#define UDP_RCVBUF_MAX 65536 /* maximum UDP receive buffer size */
static struct udpsock {
struct pktsock udp_pktsock; /* pkt socket, MUST be first */
struct udp_pcb *udp_pcb; /* lwIP UDP control block */
SIMPLEQ_ENTRY(udpsock) udp_next; /* next in free list */
} udp_array[NR_UDPSOCK];
static SIMPLEQ_HEAD(, udpsock) udp_freelist; /* list of free UDP sockets */
static const struct sockevent_ops udpsock_ops;
#define udpsock_get_sock(udp) (ipsock_get_sock(udpsock_get_ipsock(udp)))
#define udpsock_get_ipsock(udp) (pktsock_get_ipsock(&(udp)->udp_pktsock))
#define udpsock_is_ipv6(udp) (ipsock_is_ipv6(udpsock_get_ipsock(udp)))
#define udpsock_is_conn(udp) \
(udp_flags((udp)->udp_pcb) & UDP_FLAGS_CONNECTED)
static ssize_t udpsock_pcblist(struct rmib_call *, struct rmib_node *,
struct rmib_oldp *, struct rmib_newp *);
/* The CTL_NET {PF_INET,PF_INET6} IPPROTO_UDP subtree. */
/* TODO: add many more and make some of them writable.. */
static struct rmib_node net_inet_udp_table[] = {
/* 1*/ [UDPCTL_CHECKSUM] = RMIB_INT(RMIB_RO, 1, "checksum",
"Compute UDP checksums"),
/* 2*/ [UDPCTL_SENDSPACE] = RMIB_INT(RMIB_RO, UDP_SNDBUF_DEF,
"sendspace",
"Default UDP send buffer size"),
/* 3*/ [UDPCTL_RECVSPACE] = RMIB_INT(RMIB_RO, UDP_RCVBUF_DEF,
"recvspace",
"Default UDP receive buffer size"),
/* 4*/ [UDPCTL_LOOPBACKCKSUM] = RMIB_FUNC(RMIB_RW | CTLTYPE_INT, sizeof(int),
loopif_cksum, "do_loopback_cksum",
"Perform UDP checksum on loopback"),
/*+0*/ [UDPCTL_MAXID] = RMIB_FUNC(RMIB_RO | CTLTYPE_NODE, 0,
udpsock_pcblist, "pcblist",
"UDP protocol control block list"),
};
static struct rmib_node net_inet_udp_node =
RMIB_NODE(RMIB_RO, net_inet_udp_table, "udp", "UDPv4 related settings");
static struct rmib_node net_inet6_udp6_node =
RMIB_NODE(RMIB_RO, net_inet_udp_table, "udp6", "UDPv6 related settings");
/*
* Initialize the UDP sockets module.
*/
void
udpsock_init(void)
{
unsigned int slot;
/* Initialize the list of free UDP sockets. */
SIMPLEQ_INIT(&udp_freelist);
for (slot = 0; slot < __arraycount(udp_array); slot++)
SIMPLEQ_INSERT_TAIL(&udp_freelist, &udp_array[slot], udp_next);
/* Register the net.inet.udp and net.inet6.udp6 RMIB subtrees. */
mibtree_register_inet(PF_INET, IPPROTO_UDP, &net_inet_udp_node);
mibtree_register_inet(PF_INET6, IPPROTO_UDP, &net_inet6_udp6_node);
}
/*
* A packet has arrived on a UDP socket. We own the given packet buffer, and
* so we must free it if we do not want to keep it.
*/
static void
udpsock_input(void * arg, struct udp_pcb * pcb __unused, struct pbuf * pbuf,
const ip_addr_t * ipaddr, uint16_t port)
{
struct udpsock *udp = (struct udpsock *)arg;
/* All UDP input processing is handled by pktsock. */
pktsock_input(&udp->udp_pktsock, pbuf, ipaddr, port);
}
/*
* Create a UDP socket.
*/
sockid_t
udpsock_socket(int domain, int protocol, struct sock ** sockp,
const struct sockevent_ops ** ops)
{
struct udpsock *udp;
unsigned int flags;
uint8_t ip_type;
switch (protocol) {
case 0:
case IPPROTO_UDP:
break;
/* NetBSD does not support IPPROTO_UDPLITE, even though lwIP does. */
default:
return EPROTONOSUPPORT;
}
if (SIMPLEQ_EMPTY(&udp_freelist))
return ENOBUFS;
udp = SIMPLEQ_FIRST(&udp_freelist);
ip_type = pktsock_socket(&udp->udp_pktsock, domain, UDP_SNDBUF_DEF,
UDP_RCVBUF_DEF, sockp);
/* We should have enough PCBs so this call should not fail.. */
if ((udp->udp_pcb = udp_new_ip_type(ip_type)) == NULL)
return ENOBUFS;
udp_recv(udp->udp_pcb, udpsock_input, (void *)udp);
/* By default, the multicast TTL is 1 and looping is enabled. */
udp_set_multicast_ttl(udp->udp_pcb, 1);
flags = udp_flags(udp->udp_pcb);
udp_setflags(udp->udp_pcb, flags | UDP_FLAGS_MULTICAST_LOOP);
SIMPLEQ_REMOVE_HEAD(&udp_freelist, udp_next);
*ops = &udpsock_ops;
return SOCKID_UDP | (sockid_t)(udp - udp_array);
}
/*
* Bind a UDP socket to a local address.
*/
static int
udpsock_bind(struct sock * sock, const struct sockaddr * addr,
socklen_t addr_len, endpoint_t user_endpt)
{
struct udpsock *udp = (struct udpsock *)sock;
ip_addr_t ipaddr;
uint16_t port;
err_t err;
int r;
if ((r = ipsock_get_src_addr(udpsock_get_ipsock(udp), addr, addr_len,
user_endpt, &udp->udp_pcb->local_ip, udp->udp_pcb->local_port,
TRUE /*allow_mcast*/, &ipaddr, &port)) != OK)
return r;
err = udp_bind(udp->udp_pcb, &ipaddr, port);
return util_convert_err(err);
}
/*
* Connect a UDP socket to a remote address.
*/
static int
udpsock_connect(struct sock * sock, const struct sockaddr * addr,
socklen_t addr_len, endpoint_t user_endpt __unused)
{
struct udpsock *udp = (struct udpsock *)sock;
struct ifdev *ifdev;
const ip_addr_t *src_addr;
ip_addr_t dst_addr;
uint16_t dst_port;
uint32_t ifindex, ifindex2;
err_t err;
int r;
/*
* One may "unconnect" socket by providing an address with family
* AF_UNSPEC. Providing an <any>:0 address does not achieve the same.
*/
if (addr_is_unspec(addr, addr_len)) {
udp_disconnect(udp->udp_pcb);
return OK;
}
if ((r = ipsock_get_dst_addr(udpsock_get_ipsock(udp), addr,
addr_len, &udp->udp_pcb->local_ip, &dst_addr, &dst_port)) != OK)
return r;
/*
* Bind explicitly to a source address if the PCB is not bound to one
* yet. This is expected in the BSD socket API, but lwIP does not do
* it for us.
*/
if (ip_addr_isany(&udp->udp_pcb->local_ip)) {
/* Help the multicast case a bit, if possible. */
ifdev = NULL;
if (ip_addr_ismulticast(&dst_addr)) {
ifindex = pktsock_get_ifindex(&udp->udp_pktsock);
ifindex2 = udp_get_multicast_netif_index(udp->udp_pcb);
if (ifindex == 0)
ifindex = ifindex2;
if (ifindex != 0) {
ifdev = ifdev_get_by_index(ifindex);
if (ifdev == NULL)
return ENXIO;
}
}
src_addr = ifaddr_select(&dst_addr, ifdev, NULL /*ifdevp*/);
if (src_addr == NULL)
return EHOSTUNREACH;
err = udp_bind(udp->udp_pcb, src_addr,
udp->udp_pcb->local_port);
if (err != ERR_OK)
return util_convert_err(err);
}
/*
* Connecting a UDP socket serves two main purposes: 1) the socket uses
* the address as destination when sending, and 2) the socket receives
* packets from only the connected address.
*/
err = udp_connect(udp->udp_pcb, &dst_addr, dst_port);
if (err != ERR_OK)
return util_convert_err(err);
return OK;
}
/*
* Perform preliminary checks on a send request.
*/
static int
udpsock_pre_send(struct sock * sock, size_t len, socklen_t ctl_len __unused,
const struct sockaddr * addr, socklen_t addr_len __unused,
endpoint_t user_endpt __unused, int flags)
{
struct udpsock *udp = (struct udpsock *)sock;
if ((flags & ~MSG_DONTROUTE) != 0)
return EOPNOTSUPP;
if (!udpsock_is_conn(udp) && addr == NULL)
return EDESTADDRREQ;
/*
* This is only one part of the length check. The rest is done from
* udpsock_send(), once we have more information.
*/
if (len > ipsock_get_sndbuf(udpsock_get_ipsock(udp)))
return EMSGSIZE;
return OK;
}
/*
* Swap IP-level options between the UDP PCB and the packet options structure,
* for all options that have their flag set in the packet options structure.
* This function is called twice when sending a packet. The result is that the
* flagged options are overridden for only the packet being sent.
*/
static void
udpsock_swap_opt(struct udpsock * udp, struct pktopt * pkto)
{
uint8_t tos, ttl, mcast_ttl;
if (pkto->pkto_flags & PKTOF_TOS) {
tos = udp->udp_pcb->tos;
udp->udp_pcb->tos = pkto->pkto_tos;
pkto->pkto_tos = tos;
}
if (pkto->pkto_flags & PKTOF_TTL) {
ttl = udp->udp_pcb->ttl;
mcast_ttl = udp_get_multicast_ttl(udp->udp_pcb);
udp->udp_pcb->ttl = pkto->pkto_ttl;
udp_set_multicast_ttl(udp->udp_pcb, pkto->pkto_mcast_ttl);
pkto->pkto_ttl = ttl;
pkto->pkto_mcast_ttl = mcast_ttl;
}
}
/*
* Send a packet on a UDP socket.
*/
static int
udpsock_send(struct sock * sock, const struct sockdriver_data * data,
size_t len, size_t * off, const struct sockdriver_data * ctl,
socklen_t ctl_len, socklen_t * ctl_off __unused,
const struct sockaddr * addr, socklen_t addr_len,
endpoint_t user_endpt __unused, int flags, size_t min __unused)
{
struct udpsock *udp = (struct udpsock *)sock;
struct pktopt pktopt;
struct pbuf *pbuf;
struct ifdev *ifdev;
struct netif *netif;
const ip_addr_t *src_addrp, *dst_addrp;
ip_addr_t src_addr, dst_addr; /* for storage only; not always used! */
uint16_t dst_port;
uint32_t ifindex;
size_t hdrlen;
err_t err;
int r;
/* Copy in and parse any packet options. */
pktopt.pkto_flags = 0;
if ((r = pktsock_get_ctl(&udp->udp_pktsock, ctl, ctl_len,
&pktopt)) != OK)
return r;
/*
* The code below will both determine an outgoing interface and a
* source address for the packet. Even though lwIP could do this for
* us in some cases, there are other cases where we must do so
* ourselves, with as main reasons 1) the possibility that either or
* both have been provided through IPV6_PKTINFO, and 2) our intent to
* detect and stop zone violations for (combinations of) scoped IPv6
* addresses. As a result, it is easier to simply take over the
* selection tasks lwIP in their entirety.
*
* Much of the same applies to rawsock_send() as well. Functional
* differences (e.g. IP_HDRINCL support) as well as the PCB accesses in
* the code make it hard to merge the two into a single pktsock copy.
* Please do keep the two in sync as much as possible.
*/
/*
* Start by checking whether the source address and/or the outgoing
* interface are overridden using sticky and/or ancillary options. The
* call to pktsock_get_pktinfo(), if successful, will either set
* 'ifdev' to NULL, in which case there is no override, or it will set
* 'ifdev' to the outgoing interface to use, and (only) in that case
* also fill 'src_addr', with an address that may either be a locally
* owned unicast address or the unspecified ('any') address. If it is
* a unicast address, that is the source address to use for the packet.
* Otherwise, fall back to the address to which the socket is bound,
* which may also be the unspecified address or even a multicast
* address. In those case we will pick a source address further below.
*/
if ((r = pktsock_get_pktinfo(&udp->udp_pktsock, &pktopt, &ifdev,
&src_addr)) != OK)
return r;
if (ifdev != NULL && !ip_addr_isany(&src_addr)) {
/* This is guaranteed to be a proper local unicast address. */
src_addrp = &src_addr;
} else {
src_addrp = &udp->udp_pcb->local_ip;
/*
* If the socket is bound to a multicast address, use the
* unspecified ('any') address as source address instead, until
* we select a real source address (further below). This
* substitution keeps the rest of the code a bit simpler.
*/
if (ip_addr_ismulticast(src_addrp))
src_addrp = IP46_ADDR_ANY(IP_GET_TYPE(src_addrp));
}
/*
* Determine the destination address to use. If the socket is
* connected, always ignore any address provided in the send call.
*/
if (!udpsock_is_conn(udp)) {
assert(addr != NULL); /* already checked in pre_send */
if ((r = ipsock_get_dst_addr(udpsock_get_ipsock(udp), addr,
addr_len, src_addrp, &dst_addr, &dst_port)) != OK)
return r;
dst_addrp = &dst_addr;
} else {
dst_addrp = &udp->udp_pcb->remote_ip;
dst_port = udp->udp_pcb->remote_port;
}
/*
* If the destination is a multicast address, select the outgoing
* interface based on the multicast interface index, if one is set.
* This must be done here in order to allow the code further below to
* detect zone violations, because if we leave this selection to lwIP,
* it will not perform zone violation detection at all. Also note that
* this case must *not* override an interface index already specified
* using IPV6_PKTINFO, as per RFC 3542 Sec. 6.7.
*/
if (ifdev == NULL && ip_addr_ismulticast(dst_addrp)) {
ifindex = udp_get_multicast_netif_index(udp->udp_pcb);
if (ifindex != NETIF_NO_INDEX)
ifdev = ifdev_get_by_index(ifindex); /* (may fail) */
}
/*
* If an interface has been determined already now, the send operation
* will bypass routing. In that case, we must perform our own checks
* on address zone violations, because those will not be made anywhere
* else. Subsequent steps below will never introduce violations.
*/
if (ifdev != NULL && IP_IS_V6(dst_addrp)) {
if (ifaddr_is_zone_mismatch(ip_2_ip6(dst_addrp), ifdev))
return EHOSTUNREACH;
if (IP_IS_V6(src_addrp) &&
ifaddr_is_zone_mismatch(ip_2_ip6(src_addrp), ifdev))
return EHOSTUNREACH;
}
/*
* If we do not yet have an interface at this point, perform a route
* lookup to determine the outgoing interface. Unless MSG_DONTROUTE is
* set (which covers SO_DONTROUTE as well), in which case we look for a
* local subnet that matches the destination address.
*/
if (ifdev == NULL) {
if (!(flags & MSG_DONTROUTE)) {
/*
* ip_route() should never be called with an
* IPADDR_TYPE_ANY type address. This is a lwIP-
* internal requirement; while we override both routing
* functions, we do not deviate from it.
*/
if (IP_IS_ANY_TYPE_VAL(*src_addrp))
src_addrp =
IP46_ADDR_ANY(IP_GET_TYPE(dst_addrp));
/* Perform the route lookup. */
if ((netif = ip_route(src_addrp, dst_addrp)) == NULL)
return EHOSTUNREACH;
ifdev = netif_get_ifdev(netif);
} else {
if ((ifdev = ifaddr_map_by_subnet(dst_addrp)) == NULL)
return EHOSTUNREACH;
}
}
/*
* At this point we have an outgoing interface. If we do not have a
* source address yet, pick one now.
*/
assert(ifdev != NULL);
if (ip_addr_isany(src_addrp)) {
src_addrp = ifaddr_select(dst_addrp, ifdev, NULL /*ifdevp*/);
if (src_addrp == NULL)
return EHOSTUNREACH;
}
/*
* Now that we know the full conditions of what we are about to send,
* check whether the packet size leaves enough room for lwIP to prepend
* headers. If so, allocate a chain of pbufs for the packet.
*/
assert(len <= UDP_MAX_PAYLOAD);
if (IP_IS_V6(dst_addrp))
hdrlen = IP6_HLEN + UDP_HLEN;
else
hdrlen = IP_HLEN + UDP_HLEN;
if (hdrlen + len > UDP_MAX_PAYLOAD)
return EMSGSIZE;
if ((pbuf = pchain_alloc(PBUF_TRANSPORT, len)) == NULL)
return ENOBUFS;
/* Copy in the packet data. */
if ((r = pktsock_get_data(&udp->udp_pktsock, data, len, pbuf)) != OK) {
pbuf_free(pbuf);
return r;
}
/*
* Set broadcast/multicast flags for accounting purposes. Only the
* multicast flag is used for output accounting, but for loopback
* traffic, both flags are copied and used for input accounting and
* setting MSG_MCAST/MSG_BCAST.
*/
if (ip_addr_ismulticast(dst_addrp))
pbuf->flags |= PBUF_FLAG_LLMCAST;
else if (ip_addr_isbroadcast(dst_addrp, ifdev_get_netif(ifdev)))
pbuf->flags |= PBUF_FLAG_LLBCAST;
/* Send the packet. */
udpsock_swap_opt(udp, &pktopt);
assert(!ip_addr_isany(src_addrp));
assert(!ip_addr_ismulticast(src_addrp));
err = udp_sendto_if_src(udp->udp_pcb, pbuf, dst_addrp, dst_port,
ifdev_get_netif(ifdev), src_addrp);
udpsock_swap_opt(udp, &pktopt);
/* Free the pbuf, as a copy has been made. */
pbuf_free(pbuf);
/*
* On success, make sure to return the size of the sent packet as well.
* As an aside: ctl_off need not be updated, as it is not returned.
*/
if ((r = util_convert_err(err)) == OK)
*off = len;
return r;
}
/*
* Update the set of flag-type socket options on a UDP socket.
*/
static void
udpsock_setsockmask(struct sock * sock, unsigned int mask)
{
struct udpsock *udp = (struct udpsock *)sock;
if (mask & SO_REUSEADDR)
ip_set_option(udp->udp_pcb, SOF_REUSEADDR);
else
ip_reset_option(udp->udp_pcb, SOF_REUSEADDR);
if (mask & SO_BROADCAST)
ip_set_option(udp->udp_pcb, SOF_BROADCAST);
else
ip_reset_option(udp->udp_pcb, SOF_BROADCAST);
}
/*
* Prepare a helper structure for IP-level option processing.
*/
static void
udpsock_get_ipopts(struct udpsock * udp, struct ipopts * ipopts)
{
ipopts->local_ip = &udp->udp_pcb->local_ip;
ipopts->remote_ip = &udp->udp_pcb->remote_ip;
ipopts->tos = &udp->udp_pcb->tos;
ipopts->ttl = &udp->udp_pcb->ttl;
ipopts->sndmin = UDP_SNDBUF_MIN;
ipopts->sndmax = UDP_SNDBUF_MAX;
ipopts->rcvmin = UDP_RCVBUF_MIN;
ipopts->rcvmax = UDP_RCVBUF_MAX;
}
/*
* Set socket options on a UDP socket.
*/
static int
udpsock_setsockopt(struct sock * sock, int level, int name,
const struct sockdriver_data * data, socklen_t len)
{
struct udpsock *udp = (struct udpsock *)sock;
struct ipopts ipopts;
ip_addr_t ipaddr;
struct in_addr in_addr;
struct ifdev *ifdev;
unsigned int flags;
uint32_t ifindex;
uint8_t byte;
int r, val;
/*
* Unfortunately, we have to duplicate most of the multicast options
* rather than sharing them with rawsock at the pktsock level. The
* reason is that each of the PCBs have their own multicast abstraction
* functions and so we cannot merge the rest. Same for getsockopt.
*/
switch (level) {
case IPPROTO_IP:
if (udpsock_is_ipv6(udp))
break;
switch (name) {
case IP_MULTICAST_IF:
pktsock_set_mcaware(&udp->udp_pktsock);
if ((r = sockdriver_copyin_opt(data, &in_addr,
sizeof(in_addr), len)) != OK)
return r;
ip_addr_set_ip4_u32(&ipaddr, in_addr.s_addr);
if ((ifdev = ifaddr_map_by_addr(&ipaddr)) == NULL)
return EADDRNOTAVAIL;
udp_set_multicast_netif_index(udp->udp_pcb,
ifdev_get_index(ifdev));
return OK;
case IP_MULTICAST_LOOP:
pktsock_set_mcaware(&udp->udp_pktsock);
if ((r = sockdriver_copyin_opt(data, &byte,
sizeof(byte), len)) != OK)
return r;
flags = udp_flags(udp->udp_pcb);
if (byte)
flags |= UDP_FLAGS_MULTICAST_LOOP;
else
flags &= ~UDP_FLAGS_MULTICAST_LOOP;
udp_setflags(udp->udp_pcb, flags);
return OK;
case IP_MULTICAST_TTL:
pktsock_set_mcaware(&udp->udp_pktsock);
if ((r = sockdriver_copyin_opt(data, &byte,
sizeof(byte), len)) != OK)
return r;
udp_set_multicast_ttl(udp->udp_pcb, byte);
return OK;
}
break;
case IPPROTO_IPV6:
if (!udpsock_is_ipv6(udp))
break;
switch (name) {
case IPV6_MULTICAST_IF:
pktsock_set_mcaware(&udp->udp_pktsock);
if ((r = sockdriver_copyin_opt(data, &val, sizeof(val),
len)) != OK)
return r;
if (val != 0) {
ifindex = (uint32_t)val;
ifdev = ifdev_get_by_index(ifindex);
if (ifdev == NULL)
return ENXIO;
} else
ifindex = NETIF_NO_INDEX;
udp_set_multicast_netif_index(udp->udp_pcb, ifindex);
return OK;
case IPV6_MULTICAST_LOOP:
pktsock_set_mcaware(&udp->udp_pktsock);
if ((r = sockdriver_copyin_opt(data, &val, sizeof(val),
len)) != OK)
return r;
if (val < 0 || val > 1)
return EINVAL;
flags = udp_flags(udp->udp_pcb);
if (val)
flags |= UDP_FLAGS_MULTICAST_LOOP;
else
flags &= ~UDP_FLAGS_MULTICAST_LOOP;
/*
* lwIP's IPv6 functionality does not actually check
* this flag at all yet. We set it in the hope that
* one day this will magically start working.
*/
udp_setflags(udp->udp_pcb, flags);
return OK;
case IPV6_MULTICAST_HOPS:
pktsock_set_mcaware(&udp->udp_pktsock);
if ((r = sockdriver_copyin_opt(data, &val, sizeof(val),
len)) != OK)
return r;
if (val < -1 || val > UINT8_MAX)
return EINVAL;
if (val == -1)
val = 1;
udp_set_multicast_ttl(udp->udp_pcb, val);
return OK;
}
break;
}
/* Handle all other options at the packet or IP level. */
udpsock_get_ipopts(udp, &ipopts);
return pktsock_setsockopt(&udp->udp_pktsock, level, name, data, len,
&ipopts);
}
/*
* Retrieve socket options on a UDP socket.
*/
static int
udpsock_getsockopt(struct sock * sock, int level, int name,
const struct sockdriver_data * data, socklen_t * len)
{
struct udpsock *udp = (struct udpsock *)sock;
struct ipopts ipopts;
const ip4_addr_t *ip4addr;
struct in_addr in_addr;
struct ifdev *ifdev;
unsigned int flags;
uint32_t ifindex;
uint8_t byte;
int val;
switch (level) {
case IPPROTO_IP:
if (udpsock_is_ipv6(udp))
break;
switch (name) {
case IP_MULTICAST_IF:
ifindex = udp_get_multicast_netif_index(udp->udp_pcb);
/*
* Map back from the interface index to the IPv4
* address assigned to the corresponding interface.
* Should this not work out, return the 'any' address.
*/
if (ifindex != NETIF_NO_INDEX &&
(ifdev = ifdev_get_by_index(ifindex)) != NULL) {
ip4addr =
netif_ip4_addr(ifdev_get_netif(ifdev));
in_addr.s_addr = ip4_addr_get_u32(ip4addr);
} else
in_addr.s_addr = PP_HTONL(INADDR_ANY);
return sockdriver_copyout_opt(data, &in_addr,
sizeof(in_addr), len);
case IP_MULTICAST_LOOP:
flags = udp_flags(udp->udp_pcb);
byte = !!(flags & UDP_FLAGS_MULTICAST_LOOP);
return sockdriver_copyout_opt(data, &byte,
sizeof(byte), len);
case IP_MULTICAST_TTL:
byte = udp_get_multicast_ttl(udp->udp_pcb);
return sockdriver_copyout_opt(data, &byte,
sizeof(byte), len);
}
break;
case IPPROTO_IPV6:
if (!udpsock_is_ipv6(udp))
break;
switch (name) {
case IPV6_MULTICAST_IF:
ifindex = udp_get_multicast_netif_index(udp->udp_pcb);
val = (int)ifindex;
return sockdriver_copyout_opt(data, &val, sizeof(val),
len);
case IPV6_MULTICAST_LOOP:
flags = udp_flags(udp->udp_pcb);
val = !!(flags & UDP_FLAGS_MULTICAST_LOOP);
return sockdriver_copyout_opt(data, &val, sizeof(val),
len);
case IPV6_MULTICAST_HOPS:
val = udp_get_multicast_ttl(udp->udp_pcb);
return sockdriver_copyout_opt(data, &val, sizeof(val),
len);
}
break;
}
/* Handle all other options at the packet or IP level. */
udpsock_get_ipopts(udp, &ipopts);
return pktsock_getsockopt(&udp->udp_pktsock, level, name, data, len,
&ipopts);
}
/*
* Retrieve the local socket address of a UDP socket.
*/
static int
udpsock_getsockname(struct sock * sock, struct sockaddr * addr,
socklen_t * addr_len)
{
struct udpsock *udp = (struct udpsock *)sock;
ipsock_put_addr(udpsock_get_ipsock(udp), addr, addr_len,
&udp->udp_pcb->local_ip, udp->udp_pcb->local_port);
return OK;
}
/*
* Retrieve the remote socket address of a UDP socket.
*/
static int
udpsock_getpeername(struct sock * sock, struct sockaddr * addr,
socklen_t * addr_len)
{
struct udpsock *udp = (struct udpsock *)sock;
if (!udpsock_is_conn(udp))
return ENOTCONN;
ipsock_put_addr(udpsock_get_ipsock(udp), addr, addr_len,
&udp->udp_pcb->remote_ip, udp->udp_pcb->remote_port);
return OK;
}
/*
* Shut down a UDP socket for reading and/or writing.
*/
static int
udpsock_shutdown(struct sock * sock, unsigned int mask)
{
struct udpsock *udp = (struct udpsock *)sock;
if (mask & SFL_SHUT_RD)
udp_recv(udp->udp_pcb, NULL, NULL);
pktsock_shutdown(&udp->udp_pktsock, mask);
return OK;
}
/*
* Close a UDP socket.
*/
static int
udpsock_close(struct sock * sock, int force __unused)
{
struct udpsock *udp = (struct udpsock *)sock;
udp_recv(udp->udp_pcb, NULL, NULL);
udp_remove(udp->udp_pcb);
udp->udp_pcb = NULL;
pktsock_close(&udp->udp_pktsock);
return OK;
}
/*
* Free up a closed UDP socket.
*/
static void
udpsock_free(struct sock * sock)
{
struct udpsock *udp = (struct udpsock *)sock;
assert(udp->udp_pcb == NULL);
SIMPLEQ_INSERT_HEAD(&udp_freelist, udp, udp_next);
}
/*
* Fill the given kinfo_pcb sysctl(7) structure with information about the UDP
* PCB identified by the given pointer.
*/
static void
udpsock_get_info(struct kinfo_pcb * ki, const void * ptr)
{
const struct udp_pcb *pcb = (const struct udp_pcb *)ptr;
struct udpsock *udp;
ki->ki_type = SOCK_DGRAM;
/*
* All UDP sockets should be created by this module, but protect
* ourselves from the case that that is not true anyway.
*/
if (pcb->recv_arg != NULL) {
udp = (struct udpsock *)pcb->recv_arg;
assert(udp >= udp_array &&
udp < &udp_array[__arraycount(udp_array)]);
} else
udp = NULL;
ipsock_get_info(ki, &pcb->local_ip, pcb->local_port, &pcb->remote_ip,
pcb->remote_port);
if (udp != NULL) {
/* TODO: change this so that sockstat(1) may work one day. */
ki->ki_sockaddr = (uint64_t)(uintptr_t)udpsock_get_sock(udp);
ki->ki_rcvq = pktsock_get_recvlen(&udp->udp_pktsock);
}
}
/*
* Given either NULL or a previously returned UDP PCB pointer, return the first
* or next UDP PCB pointer, or NULL if there are no more. Skip UDP PCBs that
* are not bound to an address, as there is no use reporting them.
*/
static const void *
udpsock_enum(const void * last)
{
const struct udp_pcb *pcb;
if (last != NULL)
pcb = (const void *)((const struct udp_pcb *)last)->next;
else
pcb = (const void *)udp_pcbs;
while (pcb != NULL && pcb->local_port == 0)
pcb = pcb->next;
return pcb;
}
/*
* Obtain the list of UDP protocol control blocks, for sysctl(7).
*/
static ssize_t
udpsock_pcblist(struct rmib_call * call, struct rmib_node * node __unused,
struct rmib_oldp * oldp, struct rmib_newp * newp __unused)
{
return util_pcblist(call, oldp, udpsock_enum, udpsock_get_info);
}
static const struct sockevent_ops udpsock_ops = {
.sop_bind = udpsock_bind,
.sop_connect = udpsock_connect,
.sop_pre_send = udpsock_pre_send,
.sop_send = udpsock_send,
.sop_pre_recv = pktsock_pre_recv,
.sop_recv = pktsock_recv,
.sop_test_recv = pktsock_test_recv,
.sop_ioctl = ifconf_ioctl,
.sop_setsockmask = udpsock_setsockmask,
.sop_setsockopt = udpsock_setsockopt,
.sop_getsockopt = udpsock_getsockopt,
.sop_getsockname = udpsock_getsockname,
.sop_getpeername = udpsock_getpeername,
.sop_shutdown = udpsock_shutdown,
.sop_close = udpsock_close,
.sop_free = udpsock_free
};

251
minix/net/lwip/util.c Normal file
View File

@ -0,0 +1,251 @@
/* LWIP service - util.c - shared utility functions */
#include "lwip.h"
#define US 1000000 /* number of microseconds per second */
/*
* Convert the given timeval structure to a number of clock ticks, checking
* whether the given structure is valid and whether the resulting number of
* ticks can be expressed as a (relative) clock ticks value. Upon success,
* return OK, with the number of clock ticks stored in 'ticksp'. Upon failure,
* return a negative error code that may be returned to userland directly. In
* that case, the contents of 'ticksp' are left unchanged.
*
* TODO: move this function into libsys and remove other redundant copies.
*/
int
util_timeval_to_ticks(const struct timeval * tv, clock_t * ticksp)
{
clock_t ticks;
if (tv->tv_sec < 0 || tv->tv_usec < 0 || tv->tv_usec >= US)
return EINVAL;
if (tv->tv_sec >= TMRDIFF_MAX / sys_hz())
return EDOM;
ticks = tv->tv_sec * sys_hz() + (tv->tv_usec * sys_hz() + US - 1) / US;
assert(ticks <= TMRDIFF_MAX);
*ticksp = ticks;
return OK;
}
/*
* Convert the given number of clock ticks to a timeval structure. This
* function never fails.
*/
void
util_ticks_to_timeval(clock_t ticks, struct timeval * tv)
{
memset(tv, 0, sizeof(*tv));
tv->tv_sec = ticks / sys_hz();
tv->tv_usec = (ticks % sys_hz()) * US / sys_hz();
}
/*
* Copy data between a user process and a chain of buffers. If the 'copy_in'
* flag is set, the data will be copied in from the user process to the given
* chain of buffers; otherwise, the data will be copied out from the given
* buffer chain to the user process. The 'data' parameter is a sockdriver-
* supplied structure identifying the remote source or destination of the data.
* The 'len' parameter contains the number of bytes to copy, and 'off' contains
* the offset into the remote source or destination. 'pbuf' is a pointer to
* the buffer chain, and 'skip' is the number of bytes to skip in the first
* buffer on the chain. Return OK on success, or a negative error code if the
* copy operation failed. This function is packet queue friendly.
*/
int
util_copy_data(const struct sockdriver_data * data, size_t len, size_t off,
const struct pbuf * pbuf, size_t skip, int copy_in)
{
iovec_t iov[SOCKDRIVER_IOV_MAX];
unsigned int i;
size_t sub, chunk;
int r;
while (len > 0) {
sub = 0;
for (i = 0; len > 0 && i < __arraycount(iov); i++) {
assert(pbuf != NULL);
chunk = (size_t)pbuf->len - skip;
if (chunk > len)
chunk = len;
iov[i].iov_addr = (vir_bytes)pbuf->payload + skip;
iov[i].iov_size = chunk;
sub += chunk;
len -= chunk;
pbuf = pbuf->next;
skip = 0;
}
if (copy_in)
r = sockdriver_vcopyin(data, off, iov, i);
else
r = sockdriver_vcopyout(data, off, iov, i);
if (r != OK)
return r;
off += sub;
}
return OK;
}
/*
* Copy from a vector of (local) buffers to a single (local) buffer. Return
* the total number of copied bytes on success, or E2BIG if not all of the
* results could be stored in the given bfufer.
*/
ssize_t
util_coalesce(char * ptr, size_t max, const iovec_t * iov, unsigned int iovcnt)
{
size_t off, size;
for (off = 0; iovcnt > 0; iov++, iovcnt--) {
if ((size = iov->iov_size) > max)
return E2BIG;
memcpy(&ptr[off], (void *)iov->iov_addr, size);
off += size;
max -= size;
}
return off;
}
/*
* Return TRUE if the given endpoint has superuser privileges, FALSE otherwise.
*/
int
util_is_root(endpoint_t endpt)
{
return (getnuid(endpt) == ROOT_EUID);
}
/*
* Convert a lwIP-provided error code (of type err_t) to a negative MINIX 3
* error code.
*/
int
util_convert_err(err_t err)
{
switch (err) {
case ERR_OK: return OK;
case ERR_MEM: return ENOMEM;
case ERR_BUF: return ENOBUFS;
case ERR_TIMEOUT: return ETIMEDOUT;
case ERR_RTE: return EHOSTUNREACH;
case ERR_VAL: return EINVAL;
case ERR_USE: return EADDRINUSE;
case ERR_ALREADY: return EALREADY;
case ERR_ISCONN: return EISCONN;
case ERR_CONN: return ENOTCONN;
case ERR_IF: return ENETDOWN;
case ERR_ABRT: return ECONNABORTED;
case ERR_RST: return ECONNRESET;
case ERR_INPROGRESS: return EINPROGRESS; /* should not be thrown */
case ERR_WOULDBLOCK: return EWOULDBLOCK; /* should not be thrown */
case ERR_ARG: return EINVAL;
case ERR_CLSD: /* should be caught as separate case */
default: /* should have a case here */
printf("LWIP: unexpected error from lwIP: %d", err);
return EGENERIC;
}
}
/*
* Obtain the list of protocol control blocks for a particular domain and
* protocol. The call may be used for requesting either IPv4 or IPv6 PCBs,
* based on the path used to get here. It is used for TCP, UDP, and RAW PCBs.
*/
ssize_t
util_pcblist(struct rmib_call * call, struct rmib_oldp * oldp,
const void *(*enum_proc)(const void *),
void (*get_info_proc)(struct kinfo_pcb *, const void *))
{
const void *pcb;
ip_addr_t local_ip;
struct kinfo_pcb ki;
ssize_t off;
int r, size, max, domain, protocol;
if (call->call_namelen != 4)
return EINVAL;
/* The first two added name fields are not used. */
size = call->call_name[2];
if (size < 0 || (size_t)size > sizeof(ki))
return EINVAL;
if (size == 0)
size = sizeof(ki);
max = call->call_name[3];
domain = call->call_oname[1];
protocol = call->call_oname[2];
off = 0;
for (pcb = enum_proc(NULL); pcb != NULL; pcb = enum_proc(pcb)) {
/* Filter on IPv4/IPv6. */
memcpy(&local_ip, &((const struct ip_pcb *)pcb)->local_ip,
sizeof(local_ip));
/*
* lwIP does not support IPv6 sockets with IPv4-mapped IPv6
* addresses, and requires that those be represented as IPv4
* sockets instead. We perform the appropriate conversions to
* make that work in general, but here we only have the lwIP
* PCB to go on, and that PCB may not even have an associated
* sock data structure. As a result, we have to report IPv6
* sockets with IPv4-mapped IPv6 addresses as IPv4 sockets
* here. There is little room for improvement until lwIP
* allows us to store a "this is really an IPv6 socket" flag in
* its PCBs. As documented in the ipsock module, a partial
* solution would for example cause TCP sockets to "jump" from
* the IPv6 listing to the IPv4 listing when entering TIME_WAIT
* state. The jumping already occurs now for sockets that are
* getting bound, but that is not as problematic.
*/
if ((domain == AF_INET) != IP_IS_V4(&local_ip))
continue;
if (rmib_inrange(oldp, off)) {
memset(&ki, 0, sizeof(ki));
ki.ki_pcbaddr = (uint64_t)(uintptr_t)pcb;
ki.ki_ppcbaddr = (uint64_t)(uintptr_t)pcb;
ki.ki_family = domain;
ki.ki_protocol = protocol;
get_info_proc(&ki, pcb);
if ((r = rmib_copyout(oldp, off, &ki, size)) < OK)
return r;
}
off += size;
if (max > 0 && --max == 0)
break;
}
/*
* Margin to limit the possible effects of the inherent race condition
* between receiving just the data size and receiving the actual data.
*/
if (oldp == NULL)
off += PCB_SLOP * size;
return off;
}

27
minix/net/lwip/util.h Normal file
View File

@ -0,0 +1,27 @@
#ifndef MINIX_NET_LWIP_UTIL_H
#define MINIX_NET_LWIP_UTIL_H
/* util.c */
int util_timeval_to_ticks(const struct timeval * tv, clock_t * ticksp);
void util_ticks_to_timeval(clock_t ticks, struct timeval * tv);
int util_copy_data(const struct sockdriver_data * data, size_t len, size_t off,
const struct pbuf * pbuf, size_t skip, int copy_in);
ssize_t util_coalesce(char * buf, size_t max, const iovec_t * iov,
unsigned int iovcnt);
int util_convert_err(err_t err);
int util_is_root(endpoint_t user_endpt);
ssize_t util_pcblist(struct rmib_call * call, struct rmib_oldp * oldp,
const void *(*enum_proc)(const void *),
void (*get_info_proc)(struct kinfo_pcb *, const void *));
/*
* In our code, pbuf header adjustments should never fail. This wrapper checks
* that the pbuf_header() call succeeds, and panics otherwise.
*/
#define util_pbuf_header(pbuf,incr) \
do { \
if (pbuf_header((pbuf), (incr))) \
panic("unexpected pbuf header adjustment failure"); \
} while (0)
#endif /* !MINIX_NET_LWIP_UTIL_H */

View File

@ -75,6 +75,7 @@ static struct mib_node mib_minix_table[] = {
"mib", "MIB service information"),
/* 2*/ [MINIX_PROC] = MIB_NODE(_P | _RO, mib_minix_proc_table,
"proc", "Process information for ProcFS"),
/* 3*/ /* MINIX_LWIP is mounted through RMIB and thus not present here. */
};
/*

View File

@ -17,12 +17,115 @@
#include <net/gen/psip_io.h>
#include <arpa/inet.h>
#include <net/route.h>
#include <netinet6/in6_var.h>
#include <netinet6/nd6.h>
#include <net80211/ieee80211_ioctl.h>
const char *
net_ioctl_name(unsigned long req)
{
switch (req) {
NAME(FIONREAD);
/* sys/sockio.h */
NAME(SIOCSHIWAT); /* TODO: print argument */
NAME(SIOCGHIWAT); /* TODO: print argument */
NAME(SIOCSLOWAT); /* TODO: print argument */
NAME(SIOCGLOWAT); /* TODO: print argument */
NAME(SIOCSPGRP); /* TODO: print argument */
NAME(SIOCGPGRP); /* TODO: print argument */
NAME(SIOCADDRT); /* TODO: print argument */
NAME(SIOCDELRT); /* TODO: print argument */
NAME(SIOCSIFADDR); /* TODO: print argument */
NAME(SIOCGIFADDR); /* TODO: print argument */
NAME(SIOCSIFDSTADDR); /* TODO: print argument */
NAME(SIOCGIFDSTADDR); /* TODO: print argument */
NAME(SIOCSIFFLAGS); /* TODO: print argument */
NAME(SIOCGIFFLAGS); /* TODO: print argument */
NAME(SIOCGIFBRDADDR); /* TODO: print argument */
NAME(SIOCSIFBRDADDR); /* TODO: print argument */
NAME(SIOCGIFCONF); /* TODO: print argument */
NAME(SIOCGIFNETMASK); /* TODO: print argument */
NAME(SIOCSIFNETMASK); /* TODO: print argument */
NAME(SIOCGIFMETRIC); /* TODO: print argument */
NAME(SIOCSIFMETRIC); /* TODO: print argument */
NAME(SIOCDIFADDR); /* TODO: print argument */
NAME(SIOCAIFADDR); /* TODO: print argument */
NAME(SIOCGIFALIAS); /* TODO: print argument */
NAME(SIOCGIFAFLAG_IN); /* TODO: print argument */
NAME(SIOCALIFADDR); /* TODO: print argument */
NAME(SIOCGLIFADDR); /* TODO: print argument */
NAME(SIOCDLIFADDR); /* TODO: print argument */
NAME(SIOCSIFADDRPREF); /* TODO: print argument */
NAME(SIOCGIFADDRPREF); /* TODO: print argument */
NAME(SIOCADDMULTI); /* TODO: print argument */
NAME(SIOCDELMULTI); /* TODO: print argument */
NAME(SIOCSIFMEDIA); /* TODO: print argument */
NAME(SIOCGIFMEDIA); /* TODO: print argument */
NAME(SIOCSIFGENERIC); /* TODO: print argument */
NAME(SIOCGIFGENERIC); /* TODO: print argument */
NAME(SIOCSIFPHYADDR); /* TODO: print argument */
NAME(SIOCGIFPSRCADDR); /* TODO: print argument */
NAME(SIOCGIFPDSTADDR); /* TODO: print argument */
NAME(SIOCDIFPHYADDR); /* TODO: print argument */
NAME(SIOCSLIFPHYADDR); /* TODO: print argument */
NAME(SIOCGLIFPHYADDR); /* TODO: print argument */
NAME(SIOCSIFMTU); /* TODO: print argument */
NAME(SIOCGIFMTU); /* TODO: print argument */
NAME(SIOCSDRVSPEC); /* TODO: print argument */
NAME(SIOCGDRVSPEC); /* TODO: print argument */
NAME(SIOCIFCREATE); /* TODO: print argument */
NAME(SIOCIFDESTROY); /* TODO: print argument */
NAME(SIOCIFGCLONERS); /* TODO: print argument */
NAME(SIOCGIFDLT); /* TODO: print argument */
NAME(SIOCGIFCAP); /* TODO: print argument */
NAME(SIOCSIFCAP); /* TODO: print argument */
NAME(SIOCSVH); /* TODO: print argument */
NAME(SIOCGVH); /* TODO: print argument */
NAME(SIOCINITIFADDR); /* TODO: print argument */
NAME(SIOCGIFDATA); /* TODO: print argument */
NAME(SIOCZIFDATA); /* TODO: print argument */
NAME(SIOCGLINKSTR); /* TODO: print argument */
NAME(SIOCSLINKSTR); /* TODO: print argument */
NAME(SIOCGETHERCAP); /* TODO: print argument */
NAME(SIOCGIFINDEX); /* TODO: print argument */
NAME(SIOCSETPFSYNC); /* TODO: print argument */
NAME(SIOCGETPFSYNC); /* TODO: print argument */
/* netinet6/in6_var.h */
NAME(SIOCSIFADDR_IN6); /* TODO: print argument */
NAME(SIOCGIFADDR_IN6); /* TODO: print argument */
NAME(SIOCGIFDSTADDR_IN6); /* TODO: print argument */
NAME(SIOCGIFNETMASK_IN6); /* TODO: print argument */
NAME(SIOCDIFADDR_IN6); /* TODO: print argument */
NAME(SIOCGIFPSRCADDR_IN6); /* TODO: print argument */
NAME(SIOCGIFPDSTADDR_IN6); /* TODO: print argument */
NAME(SIOCGIFAFLAG_IN6); /* TODO: print argument */
NAME(SIOCGDRLST_IN6); /* TODO: print argument */
NAME(SIOCSNDFLUSH_IN6); /* TODO: print argument */
NAME(SIOCGNBRINFO_IN6); /* TODO: print argument */
NAME(SIOCSRTRFLUSH_IN6); /* TODO: print argument */
NAME(SIOCGIFSTAT_IN6); /* TODO: print argument */
NAME(SIOCGIFSTAT_ICMP6); /* TODO: print argument */
NAME(SIOCSDEFIFACE_IN6); /* TODO: print argument */
NAME(SIOCGDEFIFACE_IN6); /* TODO: print argument */
NAME(SIOCSIFINFO_FLAGS); /* TODO: print argument */
NAME(SIOCSIFPREFIX_IN6); /* TODO: print argument */
NAME(SIOCGIFPREFIX_IN6); /* TODO: print argument */
NAME(SIOCDIFPREFIX_IN6); /* TODO: print argument */
NAME(SIOCAIFPREFIX_IN6); /* TODO: print argument */
NAME(SIOCCIFPREFIX_IN6); /* TODO: print argument */
NAME(SIOCGIFALIFETIME_IN6); /* TODO: print argument */
NAME(SIOCAIFADDR_IN6); /* TODO: print argument */
NAME(SIOCGIFINFO_IN6); /* TODO: print argument */
NAME(SIOCSIFINFO_IN6); /* TODO: print argument */
NAME(SIOCSIFPHYADDR_IN6); /* TODO: print argument */
NAME(SIOCAADDRCTL_POLICY); /* TODO: print argument */
NAME(SIOCDADDRCTL_POLICY); /* TODO: print argument */
/* net80211/ieee80211_ioctl.h */
NAME(SIOCS80211NWID); /* TODO: print argument */
NAME(SIOCG80211NWID); /* TODO: print argument */
/* old MINIX inet ioctls */
NAME(NWIOSETHOPT); /* TODO: print argument */
NAME(NWIOGETHOPT); /* TODO: print argument */
NAME(NWIOGETHSTAT); /* TODO: print argument */