From f6513902aa7e4390e2b1e08ec6cfa1887bc37b21 Mon Sep 17 00:00:00 2001 From: Eirikr Hinngart <151315375+Oichkatzelesfrettschen@users.noreply.github.com> Date: Sat, 31 May 2025 17:06:32 -0700 Subject: [PATCH] =?UTF-8?q?Remove=20NetBSD=20references=20=E2=80=93=20batc?= =?UTF-8?q?h=201?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- minix/lib/libc/Makefile.inc | 4 - minix/lib/libc/arch/arm/Makefile.inc | 5 - minix/lib/libc/arch/arm/sys/Makefile.inc | 29 - minix/lib/libc/arch/i386/Makefile.inc | 9 - minix/lib/libc/arch/i386/sys/Makefile.inc | 29 - minix/lib/libc/sys/Makefile.inc | 38 - minix/lib/libc/sys/__sysctl.c | 40 - minix/lib/libc/sys/stack_utils.c | 172 - minix/lib/libddekit/build/ddekit/Makefile | 13 - .../build/ddekit_usb_client/Makefile | 12 - .../build/ddekit_usb_server/Makefile | 11 - .../dist/src/include/netif/ppp/pppoe.h | 179 -- minix/lib/liblwip/dist/src/netif/ppp/pppoe.c | 1191 ------- minix/lib/liblwip/lib/Makefile | 21 - ...ly-control-IP-forwarding-at-run-time.patch | 101 - minix/lib/libsys/Makefile | 148 - minix/lib/libsys/arch/earm/Makefile.inc | 15 - minix/net/lwip/Makefile | 42 - minix/net/lwip/addr.c | 699 ----- minix/net/lwip/addrpol.c | 143 - minix/net/lwip/bpf_filter.c | 561 ---- minix/net/lwip/bpfdev.c | 1365 -------- minix/net/lwip/ethif.c | 1718 ---------- minix/net/lwip/ifaddr.c | 2224 ------------- minix/net/lwip/ifconf.c | 930 ------ minix/net/lwip/ifdev.c | 1064 ------- minix/net/lwip/ifdev.h | 155 - minix/net/lwip/lldata.c | 584 ---- minix/net/lwip/loopif.c | 420 --- minix/net/lwip/lwip.h | 130 - minix/net/lwip/pktsock.c | 1236 -------- minix/net/lwip/rawsock.c | 1341 -------- minix/net/lwip/route.c | 1654 ---------- minix/net/lwip/rtsock.c | 1912 ----------- minix/net/lwip/tcpsock.c | 2793 ----------------- minix/net/lwip/udpsock.c | 997 ------ minix/net/uds/io.c | 1805 ----------- minix/net/uds/stat.c | 186 -- minix/net/uds/uds.c | 1417 --------- minix/tests/Makefile | 136 - minix/tests/kernel/sys_padconf/Makefile | 16 - minix/tests/test27.c | 310 -- minix/tests/test28.c | 405 --- releasetools/Makefile | 109 - releasetools/arm_sdimage.sh | 209 -- releasetools/pkgsrc_cdimage.sh | 52 - releasetools/x86_cdimage.sh | 91 - releasetools/x86_hdimage.sh | 136 - releasetools/x86_ramimage.sh | 57 - releasetools/x86_usbimage.sh | 105 - 50 files changed, 27019 deletions(-) delete mode 100644 minix/lib/libc/Makefile.inc delete mode 100644 minix/lib/libc/arch/arm/Makefile.inc delete mode 100644 minix/lib/libc/arch/arm/sys/Makefile.inc delete mode 100644 minix/lib/libc/arch/i386/Makefile.inc delete mode 100644 minix/lib/libc/arch/i386/sys/Makefile.inc delete mode 100644 minix/lib/libc/sys/Makefile.inc delete mode 100644 minix/lib/libc/sys/__sysctl.c delete mode 100644 minix/lib/libc/sys/stack_utils.c delete mode 100644 minix/lib/libddekit/build/ddekit/Makefile delete mode 100644 minix/lib/libddekit/build/ddekit_usb_client/Makefile delete mode 100644 minix/lib/libddekit/build/ddekit_usb_server/Makefile delete mode 100644 minix/lib/liblwip/dist/src/include/netif/ppp/pppoe.h delete mode 100644 minix/lib/liblwip/dist/src/netif/ppp/pppoe.c delete mode 100644 minix/lib/liblwip/lib/Makefile delete mode 100644 minix/lib/liblwip/patches/0002-MINIX-3-only-control-IP-forwarding-at-run-time.patch delete mode 100644 minix/lib/libsys/Makefile delete mode 100644 minix/lib/libsys/arch/earm/Makefile.inc delete mode 100644 minix/net/lwip/Makefile delete mode 100644 minix/net/lwip/addr.c delete mode 100644 minix/net/lwip/addrpol.c delete mode 100644 minix/net/lwip/bpf_filter.c delete mode 100644 minix/net/lwip/bpfdev.c delete mode 100644 minix/net/lwip/ethif.c delete mode 100644 minix/net/lwip/ifaddr.c delete mode 100644 minix/net/lwip/ifconf.c delete mode 100644 minix/net/lwip/ifdev.c delete mode 100644 minix/net/lwip/ifdev.h delete mode 100644 minix/net/lwip/lldata.c delete mode 100644 minix/net/lwip/loopif.c delete mode 100644 minix/net/lwip/lwip.h delete mode 100644 minix/net/lwip/pktsock.c delete mode 100644 minix/net/lwip/rawsock.c delete mode 100644 minix/net/lwip/route.c delete mode 100644 minix/net/lwip/rtsock.c delete mode 100644 minix/net/lwip/tcpsock.c delete mode 100644 minix/net/lwip/udpsock.c delete mode 100644 minix/net/uds/io.c delete mode 100644 minix/net/uds/stat.c delete mode 100644 minix/net/uds/uds.c delete mode 100644 minix/tests/Makefile delete mode 100644 minix/tests/kernel/sys_padconf/Makefile delete mode 100644 minix/tests/test27.c delete mode 100644 minix/tests/test28.c delete mode 100644 releasetools/Makefile delete mode 100755 releasetools/arm_sdimage.sh delete mode 100755 releasetools/pkgsrc_cdimage.sh delete mode 100755 releasetools/x86_cdimage.sh delete mode 100755 releasetools/x86_hdimage.sh delete mode 100755 releasetools/x86_ramimage.sh delete mode 100755 releasetools/x86_usbimage.sh diff --git a/minix/lib/libc/Makefile.inc b/minix/lib/libc/Makefile.inc deleted file mode 100644 index b283f6ea7..000000000 --- a/minix/lib/libc/Makefile.inc +++ /dev/null @@ -1,4 +0,0 @@ -# MINIX Specifics sources -.PATH: ${NETBSDSRCDIR}/minix/lib/libc - -SRCS+= configfile.c mtab.c stderr.c diff --git a/minix/lib/libc/arch/arm/Makefile.inc b/minix/lib/libc/arch/arm/Makefile.inc deleted file mode 100644 index dfa289c9b..000000000 --- a/minix/lib/libc/arch/arm/Makefile.inc +++ /dev/null @@ -1,5 +0,0 @@ -C_HERE=${NETBSDSRCDIR}/minix/lib/libc/arch/${ARCHSUBDIR} -.PATH: ${C_HERE} - -SRCS+= get_bp.S \ - read_tsc.c diff --git a/minix/lib/libc/arch/arm/sys/Makefile.inc b/minix/lib/libc/arch/arm/sys/Makefile.inc deleted file mode 100644 index 4bea415f2..000000000 --- a/minix/lib/libc/arch/arm/sys/Makefile.inc +++ /dev/null @@ -1,29 +0,0 @@ -# rts sources -HERE=${NETBSDSRCDIR}/minix/lib/libc/arch/${ARCHSUBDIR}/sys -.PATH: ${HERE} - -TMP=ucontextoffsets.h.tmp -CF=${HERE}/ucontextoffsets.cf - -INCS+=ucontextoffsets.h - -ucontext.o: ucontextoffsets.h - -SRCS+= \ - __sigreturn.S \ - _do_kernel_call_intr.S \ - _ipc.S \ - brksize.S \ - ipc_minix_kerninfo.S \ - ucontext.S - -ucontextoffsets.h: ${CF} -ucontextoffsets.h: ${NETBSDSRCDIR}/sys/sys/ucontext.h -ucontextoffsets.h: ${NETBSDSRCDIR}/minix/include/arch/${MACHINE_ARCH}/include/stackframe.h -ucontextoffsets.h: - ${_MKTARGET_CREATE} - ${TOOL_CAT} ${CF} | \ - ${TOOL_GENASSYM} -- ${CC} ${CFLAGS:N-Wa,*} \ - ${CPPFLAGS} ${PROF} ${GENASSYM_CPPFLAGS} >$TMP && \ - mv -f $TMP $@ - diff --git a/minix/lib/libc/arch/i386/Makefile.inc b/minix/lib/libc/arch/i386/Makefile.inc deleted file mode 100644 index 6476ea62e..000000000 --- a/minix/lib/libc/arch/i386/Makefile.inc +++ /dev/null @@ -1,9 +0,0 @@ -C_HERE=${NETBSDSRCDIR}/minix/lib/libc/arch/${ARCHSUBDIR} -.PATH: ${C_HERE} - -SRCS+= _cpuid.S \ - get_bp.S \ - getprocessor.S \ - read_tsc.S \ - _cpufeature.c - diff --git a/minix/lib/libc/arch/i386/sys/Makefile.inc b/minix/lib/libc/arch/i386/sys/Makefile.inc deleted file mode 100644 index 4bea415f2..000000000 --- a/minix/lib/libc/arch/i386/sys/Makefile.inc +++ /dev/null @@ -1,29 +0,0 @@ -# rts sources -HERE=${NETBSDSRCDIR}/minix/lib/libc/arch/${ARCHSUBDIR}/sys -.PATH: ${HERE} - -TMP=ucontextoffsets.h.tmp -CF=${HERE}/ucontextoffsets.cf - -INCS+=ucontextoffsets.h - -ucontext.o: ucontextoffsets.h - -SRCS+= \ - __sigreturn.S \ - _do_kernel_call_intr.S \ - _ipc.S \ - brksize.S \ - ipc_minix_kerninfo.S \ - ucontext.S - -ucontextoffsets.h: ${CF} -ucontextoffsets.h: ${NETBSDSRCDIR}/sys/sys/ucontext.h -ucontextoffsets.h: ${NETBSDSRCDIR}/minix/include/arch/${MACHINE_ARCH}/include/stackframe.h -ucontextoffsets.h: - ${_MKTARGET_CREATE} - ${TOOL_CAT} ${CF} | \ - ${TOOL_GENASSYM} -- ${CC} ${CFLAGS:N-Wa,*} \ - ${CPPFLAGS} ${PROF} ${GENASSYM_CPPFLAGS} >$TMP && \ - mv -f $TMP $@ - diff --git a/minix/lib/libc/sys/Makefile.inc b/minix/lib/libc/sys/Makefile.inc deleted file mode 100644 index 096107be8..000000000 --- a/minix/lib/libc/sys/Makefile.inc +++ /dev/null @@ -1,38 +0,0 @@ -.PATH: ${NETBSDSRCDIR}/minix/lib/libc/sys - -SRCS+= accept.c access.c adjtime.c bind.c brk.c sbrk.c m_closefrom.c getsid.c \ - chdir.c chmod.c fchmod.c chown.c fchown.c chroot.c close.c \ - clock_getres.c clock_gettime.c clock_settime.c \ - connect.c dup.c dup2.c execve.c fcntl.c flock.c fpathconf.c fork.c \ - fstatfs.c fstatvfs.c fsync.c ftruncate.c gcov_flush_sys.c getdents.c \ - getegid.c getgid.c \ - getgroups.c getitimer.c setitimer.c __getlogin.c getpeername.c \ - getpgrp.c getpid.c getppid.c priority.c getrlimit.c getsockname.c \ - getsockopt.c setsockopt.c gettimeofday.c geteuid.c getuid.c \ - getvfsstat.c \ - ioctl.c issetugid.c kill.c link.c listen.c loadname.c lseek.c \ - minix_rs.c mkdir.c mkfifo.c mknod.c mmap.c mount.c nanosleep.c \ - open.c pathconf.c pipe.c poll.c posix_spawn.c pread.c ptrace.c pwrite.c \ - read.c readlink.c reboot.c recvfrom.c recvmsg.c rename.c \ - rmdir.c select.c sem.c sendmsg.c sendto.c setgroups.c setsid.c \ - setgid.c settimeofday.c setuid.c shmat.c shmctl.c shmget.c stime.c \ - vectorio.c shutdown.c sigaction.c sigpending.c sigreturn.c sigsuspend.c\ - sigprocmask.c socket.c socketpair.c stat.c statvfs.c svrctl.c \ - symlink.c \ - sync.c syscall.c truncate.c umask.c unlink.c \ - wait4.c write.c \ - utimensat.c utimes.c futimes.c lutimes.c futimens.c \ - _exit.c _ucontext.c environ.c __getcwd.c vfork.c sizeup.c init.c \ - getrusage.c setrlimit.c setpgid.c __sysctl.c - -# Minix specific syscalls / utils. -SRCS+= kernel_utils.c sprofile.c stack_utils.c _mcontext.c - -# Emulation for missing lchown/lchmod/lchflags -OBJS+= lchflags.o lchmod.o lchown.o -lchflags.go lchflags.o lchflags.pico lchflags.bc: ${NETBSDSRCDIR}/tools/compat/lchflags.c -lchmod.go lchmod.o lchmod.pico lchmod.bc: ${NETBSDSRCDIR}/tools/compat/lchmod.c -lchown.go lchown.o lchown.pico lchown.bc: ${NETBSDSRCDIR}/tools/compat/lchown.c - -.include "${NETBSDSRCDIR}/minix/lib/libc/arch/${ARCHSUBDIR}/sys/Makefile.inc" -.include "${NETBSDSRCDIR}/minix/lib/libc/arch/${ARCHSUBDIR}/Makefile.inc" diff --git a/minix/lib/libc/sys/__sysctl.c b/minix/lib/libc/sys/__sysctl.c deleted file mode 100644 index fd02bfa66..000000000 --- a/minix/lib/libc/sys/__sysctl.c +++ /dev/null @@ -1,40 +0,0 @@ -#include -#include -#include "namespace.h" -#include "extern.h" -#include - -/* - * The sysctl(2) system call, handled by the MIB service. - */ -int -__sysctl(const int * name, unsigned int namelen, void * oldp, size_t * oldlenp, - const void * newp, size_t newlen) -{ - message m; - int r; - - memset(&m, 0, sizeof(m)); - m.m_lc_mib_sysctl.oldp = (vir_bytes)oldp; - m.m_lc_mib_sysctl.oldlen = (oldlenp != NULL) ? *oldlenp : 0; - m.m_lc_mib_sysctl.newp = (vir_bytes)newp; - m.m_lc_mib_sysctl.newlen = newlen; - m.m_lc_mib_sysctl.namelen = namelen; - m.m_lc_mib_sysctl.namep = (vir_bytes)name; - if (namelen <= CTL_SHORTNAME) - memcpy(m.m_lc_mib_sysctl.name, name, sizeof(*name) * namelen); - - r = _syscall(MIB_PROC_NR, MIB_SYSCTL, &m); - - /* - * We copy the NetBSD behavior of replying with the old length also if - * the call failed, typically with ENOMEM. This is undocumented - * behavior, but unfortunately relied on by sysctl(8) and other NetBSD - * userland code. If the call failed at the IPC level, the resulting - * value will be garbage, but it should then not be used anyway. - */ - if (oldlenp != NULL) - *oldlenp = m.m_mib_lc_sysctl.oldlen; - - return r; -} diff --git a/minix/lib/libc/sys/stack_utils.c b/minix/lib/libc/sys/stack_utils.c deleted file mode 100644 index 0c8ad9e01..000000000 --- a/minix/lib/libc/sys/stack_utils.c +++ /dev/null @@ -1,172 +0,0 @@ -/* Utilities to generate a proper C stack. - * - * Author: Lionel A. Sambuc. - */ - -#define _MINIX_SYSTEM - -#include -#include "namespace.h" -#include - -#include -#include -#include -#include -#include - -/* Create a stack image that only needs to be patched up slightly by - * the kernel to be used for the process to be executed. - * - * Every pointers are stored here as offset from the frame base, and - * will be adapted as required for the new process address space. - * - * The following parameters are passed by register to either __start - * for static binaries, or _rtld_start for dynamic ones: - * *fct, *ObjEntry, *ps_string - * - * The following stack layout is expected by _rtld(): - * - * | XXXXXXXXXX | 0x0000_00000 - * | ... | - * | ... | Top of the stack - * | argc | - * | *argv1 | points to the first char of the argv1 - * | ... | - * | *argvN | - * | NULL | - * | *env1 | - * | ... | - * | *envN | - * | NULL | - * | ElfAuxV1 | - * | ... | - * | ElfAuxVX | - * | AuxExecName| fully resolve executable name, as an ASCIIZ string, - * at most PMEF_EXECNAMELEN1 long. - * - * Here we put first the strings, then word-align, then ps_strings, to - * comply with the expected layout of NetBSD. This seems to matter for - * the NetBSD ps command, so let's make sure we are compatible... - * - * | strings | Maybe followed by some padding to word-align. - * | **argv | \ - * | argc | +---> ps_string structure content. - * | **env | | - * | envc | / - * | sigcode | On NetBSD, there may be a compatibility stub here, - * +------------+ for native code, it is not present. - * Stack Base , 0xF000_0000, descending stack. - */ - -/* The minimum size of the frame is composed of: - * argc, the NULL terminator for argv as well as one for - * environ, the ELF Aux vectors, executable name and the - * ps_strings struct. */ -#define STACK_MIN_SZ \ -( \ - sizeof(int) + sizeof(void *) * 2 + \ - sizeof(AuxInfo) * PMEF_AUXVECTORS + PMEF_EXECNAMELEN1 + \ - sizeof(struct ps_strings) \ -) - -/***************************************************************************** - * Computes stack size, argc, envc, for a given set of path, argv, envp. * - *****************************************************************************/ -void minix_stack_params(const char *path, char * const *argv, char * const *envp, - size_t *stack_size, char *overflow, int *argc, int *envc) -{ - char * const *p; - size_t const min_size = STACK_MIN_SZ; - - *stack_size = min_size; /* Size of the new initial stack. */ - *overflow = 0; /* No overflow yet. */ - *argc = 0; /* Argument count. */ - *envc = 0; /* Environment count */ - - /* Compute and add the size required to store argv and env. */ - for (p = argv; *p != NULL; p++) { - size_t const n = sizeof(*p) + strlen(*p) + 1; - *stack_size += n; - if (*stack_size < n) { - *overflow = 1; - } - (*argc)++; - } - - for (p = envp; p && *p != NULL; p++) { - size_t const n = sizeof(*p) + strlen(*p) + 1; - *stack_size += n; - if (*stack_size < n) { - *overflow = 1; - } - (*envc)++; - } - - /* Compute the aligned frame size. */ - *stack_size = (*stack_size + sizeof(void *) - 1) & - ~(sizeof(void *) - 1); - - if (*stack_size < min_size) { - /* This is possible only in case of overflow. */ - *overflow = 1; - } -} - -/***************************************************************************** - * Generate a stack in the buffer frame, ready to be used. * - *****************************************************************************/ -void minix_stack_fill(const char *path, int argc, char * const *argv, - int envc, char * const *envp, size_t stack_size, char *frame, - int *vsp, struct ps_strings **psp) -{ - char * const *p; - - /* Frame pointers (a.k.a stack pointer within the buffer in current - * address space.) */ - char *fp; /* byte aligned */ - char **fpw; /* word aligned */ - - size_t const min_size = STACK_MIN_SZ; - - /* Virtual address of the stack pointer, in new memory space. */ - *vsp = minix_get_user_sp() - stack_size; - - /* Fill in the frame now. */ - fpw = (char **) frame; - *fpw++ = (char *) argc; - - /* The strings themselves are stored after the aux vectors, - * cf. top comment. */ - fp = frame + (min_size - sizeof(struct ps_strings)) + - (envc + argc) * sizeof(char *); - - /* Fill in argv and the environment, as well as copy the strings - * themselves. */ - for (p = argv; *p != NULL; p++) { - size_t const n = strlen(*p) + 1; - *fpw++= (char *)(*vsp + (fp - frame)); - memcpy(fp, *p, n); - fp += n; - } - *fpw++ = NULL; - - for (p = envp; p && *p != NULL; p++) { - size_t const n = strlen(*p) + 1; - *fpw++= (char *)(*vsp + (fp - frame)); - memcpy(fp, *p, n); - fp += n; - } - *fpw++ = NULL; - - /* Padding, because of the stack alignement. */ - while ((size_t)fp % sizeof(void *)) *fp++= 0; - - /* Fill in the ps_string struct*/ - *psp = (struct ps_strings *) fp; - - (*psp)->ps_argvstr = (char **)(*vsp + sizeof(argc)); - (*psp)->ps_nargvstr = argc; - (*psp)->ps_envstr = (*psp)->ps_argvstr + argc + 1; - (*psp)->ps_nenvstr = envc; -} diff --git a/minix/lib/libddekit/build/ddekit/Makefile b/minix/lib/libddekit/build/ddekit/Makefile deleted file mode 100644 index 5197f67c9..000000000 --- a/minix/lib/libddekit/build/ddekit/Makefile +++ /dev/null @@ -1,13 +0,0 @@ -LIB= ddekit - -SRC_DIR = ${NETBSDSRCDIR}/minix/lib/libddekit/src - -VPATH = ${SRC_DIR} - -SRCS = pci.c printf.c mem.c pgtab.c dde.c initcall.c thread.c condvar.c \ - lock.c semaphore.c timer.c panic.c irq.c resource.c msg_queue.c - -CPPFLAGS += -D_NETBSD_SOURCE -D_MINIX_SYSTEM - -.include - diff --git a/minix/lib/libddekit/build/ddekit_usb_client/Makefile b/minix/lib/libddekit/build/ddekit_usb_client/Makefile deleted file mode 100644 index 8e56c51d4..000000000 --- a/minix/lib/libddekit/build/ddekit_usb_client/Makefile +++ /dev/null @@ -1,12 +0,0 @@ -LIB= ddekit_usb_client - -SRC_DIR = ${NETBSDSRCDIR}/minix/lib/libddekit/src - -VPATH = ${SRC_DIR} - -SRCS = usb_client.c - -CFLAGS += -Wall - -.include - diff --git a/minix/lib/libddekit/build/ddekit_usb_server/Makefile b/minix/lib/libddekit/build/ddekit_usb_server/Makefile deleted file mode 100644 index 9f2aff757..000000000 --- a/minix/lib/libddekit/build/ddekit_usb_server/Makefile +++ /dev/null @@ -1,11 +0,0 @@ -LIB= ddekit_usb_server - -SRC_DIR = ${NETBSDSRCDIR}/minix/lib/libddekit/src - -VPATH = ${SRC_DIR} - -SRCS = usb_server.c - -CFLAGS += -Wall - -.include diff --git a/minix/lib/liblwip/dist/src/include/netif/ppp/pppoe.h b/minix/lib/liblwip/dist/src/include/netif/ppp/pppoe.h deleted file mode 100644 index 9f8f2892b..000000000 --- a/minix/lib/liblwip/dist/src/include/netif/ppp/pppoe.h +++ /dev/null @@ -1,179 +0,0 @@ -/***************************************************************************** -* pppoe.h - PPP Over Ethernet implementation for lwIP. -* -* Copyright (c) 2006 by Marc Boucher, Services Informatiques (MBSI) inc. -* -* The authors hereby grant permission to use, copy, modify, distribute, -* and license this software and its documentation for any purpose, provided -* that existing copyright notices are retained in all copies and that this -* notice and the following disclaimer are included verbatim in any -* distributions. No written agreement, license, or royalty fee is required -* for any of the authorized uses. -* -* THIS SOFTWARE IS PROVIDED BY THE CONTRIBUTORS *AS IS* AND ANY EXPRESS OR -* IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES -* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. -* IN NO EVENT SHALL THE CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, -* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT -* NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, -* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY -* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT -* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF -* THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -* -****************************************************************************** -* REVISION HISTORY -* -* 06-01-01 Marc Boucher -* Ported to lwIP. -*****************************************************************************/ - - - -/* based on NetBSD: if_pppoe.c,v 1.64 2006/01/31 23:50:15 martin Exp */ - -/*- - * Copyright (c) 2002 The NetBSD Foundation, Inc. - * All rights reserved. - * - * This code is derived from software contributed to The NetBSD Foundation - * by Martin Husemann . - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * 1. Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * 3. All advertising materials mentioning features or use of this software - * must display the following acknowledgement: - * This product includes software developed by the NetBSD - * Foundation, Inc. and its contributors. - * 4. Neither the name of The NetBSD Foundation nor the names of its - * contributors may be used to endorse or promote products derived - * from this software without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS - * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED - * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR - * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS - * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR - * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF - * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS - * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN - * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) - * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE - * POSSIBILITY OF SUCH DAMAGE. - */ -#include "netif/ppp/ppp_opts.h" -#if PPP_SUPPORT && PPPOE_SUPPORT /* don't build if not configured for use in lwipopts.h */ - -#ifndef PPP_OE_H -#define PPP_OE_H - -#include "ppp.h" -#include "lwip/etharp.h" - -#ifdef PACK_STRUCT_USE_INCLUDES -# include "arch/bpstruct.h" -#endif -PACK_STRUCT_BEGIN -struct pppoehdr { - PACK_STRUCT_FLD_8(u8_t vertype); - PACK_STRUCT_FLD_8(u8_t code); - PACK_STRUCT_FIELD(u16_t session); - PACK_STRUCT_FIELD(u16_t plen); -} PACK_STRUCT_STRUCT; -PACK_STRUCT_END -#ifdef PACK_STRUCT_USE_INCLUDES -# include "arch/epstruct.h" -#endif - -#ifdef PACK_STRUCT_USE_INCLUDES -# include "arch/bpstruct.h" -#endif -PACK_STRUCT_BEGIN -struct pppoetag { - PACK_STRUCT_FIELD(u16_t tag); - PACK_STRUCT_FIELD(u16_t len); -} PACK_STRUCT_STRUCT; -PACK_STRUCT_END -#ifdef PACK_STRUCT_USE_INCLUDES -# include "arch/epstruct.h" -#endif - - -#define PPPOE_STATE_INITIAL 0 -#define PPPOE_STATE_PADI_SENT 1 -#define PPPOE_STATE_PADR_SENT 2 -#define PPPOE_STATE_SESSION 3 -/* passive */ -#define PPPOE_STATE_PADO_SENT 1 - -#define PPPOE_HEADERLEN sizeof(struct pppoehdr) -#define PPPOE_VERTYPE 0x11 /* VER=1, TYPE = 1 */ - -#define PPPOE_TAG_EOL 0x0000 /* end of list */ -#define PPPOE_TAG_SNAME 0x0101 /* service name */ -#define PPPOE_TAG_ACNAME 0x0102 /* access concentrator name */ -#define PPPOE_TAG_HUNIQUE 0x0103 /* host unique */ -#define PPPOE_TAG_ACCOOKIE 0x0104 /* AC cookie */ -#define PPPOE_TAG_VENDOR 0x0105 /* vendor specific */ -#define PPPOE_TAG_RELAYSID 0x0110 /* relay session id */ -#define PPPOE_TAG_SNAME_ERR 0x0201 /* service name error */ -#define PPPOE_TAG_ACSYS_ERR 0x0202 /* AC system error */ -#define PPPOE_TAG_GENERIC_ERR 0x0203 /* gerneric error */ - -#define PPPOE_CODE_PADI 0x09 /* Active Discovery Initiation */ -#define PPPOE_CODE_PADO 0x07 /* Active Discovery Offer */ -#define PPPOE_CODE_PADR 0x19 /* Active Discovery Request */ -#define PPPOE_CODE_PADS 0x65 /* Active Discovery Session confirmation */ -#define PPPOE_CODE_PADT 0xA7 /* Active Discovery Terminate */ - -#ifndef PPPOE_MAX_AC_COOKIE_LEN -#define PPPOE_MAX_AC_COOKIE_LEN 64 -#endif - -struct pppoe_softc { - struct pppoe_softc *next; - struct netif *sc_ethif; /* ethernet interface we are using */ - ppp_pcb *pcb; /* PPP PCB */ - - struct eth_addr sc_dest; /* hardware address of concentrator */ - u16_t sc_session; /* PPPoE session id */ - u8_t sc_state; /* discovery phase or session connected */ - -#ifdef PPPOE_TODO - u8_t *sc_service_name; /* if != NULL: requested name of service */ - u8_t *sc_concentrator_name; /* if != NULL: requested concentrator id */ -#endif /* PPPOE_TODO */ - u8_t sc_ac_cookie[PPPOE_MAX_AC_COOKIE_LEN]; /* content of AC cookie we must echo back */ - u8_t sc_ac_cookie_len; /* length of cookie data */ -#ifdef PPPOE_SERVER - u8_t *sc_hunique; /* content of host unique we must echo back */ - u8_t sc_hunique_len; /* length of host unique */ -#endif - u8_t sc_padi_retried; /* number of PADI retries already done */ - u8_t sc_padr_retried; /* number of PADR retries already done */ -}; - - -#define pppoe_init() /* compatibility define, no initialization needed */ - -ppp_pcb *pppoe_create(struct netif *pppif, - struct netif *ethif, - const char *service_name, const char *concentrator_name, - ppp_link_status_cb_fn link_status_cb, void *ctx_cb); - -/* - * Functions called from lwIP - * DO NOT CALL FROM lwIP USER APPLICATION. - */ -void pppoe_disc_input(struct netif *netif, struct pbuf *p); -void pppoe_data_input(struct netif *netif, struct pbuf *p); - -#endif /* PPP_OE_H */ - -#endif /* PPP_SUPPORT && PPPOE_SUPPORT */ diff --git a/minix/lib/liblwip/dist/src/netif/ppp/pppoe.c b/minix/lib/liblwip/dist/src/netif/ppp/pppoe.c deleted file mode 100644 index eabfa4d04..000000000 --- a/minix/lib/liblwip/dist/src/netif/ppp/pppoe.c +++ /dev/null @@ -1,1191 +0,0 @@ -/***************************************************************************** -* pppoe.c - PPP Over Ethernet implementation for lwIP. -* -* Copyright (c) 2006 by Marc Boucher, Services Informatiques (MBSI) inc. -* -* The authors hereby grant permission to use, copy, modify, distribute, -* and license this software and its documentation for any purpose, provided -* that existing copyright notices are retained in all copies and that this -* notice and the following disclaimer are included verbatim in any -* distributions. No written agreement, license, or royalty fee is required -* for any of the authorized uses. -* -* THIS SOFTWARE IS PROVIDED BY THE CONTRIBUTORS *AS IS* AND ANY EXPRESS OR -* IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES -* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. -* IN NO EVENT SHALL THE CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, -* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT -* NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, -* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY -* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT -* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF -* THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -* -****************************************************************************** -* REVISION HISTORY -* -* 06-01-01 Marc Boucher -* Ported to lwIP. -*****************************************************************************/ - - - -/* based on NetBSD: if_pppoe.c,v 1.64 2006/01/31 23:50:15 martin Exp */ - -/*- - * Copyright (c) 2002 The NetBSD Foundation, Inc. - * All rights reserved. - * - * This code is derived from software contributed to The NetBSD Foundation - * by Martin Husemann . - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * 1. Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * 3. All advertising materials mentioning features or use of this software - * must display the following acknowledgement: - * This product includes software developed by the NetBSD - * Foundation, Inc. and its contributors. - * 4. Neither the name of The NetBSD Foundation nor the names of its - * contributors may be used to endorse or promote products derived - * from this software without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS - * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED - * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR - * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS - * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR - * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF - * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS - * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN - * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) - * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE - * POSSIBILITY OF SUCH DAMAGE. - */ - -#include "netif/ppp/ppp_opts.h" -#if PPP_SUPPORT && PPPOE_SUPPORT /* don't build if not configured for use in lwipopts.h */ - -#if 0 /* UNUSED */ -#include -#include -#endif /* UNUSED */ - -#include "lwip/timeouts.h" -#include "lwip/memp.h" -#include "lwip/stats.h" -#include "lwip/snmp.h" - -#include "netif/ethernet.h" -#include "netif/ppp/ppp_impl.h" -#include "netif/ppp/lcp.h" -#include "netif/ppp/ipcp.h" -#include "netif/ppp/pppoe.h" - -/* Memory pool */ -LWIP_MEMPOOL_DECLARE(PPPOE_IF, MEMP_NUM_PPPOE_INTERFACES, sizeof(struct pppoe_softc), "PPPOE_IF") - -/* Add a 16 bit unsigned value to a buffer pointed to by PTR */ -#define PPPOE_ADD_16(PTR, VAL) \ - *(PTR)++ = (u8_t)((VAL) / 256); \ - *(PTR)++ = (u8_t)((VAL) % 256) - -/* Add a complete PPPoE header to the buffer pointed to by PTR */ -#define PPPOE_ADD_HEADER(PTR, CODE, SESS, LEN) \ - *(PTR)++ = PPPOE_VERTYPE; \ - *(PTR)++ = (CODE); \ - PPPOE_ADD_16(PTR, SESS); \ - PPPOE_ADD_16(PTR, LEN) - -#define PPPOE_DISC_TIMEOUT (5*1000) /* base for quick timeout calculation */ -#define PPPOE_SLOW_RETRY (60*1000) /* persistent retry interval */ -#define PPPOE_DISC_MAXPADI 4 /* retry PADI four times (quickly) */ -#define PPPOE_DISC_MAXPADR 2 /* retry PADR twice */ - -#ifdef PPPOE_SERVER -#error "PPPOE_SERVER is not yet supported under lwIP!" -/* from if_spppsubr.c */ -#define IFF_PASSIVE IFF_LINK0 /* wait passively for connection */ -#endif - -#define PPPOE_ERRORSTRING_LEN 64 - - -/* callbacks called from PPP core */ -static err_t pppoe_write(ppp_pcb *ppp, void *ctx, struct pbuf *p); -static err_t pppoe_netif_output(ppp_pcb *ppp, void *ctx, struct pbuf *p, u_short protocol); -static void pppoe_connect(ppp_pcb *ppp, void *ctx); -static void pppoe_disconnect(ppp_pcb *ppp, void *ctx); -static err_t pppoe_destroy(ppp_pcb *ppp, void *ctx); - -/* management routines */ -static void pppoe_abort_connect(struct pppoe_softc *); -#if 0 /* UNUSED */ -static void pppoe_clear_softc(struct pppoe_softc *, const char *); -#endif /* UNUSED */ - -/* internal timeout handling */ -static void pppoe_timeout(void *); - -/* sending actual protocol controll packets */ -static err_t pppoe_send_padi(struct pppoe_softc *); -static err_t pppoe_send_padr(struct pppoe_softc *); -#ifdef PPPOE_SERVER -static err_t pppoe_send_pado(struct pppoe_softc *); -static err_t pppoe_send_pads(struct pppoe_softc *); -#endif -static err_t pppoe_send_padt(struct netif *, u_int, const u8_t *); - -/* internal helper functions */ -static err_t pppoe_xmit(struct pppoe_softc *sc, struct pbuf *pb); -static struct pppoe_softc* pppoe_find_softc_by_session(u_int session, struct netif *rcvif); -static struct pppoe_softc* pppoe_find_softc_by_hunique(u8_t *token, size_t len, struct netif *rcvif); - -/** linked list of created pppoe interfaces */ -static struct pppoe_softc *pppoe_softc_list; - -/* Callbacks structure for PPP core */ -static const struct link_callbacks pppoe_callbacks = { - pppoe_connect, -#if PPP_SERVER - NULL, -#endif /* PPP_SERVER */ - pppoe_disconnect, - pppoe_destroy, - pppoe_write, - pppoe_netif_output, - NULL, - NULL -}; - -/* - * Create a new PPP Over Ethernet (PPPoE) connection. - * - * Return 0 on success, an error code on failure. - */ -ppp_pcb *pppoe_create(struct netif *pppif, - struct netif *ethif, - const char *service_name, const char *concentrator_name, - ppp_link_status_cb_fn link_status_cb, void *ctx_cb) -{ - ppp_pcb *ppp; - struct pppoe_softc *sc; - LWIP_UNUSED_ARG(service_name); - LWIP_UNUSED_ARG(concentrator_name); - - sc = (struct pppoe_softc *)LWIP_MEMPOOL_ALLOC(PPPOE_IF); - if (sc == NULL) { - return NULL; - } - - ppp = ppp_new(pppif, &pppoe_callbacks, sc, link_status_cb, ctx_cb); - if (ppp == NULL) { - LWIP_MEMPOOL_FREE(PPPOE_IF, sc); - return NULL; - } - - memset(sc, 0, sizeof(struct pppoe_softc)); - sc->pcb = ppp; - sc->sc_ethif = ethif; - /* put the new interface at the head of the list */ - sc->next = pppoe_softc_list; - pppoe_softc_list = sc; - return ppp; -} - -/* Called by PPP core */ -static err_t pppoe_write(ppp_pcb *ppp, void *ctx, struct pbuf *p) { - struct pppoe_softc *sc = (struct pppoe_softc *)ctx; - struct pbuf *ph; /* Ethernet + PPPoE header */ - err_t ret; -#if MIB2_STATS - u16_t tot_len; -#else /* MIB2_STATS */ - LWIP_UNUSED_ARG(ppp); -#endif /* MIB2_STATS */ - - /* skip address & flags */ - pbuf_header(p, -(s16_t)2); - - ph = pbuf_alloc(PBUF_LINK, (u16_t)(PPPOE_HEADERLEN), PBUF_RAM); - if(!ph) { - LINK_STATS_INC(link.memerr); - LINK_STATS_INC(link.proterr); - MIB2_STATS_NETIF_INC(ppp->netif, ifoutdiscards); - pbuf_free(p); - return ERR_MEM; - } - - pbuf_header(ph, -(s16_t)PPPOE_HEADERLEN); /* hide PPPoE header */ - pbuf_cat(ph, p); -#if MIB2_STATS - tot_len = ph->tot_len; -#endif /* MIB2_STATS */ - - ret = pppoe_xmit(sc, ph); - if (ret != ERR_OK) { - LINK_STATS_INC(link.err); - MIB2_STATS_NETIF_INC(ppp->netif, ifoutdiscards); - return ret; - } - - MIB2_STATS_NETIF_ADD(ppp->netif, ifoutoctets, (u16_t)tot_len); - MIB2_STATS_NETIF_INC(ppp->netif, ifoutucastpkts); - LINK_STATS_INC(link.xmit); - return ERR_OK; -} - -/* Called by PPP core */ -static err_t pppoe_netif_output(ppp_pcb *ppp, void *ctx, struct pbuf *p, u_short protocol) { - struct pppoe_softc *sc = (struct pppoe_softc *)ctx; - struct pbuf *pb; - u8_t *pl; - err_t err; -#if MIB2_STATS - u16_t tot_len; -#else /* MIB2_STATS */ - LWIP_UNUSED_ARG(ppp); -#endif /* MIB2_STATS */ - - /* @todo: try to use pbuf_header() here! */ - pb = pbuf_alloc(PBUF_LINK, PPPOE_HEADERLEN + sizeof(protocol), PBUF_RAM); - if(!pb) { - LINK_STATS_INC(link.memerr); - LINK_STATS_INC(link.proterr); - MIB2_STATS_NETIF_INC(ppp->netif, ifoutdiscards); - return ERR_MEM; - } - - pbuf_header(pb, -(s16_t)PPPOE_HEADERLEN); - - pl = (u8_t*)pb->payload; - PUTSHORT(protocol, pl); - - pbuf_chain(pb, p); -#if MIB2_STATS - tot_len = pb->tot_len; -#endif /* MIB2_STATS */ - - if( (err = pppoe_xmit(sc, pb)) != ERR_OK) { - LINK_STATS_INC(link.err); - MIB2_STATS_NETIF_INC(ppp->netif, ifoutdiscards); - return err; - } - - MIB2_STATS_NETIF_ADD(ppp->netif, ifoutoctets, tot_len); - MIB2_STATS_NETIF_INC(ppp->netif, ifoutucastpkts); - LINK_STATS_INC(link.xmit); - return ERR_OK; -} - -static err_t -pppoe_destroy(ppp_pcb *ppp, void *ctx) -{ - struct pppoe_softc *sc = (struct pppoe_softc *)ctx; - struct pppoe_softc **copp, *freep; - LWIP_UNUSED_ARG(ppp); - - sys_untimeout(pppoe_timeout, sc); - - /* remove interface from list */ - for (copp = &pppoe_softc_list; (freep = *copp); copp = &freep->next) { - if (freep == sc) { - *copp = freep->next; - break; - } - } - -#ifdef PPPOE_TODO - if (sc->sc_concentrator_name) { - mem_free(sc->sc_concentrator_name); - } - if (sc->sc_service_name) { - mem_free(sc->sc_service_name); - } -#endif /* PPPOE_TODO */ - LWIP_MEMPOOL_FREE(PPPOE_IF, sc); - - return ERR_OK; -} - -/* - * Find the interface handling the specified session. - * Note: O(number of sessions open), this is a client-side only, mean - * and lean implementation, so number of open sessions typically should - * be 1. - */ -static struct pppoe_softc* pppoe_find_softc_by_session(u_int session, struct netif *rcvif) { - struct pppoe_softc *sc; - - for (sc = pppoe_softc_list; sc != NULL; sc = sc->next) { - if (sc->sc_state == PPPOE_STATE_SESSION - && sc->sc_session == session - && sc->sc_ethif == rcvif) { - return sc; - } - } - return NULL; -} - -/* Check host unique token passed and return appropriate softc pointer, - * or NULL if token is bogus. */ -static struct pppoe_softc* pppoe_find_softc_by_hunique(u8_t *token, size_t len, struct netif *rcvif) { - struct pppoe_softc *sc, *t; - - if (len != sizeof sc) { - return NULL; - } - MEMCPY(&t, token, len); - - for (sc = pppoe_softc_list; sc != NULL; sc = sc->next) { - if (sc == t) { - break; - } - } - - if (sc == NULL) { - PPPDEBUG(LOG_DEBUG, ("pppoe: alien host unique tag, no session found\n")); - return NULL; - } - - /* should be safe to access *sc now */ - if (sc->sc_state < PPPOE_STATE_PADI_SENT || sc->sc_state >= PPPOE_STATE_SESSION) { - PPPDEBUG(LOG_DEBUG, ("%c%c%"U16_F": host unique tag found, but it belongs to a connection in state %d\n", - sc->sc_ethif->name[0], sc->sc_ethif->name[1], sc->sc_ethif->num, sc->sc_state)); - return NULL; - } - if (sc->sc_ethif != rcvif) { - PPPDEBUG(LOG_DEBUG, ("%c%c%"U16_F": wrong interface, not accepting host unique\n", - sc->sc_ethif->name[0], sc->sc_ethif->name[1], sc->sc_ethif->num)); - return NULL; - } - return sc; -} - -/* analyze and handle a single received packet while not in session state */ -void -pppoe_disc_input(struct netif *netif, struct pbuf *pb) -{ - u16_t tag, len; - u16_t session, plen; - struct pppoe_softc *sc; -#if PPP_DEBUG - const char *err_msg = NULL; -#endif /* PPP_DEBUG */ - u8_t *ac_cookie; - u16_t ac_cookie_len; -#ifdef PPPOE_SERVER - u8_t *hunique; - size_t hunique_len; -#endif - struct pppoehdr *ph; - struct pppoetag pt; - int off, err; - struct eth_hdr *ethhdr; - - /* don't do anything if there is not a single PPPoE instance */ - if (pppoe_softc_list == NULL) { - pbuf_free(pb); - return; - } - - pb = ppp_singlebuf(pb); - - if (pb->len < sizeof(*ethhdr)) { - goto done; - } - ethhdr = (struct eth_hdr *)pb->payload; - off = sizeof(*ethhdr); - - ac_cookie = NULL; - ac_cookie_len = 0; -#ifdef PPPOE_SERVER - hunique = NULL; - hunique_len = 0; -#endif - session = 0; - if (pb->len - off < (u16_t)PPPOE_HEADERLEN) { - PPPDEBUG(LOG_DEBUG, ("pppoe: packet too short: %d\n", pb->len)); - goto done; - } - - ph = (struct pppoehdr *) (ethhdr + 1); - if (ph->vertype != PPPOE_VERTYPE) { - PPPDEBUG(LOG_DEBUG, ("pppoe: unknown version/type packet: 0x%x\n", ph->vertype)); - goto done; - } - session = lwip_ntohs(ph->session); - plen = lwip_ntohs(ph->plen); - off += sizeof(*ph); - - if (plen + off > pb->len) { - PPPDEBUG(LOG_DEBUG, ("pppoe: packet content does not fit: data available = %d, packet size = %u\n", - pb->len - off, plen)); - goto done; - } - if(pb->tot_len == pb->len) { - pb->tot_len = pb->len = (u16_t)off + plen; /* ignore trailing garbage */ - } - tag = 0; - len = 0; - sc = NULL; - while (off + sizeof(pt) <= pb->len) { - MEMCPY(&pt, (u8_t*)pb->payload + off, sizeof(pt)); - tag = lwip_ntohs(pt.tag); - len = lwip_ntohs(pt.len); - if (off + sizeof(pt) + len > pb->len) { - PPPDEBUG(LOG_DEBUG, ("pppoe: tag 0x%x len 0x%x is too long\n", tag, len)); - goto done; - } - switch (tag) { - case PPPOE_TAG_EOL: - goto breakbreak; - case PPPOE_TAG_SNAME: - break; /* ignored */ - case PPPOE_TAG_ACNAME: - break; /* ignored */ - case PPPOE_TAG_HUNIQUE: - if (sc != NULL) { - break; - } -#ifdef PPPOE_SERVER - hunique = (u8_t*)pb->payload + off + sizeof(pt); - hunique_len = len; -#endif - sc = pppoe_find_softc_by_hunique((u8_t*)pb->payload + off + sizeof(pt), len, netif); - break; - case PPPOE_TAG_ACCOOKIE: - if (ac_cookie == NULL) { - if (len > PPPOE_MAX_AC_COOKIE_LEN) { - PPPDEBUG(LOG_DEBUG, ("pppoe: AC cookie is too long: len = %d, max = %d\n", len, PPPOE_MAX_AC_COOKIE_LEN)); - goto done; - } - ac_cookie = (u8_t*)pb->payload + off + sizeof(pt); - ac_cookie_len = len; - } - break; -#if PPP_DEBUG - case PPPOE_TAG_SNAME_ERR: - err_msg = "SERVICE NAME ERROR"; - break; - case PPPOE_TAG_ACSYS_ERR: - err_msg = "AC SYSTEM ERROR"; - break; - case PPPOE_TAG_GENERIC_ERR: - err_msg = "GENERIC ERROR"; - break; -#endif /* PPP_DEBUG */ - default: - break; - } -#if PPP_DEBUG - if (err_msg != NULL) { - char error_tmp[PPPOE_ERRORSTRING_LEN]; - u16_t error_len = LWIP_MIN(len, sizeof(error_tmp)-1); - strncpy(error_tmp, (char*)pb->payload + off + sizeof(pt), error_len); - error_tmp[error_len] = '\0'; - if (sc) { - PPPDEBUG(LOG_DEBUG, ("pppoe: %c%c%"U16_F": %s: %s\n", sc->sc_ethif->name[0], sc->sc_ethif->name[1], sc->sc_ethif->num, err_msg, error_tmp)); - } else { - PPPDEBUG(LOG_DEBUG, ("pppoe: %s: %s\n", err_msg, error_tmp)); - } - } -#endif /* PPP_DEBUG */ - off += sizeof(pt) + len; - } - -breakbreak:; - switch (ph->code) { - case PPPOE_CODE_PADI: -#ifdef PPPOE_SERVER - /* - * got service name, concentrator name, and/or host unique. - * ignore if we have no interfaces with IFF_PASSIVE|IFF_UP. - */ - if (LIST_EMPTY(&pppoe_softc_list)) { - goto done; - } - LIST_FOREACH(sc, &pppoe_softc_list, sc_list) { - if (!(sc->sc_sppp.pp_if.if_flags & IFF_UP)) { - continue; - } - if (!(sc->sc_sppp.pp_if.if_flags & IFF_PASSIVE)) { - continue; - } - if (sc->sc_state == PPPOE_STATE_INITIAL) { - break; - } - } - if (sc == NULL) { - /* PPPDEBUG(LOG_DEBUG, ("pppoe: free passive interface is not found\n")); */ - goto done; - } - if (hunique) { - if (sc->sc_hunique) { - mem_free(sc->sc_hunique); - } - sc->sc_hunique = mem_malloc(hunique_len); - if (sc->sc_hunique == NULL) { - goto done; - } - sc->sc_hunique_len = hunique_len; - MEMCPY(sc->sc_hunique, hunique, hunique_len); - } - MEMCPY(&sc->sc_dest, eh->ether_shost, sizeof sc->sc_dest); - sc->sc_state = PPPOE_STATE_PADO_SENT; - pppoe_send_pado(sc); - break; -#endif /* PPPOE_SERVER */ - case PPPOE_CODE_PADR: -#ifdef PPPOE_SERVER - /* - * get sc from ac_cookie if IFF_PASSIVE - */ - if (ac_cookie == NULL) { - /* be quiet if there is not a single pppoe instance */ - PPPDEBUG(LOG_DEBUG, ("pppoe: received PADR but not includes ac_cookie\n")); - goto done; - } - sc = pppoe_find_softc_by_hunique(ac_cookie, ac_cookie_len, netif); - if (sc == NULL) { - /* be quiet if there is not a single pppoe instance */ - if (!LIST_EMPTY(&pppoe_softc_list)) { - PPPDEBUG(LOG_DEBUG, ("pppoe: received PADR but could not find request for it\n")); - } - goto done; - } - if (sc->sc_state != PPPOE_STATE_PADO_SENT) { - PPPDEBUG(LOG_DEBUG, ("%c%c%"U16_F": received unexpected PADR\n", sc->sc_ethif->name[0], sc->sc_ethif->name[1], sc->sc_ethif->num)); - goto done; - } - if (hunique) { - if (sc->sc_hunique) { - mem_free(sc->sc_hunique); - } - sc->sc_hunique = mem_malloc(hunique_len); - if (sc->sc_hunique == NULL) { - goto done; - } - sc->sc_hunique_len = hunique_len; - MEMCPY(sc->sc_hunique, hunique, hunique_len); - } - pppoe_send_pads(sc); - sc->sc_state = PPPOE_STATE_SESSION; - ppp_start(sc->pcb); /* notify upper layers */ - break; -#else - /* ignore, we are no access concentrator */ - goto done; -#endif /* PPPOE_SERVER */ - case PPPOE_CODE_PADO: - if (sc == NULL) { - /* be quiet if there is not a single pppoe instance */ - if (pppoe_softc_list != NULL) { - PPPDEBUG(LOG_DEBUG, ("pppoe: received PADO but could not find request for it\n")); - } - goto done; - } - if (sc->sc_state != PPPOE_STATE_PADI_SENT) { - PPPDEBUG(LOG_DEBUG, ("%c%c%"U16_F": received unexpected PADO\n", sc->sc_ethif->name[0], sc->sc_ethif->name[1], sc->sc_ethif->num)); - goto done; - } - if (ac_cookie) { - sc->sc_ac_cookie_len = ac_cookie_len; - MEMCPY(sc->sc_ac_cookie, ac_cookie, ac_cookie_len); - } - MEMCPY(&sc->sc_dest, ethhdr->src.addr, sizeof(sc->sc_dest.addr)); - sys_untimeout(pppoe_timeout, sc); - sc->sc_padr_retried = 0; - sc->sc_state = PPPOE_STATE_PADR_SENT; - if ((err = pppoe_send_padr(sc)) != 0) { - PPPDEBUG(LOG_DEBUG, ("pppoe: %c%c%"U16_F": failed to send PADR, error=%d\n", sc->sc_ethif->name[0], sc->sc_ethif->name[1], sc->sc_ethif->num, err)); - } - sys_timeout(PPPOE_DISC_TIMEOUT * (1 + sc->sc_padr_retried), pppoe_timeout, sc); - break; - case PPPOE_CODE_PADS: - if (sc == NULL) { - goto done; - } - sc->sc_session = session; - sys_untimeout(pppoe_timeout, sc); - PPPDEBUG(LOG_DEBUG, ("pppoe: %c%c%"U16_F": session 0x%x connected\n", sc->sc_ethif->name[0], sc->sc_ethif->name[1], sc->sc_ethif->num, session)); - sc->sc_state = PPPOE_STATE_SESSION; - ppp_start(sc->pcb); /* notify upper layers */ - break; - case PPPOE_CODE_PADT: - /* Don't disconnect here, we let the LCP Echo/Reply find the fact - * that PPP session is down. Asking the PPP stack to end the session - * require strict checking about the PPP phase to prevent endless - * disconnection loops. - */ -#if 0 /* UNUSED */ - if (sc == NULL) { /* PADT frames are rarely sent with a hunique tag, this is actually almost always true */ - goto done; - } - pppoe_clear_softc(sc, "received PADT"); -#endif /* UNUSED */ - break; - default: - if(sc) { - PPPDEBUG(LOG_DEBUG, ("%c%c%"U16_F": unknown code (0x%"X16_F") session = 0x%"X16_F"\n", - sc->sc_ethif->name[0], sc->sc_ethif->name[1], sc->sc_ethif->num, - (u16_t)ph->code, session)); - } else { - PPPDEBUG(LOG_DEBUG, ("pppoe: unknown code (0x%"X16_F") session = 0x%"X16_F"\n", (u16_t)ph->code, session)); - } - break; - } - -done: - pbuf_free(pb); - return; -} - -void -pppoe_data_input(struct netif *netif, struct pbuf *pb) -{ - u16_t session, plen; - struct pppoe_softc *sc; - struct pppoehdr *ph; -#ifdef PPPOE_TERM_UNKNOWN_SESSIONS - u8_t shost[ETHER_ADDR_LEN]; -#endif - -#ifdef PPPOE_TERM_UNKNOWN_SESSIONS - MEMCPY(shost, ((struct eth_hdr *)pb->payload)->src.addr, sizeof(shost)); -#endif - if (pbuf_header(pb, -(s16_t)sizeof(struct eth_hdr)) != 0) { - /* bail out */ - PPPDEBUG(LOG_ERR, ("pppoe_data_input: pbuf_header failed\n")); - LINK_STATS_INC(link.lenerr); - goto drop; - } - - if (pb->len < sizeof(*ph)) { - PPPDEBUG(LOG_DEBUG, ("pppoe_data_input: could not get PPPoE header\n")); - goto drop; - } - ph = (struct pppoehdr *)pb->payload; - - if (ph->vertype != PPPOE_VERTYPE) { - PPPDEBUG(LOG_DEBUG, ("pppoe (data): unknown version/type packet: 0x%x\n", ph->vertype)); - goto drop; - } - if (ph->code != 0) { - goto drop; - } - - session = lwip_ntohs(ph->session); - sc = pppoe_find_softc_by_session(session, netif); - if (sc == NULL) { -#ifdef PPPOE_TERM_UNKNOWN_SESSIONS - PPPDEBUG(LOG_DEBUG, ("pppoe: input for unknown session 0x%x, sending PADT\n", session)); - pppoe_send_padt(netif, session, shost); -#endif - goto drop; - } - - plen = lwip_ntohs(ph->plen); - - if (pbuf_header(pb, -(s16_t)(PPPOE_HEADERLEN)) != 0) { - /* bail out */ - PPPDEBUG(LOG_ERR, ("pppoe_data_input: pbuf_header PPPOE_HEADERLEN failed\n")); - LINK_STATS_INC(link.lenerr); - goto drop; - } - - PPPDEBUG(LOG_DEBUG, ("pppoe_data_input: %c%c%"U16_F": pkthdr.len=%d, pppoe.len=%d\n", - sc->sc_ethif->name[0], sc->sc_ethif->name[1], sc->sc_ethif->num, - pb->len, plen)); - - if (pb->tot_len < plen) { - goto drop; - } - - /* Dispatch the packet thereby consuming it. */ - ppp_input(sc->pcb, pb); - return; - -drop: - pbuf_free(pb); -} - -static err_t -pppoe_output(struct pppoe_softc *sc, struct pbuf *pb) -{ - struct eth_hdr *ethhdr; - u16_t etype; - err_t res; - - /* make room for Ethernet header - should not fail */ - if (pbuf_header(pb, (s16_t)(sizeof(struct eth_hdr))) != 0) { - /* bail out */ - PPPDEBUG(LOG_ERR, ("pppoe: %c%c%"U16_F": pppoe_output: could not allocate room for Ethernet header\n", sc->sc_ethif->name[0], sc->sc_ethif->name[1], sc->sc_ethif->num)); - LINK_STATS_INC(link.lenerr); - pbuf_free(pb); - return ERR_BUF; - } - ethhdr = (struct eth_hdr *)pb->payload; - etype = sc->sc_state == PPPOE_STATE_SESSION ? ETHTYPE_PPPOE : ETHTYPE_PPPOEDISC; - ethhdr->type = lwip_htons(etype); - MEMCPY(ðhdr->dest.addr, &sc->sc_dest.addr, sizeof(ethhdr->dest.addr)); - MEMCPY(ðhdr->src.addr, &sc->sc_ethif->hwaddr, sizeof(ethhdr->src.addr)); - - PPPDEBUG(LOG_DEBUG, ("pppoe: %c%c%"U16_F" (%x) state=%d, session=0x%x output -> %02"X16_F":%02"X16_F":%02"X16_F":%02"X16_F":%02"X16_F":%02"X16_F", len=%d\n", - sc->sc_ethif->name[0], sc->sc_ethif->name[1], sc->sc_ethif->num, etype, - sc->sc_state, sc->sc_session, - sc->sc_dest.addr[0], sc->sc_dest.addr[1], sc->sc_dest.addr[2], sc->sc_dest.addr[3], sc->sc_dest.addr[4], sc->sc_dest.addr[5], - pb->tot_len)); - - res = sc->sc_ethif->linkoutput(sc->sc_ethif, pb); - - pbuf_free(pb); - - return res; -} - -static err_t -pppoe_send_padi(struct pppoe_softc *sc) -{ - struct pbuf *pb; - u8_t *p; - int len; -#ifdef PPPOE_TODO - int l1 = 0, l2 = 0; /* XXX: gcc */ -#endif /* PPPOE_TODO */ - - /* calculate length of frame (excluding ethernet header + pppoe header) */ - len = 2 + 2 + 2 + 2 + sizeof sc; /* service name tag is required, host unique is send too */ -#ifdef PPPOE_TODO - if (sc->sc_service_name != NULL) { - l1 = (int)strlen(sc->sc_service_name); - len += l1; - } - if (sc->sc_concentrator_name != NULL) { - l2 = (int)strlen(sc->sc_concentrator_name); - len += 2 + 2 + l2; - } -#endif /* PPPOE_TODO */ - LWIP_ASSERT("sizeof(struct eth_hdr) + PPPOE_HEADERLEN + len <= 0xffff", - sizeof(struct eth_hdr) + PPPOE_HEADERLEN + len <= 0xffff); - - /* allocate a buffer */ - pb = pbuf_alloc(PBUF_LINK, (u16_t)(PPPOE_HEADERLEN + len), PBUF_RAM); - if (!pb) { - return ERR_MEM; - } - LWIP_ASSERT("pb->tot_len == pb->len", pb->tot_len == pb->len); - - p = (u8_t*)pb->payload; - /* fill in pkt */ - PPPOE_ADD_HEADER(p, PPPOE_CODE_PADI, 0, (u16_t)len); - PPPOE_ADD_16(p, PPPOE_TAG_SNAME); -#ifdef PPPOE_TODO - if (sc->sc_service_name != NULL) { - PPPOE_ADD_16(p, l1); - MEMCPY(p, sc->sc_service_name, l1); - p += l1; - } else -#endif /* PPPOE_TODO */ - { - PPPOE_ADD_16(p, 0); - } -#ifdef PPPOE_TODO - if (sc->sc_concentrator_name != NULL) { - PPPOE_ADD_16(p, PPPOE_TAG_ACNAME); - PPPOE_ADD_16(p, l2); - MEMCPY(p, sc->sc_concentrator_name, l2); - p += l2; - } -#endif /* PPPOE_TODO */ - PPPOE_ADD_16(p, PPPOE_TAG_HUNIQUE); - PPPOE_ADD_16(p, sizeof(sc)); - MEMCPY(p, &sc, sizeof sc); - - /* send pkt */ - return pppoe_output(sc, pb); -} - -static void -pppoe_timeout(void *arg) -{ - u32_t retry_wait; - int err; - struct pppoe_softc *sc = (struct pppoe_softc*)arg; - - PPPDEBUG(LOG_DEBUG, ("pppoe: %c%c%"U16_F": timeout\n", sc->sc_ethif->name[0], sc->sc_ethif->name[1], sc->sc_ethif->num)); - - switch (sc->sc_state) { - case PPPOE_STATE_PADI_SENT: - /* - * We have two basic ways of retrying: - * - Quick retry mode: try a few times in short sequence - * - Slow retry mode: we already had a connection successfully - * established and will try infinitely (without user - * intervention) - * We only enter slow retry mode if IFF_LINK1 (aka autodial) - * is not set. - */ - if (sc->sc_padi_retried < 0xff) { - sc->sc_padi_retried++; - } - if (!sc->pcb->settings.persist && sc->sc_padi_retried >= PPPOE_DISC_MAXPADI) { -#if 0 - if ((sc->sc_sppp.pp_if.if_flags & IFF_LINK1) == 0) { - /* slow retry mode */ - retry_wait = PPPOE_SLOW_RETRY; - } else -#endif - { - pppoe_abort_connect(sc); - return; - } - } - /* initialize for quick retry mode */ - retry_wait = LWIP_MIN(PPPOE_DISC_TIMEOUT * sc->sc_padi_retried, PPPOE_SLOW_RETRY); - if ((err = pppoe_send_padi(sc)) != 0) { - sc->sc_padi_retried--; - PPPDEBUG(LOG_DEBUG, ("pppoe: %c%c%"U16_F": failed to transmit PADI, error=%d\n", sc->sc_ethif->name[0], sc->sc_ethif->name[1], sc->sc_ethif->num, err)); - } - sys_timeout(retry_wait, pppoe_timeout, sc); - break; - - case PPPOE_STATE_PADR_SENT: - sc->sc_padr_retried++; - if (sc->sc_padr_retried >= PPPOE_DISC_MAXPADR) { - MEMCPY(&sc->sc_dest, ethbroadcast.addr, sizeof(sc->sc_dest)); - sc->sc_state = PPPOE_STATE_PADI_SENT; - sc->sc_padr_retried = 0; - if ((err = pppoe_send_padi(sc)) != 0) { - PPPDEBUG(LOG_DEBUG, ("pppoe: %c%c%"U16_F": failed to send PADI, error=%d\n", sc->sc_ethif->name[0], sc->sc_ethif->name[1], sc->sc_ethif->num, err)); - } - sys_timeout(PPPOE_DISC_TIMEOUT * (1 + sc->sc_padi_retried), pppoe_timeout, sc); - return; - } - if ((err = pppoe_send_padr(sc)) != 0) { - sc->sc_padr_retried--; - PPPDEBUG(LOG_DEBUG, ("pppoe: %c%c%"U16_F": failed to send PADR, error=%d\n", sc->sc_ethif->name[0], sc->sc_ethif->name[1], sc->sc_ethif->num, err)); - } - sys_timeout(PPPOE_DISC_TIMEOUT * (1 + sc->sc_padr_retried), pppoe_timeout, sc); - break; - default: - return; /* all done, work in peace */ - } -} - -/* Start a connection (i.e. initiate discovery phase) */ -static void -pppoe_connect(ppp_pcb *ppp, void *ctx) -{ - err_t err; - struct pppoe_softc *sc = (struct pppoe_softc *)ctx; - lcp_options *lcp_wo; - lcp_options *lcp_ao; -#if PPP_IPV4_SUPPORT && VJ_SUPPORT - ipcp_options *ipcp_wo; - ipcp_options *ipcp_ao; -#endif /* PPP_IPV4_SUPPORT && VJ_SUPPORT */ - - sc->sc_session = 0; - sc->sc_ac_cookie_len = 0; - sc->sc_padi_retried = 0; - sc->sc_padr_retried = 0; - /* changed to real address later */ - MEMCPY(&sc->sc_dest, ethbroadcast.addr, sizeof(sc->sc_dest)); -#ifdef PPPOE_SERVER - /* wait PADI if IFF_PASSIVE */ - if ((sc->sc_sppp.pp_if.if_flags & IFF_PASSIVE)) { - return 0; - } -#endif - - lcp_wo = &ppp->lcp_wantoptions; - lcp_wo->mru = sc->sc_ethif->mtu-PPPOE_HEADERLEN-2; /* two byte PPP protocol discriminator, then IP data */ - lcp_wo->neg_asyncmap = 0; - lcp_wo->neg_pcompression = 0; - lcp_wo->neg_accompression = 0; - lcp_wo->passive = 0; - lcp_wo->silent = 0; - - lcp_ao = &ppp->lcp_allowoptions; - lcp_ao->mru = sc->sc_ethif->mtu-PPPOE_HEADERLEN-2; /* two byte PPP protocol discriminator, then IP data */ - lcp_ao->neg_asyncmap = 0; - lcp_ao->neg_pcompression = 0; - lcp_ao->neg_accompression = 0; - -#if PPP_IPV4_SUPPORT && VJ_SUPPORT - ipcp_wo = &ppp->ipcp_wantoptions; - ipcp_wo->neg_vj = 0; - ipcp_wo->old_vj = 0; - - ipcp_ao = &ppp->ipcp_allowoptions; - ipcp_ao->neg_vj = 0; - ipcp_ao->old_vj = 0; -#endif /* PPP_IPV4_SUPPORT && VJ_SUPPORT */ - - /* save state, in case we fail to send PADI */ - sc->sc_state = PPPOE_STATE_PADI_SENT; - if ((err = pppoe_send_padi(sc)) != 0) { - PPPDEBUG(LOG_DEBUG, ("pppoe: %c%c%"U16_F": failed to send PADI, error=%d\n", sc->sc_ethif->name[0], sc->sc_ethif->name[1], sc->sc_ethif->num, err)); - } - sys_timeout(PPPOE_DISC_TIMEOUT, pppoe_timeout, sc); -} - -/* disconnect */ -static void -pppoe_disconnect(ppp_pcb *ppp, void *ctx) -{ - struct pppoe_softc *sc = (struct pppoe_softc *)ctx; - - PPPDEBUG(LOG_DEBUG, ("pppoe: %c%c%"U16_F": disconnecting\n", sc->sc_ethif->name[0], sc->sc_ethif->name[1], sc->sc_ethif->num)); - if (sc->sc_state == PPPOE_STATE_SESSION) { - pppoe_send_padt(sc->sc_ethif, sc->sc_session, (const u8_t *)&sc->sc_dest); - } - - /* stop any timer, disconnect can be called while initiating is in progress */ - sys_untimeout(pppoe_timeout, sc); - sc->sc_state = PPPOE_STATE_INITIAL; -#ifdef PPPOE_SERVER - if (sc->sc_hunique) { - mem_free(sc->sc_hunique); - sc->sc_hunique = NULL; /* probably not necessary, if state is initial we shouldn't have to access hunique anyway */ - } - sc->sc_hunique_len = 0; /* probably not necessary, if state is initial we shouldn't have to access hunique anyway */ -#endif - ppp_link_end(ppp); /* notify upper layers */ - return; -} - -/* Connection attempt aborted */ -static void -pppoe_abort_connect(struct pppoe_softc *sc) -{ - PPPDEBUG(LOG_DEBUG, ("%c%c%"U16_F": could not establish connection\n", sc->sc_ethif->name[0], sc->sc_ethif->name[1], sc->sc_ethif->num)); - sc->sc_state = PPPOE_STATE_INITIAL; - ppp_link_failed(sc->pcb); /* notify upper layers */ -} - -/* Send a PADR packet */ -static err_t -pppoe_send_padr(struct pppoe_softc *sc) -{ - struct pbuf *pb; - u8_t *p; - size_t len; -#ifdef PPPOE_TODO - size_t l1 = 0; /* XXX: gcc */ -#endif /* PPPOE_TODO */ - - len = 2 + 2 + 2 + 2 + sizeof(sc); /* service name, host unique */ -#ifdef PPPOE_TODO - if (sc->sc_service_name != NULL) { /* service name tag maybe empty */ - l1 = strlen(sc->sc_service_name); - len += l1; - } -#endif /* PPPOE_TODO */ - if (sc->sc_ac_cookie_len > 0) { - len += 2 + 2 + sc->sc_ac_cookie_len; /* AC cookie */ - } - LWIP_ASSERT("sizeof(struct eth_hdr) + PPPOE_HEADERLEN + len <= 0xffff", - sizeof(struct eth_hdr) + PPPOE_HEADERLEN + len <= 0xffff); - pb = pbuf_alloc(PBUF_LINK, (u16_t)(PPPOE_HEADERLEN + len), PBUF_RAM); - if (!pb) { - return ERR_MEM; - } - LWIP_ASSERT("pb->tot_len == pb->len", pb->tot_len == pb->len); - p = (u8_t*)pb->payload; - PPPOE_ADD_HEADER(p, PPPOE_CODE_PADR, 0, len); - PPPOE_ADD_16(p, PPPOE_TAG_SNAME); -#ifdef PPPOE_TODO - if (sc->sc_service_name != NULL) { - PPPOE_ADD_16(p, l1); - MEMCPY(p, sc->sc_service_name, l1); - p += l1; - } else -#endif /* PPPOE_TODO */ - { - PPPOE_ADD_16(p, 0); - } - if (sc->sc_ac_cookie_len > 0) { - PPPOE_ADD_16(p, PPPOE_TAG_ACCOOKIE); - PPPOE_ADD_16(p, sc->sc_ac_cookie_len); - MEMCPY(p, sc->sc_ac_cookie, sc->sc_ac_cookie_len); - p += sc->sc_ac_cookie_len; - } - PPPOE_ADD_16(p, PPPOE_TAG_HUNIQUE); - PPPOE_ADD_16(p, sizeof(sc)); - MEMCPY(p, &sc, sizeof sc); - - return pppoe_output(sc, pb); -} - -/* send a PADT packet */ -static err_t -pppoe_send_padt(struct netif *outgoing_if, u_int session, const u8_t *dest) -{ - struct pbuf *pb; - struct eth_hdr *ethhdr; - err_t res; - u8_t *p; - - pb = pbuf_alloc(PBUF_LINK, (u16_t)(PPPOE_HEADERLEN), PBUF_RAM); - if (!pb) { - return ERR_MEM; - } - LWIP_ASSERT("pb->tot_len == pb->len", pb->tot_len == pb->len); - - pbuf_header(pb, (s16_t)sizeof(struct eth_hdr)); - ethhdr = (struct eth_hdr *)pb->payload; - ethhdr->type = PP_HTONS(ETHTYPE_PPPOEDISC); - MEMCPY(ðhdr->dest.addr, dest, sizeof(ethhdr->dest.addr)); - MEMCPY(ðhdr->src.addr, &outgoing_if->hwaddr, sizeof(ethhdr->src.addr)); - - p = (u8_t*)(ethhdr + 1); - PPPOE_ADD_HEADER(p, PPPOE_CODE_PADT, session, 0); - - res = outgoing_if->linkoutput(outgoing_if, pb); - - pbuf_free(pb); - - return res; -} - -#ifdef PPPOE_SERVER -static err_t -pppoe_send_pado(struct pppoe_softc *sc) -{ - struct pbuf *pb; - u8_t *p; - size_t len; - - /* calc length */ - len = 0; - /* include ac_cookie */ - len += 2 + 2 + sizeof(sc); - /* include hunique */ - len += 2 + 2 + sc->sc_hunique_len; - pb = pbuf_alloc(PBUF_LINK, (u16_t)(PPPOE_HEADERLEN + len), PBUF_RAM); - if (!pb) { - return ERR_MEM; - } - LWIP_ASSERT("pb->tot_len == pb->len", pb->tot_len == pb->len); - p = (u8_t*)pb->payload; - PPPOE_ADD_HEADER(p, PPPOE_CODE_PADO, 0, len); - PPPOE_ADD_16(p, PPPOE_TAG_ACCOOKIE); - PPPOE_ADD_16(p, sizeof(sc)); - MEMCPY(p, &sc, sizeof(sc)); - p += sizeof(sc); - PPPOE_ADD_16(p, PPPOE_TAG_HUNIQUE); - PPPOE_ADD_16(p, sc->sc_hunique_len); - MEMCPY(p, sc->sc_hunique, sc->sc_hunique_len); - return pppoe_output(sc, pb); -} - -static err_t -pppoe_send_pads(struct pppoe_softc *sc) -{ - struct pbuf *pb; - u8_t *p; - size_t len, l1 = 0; /* XXX: gcc */ - - sc->sc_session = mono_time.tv_sec % 0xff + 1; - /* calc length */ - len = 0; - /* include hunique */ - len += 2 + 2 + 2 + 2 + sc->sc_hunique_len; /* service name, host unique*/ - if (sc->sc_service_name != NULL) { /* service name tag maybe empty */ - l1 = strlen(sc->sc_service_name); - len += l1; - } - pb = pbuf_alloc(PBUF_LINK, (u16_t)(PPPOE_HEADERLEN + len), PBUF_RAM); - if (!pb) { - return ERR_MEM; - } - LWIP_ASSERT("pb->tot_len == pb->len", pb->tot_len == pb->len); - p = (u8_t*)pb->payload; - PPPOE_ADD_HEADER(p, PPPOE_CODE_PADS, sc->sc_session, len); - PPPOE_ADD_16(p, PPPOE_TAG_SNAME); - if (sc->sc_service_name != NULL) { - PPPOE_ADD_16(p, l1); - MEMCPY(p, sc->sc_service_name, l1); - p += l1; - } else { - PPPOE_ADD_16(p, 0); - } - PPPOE_ADD_16(p, PPPOE_TAG_HUNIQUE); - PPPOE_ADD_16(p, sc->sc_hunique_len); - MEMCPY(p, sc->sc_hunique, sc->sc_hunique_len); - return pppoe_output(sc, pb); -} -#endif - -static err_t -pppoe_xmit(struct pppoe_softc *sc, struct pbuf *pb) -{ - u8_t *p; - size_t len; - - len = pb->tot_len; - - /* make room for PPPoE header - should not fail */ - if (pbuf_header(pb, (s16_t)(PPPOE_HEADERLEN)) != 0) { - /* bail out */ - PPPDEBUG(LOG_ERR, ("pppoe: %c%c%"U16_F": pppoe_xmit: could not allocate room for PPPoE header\n", sc->sc_ethif->name[0], sc->sc_ethif->name[1], sc->sc_ethif->num)); - LINK_STATS_INC(link.lenerr); - pbuf_free(pb); - return ERR_BUF; - } - - p = (u8_t*)pb->payload; - PPPOE_ADD_HEADER(p, 0, sc->sc_session, len); - - return pppoe_output(sc, pb); -} - -#if 0 /*def PFIL_HOOKS*/ -static int -pppoe_ifattach_hook(void *arg, struct pbuf **mp, struct netif *ifp, int dir) -{ - struct pppoe_softc *sc; - int s; - - if (mp != (struct pbuf **)PFIL_IFNET_DETACH) { - return 0; - } - - LIST_FOREACH(sc, &pppoe_softc_list, sc_list) { - if (sc->sc_ethif != ifp) { - continue; - } - if (sc->sc_sppp.pp_if.if_flags & IFF_UP) { - sc->sc_sppp.pp_if.if_flags &= ~(IFF_UP|IFF_RUNNING); - PPPDEBUG(LOG_DEBUG, ("%c%c%"U16_F": ethernet interface detached, going down\n", - sc->sc_ethif->name[0], sc->sc_ethif->name[1], sc->sc_ethif->num)); - } - sc->sc_ethif = NULL; - pppoe_clear_softc(sc, "ethernet interface detached"); - } - - return 0; -} -#endif - -#if 0 /* UNUSED */ -static void -pppoe_clear_softc(struct pppoe_softc *sc, const char *message) -{ - LWIP_UNUSED_ARG(message); - - /* stop timer */ - sys_untimeout(pppoe_timeout, sc); - PPPDEBUG(LOG_DEBUG, ("pppoe: %c%c%"U16_F": session 0x%x terminated, %s\n", sc->sc_ethif->name[0], sc->sc_ethif->name[1], sc->sc_ethif->num, sc->sc_session, message)); - sc->sc_state = PPPOE_STATE_INITIAL; - ppp_link_end(sc->pcb); /* notify upper layers - /!\ dangerous /!\ - see pppoe_disc_input() */ -} -#endif /* UNUSED */ -#endif /* PPP_SUPPORT && PPPOE_SUPPORT */ diff --git a/minix/lib/liblwip/lib/Makefile b/minix/lib/liblwip/lib/Makefile deleted file mode 100644 index 3ac649f57..000000000 --- a/minix/lib/liblwip/lib/Makefile +++ /dev/null @@ -1,21 +0,0 @@ - -.include - -DISTDIR= ${NETBSDSRCDIR}/minix/lib/liblwip/dist -SRCDIR= ${DISTDIR}/src - -LIB=lwip - -.PATH: ${SRCDIR}/core ${SRCDIR}/core/ipv4 ${SRCDIR}/core/ipv6 -.include "${.CURDIR}/core/Makefile.inc" - -.PATH: ${SRCDIR}/netif -.include "${.CURDIR}/netif/Makefile.inc" - -CPPFLAGS+= -D_MINIX_SYSTEM -CPPFLAGS+= -I${SRCDIR}/include -I${.CURDIR} -CPPFLAGS+= -Wno-empty-body - -WARNS?= 5 - -.include diff --git a/minix/lib/liblwip/patches/0002-MINIX-3-only-control-IP-forwarding-at-run-time.patch b/minix/lib/liblwip/patches/0002-MINIX-3-only-control-IP-forwarding-at-run-time.patch deleted file mode 100644 index 947a3d227..000000000 --- a/minix/lib/liblwip/patches/0002-MINIX-3-only-control-IP-forwarding-at-run-time.patch +++ /dev/null @@ -1,101 +0,0 @@ -From 7dd690e2c3f3350f5fd647ca52c3fdcc8ef17f4e Mon Sep 17 00:00:00 2001 -From: David van Moolenbroek -Date: Thu, 2 Feb 2017 18:21:57 +0000 -Subject: [PATCH 2/4] MINIX 3 only: control IP forwarding at run time - -The lwIP core supports IPv4 and IPv6 packet forwarding, but allows -this functionality to be enabled or disabled at compile time only. -For MINIX 3, this is not enough, as NetBSD userland (including the -network RC script) expects to be able to control this setting at run -time. - -This patch adds run-time control over IPv4 and IPv6 forwarding with -the addition of two variables, lwip_ip4_forward and lwip_ip6_forward. -These variables are defined in the LWIP service and declared for lwIP -in arch/cc.h. The variables may be changed at any time. Any non-zero -value indicates that packets of the corresponding IP version should be -forwarded. - -In addition, change lwIP such that if IPv6 forwarding is enabled, -meaning that the node acts as a (minimal, currently non RFC compliant) -router, the following adjustments are made (see RFC 4861): - -- ICMPv6 Redirect messages are not accepted; -- ICMPv6 Neighbor Advertisement messages carry the Router flag. ---- - src/core/ipv4/ip4.c | 7 +++++++ - src/core/ipv6/ip6.c | 7 +++++++ - src/core/ipv6/nd6.c | 14 ++++++++++++++ - 3 files changed, 28 insertions(+) - -diff --git a/src/core/ipv4/ip4.c b/src/core/ipv4/ip4.c -index d2b1751..d2fde03 100644 ---- a/src/core/ipv4/ip4.c -+++ b/src/core/ipv4/ip4.c -@@ -272,6 +272,13 @@ ip4_forward(struct pbuf *p, struct ip_hdr *iphdr, struct netif *inp) - { - struct netif *netif; - -+#if defined(__minix) -+ /* MINIX 3 only: forward packets only when enabled through configuration. */ -+ if (!lwip_ip4_forward) { -+ return; -+ } -+#endif /* defined(__minix) */ -+ - PERF_START; - LWIP_UNUSED_ARG(inp); - -diff --git a/src/core/ipv6/ip6.c b/src/core/ipv6/ip6.c -index 88d998b..24ecaaa 100644 ---- a/src/core/ipv6/ip6.c -+++ b/src/core/ipv6/ip6.c -@@ -367,6 +367,13 @@ ip6_forward(struct pbuf *p, struct ip6_hdr *iphdr, struct netif *inp) - { - struct netif *netif; - -+#if defined(__minix) -+ /* MINIX 3 only: forward packets only when enabled through configuration. */ -+ if (!lwip_ip6_forward) { -+ return; -+ } -+#endif /* defined(__minix) */ -+ - /* do not forward link-local or loopback addresses */ - if (ip6_addr_islinklocal(ip6_current_dest_addr()) || - ip6_addr_isloopback(ip6_current_dest_addr())) { -diff --git a/src/core/ipv6/nd6.c b/src/core/ipv6/nd6.c -index 0122d99..bd121f5 100644 ---- a/src/core/ipv6/nd6.c -+++ b/src/core/ipv6/nd6.c -@@ -790,6 +790,14 @@ nd6_input(struct pbuf *p, struct netif *inp) - struct lladdr_option *lladdr_opt; - ip6_addr_t destination_address, target_address; - -+#if defined(__minix) -+ /* MINIX 3 only: if forwarding is enabled, do not accept redirects. */ -+ if (!lwip_ip6_forward) { -+ pbuf_free(p); -+ return; -+ } -+#endif /* defined(__minix) */ -+ - /* Check that Redir header fits in packet. */ - if (p->len < sizeof(struct redirect_header)) { - /* @todo debug message */ -@@ -1259,6 +1267,12 @@ nd6_send_na(struct netif *netif, const ip6_addr_t *target_addr, u8_t flags) - na_hdr->code = 0; - na_hdr->chksum = 0; - na_hdr->flags = flags & 0xf0; -+#if defined(__minix) -+ /* MINIX 3 only: if forwarding is enabled, set the router bit. */ -+ if (lwip_ip6_forward) { -+ na_hdr->flags |= ND6_FLAG_ROUTER; -+ } -+#endif /* defined(__minix) */ - na_hdr->reserved[0] = 0; - na_hdr->reserved[1] = 0; - na_hdr->reserved[2] = 0; --- -2.5.2 - diff --git a/minix/lib/libsys/Makefile b/minix/lib/libsys/Makefile deleted file mode 100644 index bff698453..000000000 --- a/minix/lib/libsys/Makefile +++ /dev/null @@ -1,148 +0,0 @@ -# Makefile for libsys -.include - -CPPFLAGS+= -D_MINIX_SYSTEM -D_SYSTEM - -LIB= sys - -CFLAGS+= -fno-builtin - -.include "arch/${MACHINE_ARCH}/Makefile.inc" - -SRCS+= \ - alloc_util.c \ - assert.c \ - asynsend.c \ - clock_time.c \ - closenb.c \ - copyfd.c \ - cpuavg.c \ - ds.c \ - env_get_prm.c \ - env_panic.c \ - env_parse.c \ - fkey_ctl.c \ - getepinfo.c \ - getprocnr.c \ - getticks.c \ - getsysinfo.c \ - getuptime.c \ - kernel_call.c \ - kprintf.c \ - kputc.c \ - kputs.c \ - mapdriver.c \ - optset.c \ - panic.c \ - proceventmask.c \ - rmib.c \ - safecopies.c \ - sched_start.c \ - sched_stop.c \ - sef.c \ - sef_fi.c \ - sef_init.c \ - sef_liveupdate.c \ - sef_llvm.c \ - sef_ping.c \ - sef_signal.c \ - sef_st.c \ - socketpath.c \ - sqrt_approx.c \ - srv_fork.c \ - srv_kill.c \ - stacktrace.c \ - sys_abort.c \ - sys_clear.c \ - sys_diagctl.c \ - sys_endsig.c \ - sys_exec.c \ - sys_exit.c \ - sys_fork.c \ - sys_getinfo.c \ - sys_getsig.c \ - sys_hz.c \ - sys_irqctl.c \ - sys_kill.c \ - sys_mcontext.c \ - sys_memset.c \ - sys_padconf.c \ - sys_physcopy.c \ - sys_privctl.c \ - sys_runctl.c \ - sys_safecopy.c \ - sys_safememset.c \ - sys_schedctl.c \ - sys_schedule.c \ - sys_setalarm.c \ - sys_setgrant.c \ - sys_settime.c \ - sys_sigreturn.c \ - sys_sigsend.c \ - sys_sprof.c \ - sys_statectl.c \ - sys_stime.c \ - sys_times.c \ - sys_trace.c \ - sys_umap.c \ - sys_update.c \ - sys_vircopy.c \ - sys_vmctl.c \ - sys_vsafecopy.c \ - sys_vtimer.c \ - sys_vumap.c \ - taskcall.c \ - tickdelay.c \ - timers.c \ - vm_cache.c \ - vm_exit.c \ - vm_fork.c \ - vm_getrusage.c \ - vm_info.c \ - vm_map_phys.c \ - vm_memctl.c \ - vm_prepare.c \ - vm_procctl.c \ - vm_set_priv.c \ - vm_update.c - -.if ${MKPCI} != "no" -SRCS+= pci_attr_r16.c \ - pci_attr_r32.c \ - pci_attr_r8.c \ - pci_attr_w16.c \ - pci_attr_w32.c \ - pci_attr_w8.c \ - pci_del_acl.c \ - pci_dev_name.c \ - pci_find_dev.c \ - pci_first_dev.c \ - pci_get_bar.c \ - pci_ids.c \ - pci_init.c \ - pci_next_dev.c \ - pci_rescan_bus.c \ - pci_reserve.c \ - pci_set_acl.c \ - pci_slot_name.c -.endif - -.if ${MKCOVERAGE} != "no" -SRCS+= gcov.c \ - sef_gcov.c \ - llvm_gcov.c -CPPFLAGS+= -DUSE_COVERAGE -.endif - -.if ${USE_LIVEUPDATE} != "no" -CPPFLAGS+= -DUSE_LIVEUPDATE -.endif - -.if ${USE_SYSDEBUG} != "no" -CPPFLAGS+= -DUSE_SYSDEBUG -.endif - -CPPFLAGS.sched_start.c+= -I${NETBSDSRCDIR}/minix -CPPFLAGS.sef_st.c+= -I${NETBSDSRCDIR}/minix - -.include diff --git a/minix/lib/libsys/arch/earm/Makefile.inc b/minix/lib/libsys/arch/earm/Makefile.inc deleted file mode 100644 index 55fd2bced..000000000 --- a/minix/lib/libsys/arch/earm/Makefile.inc +++ /dev/null @@ -1,15 +0,0 @@ -# Makefile for arch-dependent libsys code -.include - -HERE=${.CURDIR}/arch/${MACHINE_ARCH} -.PATH: ${HERE} - -SRCS+= \ - frclock_util.c \ - spin.c \ - tsc_util.c - -CPPFLAGS+= -I${HERE}/../../ -CPPFLAGS+= -I${NETBSDSRCDIR} -I${NETBSDSRCDIR}/kernel/arch/${MACHINE_ARCH}/ - - diff --git a/minix/net/lwip/Makefile b/minix/net/lwip/Makefile deleted file mode 100644 index 15083862b..000000000 --- a/minix/net/lwip/Makefile +++ /dev/null @@ -1,42 +0,0 @@ -# Makefile for the lwIP TCP/IP socket driver service (LWIP) - -.include - -PROG= lwip -SRCS= lwip.c mempool.c pchain.c addr.c addrpol.c tcpisn.c mcast.c ipsock.c \ - pktsock.c tcpsock.c udpsock.c rawsock.c ifdev.c ifaddr.c loopif.c \ - ethif.c ndev.c rttree.c route.c rtsock.c lnksock.c lldata.c mibtree.c \ - ifconf.c bpfdev.c bpf_filter.c util.c - -FILES=${PROG}.conf -FILESNAME=${PROG} -FILESDIR= /etc/system.conf.d - -CPPFLAGS+= -I${NETBSDSRCDIR}/minix/lib/liblwip/dist/src/include -CPPFLAGS+= -I${NETBSDSRCDIR}/minix/lib/liblwip/lib - -# Disabling USE_INET6 only superficially hides IPv6 support in the service. -.if (${USE_INET6} != "no") -CPPFLAGS+= -DINET6 -.endif - -# Some warnings are the result of usage of lwIP macros. We must not generate -# errors for those, but even producing the warnings is not helpful, so we -# disable them altogether. -CPPFLAGS+= -Wno-address - -DPADD+= ${LIBLWIP} ${LIBSOCKEVENT} ${LIBSOCKDRIVER} ${LIBCHARDRIVER} \ - ${LIBSYS} ${LIBTIMERS} -LDADD+= -llwip -lsockevent -lsockdriver -lchardriver -lsys -ltimers - -WARNS?= 5 - -.if defined(__MINIX) -.if !empty(DBG:M-Og) || !empty(CFLAGS:M-Og) || \ - !empty(DBG:M-g) || !empty(CFLAGS:M-g) -#LSC: -Wno-maybe-uninitialized while compiling with -DNDEBUG -Og -CWARNFLAGS.gcc+= -Wno-maybe-uninitialized -.endif -.endif # defined(__MINIX) - -.include diff --git a/minix/net/lwip/addr.c b/minix/net/lwip/addr.c deleted file mode 100644 index 246a31fa8..000000000 --- a/minix/net/lwip/addr.c +++ /dev/null @@ -1,699 +0,0 @@ -/* LWIP service - addr.c - socket address verification and conversion */ - -#include "lwip.h" - -/* - * Return TRUE if the given socket address is of type AF_UNSPEC, or FALSE - * otherwise. - */ -int -addr_is_unspec(const struct sockaddr * addr, socklen_t addr_len) -{ - - return (addr_len >= offsetof(struct sockaddr, sa_data) && - addr->sa_family == AF_UNSPEC); -} - -/* - * Check whether the given multicast address is generally valid. This check - * should not be moved into addr_get_inet(), as we do not want to forbid - * creating routes for such addresses, for example. We do however apply the - * restrictions here to all provided source and destination addresses. Return - * TRUE if the address is an acceptable multicast address, or FALSE otherwise. - */ -int -addr_is_valid_multicast(const ip_addr_t * ipaddr) -{ - uint8_t scope; - - assert(ip_addr_ismulticast(ipaddr)); - - /* We apply restrictions to IPv6 multicast addresses only. */ - if (IP_IS_V6(ipaddr)) { - scope = ip6_addr_multicast_scope(ip_2_ip6(ipaddr)); - - if (scope == IP6_MULTICAST_SCOPE_RESERVED0 || - scope == IP6_MULTICAST_SCOPE_RESERVEDF) - return FALSE; - - /* - * We do not impose restrictions on the three defined embedded - * flags, even though we put no effort into supporting them, - * especially in terms of automatically creating routes for - * all cases. We do force the fourth flag to be zero. - * Unfortunately there is no lwIP macro to check for this flag. - */ - if (ip_2_ip6(ipaddr)->addr[0] & PP_HTONL(0x00800000UL)) - return FALSE; - - /* Prevent KAME-embedded zone IDs from entering the system. */ - if (ip6_addr_has_scope(ip_2_ip6(ipaddr), IP6_UNKNOWN) && - (ip_2_ip6(ipaddr)->addr[0] & PP_HTONL(0x0000ffffUL))) - return FALSE; - } - - return TRUE; -} - -/* - * Load a sockaddr structure, as copied from userland, as a lwIP-style IP - * address and (optionally) a port number. The expected type of IP address is - * given as 'type', which must be one of IPADDR_TYPE_{V4,ANY,V6}. If it is - * IPADDR_TYPE_V4, 'addr' is expected to point to a sockaddr_in structure. If - * it is IPADDR_TYPE_{ANY,V6}, 'addr' is expected to point to a sockaddr_in6 - * structure. For the _ANY case, the result will be an _ANY address only if it - * is the unspecified (all-zeroes) address and a _V6 address in all other - * cases. For the _V6 case, the result will always be a _V6 address. The - * length of the structure pointed to by 'addr' is given as 'addr_len'. If the - * boolean 'kame' flag is set, addresses will be interpreted to be KAME style, - * meaning that for scoped IPv6 addresses, the zone is embedded in the address - * rather than given in sin6_scope_id. On success, store the resulting IP - * address in 'ipaddr'. If 'port' is not NULL, store the port number in it; - * otherwise, ignore the port number. On any parsing failure, return an - * appropriate negative error code. - */ -int -addr_get_inet(const struct sockaddr * addr, socklen_t addr_len, uint8_t type, - ip_addr_t * ipaddr, int kame, uint16_t * port) -{ - struct sockaddr_in sin; - struct sockaddr_in6 sin6; - ip6_addr_t *ip6addr; - uint32_t ifindex; - - switch (type) { - case IPADDR_TYPE_V4: - if (addr_len != sizeof(sin)) - return EINVAL; - - /* - * Getting around strict aliasing problems. Oh, the irony of - * doing an extra memcpy so that the compiler can do a better - * job at optimizing.. - */ - memcpy(&sin, addr, sizeof(sin)); - - if (sin.sin_family != AF_INET) - return EAFNOSUPPORT; - - ip_addr_set_ip4_u32(ipaddr, sin.sin_addr.s_addr); - - if (port != NULL) - *port = ntohs(sin.sin_port); - - return OK; - - case IPADDR_TYPE_ANY: - case IPADDR_TYPE_V6: - if (addr_len != sizeof(sin6)) - return EINVAL; - - /* Again, strict aliasing.. */ - memcpy(&sin6, addr, sizeof(sin6)); - - if (sin6.sin6_family != AF_INET6) - return EAFNOSUPPORT; - - memset(ipaddr, 0, sizeof(*ipaddr)); - - /* - * This is a bit ugly, but NetBSD does not expose s6_addr32 and - * s6_addr is a series of bytes, which is a mismatch for lwIP. - * The alternative would be another memcpy.. - */ - ip6addr = ip_2_ip6(ipaddr); - assert(sizeof(ip6addr->addr) == sizeof(sin6.sin6_addr)); - memcpy(ip6addr->addr, &sin6.sin6_addr, sizeof(ip6addr->addr)); - - /* - * If the address may have a scope, extract the zone ID. - * Where the zone ID is depends on the 'kame' parameter: KAME- - * style addresses have it embedded within the address, whereas - * non-KAME addresses use the (misnamed) sin6_scope_id field. - */ - if (ip6_addr_has_scope(ip6addr, IP6_UNKNOWN)) { - if (kame) { - ifindex = - ntohl(ip6addr->addr[0]) & 0x0000ffffUL; - - ip6addr->addr[0] &= PP_HTONL(0xffff0000UL); - } else { - /* - * Reject KAME-style addresses for normal - * socket calls, to save ourselves the trouble - * of mixed address styles elsewhere. - */ - if (ip6addr->addr[0] & PP_HTONL(0x0000ffffUL)) - return EINVAL; - - ifindex = sin6.sin6_scope_id; - } - - /* - * Reject invalid zone IDs. This also enforces that - * no zone IDs wider than eight bits enter the system. - * As a side effect, it is not possible to add routes - * for invalid zones, but that should be no problem. - */ - if (ifindex != 0 && - ifdev_get_by_index(ifindex) == NULL) - return ENXIO; - - ip6_addr_set_zone(ip6addr, ifindex); - } else - ip6_addr_clear_zone(ip6addr); - - /* - * Set the type to ANY if it was ANY and the address itself is - * ANY as well. Otherwise, we are binding to a specific IPv6 - * address, so IPV6_V6ONLY stops being relevant and we should - * leave the address set to V6. Destination addresses for ANY - * are set to V6 elsewhere. - */ - if (type == IPADDR_TYPE_ANY && ip6_addr_isany(ip6addr)) - IP_SET_TYPE(ipaddr, type); - else - IP_SET_TYPE(ipaddr, IPADDR_TYPE_V6); - - if (port != NULL) - *port = ntohs(sin6.sin6_port); - - return OK; - - default: - return EAFNOSUPPORT; - } -} - -/* - * Store an lwIP-style IP address and port number as a sockaddr structure - * (sockaddr_in or sockaddr_in6, depending on the given IP address) to be - * copied to userland. The result is stored in the buffer pointed to by - * 'addr'. Before the call, 'addr_len' must be set to the size of this buffer. - * This is an internal check to prevent buffer overflows, and must not be used - * to validate input, since a mismatch will trigger a panic. After the call, - * 'addr_len' will be set to the size of the resulting structure. The lwIP- - * style address is given as 'ipaddr'. If the boolean 'kame' flag is set, the - * address will be stored KAME-style, meaning that for scoped IPv6 addresses, - * the address zone will be stored embedded in the address rather than in - * sin6_scope_id. If relevant, 'port' contains the port number in host-byte - * order; otherwise it should be set to zone. - */ -void -addr_put_inet(struct sockaddr * addr, socklen_t * addr_len, - const ip_addr_t * ipaddr, int kame, uint16_t port) -{ - struct sockaddr_in sin; - struct sockaddr_in6 sin6; - const ip6_addr_t *ip6addr; - uint32_t zone; - - switch (IP_GET_TYPE(ipaddr)) { - case IPADDR_TYPE_V4: - if (*addr_len < sizeof(sin)) - panic("provided address buffer too small"); - - memset(&sin, 0, sizeof(sin)); - - sin.sin_len = sizeof(sin); - sin.sin_family = AF_INET; - sin.sin_port = htons(port); - sin.sin_addr.s_addr = ip_addr_get_ip4_u32(ipaddr); - - memcpy(addr, &sin, sizeof(sin)); - *addr_len = sizeof(sin); - - break; - - case IPADDR_TYPE_ANY: - case IPADDR_TYPE_V6: - if (*addr_len < sizeof(sin6)) - panic("provided address buffer too small"); - - ip6addr = ip_2_ip6(ipaddr); - - memset(&sin6, 0, sizeof(sin6)); - - sin6.sin6_len = sizeof(sin6); - sin6.sin6_family = AF_INET6; - sin6.sin6_port = htons(port); - memcpy(&sin6.sin6_addr, ip6addr->addr, sizeof(sin6.sin6_addr)); - - /* - * If the IPv6 address has a zone set, it must be scoped, and - * we put the zone in the result. It may occur that a scoped - * IPv6 address does not have a zone here though, for example - * if packet routing fails for sendto() with a zoneless address - * on an unbound socket, resulting in an RTM_MISS message. In - * such cases, simply leave the zone index blank in the result. - */ - if (ip6_addr_has_zone(ip6addr)) { - assert(ip6_addr_has_scope(ip6addr, IP6_UNKNOWN)); - - zone = ip6_addr_zone(ip6addr); - assert(zone <= UINT8_MAX); - - if (kame) - sin6.sin6_addr.s6_addr[3] = zone; - else - sin6.sin6_scope_id = zone; - } - - memcpy(addr, &sin6, sizeof(sin6)); - *addr_len = sizeof(sin6); - - break; - - default: - panic("unknown IP address type: %u", IP_GET_TYPE(ipaddr)); - } -} - -/* - * Load a link-layer sockaddr structure (sockaddr_dl), as copied from userland, - * and return the contained name and/or hardware address. The address is - * provided as 'addr', with length 'addr_len'. On success, return OK. If - * 'name' is not NULL, it must be of size 'name_max', and will be used to store - * the (null-terminated) interface name in the given structure if present, or - * the empty string if not. If 'hwaddr' is not NULL, it will be used to store - * the hardware address in the given structure, which must in that case be - * present and exactly 'hwaddr_len' bytes long. On any parsing failure, return - * an appropriate negative error code. - */ -int -addr_get_link(const struct sockaddr * addr, socklen_t addr_len, char * name, - size_t name_max, uint8_t * hwaddr, size_t hwaddr_len) -{ - struct sockaddr_dlx sdlx; - size_t nlen, alen; - - if (addr_len < offsetof(struct sockaddr_dlx, sdlx_data)) - return EINVAL; - - /* - * We cannot prevent callers from passing in massively oversized - * sockaddr_dl structure. However, we insist that all the actual data - * be contained within the size of our sockaddr_dlx version. - */ - if (addr_len > sizeof(sdlx)) - addr_len = sizeof(sdlx); - - memcpy(&sdlx, addr, addr_len); - - if (sdlx.sdlx_family != AF_LINK) - return EAFNOSUPPORT; - - /* Address selectors are not currently supported. */ - if (sdlx.sdlx_slen != 0) - return EINVAL; - - nlen = (size_t)sdlx.sdlx_nlen; - alen = (size_t)sdlx.sdlx_alen; - - /* The nlen and alen fields are 8-bit, so no risks of overflow here. */ - if (addr_len < offsetof(struct sockaddr_dlx, sdlx_data) + nlen + alen) - return EINVAL; - - /* - * Copy out the name, truncating it if needed. The name in the - * sockaddr is not null terminated, so we have to do that. If the - * sockaddr has no name, copy out an empty name. - */ - if (name != NULL) { - assert(name_max > 0); - - if (name_max > nlen + 1) - name_max = nlen + 1; - - memcpy(name, sdlx.sdlx_data, name_max - 1); - name[name_max - 1] = '\0'; - } - - /* - * Copy over the hardware address. For simplicity, we require that the - * caller specify the exact hardware address length. - */ - if (hwaddr != NULL) { - if (alen != hwaddr_len) - return EINVAL; - - memcpy(hwaddr, sdlx.sdlx_data + nlen, hwaddr_len); - } - - return OK; -} - -/* - * Store a link-layer sockaddr structure (sockaddr_dl), to be copied to - * userland. The result is stored in the buffer pointed to by 'addr'. Before - * the call, 'addr_len' must be set to the size of this buffer. This is an - * internal check to prevent buffer overflows, and must not be used to validate - * input, since a mismatch will trigger a panic. After the call, 'addr_len' - * will be set to the size of the resulting structure. The given interface - * index 'ifindex' and (IFT_) interface type 'type' will always be stored in - * the resulting structure. If 'name' is not NULL, it must be a null- - * terminated interface name string which will be included in the structure. - * If 'hwaddr' is not NULL, it must be a hardware address of length - * 'hwaddr_len', which will also be included in the structure. - */ -void -addr_put_link(struct sockaddr * addr, socklen_t * addr_len, uint32_t ifindex, - uint32_t type, const char * name, const uint8_t * hwaddr, - size_t hwaddr_len) -{ - struct sockaddr_dlx sdlx; - size_t name_len; - socklen_t len; - - name_len = (name != NULL) ? strlen(name) : 0; - - if (hwaddr == NULL) - hwaddr_len = 0; - - assert(name_len < IFNAMSIZ); - assert(hwaddr_len <= NETIF_MAX_HWADDR_LEN); - - len = offsetof(struct sockaddr_dlx, sdlx_data) + name_len + hwaddr_len; - - if (*addr_len < len) - panic("provided address buffer too small"); - - memset(&sdlx, 0, sizeof(sdlx)); - sdlx.sdlx_len = len; - sdlx.sdlx_family = AF_LINK; - sdlx.sdlx_index = ifindex; - sdlx.sdlx_type = type; - sdlx.sdlx_nlen = name_len; - sdlx.sdlx_alen = hwaddr_len; - if (name_len > 0) - memcpy(sdlx.sdlx_data, name, name_len); - if (hwaddr_len > 0) - memcpy(sdlx.sdlx_data + name_len, hwaddr, hwaddr_len); - - memcpy(addr, &sdlx, len); - *addr_len = len; -} - -/* - * Convert an IPv4 or IPv6 netmask, given as sockaddr structure 'addr', to a - * prefix length. The length of the sockaddr structure is given as 'addr_len'. - * For consistency with addr_get_inet(), the expected address type is given as - * 'type', and must be either IPADDR_TYPE_V4 or IPADDR_TYPE_V6. On success, - * return OK with the number of set prefix bits returned in 'prefix', and - * optionally with a lwIP representation of the netmask stored in 'ipaddr' (if - * not NULL). On failure, return an appropriate negative error code. Note - * that this function does not support compressed IPv4 network masks; such - * addresses must be expanded before a call to this function. - */ -int -addr_get_netmask(const struct sockaddr * addr, socklen_t addr_len, - uint8_t type, unsigned int * prefix, ip_addr_t * ipaddr) -{ - struct sockaddr_in sin; - struct sockaddr_in6 sin6; - unsigned int byte, bit; - uint32_t val; - - switch (type) { - case IPADDR_TYPE_V4: - if (addr_len != sizeof(sin)) - return EINVAL; - - memcpy(&sin, addr, sizeof(sin)); - - if (sin.sin_family != AF_INET) - return EAFNOSUPPORT; - - val = ntohl(sin.sin_addr.s_addr); - - /* Find the first zero bit. */ - for (bit = 0; bit < IP4_BITS; bit++) - if (!(val & (1 << (IP4_BITS - bit - 1)))) - break; - - *prefix = bit; - - /* All bits after the first zero bit must also be zero. */ - if (bit < IP4_BITS && - (val & ((1 << (IP4_BITS - bit - 1)) - 1))) - return EINVAL; - - if (ipaddr != NULL) - ip_addr_set_ip4_u32(ipaddr, sin.sin_addr.s_addr); - - return OK; - - case IPADDR_TYPE_V6: - if (addr_len != sizeof(sin6)) - return EINVAL; - - memcpy(&sin6, addr, sizeof(sin6)); - - if (sin6.sin6_family != AF_INET6) - return EAFNOSUPPORT; - - /* Find the first zero bit. */ - for (byte = 0; byte < __arraycount(sin6.sin6_addr.s6_addr); - byte++) - if (sin6.sin6_addr.s6_addr[byte] != 0xff) - break; - - /* If all bits are set, there is nothing more to do. */ - if (byte == __arraycount(sin6.sin6_addr.s6_addr)) { - *prefix = __arraycount(sin6.sin6_addr.s6_addr) * NBBY; - - return OK; - } - - for (bit = 0; bit < NBBY; bit++) - if (!(sin6.sin6_addr.s6_addr[byte] & - (1 << (NBBY - bit - 1)))) - break; - - *prefix = byte * NBBY + bit; - - /* All bits after the first zero bit must also be zero. */ - if (bit < NBBY && (sin6.sin6_addr.s6_addr[byte] & - ((1 << (NBBY - bit - 1)) - 1))) - return EINVAL; - - for (byte++; byte < __arraycount(sin6.sin6_addr.s6_addr); - byte++) - if (sin6.sin6_addr.s6_addr[byte] != 0) - return EINVAL; - - if (ipaddr != NULL) { - ip_addr_set_zero_ip6(ipaddr); - - memcpy(ip_2_ip6(ipaddr)->addr, &sin6.sin6_addr, - sizeof(ip_2_ip6(ipaddr)->addr)); - } - - return OK; - - default: - panic("unknown IP address type: %u", type); - } -} - -/* - * Generate a raw network mask based on the given prefix length. - */ -void -addr_make_netmask(uint8_t * addr, socklen_t addr_len, unsigned int prefix) -{ - unsigned int byte, bit; - - byte = prefix / NBBY; - bit = prefix % NBBY; - - assert(byte + !!bit <= addr_len); - - if (byte > 0) - memset(addr, 0xff, byte); - if (bit != 0) - addr[byte++] = (uint8_t)(0xff << (NBBY - bit)); - if (byte < addr_len) - memset(&addr[byte], 0, addr_len - byte); -} - -/* - * Store a network mask as a sockaddr structure, in 'addr'. Before the call, - * 'addr_len' must be set to the memory size of 'addr'. The address type is - * given as 'type', and must be either IPADDR_TYPE_V4 or IPADDR_TYPE_V6. The - * prefix length from which to generate the network mask is given as 'prefix'. - * Upon return, 'addr_len' is set to the size of the resulting sockaddr - * structure. - */ -void -addr_put_netmask(struct sockaddr * addr, socklen_t * addr_len, uint8_t type, - unsigned int prefix) -{ - struct sockaddr_in sin; - struct sockaddr_in6 sin6; - - switch (type) { - case IPADDR_TYPE_V4: - if (*addr_len < sizeof(sin)) - panic("provided address buffer too small"); - - assert(prefix <= IP4_BITS); - - memset(&sin, 0, sizeof(sin)); - sin.sin_len = sizeof(sin); - sin.sin_family = AF_INET; - - addr_make_netmask((uint8_t *)&sin.sin_addr.s_addr, - sizeof(sin.sin_addr.s_addr), prefix); - - memcpy(addr, &sin, sizeof(sin)); - *addr_len = sizeof(sin); - - break; - - case IPADDR_TYPE_V6: - if (*addr_len < sizeof(sin6)) - panic("provided address buffer too small"); - - assert(prefix <= IP6_BITS); - - memset(&sin6, 0, sizeof(sin6)); - sin6.sin6_len = sizeof(sin6); - sin6.sin6_family = AF_INET6; - - addr_make_netmask(sin6.sin6_addr.s6_addr, - sizeof(sin6.sin6_addr.s6_addr), prefix); - - memcpy(addr, &sin6, sizeof(sin6)); - *addr_len = sizeof(sin6); - - break; - - default: - panic("unknown IP address type: %u", type); - } -} - -/* - * Normalize the given address in 'src' to the given number of prefix bits, - * setting all other bits to zero. Return the result in 'dst'. - */ -void -addr_normalize(ip_addr_t * dst, const ip_addr_t * src, unsigned int prefix) -{ -#if !defined(NDEBUG) - unsigned int addr_len; -#endif /* !defined(NDEBUG) */ - unsigned int byte, bit; - const uint8_t *srcaddr; - uint8_t type, *dstaddr; - - type = IP_GET_TYPE(src); - - memset(dst, 0, sizeof(*dst)); - IP_SET_TYPE(dst, type); - - switch (type) { - case IPADDR_TYPE_V4: - srcaddr = (const uint8_t *)&ip_2_ip4(src)->addr; - dstaddr = (uint8_t *)&ip_2_ip4(dst)->addr; -#if !defined(NDEBUG) - addr_len = sizeof(ip_2_ip4(src)->addr); -#endif /* !defined(NDEBUG) */ - - break; - - case IPADDR_TYPE_V6: - ip6_addr_set_zone(ip_2_ip6(dst), ip6_addr_zone(ip_2_ip6(src))); - - srcaddr = (const uint8_t *)&ip_2_ip6(src)->addr; - dstaddr = (uint8_t *)&ip_2_ip6(dst)->addr; -#if !defined(NDEBUG) - addr_len = sizeof(ip_2_ip6(src)->addr); -#endif /* !defined(NDEBUG) */ - - break; - - default: - panic("unknown IP address type: %u", type); - } - - byte = prefix / NBBY; - bit = prefix % NBBY; - - assert(byte + !!bit <= addr_len); - - if (byte > 0) - memcpy(dstaddr, srcaddr, byte); - if (bit != 0) { - dstaddr[byte] = - srcaddr[byte] & (uint8_t)(0xff << (NBBY - bit)); - byte++; - } -} - -/* - * Return the number of common bits between the given two addresses, up to the - * given maximum. Thus, return a value between 0 and 'max' inclusive. - */ -unsigned int -addr_get_common_bits(const ip_addr_t * ipaddr1, const ip_addr_t * ipaddr2, - unsigned int max) -{ - unsigned int addr_len, prefix, bit; - const uint8_t *addr1, *addr2; - uint8_t byte; - - switch (IP_GET_TYPE(ipaddr1)) { - case IPADDR_TYPE_V4: - assert(IP_IS_V4(ipaddr2)); - - addr1 = (const uint8_t *)&ip_2_ip4(ipaddr1)->addr; - addr2 = (const uint8_t *)&ip_2_ip4(ipaddr2)->addr; - addr_len = sizeof(ip_2_ip4(ipaddr1)->addr); - - break; - - case IPADDR_TYPE_V6: - assert(IP_IS_V6(ipaddr2)); - - addr1 = (const uint8_t *)&ip_2_ip6(ipaddr1)->addr; - addr2 = (const uint8_t *)&ip_2_ip6(ipaddr2)->addr; - addr_len = sizeof(ip_2_ip6(ipaddr1)->addr); - - break; - - default: - panic("unknown IP address type: %u", IP_GET_TYPE(ipaddr1)); - } - - if (addr_len > max * NBBY) - addr_len = max * NBBY; - - prefix = 0; - - for (prefix = 0; addr_len > 0; addr1++, addr2++, prefix += NBBY) { - if ((byte = (*addr1 ^ *addr2)) != 0) { - /* TODO: see if we want a lookup table for this. */ - for (bit = 0; bit < NBBY; bit++, prefix++) - if (byte & (1 << (NBBY - bit - 1))) - break; - break; - } - } - - if (prefix > max) - prefix = max; - - return prefix; -} - -/* - * Convert the given IPv4 address to an IPv4-mapped IPv6 address. - */ -void -addr_make_v4mapped_v6(ip_addr_t * dst, const ip4_addr_t * src) -{ - - IP_ADDR6(dst, 0, 0, PP_HTONL(0x0000ffffUL), ip4_addr_get_u32(src)); -} diff --git a/minix/net/lwip/addrpol.c b/minix/net/lwip/addrpol.c deleted file mode 100644 index 144f02c1c..000000000 --- a/minix/net/lwip/addrpol.c +++ /dev/null @@ -1,143 +0,0 @@ -/* LWIP service - addrpol.c - address policy table and values */ -/* - * The main purpose of this module is to implement the address policy table - * described in RFC 6724. In general, the policy table is used for two - * purposes: source address selection, which is part of this service, and - * destination address selection, which is implemented in libc. NetBSD 7, the - * version that MINIX 3 is synced against at this moment, does not actually - * implement the libc part yet, though. That will change with NetBSD 8, where - * libc uses sysctl(7) to obtain the kernel's policy table, which itself can be - * changed with the new ip6addrctl(8) utility. Once we resync to NetBSD 8, we - * will also have to support this new functionality, and this module is where - * it would be implemented. Since NetBSD 7 is even lacking the necessary - * definitions, we cannot do that ahead of time, though. Thus, until then, - * this module is rather simple, as it only implements a static policy table - * used for source address selection. No changes beyond this module should be - * necessary, e.g. we are purposely not caching labels for local addresses. - */ - -#include "lwip.h" - -/* - * Address policy table. Currently hardcoded to the default of RFC 6724. - * Sorted by prefix length, so that the first match is always also the longest. - */ -static const struct { - ip_addr_t ipaddr; - unsigned int prefix; - int precedence; - int label; -} addrpol_table[] = { - { IPADDR6_INIT_HOST(0, 0, 0, 1), 128, 50, 0 }, - { IPADDR6_INIT_HOST(0, 0, 0x0000ffffUL, 0), 96, 35, 4 }, - { IPADDR6_INIT_HOST(0, 0, 0, 0), 96, 1, 3 }, - { IPADDR6_INIT_HOST(0x20010000UL, 0, 0, 0), 32, 5, 5 }, - { IPADDR6_INIT_HOST(0x20020000UL, 0, 0, 0), 16, 30, 2 }, - { IPADDR6_INIT_HOST(0x3ffe0000UL, 0, 0, 0), 16, 1, 12 }, - { IPADDR6_INIT_HOST(0xfec00000UL, 0, 0, 0), 10, 1, 11 }, - { IPADDR6_INIT_HOST(0xfc000000UL, 0, 0, 0), 7, 3, 13 }, - { IPADDR6_INIT_HOST(0, 0, 0, 0), 0, 40, 1 } -}; - -/* - * Obtain the label value for the given IP address from the address policy - * table. Currently only IPv6 addresses may be given. This function is linear - * in number of address policy table entries, requiring a relatively expensive - * normalization operation for each entry, so it should not be called lightly. - * Its results should not be cached beyond local contexts either, because the - * policy table itself may be changed from userland (in the future). - * - * TODO: convert IPv4 addresses to IPv4-mapped IPv6 addresses. - * TODO: embed the interface index in link-local addresses. - */ -int -addrpol_get_label(const ip_addr_t * iporig) -{ - ip_addr_t ipaddr; - unsigned int i; - - assert(IP_IS_V6(iporig)); - - /* - * The policy table is sorted by prefix length such that the first - * match is also the one with the longest prefix, and as such the best. - */ - for (i = 0; i < __arraycount(addrpol_table); i++) { - addr_normalize(&ipaddr, iporig, addrpol_table[i].prefix); - - if (ip_addr_cmp(&addrpol_table[i].ipaddr, &ipaddr)) - return addrpol_table[i].label; - } - - /* - * We cannot possibly get here with the default policy table, because - * the last entry will always match. It is not clear what we should - * return if there is no matching entry, though. For now, we return - * the default label value for the default (::/0) entry, which is 1. - */ - return 1; -} - -/* - * Return an opaque positive value (possibly zero) that represents the scope of - * the given IP address. A larger value indicates a wider scope. The 'is_src' - * flag indicates whether the address is a source or a destination address, - * which affects the value returned for unknown addresses. A scope is a direct - * function of only the given address, so the result may be cached on a per- - * address basis without risking invalidation at any point in time. - */ -int -addrpol_get_scope(const ip_addr_t * ipaddr, int is_src) -{ - const ip6_addr_t *ip6addr; - - /* - * For now, all IPv4 addresses are considered global. This function is - * currently called only for IPv6 addresses anyway. - */ - if (IP_IS_V4(ipaddr)) - return IP6_MULTICAST_SCOPE_GLOBAL; - - assert(IP_IS_V6(ipaddr)); - - ip6addr = ip_2_ip6(ipaddr); - - /* - * These are ordered not by ascending scope, but (roughly) by expected - * likeliness to match, for performance reasons. - */ - if (ip6_addr_isglobal(ip6addr)) - return IP6_MULTICAST_SCOPE_GLOBAL; - - if (ip6_addr_islinklocal(ip6addr) || ip6_addr_isloopback(ip6addr)) - return IP6_MULTICAST_SCOPE_LINK_LOCAL; - - /* - * We deliberately deviate from RFC 6724 Sec. 3.1 by considering - * Unique-Local Addresses (ULAs) to be of smaller scope than global - * addresses, to avoid that during source address selection, a - * preferred ULA is picked over a deprecated global address when given - * a global address as destination, as that would likely result in - * broken two-way communication. - */ - if (ip6_addr_isuniquelocal(ip6addr)) - return IP6_MULTICAST_SCOPE_ORGANIZATION_LOCAL; - - if (ip6_addr_ismulticast(ip6addr)) - return ip6_addr_multicast_scope(ip6addr); - - /* Site-local addresses are deprecated. */ - if (ip6_addr_issitelocal(ip6addr)) - return IP6_MULTICAST_SCOPE_SITE_LOCAL; - - /* - * If the address is a source address, give it a scope beyond global to - * make sure that a "real" global address is picked first. If the - * address is a destination address, give it a global scope so as to - * pick "real" global addresses over unknown-scope source addresses. - */ - if (is_src) - return IP6_MULTICAST_SCOPE_RESERVEDF; /* greater than GLOBAL */ - else - return IP6_MULTICAST_SCOPE_GLOBAL; -} diff --git a/minix/net/lwip/bpf_filter.c b/minix/net/lwip/bpf_filter.c deleted file mode 100644 index 8c0efca6f..000000000 --- a/minix/net/lwip/bpf_filter.c +++ /dev/null @@ -1,561 +0,0 @@ -/* LWIP service - bpf_filter.c - Berkeley Packet Filter core implementation */ -/* - * This is basically a drop-in replacement of NetBSD's bpf_filter.c, which - * itself can be compiled for either the NetBSD kernel or for userland. On - * MINIX 3, we would like to perform certain checks that NetBSD implements only - * for its kernel (e.g., memory store access validation) while replacing the - * NetBSD kernel specifics with our own (pbuf instead of mbuf, no BPF contexts - * for now, etc.). As a result, it is easier to reimplement the whole thing, - * because there is not all that much to it. - * - * Support for the standard BSD API allows us to run standard tests against - * this module from userland, where _MINIX_SYSTEM is not defined. MINIX 3 - * specific extensions are enabled only if _MINIX_SYSTEM is defined. - */ -#include -#include -#include -#include - -#ifdef _MINIX_SYSTEM -#include "lwip.h" - -/* - * Obtain an unsigned 32-bit value in network byte order from the pbuf chain - * 'pbuf' at offset 'k'. The given offset is guaranteed to be within bounds. - */ -static uint32_t -bpf_get32_ext(const struct pbuf * pbuf, uint32_t k) -{ - uint32_t val; - unsigned int i; - - /* - * Find the pbuf that contains the first byte. We expect that most - * filters will operate only on the headers of packets, so that we - * mostly avoid going through this O(n) loop. Since only the superuser - * can open BPF devices at all, we need not be worried about abuse in - * this regard. However, it turns out that this loop is particularly - * CPU-intensive after all, we can probably improve it by caching the - * last visited pbuf, as read locality is likely high. - */ - while (k >= pbuf->len) { - k -= pbuf->len; - pbuf = pbuf->next; - assert(pbuf != NULL); - } - - /* - * We assume that every pbuf has some data, but we make no assumptions - * about any minimum amount of data per pbuf. Therefore, we may have - * to take the bytes from anywhere between one and four pbufs. - * Hopefully the compiler will unroll this loop for us. - */ - val = (uint32_t)(((u_char *)pbuf->payload)[k]) << 24; - - for (i = 0; i < 3; i++) { - if (k >= (uint32_t)pbuf->len - 1) { - k = 0; - pbuf = pbuf->next; - assert(pbuf != NULL); - } else - k++; - val = (val << 8) | (uint32_t)(((u_char *)pbuf->payload)[k]); - } - - return val; -} - -/* - * Obtain an unsigned 16-bit value in network byte order from the pbuf chain - * 'pbuf' at offset 'k'. The given offset is guaranteed to be within bounds. - */ -static uint32_t -bpf_get16_ext(const struct pbuf * pbuf, uint32_t k) -{ - - /* As above. */ - while (k >= pbuf->len) { - k -= pbuf->len; - pbuf = pbuf->next; - assert(pbuf != NULL); - } - - /* - * There are only two possible cases to cover here: either the two - * bytes are in the same pbuf, or they are in subsequent ones. - */ - if (k < (uint32_t)pbuf->len - 1) { - return ((uint32_t)(((u_char *)pbuf->payload)[k]) << 8) | - (uint32_t)(((u_char *)pbuf->next->payload)[k + 1]); - } else { - assert(pbuf->next != NULL); - return ((uint32_t)(((u_char *)pbuf->payload)[k]) << 8) | - (uint32_t)(((u_char *)pbuf->next->payload)[0]); - } -} - -/* - * Obtain an unsigned 8-bit value from the pbuf chain 'pbuf' at offset 'k'. - * The given offset is guaranteed to be within bounds. - */ -static uint32_t -bpf_get8_ext(const struct pbuf * pbuf, uint32_t k) -{ - - /* As above. */ - while (k >= pbuf->len) { - k -= pbuf->len; - pbuf = pbuf->next; - assert(pbuf != NULL); - } - - return (uint32_t)(((u_char *)pbuf->payload)[k]); -} - -#endif /* _MINIX_SYSTEM */ - -/* - * Execute a BPF filter program on (the first part of) a packet, and return the - * maximum size of the packet that should be delivered to the filter owner. - * - * The 'pc' parameter points to an array of BPF instructions that together form - * the filter program to be executed. If 'pc' is NULL, the packet is fully - * accepted. Otherwise, the given program MUST have passed a previous call to - * bpf_validate(). Not doing so will allow for arbitrary memory access. - * - * The 'packet' array contains up to the whole packet. The value of 'total' - * denotes the total length of the packet; 'len' contains the size of the array - * 'packet'. Chunked storage of the packet is not supported at this time. - * - * If executing the program succeeds, the return value is the maximum number of - * bytes from the packet to be delivered. The return value may exceed the full - * packet size. If the number of bytes returned is zero, the packet is to be - * ignored. If the program fails to execute properly and return a value, a - * value of zero is returned as well, thus also indicating that the packet - * should be ignored. This is intentional: it saves filter programs from - * having to perform explicit checks on the packet they are filtering. - */ -u_int -bpf_filter(const struct bpf_insn * pc, const u_char * packet, u_int total, - u_int len) -#ifdef _MINIX_SYSTEM -{ - - return bpf_filter_ext(pc, NULL /*pbuf*/, packet, total, len); -} - -u_int -bpf_filter_ext(const struct bpf_insn * pc, const struct pbuf * pbuf, - const u_char * packet, u_int total, u_int len) -#endif /* _MINIX_SYSTEM */ -{ - uint32_t k, a, x, mem[BPF_MEMWORDS]; - - /* An empty program accepts all packets. */ - if (pc == NULL) - return UINT_MAX; - - /* - * We need not clear 'mem': the checker guarantees that each memory - * store word is always written before it is read. - */ - a = 0; - x = 0; - - /* Execute the program. */ - for (;; pc++) { - k = pc->k; - - switch (pc->code) { - case BPF_LD+BPF_W+BPF_IND: /* A <- P[X+k:4] */ - if (k + x < k) - return 0; - k += x; - /* FALLTHROUGH */ - case BPF_LD+BPF_W+BPF_ABS: /* A <- P[k:4] */ - /* - * 'k' may have any value, so check bounds in such a - * way that 'k' cannot possibly overflow and wrap. - */ - if (len >= 3 && k < len - 3) - a = ((uint32_t)packet[k] << 24) | - ((uint32_t)packet[k + 1] << 16) | - ((uint32_t)packet[k + 2] << 8) | - (uint32_t)packet[k + 3]; -#ifdef _MINIX_SYSTEM - else if (total >= 3 && k < total - 3) - a = bpf_get32_ext(pbuf, k); -#endif /* _MINIX_SYSTEM */ - else - return 0; - break; - case BPF_LD+BPF_H+BPF_IND: /* A <- P[X+k:2] */ - if (k + x < k) - return 0; - k += x; - /* FALLTHROUGH */ - case BPF_LD+BPF_H+BPF_ABS: /* A <- P[k:2] */ - /* As above. */ - if (len >= 1 && k < len - 1) - a = ((uint32_t)packet[k] << 8) | - (uint32_t)packet[k + 1]; -#ifdef _MINIX_SYSTEM - else if (total >= 1 && k < total - 1) - a = bpf_get16_ext(pbuf, k); -#endif /* _MINIX_SYSTEM */ - else - return 0; - break; - case BPF_LD+BPF_B+BPF_IND: /* A <- P[X+k:1] */ - if (k + x < k) - return 0; - k += x; - /* FALLTHROUGH */ - case BPF_LD+BPF_B+BPF_ABS: /* A <- P[k:1] */ - if (k < len) - a = (uint32_t)packet[k]; -#ifdef _MINIX_SYSTEM - else if (k < total) - a = bpf_get8_ext(pbuf, k); -#endif /* _MINIX_SYSTEM */ - else - return 0; - break; - case BPF_LD+BPF_W+BPF_LEN: /* A <- len */ - a = total; - break; - case BPF_LD+BPF_IMM: /* A <- k */ - a = k; - break; - case BPF_LD+BPF_MEM: /* A <- M[k] */ - a = mem[k]; - break; - - case BPF_LDX+BPF_IMM: /* X <- k */ - x = k; - break; - case BPF_LDX+BPF_MEM: /* X <- M[k] */ - x = mem[k]; - break; - case BPF_LDX+BPF_LEN: /* X <- len */ - x = total; - break; - case BPF_LDX+BPF_B+BPF_MSH: /* X <- 4*(P[k:1]&0xf) */ - if (k < len) - x = ((uint32_t)packet[k] & 0xf) << 2; -#ifdef _MINIX_SYSTEM - else if (k < total) - x = (bpf_get8_ext(pbuf, k) & 0xf) << 2; -#endif /* _MINIX_SYSTEM */ - else - return 0; - break; - - case BPF_ST: /* M[k] <- A */ - mem[k] = a; - break; - - case BPF_STX: /* M[k] <- X */ - mem[k] = x; - break; - - case BPF_ALU+BPF_ADD+BPF_K: /* A <- A + k */ - a += k; - break; - case BPF_ALU+BPF_SUB+BPF_K: /* A <- A - k */ - a -= k; - break; - case BPF_ALU+BPF_MUL+BPF_K: /* A <- A * k */ - a *= k; - break; - case BPF_ALU+BPF_DIV+BPF_K: /* A <- A / k */ - a /= k; - break; - case BPF_ALU+BPF_MOD+BPF_K: /* A <- A % k */ - a %= k; - break; - case BPF_ALU+BPF_AND+BPF_K: /* A <- A & k */ - a &= k; - break; - case BPF_ALU+BPF_OR+BPF_K: /* A <- A | k */ - a |= k; - break; - case BPF_ALU+BPF_XOR+BPF_K: /* A <- A ^ k */ - a ^= k; - break; - case BPF_ALU+BPF_LSH+BPF_K: /* A <- A << k */ - a <<= k; - break; - case BPF_ALU+BPF_RSH+BPF_K: /* A <- A >> k */ - a >>= k; - break; - case BPF_ALU+BPF_ADD+BPF_X: /* A <- A + X */ - a += x; - break; - case BPF_ALU+BPF_SUB+BPF_X: /* A <- A - X */ - a -= x; - break; - case BPF_ALU+BPF_MUL+BPF_X: /* A <- A * X */ - a *= x; - break; - case BPF_ALU+BPF_DIV+BPF_X: /* A <- A / X */ - if (x == 0) - return 0; - a /= x; - break; - case BPF_ALU+BPF_MOD+BPF_X: /* A <- A % X */ - if (x == 0) - return 0; - a %= x; - break; - case BPF_ALU+BPF_AND+BPF_X: /* A <- A & X */ - a &= x; - break; - case BPF_ALU+BPF_OR+BPF_X: /* A <- A | X */ - a |= x; - break; - case BPF_ALU+BPF_XOR+BPF_X: /* A <- A ^ X */ - a ^= x; - break; - case BPF_ALU+BPF_LSH+BPF_X: /* A <- A << X */ - if (x >= 32) - return 0; - a <<= x; - break; - case BPF_ALU+BPF_RSH+BPF_X: /* A <- A >> X */ - if (x >= 32) - return 0; - a >>= x; - break; - case BPF_ALU+BPF_NEG: /* A <- -A */ - a = -a; - break; - - case BPF_JMP+BPF_JA: /* pc += k */ - pc += k; - break; - case BPF_JMP+BPF_JGT+BPF_K: /* pc += (A > k) ? jt : jf */ - pc += (a > k) ? pc->jt : pc->jf; - break; - case BPF_JMP+BPF_JGE+BPF_K: /* pc += (A >= k) ? jt : jf */ - pc += (a >= k) ? pc->jt : pc->jf; - break; - case BPF_JMP+BPF_JEQ+BPF_K: /* pc += (A == k) ? jt : jf */ - pc += (a == k) ? pc->jt : pc->jf; - break; - case BPF_JMP+BPF_JSET+BPF_K: /* pc += (A & k) ? jt : jf */ - pc += (a & k) ? pc->jt : pc->jf; - break; - case BPF_JMP+BPF_JGT+BPF_X: /* pc += (A > X) ? jt : jf */ - pc += (a > x) ? pc->jt : pc->jf; - break; - case BPF_JMP+BPF_JGE+BPF_X: /* pc += (A >= X) ? jt : jf */ - pc += (a >= x) ? pc->jt : pc->jf; - break; - case BPF_JMP+BPF_JEQ+BPF_X: /* pc += (A == X) ? jt : jf */ - pc += (a == x) ? pc->jt : pc->jf; - break; - case BPF_JMP+BPF_JSET+BPF_X: /* pc += (A & X) ? jt : jf */ - pc += (a & x) ? pc->jt : pc->jf; - break; - - case BPF_RET+BPF_A: /* accept A bytes */ - return a; - case BPF_RET+BPF_K: /* accept K bytes */ - return k; - - case BPF_MISC+BPF_TAX: /* X <- A */ - x = a; - break; - case BPF_MISC+BPF_TXA: /* A <- X */ - a = x; - break; - - default: /* unknown instruction */ - return 0; - } - } - - /* NOTREACHED */ -} - -/* - * In order to avoid having to perform explicit memory allocation, we store - * some validation state on the stack, using data types that are as small as - * possible for the current definitions. The data types, and in fact the whole - * assumption that we can store the state on the stack, may need to be revised - * if certain constants are increased in the future. As of writing, the - * validation routine uses a little over 1KB of stack memory. - */ -#if BPF_MEMWORDS <= 16 /* value as of writing: 16 */ -typedef uint16_t meminv_t; -#else -#error "increased BPF_MEMWORDS may require code revision" -#endif - -#if BPF_MAXINSNS > 2048 /* value as of writing: 512 */ -#error "increased BPF_MAXINSNS may require code revision" -#endif - -/* - * Verify that the given filter program is safe to execute, by performing as - * many static validity checks as possible. The program is given as 'insns', - * which must be an array of 'ninsns' BPF instructions. Unlike bpf_filter(), - * this function does not accept empty filter programs. The function returns 1 - * if the program was successfully validated, or 0 if the program should not be - * accepted. - */ -int -bpf_validate(const struct bpf_insn * insns, int ninsns) -{ - bitchunk_t reachable[BITMAP_CHUNKS(BPF_MAXINSNS)]; - meminv_t invalid, meminv[BPF_MAXINSNS]; - const struct bpf_insn *insn; - u_int pc, count, target; - int advance; - - if (insns == NULL || ninsns <= 0 || ninsns > BPF_MAXINSNS) - return 0; - count = (u_int)ninsns; - - memset(reachable, 0, sizeof(reachable[0]) * BITMAP_CHUNKS(count)); - memset(meminv, 0, sizeof(meminv[0]) * count); - - SET_BIT(reachable, 0); - meminv[0] = (meminv_t)~0; - - for (pc = 0; pc < count; pc++) { - /* We completely ignore instructions that are not reachable. */ - if (!GET_BIT(reachable, pc)) - continue; - - invalid = meminv[pc]; - advance = 1; - - insn = &insns[pc]; - - switch (insn->code) { - case BPF_LD+BPF_W+BPF_ABS: - case BPF_LD+BPF_H+BPF_ABS: - case BPF_LD+BPF_B+BPF_ABS: - case BPF_LD+BPF_W+BPF_IND: - case BPF_LD+BPF_H+BPF_IND: - case BPF_LD+BPF_B+BPF_IND: - case BPF_LD+BPF_LEN: - case BPF_LD+BPF_IMM: - case BPF_LDX+BPF_IMM: - case BPF_LDX+BPF_LEN: - case BPF_LDX+BPF_B+BPF_MSH: - case BPF_ALU+BPF_ADD+BPF_K: - case BPF_ALU+BPF_SUB+BPF_K: - case BPF_ALU+BPF_MUL+BPF_K: - case BPF_ALU+BPF_AND+BPF_K: - case BPF_ALU+BPF_OR+BPF_K: - case BPF_ALU+BPF_XOR+BPF_K: - case BPF_ALU+BPF_ADD+BPF_X: - case BPF_ALU+BPF_SUB+BPF_X: - case BPF_ALU+BPF_MUL+BPF_X: - case BPF_ALU+BPF_DIV+BPF_X: - case BPF_ALU+BPF_MOD+BPF_X: - case BPF_ALU+BPF_AND+BPF_X: - case BPF_ALU+BPF_OR+BPF_X: - case BPF_ALU+BPF_XOR+BPF_X: - case BPF_ALU+BPF_LSH+BPF_X: - case BPF_ALU+BPF_RSH+BPF_X: - case BPF_ALU+BPF_NEG: - case BPF_MISC+BPF_TAX: - case BPF_MISC+BPF_TXA: - /* Nothing we can check for these. */ - break; - case BPF_ALU+BPF_DIV+BPF_K: - case BPF_ALU+BPF_MOD+BPF_K: - /* No division by zero. */ - if (insn->k == 0) - return 0; - break; - case BPF_ALU+BPF_LSH+BPF_K: - case BPF_ALU+BPF_RSH+BPF_K: - /* Do not invoke undefined behavior. */ - if (insn->k >= 32) - return 0; - break; - case BPF_LD+BPF_MEM: - case BPF_LDX+BPF_MEM: - /* - * Only allow loading words that have been stored in - * all execution paths leading up to this instruction. - */ - if (insn->k >= BPF_MEMWORDS || - (invalid & (1 << insn->k))) - return 0; - break; - case BPF_ST: - case BPF_STX: - if (insn->k >= BPF_MEMWORDS) - return 0; - invalid &= ~(1 << insn->k); - break; - case BPF_JMP+BPF_JA: - /* - * Make sure that the target instruction of the jump is - * still part of the program, and mark it as reachable. - */ - if (insn->k >= count - pc - 1) - return 0; - target = pc + insn->k + 1; - SET_BIT(reachable, target); - meminv[target] |= invalid; - advance = 0; - break; - case BPF_JMP+BPF_JGT+BPF_K: - case BPF_JMP+BPF_JGE+BPF_K: - case BPF_JMP+BPF_JEQ+BPF_K: - case BPF_JMP+BPF_JSET+BPF_K: - case BPF_JMP+BPF_JGT+BPF_X: - case BPF_JMP+BPF_JGE+BPF_X: - case BPF_JMP+BPF_JEQ+BPF_X: - case BPF_JMP+BPF_JSET+BPF_X: - /* - * Make sure that both target instructions are still - * part of the program, and mark both as reachable. - * There is no chance that the additions will overflow. - */ - target = pc + insn->jt + 1; - if (target >= count) - return 0; - SET_BIT(reachable, target); - meminv[target] |= invalid; - - target = pc + insn->jf + 1; - if (target >= count) - return 0; - SET_BIT(reachable, target); - meminv[target] |= invalid; - - advance = 0; - break; - case BPF_RET+BPF_A: - case BPF_RET+BPF_K: - advance = 0; - break; - default: - return 0; - } - - /* - * After most instructions, we simply advance to the next. For - * one thing, this means that there must be a next instruction - * at all. - */ - if (advance) { - if (pc + 1 == count) - return 0; - SET_BIT(reachable, pc + 1); - meminv[pc + 1] |= invalid; - } - } - - /* The program has passed all our basic tests. */ - return 1; -} diff --git a/minix/net/lwip/bpfdev.c b/minix/net/lwip/bpfdev.c deleted file mode 100644 index 3e12c8dac..000000000 --- a/minix/net/lwip/bpfdev.c +++ /dev/null @@ -1,1365 +0,0 @@ -/* LWIP service - bpfdev.c - Berkeley Packet Filter (/dev/bpf) interface */ -/* - * BPF is a cloning device: opening /dev/bpf returns a new BPF device which is - * independent from any other opened BPF devices. We assume that each BPF - * device is used by one single user process, and this implementation therefore - * does not support multiple concurrent device calls on the same BPF device. - * - * Packet buffering basically follows the BSD model: each BPF device that is - * configured (that is, it has been attached to an interface) has two buffers, - * each of the configured size: a store buffer, where new packets are stored, - * and a hold buffer, which is typically full and awaiting retrieval through a - * read call from userland. The buffers are swapped ("rotated") when the store - * buffer is filled up and the hold buffer is empty - if the hold buffer is not - * empty is not empty either, additional packets are dropped. - * - * These buffers are allocated when the BPF device is attached to an interface. - * The interface may later disappear, in which case the BPF device is detached - * from it, allowing any final packets to be read before read requests start - * returning I/O errors. The buffers are freed only when the device is closed. - */ - -#include "lwip.h" -#include "bpfdev.h" - -#include -#include -#include -#include -#include - -/* - * Make sure that our implementation matches the BPF version in the NetBSD - * headers. If they change the version number, we may have to make changes - * here accordingly. - */ -#if BPF_MAJOR_VERSION != 1 || BPF_MINOR_VERSION != 1 -#error "NetBSD BPF version has changed" -#endif - -/* The number of BPF devices. */ -#define NR_BPFDEV 16 - -/* BPF receive buffer size: allowed range and default. */ -#define BPF_BUF_MIN BPF_WORDALIGN(sizeof(struct bpf_hdr)) -#define BPF_BUF_DEF 32768 -#define BPF_BUF_MAX 262144 - -/* - * By opening /dev/bpf, one will obtain a cloned device with a different minor - * number, which maps to one of the BPF devices. - */ -#define BPFDEV_MINOR 0 /* minor number of /dev/bpf */ -#define BPFDEV_BASE_MINOR 1 /* base minor number for BPF devices */ - -static struct bpfdev { - struct bpfdev_link bpf_link; /* structure link, MUST be first */ - TAILQ_ENTRY(bpfdev) bpf_next; /* next on free or interface list */ - struct ifdev *bpf_ifdev; /* associated interface, or NULL */ - unsigned int bpf_flags; /* flags (BPFF_) */ - size_t bpf_size; /* size of packet buffers */ - char *bpf_sbuf; /* store buffer (mmap'd, or NULL) */ - char *bpf_hbuf; /* hold buffer (mmap'd, or NULL) */ - size_t bpf_slen; /* used part of store buffer */ - size_t bpf_hlen; /* used part of hold buffer */ - struct bpf_insn *bpf_filter; /* verified BPF filter, or NULL */ - size_t bpf_filterlen; /* length of filter, for munmap */ - pid_t bpf_pid; /* process ID of last using process */ - clock_t bpf_timeout; /* timeout for read calls (0 = none) */ - struct { /* state for pending read request */ - endpoint_t br_endpt; /* reading endpoint, or NONE */ - cp_grant_id_t br_grant; /* grant for reader's buffer */ - cdev_id_t br_id; /* read request identifier */ - minix_timer_t br_timer; /* timer for read timeout */ - } bpf_read; - struct { /* state for pending select request */ - endpoint_t bs_endpt; /* selecting endpoint, or NONE */ - unsigned int bs_selops; /* pending select operations */ - } bpf_select; - struct { /* packet capture statistics */ - uint64_t bs_recv; /* # of packets run through filter */ - uint64_t bs_drop; /* # of packets dropped: buffer full */ - uint64_t bs_capt; /* # of packets accepted by filter */ - } bpf_stat; -} bpf_array[NR_BPFDEV]; - -#define BPFF_IN_USE 0x01 /* this BPF device object is in use */ -#define BPFF_PROMISC 0x02 /* promiscuous mode enabled */ -#define BPFF_IMMEDIATE 0x04 /* immediate mode is enabled */ -#define BPFF_SEESENT 0x08 /* also process host-sent packets */ -#define BPFF_HDRCMPLT 0x10 /* do not fill in link-layer source */ -#define BPFF_FEEDBACK 0x20 /* feed back written packet as input */ - -static TAILQ_HEAD(, bpfdev_link) bpfl_freelist; /* list of free BPF devices */ - -static struct bpf_stat bpf_stat; - -static ssize_t bpfdev_peers(struct rmib_call *, struct rmib_node *, - struct rmib_oldp *, struct rmib_newp *); - -/* The CTL_NET NET_BPF subtree. All nodes are dynamically numbered. */ -static struct rmib_node net_bpf_table[] = { - RMIB_INT(RMIB_RO, BPF_BUF_MAX, "maxbufsize", - "Maximum size for data capture buffer"), /* TODO: read-write */ - RMIB_STRUCT(RMIB_RO, sizeof(bpf_stat), &bpf_stat, "stats", - "BPF stats"), - RMIB_FUNC(RMIB_RO | CTLTYPE_NODE, 0, bpfdev_peers, "peers", - "BPF peers"), -}; - -static struct rmib_node net_bpf_node = - RMIB_NODE(RMIB_RO, net_bpf_table, "bpf", "BPF options"); - -/* - * Initialize the BPF module. - */ -void -bpfdev_init(void) -{ - const int mib[] = { CTL_NET, NET_BPF }; - unsigned int slot; - int r; - - /* Initialize data structures. */ - TAILQ_INIT(&bpfl_freelist); - - for (slot = 0; slot < __arraycount(bpf_array); slot++) { - bpf_array[slot].bpf_flags = 0; - - TAILQ_INSERT_TAIL(&bpfl_freelist, &bpf_array[slot].bpf_link, - bpfl_next); - } - - memset(&bpf_stat, 0, sizeof(bpf_stat)); - - /* Register the "net.bpf" subtree with the MIB service. */ - if ((r = rmib_register(mib, __arraycount(mib), &net_bpf_node)) != OK) - panic("unable to register net.bpf RMIB tree: %d", r); -} - -/* - * Given a BPF device object, return the corresponding minor number. - */ -static devminor_t -bpfdev_get_minor(struct bpfdev * bpfdev) -{ - - assert(bpfdev != NULL); - - return BPFDEV_BASE_MINOR + (devminor_t)(bpfdev - bpf_array); -} - -/* - * Given a minor number, return the corresponding BPF device object, or NULL if - * the minor number does not identify a BPF device. - */ -static struct bpfdev * -bpfdev_get_by_minor(devminor_t minor) -{ - - if (minor < BPFDEV_BASE_MINOR || - (unsigned int)minor >= BPFDEV_BASE_MINOR + __arraycount(bpf_array)) - return NULL; - - return &bpf_array[minor - BPFDEV_BASE_MINOR]; -} - -/* - * Open a BPF device, returning a cloned device instance. - */ -static int -bpfdev_open(devminor_t minor, int access __unused, endpoint_t user_endpt) -{ - struct bpfdev_link *bpfl; - struct bpfdev *bpf; - - /* Disallow opening cloned devices through device nodes. */ - if (minor != BPFDEV_MINOR) - return ENXIO; - - if (TAILQ_EMPTY(&bpfl_freelist)) - return ENOBUFS; - - bpfl = TAILQ_FIRST(&bpfl_freelist); - TAILQ_REMOVE(&bpfl_freelist, bpfl, bpfl_next); - - bpf = (struct bpfdev *)bpfl; - - memset(bpf, 0, sizeof(*bpf)); - - bpf->bpf_flags = BPFF_IN_USE | BPFF_SEESENT; - bpf->bpf_size = BPF_BUF_DEF; - bpf->bpf_pid = getnpid(user_endpt); - bpf->bpf_read.br_endpt = NONE; - bpf->bpf_select.bs_endpt = NONE; - - return CDEV_CLONED | bpfdev_get_minor(bpf); -} - -/* - * Close a BPF device. - */ -static int -bpfdev_close(devminor_t minor) -{ - struct bpfdev *bpf; - - if ((bpf = bpfdev_get_by_minor(minor)) == NULL) - return EINVAL; - - /* - * There cannot possibly be a pending read request, so we never need to - * cancel the read timer from here either. - */ - assert(bpf->bpf_read.br_endpt == NONE); - - if (bpf->bpf_sbuf != NULL) { - assert(bpf->bpf_hbuf != NULL); - - if (munmap(bpf->bpf_sbuf, bpf->bpf_size) != 0) - panic("munmap failed: %d", -errno); - if (munmap(bpf->bpf_hbuf, bpf->bpf_size) != 0) - panic("munmap failed: %d", -errno); - - bpf->bpf_sbuf = NULL; - bpf->bpf_hbuf = NULL; - } else - assert(bpf->bpf_hbuf == NULL); - - if (bpf->bpf_filter != NULL) { - assert(bpf->bpf_filterlen > 0); - - if (munmap(bpf->bpf_filter, bpf->bpf_filterlen) != 0) - panic("munmap failed: %d", -errno); - - bpf->bpf_filter = NULL; - } - - /* - * If the BPF device was attached to an interface, and that interface - * has not disappeared in the meantime, detach from it now. - */ - if (bpf->bpf_ifdev != NULL) { - if (bpf->bpf_flags & BPFF_PROMISC) - ifdev_clear_promisc(bpf->bpf_ifdev); - - ifdev_detach_bpf(bpf->bpf_ifdev, &bpf->bpf_link); - - bpf->bpf_ifdev = NULL; - } - - bpf->bpf_flags = 0; /* mark as no longer in use */ - - TAILQ_INSERT_HEAD(&bpfl_freelist, &bpf->bpf_link, bpfl_next); - - return OK; -} - -/* - * Rotate buffers for the BPF device, by swapping the store buffer and the hold - * buffer. - */ -static void -bpfdev_rotate(struct bpfdev * bpf) -{ - char *buf; - size_t len; - - /* - * When rotating, the store buffer may or may not be empty, but the - * hold buffer must always be empty. - */ - assert(bpf->bpf_hlen == 0); - - buf = bpf->bpf_sbuf; - len = bpf->bpf_slen; - bpf->bpf_sbuf = bpf->bpf_hbuf; - bpf->bpf_slen = bpf->bpf_hlen; - bpf->bpf_hbuf = buf; - bpf->bpf_hlen = len; -} - -/* - * Test whether any of the given select operations are ready on the BPF device, - * and return the set of ready operations. - */ -static unsigned int -bpfdev_test_select(struct bpfdev * bpf, unsigned int ops) -{ - unsigned int ready_ops; - - ready_ops = 0; - - /* - * The BPF device is ready for reading if the hold buffer is not empty - * (i.e.: the store buffer has been filled up completely and was - * therefore rotated) or if immediate mode is set and the store buffer - * is not empty (i.e.: any packet is available at all). In the latter - * case, the buffers will be rotated during the read. We do not - * support applying the read timeout to selects and maintaining state - * between the select and the following read, because despite that - * libpcap claims that it is the right behavior, that is just insane. - */ - if (ops & CDEV_OP_RD) { - if (bpf->bpf_ifdev == NULL) - ready_ops |= CDEV_OP_RD; - else if (bpf->bpf_hlen > 0) - ready_ops |= CDEV_OP_RD; - else if ((bpf->bpf_flags & BPFF_IMMEDIATE) && - bpf->bpf_slen > 0) - ready_ops |= CDEV_OP_RD; - } - - if (ops & CDEV_OP_WR) - ready_ops |= CDEV_OP_WR; - - return ready_ops; -} - -/* - * There has been a state change on the BPF device. If now possible, resume a - * pending select query, if any. - */ -static void -bpfdev_resume_select(struct bpfdev * bpf) -{ - unsigned int ops, ready_ops; - endpoint_t endpt; - - /* First see if there is a pending select request at all. */ - if ((endpt = bpf->bpf_select.bs_endpt) == NONE) - return; - ops = bpf->bpf_select.bs_selops; - - assert(ops != 0); - - /* Then see if any of the pending operations are now ready. */ - if ((ready_ops = bpfdev_test_select(bpf, ops)) == 0) - return; - - /* If so, notify VFS about the ready operations. */ - chardriver_reply_select(bpf->bpf_select.bs_endpt, - bpfdev_get_minor(bpf), ready_ops); - - /* - * Forget about the ready operations. If that leaves no pending - * operations, forget about the select request altogether. - */ - if ((bpf->bpf_select.bs_selops &= ~ready_ops) == 0) - bpf->bpf_select.bs_endpt = NONE; -} - -/* - * There has been a state change on the BPF device. If now possible, resume a - * pending read request, if any. If the call is a result of a timeout, - * 'is_timeout' is set. In that case, the read request must be resumed with an - * EAGAIN error if no packets are available, and the running timer must be - * canceled. Otherwise, the resumption is due to a full buffer or a - * disappeared interface, and 'is_timeout' is not set. In this case, the read - * request must be resumed with an I/O error if no packets are available. - */ -static void -bpfdev_resume_read(struct bpfdev * bpf, int is_timeout) -{ - ssize_t r; - - assert(bpf->bpf_read.br_endpt != NONE); - - /* - * If the hold buffer is still empty, see if the store buffer has - * any packets to copy out. - */ - if (bpf->bpf_hlen == 0) - bpfdev_rotate(bpf); - - /* Return any available packets, or otherwise an error. */ - if (bpf->bpf_hlen > 0) { - assert(bpf->bpf_hlen <= bpf->bpf_size); - - r = sys_safecopyto(bpf->bpf_read.br_endpt, - bpf->bpf_read.br_grant, 0, (vir_bytes)bpf->bpf_hbuf, - bpf->bpf_hlen); - - if (r == OK) { - r = (ssize_t)bpf->bpf_hlen; - - bpf->bpf_hlen = 0; - - assert(bpf->bpf_slen != bpf->bpf_size); - - /* - * Allow readers to get the last packets after the - * interface has disappeared, before getting errors. - */ - if (bpf->bpf_ifdev == NULL) - bpfdev_rotate(bpf); - } - } else - r = (is_timeout) ? EAGAIN : EIO; - - chardriver_reply_task(bpf->bpf_read.br_endpt, bpf->bpf_read.br_id, r); - - bpf->bpf_read.br_endpt = NONE; - - /* Was there still a timer running? Then cancel it now. */ - if (bpf->bpf_timeout > 0 && !is_timeout) - cancel_timer(&bpf->bpf_read.br_timer); -} - -/* - * A read timeout has triggered for the BPF device. Wake up the pending read - * request. - */ -static void -bpfdev_timeout(int arg) -{ - struct bpfdev *bpf; - - assert(arg >= 0 && (unsigned int)arg < __arraycount(bpf_array)); - - bpf = &bpf_array[arg]; - - assert(bpf->bpf_read.br_endpt != NONE); - - bpfdev_resume_read(bpf, TRUE /*is_timeout*/); -} - -/* - * Read from a BPF device. - */ -static ssize_t -bpfdev_read(devminor_t minor, uint64_t position, endpoint_t endpt, - cp_grant_id_t grant, size_t size, int flags, cdev_id_t id) -{ - struct bpfdev *bpf; - ssize_t r; - int suspend; - - if ((bpf = bpfdev_get_by_minor(minor)) == NULL) - return EINVAL; - - /* Allow only one read call at a time. */ - if (bpf->bpf_read.br_endpt != NONE) - return EIO; - - /* Has this BPF device been configured at all yet? */ - if (bpf->bpf_sbuf == NULL) - return EINVAL; - - /* - * Does the read call size match the entire buffer size? This is a - * ridiculous requirement but it makes our job quite a bit easier.. - */ - if (size != bpf->bpf_size) - return EINVAL; - - /* - * Following standard receive semantics, if the interface is gone, - * return all the packets that were pending before returning an error. - * This requires extra buffer rotations after read completion, too. - */ - if (bpf->bpf_ifdev == NULL && bpf->bpf_hlen == 0) - return EIO; - - /* - * If immediate mode is not enabled, we should always suspend the read - * call if the hold buffer is empty. If immediate mode is enabled, we - * should only suspend the read call if both buffers are empty, and - * return data from the hold buffer or otherwise the store buffer, - * whichever is not empty. A non-blocking call behaves as though - * immediate mode is enabled, except it will return EAGAIN instead of - * suspending the read call if both buffers are empty. Thus, we may - * have to rotate buffers for both immediate mode and non-blocking - * calls. The latter is necessary for libpcap to behave correctly. - */ - if ((flags & CDEV_NONBLOCK) || (bpf->bpf_flags & BPFF_IMMEDIATE)) - suspend = (bpf->bpf_hlen == 0 && bpf->bpf_slen == 0); - else - suspend = (bpf->bpf_hlen == 0); - - if (suspend) { - if (flags & CDEV_NONBLOCK) - return EAGAIN; - - /* Suspend the read call for later. */ - bpf->bpf_read.br_endpt = endpt; - bpf->bpf_read.br_grant = grant; - bpf->bpf_read.br_id = id; - - /* Set a timer if requested. */ - if (bpf->bpf_timeout > 0) - set_timer(&bpf->bpf_read.br_timer, bpf->bpf_timeout, - bpfdev_timeout, (int)(bpf - bpf_array)); - - return EDONTREPLY; - } - - /* If we get here, either buffer has data; rotate buffers if needed. */ - if (bpf->bpf_hlen == 0) - bpfdev_rotate(bpf); - assert(bpf->bpf_hlen > 0); - - if ((r = sys_safecopyto(endpt, grant, 0, (vir_bytes)bpf->bpf_hbuf, - bpf->bpf_hlen)) != OK) - return r; - - r = (ssize_t)bpf->bpf_hlen; - - bpf->bpf_hlen = 0; - - /* - * If the store buffer is exactly full, rotate it now. Also, if the - * interface has disappeared, the store buffer will never fill up. - * Rotate it so that the application will get any remaining data before - * getting errors about the interface being gone. - */ - if (bpf->bpf_slen == bpf->bpf_size || bpf->bpf_ifdev == NULL) - bpfdev_rotate(bpf); - - return r; -} - -/* - * Write to a BPF device. - */ -static ssize_t -bpfdev_write(devminor_t minor, uint64_t position, endpoint_t endpt, - cp_grant_id_t grant, size_t size, int flags, cdev_id_t id) -{ - struct bpfdev *bpf; - struct pbuf *pbuf, *pptr, *pcopy; - size_t off; - err_t err; - int r; - - if ((bpf = bpfdev_get_by_minor(minor)) == NULL) - return EINVAL; - - if (bpf->bpf_ifdev == NULL) - return EINVAL; - - /* VFS skips zero-sized I/O calls right now, but that may change. */ - if (size == 0) - return 0; /* nothing to do */ - - if (size > ifdev_get_hdrlen(bpf->bpf_ifdev) + - ifdev_get_mtu(bpf->bpf_ifdev)) - return EMSGSIZE; - - if ((pbuf = pchain_alloc(PBUF_LINK, size)) == NULL) - return ENOMEM; - - /* TODO: turn this into a series of vector copies. */ - off = 0; - for (pptr = pbuf; pptr != NULL; pptr = pptr->next) { - if ((r = sys_safecopyfrom(endpt, grant, off, - (vir_bytes)pptr->payload, pptr->len)) != OK) { - pbuf_free(pbuf); - - return r; - } - off += pptr->len; - } - assert(off == size); - - /* - * In feedback mode, we cannot use the same packet buffers for both - * output and input, so make a copy. We do this before calling the - * output function, which may change part of the buffers, because the - * BSDs take this approach as well. - */ - if (bpf->bpf_flags & BPFF_FEEDBACK) { - if ((pcopy = pchain_alloc(PBUF_LINK, size)) == NULL) { - pbuf_free(pbuf); - - return ENOMEM; - } - - if (pbuf_copy(pcopy, pbuf) != ERR_OK) - panic("unexpected pbuf copy failure"); - } else - pcopy = NULL; - - /* Pass in the packet as output, and free it again. */ - err = ifdev_output(bpf->bpf_ifdev, pbuf, NULL /*netif*/, - TRUE /*to_bpf*/, !!(bpf->bpf_flags & BPFF_HDRCMPLT)); - - pbuf_free(pbuf); - - /* In feedback mode, pass in the copy as input, if output succeeded. */ - if (err == ERR_OK && (bpf->bpf_flags & BPFF_FEEDBACK)) - ifdev_input(bpf->bpf_ifdev, pcopy, NULL /*netif*/, - FALSE /*to_bpf*/); - else if (pcopy != NULL) - pbuf_free(pcopy); - - return (err == ERR_OK) ? (ssize_t)size : util_convert_err(err); -} - -/* - * Attach a BPF device to a network interface, using the interface name given - * in an ifreq structure. As side effect, allocate hold and store buffers for - * the device. These buffers will stay allocated until the device is closed, - * even though the interface may disappear before that. Return OK if the BPF - * device was successfully attached to the interface, or a negative error code - * otherwise. - */ -static int -bpfdev_attach(struct bpfdev * bpf, struct ifreq * ifr) -{ - struct ifdev *ifdev; - void *sbuf, *hbuf; - - /* Find the interface with the given name. */ - ifr->ifr_name[sizeof(ifr->ifr_name) - 1] = '\0'; - if ((ifdev = ifdev_find_by_name(ifr->ifr_name)) == NULL) - return ENXIO; - - /* - * Allocate a store buffer and a hold buffer. Preallocate the memory, - * or we might get killed later during low-memory conditions. - */ - if ((sbuf = (char *)mmap(NULL, bpf->bpf_size, PROT_READ | PROT_WRITE, - MAP_ANON | MAP_PRIVATE | MAP_PREALLOC, -1, 0)) == MAP_FAILED) - return ENOMEM; - - if ((hbuf = (char *)mmap(NULL, bpf->bpf_size, PROT_READ | PROT_WRITE, - MAP_ANON | MAP_PRIVATE | MAP_PREALLOC, -1, 0)) == MAP_FAILED) { - (void)munmap(sbuf, bpf->bpf_size); - - return ENOMEM; - } - - bpf->bpf_ifdev = ifdev; - bpf->bpf_sbuf = sbuf; - bpf->bpf_hbuf = hbuf; - assert(bpf->bpf_slen == 0); - assert(bpf->bpf_hlen == 0); - - ifdev_attach_bpf(ifdev, &bpf->bpf_link); - - return OK; -} - -/* - * Detach the BPF device from its interface, which is about to disappear. - */ -void -bpfdev_detach(struct bpfdev_link * bpfl) -{ - struct bpfdev *bpf = (struct bpfdev *)bpfl; - - assert(bpf->bpf_flags & BPFF_IN_USE); - assert(bpf->bpf_ifdev != NULL); - - /* - * We deliberately leave the buffers allocated here, for two reasons: - * - * 1) it lets applications to read any last packets in the buffers; - * 2) it prevents reattaching the BPF device to another interface. - */ - bpf->bpf_ifdev = NULL; - - /* - * Resume pending read and select requests, returning any data left, - * or an error if none. - */ - if (bpf->bpf_hlen == 0) - bpfdev_rotate(bpf); - - if (bpf->bpf_read.br_endpt != NONE) - bpfdev_resume_read(bpf, FALSE /*is_timeout*/); - - bpfdev_resume_select(bpf); -} - -/* - * Flush the given BPF device, resetting its buffer contents and statistics - * counters. - */ -static void -bpfdev_flush(struct bpfdev * bpf) -{ - - bpf->bpf_slen = 0; - bpf->bpf_hlen = 0; - - bpf->bpf_stat.bs_recv = 0; - bpf->bpf_stat.bs_drop = 0; - bpf->bpf_stat.bs_capt = 0; -} - -/* - * Install a filter program on the BPF device. A new filter replaces any old - * one. A zero-sized filter simply clears a previous filter. On success, - * perform a flush and return OK. On failure, return a negative error code - * without making any modifications to the current filter. - */ -static int -bpfdev_setfilter(struct bpfdev * bpf, endpoint_t endpt, cp_grant_id_t grant) -{ - struct bpf_insn *filter; - unsigned int count; - size_t len; - int r; - - if ((r = sys_safecopyfrom(endpt, grant, - offsetof(struct minix_bpf_program, mbf_len), (vir_bytes)&count, - sizeof(count))) != OK) - return r; - - if (count > BPF_MAXINSNS) - return EINVAL; - len = count * sizeof(struct bpf_insn); - - if (len > 0) { - if ((filter = (struct bpf_insn *)mmap(NULL, len, - PROT_READ | PROT_WRITE, MAP_ANON | MAP_PRIVATE, -1, 0)) == - MAP_FAILED) - return ENOMEM; - - if ((r = sys_safecopyfrom(endpt, grant, - offsetof(struct minix_bpf_program, mbf_insns), - (vir_bytes)filter, len)) != OK) { - (void)munmap(filter, len); - - return r; - } - - if (!bpf_validate(filter, count)) { - (void)munmap(filter, len); - - return EINVAL; - } - } else - filter = NULL; - - if (bpf->bpf_filter != NULL) - (void)munmap(bpf->bpf_filter, bpf->bpf_filterlen); - - bpf->bpf_filter = filter; - bpf->bpf_filterlen = len; - - bpfdev_flush(bpf); - - return OK; -} - -/* - * Process an I/O control request on the BPF device. - */ -static int -bpfdev_ioctl(devminor_t minor, unsigned long request, endpoint_t endpt, - cp_grant_id_t grant, int flags, endpoint_t user_endpt, cdev_id_t id) -{ - struct bpfdev *bpf; - struct bpf_stat bs; - struct bpf_version bv; - struct bpf_dltlist bfl; - struct timeval tv; - struct ifreq ifr; - unsigned int uval; - int r, val; - - if ((bpf = bpfdev_get_by_minor(minor)) == NULL) - return EINVAL; - - /* - * We do not support multiple concurrent requests in this module. That - * not only means that we forbid a read(2) call on a BPF device object - * while another read(2) is already pending: we also disallow IOCTL - * IOCTL calls while such a read(2) call is in progress. This - * restriction should never be a problem for user programs, and allows - * us to rely on the fact that that no settings can change between the - * start and end of any read call. As a side note, pending select(2) - * queries may be similarly affected, and will also not be fully - * accurate if any options are changed while pending. - */ - if (bpf->bpf_read.br_endpt != NONE) - return EIO; - - bpf->bpf_pid = getnpid(user_endpt); - - /* These are in order of the NetBSD BIOC.. IOCTL numbers. */ - switch (request) { - case BIOCGBLEN: - uval = bpf->bpf_size; - - return sys_safecopyto(endpt, grant, 0, (vir_bytes)&uval, - sizeof(uval)); - - case BIOCSBLEN: - if (bpf->bpf_sbuf != NULL) - return EINVAL; - - if ((r = sys_safecopyfrom(endpt, grant, 0, (vir_bytes)&uval, - sizeof(uval))) != OK) - return r; - - if (uval < BPF_BUF_MIN) - uval = BPF_BUF_MIN; - else if (uval > BPF_BUF_MAX) - uval = BPF_BUF_MAX; - - /* Is this the right thing to do? It doesn't matter for us. */ - uval = BPF_WORDALIGN(uval); - - if ((r = sys_safecopyto(endpt, grant, 0, (vir_bytes)&uval, - sizeof(uval))) != OK) - return r; - - bpf->bpf_size = uval; - - return OK; - - case MINIX_BIOCSETF: - return bpfdev_setfilter(bpf, endpt, grant); - - case BIOCPROMISC: - if (bpf->bpf_ifdev == NULL) - return EINVAL; - - if (!(bpf->bpf_flags & BPFF_PROMISC)) { - if (!ifdev_set_promisc(bpf->bpf_ifdev)) - return EINVAL; - - bpf->bpf_flags |= BPFF_PROMISC; - } - - return OK; - - case BIOCFLUSH: - bpfdev_flush(bpf); - - return OK; - - case BIOCGDLT: - if (bpf->bpf_ifdev == NULL) - return EINVAL; - - /* TODO: support for type configuration per BPF device. */ - uval = ifdev_get_dlt(bpf->bpf_ifdev); - - return sys_safecopyto(endpt, grant, 0, (vir_bytes)&uval, - sizeof(uval)); - - case BIOCGETIF: - if (bpf->bpf_ifdev == NULL) - return EINVAL; - - memset(&ifr, 0, sizeof(ifr)); - strlcpy(ifr.ifr_name, ifdev_get_name(bpf->bpf_ifdev), - sizeof(ifr.ifr_name)); - - return sys_safecopyto(endpt, grant, 0, (vir_bytes)&ifr, - sizeof(ifr)); - - case BIOCSETIF: - /* - * Test on the presence of a buffer rather than on an interface - * since the latter may disappear and thus be reset to NULL, in - * which case we do not want to allow rebinding to another. - */ - if (bpf->bpf_sbuf != NULL) - return EINVAL; - - if ((r = sys_safecopyfrom(endpt, grant, 0, (vir_bytes)&ifr, - sizeof(ifr))) != OK) - return r; - - return bpfdev_attach(bpf, &ifr); - - case BIOCGSTATS: - /* - * Why do we not embed a bpf_stat structure directly in the - * BPF device structure? Well, bpf_stat has massive padding.. - */ - memset(&bs, 0, sizeof(bs)); - bs.bs_recv = bpf->bpf_stat.bs_recv; - bs.bs_drop = bpf->bpf_stat.bs_drop; - bs.bs_capt = bpf->bpf_stat.bs_capt; - - return sys_safecopyto(endpt, grant, 0, (vir_bytes)&bs, - sizeof(bs)); - - case BIOCIMMEDIATE: - if ((r = sys_safecopyfrom(endpt, grant, 0, (vir_bytes)&uval, - sizeof(uval))) != OK) - return r; - - if (uval) - bpf->bpf_flags |= BPFF_IMMEDIATE; - else - bpf->bpf_flags &= ~BPFF_IMMEDIATE; - - return OK; - - case BIOCVERSION: - memset(&bv, 0, sizeof(bv)); - bv.bv_major = BPF_MAJOR_VERSION; - bv.bv_minor = BPF_MINOR_VERSION; - - return sys_safecopyto(endpt, grant, 0, (vir_bytes)&bv, - sizeof(bv)); - - case BIOCGHDRCMPLT: - uval = !!(bpf->bpf_flags & BPFF_HDRCMPLT); - - return sys_safecopyto(endpt, grant, 0, (vir_bytes)&uval, - sizeof(uval)); - - case BIOCSHDRCMPLT: - if ((r = sys_safecopyfrom(endpt, grant, 0, (vir_bytes)&uval, - sizeof(uval))) != OK) - return r; - - if (uval) - bpf->bpf_flags |= BPFF_HDRCMPLT; - else - bpf->bpf_flags &= ~BPFF_HDRCMPLT; - - return OK; - - case BIOCSDLT: - if (bpf->bpf_ifdev == NULL) - return EINVAL; - - if ((r = sys_safecopyfrom(endpt, grant, 0, (vir_bytes)&uval, - sizeof(uval))) != OK) - return r; - - /* TODO: support for type configuration per BPF device. */ - if (uval != ifdev_get_dlt(bpf->bpf_ifdev)) - return EINVAL; - - return OK; - - case MINIX_BIOCGDLTLIST: - if (bpf->bpf_ifdev == NULL) - return EINVAL; - - if ((r = sys_safecopyfrom(endpt, grant, 0, (vir_bytes)&bfl, - sizeof(bfl))) != OK) - return r; - - if (bfl.bfl_list != NULL) { - if (bfl.bfl_len < 1) - return ENOMEM; - - /* - * Copy out the 'list', which consists of one entry. - * If we were to produce multiple entries, we would - * have to check against the MINIX_BPF_MAXDLT limit. - */ - uval = ifdev_get_dlt(bpf->bpf_ifdev); - - if ((r = sys_safecopyto(endpt, grant, - offsetof(struct minix_bpf_dltlist, mbfl_list), - (vir_bytes)&uval, sizeof(uval))) != OK) - return r; - } - bfl.bfl_len = 1; - - return sys_safecopyto(endpt, grant, 0, (vir_bytes)&bfl, - sizeof(bfl)); - - case BIOCGSEESENT: - uval = !!(bpf->bpf_flags & BPFF_SEESENT); - - return sys_safecopyto(endpt, grant, 0, (vir_bytes)&uval, - sizeof(uval)); - - case BIOCSSEESENT: - if ((r = sys_safecopyfrom(endpt, grant, 0, (vir_bytes)&uval, - sizeof(uval))) != OK) - return r; - - if (uval) - bpf->bpf_flags |= BPFF_SEESENT; - else - bpf->bpf_flags &= ~BPFF_SEESENT; - - return OK; - - case BIOCSRTIMEOUT: - if ((r = sys_safecopyfrom(endpt, grant, 0, (vir_bytes)&tv, - sizeof(tv))) != OK) - return r; - - if ((r = util_timeval_to_ticks(&tv, &bpf->bpf_timeout)) != OK) - return r; - - return OK; - - case BIOCGRTIMEOUT: - util_ticks_to_timeval(bpf->bpf_timeout, &tv); - - return sys_safecopyto(endpt, grant, 0, (vir_bytes)&tv, - sizeof(tv)); - - case BIOCGFEEDBACK: - uval = !!(bpf->bpf_flags & BPFF_FEEDBACK); - - return sys_safecopyto(endpt, grant, 0, (vir_bytes)&uval, - sizeof(uval)); - - case BIOCSFEEDBACK: - if ((r = sys_safecopyfrom(endpt, grant, 0, (vir_bytes)&uval, - sizeof(uval))) != OK) - return r; - - if (uval) - bpf->bpf_flags |= BPFF_FEEDBACK; - else - bpf->bpf_flags &= ~BPFF_FEEDBACK; - - return OK; - - case FIONREAD: - val = 0; - if (bpf->bpf_hlen > 0) - val = bpf->bpf_hlen; - else if ((bpf->bpf_flags & BPFF_IMMEDIATE) && - bpf->bpf_slen > 0) - val = bpf->bpf_slen; - else - val = 0; - - return sys_safecopyto(endpt, grant, 0, (vir_bytes)&val, - sizeof(val)); - - default: - return ENOTTY; - } -} - -/* - * Cancel a previously suspended request on a BPF device. Since only read - * requests may be suspended (select is handled differently), the cancel - * request must be for a read request. Note that character devices currently - * (still) behave slightly differently from socket devices here: while socket - * drivers are supposed to respond to the original request, character drivers - * must respond to the original request from the cancel callback. - */ -static int -bpfdev_cancel(devminor_t minor, endpoint_t endpt, cdev_id_t id) -{ - struct bpfdev *bpf; - - if ((bpf = bpfdev_get_by_minor(minor)) == NULL) - return EDONTREPLY; - - /* Is this a cancel request for the currently pending read request? */ - if (bpf->bpf_read.br_endpt != endpt || bpf->bpf_read.br_id != id) - return EDONTREPLY; - - /* If so, cancel the read request. */ - if (bpf->bpf_timeout > 0) - cancel_timer(&bpf->bpf_read.br_timer); - - bpf->bpf_read.br_endpt = NONE; - - return EINTR; /* the return value for the canceled read request */ -} - -/* - * Perform a select query on a BPF device. - */ -static int -bpfdev_select(devminor_t minor, unsigned int ops, endpoint_t endpt) -{ - struct bpfdev *bpf; - unsigned int r, notify; - - if ((bpf = bpfdev_get_by_minor(minor)) == NULL) - return EINVAL; - - notify = (ops & CDEV_NOTIFY); - ops &= (CDEV_OP_RD | CDEV_OP_WR | CDEV_OP_ERR); - - r = bpfdev_test_select(bpf, ops); - - /* - * For the operations that were not immediately ready, if requested, - * save the select request for later. - */ - ops &= ~r; - - if (ops != 0 && notify) { - if (bpf->bpf_select.bs_endpt != NONE) { - /* Merge in the operations with any earlier request. */ - if (bpf->bpf_select.bs_endpt != endpt) - return EIO; - bpf->bpf_select.bs_selops |= ops; - } else { - bpf->bpf_select.bs_endpt = endpt; - bpf->bpf_select.bs_selops = ops; - } - } - - return r; -} - -/* - * Process an incoming packet on the interface to which the given BPF device is - * attached. If the packet passes the filter (if any), store as much as - * requested of it in the store buffer, rotating buffers if needed and resuming - * suspended read and select requests as appropriate. This function is also - * called through bpfdev_output() below. - */ -void -bpfdev_input(struct bpfdev_link * bpfl, const struct pbuf * pbuf) -{ - struct bpfdev *bpf = (struct bpfdev *)bpfl; - struct timespec ts; - struct bpf_hdr bh; - const struct pbuf *pptr; - size_t caplen, hdrlen, totlen, off, chunk; - int hfull; - - /* - * Apparently bs_recv is the counter of packets that were run through - * the filter, not the number of packets that were or could be received - * by the user (which is what I got from the manual page.. oh well). - */ - bpf->bpf_stat.bs_recv++; - bpf_stat.bs_recv++; - - /* - * Run the packet through the BPF device's filter to see whether the - * packet should be stored and if so, how much of it. If no filter is - * set, all packets will be stored in their entirety. - */ - caplen = bpf_filter_ext(bpf->bpf_filter, pbuf, (u_char *)pbuf->payload, - pbuf->tot_len, pbuf->len); - - if (caplen == 0) - return; /* no match; ignore packet */ - - if (caplen > pbuf->tot_len) - caplen = pbuf->tot_len; - - /* Truncate packet entries to the full size of the buffers. */ - hdrlen = BPF_WORDALIGN(sizeof(bh)); - totlen = BPF_WORDALIGN(hdrlen + caplen); - - if (totlen > bpf->bpf_size) { - totlen = bpf->bpf_size; - caplen = totlen - hdrlen; - } - assert(totlen >= hdrlen); - - bpf->bpf_stat.bs_capt++; - bpf_stat.bs_capt++; - - assert(bpf->bpf_sbuf != NULL); - if (totlen > bpf->bpf_size - bpf->bpf_slen) { - /* - * If the store buffer is full and the hold buffer is not - * empty, we cannot swap the two buffers, and so we must drop - * the current packet. - */ - if (bpf->bpf_hlen > 0) { - bpf->bpf_stat.bs_drop++; - bpf_stat.bs_drop++; - - return; - } - - /* - * Rotate the buffers: the hold buffer will now be "full" and - * ready to be read - it may not actually be entirely full, but - * we could not fit this packet and we are not going to deliver - * packets out of order.. - */ - bpfdev_rotate(bpf); - - hfull = TRUE; - } else - hfull = FALSE; - - /* - * Retrieve the capture time for the packet. Ideally this would be - * done only once per accepted packet, but we do not expect many BPF - * devices to be receiving the same packets often enough to make that - * worth it. - */ - clock_time(&ts); - - /* - * Copy the packet into the store buffer, including a newly generated - * header. Zero any padding areas, even if strictly not necessary. - */ - memset(&bh, 0, sizeof(bh)); - bh.bh_tstamp.tv_sec = ts.tv_sec; - bh.bh_tstamp.tv_usec = ts.tv_nsec / 1000; - bh.bh_caplen = caplen; - bh.bh_datalen = pbuf->tot_len; - bh.bh_hdrlen = hdrlen; - - assert(bpf->bpf_sbuf != NULL); - off = bpf->bpf_slen; - - memcpy(&bpf->bpf_sbuf[off], &bh, sizeof(bh)); - if (hdrlen > sizeof(bh)) - memset(&bpf->bpf_sbuf[off + sizeof(bh)], 0, - hdrlen - sizeof(bh)); - off += hdrlen; - - for (pptr = pbuf; pptr != NULL && caplen > 0; pptr = pptr->next) { - chunk = pptr->len; - if (chunk > caplen) - chunk = caplen; - - memcpy(&bpf->bpf_sbuf[off], pptr->payload, chunk); - - off += chunk; - caplen -= chunk; - } - - assert(off <= bpf->bpf_slen + totlen); - if (bpf->bpf_slen + totlen > off) - memset(&bpf->bpf_sbuf[off], 0, bpf->bpf_slen + totlen - off); - - bpf->bpf_slen += totlen; - - /* - * Edge case: if the hold buffer is empty and the store buffer is now - * exactly full, rotate buffers so that the packets can be read - * immediately, without waiting for the next packet to cause rotation. - */ - if (bpf->bpf_hlen == 0 && bpf->bpf_slen == bpf->bpf_size) { - bpfdev_rotate(bpf); - - hfull = TRUE; - } - - /* - * If the hold buffer is now full, or if immediate mode is enabled, - * then we now have data to deliver to userland. See if we can wake up - * any read or select call (either but not both here). - */ - if (hfull || (bpf->bpf_flags & BPFF_IMMEDIATE)) { - if (bpf->bpf_read.br_endpt != NONE) - bpfdev_resume_read(bpf, FALSE /*is_timeout*/); - else - bpfdev_resume_select(bpf); - } -} - -/* - * Process an outgoing packet on the interface to which the given BPF device is - * attached. If the BPF device is configured to capture outgoing packets as - * well, attempt to capture the packet as per bpfdev_input(). - */ -void -bpfdev_output(struct bpfdev_link * bpfl, const struct pbuf * pbuf) -{ - struct bpfdev *bpf = (struct bpfdev *)bpfl; - - if (bpf->bpf_flags & BPFF_SEESENT) - bpfdev_input(bpfl, pbuf); -} - -/* - * Fill the given 'bde' structure with information about BPF device 'bpf'. - */ -static void -bpfdev_get_info(struct bpf_d_ext * bde, const struct bpfdev * bpf) -{ - - bde->bde_bufsize = bpf->bpf_size; - bde->bde_promisc = !!(bpf->bpf_flags & BPFF_PROMISC); - bde->bde_state = BPF_IDLE; - bde->bde_immediate = !!(bpf->bpf_flags & BPFF_IMMEDIATE); - bde->bde_hdrcmplt = !!(bpf->bpf_flags & BPFF_HDRCMPLT); - bde->bde_seesent = !!(bpf->bpf_flags & BPFF_SEESENT); - /* - * NetBSD updates the process ID upon device open, close, ioctl, and - * poll. From those, only open and ioctl make sense for us. Sadly - * there is no way to indicate "no known PID" to netstat(1), so we - * cannot even save just the endpoint and look up the corresponding PID - * later, since the user process may be gone by then. - */ - bde->bde_pid = bpf->bpf_pid; - bde->bde_rcount = bpf->bpf_stat.bs_recv; - bde->bde_dcount = bpf->bpf_stat.bs_drop; - bde->bde_ccount = bpf->bpf_stat.bs_capt; - if (bpf->bpf_ifdev != NULL) - strlcpy(bde->bde_ifname, ifdev_get_name(bpf->bpf_ifdev), - sizeof(bde->bde_ifname)); -} - -/* - * Obtain statistics about open BPF devices ("peers"). This node may be - * accessed by the superuser only. Used by netstat(1). - */ -static ssize_t -bpfdev_peers(struct rmib_call * call, struct rmib_node * node __unused, - struct rmib_oldp * oldp, struct rmib_newp * newp __unused) -{ - struct bpfdev *bpf; - struct bpf_d_ext bde; - unsigned int slot; - ssize_t off; - int r, size, max; - - if (!(call->call_flags & RMIB_FLAG_AUTH)) - return EPERM; - - if (call->call_namelen != 2) - return EINVAL; - - size = call->call_name[0]; - if (size < 0 || (size_t)size > sizeof(bde)) - return EINVAL; - if (size == 0) - size = sizeof(bde); - max = call->call_name[1]; - - off = 0; - - for (slot = 0; slot < __arraycount(bpf_array); slot++) { - bpf = &bpf_array[slot]; - - if (!(bpf->bpf_flags & BPFF_IN_USE)) - continue; - - if (rmib_inrange(oldp, off)) { - memset(&bde, 0, sizeof(bde)); - - bpfdev_get_info(&bde, bpf); - - if ((r = rmib_copyout(oldp, off, &bde, size)) < 0) - return r; - } - - off += sizeof(bde); - if (max > 0 && --max == 0) - break; - } - - /* No slack needed: netstat(1) resizes its buffer as needed. */ - return off; -} - -static const struct chardriver bpfdev_tab = { - .cdr_open = bpfdev_open, - .cdr_close = bpfdev_close, - .cdr_read = bpfdev_read, - .cdr_write = bpfdev_write, - .cdr_ioctl = bpfdev_ioctl, - .cdr_cancel = bpfdev_cancel, - .cdr_select = bpfdev_select -}; - -/* - * Process a character driver request. Since the LWIP service offers character - * devices for BPF only, it must be a request for a BPF device. - */ -void -bpfdev_process(message * m_ptr, int ipc_status) -{ - - chardriver_process(&bpfdev_tab, m_ptr, ipc_status); -} diff --git a/minix/net/lwip/ethif.c b/minix/net/lwip/ethif.c deleted file mode 100644 index 863b12e48..000000000 --- a/minix/net/lwip/ethif.c +++ /dev/null @@ -1,1718 +0,0 @@ -/* LWIP service - ethif.c - ethernet interfaces */ -/* - * The most important aspect of this module is to maintain a send queue for the - * interface. This send queue consists of packets to send. At times, the user - * may request a change to the driver configuration. While configuration - * requests would ideally be enqueued in the send queue, this has proven too - * problematic to work in practice, especially since out-of-memory conditions - * may prevent configuration requests from being accepted immediately in such a - * model. Instead, we take a simple and blunt approach: configuration requests - * "cut in line" and thus take precedence over pending packets in the send - * queue. This may not always be entirely correct: for example, packets may be - * transmitted with the old ethernet address after the network device has - * already been reconfigured to receive from a new ethernet address. However, - * this should not be a real problem, and we take care explicitly of perhaps - * the most problematic case: packets not getting checksummed due to checksum - * offloading configuration changes. - * - * Even with this blunt approach, we maintain three concurrent configurations: - * the active, the pending, and the wanted configuration. The active one is - * the last known active configuration at the network driver. It used not only - * to report whether the device is in RUNNING state, but also to replay the - * active configuration to a restarted driver. The pending configuration is - * a partially new configuration that has been given to ndev to send to the - * driver, but not yet acknowledged by the driver. Finally, the wanted - * configuration is the latest one that has yet to be given to ndev. - * - * Each configuration has a bitmask indicating which part of the configuration - * has changed, in order to limit work on the driver side. This is also the - * reason that the pending and wanted configurations are separate: if e.g. a - * media change is pending at the driver, and the user also requests a mode - * change, we do not want the media change to be repeated after it has been - * acknowleged by the driver, just to change the mode as well. In this example - * the pending configuration will have NDEV_SET_MEDIA set, and the wanted - * configuration will have NDEV_SET_MODE set. Once acknowledged, the pending - * bitmask is cleared and the wanted bitmask is tested to see if another - * configuration change should be given to ndev. Technically, this could lead - * to starvation of actual packet transmission, but we expect configuration - * changes to be very rare, since they are always user initiated. - * - * It is important to note for understanding the code that for some fields - * (mode, flags, caps), the three configurations are cascading: even though the - * wanted configuration may not have NDEV_SET_MODE set, its mode field will - * still contain the most recently requested mode; that is, the mode in the - * pending configuration if that one has NDEV_SET_MODE set, or otherwise the - * mode in the active configuration. For that reason, we carefully merge - * configuration requests into the next level (wanted -> pending -> active), - * updating just the fields that have been changed by the previous level. This - * approach simplifies obtaining current values a lot, but is not very obvious. - * - * Also, we never send multiple configuration requests at once, even though - * ndev would let us do that: we use a single array for the list of multicast - * ethernet addresses that we send to the driver, which the driver may retrieve - * (using a memory grant) at any time. We necessarily recompute the multicast - * list before sending a configuration request, and thus, sending multiple - * requests at once may lead to the driver retrieving a corrupted list. - */ - -#include "lwip.h" -#include "ethif.h" - -#include "lwip/etharp.h" -#include "lwip/ethip6.h" -#include "lwip/igmp.h" -#include "lwip/mld6.h" - -#include - -#define ETHIF_MAX_MTU 1500 /* maximum MTU value for ethernet */ -#define ETHIF_DEF_MTU ETHIF_MAX_MTU /* default MTU value that we use */ - -#define ETHIF_MCAST_MAX 8 /* maximum number of multicast addresses */ - -struct ethif { - struct ifdev ethif_ifdev; /* interface device, MUST be first */ - ndev_id_t ethif_ndev; /* network device ID */ - unsigned int ethif_flags; /* interface flags (ETHIFF_) */ - uint32_t ethif_caps; /* driver capabilities (NDEV_CAPS_) */ - uint32_t ethif_media; /* driver-reported media type (IFM_) */ - struct ndev_conf ethif_active; /* active configuration (at driver) */ - struct ndev_conf ethif_pending; /* pending configuration (at ndev) */ - struct ndev_conf ethif_wanted; /* desired configuration (waiting) */ - struct ndev_hwaddr ethif_mclist[ETHIF_MCAST_MAX]; /* multicast list */ - struct { /* send queue (packet/conf refs) */ - struct pbuf *es_head; /* first (oldest) request reference */ - struct pbuf **es_unsentp; /* ptr-ptr to first unsent request */ - struct pbuf **es_tailp; /* ptr-ptr for adding new requests */ - unsigned int es_count; /* buffer count, see ETHIF_PBUF_.. */ - } ethif_snd; - struct { /* receive queue (packets) */ - struct pbuf *er_head; /* first (oldest) request buffer */ - struct pbuf **er_tailp; /* ptr-ptr for adding new requests */ - } ethif_rcv; - SIMPLEQ_ENTRY(ethif) ethif_next; /* next in free list */ -} ethif_array[NR_NDEV]; /* any other value would be suboptimal */ - -#define ethif_get_name(ethif) (ifdev_get_name(&(ethif)->ethif_ifdev)) -#define ethif_get_netif(ethif) (ifdev_get_netif(&(ethif)->ethif_ifdev)) - -#define ETHIFF_DISABLED 0x01 /* driver has disappeared */ -#define ETHIFF_FIRST_CONF 0x02 /* first configuration request sent */ - -/* - * Send queue limit settings. Both are counted in number of pbuf objects. - * ETHIF_PBUF_MIN is the minimum number of pbuf objects that can always be - * enqueued on a particular interface's send queue. It should be at least the - * number of pbufs for one single packet after being reduced to the ndev limit, - * so NDEV_IOV_MAX (8) is a natural fit. The ETHIF_PBUF_MAX_n values define - * the maximum number of pbufs that may be used by all interface send queues - * combined, whichever of the two is smaller. The resulting number must be set - * fairly high, because at any time there may be a lot of active TCP sockets - * that all generate a (multi-pbuf) packet as a result of a clock tick. It is - * currently a function of the size of the buffer pool, capped to a value that - * is a function of the number of TCP sockets (assuming one packet per socket; - * up to MSS/BUFSIZE+1 data pbufs, one header pbuf, one extra as margin). The - * difference between the per-interface guaranteed minimum and the global - * maximum is what makes up a pool of "spares", which are really just tokens - * allowing for enqueuing of that many pbufs. - */ -#define ETHIF_PBUF_MIN (NDEV_IOV_MAX) -#define ETHIF_PBUF_MAX_1 (mempool_cur_buffers() >> 1) -#define ETHIF_PBUF_MAX_2 (NR_TCPSOCK * (TCP_MSS / MEMPOOL_BUFSIZE + 3)) - -static unsigned int ethif_spares; - -static SIMPLEQ_HEAD(, ethif) ethif_freelist; /* free ethif objects */ - -static const struct ifdev_ops ethif_ops; - -#ifdef INET6 -static ip6_addr_t ethif_ip6addr_allnodes_ll; -#endif /* INET6 */ - -/* - * Initialize the ethernet interfaces module. - */ -void -ethif_init(void) -{ - unsigned int slot; - - /* Initialize the list of free ethif objects. */ - SIMPLEQ_INIT(ðif_freelist); - - for (slot = 0; slot < __arraycount(ethif_array); slot++) - SIMPLEQ_INSERT_TAIL(ðif_freelist, ðif_array[slot], - ethif_next); - - /* Initialize the number of in-use spare tokens. */ - ethif_spares = 0; - -#ifdef INET6 - /* Preinitialize the link-local all-nodes IPv6 multicast address. */ - ip6_addr_set_allnodes_linklocal(ðif_ip6addr_allnodes_ll); -#endif /* INET6 */ -} - -/* - * As the result of some event, the NetBSD-style interface flags for this - * interface may have changed. Recompute and update the flags as appropriate. - */ -static void -ethif_update_ifflags(struct ethif * ethif) -{ - unsigned int ifflags; - - ifflags = ifdev_get_ifflags(ðif->ethif_ifdev); - - /* These are the flags that we might update here. */ - ifflags &= ~(IFF_RUNNING | IFF_ALLMULTI); - - /* - * For us, the RUNNING flag indicates that -as far as we know- the - * network device is fully operational and has its I/O engines running. - * This is a reflection of the current state, not of any intention, so - * we look at the active configuration here. We use the same approach - * for one other receive state flags here (ALLMULTI). - */ - if ((ethif->ethif_flags & - (ETHIFF_DISABLED | ETHIFF_FIRST_CONF)) == 0 && - ethif->ethif_active.nconf_mode != NDEV_MODE_DOWN) { - ifflags |= IFF_RUNNING; - - if (ethif->ethif_active.nconf_mode & NDEV_MODE_MCAST_ALL) - ifflags |= IFF_ALLMULTI; - } - - ifdev_update_ifflags(ðif->ethif_ifdev, ifflags); -} - -/* - * Add a multicast hardware receive address into the set of hardware addresses - * in the given configuration, if the given address is not already in the - * configuration's set. Adjust the configuration's mode as needed. Return - * TRUE If the address was added, and FALSE if the address could not be added - * due to a full list (of 'max' elements), in which case the mode is changed - * from receiving from listed multicast addresses to receiving from all - * multicast addresses. - */ -static int -ethif_add_mcast(struct ndev_conf * nconf, unsigned int max, - struct ndev_hwaddr * hwaddr) -{ - unsigned int slot; - - /* - * See if the hardware address is already in the list we produced so - * far. This makes the multicast list generation O(n^2) but we do not - * expect many entries nor is the list size large anyway. - */ - for (slot = 0; slot < nconf->nconf_mccount; slot++) - if (!memcmp(&nconf->nconf_mclist[slot], hwaddr, - sizeof(*hwaddr))) - return TRUE; - - if (nconf->nconf_mccount < max) { - memcpy(&nconf->nconf_mclist[slot], hwaddr, sizeof(*hwaddr)); - nconf->nconf_mccount++; - - nconf->nconf_mode |= NDEV_MODE_MCAST_LIST; - - return TRUE; - } else { - nconf->nconf_mode &= ~NDEV_MODE_MCAST_LIST; - nconf->nconf_mode |= NDEV_MODE_MCAST_ALL; - - return FALSE; - } -} - -/* - * Add the ethernet hardware address derived from the given IPv4 multicast - * address, to the list of multicast addresses. - */ -static int -ethif_add_mcast_v4(struct ndev_conf * nconf, unsigned int max, - const ip4_addr_t * ip4addr) -{ - struct ndev_hwaddr hwaddr; - - /* 01:00:05:xx:xx:xx with the lower 23 bits of the IPv4 address. */ - hwaddr.nhwa_addr[0] = LL_IP4_MULTICAST_ADDR_0; - hwaddr.nhwa_addr[1] = LL_IP4_MULTICAST_ADDR_1; - hwaddr.nhwa_addr[2] = LL_IP4_MULTICAST_ADDR_2; - hwaddr.nhwa_addr[3] = (ip4_addr_get_u32(ip4addr) >> 16) & 0x7f; - hwaddr.nhwa_addr[4] = (ip4_addr_get_u32(ip4addr) >> 8) & 0xff; - hwaddr.nhwa_addr[5] = (ip4_addr_get_u32(ip4addr) >> 0) & 0xff; - - return ethif_add_mcast(nconf, max, &hwaddr); -} - -/* - * Add the ethernet hardware address derived from the given IPv6 multicast - * address, to the list of multicast addresses. - */ -static int -ethif_add_mcast_v6(struct ndev_conf * nconf, unsigned int max, - const ip6_addr_t * ip6addr) -{ - struct ndev_hwaddr hwaddr; - - /* 33:33:xx:xx:xx:xx with the lower 32 bits of the IPv6 address. */ - hwaddr.nhwa_addr[0] = LL_IP6_MULTICAST_ADDR_0; - hwaddr.nhwa_addr[1] = LL_IP6_MULTICAST_ADDR_1; - memcpy(&hwaddr.nhwa_addr[2], &ip6addr->addr[3], sizeof(uint32_t)); - - return ethif_add_mcast(nconf, max, &hwaddr); -} - -/* - * Set up the multicast mode for a configuration that is to be sent to a - * network driver, generating a multicast receive address list for the driver - * as applicable. - */ -static void -ethif_gen_mcast(struct ethif * ethif, struct ndev_conf * nconf) -{ - struct igmp_group *group4; - struct mld_group *group6; - unsigned int max; - - /* Make sure that multicast is supported at all for this interface. */ - if (!(ethif->ethif_caps & NDEV_CAP_MCAST)) - return; - - /* Make sure the mode is being (re)configured to be up. */ - if (!(nconf->nconf_set & NDEV_SET_MODE) || - nconf->nconf_mode == NDEV_MODE_DOWN) - return; - - /* Recompute the desired multicast flags. */ - nconf->nconf_mode &= ~(NDEV_MODE_MCAST_LIST | NDEV_MODE_MCAST_ALL); - - /* If promiscuous mode is enabled, receive all multicast packets. */ - if (nconf->nconf_mode & NDEV_MODE_PROMISC) { - nconf->nconf_mode |= NDEV_MODE_MCAST_ALL; - - return; - } - - /* - * Map all IGMP/MLD6 multicast addresses to ethernet addresses, merging - * any duplicates to save slots. We have to add the MLD6 all-nodes - * multicast address ourselves, which also means the list is never - * empty unless compiling with USE_INET6=no. If the list is too small - * for all addresses, opt to receive all multicast packets instead. - */ - nconf->nconf_mclist = ethif->ethif_mclist; - nconf->nconf_mccount = 0; - max = __arraycount(ethif->ethif_mclist); - - for (group4 = netif_igmp_data(ethif_get_netif(ethif)); group4 != NULL; - group4 = group4->next) - if (!ethif_add_mcast_v4(nconf, max, &group4->group_address)) - return; - -#ifdef INET6 - if (!ethif_add_mcast_v6(nconf, max, ðif_ip6addr_allnodes_ll)) - return; -#endif /* INET6 */ - - for (group6 = netif_mld6_data(ethif_get_netif(ethif)); group6 != NULL; - group6 = group6->next) - if (!ethif_add_mcast_v6(nconf, max, &group6->group_address)) - return; -} - -/* - * Merge a source configuration into a destination configuration, copying any - * fields intended to be set from the source into the destination and clearing - * the "set" mask in the source, without changing the source fields, so that - * the source will reflect the destination's contents. - */ -static void -ethif_merge_conf(struct ndev_conf * dconf, struct ndev_conf * sconf) -{ - - dconf->nconf_set |= sconf->nconf_set; - - if (sconf->nconf_set & NDEV_SET_MODE) - dconf->nconf_mode = sconf->nconf_mode; - if (sconf->nconf_set & NDEV_SET_CAPS) - dconf->nconf_caps = sconf->nconf_caps; - if (sconf->nconf_set & NDEV_SET_FLAGS) - dconf->nconf_flags = sconf->nconf_flags; - if (sconf->nconf_set & NDEV_SET_MEDIA) - dconf->nconf_media = sconf->nconf_media; - if (sconf->nconf_set & NDEV_SET_HWADDR) - memcpy(&dconf->nconf_hwaddr, &sconf->nconf_hwaddr, - sizeof(dconf->nconf_hwaddr)); - - sconf->nconf_set = 0; -} - -/* - * Return TRUE if we can and should try to pass a configuration request to the - * ndev layer on this interface, or FALSE otherwise. - */ -static int -ethif_can_conf(struct ethif * ethif) -{ - - /* Is there a configuration change waiting? The common case is no. */ - if (ethif->ethif_wanted.nconf_set == 0) - return FALSE; - - /* - * Is there a configuration change pending already? Then wait for it - * to be acknowledged first. - */ - if (ethif->ethif_pending.nconf_set != 0) - return FALSE; - - /* Make sure the interface is in the appropriate state. */ - if (ethif->ethif_flags & ETHIFF_DISABLED) - return FALSE; - - /* First let all current packet send requests finish. */ - return (ethif->ethif_snd.es_unsentp == ðif->ethif_snd.es_head); -} - -/* - * Return TRUE if we can and should try to pass the next unsent packet send - * request to the ndev layer on this interface, or FALSE otherwise. - */ -static int -ethif_can_send(struct ethif * ethif) -{ - - /* Is there anything to hand to ndev at all? The common case is no. */ - if (*ethif->ethif_snd.es_unsentp == NULL) - return FALSE; - - /* - * Is there a configuration change pending? Then we cannot send - * packets yet. Always let all configuration changes through first. - */ - if (ethif->ethif_pending.nconf_set != 0 || - ethif->ethif_wanted.nconf_set != 0) - return FALSE; - - /* Make sure the interface is in the appropriate state. */ - if ((ethif->ethif_flags & (ETHIFF_DISABLED | ETHIFF_FIRST_CONF)) != 0) - return FALSE; - - return TRUE; -} - -/* - * Return TRUE if we can and should try to receive packets on this interface - * and are ready to accept received packets, or FALSE otherwise. - */ -static int -ethif_can_recv(struct ethif * ethif) -{ - - if ((ethif->ethif_flags & (ETHIFF_DISABLED | ETHIFF_FIRST_CONF)) != 0) - return FALSE; - - /* - * We do not check the link status here. There is no reason not to - * spawn receive requests, or accept received packets, while the link - * is reported to be down. - */ - return ifdev_is_up(ðif->ethif_ifdev); -} - -/* - * Polling function, invoked after each message loop iteration. Check whether - * any configuration change or packets can be sent to the driver, and whether - * any new packet receive requests can be enqueued at the driver. - */ -static void -ethif_poll(struct ifdev * ifdev) -{ - struct ethif *ethif = (struct ethif *)ifdev; - struct pbuf *pbuf, *pref; - - /* - * If a configuration request is desired, see if we can send it to the - * driver now. Otherwise, attempt to send any packets if possible. - * In both cases, a failure of the ndev call indicates that we should - * try again later. - */ - if (ethif_can_conf(ethif)) { - ethif_gen_mcast(ethif, ðif->ethif_wanted); - - /* - * On success, move the wanted configuration into the pending - * slot. Otherwise, try again on the next poll iteration. - */ - if (ndev_conf(ethif->ethif_ndev, ðif->ethif_wanted) == OK) - ethif_merge_conf(ðif->ethif_pending, - ðif->ethif_wanted); - } else { - while (ethif_can_send(ethif)) { - pref = *ethif->ethif_snd.es_unsentp; - - if (pref->type == PBUF_REF) - pbuf = (struct pbuf *)pref->payload; - else - pbuf = pref; - - if (ndev_send(ethif->ethif_ndev, pbuf) == OK) - ethif->ethif_snd.es_unsentp = - pchain_end(pref); - else - break; - } - } - - /* - * Attempt to create additional receive requests for the driver, if - * applicable. We currently do not set a limit on the maximum number - * of concurrently pending receive requests here, because the maximum - * in ndev is already quite low. That may have to be changed one day. - */ - while (ethif_can_recv(ethif) && ndev_can_recv(ethif->ethif_ndev)) { - /* - * Allocate a buffer for the network device driver to copy the - * received packet into. Allocation may fail if no buffers are - * available at this time; in that case simply try again later. - * We add room for a VLAN tag even though we do not support - * such tags just yet. - */ - if ((pbuf = pchain_alloc(PBUF_RAW, ETH_PAD_LEN + ETH_HDR_LEN + - ETHIF_MAX_MTU + NDEV_ETH_PACKET_TAG)) == NULL) - break; - - /* - * Effectively throw away two bytes in order to align TCP/IP - * header fields to 32 bits. See the short discussion in - * lwipopts.h as to why we are not using lwIP's ETH_PAD_SIZE. - */ - util_pbuf_header(pbuf, -ETH_PAD_LEN); - - /* - * Send the request to the driver. This may still fail due to - * grant allocation failure, in which case we try again later. - */ - if (ndev_recv(ethif->ethif_ndev, pbuf) != OK) { - pbuf_free(pbuf); - - break; - } - - /* - * Hold on to the packet buffer until the receive request - * completes or is aborted, or the driver disappears. - */ - *ethif->ethif_rcv.er_tailp = pbuf; - ethif->ethif_rcv.er_tailp = pchain_end(pbuf); - } -} - -/* - * Complete the link-layer header of the packet by filling in a source address. - * This is relevant for BPF-generated packets only, and thus we can safely - * modify the given pbuf. - */ -static void -ethif_hdrcmplt(struct ifdev * ifdev, struct pbuf * pbuf) -{ - struct netif *netif; - - /* Make sure there is an ethernet packet header at all. */ - if (pbuf->len < ETH_HDR_LEN) - return; - - netif = ifdev_get_netif(ifdev); - - /* - * Insert the source ethernet address into the packet. The source - * address is located right after the destination address at the start - * of the packet. - */ - memcpy((uint8_t *)pbuf->payload + netif->hwaddr_len, netif->hwaddr, - netif->hwaddr_len); -} - -/* - * Return TRUE if the given additional number of spare tokens may be used, or - * FALSE if the limit has been reached. Each spare token represents one - * enqueued pbuf. The limit must be such that we do not impede normal traffic - * but also do not spend the entire buffer pool on enqueued packets. - */ -static int -ethif_can_spare(unsigned int spares) -{ - unsigned int max; - - /* - * Use the configured maximum, which depends on the current size of the - * buffer pool. - */ - max = ETHIF_PBUF_MAX_1; - - /* - * However, limit the total to a value based on the maximum number of - * TCP packets that can, in the worst case, be expected to queue up at - * any single moment. - */ - if (max > ETHIF_PBUF_MAX_2) - max = ETHIF_PBUF_MAX_2; - - return (spares + ethif_spares <= max - ETHIF_PBUF_MIN * NR_NDEV); -} - -/* - * Process a packet as output on an ethernet interface. - */ -static err_t -ethif_output(struct ifdev * ifdev, struct pbuf * pbuf, struct netif * netif) -{ - struct ethif *ethif = (struct ethif *)ifdev; - struct pbuf *pref, *pcopy; - size_t padding; - unsigned int count, spares; - - /* Packets must never be sent on behalf of another interface. */ - assert(netif == NULL); - - /* - * The caller already rejects packets while the interface or link is - * down. We do want to keep enqueuing packets while the driver is - * restarting, so do not check ETHIFF_DISABLED or ETHIFF_FIRST_CONF. - */ - - /* - * Reject oversized packets immediately. This should not happen. - * Undersized packets are padded below. - */ - if (pbuf->tot_len > NDEV_ETH_PACKET_MAX) { - printf("LWIP: attempt to send oversized ethernet packet " - "(size %u)\n", pbuf->tot_len); - util_stacktrace(); - - return ERR_MEM; - } - - /* - * The original lwIP idea for processing output packets is that we make - * a copy of the packet here, so that lwIP is free to do whatever it - * wants with the original packet (e.g., keep on the TCP retransmission - * queue). More recently, lwIP has made progress towards allowing the - * packet to be referenced only, decreasing the reference count only - * once the packet has been actually sent. For many embedded systems, - * that change now allows zero-copy transmission with direct DMA from - * the provided packet buffer. We are not so lucky: we have to make an - * additional inter-process copy anyway. We do however use the same - * referencing system to avoid having to make yet another copy of the - * packet here. - * - * There was previously a check on (pbuf->ref > 1) here, to ensure that - * we would never enqueue packets that are retransmitted while we were - * still in the process of sending the initial copy. Now that for ARP - * and NDP queuing, packets are referenced rather than copied (lwIP - * patch #9272), we can no longer perform that check: packets may - * legitimately have a reference count of 2 at this point. The second - * reference will be dropped by the caller immediately after we return. - */ - - /* - * There are two cases in which we need to make a copy of the packet - * after all: - * - * 1) in the case that the packet needs to be padded in order to reach - * the minimum ethernet packet size (for drivers' convenience); - * 2) in the (much more exceptional) case that the given pbuf chain - * exceeds the maximum vector size for network driver requests. - */ - if (NDEV_ETH_PACKET_MIN > pbuf->tot_len) - padding = NDEV_ETH_PACKET_MIN - pbuf->tot_len; - else - padding = 0; - - count = pbuf_clen(pbuf); - - if (padding != 0 || count > NDEV_IOV_MAX) { - pcopy = pchain_alloc(PBUF_RAW, pbuf->tot_len + padding); - if (pcopy == NULL) { - ifdev_output_drop(ifdev); - - return ERR_MEM; - } - - if (pbuf_copy(pcopy, pbuf) != ERR_OK) - panic("unexpected pbuf copy failure"); - - if (padding > 0) { - /* - * This restriction can be lifted if needed, but it - * involves hairy pbuf traversal and our standard pool - * size should be way in excess of the minimum packet - * size. - */ - assert(pcopy->len == pbuf->tot_len + padding); - - memset((char *)pcopy->payload + pbuf->tot_len, 0, - padding); - } - - count = pbuf_clen(pcopy); - assert(count <= NDEV_IOV_MAX); - - pbuf = pcopy; - } else - pcopy = NULL; - - /* - * Restrict the size of the send queue, so that it will not exhaust the - * buffer pool. - */ - if (ethif->ethif_snd.es_count >= ETHIF_PBUF_MIN) - spares = count; - else if (ethif->ethif_snd.es_count + count > ETHIF_PBUF_MIN) - spares = ethif->ethif_snd.es_count + count - ETHIF_PBUF_MIN; - else - spares = 0; - - if (spares > 0 && !ethif_can_spare(spares)) { - if (pcopy != NULL) - pbuf_free(pcopy); - - ifdev_output_drop(ifdev); - - return ERR_MEM; - } - - /* - * A side effect of the referencing approach is that we cannot touch - * the last pbuf's "next" pointer. Thus, we need another way of - * linking together the buffers on the send queue. We use a linked - * list of PBUF_REF-type buffers for this instead. However, do this - * only when we have not made a copy of the original pbuf, because then - * we might as well use the copy instead. - */ - if (pcopy == NULL) { - if ((pref = pbuf_alloc(PBUF_RAW, 0, PBUF_REF)) == NULL) { - ifdev_output_drop(ifdev); - - return ERR_MEM; - } - - pbuf_ref(pbuf); - - pref->payload = pbuf; - pref->tot_len = 0; - pref->len = count; - } else - pref = pcopy; - - /* If the send queue was empty so far, set the IFF_OACTIVE flag. */ - if (ethif->ethif_snd.es_head == NULL) - ifdev_update_ifflags(ðif->ethif_ifdev, - ifdev_get_ifflags(ðif->ethif_ifdev) | IFF_OACTIVE); - - /* - * Enqueue the packet on the send queue. It will be sent from the - * polling function as soon as possible. TODO: see if sending it from - * here makes any performance difference at all. - */ - *ethif->ethif_snd.es_tailp = pref; - ethif->ethif_snd.es_tailp = pchain_end(pref); - - ethif->ethif_snd.es_count += count; - ethif_spares += spares; - - return ERR_OK; -} - -/* - * Transmit an ethernet packet on an ethernet interface, as requested by lwIP. - */ -static err_t -ethif_linkoutput(struct netif * netif, struct pbuf * pbuf) -{ - struct ifdev *ifdev = netif_get_ifdev(netif); - - /* - * Let ifdev make the callback to our output function, so that it can - * pass the packet to BPF devices and generically update statistics. - */ - return ifdev_output(ifdev, pbuf, NULL /*netif*/, TRUE /*to_bpf*/, - TRUE /*hdrcmplt*/); -} - -/* - * The multicast address list has changed. See to it that the change will make - * it to the network driver at some point. - */ -static err_t -ethif_set_mcast(struct ethif * ethif) -{ - - /* - * Simply generate a mode change request, unless the interface is down. - * Once the mode change request is about to be sent to the driver, we - * will recompute the multicast settings. - */ - if (ifdev_is_up(ðif->ethif_ifdev)) - ethif->ethif_wanted.nconf_set |= NDEV_SET_MODE; - - return ERR_OK; -} - -/* - * An IPv4 multicast address has been added to or removed from the list of IPv4 - * multicast addresses. - */ -static err_t -ethif_set_mcast_v4(struct netif * netif, const ip4_addr_t * group __unused, - enum netif_mac_filter_action action __unused) -{ - - return ethif_set_mcast((struct ethif *)netif_get_ifdev(netif)); -} - -/* - * An IPv6 multicast address has been added to or removed from the list of IPv6 - * multicast addresses. - */ -static err_t -ethif_set_mcast_v6(struct netif * netif, const ip6_addr_t * group __unused, - enum netif_mac_filter_action action __unused) -{ - - return ethif_set_mcast((struct ethif *)netif_get_ifdev(netif)); -} - -/* - * Initialization function for an ethernet-type netif interface, called from - * lwIP at interface creation time. - */ -static err_t -ethif_init_netif(struct ifdev * ifdev, struct netif * netif) -{ - struct ethif *ethif = (struct ethif *)ifdev; - - /* - * Fill in a dummy name. Since it is only two characters, do not - * bother trying to reuse part of the given name. If this name is ever - * actually used anywhere, the dummy should suffice for debugging. - */ - netif->name[0] = 'e'; - netif->name[1] = 'n'; - - netif->linkoutput = ethif_linkoutput; - - memset(netif->hwaddr, 0, sizeof(netif->hwaddr)); - - /* - * Set the netif flags, partially based on the capabilities reported by - * the network device driver. The reason that we do this now is that - * lwIP tests for some of these flags and starts appropriate submodules - * (e.g., IGMP) right after returning from this function. If we set - * the flags later, we also have to take over management of those - * submodules, which is something we'd rather avoid. For this reason - * in particular, we also do not support capability mask changes after - * driver restarts - see ethif_enable(). - */ - netif->flags = NETIF_FLAG_ETHARP | NETIF_FLAG_ETHERNET; - - if (ethif->ethif_caps & NDEV_CAP_BCAST) - netif->flags |= NETIF_FLAG_BROADCAST; - - if (ethif->ethif_caps & NDEV_CAP_MCAST) { - /* The IGMP code adds the all-stations multicast entry. */ - netif->igmp_mac_filter = ethif_set_mcast_v4; - - netif->flags |= NETIF_FLAG_IGMP; - - /* For MLD6 we have to add the all-nodes entry ourselves. */ - netif->mld_mac_filter = ethif_set_mcast_v6; - - netif->flags |= NETIF_FLAG_MLD6; - } - - return ERR_OK; -} - -/* - * The ndev layer reports that a new network device driver has appeared, with - * the given ndev identifier, a driver-given name, and a certain set of - * capabilities. Create a new ethernet interface object for it. On success, - * return a pointer to the object (for later callbacks from ndev). In that - * case, the ndev layer will always immediately call ethif_enable() afterwards. - * On failure, return NULL, in which case ndev will forget about the driver. - */ -struct ethif * -ethif_add(ndev_id_t id, const char * name, uint32_t caps) -{ - struct ethif *ethif; - unsigned int ifflags; - int r; - - /* - * First make sure that the interface name is valid, unique, and not - * reserved for virtual interface types. - */ - if ((r = ifdev_check_name(name, NULL /*vtype_slot*/)) != OK) { - /* - * There is some risk in printing bad stuff, but this may help - * in preventing serious driver writer frustration.. - */ - printf("LWIP: invalid driver name '%s' (%d)\n", name, r); - - return NULL; - } - - /* Then see if there is a free ethernet interface object available. */ - if (SIMPLEQ_EMPTY(ðif_freelist)) { - printf("LWIP: out of slots for driver name '%s'\n", name); - - return NULL; - } - - /* - * All good; set up the interface. First initialize the object, since - * adding the interface to lwIP might spawn some activity right away. - */ - ethif = SIMPLEQ_FIRST(ðif_freelist); - SIMPLEQ_REMOVE_HEAD(ðif_freelist, ethif_next); - - /* Initialize the ethif structure. */ - memset(ethif, 0, sizeof(*ethif)); - ethif->ethif_ndev = id; - ethif->ethif_flags = ETHIFF_DISABLED; - ethif->ethif_caps = caps; - - ethif->ethif_snd.es_head = NULL; - ethif->ethif_snd.es_unsentp = ðif->ethif_snd.es_head; - ethif->ethif_snd.es_tailp = ðif->ethif_snd.es_head; - ethif->ethif_snd.es_count = 0; - - ethif->ethif_rcv.er_head = NULL; - ethif->ethif_rcv.er_tailp = ðif->ethif_rcv.er_head; - - /* - * Set all the three configurations to the same initial values. Since - * any change to the configuration will go through all three, this - * allows us to obtain various parts of the status (in particular, the - * mode, flags, enabled capabilities, and media type selection) from - * any of the three without having to consult the others. Note that - * the hardware address is set to a indeterminate initial value, as it - * is left to the network driver unless specifically overridden. - */ - ethif->ethif_active.nconf_set = 0; - ethif->ethif_active.nconf_mode = NDEV_MODE_DOWN; - ethif->ethif_active.nconf_flags = 0; - ethif->ethif_active.nconf_caps = 0; - ethif->ethif_active.nconf_media = - IFM_MAKEWORD(IFM_ETHER, IFM_AUTO, 0, 0); - memcpy(ðif->ethif_pending, ðif->ethif_active, - sizeof(ethif->ethif_pending)); - memcpy(ðif->ethif_wanted, ðif->ethif_pending, - sizeof(ethif->ethif_wanted)); - - /* - * Compute the initial NetBSD-style interface flags. The IFF_SIMPLEX - * interface flag is always enabled because we do not support network - * drivers that are receiving their own packets. In particular, lwIP - * currently does not deal well with receiving back its own multicast - * packets, which leads to IPv6 DAD failures. The other two flags - * (IFF_BROADCAST, IFF_MULTICAST) denote capabilities, not enabled - * receipt modes. - */ - ifflags = IFF_SIMPLEX; - if (caps & NDEV_CAP_BCAST) - ifflags |= IFF_BROADCAST; - if (caps & NDEV_CAP_MCAST) - ifflags |= IFF_MULTICAST; - - /* Finally, add the interface to ifdev and lwIP. This cannot fail. */ - ifdev_add(ðif->ethif_ifdev, name, ifflags, IFT_ETHER, ETH_HDR_LEN, - ETHARP_HWADDR_LEN, DLT_EN10MB, ETHIF_DEF_MTU, - ND6_IFF_PERFORMNUD | ND6_IFF_AUTO_LINKLOCAL, ðif_ops); - - return ethif; -} - -/* - * The link status and/or media type of an ethernet interface has changed. - */ -static void -ethif_set_status(struct ethif * ethif, uint32_t link, uint32_t media) -{ - unsigned int iflink; - - /* We save the media type locally for now. */ - ethif->ethif_media = media; - - /* Let the ifdev module handle the details of the link change. */ - switch (link) { - case NDEV_LINK_UP: iflink = LINK_STATE_UP; break; - case NDEV_LINK_DOWN: iflink = LINK_STATE_DOWN; break; - default: iflink = LINK_STATE_UNKNOWN; break; - } - - ifdev_update_link(ðif->ethif_ifdev, iflink); -} - -/* - * The ndev layer reports that a previously added or disabled network device - * driver has been (re)enabled. Start by initializing the driver. Return TRUE - * if the interface could indeed be enabled, or FALSE if it should be forgotten - * altogether after all. - */ -int -ethif_enable(struct ethif * ethif, const char * name, - const struct ndev_hwaddr * hwaddr, uint8_t hwaddr_len, uint32_t caps, - uint32_t link, uint32_t media) -{ - int r; - - assert(ethif->ethif_flags & ETHIFF_DISABLED); - - /* - * One disadvantage of keeping service labels and ethernet driver names - * disjunct is that the ethernet driver may mess with its name between - * restarts. Ultimately we may end up renaming our ethernet drivers - * such that their labels match their names, in which case we no longer - * need the drivers themselves to produce a name, and we can retire - * this check. - */ - if (name != NULL && strcmp(ethif_get_name(ethif), name)) { - printf("LWIP: driver '%s' restarted with name '%s'\n", - ethif_get_name(ethif), name); - - return FALSE; - } - - /* - * The hardware address length is just a sanity check for now. After - * the initialization reply, we assume the same length is used for all - * addresses, which is also the maximum, namely 48 bits (six bytes). - */ - if (hwaddr_len != ETHARP_HWADDR_LEN) { - printf("LWIP: driver '%s' reports hwaddr length %u\n", - ethif_get_name(ethif), hwaddr_len); - - return FALSE; - } - - /* - * If the driver has changed its available capabilities as a result of - * a restart, we have a problem: we may already have configured the - * interface's netif object to make use of of some of those - * capabilities. TODO: we can deal with some cases (e.g., disappearing - * checksum offloading capabilities) with some effort, and with other - * cases (e.g., disappearing multicast support) with a LOT more effort. - */ - if (ethif->ethif_caps != caps) { - printf("LWIP: driver '%s' changed capabilities\n", - ethif_get_name(ethif)); - - return FALSE; - } - - /* - * Set the hardware address on the interface, unless a request is - * currently pending to change it, in which case the new address has - * been set already and we do not want to revert that change. If not, - * we always set the address, because it may have changed as part of a - * driver restart and we do not want to get out of sync with it, nor - * can we necessarily change it back. - */ - if (!(ethif->ethif_active.nconf_set & NDEV_SET_HWADDR) && - !(ethif->ethif_pending.nconf_set & NDEV_SET_HWADDR)) - ifdev_update_hwaddr(ðif->ethif_ifdev, hwaddr->nhwa_addr, - (name == NULL) /*is_factory*/); - - /* - * At this point, only one more thing can fail: it is possible that we - * do not manage to send the first configuration request due to memory - * shortage. This is extremely unlikely to happen, so send the conf - * request first and forget the entire driver if it fails. - */ - /* - * Always generate a new multicast list before sending a configuration - * request, and at no other time (since there may be a grant for it). - */ - ethif_gen_mcast(ethif, ðif->ethif_active); - - if ((r = ndev_conf(ethif->ethif_ndev, ðif->ethif_active)) != OK) { - printf("LWIP: sending first configuration to '%s' failed " - "(%d)\n", ethif_get_name(ethif), r); - - return FALSE; - } - - ethif_set_status(ethif, link, media); - - ethif->ethif_flags &= ~ETHIFF_DISABLED; - ethif->ethif_flags |= ETHIFF_FIRST_CONF; - - return TRUE; -} - -/* - * The configuration change stored in the "pending" slot of the given ethif - * object has been acknowledged by the network device driver (or the driver has - * died, see ethif_disable()). Apply changes to the "active" slot of the given - * ethif object, as well as previously delayed changes to lwIP through netif. - */ -static void -ethif_post_conf(struct ethif * ethif) -{ - struct ndev_conf *nconf; - unsigned int flags; - - nconf = ðif->ethif_pending; - - /* - * Now that the driver configuration has changed, we know that the - * new checksum settings will be applied to all sent and received - * packets, and we can disable checksumming flags in netif as desired. - * Enabling checksumming flags has already been done earlier on. - */ - if (nconf->nconf_set & NDEV_SET_CAPS) { - flags = ethif_get_netif(ethif)->chksum_flags; - - if (nconf->nconf_caps & NDEV_CAP_CS_IP4_TX) - flags &= ~NETIF_CHECKSUM_GEN_IP; - if (nconf->nconf_caps & NDEV_CAP_CS_IP4_RX) - flags &= ~NETIF_CHECKSUM_CHECK_IP; - if (nconf->nconf_caps & NDEV_CAP_CS_UDP_TX) - flags &= ~NETIF_CHECKSUM_GEN_UDP; - if (nconf->nconf_caps & NDEV_CAP_CS_UDP_RX) - flags &= ~NETIF_CHECKSUM_CHECK_UDP; - if (nconf->nconf_caps & NDEV_CAP_CS_TCP_TX) - flags &= ~NETIF_CHECKSUM_GEN_TCP; - if (nconf->nconf_caps & NDEV_CAP_CS_TCP_RX) - flags &= ~NETIF_CHECKSUM_CHECK_TCP; - - NETIF_SET_CHECKSUM_CTRL(ethif_get_netif(ethif), flags); - } - - /* - * Merge any individual parts of the now acknowledged configuration - * changes into the active configuration. The result is that we are - * able to reapply these changes at any time should the network driver - * be restarted. In addition, by only setting bits for fields that - * have actually changed, we can later tell whether the user wanted the - * change or ethif should just take over what the driver reports after - * a restart; this is important for HW-address and media settings. - */ - ethif_merge_conf(ðif->ethif_active, ðif->ethif_pending); -} - -/* - * All receive requests have been canceled at the ndev layer, because the - * network device driver has been restarted or shut down. Clear the receive - * queue, freeing any packets in it. - */ -static void -ethif_drain(struct ethif * ethif) -{ - struct pbuf *pbuf, **pnext; - - while ((pbuf = ethif->ethif_rcv.er_head) != NULL) { - pnext = pchain_end(pbuf); - - if ((ethif->ethif_rcv.er_head = *pnext) == NULL) - ethif->ethif_rcv.er_tailp = ðif->ethif_rcv.er_head; - - *pnext = NULL; - pbuf_free(pbuf); - } -} - -/* - * The network device driver has stopped working (i.e., crashed), but has not - * been shut down completely, and is expect to come back later. - */ -void -ethif_disable(struct ethif * ethif) -{ - - /* - * We assume, optimistically, that a new instance of the driver will be - * brought up soon after which we can continue operating as before. As - * such, we do not want to change most of the user-visible state until - * we know for sure that our optimism was in vain. In particular, we - * do *not* want to change the following parts of the state here: - * - * - the contents of the send queue; - * - the state of the interface (up or down); - * - the state and media type of the physical link. - * - * The main user-visible indication of the crash will be that the - * interface does not have the IFF_RUNNING flag set. - */ - - /* - * If a configuration request was pending, it will be lost now. Highly - * unintuitively, make the requested configuration the *active* one, - * just as though the request completed successfully. This works, - * because once the driver comes back, the active configuration will be - * replayed as initial configuration. Therefore, by pretending that - * the current request went through, we ensure that it too will be sent - * to the new instance--before anything else is allowed to happen. - */ - if (ethif->ethif_pending.nconf_set != 0) - ethif_post_conf(ethif); - - /* - * Any packet send requests have been lost, too, and likewise forgotten - * by ndev. Thus, we need to forget that we sent any packets, so that - * they will be resent after the driver comes back up. That *may* - * cause packet duplication, but that is preferable over packet loss. - */ - ethif->ethif_snd.es_unsentp = ðif->ethif_snd.es_head; - - /* - * We fully restart the receive queue, because all receive requests - * have been forgotten by ndev as well now and it is easier to simply - * reconstruct the receive queue in its entirety later on. - */ - ethif_drain(ethif); - - /* Make sure we do not attempt to initiate new requests for now. */ - ethif->ethif_flags &= ~ETHIFF_FIRST_CONF; - ethif->ethif_flags |= ETHIFF_DISABLED; -} - -/* - * Dequeue and discard the packet at the head of the send queue. - */ -static void -ethif_dequeue_send(struct ethif * ethif) -{ - struct pbuf *pref, *pbuf, **pnext; - unsigned int count, spares; - - /* - * The send queue is a linked list of reference buffers, each of which - * links to the actual packet. Dequeue the first reference buffer. - */ - pref = ethif->ethif_snd.es_head; - assert(pref != NULL); - - pnext = pchain_end(pref); - - if (ethif->ethif_snd.es_unsentp == pnext) - ethif->ethif_snd.es_unsentp = ðif->ethif_snd.es_head; - - if ((ethif->ethif_snd.es_head = *pnext) == NULL) - ethif->ethif_snd.es_tailp = ðif->ethif_snd.es_head; - - /* Do this before possibly calling pbuf_clen() below.. */ - *pnext = NULL; - - /* - * If we never made a copy of the original packet, we now have it - * pointed to by a reference buffer. If so, decrease the reference - * count of the actual packet, thereby freeing it if lwIP itself was - * already done with. Otherwise, the copy of the packet is the - * reference buffer itself. In both cases we need to free that buffer. - */ - if (pref->type == PBUF_REF) { - pbuf = (struct pbuf *)pref->payload; - - pbuf_free(pbuf); - - count = pref->len; - } else - count = pbuf_clen(pref); - - assert(count > 0); - assert(ethif->ethif_snd.es_count >= count); - ethif->ethif_snd.es_count -= count; - - if (ethif->ethif_snd.es_count >= ETHIF_PBUF_MIN) - spares = count; - else if (ethif->ethif_snd.es_count + count > ETHIF_PBUF_MIN) - spares = ethif->ethif_snd.es_count + count - ETHIF_PBUF_MIN; - else - spares = 0; - - assert(ethif_spares >= spares); - ethif_spares -= spares; - - /* Free the reference buffer as well. */ - pbuf_free(pref); - - /* If the send queue is now empty, clear the IFF_OACTIVE flag. */ - if (ethif->ethif_snd.es_head == NULL) - ifdev_update_ifflags(ðif->ethif_ifdev, - ifdev_get_ifflags(ðif->ethif_ifdev) & ~IFF_OACTIVE); -} - -/* - * The ndev layer reports that a network device driver has been permanently - * shut down. Remove the corresponding ethernet interface from the system. - */ -void -ethif_remove(struct ethif * ethif) -{ - int r; - - /* Clear the send and receive queues. */ - while (ethif->ethif_snd.es_head != NULL) - ethif_dequeue_send(ethif); - - ethif_drain(ethif); - - /* Let the ifdev module deal with most other removal aspects. */ - if ((r = ifdev_remove(ðif->ethif_ifdev)) != OK) - panic("unable to remove ethernet interface: %d", r); - - /* Finally, readd the ethif object to the free list. */ - SIMPLEQ_INSERT_HEAD(ðif_freelist, ethif, ethif_next); -} - -/* - * The ndev layer reports that the (oldest) pending configuration request has - * completed with the given result. - */ -void -ethif_configured(struct ethif * ethif, int32_t result) -{ - - /* - * The driver is not supposed to return failure in response to a - * configure result. If it does, we have no proper way to recover, as - * we may already have applied part of the new configuration to netif. - * For now, just report failure and then pretend success. - */ - if (result < 0) { - printf("LWIP: driver '%s' replied with conf result %d\n", - ethif_get_name(ethif), result); - - result = 0; - } - - if (ethif->ethif_flags & ETHIFF_FIRST_CONF) - ethif->ethif_flags &= ~ETHIFF_FIRST_CONF; - else - ethif_post_conf(ethif); - - /* - * For now, the result is simply a boolean value indicating whether the - * driver is using the all-multicast receive mode instead of the - * multicast-list receive mode. We can turn it into a bitmap later. - */ - if (result != 0) { - ethif->ethif_active.nconf_mode &= ~NDEV_MODE_MCAST_LIST; - ethif->ethif_active.nconf_mode |= NDEV_MODE_MCAST_ALL; - } - - /* The interface flags may have changed now, so update them. */ - ethif_update_ifflags(ethif); - - /* Regular operation will resume from the polling function. */ -} - -/* - * The ndev layer reports that the first packet on the send queue has been - * successfully transmitted with 'result' set to OK, or dropped if 'result' is - * negative. The latter may happen if the interface was taken down while there - * were still packets in transit. - */ -void -ethif_sent(struct ethif * ethif, int32_t result) -{ - - ethif_dequeue_send(ethif); - - if (result < 0) - ifdev_output_drop(ðif->ethif_ifdev); - - /* More requests may be sent from the polling function now. */ -} - -/* - * The ndev layer reports that the first buffer on the receive queue has been - * filled with a packet of 'result' bytes, or if 'result' is negative, the - * receive request has been aborted. - */ -void -ethif_received(struct ethif * ethif, int32_t result) -{ - struct pbuf *pbuf, *pwalk, **pnext; - size_t left; - - /* - * Start by removing the first buffer chain off the receive queue. The - * ndev layer guarantees that there ever was a receive request at all. - */ - if ((pbuf = ethif->ethif_rcv.er_head) == NULL) - panic("driver received packet but queue empty"); - - pnext = pchain_end(pbuf); - - if ((ethif->ethif_rcv.er_head = *pnext) == NULL) - ethif->ethif_rcv.er_tailp = ðif->ethif_rcv.er_head; - *pnext = NULL; - - /* Decide if we can and should deliver a packet to the layers above. */ - if (result <= 0 || !ethif_can_recv(ethif)) { - pbuf_free(pbuf); - - return; - } - - if (result > pbuf->tot_len) { - printf("LWIP: driver '%s' returned bad packet size (%zd)\n", - ethif_get_name(ethif), (ssize_t)result); - - pbuf_free(pbuf); - - return; - } - - /* - * The packet often does not use all of the buffers, or at least not - * all of the last buffer. Adjust lengths for the buffers that contain - * part of the packet, and free the remaining (unused) buffers, if any. - */ - left = (size_t)result; - - for (pwalk = pbuf; ; pwalk = pwalk->next) { - pwalk->tot_len = left; - if (pwalk->len > left) - pwalk->len = left; - left -= pwalk->len; - if (left == 0) - break; - } - - if (pwalk->next != NULL) { - pbuf_free(pwalk->next); - - pwalk->next = NULL; - } - - /* - * Finally, hand off the packet to the layers above. We go through - * ifdev so that it can pass the packet to BPF devices and update - * statistics and all that. - */ - ifdev_input(ðif->ethif_ifdev, pbuf, NULL /*netif*/, - TRUE /*to_bpf*/); -} - -/* - * The ndev layer reports a network driver status update. If anything has - * changed since the last status, we may have to take action. The given - * statistics counters are relative to the previous status report. - */ -void -ethif_status(struct ethif * ethif, uint32_t link, uint32_t media, - uint32_t oerror, uint32_t coll, uint32_t ierror, uint32_t iqdrop) -{ - struct if_data *ifdata; - - ethif_set_status(ethif, link, media); - - ifdata = ifdev_get_ifdata(ðif->ethif_ifdev); - ifdata->ifi_oerrors += oerror; - ifdata->ifi_collisions += coll; - ifdata->ifi_ierrors += ierror; - ifdata->ifi_iqdrops += iqdrop; -} - -/* - * Set NetBSD-style interface flags (IFF_) for an ethernet interface. - */ -static int -ethif_set_ifflags(struct ifdev * ifdev, unsigned int ifflags) -{ - struct ethif *ethif = (struct ethif *)ifdev; - uint32_t mode, flags; - - /* - * We do not support IFF_NOARP at this time, because lwIP does not: the - * idea of IFF_NOARP is that only static ARP entries are used, but lwIP - * does not support separating static from dynamic ARP operation. The - * flag does not appear to be particularly widely used anyway. - */ - if ((ifflags & ~(IFF_UP | IFF_DEBUG | IFF_LINK0 | IFF_LINK1 | - IFF_LINK2)) != 0) - return EINVAL; - - mode = ethif->ethif_wanted.nconf_mode; - if ((ifflags & IFF_UP) && mode == NDEV_MODE_DOWN) { - mode = NDEV_MODE_UP; - - /* Always enable broadcast receipt when supported. */ - if (ethif->ethif_caps & NDEV_CAP_BCAST) - mode |= NDEV_MODE_BCAST; - - if (ifdev_is_promisc(ifdev)) - mode |= NDEV_MODE_PROMISC; - - /* - * The multicast flags will be set right before we send the - * request to the driver. - */ - } else if (!(ifflags & IFF_UP) && mode != NDEV_MODE_DOWN) - ethif->ethif_wanted.nconf_mode = NDEV_MODE_DOWN; - - if (mode != ethif->ethif_wanted.nconf_mode) { - ethif->ethif_wanted.nconf_mode = mode; - ethif->ethif_wanted.nconf_set |= NDEV_SET_MODE; - } - - /* - * Some of the interface flags (UP, DEBUG, PROMISC, LINK[0-2]) are a - * reflection of the intended state as set by userland before, so that - * a userland utility will never not see the flag it just set (or the - * other way around). These flags therefore do not necessarily reflect - * what is actually going on at that moment. We cannot have both. - */ - flags = 0; - if (ifflags & IFF_DEBUG) - flags |= NDEV_FLAG_DEBUG; - if (ifflags & IFF_LINK0) - flags |= NDEV_FLAG_LINK0; - if (ifflags & IFF_LINK1) - flags |= NDEV_FLAG_LINK1; - if (ifflags & IFF_LINK2) - flags |= NDEV_FLAG_LINK2; - - if (flags != ethif->ethif_wanted.nconf_flags) { - ethif->ethif_wanted.nconf_flags = flags; - ethif->ethif_wanted.nconf_set |= NDEV_SET_FLAGS; - } - - /* The changes will be picked up from the polling function. */ - return OK; -} - -/* - * Convert a bitmask of ndev-layer capabilities (NDEV_CAP_) to NetBSD-style - * interface capabilities (IFCAP_). - */ -static uint64_t -ethif_cap_to_ifcap(uint32_t caps) -{ - uint64_t ifcap; - - ifcap = 0; - if (caps & NDEV_CAP_CS_IP4_TX) - ifcap |= IFCAP_CSUM_IPv4_Tx; - if (caps & NDEV_CAP_CS_IP4_RX) - ifcap |= IFCAP_CSUM_IPv4_Rx; - if (caps & NDEV_CAP_CS_UDP_TX) - ifcap |= IFCAP_CSUM_UDPv4_Tx | IFCAP_CSUM_UDPv6_Tx; - if (caps & NDEV_CAP_CS_UDP_RX) - ifcap |= IFCAP_CSUM_UDPv4_Rx | IFCAP_CSUM_UDPv6_Rx; - if (caps & NDEV_CAP_CS_TCP_TX) - ifcap |= IFCAP_CSUM_TCPv4_Tx | IFCAP_CSUM_TCPv6_Tx; - if (caps & NDEV_CAP_CS_TCP_RX) - ifcap |= IFCAP_CSUM_TCPv4_Rx | IFCAP_CSUM_TCPv6_Rx; - - return ifcap; -} - -/* - * Retrieve potential and enabled NetBSD-style interface capabilities (IFCAP_). - */ -static void -ethif_get_ifcap(struct ifdev * ifdev, uint64_t * ifcap, uint64_t * ifena) -{ - struct ethif *ethif = (struct ethif *)ifdev; - - *ifcap = ethif_cap_to_ifcap(ethif->ethif_caps); - *ifena = ethif_cap_to_ifcap(ethif->ethif_wanted.nconf_caps); -} - -/* - * Set NetBSD-style enabled interface capabilities (IFCAP_). - */ -static int -ethif_set_ifcap(struct ifdev * ifdev, uint64_t ifcap) -{ - struct ethif *ethif = (struct ethif *)ifdev; - unsigned int flags; - uint32_t caps; - - if (ifcap & ~(IFCAP_CSUM_IPv4_Tx | IFCAP_CSUM_IPv4_Rx | - IFCAP_CSUM_UDPv4_Tx | IFCAP_CSUM_UDPv6_Tx | - IFCAP_CSUM_UDPv4_Rx | IFCAP_CSUM_UDPv6_Rx | - IFCAP_CSUM_TCPv4_Tx | IFCAP_CSUM_TCPv6_Tx | - IFCAP_CSUM_TCPv4_Rx | IFCAP_CSUM_TCPv6_Rx)) - return EINVAL; - - /* - * Some IPv4/IPv6 flags need to be set together in order to be picked - * up. Unfortunately, that is all we can do given that lwIP does not - * distinguish IPv4/IPv6 when it comes to TCP/UDP checksum flags. - */ - caps = 0; - if (ifcap & IFCAP_CSUM_IPv4_Tx) - caps |= NDEV_CAP_CS_IP4_TX; - if (ifcap & IFCAP_CSUM_IPv4_Rx) - caps |= NDEV_CAP_CS_IP4_RX; - if ((ifcap & (IFCAP_CSUM_UDPv4_Tx | IFCAP_CSUM_UDPv6_Tx)) == - (IFCAP_CSUM_UDPv4_Tx | IFCAP_CSUM_UDPv6_Tx)) - caps |= NDEV_CAP_CS_UDP_TX; - if ((ifcap & (IFCAP_CSUM_UDPv4_Rx | IFCAP_CSUM_UDPv6_Rx)) == - (IFCAP_CSUM_UDPv4_Rx | IFCAP_CSUM_UDPv6_Rx)) - caps |= NDEV_CAP_CS_UDP_RX; - if ((ifcap & (IFCAP_CSUM_TCPv4_Tx | IFCAP_CSUM_TCPv6_Tx)) == - (IFCAP_CSUM_TCPv4_Tx | IFCAP_CSUM_TCPv6_Tx)) - caps |= NDEV_CAP_CS_TCP_TX; - if ((ifcap & (IFCAP_CSUM_TCPv4_Rx | IFCAP_CSUM_TCPv6_Rx)) == - (IFCAP_CSUM_TCPv4_Rx | IFCAP_CSUM_TCPv6_Rx)) - caps |= NDEV_CAP_CS_TCP_RX; - - /* - * When changing checksumming capabilities, we have to make sure that - * we only ever checksum too much and never too little. This means - * that we enable any checksum options in netif here, and disable any - * checksum options in netif only after driver configuration. - * - * Note that we have to draw the line somewhere with this kind of - * self-protection, and that line is short of TCP retransmission: we - * see it as lwIP's job to compute checksums for retransmitted TCP - * packets if they were saved across checksum changes. Even though - * lwIP may not care, there is little we can do about that anyway. - */ - if (ethif->ethif_wanted.nconf_caps != caps) { - flags = ethif_get_netif(ethif)->chksum_flags; - - if (!(caps & NDEV_CAP_CS_IP4_TX)) - flags |= NETIF_CHECKSUM_GEN_IP; - if (!(caps & NDEV_CAP_CS_IP4_RX)) - flags |= NETIF_CHECKSUM_CHECK_IP; - if (!(caps & NDEV_CAP_CS_UDP_TX)) - flags |= NETIF_CHECKSUM_GEN_UDP; - if (!(caps & NDEV_CAP_CS_UDP_RX)) - flags |= NETIF_CHECKSUM_CHECK_UDP; - if (!(caps & NDEV_CAP_CS_TCP_TX)) - flags |= NETIF_CHECKSUM_GEN_TCP; - if (!(caps & NDEV_CAP_CS_TCP_RX)) - flags |= NETIF_CHECKSUM_CHECK_TCP; - - NETIF_SET_CHECKSUM_CTRL(ethif_get_netif(ethif), flags); - - ethif->ethif_wanted.nconf_caps = caps; - ethif->ethif_wanted.nconf_set |= NDEV_SET_CAPS; - } - - /* The changes will be picked up from the polling function. */ - return OK; -} - -/* - * Retrieve NetBSD-style interface media type (IFM_). Return both the current - * media type selection and the driver-reported active media type. - */ -static void -ethif_get_ifmedia(struct ifdev * ifdev, int * ifcurrent, int * ifactive) -{ - struct ethif *ethif = (struct ethif *)ifdev; - - /* - * For the current select, report back whatever the user gave us, even - * if it has not reached the driver at all yet. - */ - *ifcurrent = (int)ethif->ethif_wanted.nconf_media; - *ifactive = (int)ethif->ethif_media; -} - -/* - * Set current NetBSD-style interface media type (IFM_). - */ -static int -ethif_set_ifmedia(struct ifdev * ifdev, int ifmedia) -{ - struct ethif *ethif = (struct ethif *)ifdev; - - /* - * We currently completely lack the infrastructure to suspend the - * current IOCTL call until the driver replies (or disappears). - * Therefore we have no choice but to return success here, even if the - * driver cannot accept the change. The driver does notify us of media - * changes, so the user may observe the new active media type later. - * Also note that the new media type may not be the requested type, - * which is why we do not perform any checks against the wanted or - * active media types. - */ - ethif->ethif_wanted.nconf_media = (uint32_t)ifmedia; - ethif->ethif_wanted.nconf_set |= NDEV_SET_MEDIA; - - /* The change will be picked up from the polling function. */ - return OK; -} - -/* - * Enable or disable promiscuous mode on the interface. - */ -static void -ethif_set_promisc(struct ifdev * ifdev, int promisc) -{ - struct ethif *ethif = (struct ethif *)ifdev; - - if (ethif->ethif_wanted.nconf_mode != NDEV_MODE_DOWN) { - if (promisc) - ethif->ethif_wanted.nconf_mode |= NDEV_MODE_PROMISC; - else - ethif->ethif_wanted.nconf_mode &= ~NDEV_MODE_PROMISC; - ethif->ethif_wanted.nconf_set |= NDEV_SET_MODE; - } - - /* The change will be picked up from the polling function. */ -} - -/* - * Set the hardware address on the interface. - */ -static int -ethif_set_hwaddr(struct ifdev * ifdev, const uint8_t * hwaddr) -{ - struct ethif *ethif = (struct ethif *)ifdev; - - if (!(ethif->ethif_caps & NDEV_CAP_HWADDR)) - return EINVAL; - - memcpy(ðif->ethif_wanted.nconf_hwaddr.nhwa_addr, hwaddr, - ETHARP_HWADDR_LEN); - ethif->ethif_wanted.nconf_set |= NDEV_SET_HWADDR; - - /* The change will be picked up from the polling function. */ - return OK; -} - -/* - * Set the Maximum Transmission Unit for this interface. Return TRUE if the - * new value is acceptable, in which case the caller will do the rest. Return - * FALSE otherwise. - */ -static int -ethif_set_mtu(struct ifdev * ifdev __unused, unsigned int mtu) -{ - - return (mtu <= ETHIF_MAX_MTU); -} - -static const struct ifdev_ops ethif_ops = { - .iop_init = ethif_init_netif, - .iop_input = netif_input, - .iop_output = ethif_output, - .iop_output_v4 = etharp_output, - .iop_output_v6 = ethip6_output, - .iop_hdrcmplt = ethif_hdrcmplt, - .iop_poll = ethif_poll, - .iop_set_ifflags = ethif_set_ifflags, - .iop_get_ifcap = ethif_get_ifcap, - .iop_set_ifcap = ethif_set_ifcap, - .iop_get_ifmedia = ethif_get_ifmedia, - .iop_set_ifmedia = ethif_set_ifmedia, - .iop_set_promisc = ethif_set_promisc, - .iop_set_hwaddr = ethif_set_hwaddr, - .iop_set_mtu = ethif_set_mtu, -}; diff --git a/minix/net/lwip/ifaddr.c b/minix/net/lwip/ifaddr.c deleted file mode 100644 index 17cb6b58b..000000000 --- a/minix/net/lwip/ifaddr.c +++ /dev/null @@ -1,2224 +0,0 @@ -/* LWIP service - ifaddr.c - network interface address management */ -/* - * This module is an exception to the regular source organization of this - * service, in that it manages part of another module's data structures, namely - * ifdev. As such, it should be seen as logically part of ifdev. It is - * separated only to keep the source code more manageable. Still, this module - * may use direct access only on the address-related fields of the ifdev - * structure, so that those one day may be move into an ifaddr-specific - * substructure within ifdev. - */ -/* - * We manage three types of addresses here: IPv4 addresses (ifaddr_v4), - * IPv6 addresses (ifaddr_v6), and link-layer a.k.a. MAC addresses (ifaddr_dl). - * - * Managing IPv4 addresses is easy. lwIP supports only one IPv4 address per - * netif. While it would be possible to construct a model where one ifdev - * consists of multiple netifs (with one IPv4 address each), we not support - * this--mostly because it is a pain to keep state synchronized between the - * netifs in that case. Such support can still be added later; the IPv4 API - * exposed from here does support multiple IPv4 addresses already just in case, - * as does much of the code using the API. - * - * For IPv4 addresses we maintain only one extra piece of information here, - * which is whether an IPv4 address has been set at all. This is because for - * our userland (DHCP clients in particular), we must allow assigning 0.0.0.0 - * as address to an interface. We do not use the lwIP per-netif IPv4 gateway - * field, nor the concept of a "default netif", in both cases because we - * override all (routing) decisions that would use those settings. lwIP does - * not allow a broadcast address to be set, so support for broadcast addresses - * is botched here: we disregard custom broadcast addresses given to us, and - * instead expose the broadcast address that is used within lwIP. - * - * Managing IPv6 addresses is much more complicated. First of all, even though - * lwIP supports stateless address autoconfiguration (SLAAC) as per RFC 4862, - * we disable that and instead make dhcpcd(8) responsible for all IPv6 address - * configuration. dhcpcd(8) will set addresses and routes as necessary, the - * latter of which are used in lwIP through our routing hooks (in the route - * module). This approach, which is in line with where NetBSD is headed, - * allows us to work around a number of lwIP limitations. As a result we do - * differ in this respect from NetBSD, which may switch between kernel-only, - * dhcpcd-only, and hybrid autoconfiguration, mainly throught the accept_rtadv - * sysctl(7) node. Writing to this node has no real effect on MINIX 3. - * - * All IPv6 addresses have a prefix length, which is almost but not quite the - * same as IPv4's subnet masks (see RFC 5942). We must maintain the per- - * address prefix length ourselves, as lwIP supports IPv6 prefix lengths of 64 - * bits only. Our dhcpcd(8)-based approach allows us to work around that. - * - * All IPv6 addresses also have a state and a lifetime, both of which are - * managed by lwIP. Unlike for IPv4, address-derived routes and routing socket - * messages are only created for addresses that are "valid", which means that - * they are in either PREFERRED or DEPRECATED state. This means that we have - * to be aware of all address state transitions between "valid" and "not - * valid", some of which (namely address duplication detection and lifetime - * expirations) are initiated by lwIP. As such, we need to keep shadow state - * for each address, and use a callback to detect whether state has changed. - * - * For understanding of this module as well as lwIP, it is important to note - * that "valid" is not the opposite of "invalid" in this context: "not valid" - * includes the address states INVALID, DUPLICATED, and TENTATIVE, while - * "invalid"/INVALID simply means that the address slot is free. - * - * Each IPv6 address also has associated flags. We support an AUTOCONF flag - * which indicates that no subnet route should be added for the address; on - * MINIX 3, dhcpcd(8) is modified to pass in that flag when appropriate, thus - * solving a problem that NetBSD suffers from, namely that it does not know - * whether a userland-given route is static (implying a subnet) or auto- - * configured (implying no subnet, again as per RFC 5942), leading to it doing - * the wrong thing in dhcpcd-only autoconfiguration mode. The TEMPORARY flag, - * for privacy addresses (RFC 4941) should be the same as on NetBSD; it is - * currently used only in source address selection (RFC 6724). We override - * lwIP's IPv6 source address selection algorithm to include support for not - * just this flag, but also label and proper longest-common-prefix comparisons. - * Finally, there is an HWBASED flag to make sure that when the link-layer - * address is changed, the IPv6 link-local address is changed accordingly only - * if the previous link-local address was also autogenerated from a link-layer - * address and not set manually by userland. - * - * Finally, we support multiple link-layer addresses per interface, but only - * because NetBSD's ifconfig(8) uses an API that expects such multi-address - * support. At any time, only one of the addresses is marked as "active", - * which means it is used as MAC address in outgoing packets. We support only - * one MAC address per device driver, so the support for additional, inactive - * link-layer addresses is there exclusively for ifconfig(8) interoperability. - * - * All interfaces, including those that do not have MAC addresses at all (e.g., - * loopback interfaces), do have one link-layer address. This is expected in - * particular by getifaddrs(3), which only recognizes interfaces that have a - * link-layer address. - * - * Many features are still missing here, especially for IP addresses. For - * example, we do not yet support destination addresses at all yet, simply - * because there is no interface type that uses them. For IPv6, more work is - * to be done to support proper netif status transitions versus address states, - * fallout from address duplication, and various ND6_IFF_ flags. - */ - -#include "lwip.h" -#include "rtsock.h" -#include "route.h" - -#include "lwip/etharp.h" - -#include -#include - -/* - * Routing flags for local address and local network routing entries. This - * may later have to be refined, for example in order not to set RTF_CLONING - * for routes on interfaces that do not have link-layer addressing. - * - * IMPORTANT: as of NetBSD 8, RTF_CLONING has been renamed to RTF_CONNECTED. - */ -#define IFADDR_HOST_RTFLAGS (RTF_UP | RTF_HOST | RTF_LOCAL) -#define IFADDR_NET_RTFLAGS (RTF_UP | RTF_CLONING) - -/* Address-related sysctl(7) settings. */ -int ifaddr_auto_linklocal = 1; /* different from NetBSD, see its usage */ -int ifaddr_accept_rtadv = 0; /* settable but completely disregarded */ - -/* - * Initialize the local address administration for an interface that is in the - * process of being created. - */ -void -ifaddr_init(struct ifdev * ifdev) -{ - unsigned int i; - - ifdev->ifdev_v4set = FALSE; - - for (i = 0; i < LWIP_IPV6_NUM_ADDRESSES; i++) - ifdev->ifdev_v6state[i] = IP6_ADDR_INVALID; - - for (i = 0; i < __arraycount(ifdev->ifdev_hwlist); i++) - ifdev->ifdev_hwlist[i].ifhwa_flags = 0; -} - -/* - * Find an IPv4 address locally assigned to a interface. The IPv4 address is - * given as 'addr'. The interface is given as 'ifdev'. On success, return OK, - * with the IPv4 address number stored in 'num'. On failure, return a negative - * error code. - */ -int -ifaddr_v4_find(struct ifdev * ifdev, const struct sockaddr_in * addr, - ifaddr_v4_num_t * num) -{ - ip_addr_t ipaddr; - int r; - - if ((r = addr_get_inet((const struct sockaddr *)addr, sizeof(*addr), - IPADDR_TYPE_V4, &ipaddr, TRUE /*kame*/, NULL /*port*/)) != OK) - return r; - - if (!ifdev->ifdev_v4set || - !ip_addr_cmp(netif_ip_addr4(ifdev_get_netif(ifdev)), &ipaddr)) - return EADDRNOTAVAIL; - - *num = 0; - return OK; -} - -/* - * Enumerate IPv4 addresses locally assigned to the given interface 'ifdev'. - * The caller should set 'nump' to 0 initially, and increase it by one between - * a successful call and the next enumeration call. Return TRUE on success, - * meaning that starting from the given value of 'nump' there is at least one - * IPv4 address, of which the number is stored in 'nump' on return. Return - * FALSE if there are no more IPv4 addresses locally assigned to the interface. - */ -int -ifaddr_v4_enum(struct ifdev * ifdev, ifaddr_v4_num_t * num) -{ - - /* - * For now, we support only up to one IPv4 address per interface. - * set if we are to return it. - */ - return (*num == 0 && ifdev->ifdev_v4set); -} - -/* - * Obtain information about the IPv4 address 'num' assigned to the interface - * 'ifdev'. On success, return OK, with the IPv4 address stored in 'addr', the - * network mask stored in 'mask', the broadcast stored in 'bcast', and the - * destination address stored in 'dest'. Each of these pointers may be NULL. - * The interface may not have a broadcast and/or destination address; in that - * case, their corresponding structures are not filled in at all, and thus must - * be preinitialized by the caller to a default state. The reason for not - * zeroing them is that some callers use the same buffer for both. On failure, - * return a negative error code. - */ -int -ifaddr_v4_get(struct ifdev * ifdev, ifaddr_v4_num_t num, - struct sockaddr_in * addr, struct sockaddr_in * mask, - struct sockaddr_in * bcast, struct sockaddr_in * dest) -{ - const ip_addr_t *ipaddr, *netmask; - struct netif *netif; - ip_addr_t broad; - socklen_t addr_len; - - if (!ifaddr_v4_enum(ifdev, &num)) - return EADDRNOTAVAIL; - - netif = ifdev_get_netif(ifdev); - - if (addr != NULL) { - addr_len = sizeof(*addr); - - addr_put_inet((struct sockaddr *)addr, &addr_len, - netif_ip_addr4(netif), TRUE /*kame*/, 0 /*port*/); - } - - if (mask != NULL) { - addr_len = sizeof(*mask); - - /* - * Do not bother using addr_put_netmask() here, as we would - * then first have to compute the prefix length.. - */ - addr_put_inet((struct sockaddr *)mask, &addr_len, - netif_ip_netmask4(netif), TRUE /*kame*/, 0 /*port*/); - } - - if (bcast != NULL) { - if (netif->flags & NETIF_FLAG_BROADCAST) { - /* Fake a broadcast address. */ - ipaddr = netif_ip_addr4(netif); - netmask = netif_ip_netmask4(netif); - - ip_addr_set_ip4_u32(&broad, - ip_addr_get_ip4_u32(ipaddr) | - ~ip_addr_get_ip4_u32(netmask)); - - addr_len = sizeof(*bcast); - - addr_put_inet((struct sockaddr *)bcast, &addr_len, - &broad, TRUE /*kame*/, 0 /*port*/); - } else { - bcast->sin_len = 0; - bcast->sin_family = AF_UNSPEC; - } - } - - if (dest != NULL) { - /* TODO: dest */ - dest->sin_len = 0; - dest->sin_family = AF_UNSPEC; - } - - return OK; -} - -/* - * Obtain NetBSD-style state flags (IN_IFF_) for the given local IPv4 address. - * The given number must identify an existing address. Return the flags. - */ -int -ifaddr_v4_get_flags(struct ifdev * ifdev, ifaddr_v4_num_t num) -{ - - /* IPv4 per-address flags are not supported yet. */ - return 0; -} - -/* - * Determine whether there should be a local subnet route for the given - * assigned IPv4 address, and if so, compute the subnet mask to add. Return - * TRUE if a local subnet route should be added, and return the network base - * address in 'netbase' and the number of prefix bits in 'prefixp'. Return - * FALSE if no subnet route should be added for the assigned address. - */ -static unsigned int -ifaddr_v4_netroute(struct ifdev * ifdev, ifaddr_v4_num_t num, - ip_addr_t * netbase, unsigned int * prefixp) -{ - const ip_addr_t *ipaddr, *netmask; - unsigned int prefix; - uint32_t val; - - /* Do not add subnet masks for loopback interfaces. */ - if (ifdev_is_loopback(ifdev)) - return FALSE; - - assert(num == 0); - assert(ifdev->ifdev_v4set); - - ipaddr = netif_ip_addr4(ifdev_get_netif(ifdev)); - netmask = netif_ip_netmask4(ifdev_get_netif(ifdev)); - - /* - * If the subnet is a /32, skip adding a local host route: not only - * would it not be useful, it would fail anyway because we currently do - * not support adding a host-type route and a full-width net-type route - * for the same IP address. - */ - if (ip_addr_get_ip4_u32(netmask) == PP_HTONL(0xffffffffUL)) - return FALSE; - - /* Compute the network base address. */ - ip_addr_set_ip4_u32(netbase, - ip_addr_get_ip4_u32(ipaddr) & ip_addr_get_ip4_u32(netmask)); - - /* Find the number of prefix bits of the netmask. TODO: improve.. */ - val = ntohl(ip_addr_get_ip4_u32(netmask)); - - for (prefix = 0; prefix < IP4_BITS; prefix++) - if (!(val & (1 << (IP4_BITS - prefix - 1)))) - break; - - *prefixp = prefix; - return TRUE; -} - -/* - * A local IPv4 address has been added to an interface. The interface is given - * as 'ifdev', and the number of the just-added IPv4 address is given as 'num'. - * Generate a routing socket message and add local routes as appropriate. - */ -static void -ifaddr_v4_added(struct ifdev * ifdev, ifaddr_v4_num_t num) -{ - const ip_addr_t *ipaddr; - ip_addr_t netbase; - unsigned int prefix; - - assert(num == 0); - assert(ifdev->ifdev_v4set); - - /* Report the addition of the interface address. */ - rtsock_msg_addr_v4(ifdev, RTM_NEWADDR, num); - - /* - * Add the local host route. This will always succeed: for addition, - * we just checked with route_can_add(); when updating, we first remove - * the exact same route. For now, we forbid users from messing with - * RTF_LOCAL routes directly, since nothing good (and a whole lot of - * bad) can come out of that, so the routes will not change under us. - * - * Why are we not using lo0 for this route, like the BSDs do? Because - * that approach is not compatible with link-local addresses. Instead, - * we intercept outgoing traffic to the local address, and redirect it - * over lo0, bypassing routing. If we did not do this, we would never - * know the originally intended zone of the outgoing packet. As an - * intended side effect, the traffic does show up on lo0 with BPF, just - * like on BSDs. Similarly, we do not need to set a gateway here. - * - * We currently do not use the routing tables for lookups on local - * addresses - see ifaddr_v6_map() as to why. If we ever do, that adds - * another reason that the interface associated with the route must be - * the interface that owns the address (and not, say, lo0). - */ - ipaddr = netif_ip_addr4(ifdev_get_netif(ifdev)); - - (void)route_add(ipaddr, IP4_BITS, NULL /*gateway*/, ifdev, - IFADDR_HOST_RTFLAGS, NULL /*rtr*/); - - /* - * Add the local network route, if the rules say that we should. Even - * then, adding the route may fail for various reasons, but this route - * is not essential and so we ignore failures here. - */ - if (ifaddr_v4_netroute(ifdev, num, &netbase, &prefix)) - (void)route_add(&netbase, prefix, NULL /*gateway*/, ifdev, - IFADDR_NET_RTFLAGS, NULL /*rtr*/); -} - -/* - * A particular local IPv4 address is being deleted. See if there is another - * local IPv4 address assigned to another interface that should have the same - * local subnet route (but didn't, as such duplicate routes can obviously not - * be added), and if so, readd the route for that other address. - */ -static void -ifaddr_v4_dupcheck(struct ifdev * oifdev, const ip_addr_t * onetbase, - unsigned int oprefix) -{ - struct ifdev *ifdev; - ip_addr_t netbase; - unsigned int prefix; - - for (ifdev = NULL; (ifdev = ifdev_enum(ifdev)) != NULL; ) { - if (ifdev == oifdev || !ifdev->ifdev_v4set) - continue; - - if (ifaddr_v4_netroute(ifdev, (ifaddr_v4_num_t)0, &netbase, - &prefix) && prefix == oprefix && - ip_addr_cmp(&netbase, onetbase)) { - (void)route_add(&netbase, prefix, NULL /*gateway*/, - ifdev, IFADDR_NET_RTFLAGS, NULL /*rtr*/); - - return; - } - } -} - -/* - * A local IPv4 address is about to be deleted from an interface, or the - * interface itself is about to be destroyed. Generate a routing socket - * message about this and delete local routes as appropriate. The interface is - * given as 'ifdev', and the number of the IPv4 address that is about to be - * deleted is given as 'num'. - */ -static void -ifaddr_v4_deleted(struct ifdev * ifdev, ifaddr_v4_num_t num) -{ - struct route_entry *route; - ip_addr_t netbase; - unsigned int prefix; - - assert(num == 0); - assert(ifdev->ifdev_v4set); - - /* Delete the local network route, if we tried adding it at all. */ - if (ifaddr_v4_netroute(ifdev, num, &netbase, &prefix) && - (route = route_find(&netbase, prefix, - FALSE /*is_host*/)) != NULL && - route_get_flags(route) == IFADDR_NET_RTFLAGS) { - route_delete(route, NULL /*rtr*/); - - /* - * Readd the local network route for another interface, if that - * interface has a local address on the very same network. - */ - ifaddr_v4_dupcheck(ifdev, &netbase, prefix); - } - - /* Delete the local host route. */ - if ((route = route_find(netif_ip_addr4(ifdev_get_netif(ifdev)), - IP4_BITS, TRUE /*is_host*/)) != NULL) - route_delete(route, NULL /*rtr*/); - - /* Report the deletion of the interface address. */ - rtsock_msg_addr_v4(ifdev, RTM_DELADDR, num); -} - -/* - * Add or update an IPv4 address on an interface. The interface is given as - * 'ifdev'. The address to add or update is pointed to by 'addr', which must - * always be a pointer to a valid address. For DHCP clients it must be - * possible to add the 'any' address (0.0.0.0). The network mask, broadcast - * address, and destination address parameters 'mask', 'bcast', and 'dest' - * (respectively) may be NULL pointers or pointers to AF_UNSPEC addresses, and - * will be disregarded if they are. If 'mask' and/or 'bcast' are NULL when - * adding an address, default values will be computed for them. The 'flags' - * field may contain NetBSD-style address flags (IN_IFF_). Return OK if the - * address was successfully added or updated, or a negative error code if not. - */ -int -ifaddr_v4_add(struct ifdev * ifdev, const struct sockaddr_in * addr, - const struct sockaddr_in * mask, const struct sockaddr_in * bcast, - const struct sockaddr_in * dest, int flags) -{ - ip_addr_t ipaddr, netmask, broad; - ip4_addr_t ip4zero; - struct netif *netif; - unsigned int dummy; - uint32_t val; - int r; - - assert(addr != NULL); - - if ((r = addr_get_inet((const struct sockaddr *)addr, sizeof(*addr), - IPADDR_TYPE_V4, &ipaddr, TRUE /*kame*/, NULL /*port*/)) != OK) - return r; - - /* Forbid multicast (class D) and experimental (class E) addresses. */ - val = ntohl(ip_addr_get_ip4_u32(&ipaddr)); - - if (ip_addr_ismulticast(&ipaddr) || IP_EXPERIMENTAL(val)) - return EINVAL; - - if (mask != NULL && mask->sin_family != AF_UNSPEC) { - if ((r = addr_get_netmask((const struct sockaddr *)mask, - sizeof(*mask), IPADDR_TYPE_V4, &dummy, &netmask)) != OK) - return r; - } else { - /* - * Generate a netmask based on IP class. Old, obsolete stuff, - * but we can't have no netmask. - */ - if (IN_CLASSA(val)) - ip_addr_set_ip4_u32(&netmask, PP_HTONL(IN_CLASSA_NET)); - else if (IN_CLASSB(val)) - ip_addr_set_ip4_u32(&netmask, PP_HTONL(IN_CLASSB_NET)); - else if (IN_CLASSC(val)) - ip_addr_set_ip4_u32(&netmask, PP_HTONL(IN_CLASSC_NET)); - else /* should not trigger */ - ip_addr_set_ip4_u32(&netmask, PP_HTONL(IN_CLASSD_NET)); - } - - if (bcast != NULL && bcast->sin_family != AF_UNSPEC) { - if ((r = addr_get_inet((const struct sockaddr *)bcast, - sizeof(*bcast), IPADDR_TYPE_V4, &broad, TRUE /*kame*/, - NULL /*port*/)) != OK) - return r; - - /* - * lwIP does not allow setting the broadcast address, so we - * must ensure that the given address is what lwIP uses anyway. - * No need to perform byte order swaps here. - */ - if (ip_addr_get_ip4_u32(&broad) != - (ip_addr_get_ip4_u32(&ipaddr) | - ~ip_addr_get_ip4_u32(&netmask))) - return EINVAL; - } - - /* TODO: dest (note: may be NULL) */ - - /* - * We currently do not support any IPv4 address flags. Even though - * supporting them would make maintaining dhcpcd(8) easier, lwIP does - * not offers the means to implement them properly. - */ - if (flags != 0) - return EINVAL; - - netif = ifdev_get_netif(ifdev); - - /* Should we add a new address, or update an existing one? */ - if (!ifdev->ifdev_v4set || - !ip_addr_cmp(netif_ip_addr4(netif), &ipaddr)) { - /* - * Add a new address. lwIP supports only one IPv4 address per - * netif. - */ - if (ifdev->ifdev_v4set) - return ENOBUFS; /* TODO: a better error code */ - - /* - * It must be possible to add the address to the routing table, - * so make sure that we can add such a route later on. The - * error code should be accurate for most real-world cases. - */ - if (!route_can_add(&ipaddr, IP4_BITS, TRUE /*is_host*/)) - return EEXIST; - - ip4_addr_set_zero(&ip4zero); - - netif_set_addr(netif, ip_2_ip4(&ipaddr), ip_2_ip4(&netmask), - &ip4zero); - - ifdev->ifdev_v4set = TRUE; - } else { - /* - * Update an existing address. First report the address as - * deleted. Do not actually delete the address in netif, - * because that would cause problems with its changing IP - * addresses on existing sockets. - */ - ifaddr_v4_deleted(ifdev, (ifaddr_v4_num_t)0); - - /* Update the one part that may have actually changed. */ - netif_set_netmask(netif, ip_2_ip4(&netmask)); - } - - /* In both cases, we now need to report the address as added. */ - ifaddr_v4_added(ifdev, (ifaddr_v4_num_t)0); - - return OK; -} - -/* - * Delete an IPv4 address from an interface. The given address number 'num' - * must have been obtained from ifaddr_v4_find() or ifaddr_v4_enum() on the - * same interface just before. This function always succeeds. - */ -void -ifaddr_v4_del(struct ifdev * ifdev, ifaddr_v4_num_t num) -{ - ip4_addr_t ip4zero; - - assert(num == 0); - assert(ifdev->ifdev_v4set); - - /* - * Report the address as deleted. Always do this first, because the - * reporting requires that the address is still there. - */ - ifaddr_v4_deleted(ifdev, num); - - /* Then actually delete the address. */ - ip4_addr_set_zero(&ip4zero); - - netif_set_addr(ifdev_get_netif(ifdev), &ip4zero, &ip4zero, &ip4zero); - - ifdev->ifdev_v4set = FALSE; -} - -/* - * Announce all IPv4 addresses associated with the given interface as deleted, - * Used (only) right before the interface is destroyed. - */ -void -ifaddr_v4_clear(struct ifdev * ifdev) -{ - - if (ifdev->ifdev_v4set) - ifaddr_v4_deleted(ifdev, (ifaddr_v4_num_t)0); -} - -/* - * Return the first interface device that owns the given IPv4 address, or NULL - * if it is not a valid local IPv4 address. - */ -struct ifdev * -ifaddr_v4_map_by_addr(const ip4_addr_t * ip4addr) -{ - struct ifdev *ifdev; - - /* - * It would be nice to be able to do a route lookup on an RTF_LOCAL - * entry here, but we do not do this for IPv6 either - see the comment - * in ifaddr_v6_map() - and it is much less needed here, because each - * interface has at most one IPv4 address. - */ - for (ifdev = NULL; (ifdev = ifdev_enum(ifdev)) != NULL; ) { - if (ifdev->ifdev_v4set && - ip4_addr_cmp(netif_ip4_addr(ifdev_get_netif(ifdev)), - ip4addr)) - return ifdev; - } - - return NULL; -} - -/* - * Return the first interface device for which the given IPv4 address is on a - * configured local subnet, or NULL if no match was found. - */ -static struct ifdev * -ifaddr_v4_map_by_subnet(const ip4_addr_t * ip4addr) -{ - struct ifdev *ifdev; - struct netif *netif; - uint32_t addr1, addr2, mask; - - addr1 = ip4_addr_get_u32(ip4addr); - - /* - * Here, we must never do a route lookup, because this routine is used - * for SO_DONTROUTE/MSG_DONTROUTE. - */ - for (ifdev = NULL; (ifdev = ifdev_enum(ifdev)) != NULL; ) { - if (!ifdev->ifdev_v4set) - continue; - - netif = ifdev_get_netif(ifdev); - - addr2 = ip4_addr_get_u32(netif_ip4_addr(netif)); - mask = ip4_addr_get_u32(netif_ip4_netmask(netif)); - - if ((addr1 & mask) == (addr2 & mask)) - return ifdev; - } - - return NULL; -} - -/* - * Return TRUE if the given local IPv6 interface address is valid (= preferred - * or deprecated), or FALSE if it is not (= tentative or duplicated). The - * address slot must be in use, that is, it must not be free (= invalid). - */ -static int -ifaddr_v6_isvalid(struct ifdev * ifdev, ifaddr_v6_num_t num) -{ - int state; - - state = ifdev->ifdev_v6state[num]; - - /* Note that 'valid' and 'invalid' are not each other's inverse! */ - assert(!ip6_addr_isinvalid(state)); - - return ip6_addr_isvalid(state); -} - -/* - * Find an IPv6 address assigned to the given interface that matches the given - * IPv6 address. Return TRUE if a match was found, with its number stored in - * 'nump'. Return FALSE if the address is not assigned to the interface. - */ -static int -ifaddr_v6_match(struct ifdev * ifdev, const ip_addr_t * ipaddr, - ifaddr_v6_num_t * nump) -{ - int8_t i; - - assert(IP_IS_V6(ipaddr)); - - i = netif_get_ip6_addr_match(ifdev_get_netif(ifdev), ip_2_ip6(ipaddr)); - if (i < 0) - return FALSE; - - *nump = i; - return TRUE; -} - -/* - * Find an IPv6 address locally assigned to a interface. The IPv6 address is - * given as 'addr6', and must use KAME-style embedding for zones. The - * interface is given as 'ifdev'. On success, return OK, with the IPv6 address - * number stored in 'num'. On failure, return a negative error code. This - * function also returns tentative and duplicated addresses. - */ -int -ifaddr_v6_find(struct ifdev * ifdev, const struct sockaddr_in6 * addr6, - ifaddr_v6_num_t * nump) -{ - ip_addr_t ipaddr; - int r; - - if ((r = addr_get_inet((const struct sockaddr *)addr6, sizeof(*addr6), - IPADDR_TYPE_V6, &ipaddr, TRUE /*kame*/, NULL /*port*/)) != OK) - return r; - - if (ip6_addr_has_zone(ip_2_ip6(&ipaddr)) && - ip6_addr_zone(ip_2_ip6(&ipaddr)) != ifdev_get_index(ifdev)) - return EADDRNOTAVAIL; - - if (!ifaddr_v6_match(ifdev, &ipaddr, nump)) - return EADDRNOTAVAIL; - - return OK; -} - -/* - * Enumerate IPv6 addresses locally assigned to the given interface 'ifdev'. - * The caller should set 'nump' to 0 initially, and increase it by one between - * a successful call and the next enumeration call. Return TRUE on success, - * meaning that starting from the given value of 'nump' there is at least one - * IPv6 address, of which the number is stored in 'nump' on return. Return - * FALSE if there are no more IPv6 addresses locally assigned to the interface. - * This function also returns tentative and duplicated address entries. - */ -int -ifaddr_v6_enum(struct ifdev * ifdev, ifaddr_v6_num_t * nump) -{ - ifaddr_v6_num_t num; - - for (num = *nump; num < LWIP_IPV6_NUM_ADDRESSES; num++) { - if (!ip6_addr_isinvalid(ifdev->ifdev_v6state[num])) { - *nump = num; - return TRUE; - } - } - - return FALSE; -} - -/* - * Obtain information about the IPv6 address 'num' assigned to the interface - * 'ifdev'. Store the IPv6 address in 'addr6', the network mask in 'mask6', - * and the destination address in 'dest6'. Each of these pointers may be NULL. - * The returned addresses use KAME-style embedding for zones. This function - * also returns tentative and duplicated addresses. It always succeeds. - */ -void -ifaddr_v6_get(struct ifdev * ifdev, ifaddr_v6_num_t num, - struct sockaddr_in6 * addr6, struct sockaddr_in6 * mask6, - struct sockaddr_in6 * dest6) -{ - struct netif *netif; - socklen_t addr_len; - - /* - * Due to route message generation upon address addition and deletion, - * either the ifdev_v6state or the netif state may not yet have been - * updated here. - */ - assert(!ip6_addr_isinvalid(ifdev->ifdev_v6state[num]) || - !ip6_addr_isinvalid(netif_ip6_addr_state(ifdev_get_netif(ifdev), - (int)num))); - - netif = ifdev_get_netif(ifdev); - - if (addr6 != NULL) { - addr_len = sizeof(*addr6); - - (void)addr_put_inet((struct sockaddr *)addr6, &addr_len, - netif_ip_addr6(netif, (int)num), TRUE /*kame*/, - 0 /*port*/); - } - - if (mask6 != NULL) { - addr_len = sizeof(*mask6); - - addr_put_netmask((struct sockaddr *)mask6, &addr_len, - IPADDR_TYPE_V6, ifdev->ifdev_v6prefix[num]); - } - - if (dest6 != NULL) { - /* TODO: dest6 */ - dest6->sin6_len = 0; - dest6->sin6_family = AF_UNSPEC; - } -} - -/* - * Obtain NetBSD-style state flags (IN6_IFF_) for the given local IPv6 address. - * The given number must identify an existing address. Return the flags. - */ -int -ifaddr_v6_get_flags(struct ifdev * ifdev, ifaddr_v6_num_t num) -{ - int state, flags; - - state = ifdev->ifdev_v6state[num]; - - assert(!ip6_addr_isinvalid(state)); - - flags = 0; - if (ip6_addr_isduplicated(state)) - flags |= IN6_IFF_DUPLICATED; - if (ip6_addr_istentative(state)) - flags |= IN6_IFF_TENTATIVE; - if (ip6_addr_isdeprecated(state)) - flags |= IN6_IFF_DEPRECATED; - if (ifdev->ifdev_v6flags[num] & IFADDR_V6F_AUTOCONF) - flags |= IN6_IFF_AUTOCONF; - if (ifdev->ifdev_v6flags[num] & IFADDR_V6F_TEMPORARY) - flags |= IN6_IFF_TEMPORARY; - - return flags; -} - -/* - * Obtain lifetime information about the given local IPv6 address. The given - * 'lifetime' structure is filled as a result. This function always succeeds. - */ -void -ifaddr_v6_get_lifetime(struct ifdev * ifdev, ifaddr_v6_num_t num, - struct in6_addrlifetime * lifetime) -{ - struct netif *netif; - uint32_t valid_life, pref_life; - time_t now; - - assert(!ip6_addr_isinvalid(ifdev->ifdev_v6state[num])); - - netif = ifdev_get_netif(ifdev); - - valid_life = netif_ip6_addr_valid_life(netif, (int)num); - pref_life = netif_ip6_addr_pref_life(netif, (int)num); - - /* - * Represent 'static' as 'infinite' to userland. This applies only to - * link-local addresses, which do not have lifetimes at all. - */ - if (ip6_addr_life_isstatic(valid_life)) { - valid_life = IP6_ADDR_LIFE_INFINITE; - pref_life = IP6_ADDR_LIFE_INFINITE; - } - - now = clock_time(NULL); - - /* - * TODO: the _vltime and _pltime values filled in here are not correct. - * They should be set to the originally assigned values rather than the - * current ones. Getting this right would mean we'd have to save the - * original values. So far it does not look like userland needs that.. - */ - memset(lifetime, 0, sizeof(*lifetime)); - lifetime->ia6t_vltime = valid_life; - lifetime->ia6t_pltime = pref_life; - if (!ip6_addr_life_isinfinite(valid_life)) - lifetime->ia6t_expire = now + valid_life; - if (!ip6_addr_life_isinfinite(pref_life)) - lifetime->ia6t_preferred = now + pref_life; -} - -/* - * Determine whether there should be a local subnet route for the given - * assigned IPv6 address, and if so, compute the subnet mask to add. Return - * TRUE if a local subnet route should be added, and return the network base - * address in 'netbase' and the number of prefix bits in 'prefixp'. Return - * FALSE if no subnet route should be added for the assigned address. - */ -static unsigned int -ifaddr_v6_netroute(struct ifdev * ifdev, ifaddr_v6_num_t num, - ip_addr_t * netbase, unsigned int * prefixp) -{ - const ip_addr_t *ipaddr; - - ipaddr = netif_ip_addr6(ifdev_get_netif(ifdev), (int)num); - - /* - * A local network route should be added only if all of the following - * conditions are met: - * - * 1) The address is not auto-configured. Autoconfigured addresses do - * not have an implied subnet, as explained in RFC 5942. - * Consistency with respect to subnet routes is why we do not allow - * changing the AUTOCONF flag after an address has been added. - * 2) The subnet assignment is not a /128 prefix. Not only would such - * a route not be useful, adding it would fail anyway because we - * currently do not support adding a host-type route and a - * full-width net-type route for the same IP address. - * 3) If the interface is a loopback device, the address is not a link- - * local address. This appears to be what NetBSD does, but - * additional loopback-related exceptions may be needed here. - */ - if ((ifdev->ifdev_v6flags[num] & IFADDR_V6F_AUTOCONF) || - ifdev->ifdev_v6prefix[num] == IP6_BITS || - (ifdev_is_loopback(ifdev) && - ip6_addr_islinklocal(ip_2_ip6(ipaddr)))) - return FALSE; - - addr_normalize(netbase, ipaddr, ifdev->ifdev_v6prefix[num]); - - *prefixp = ifdev->ifdev_v6prefix[num]; - return TRUE; -} - -/* - * A local IPv6 has become valid (preferred or deprecated) after previously - * being invalid (tentative, duplicated, or free). Report the addition of the - * now-usable address, and add appropriate routes to the IPv6 routing table. - * - * This function is *not* called immediately when an address is added, but - * rather when the address becomes valid (meaning it is no longer tentative, - * and thus supposedly collision-free). For that reason, unlike for IPv4, this - * function is only ever called indirectly, through the netif status callback. - */ -static void -ifaddr_v6_added(struct ifdev * ifdev, ifaddr_v6_num_t num) -{ - const ip_addr_t *ipaddr; - ip_addr_t base; - ip6_addr_t *base6; - unsigned int prefix; - - /* Check the netif as ifdev_v6state is not yet updated here. */ - assert(!ip6_addr_isinvalid(netif_ip6_addr_state(ifdev_get_netif(ifdev), - (int)num))); - - /* Report the addition of the interface address. */ - rtsock_msg_addr_v6(ifdev, RTM_NEWADDR, num); - - /* - * Add the local host route. This will always succeed. See the IPv4 - * version of this code for more information. - */ - ipaddr = netif_ip_addr6(ifdev_get_netif(ifdev), (int)num); - - (void)route_add(ipaddr, IP6_BITS, NULL /*gateway*/, ifdev, - IFADDR_HOST_RTFLAGS, NULL /*rtr*/); - - /* - * Add the local network route, if the rules say that we should. Even - * then, adding the route may fail for various reasons, but this route - * is not essential and so we ignore failures here. - */ - if (ifaddr_v6_netroute(ifdev, num, &base, &prefix)) - (void)route_add(&base, prefix, NULL /*gateway*/, ifdev, - IFADDR_NET_RTFLAGS, NULL /*rtr*/); - - /* - * Add the node-local and link-local scope multicast routes. These are - * interface-specific rather than address-specific. They are (re)added - * for every address, and never deleted until interface destruction. - */ - ip_addr_set_zero_ip6(&base); - base6 = ip_2_ip6(&base); - - base6->addr[0] = htonl(0xff010000UL | ifdev_get_index(ifdev)); - - (void)route_add(&base, 32, NULL /*gateway*/, ifdev, IFADDR_NET_RTFLAGS, - NULL /*rtr*/); - - base6->addr[0] = htonl(0xff020000UL | ifdev_get_index(ifdev)); - - (void)route_add(&base, 32, NULL /*gateway*/, ifdev, IFADDR_NET_RTFLAGS, - NULL /*rtr*/); -} - -/* - * A particular local IPv6 address is being deleted. See if there is another - * local IPv6 address assigned that should have the same local subnet route - * (but didn't, as such duplicate routes can obviously not be added), and if - * so, readd the route for that other address, possibly for the same interface. - */ -static void -ifaddr_v6_dupcheck(struct ifdev * oifdev, const ip_addr_t * onetbase, - unsigned int oprefix) -{ - struct ifdev *ifdev; - ip_addr_t netbase; - unsigned int prefix; - ifaddr_v6_num_t num; - - for (ifdev = NULL; (ifdev = ifdev_enum(ifdev)) != NULL; ) { - if (ifdev == oifdev) - continue; - - for (num = 0; num < LWIP_IPV6_NUM_ADDRESSES; num++) { - if (ip6_addr_isinvalid(ifdev->ifdev_v6state[num]) || - !ifaddr_v6_isvalid(ifdev, num)) - continue; - - if (!ifaddr_v6_netroute(ifdev, num, &netbase, &prefix)) - continue; - - if (prefix != oprefix || - !ip_addr_cmp(&netbase, onetbase)) - continue; - - (void)route_add(&netbase, prefix, NULL /*gateway*/, - ifdev, IFADDR_NET_RTFLAGS, NULL /*rtr*/); - - return; - } - } -} - -/* - * A local IPv6 has become invalid (tentative, duplicated, or free) after - * previously being valid (preferred or deprecated). Report the deletion of - * the previously-usable address, and remove previously added routes from the - * IPv6 routing table. - * - * This function is not always called for every deleted address: instead, it is - * called only when the address was previously valid, meaning that - * ifaddr_v6_added() was invoked on it before as well. Unlike for IPv4, this - * function is typically called indirectly, through the netif status callback. - */ -static void -ifaddr_v6_deleted(struct ifdev * ifdev, ifaddr_v6_num_t num) -{ - struct route_entry *route; - const ip_addr_t *ipaddr; - ip_addr_t netbase; - unsigned int prefix; - - assert(!ip6_addr_isinvalid(ifdev->ifdev_v6state[num])); - - ipaddr = netif_ip_addr6(ifdev_get_netif(ifdev), (int)num); - - /* Delete the local network route, if we tried adding it at all. */ - if (ifaddr_v6_netroute(ifdev, num, &netbase, &prefix) && - (route = route_find(&netbase, prefix, - FALSE /*is_host*/)) != NULL && - route_get_flags(route) == IFADDR_NET_RTFLAGS) { - route_delete(route, NULL /*rtr*/); - - /* - * Readd the local network route for another interface, if that - * interface has a local address on the very same network. - * Skip scoped (e.g., link-local) addresses, for which the - * routes are unique anyway. - */ - if (!ip6_addr_has_scope(ip_2_ip6(ipaddr), IP6_UNICAST)) - ifaddr_v6_dupcheck(ifdev, &netbase, prefix); - } - - /* Delete the local host route. */ - if ((route = route_find(ipaddr, IP6_BITS, TRUE /*is_host*/)) != NULL) - route_delete(route, NULL /*rtr*/); - - /* Report the deletion of the interface address. */ - rtsock_msg_addr_v6(ifdev, RTM_DELADDR, num); -} - -/* - * Add or update an IPv6 address on an interface. The interface is given as - * 'ifdev'. The IPv6 address to add or update is pointed to by 'addr6', which - * must always be a pointer to a valid address. The network mask is given as - * 'mask6', but may be NULL when updating an address. The same applies to the - * destination address 'dest6'. The given IPv6 address and destination address - * must use KAME-style embedding for zones. The flags field 'flags' contains - * a set of NetBSD-style address flags (IN6_IFF_). The 'lifetime' parameter - * always points to lifetime information to be set or updated. Return OK if - * the address was successfully added or updated, or a negative error code - * otherwise. - */ -int -ifaddr_v6_add(struct ifdev * ifdev, const struct sockaddr_in6 * addr6, - const struct sockaddr_in6 * mask6, const struct sockaddr_in6 * dest6, - int flags, const struct in6_addrlifetime * lifetime) -{ - ip_addr_t ipaddr; - ip6_addr_t *ip6addr; - struct netif *netif; - unsigned int prefix; - ifaddr_v6_num_t num; - uint32_t valid_life; - int r, state; - - netif = ifdev_get_netif(ifdev); - - /* - * Somewhat curiously, NetBSD ignores the zone ID for these requests, - * rather than rejecting requests with a zone ID that does not match - * the associated interface's. We have no reason to be stricter, and - * so we overwrite whatever zone was given.. - */ - if ((r = addr_get_inet((const struct sockaddr *)addr6, sizeof(*addr6), - IPADDR_TYPE_V6, &ipaddr, TRUE /*kame*/, NULL /*port*/)) != OK) - return r; - - /* - * Forbid locally-assigned multicast addresses. Not only are those - * absolutely disallowed in theory, we also assume all locally assigned - * addresses are unicast in various places in practice. - */ - if (ip_addr_ismulticast(&ipaddr)) - return EINVAL; - - ip6_addr_assign_zone(ip_2_ip6(&ipaddr), IP6_UNICAST, netif); - - /* - * The netmask needs to be there only when adding a new address, but if - * a netmask is given, it must be valid. Note that lwIP itself - * supports only /64 subnets; however, due to our custom routing hooks, - * combined with giving lifetimes to all addresses (except the primary - * link-local address, which is a /64), we control all routing - * decisions that would otherwise be affected by that lwIP limitation. - */ - if (mask6 != NULL && mask6->sin6_family != AF_UNSPEC) { - if ((r = addr_get_netmask((const struct sockaddr *)mask6, - sizeof(*mask6), IPADDR_TYPE_V6, &prefix, - NULL /*ipaddr*/)) != OK) - return r; - } else - prefix = 0; - - /* TODO: dest6 (note: may be NULL) */ - - /* TODO: support for IN6_IFF_ANYCAST and IN6_IFF_DETACHED. */ - if (flags & ~(IN6_IFF_TENTATIVE | IN6_IFF_DEPRECATED | IN6_IFF_NODAD | - IN6_IFF_AUTOCONF | IN6_IFF_TEMPORARY)) - return EINVAL; - - /* Should we add a new address, or update an existing one? */ - ip6addr = ip_2_ip6(&ipaddr); - - if (!ifaddr_v6_match(ifdev, &ipaddr, &num)) { - /* Add a new address. */ - if (prefix == 0) - return EINVAL; - - /* - * It must be possible to add the address to the routing table, - * so make sure that we can add such a route later on. The - * error code should be accurate for most real-world cases. - */ - if (!route_can_add(&ipaddr, IP6_BITS, TRUE /*is_host*/)) - return EEXIST; - - /* - * As an exception, if the given address is a link-local - * address and there is no link-local address in slot 0, use - * slot 0 to store this address. This requires a /64 prefix - * length, because lwIP will use an implied /64 subnet for it. - */ - if (ip6_addr_isinvalid(ifdev->ifdev_v6state[0]) && - ip6_addr_islinklocal(ip6addr) && prefix == 64) { - num = (ifaddr_v6_num_t)0; - - /* - * Such link-local addresses are not considered to be - * autoconfigured, because they always have an implied - * subnet. Therefore, clear that flag. - */ - flags &= ~IN6_IFF_AUTOCONF; - } else { - /* - * Find a free slot. We bypass netif_ip6_addr_add() as - * it makes things more, rather than less, complicated - * for us here. - */ - for (num = 1; num < LWIP_IPV6_NUM_ADDRESSES; num++) { - state = ifdev->ifdev_v6state[num]; - - if (ip6_addr_isinvalid(state)) - break; - } - - if (num == LWIP_IPV6_NUM_ADDRESSES) - return ENOBUFS; /* TODO: a better error code */ - } - - assert(ip6_addr_isinvalid(netif_ip6_addr_state(netif, num))); - - /* - * We bypass the standard netif IPv6 address assignment - * functions here, because we may want to change the state of - * the address to something particular (rather than always - * tentative) and set the state only when we're otherwise done. - */ - netif->ip6_addr[num] = ipaddr; - - ifdev->ifdev_v6prefix[num] = prefix; - - /* - * New addresses are always DAD-tested for collisions first, - * except on loopback interfaces, which will simply get back - * its own DAD request and conclude there is a collision.. - */ - if (flags & IN6_IFF_TENTATIVE) - state = IP6_ADDR_TENTATIVE; - else if (flags & IN6_IFF_DEPRECATED) - state = IP6_ADDR_VALID; - else if (ifdev_is_loopback(ifdev) || (flags & IN6_IFF_NODAD)) - state = IP6_ADDR_PREFERRED; - else - state = IP6_ADDR_TENTATIVE; - - ifdev->ifdev_v6flags[num] = 0; - if (flags & IN6_IFF_AUTOCONF) - ifdev->ifdev_v6flags[num] |= IFADDR_V6F_AUTOCONF; - if (flags & IN6_IFF_TEMPORARY) - ifdev->ifdev_v6flags[num] |= IFADDR_V6F_TEMPORARY; - - /* Precompute the address scope as well. */ - ifdev->ifdev_v6scope[num] = - addrpol_get_scope(&ipaddr, TRUE /*is_src*/); - } else { - /* Update an existing address. */ - /* - * Since no fundamental aspects about the address may change - * we also do not need to delete and readd the address here. - */ - if (prefix != 0 && prefix != ifdev->ifdev_v6prefix[num]) - return EINVAL; - - /* TODO: figure out exactly what userland wants here.. */ - if (flags & IN6_IFF_TENTATIVE) - state = IP6_ADDR_TENTATIVE; - else if (flags & IN6_IFF_DEPRECATED) - state = IP6_ADDR_VALID; - else - state = IP6_ADDR_PREFERRED; - - /* - * Leave the AUTOCONF flag as is, because otherwise we might - * also have to add or delete a subnet route here. - */ - if (flags & IN6_IFF_TEMPORARY) - ifdev->ifdev_v6flags[num] |= IFADDR_V6F_TEMPORARY; - else - ifdev->ifdev_v6flags[num] &= ~IFADDR_V6F_TEMPORARY; - } - - /* - * In our implementation, all addresses except the first link-local - * address (which is always stored in slot 0) have a lifetime and are - * thus not static as far as lwIP is concerned. The result is that all - * those addresses are considered to be /128 assignments, leaving the - * routing decisions entirely to us, which is exactly what we want. As - * such we have to be careful not to assign a valid lifetime of 0 - * ("static"). For preferred lifetimes, 0 is not a special value, - * though. Either value may be 0xffffffff, which denotes "infinite". - * - * As for those routing decisions: we use the AUTOCONF flag as the - * indication whether or not to add a subnet (= on-link prefix) route - * for the address. See also ifaddr_v6_added(). - */ - if (num != 0) { - valid_life = lifetime->ia6t_vltime; - if (ip6_addr_life_isstatic(valid_life)) - valid_life++; - netif_ip6_addr_set_valid_life(netif, (int)num, valid_life); - netif_ip6_addr_set_pref_life(netif, (int)num, - lifetime->ia6t_pltime); - } - - /* - * The lifetime of address slot 0 is initialized to, and remains at all - * times, zero ("static"). All other slots have an actual lifetime. - */ - assert(netif_ip6_addr_isstatic(netif, (int)num) == !num); - - /* - * Change the address state last, as this may immediately trigger - * reports and route addition etc, although usually it will not: - * addresses are typically added as tentative, and ifaddr_v6_added() - * will be called only once the address is valid. - */ - netif_ip6_addr_set_state(netif, (int)num, state); - - return OK; -} - -/* - * Delete an IPv6 address from an interface. The given address number must - * have been obtained through ifaddr_v6_find() or ifaddr_v6_enum(). - * This function always succeeds. - */ -void -ifaddr_v6_del(struct ifdev * ifdev, ifaddr_v6_num_t num) -{ - - assert(num <= LWIP_IPV6_NUM_ADDRESSES); - assert(!ip6_addr_isinvalid(ifdev->ifdev_v6state[num])); - - /* The state change will also trigger ifaddr_v6_deleted() if needed. */ - netif_ip6_addr_set_state(ifdev_get_netif(ifdev), (int)num, - IP6_ADDR_INVALID); -} - -/* - * Announce all IPv6 addresses associated with the given interface as deleted. - * Used (only) right before the interface is destroyed. - */ -void -ifaddr_v6_clear(struct ifdev * ifdev) -{ - ifaddr_v6_num_t num; - - for (num = 0; ifaddr_v6_enum(ifdev, &num); num++) { - if (ifaddr_v6_isvalid(ifdev, num)) - ifaddr_v6_deleted(ifdev, num); - } -} - -/* - * Check state changes on local IPv6 addresses and update shadow state - * accordingly. - */ -void -ifaddr_v6_check(struct ifdev * ifdev) -{ - struct netif *netif; - ifaddr_v6_num_t num; - int old_state, new_state, was_valid, is_valid; - - netif = ifdev_get_netif(ifdev); - - for (num = 0; num < LWIP_IPV6_NUM_ADDRESSES; num++) { - /* - * Since we compile lwIP without support for stateless - * autoconfiguration, there will be no cases where new - * addresses appear out of nowhere. As such, we can rely on - * all necessary fields already being initialized here. - */ - old_state = ifdev->ifdev_v6state[num]; - new_state = netif_ip6_addr_state(netif, num); - - if (old_state == new_state) - continue; - - was_valid = ip6_addr_isvalid(old_state); - is_valid = ip6_addr_isvalid(new_state); - - if (was_valid != is_valid) { - if (is_valid) - ifaddr_v6_added(ifdev, num); - else - ifaddr_v6_deleted(ifdev, num); - } - - ifdev->ifdev_v6state[num] = new_state; - - /* - * TODO: implement the requirements for dealing with duplicated - * addresses, in particular the link-local address, as - * specified by RFC 4862 Sec. 5.4.5. NetBSD uses the - * ND6_IFF_IFDISABLED flag for this, essentially disabling - * the interface completely when that flag is set. - */ - } -} - -/* - * A change in the interface and/or link status has resulted in both now being - * up. Set the link-local address, if any, to tentative state. Exempt - * loopback interfaces, which would just see their own requests as collisions. - * - * TODO: the current implementation is the absolute minimum required for - * dhcpcd(8) to function somewhat properly, but there is much more to be - * decided and done when it comes to dealing with status changes.. - */ -void -ifaddr_v6_set_up(struct ifdev * ifdev) -{ - - if (!ifdev_is_loopback(ifdev) && - !ip6_addr_isinvalid(ifdev->ifdev_v6state[0])) - netif_ip6_addr_set_state(ifdev_get_netif(ifdev), 0, - IP6_ADDR_TENTATIVE); -} - -/* - * Check whether all conditions are met for (re)assigning a link-local IPv6 - * address, and if so, do just that. - */ -void -ifaddr_v6_set_linklocal(struct ifdev * ifdev) -{ - - /* - * A few conditions must be met for link-local address assignment. - * First of all, link-local address assignment must be enabled both - * globally and on the interface. The BSDs use the global setting as - * an initial value for the link-local setting, but if we do this, it - * would basically be impossible to change the global setting and have - * any effect. Thus, we use the global setting as an additional - * requirement, with as reasoning that people will typically disable - * the global setting in order to assign no IPv6 addresses at all. - */ - if (!(ifdev_get_nd6flags(ifdev) & ND6_IFF_AUTO_LINKLOCAL) || - !ifaddr_auto_linklocal) - return; - - /* - * Second, the interface must be up. This is an artificial requirement - * that allows for the above settings to be changed at all: if we - * assigned a link-local address as soon as we could (see below), this - * would leave virtually no opportunity to change the settings. Once - * assigned, a link-local address is never removed automatically. - */ - if (!ifdev_is_up(ifdev)) - return; - - /* - * A proper (48-bit) hardware address must be set. Interfaces without - * hardware addresses (e.g., loopback devices) do not have this kind of - * auto-assignment. It may take a while for the driver to get back to - * us with its initial hardware address, so wait for at least that. - * Also update the link-local address upon subsequent (user-initiated) - * changes to the hardware address, as long as if the IPv6 address has - * not been overridden by userland by then. - */ - if (ifdev_get_hwlen(ifdev) != ETHARP_HWADDR_LEN || - !(ifdev->ifdev_hwlist[0].ifhwa_flags & IFHWAF_VALID)) - return; - - if (!ip6_addr_isinvalid(ifdev->ifdev_v6state[0]) && - (ifdev->ifdev_v6flags[0] & IFADDR_V6F_HWBASED)) - return; - - /* - * All conditions are met. Set or replace the interface's IPv6 - * link-local address. This uses the first IPv6 address slot, which - * will be skipped when adding non-link-local addresses. We first - * delete the old address if any, in order to force invalidation of - * bound sockets, because setting the new address does not (currently) - * consider sockets. - */ - if (!ip6_addr_isinvalid(ifdev->ifdev_v6state[0])) - ifaddr_v6_del(ifdev, (ifaddr_v6_num_t)0); - -#ifdef INET6 - ifdev->ifdev_v6flags[0] = IFADDR_V6F_HWBASED; - ifdev->ifdev_v6prefix[0] = 64; - netif_create_ip6_linklocal_address(ifdev_get_netif(ifdev), - 1 /*from_mac_48bit*/); - assert(!ip6_addr_isinvalid(ifdev->ifdev_v6state[0])); - - ifdev->ifdev_v6scope[0] = - addrpol_get_scope(netif_ip_addr6(ifdev_get_netif(ifdev), 0), - TRUE /*is_src*/); -#endif /* INET6 */ -} - -/* - * Return the first interface device that owns the given (non-any) IPv6 - * address, or NULL if it is not a valid local IPv6 address. Addresses that - * exist but are not usable ("usually assigned" in the RFC4862 sense) are - * considered not valid in this context. - */ -struct ifdev * -ifaddr_v6_map_by_addr(const ip6_addr_t * ip6addr) -{ - struct ifdev *ifdev; - struct netif *netif; - ifaddr_v6_num_t num; - - /* - * It would be nice to be able to do a route lookup on an RTF_LOCAL - * entry here, but this approach would currently have two problems. - * - * 1) link-local addresses would require a lookup with a different - * embedded zone for each possible interface, requiring a loop over - * all interfaces after all; we could do a route lookup for global - * addresses only, but then there's also the issue that.. - * 2) once we get the interface from the route, we still have to check - * check the state of the address, as done below, and that requires - * us to go through all the interface addresses after all; we could - * embed the local address number in the RTF_LOCAL routing entry but - * that would get rather messy API-wise. - * - * Still, if it turns out that this function is a bottleneck, the above - * workarounds should offer a way forward for the common case. - */ - for (ifdev = NULL; (ifdev = ifdev_enum(ifdev)) != NULL; ) { - netif = ifdev_get_netif(ifdev); - - for (num = 0; num < LWIP_IPV6_NUM_ADDRESSES; num++) { - if (ip6_addr_isinvalid(ifdev->ifdev_v6state[num])) - continue; - - /* - * An address may be used as a local address only if it - * is preferred or deprecated, not if it is tentative - * or duplicated. - */ - if (!ifaddr_v6_isvalid(ifdev, num)) - continue; - - /* - * Ignore the zone if the given address does not have - * one set. Otherwise, the zone must match. - */ - if (ip6_addr_cmp_zoneless(netif_ip6_addr(netif, num), - ip6addr) && (!ip6_addr_has_zone(ip6addr) || - ip6_addr_test_zone(ip6addr, netif))) - return ifdev; - } - } - - return NULL; -} - -/* - * Return the first interface device for which the given IPv6 address is on a - * configured local subnet, or NULL if no match was found. - */ -static struct ifdev * -ifaddr_v6_map_by_subnet(const ip_addr_t * ipaddr) -{ - const ip_addr_t *addr; - struct ifdev *ifdev; - struct netif *netif; - ifaddr_v6_num_t num; - unsigned int prefix; - - assert(IP_IS_V6(ipaddr)); - - for (ifdev = NULL; (ifdev = ifdev_enum(ifdev)) != NULL; ) { - netif = ifdev_get_netif(ifdev); - - if (ip6_addr_has_zone(ip_2_ip6(ipaddr)) && - !ip6_addr_test_zone(ip_2_ip6(ipaddr), netif)) - continue; - - for (num = 0; num < LWIP_IPV6_NUM_ADDRESSES; num++) { - if (ip6_addr_isinvalid(ifdev->ifdev_v6state[num])) - continue; - - if (!ifaddr_v6_isvalid(ifdev, num)) - continue; - - addr = netif_ip_addr6(netif, num); - - /* - * For addresses with no implied subnet, check against - * the full address, so as to match only that address. - */ - if (ifdev->ifdev_v6flags[num] & IFADDR_V6F_AUTOCONF) - prefix = IP6_BITS; - else - prefix = ifdev->ifdev_v6prefix[num]; - - if (addr_get_common_bits(ipaddr, addr, prefix) == - prefix) - return ifdev; - } - } - - return NULL; -} - -/* - * Select an IPv6 source address for communication to the given destination - * address on the given interface. Return the selected source address, or NULL - * if no appropriate source address could be found. This function implements - * RFC 6724 Sec. 5, and is very close to a drop-in replacement for lwIP's own - * ip6_select_source_address() function. We can do a slightly better job - * because we have more information (for Rules 6 and 7) and can offer a more - * complete, less lightweight implementation (for Rule 8). - * - * In summary, this is the implementation status of the rules: - * - * - Rules 1, 2, 3: fully implemented - * - Rules 4, 5, 5.5: not applicable - * - Rules 6, 7, 8: fully implemented - * - * Note that for rule 2, scope decisions are left to the addrpol module, which - * makes a deliberate exception from the RFC for Unique-Local Addresses. - * - * The given destination address may not be properly zoned. - */ -static const ip_addr_t * -ifaddr_v6_select(struct ifdev * ifdev, const ip_addr_t * dest_addr) -{ - const ip_addr_t *cand_addr, *best_addr; - int dest_scope, cand_scope, best_scope; - int dest_label, cand_label, best_label = 0 /*gcc*/; - uint8_t cand_pref, best_pref = 0 /*gcc*/; - uint8_t cand_temp, best_temp = 0 /*gcc*/; - int cand_bits, best_bits = 0 /*gcc*/; - ifaddr_v6_num_t num, best_num; - - assert(ifdev != NULL); - assert(IP_IS_V6(dest_addr)); - - dest_scope = addrpol_get_scope(dest_addr, FALSE /*is_src*/); - dest_label = -1; /* obtain only when necessary */ - - best_addr = NULL; - best_num = -1; - - for (num = 0; num < LWIP_IPV6_NUM_ADDRESSES; num++) { - /* Consider only valid (preferred and deprecated) addresses. */ - if (!ip6_addr_isvalid(ifdev->ifdev_v6state[num])) - continue; - - cand_addr = netif_ip_addr6(ifdev_get_netif(ifdev), (int)num); - - /* Rule 1 */ - if (ip6_addr_cmp_zoneless(ip_2_ip6(cand_addr), - ip_2_ip6(dest_addr))) - return cand_addr; - - cand_scope = ifdev->ifdev_v6scope[num]; - cand_pref = ip6_addr_ispreferred(ifdev->ifdev_v6state[num]); - cand_temp = (ifdev->ifdev_v6flags[num] & IFADDR_V6F_TEMPORARY); - cand_label = -1; - cand_bits = -1; - - /* - * The following monster of an if-condition relies on order of - * evaluation to obtain the more expensive-to-compute values - * only when strictly necessary. We use a shortcut for Rule 6: - * labels are computed based on longest matching prefix, so if - * Rule 6 prefers the candidate address, Rule 8 would have - * preferred the candidate address as well. Therefore, skip - * even computing labels when Rule 7 would not prefer either - * address, i.e. the "temporary" state of the candidate and the - * best address are equal. For complete ties (which exist, - * because Rule 8 - longest common prefix - checks up to the - * subnet size), as "policy" we always pick the first address. - */ -#define ADDRPOL_GET_LABEL(addr, label) \ - (label != -1 || (label = addrpol_get_label(addr), 1)) -#define ADDR_GET_COMMON_BITS(addr1, addr2, num, bits) \ - (bits != -1 || (bits = (int) \ - addr_get_common_bits(addr1, addr2, ifdev->ifdev_v6prefix[num]), 1)) - - if (best_addr == NULL || /* no alternative yet */ - /* Rule 2 */ - (cand_scope < best_scope && cand_scope >= dest_scope) || - (cand_scope > best_scope && best_scope < dest_scope) || - (cand_scope == best_scope && - /* Rule 3 */ - (cand_pref > best_pref || (cand_pref == best_pref && - /* Rule 6 */ - ((cand_temp != best_temp && /* shortcut, part 1 */ - ADDRPOL_GET_LABEL(dest_addr, dest_label) && - ADDRPOL_GET_LABEL(cand_addr, cand_label) && - ADDRPOL_GET_LABEL(best_addr, best_label) && - cand_label == dest_label && best_label != dest_label) || - ((cand_temp == best_temp || /* shortcut, part 2 */ - ((cand_label == dest_label) == - (best_label == dest_label))) && - /* Rule 7 */ - (cand_temp > best_temp || (cand_temp == best_temp && - /* Rule 8 */ - ADDR_GET_COMMON_BITS(cand_addr, dest_addr, num, - cand_bits) && - ADDR_GET_COMMON_BITS(best_addr, dest_addr, best_num, - best_bits) && - cand_bits > best_bits)))))))) { - /* We found a new "winning" candidate. */ - best_addr = cand_addr; - best_scope = cand_scope; - best_pref = cand_pref; - best_temp = cand_temp; - best_label = cand_label; - best_bits = cand_bits; - best_num = num; - } - } - - /* Return the best candidate, if any. */ - return best_addr; -} - -/* - * Pick an IPv6 source address locally assigned to the given interface, for use - * with the given IPv6 destination address. See ifaddr_v6_select() on why we - * override lwIP's version of this function. - * - * This is a full replacement of the corresponding lwIP function, which should - * be overridden with weak symbols, using patches against the lwIP source code. - * As such, the lwIP headers should already provide the correct prototype for - * this function. If not, something will have changed in the lwIP - * implementation, and this code must be revised accordingly. - * - * Important: there are currently no tests that will detect that overriding is - * broken, since our test code (necessarily) uses the code path that calls - * ifaddr_v6_select() directly, even though there are other places in the lwIP - * source code that explicitly call this functions. - */ -const ip_addr_t * -ip6_select_source_address(struct netif * netif, const ip6_addr_t * dest_addr) -{ - ip_addr_t ipaddr; - - ip_addr_copy_from_ip6(ipaddr, *dest_addr); - - return ifaddr_v6_select(netif_get_ifdev(netif), &ipaddr); -} - -/* - * Find and return the interface to which the given address is assigned as a - * local (source) address, or NULL if the given address is not a local address - * for any interface. The 'any' address as well as IPv4-mapped IPv6 addresses - * are not supported and will yield NULL. - */ -struct ifdev * -ifaddr_map_by_addr(const ip_addr_t * ipaddr) -{ - - switch (IP_GET_TYPE(ipaddr)) { - case IPADDR_TYPE_V4: - return ifaddr_v4_map_by_addr(ip_2_ip4(ipaddr)); - - case IPADDR_TYPE_V6: - if (ip6_addr_isipv4mappedipv6(ip_2_ip6(ipaddr))) - return NULL; - - return ifaddr_v6_map_by_addr(ip_2_ip6(ipaddr)); - - case IPADDR_TYPE_ANY: - return NULL; - - default: - panic("unknown IP address type: %u", IP_GET_TYPE(ipaddr)); - } -} - -/* - * Find and return an interface that has a local network configured that - * contains the given address, or NULL if there is no match. If there are - * multiple matches, an arbitrary one is returned. The 'any' address as well - * as IPv4-mapped IPv6 addresses are not supported and will yield NULL. - */ -struct ifdev * -ifaddr_map_by_subnet(const ip_addr_t * ipaddr) -{ - - switch (IP_GET_TYPE(ipaddr)) { - case IPADDR_TYPE_V4: - return ifaddr_v4_map_by_subnet(ip_2_ip4(ipaddr)); - - case IPADDR_TYPE_V6: - if (ip6_addr_isipv4mappedipv6(ip_2_ip6(ipaddr))) - return NULL; - - return ifaddr_v6_map_by_subnet(ipaddr); - - case IPADDR_TYPE_ANY: - return NULL; - - default: - panic("unknown IP address type: %u", IP_GET_TYPE(ipaddr)); - } -} - -/* - * Select a local address to use as source address for the given destination - * address. If 'ifdev' is not NULL, it points to the interface from which to - * select a source address. If 'ifdev' is NULL, this function will attempt to - * select an interface as well. On success, return the selected source - * address, and if 'ifdevp' is not NULL, store the selected interface in it. - * On failure, return NULL. - */ -const ip_addr_t * -ifaddr_select(const ip_addr_t * dst_addr, struct ifdev * ifdev, - struct ifdev ** ifdevp) -{ - struct route_entry *route; - const ip6_addr_t *ip6addr; - - /* - * If no interface is provided yet, start by determining the interface. - * If the destination address has a zone, this step is easy. Otherwise - * we have to do a routing query on the destination address. - */ - if (ifdev == NULL) { - ip6addr = ip_2_ip6(dst_addr); - - if (IP_IS_V6(dst_addr) && ip6_addr_has_zone(ip6addr)) { - ifdev = ifdev_get_by_index(ip6_addr_zone(ip6addr)); - - if (ifdev == NULL) - return NULL; - } else { - if ((route = route_lookup(dst_addr)) == NULL) - return NULL; - - ifdev = route_get_ifdev(route); - } - } - - if (ifdevp != NULL) - *ifdevp = ifdev; - - /* - * We have found an interface. Now select an IP address assigned to - * that interface. For IPv4, this is easy: each interface has only one - * local address (if that). For IPv6, we may have to select one of the - * locally assigned addresses: global, link-local, etc. - */ - switch (IP_GET_TYPE(dst_addr)) { - case IPADDR_TYPE_V4: - /* Use the IPv4 source address if one is set at all. */ - if (!ifdev->ifdev_v4set) - return FALSE; - - return netif_ip_addr4(ifdev_get_netif(ifdev)); - - case IPADDR_TYPE_V6: - return ifaddr_v6_select(ifdev, dst_addr); - - default: - panic("unknown IP address type: %u", IP_GET_TYPE(dst_addr)); - } -} - -/* - * Check the given IPv6 address for a zone violation against the given - * interface--that is, a scoped address leaving its original zone if used in - * the context of the interface. Return TRUE if the address is zone- - * incompatible with the interface, and thus must not be used in packets sent - * to that interface. Return FALSE if there is no such zone incompatibility. - */ -int -ifaddr_is_zone_mismatch(const ip6_addr_t * ipaddr, struct ifdev * ifdev) -{ - - /* - * The IPv6 loopback address (::1) has an implicit link-local scope, - * with a zone corresponding to the interface it is assigned to. We - * take a shortcut by assuming that the loopback address is assigned to - * the primary loopback interface. - */ - if (ip6_addr_isloopback(ipaddr)) - return (ifdev != ifdev_get_loopback()); - - /* Zoned addresses must not leave their zone. */ - if (ip6_addr_has_zone(ipaddr)) - return !ip6_addr_test_zone(ipaddr, ifdev_get_netif(ifdev)); - - return FALSE; -} - -/* - * Find a data link (hardware) address locally assigned to a interface. The - * address is given as 'addr', and the length of the memory area that contains - * 'addr' is given as 'addr_len'. The interface is given as 'ifdev'. On - * success, return OK, with the data link address number stored in 'num'. For - * interfaces that do not support hardware addresses, if the given address - * provides a zero-length hardware address, always return successfully with 0 - * stored in 'nump'. On failure, return a negative error code. - */ -int -ifaddr_dl_find(struct ifdev * ifdev, const struct sockaddr_dlx * addr, - socklen_t addr_len, ifaddr_dl_num_t * nump) -{ - uint8_t hwaddr[NETIF_MAX_HWADDR_LEN]; - ifaddr_dl_num_t num; - int r; - - if ((r = addr_get_link((const struct sockaddr *)addr, addr_len, - NULL /*name*/, 0 /*name_max*/, hwaddr, - ifdev_get_hwlen(ifdev))) != OK) - return r; - - /* - * For interfaces without hardware addresses, after passing the above - * sanity checks (which guarantee that the searched-for address is of - * zero length), return the pseudo-entry zero, which yields an entry - * with a zero-sized hardware address once obtained. This is required - * for at least ifconfig(8). - */ - if (ifdev->ifdev_ops->iop_set_hwaddr == NULL) { - *nump = 0; - return OK; - } - - for (num = 0; (size_t)num < __arraycount(ifdev->ifdev_hwlist); num++) { - if ((ifdev->ifdev_hwlist[num].ifhwa_flags & IFHWAF_VALID) && - !memcmp(ifdev->ifdev_hwlist[num].ifhwa_addr, hwaddr, - ifdev_get_hwlen(ifdev))) { - *nump = num; - return OK; - } - } - - return EADDRNOTAVAIL; -} - -/* - * Enumerate data link (hardware) addresses locally assigned to the given - * interface 'ifdev'. The caller should set 'nump' to 0 initially, and - * increase it by one between a successful call and the next enumeration call. - * Return TRUE on success, meaning that starting from the given value of 'nump' - * there is at least one data link address, of which the number is stored in - * 'nump' on return. Return FALSE if there are no more data link addresses - * locally assigned to the interface. - */ -int -ifaddr_dl_enum(struct ifdev * ifdev, ifaddr_dl_num_t * num) -{ - - /* - * If hardware addresses are not supported, or if no hardware address - * has been added to this interface yet (this shouldn't happen but - * still), there is always one entry with a (zero-sized) address. - * That is required for the IFP (name) entry as used by getifaddrs(3). - */ - if (ifdev->ifdev_ops->iop_set_hwaddr == NULL || - !(ifdev->ifdev_hwlist[0].ifhwa_flags & IFHWAF_VALID)) - return (*num == 0); - - for (; (size_t)*num < __arraycount(ifdev->ifdev_hwlist); (*num)++) { - if (ifdev->ifdev_hwlist[*num].ifhwa_flags & IFHWAF_VALID) - return TRUE; - } - - return FALSE; -} - -/* - * Retrieve a data link (hardware) address for an interface. For interfaces - * that support hardware addresses, 'num' must be a number returned by - * ifaddr_dl_find() or ifaddr_dl_enum(). For others, 'num' must be zero, and a - * pseudo-address of zero size will be returned. The address will be stored in - * 'addr'. This function always succeeds. - */ -void -ifaddr_dl_get(struct ifdev * ifdev, ifaddr_dl_num_t num, - struct sockaddr_dlx * addr) -{ - const uint8_t *hwaddr; - size_t hwaddr_len; - socklen_t addr_len; - - if ((hwaddr_len = ifdev_get_hwlen(ifdev)) > 0) { - /* - * Note that if we have no hardware addresses yet (which should - * not happen but still), the first entry may not be marked as - * valid yet. Ignore it, and return an all-zeroes address. - */ - hwaddr = ifdev->ifdev_hwlist[num].ifhwa_addr; - } else - hwaddr = NULL; - - addr_len = sizeof(*addr); - - addr_put_link((struct sockaddr *)addr, &addr_len, - ifdev_get_index(ifdev), ifdev_get_iftype(ifdev), - ifdev_get_name(ifdev), hwaddr, hwaddr_len); -} - -/* - * Obtain NetBSD-style state flags (IFLR_) for the given local data link - * address. The given number may be 0, in which case that slot's state may not - * be valid. Otherwise, the given number must identify an existing address. - * Return the flags, 0 if the slot was not valid. - */ -int -ifaddr_dl_get_flags(struct ifdev * ifdev, ifaddr_dl_num_t num) -{ - int flags; - - assert(num >= 0 && (size_t)num < __arraycount(ifdev->ifdev_hwlist)); - - if (!(ifdev->ifdev_hwlist[num].ifhwa_flags & IFHWAF_VALID)) - return 0; - - flags = (num == 0) ? IFLR_ACTIVE : 0; - - if (ifdev->ifdev_hwlist[num].ifhwa_flags & IFHWAF_FACTORY) - flags |= IFLR_FACTORY; - - return flags; -} - -/* - * Scan the list of hardware addresses of the given interface for a particular - * hardware address, as well as for an available entry. Return the entry found - * or -1 if the given hardware address was not found. Independently, return an - * available entry in 'availp' or -1 if no entries are available. - */ -static ifaddr_dl_num_t -ifaddr_dl_scan(struct ifdev * ifdev, const uint8_t * hwaddr, - ifaddr_dl_num_t * availp) -{ - ifaddr_dl_num_t num, found, avail; - - found = avail = -1; - - for (num = 0; (size_t)num < __arraycount(ifdev->ifdev_hwlist); num++) { - if (!(ifdev->ifdev_hwlist[num].ifhwa_flags & IFHWAF_VALID)) { - if (avail == -1) - avail = num; - } else if (!memcmp(ifdev->ifdev_hwlist[num].ifhwa_addr, hwaddr, - ifdev_get_hwlen(ifdev))) - found = num; - } - - *availp = avail; - return found; -} - -/* - * Set a hardware address entry in the hardware address list of the given - * interface. - */ -static void -ifaddr_dl_set(struct ifdev * ifdev, ifaddr_dl_num_t num, - const uint8_t * hwaddr, int is_factory) -{ - - memcpy(&ifdev->ifdev_hwlist[num].ifhwa_addr, hwaddr, - ifdev_get_hwlen(ifdev)); - - ifdev->ifdev_hwlist[num].ifhwa_flags = IFHWAF_VALID; - if (is_factory) - ifdev->ifdev_hwlist[num].ifhwa_flags |= IFHWAF_FACTORY; - - rtsock_msg_addr_dl(ifdev, RTM_NEWADDR, num); -} - -/* - * Mark a new hardware address as active, after it has already been activated - * on the hardware and in local administration. The active slot is always slot - * zero, so swap slots if needed. - */ -static void -ifaddr_dl_activate(struct ifdev * ifdev, ifaddr_dl_num_t num) -{ - struct ifdev_hwaddr tmp; - struct netif *netif; - size_t sz; - - assert(num != -1); - - /* The given slot may be zero if this is the initial address. */ - if (num != 0) { - sz = sizeof(tmp); - memcpy(&tmp, &ifdev->ifdev_hwlist[0], sz); - memcpy(&ifdev->ifdev_hwlist[0], &ifdev->ifdev_hwlist[num], sz); - memcpy(&ifdev->ifdev_hwlist[num], &tmp, sz); - } - - netif = ifdev_get_netif(ifdev); - - /* Tell lwIP and routing sockets. */ - memcpy(&netif->hwaddr, &ifdev->ifdev_hwlist[0].ifhwa_addr, - ifdev_get_hwlen(ifdev)); - - rtsock_msg_addr_dl(ifdev, RTM_CHGADDR, 0); - - /* See if we can and should generate a link-local IPv6 address now. */ - ifaddr_v6_set_linklocal(ifdev); -} - -/* - * Add a data link (hardware) address to an interface, or if it already exists, - * update its associated flags (IFLR_). - */ -int -ifaddr_dl_add(struct ifdev * ifdev, const struct sockaddr_dlx * addr, - socklen_t addr_len, int flags) -{ - uint8_t hwaddr[NETIF_MAX_HWADDR_LEN]; - ifaddr_dl_num_t found, avail; - int r; - - /* - * If this interface type does not support setting hardware addresses, - * refuse the call. If the interface type supports it but the - * underlying hardware does not, we cannot report failure here, though. - * In that case, attempts to activate an address will fail instead. - */ - if (ifdev->ifdev_ops->iop_set_hwaddr == NULL) - return EINVAL; - - if ((r = addr_get_link((const struct sockaddr *)addr, addr_len, - NULL /*name*/, 0 /*name_max*/, hwaddr, - ifdev_get_hwlen(ifdev))) != OK) - return r; - - /* - * Find the slot for the given hardware address. Also find the slot of - * the active address, and a free slot. All of these may not exist. - */ - found = ifaddr_dl_scan(ifdev, hwaddr, &avail); - - if (found == -1) { - if (avail == -1) - return ENOBUFS; /* TODO: a better error code */ - found = avail; - } - - /* - * If we are asked to activate this address, try that first: this may - * fail if the network device does not support setting addresses, in - * which case we want to fail without causing routing socket noise. - */ - if ((flags & IFLR_ACTIVE) && found != 0 && - (r = ifdev->ifdev_ops->iop_set_hwaddr(ifdev, hwaddr)) != OK) - return r; - - /* - * If this is a new address, add and announce it. Otherwise, just - * update its flags. - */ - if (found == avail) { - ifaddr_dl_set(ifdev, found, hwaddr, - (flags & IFLR_FACTORY)); - } else { - ifdev->ifdev_hwlist[found].ifhwa_flags &= ~IFLR_FACTORY; - if (flags & IFLR_FACTORY) - ifdev->ifdev_hwlist[found].ifhwa_flags |= IFLR_FACTORY; - } - - /* - * Activate the address if requested, swapping slots as needed. It is - * not possible to deactivate the active address by changing its flags. - */ - if ((flags & IFLR_ACTIVE) && found != 0) - ifaddr_dl_activate(ifdev, found); - - return OK; -} - -/* - * Delete a data link (hardware) address from an interface. - */ -int -ifaddr_dl_del(struct ifdev * ifdev, ifaddr_dl_num_t num) -{ - - if (ifdev->ifdev_ops->iop_set_hwaddr == NULL) - return EINVAL; - - assert(num >= 0 && (size_t)num < __arraycount(ifdev->ifdev_hwlist)); - assert(ifdev->ifdev_hwlist[num].ifhwa_flags & IFHWAF_VALID); - - /* It is not possible to delete the active address. */ - if (num == 0) - return EBUSY; - - rtsock_msg_addr_dl(ifdev, RTM_DELADDR, num); - - ifdev->ifdev_hwlist[num].ifhwa_flags = 0; - - return OK; -} - -/* - * Announce all data link (hardware) addresses associated with the given - * interface as deleted, including the active address. Used (only) right - * before the interface is destroyed. - */ -void -ifaddr_dl_clear(struct ifdev * ifdev) -{ - ifaddr_dl_num_t num; - - /* - * Do the active address last, because all announcements carry the - * active address's hardware address as well. - */ - for (num = 1; ifaddr_dl_enum(ifdev, &num); num++) - rtsock_msg_addr_dl(ifdev, RTM_DELADDR, num); - - if (ifdev->ifdev_hwlist[0].ifhwa_flags & IFHWAF_VALID) - rtsock_msg_addr_dl(ifdev, RTM_DELADDR, (ifaddr_dl_num_t)0); -} - -/* - * Update the interface's active hardware address. If the 'is_factory' flag is - * set, the address is the factory (driver-given) address. This function may - * only be called from ifdev_update_hwaddr(). - */ -void -ifaddr_dl_update(struct ifdev * ifdev, const uint8_t * hwaddr, int is_factory) -{ - ifaddr_dl_num_t found, avail; - - /* - * Find the slot for the given hardware address. Also find the slot of - * the active address, and a free slot. All of these may not exist. - */ - found = ifaddr_dl_scan(ifdev, hwaddr, &avail); - - /* If the given address is already the active one, do nothing. */ - if (found == 0) { - /* Factory addresses are always added first! */ - assert(!is_factory); - - return; - } - - if (found == -1) { - /* - * If the given address is not in the list, add it. If the - * list is full, first remove any non-active address. The user - * won't like this, but it preserves correctness without too - * many complications, because this case is unlikely to happen. - */ - if (avail == -1) { - found = 1; - - (void)ifaddr_dl_del(ifdev, found); - } else - found = avail; - - ifaddr_dl_set(ifdev, found, hwaddr, is_factory); - } - - ifaddr_dl_activate(ifdev, found); -} diff --git a/minix/net/lwip/ifconf.c b/minix/net/lwip/ifconf.c deleted file mode 100644 index e1a48c7f8..000000000 --- a/minix/net/lwip/ifconf.c +++ /dev/null @@ -1,930 +0,0 @@ -/* LWIP service - ifconf.c - interface configuration */ - -#include "lwip.h" -#include "ifaddr.h" -#include "lldata.h" - -#include -#include - -#define LOOPBACK_IFNAME "lo0" /* name of the loopback interface */ - -/* - * Initialize the first loopback device, which is present by default. - */ -void -ifconf_init(void) -{ - const struct sockaddr_in addr = { - .sin_family = AF_INET, - .sin_addr = { htonl(INADDR_LOOPBACK) } - }; - struct sockaddr_in6 ll_addr6 = { - .sin6_family = AF_INET6, - }; - const struct sockaddr_in6 lo_addr6 = { - .sin6_family = AF_INET6, - .sin6_addr = IN6ADDR_LOOPBACK_INIT - }; - const struct in6_addrlifetime lifetime = { - .ia6t_vltime = ND6_INFINITE_LIFETIME, - .ia6t_pltime = ND6_INFINITE_LIFETIME - }; - struct sockaddr_in6 mask6; - struct ifdev *ifdev; - socklen_t addr_len; - int r; - - if ((r = ifdev_create(LOOPBACK_IFNAME)) != OK) - panic("unable to create loopback interface: %d", r); - - if ((ifdev = ifdev_find_by_name(LOOPBACK_IFNAME)) == NULL) - panic("unable to find loopback interface"); - - if ((r = ifaddr_v4_add(ifdev, &addr, NULL, NULL, NULL, 0)) != OK) - panic("unable to set IPv4 address on loopback interface: %d", - r); - - addr_len = sizeof(mask6); - addr_put_netmask((struct sockaddr *)&mask6, &addr_len, IPADDR_TYPE_V6, - 64 /*prefix*/); - - ll_addr6.sin6_addr.s6_addr[0] = 0xfe; - ll_addr6.sin6_addr.s6_addr[1] = 0x80; - ll_addr6.sin6_addr.s6_addr[15] = ifdev_get_index(ifdev); - - if ((r = ifaddr_v6_add(ifdev, &ll_addr6, &mask6, NULL, 0, - &lifetime)) != OK) - panic("unable to set IPv6 address on loopback interface: %d", - r); - - addr_len = sizeof(mask6); - addr_put_netmask((struct sockaddr *)&mask6, &addr_len, IPADDR_TYPE_V6, - 128 /*prefix*/); - - if ((r = ifaddr_v6_add(ifdev, &lo_addr6, &mask6, NULL, 0, - &lifetime)) != OK) - panic("unable to set IPv6 address on loopback interface: %d", - r); - - if ((r = ifdev_set_ifflags(ifdev, IFF_UP)) != OK) - panic("unable to bring up loopback interface"); -} - -/* - * Process an address family independent IOCTL request with an "ifreq" - * structure. - */ -static int -ifconf_ioctl_ifreq(unsigned long request, const struct sockdriver_data * data) -{ - struct ifdev *ifdev; - struct ifreq ifr; - int r; - - if ((r = sockdriver_copyin(data, 0, &ifr, sizeof(ifr))) != OK) - return r; - - if (request != SIOCIFCREATE) { - ifr.ifr_name[sizeof(ifr.ifr_name) - 1] = '\0'; - - if ((ifdev = ifdev_find_by_name(ifr.ifr_name)) == NULL) - return ENXIO; - } else - ifdev = NULL; - - switch (request) { - case SIOCGIFFLAGS: - ifr.ifr_flags = ifdev_get_ifflags(ifdev); - - return sockdriver_copyout(data, 0, &ifr, sizeof(ifr)); - - case SIOCSIFFLAGS: - /* - * Unfortunately, ifr_flags is a signed integer and the sign - * bit is in fact used as a flag, so without explicit casting - * we end up setting all upper bits of the (full) integer. If - * NetBSD ever extends the field, this assert should trigger.. - */ - assert(sizeof(ifr.ifr_flags) == sizeof(short)); - - return ifdev_set_ifflags(ifdev, (unsigned short)ifr.ifr_flags); - - case SIOCGIFMETRIC: - ifr.ifr_metric = ifdev_get_metric(ifdev); - - return sockdriver_copyout(data, 0, &ifr, sizeof(ifr)); - - case SIOCSIFMETRIC: - /* The metric is not used within the operating system. */ - ifdev_set_metric(ifdev, ifr.ifr_metric); - - return OK; - - case SIOCSIFMEDIA: - return ifdev_set_ifmedia(ifdev, ifr.ifr_media); - - case SIOCGIFMTU: - ifr.ifr_mtu = ifdev_get_mtu(ifdev); - - return sockdriver_copyout(data, 0, &ifr, sizeof(ifr)); - - case SIOCSIFMTU: - return ifdev_set_mtu(ifdev, ifr.ifr_mtu); - - case SIOCIFCREATE: - if (memchr(ifr.ifr_name, '\0', sizeof(ifr.ifr_name)) == NULL) - return EINVAL; - - return ifdev_create(ifr.ifr_name); - - case SIOCIFDESTROY: - return ifdev_destroy(ifdev); - - case SIOCGIFDLT: - ifr.ifr_dlt = ifdev_get_dlt(ifdev); - - return sockdriver_copyout(data, 0, &ifr, sizeof(ifr)); - - case SIOCGIFINDEX: - ifr.ifr_index = ifdev_get_index(ifdev); - - return sockdriver_copyout(data, 0, &ifr, sizeof(ifr)); - - default: - return ENOTTY; - } -} - -/* - * Process an address family independent IOCTL request with an "ifcapreq" - * structure. - */ -static int -ifconf_ioctl_ifcap(unsigned long request, - const struct sockdriver_data * data) -{ - struct ifdev *ifdev; - struct ifcapreq ifcr; - int r; - - if ((r = sockdriver_copyin(data, 0, &ifcr, sizeof(ifcr))) != OK) - return r; - - ifcr.ifcr_name[sizeof(ifcr.ifcr_name) - 1] = '\0'; - - if ((ifdev = ifdev_find_by_name(ifcr.ifcr_name)) == NULL) - return ENXIO; - - switch (request) { - case SIOCSIFCAP: - return ifdev_set_ifcap(ifdev, ifcr.ifcr_capenable); - - case SIOCGIFCAP: - ifdev_get_ifcap(ifdev, &ifcr.ifcr_capabilities, - &ifcr.ifcr_capenable); - - return sockdriver_copyout(data, 0, &ifcr, sizeof(ifcr)); - - default: - return ENOTTY; - } -} - -/* - * Process an address family independent IOCTL request with an "ifmediareq" - * structure. - */ -static int -ifconf_ioctl_ifmedia(unsigned long request, - const struct sockdriver_data * data) -{ - struct ifdev *ifdev; - struct ifmediareq ifm; - int r; - - if ((r = sockdriver_copyin(data, 0, &ifm, sizeof(ifm))) != OK) - return r; - - ifm.ifm_name[sizeof(ifm.ifm_name) - 1] = '\0'; - - if ((ifdev = ifdev_find_by_name(ifm.ifm_name)) == NULL) - return ENXIO; - - switch (request) { - case MINIX_SIOCGIFMEDIA: - if ((r = ifdev_get_ifmedia(ifdev, &ifm.ifm_current, - &ifm.ifm_active)) != OK) - return r; - ifm.ifm_mask = 0; - - switch (ifdev_get_link(ifdev)) { - case LINK_STATE_UP: - ifm.ifm_status = IFM_AVALID | IFM_ACTIVE; - break; - case LINK_STATE_DOWN: - ifm.ifm_status = IFM_AVALID; - break; - default: - ifm.ifm_status = 0; - break; - } - - /* - * TODO: support for the list of supported media types. This - * one is not easy, because we cannot simply suspend the IOCTL - * and query the driver. For now, return only entry (which is - * the minimum for ifconfig(8) not to complain), namely the - * currently selected one. - */ - if (ifm.ifm_ulist != NULL) { - if (ifm.ifm_count < 1) - return ENOMEM; - - /* - * Copy out the 'list', which consists of one entry. - * If we were to produce multiple entries, we would - * have to check against the MINIX_IF_MAXMEDIA limit. - */ - if ((r = sockdriver_copyout(data, - offsetof(struct minix_ifmediareq, mifm_list), - &ifm.ifm_current, sizeof(ifm.ifm_current))) != OK) - return r; - } - ifm.ifm_count = 1; - - return sockdriver_copyout(data, 0, &ifm, sizeof(ifm)); - - default: - return ENOTTY; - } -} - -/* - * Process an address family independent IOCTL request with an "if_clonereq" - * structure. - */ -static int -ifconf_ioctl_ifclone(unsigned long request, - const struct sockdriver_data * data) -{ - struct if_clonereq ifcr; - const char *ptr; - char name[IFNAMSIZ]; - size_t off; - unsigned int num; - int r; - - if ((r = sockdriver_copyin(data, 0, &ifcr, sizeof(ifcr))) != OK) - return r; - - if (ifcr.ifcr_count < 0) - return EINVAL; - - off = offsetof(struct minix_if_clonereq, mifcr_buffer); - - for (num = 0; (ptr = ifdev_enum_vtypes(num)) != NULL; num++) { - /* Prevent overflow in case we ever have over 128 vtypes.. */ - if (num == MINIX_IF_MAXCLONERS) - break; - - if (ifcr.ifcr_buffer == NULL || - num >= (unsigned int)ifcr.ifcr_count) - continue; - - memset(name, 0, sizeof(name)); - strlcpy(name, ptr, sizeof(name)); - - if ((r = sockdriver_copyout(data, off, name, - sizeof(name))) != OK) - return r; - - off += sizeof(name); - } - - ifcr.ifcr_total = num; - - return sockdriver_copyout(data, 0, &ifcr, sizeof(ifcr)); -} - -/* - * Process an address family independent IOCTL request with an "if_addrprefreq" - * structure. - */ -static int -ifconf_ioctl_ifaddrpref(unsigned long request, - const struct sockdriver_data * data) -{ - struct ifdev *ifdev; - struct if_addrprefreq ifap; - int r; - - if ((r = sockdriver_copyin(data, 0, &ifap, sizeof(ifap))) != OK) - return r; - - ifap.ifap_name[sizeof(ifap.ifap_name) - 1] = '\0'; - - if ((ifdev = ifdev_find_by_name(ifap.ifap_name)) == NULL) - return ENXIO; - - /* - * For now, we simply support only a preference of 0. We do not try to - * look up the given address, nor do we return the looked up address. - */ - switch (request) { - case SIOCSIFADDRPREF: - if (ifap.ifap_preference != 0) - return EINVAL; - - return OK; - - case SIOCGIFADDRPREF: - ifap.ifap_preference = 0; - - return sockdriver_copyout(data, 0, &ifap, sizeof(ifap)); - - default: - return ENOTTY; - } -} - -/* - * Process an IOCTL request for AF_INET with an "ifreq" structure. - */ -static int -ifconf_ioctl_v4_ifreq(unsigned long request, - const struct sockdriver_data * data) -{ - struct sockaddr_in addr, mask, bcast, dest, *sin = NULL /*gcc*/; - struct ifdev *ifdev; - struct ifreq ifr; - ifaddr_v4_num_t num; - int r, flags; - - if ((r = sockdriver_copyin(data, 0, &ifr, sizeof(ifr))) != OK) - return r; - - ifr.ifr_name[sizeof(ifr.ifr_name) - 1] = '\0'; - - if ((ifdev = ifdev_find_by_name(ifr.ifr_name)) == NULL) - return ENXIO; - - switch (request) { - case SIOCGIFADDR: - case SIOCGIFNETMASK: - case SIOCGIFBRDADDR: - case SIOCGIFDSTADDR: - /* Retrieve all addresses, then copy out the desired one. */ - switch (request) { - case SIOCGIFADDR: sin = &addr; break; - case SIOCGIFNETMASK: sin = &mask; break; - case SIOCGIFBRDADDR: sin = &bcast; break; - case SIOCGIFDSTADDR: sin = &dest; break; - } - - sin->sin_len = 0; - - if ((r = ifaddr_v4_get(ifdev, (ifaddr_v4_num_t)0, &addr, &mask, - &bcast, &dest)) != OK) - return r; - - if (sin->sin_len == 0) /* not filled in */ - return EADDRNOTAVAIL; - - memcpy(&ifr.ifr_addr, sin, sizeof(*sin)); - - return sockdriver_copyout(data, 0, &ifr, sizeof(ifr)); - - case SIOCGIFAFLAG_IN: - if ((r = ifaddr_v4_find(ifdev, - (struct sockaddr_in *)&ifr.ifr_addr, &num)) != OK) - return r; - - ifr.ifr_addrflags = ifaddr_v4_get_flags(ifdev, num); - - return sockdriver_copyout(data, 0, &ifr, sizeof(ifr)); - - case SIOCSIFADDR: - /* - * This one is slightly different from the rest, in that we - * either set or update the primary address: if we set it, we - * must let _add() generate a matching netmask automatically, - * while if we update it, _add() would fail unless we first - * delete the old entry. - */ - sin = (struct sockaddr_in *)&ifr.ifr_addr; - - if ((r = ifaddr_v4_get(ifdev, (ifaddr_v4_num_t)0, &addr, &mask, - &bcast, &dest)) == OK) { - flags = ifaddr_v4_get_flags(ifdev, (ifaddr_v4_num_t)0); - - ifaddr_v4_del(ifdev, (ifaddr_v4_num_t)0); - - /* - * If setting the new address fails, reinstating the - * old address should always work. This is really ugly - * as it generates routing socket noise, but this call - * is deprecated anyway. - */ - if ((r = ifaddr_v4_add(ifdev, sin, &mask, &bcast, - &dest, 0 /*flags*/)) != OK) - (void)ifaddr_v4_add(ifdev, &addr, &mask, - &bcast, &dest, flags); - - return r; - } else - return ifaddr_v4_add(ifdev, sin, NULL /*mask*/, - NULL /*bcast*/, NULL /*dest*/, 0 /*flags*/); - - case SIOCSIFNETMASK: - case SIOCSIFBRDADDR: - case SIOCSIFDSTADDR: - /* These calls only update the existing primary address. */ - if ((r = ifaddr_v4_get(ifdev, (ifaddr_v4_num_t)0, &addr, &mask, - &bcast, &dest)) != OK) - return r; - - sin = (struct sockaddr_in *)&ifr.ifr_addr; - - switch (request) { - case SIOCSIFNETMASK: memcpy(&mask, sin, sizeof(mask)); break; - case SIOCSIFBRDADDR: memcpy(&bcast, sin, sizeof(bcast)); break; - case SIOCSIFDSTADDR: memcpy(&dest, sin, sizeof(dest)); break; - } - - return ifaddr_v4_add(ifdev, &addr, &mask, &bcast, &dest, - ifaddr_v4_get_flags(ifdev, (ifaddr_v4_num_t)0)); - - case SIOCDIFADDR: - if ((r = ifaddr_v4_find(ifdev, - (struct sockaddr_in *)&ifr.ifr_addr, &num)) != OK) - return r; - - ifaddr_v4_del(ifdev, num); - - return OK; - - default: - return ENOTTY; - } -} - -/* - * Process an IOCTL request for AF_INET with an "ifaliasreq" structure. - */ -static int -ifconf_ioctl_v4_ifalias(unsigned long request, - const struct sockdriver_data * data) -{ - struct ifdev *ifdev; - struct ifaliasreq ifra; - struct sockaddr_in dest; - ifaddr_v4_num_t num; - int r; - - if ((r = sockdriver_copyin(data, 0, &ifra, sizeof(ifra))) != OK) - return r; - - ifra.ifra_name[sizeof(ifra.ifra_name) - 1] = '\0'; - - if ((ifdev = ifdev_find_by_name(ifra.ifra_name)) == NULL) - return ENXIO; - - switch (request) { - case SIOCAIFADDR: - return ifaddr_v4_add(ifdev, - (struct sockaddr_in *)&ifra.ifra_addr, - (struct sockaddr_in *)&ifra.ifra_mask, - (struct sockaddr_in *)&ifra.ifra_broadaddr, - (struct sockaddr_in *)&ifra.ifra_dstaddr, 0 /*flags*/); - - case SIOCGIFALIAS: - if ((r = ifaddr_v4_find(ifdev, - (struct sockaddr_in *)&ifra.ifra_addr, &num)) != OK) - return r; - - /* - * The broadcast and destination address are stored in the same - * ifaliasreq field. We cannot pass a pointer to the same - * field to ifaddr_v4_get(). So, use a temporary variable. - */ - (void)ifaddr_v4_get(ifdev, num, - (struct sockaddr_in *)&ifra.ifra_addr, - (struct sockaddr_in *)&ifra.ifra_mask, - (struct sockaddr_in *)&ifra.ifra_broadaddr, &dest); - - if (ifra.ifra_broadaddr.sa_len == 0) - memcpy(&ifra.ifra_dstaddr, &dest, sizeof(dest)); - - return sockdriver_copyout(data, 0, &ifra, sizeof(ifra)); - - default: - return ENOTTY; - } -} - -/* - * Process an IOCTL request for AF_INET. - */ -static int -ifconf_ioctl_v4(unsigned long request, const struct sockdriver_data * data, - endpoint_t user_endpt) -{ - - switch (request) { - case SIOCSIFADDR: - case SIOCSIFDSTADDR: - case SIOCSIFBRDADDR: - case SIOCSIFNETMASK: - case SIOCDIFADDR: - if (!util_is_root(user_endpt)) - return EPERM; - - /* FALLTHROUGH */ - case SIOCGIFADDR: - case SIOCGIFDSTADDR: - case SIOCGIFBRDADDR: - case SIOCGIFNETMASK: - case SIOCGIFAFLAG_IN: - return ifconf_ioctl_v4_ifreq(request, data); - - case SIOCAIFADDR: - if (!util_is_root(user_endpt)) - return EPERM; - - /* FALLTHROUGH */ - case SIOCGIFALIAS: - return ifconf_ioctl_v4_ifalias(request, data); - - default: - return ENOTTY; - } -} - -#ifdef INET6 -/* - * Process an IOCTL request for AF_INET6 with an "in6_ifreq" structure. - */ -static int -ifconf_ioctl_v6_ifreq(unsigned long request, - const struct sockdriver_data * data) -{ - struct ifdev *ifdev; - struct in6_ifreq ifr; - ifaddr_v6_num_t num; - int r; - - if ((r = sockdriver_copyin(data, 0, &ifr, sizeof(ifr))) != OK) - return r; - - ifr.ifr_name[sizeof(ifr.ifr_name) - 1] = '\0'; - - if ((ifdev = ifdev_find_by_name(ifr.ifr_name)) == NULL) - return ENXIO; - - if ((r = ifaddr_v6_find(ifdev, &ifr.ifr_addr, &num)) != OK) - return r; - - switch (request) { - case SIOCGIFADDR_IN6: - /* This IOCTL basically checks if the given address exists. */ - ifaddr_v6_get(ifdev, num, &ifr.ifr_addr, NULL, NULL); - - return sockdriver_copyout(data, 0, &ifr, sizeof(ifr)); - - case SIOCDIFADDR_IN6: - ifaddr_v6_del(ifdev, num); - - return OK; - - case SIOCGIFNETMASK_IN6: - ifaddr_v6_get(ifdev, num, NULL, &ifr.ifr_addr, NULL); - - return sockdriver_copyout(data, 0, &ifr, sizeof(ifr)); - - case SIOCGIFAFLAG_IN6: - ifr.ifr_ifru.ifru_flags6 = ifaddr_v6_get_flags(ifdev, num); - - return sockdriver_copyout(data, 0, &ifr, sizeof(ifr)); - - case SIOCGIFALIFETIME_IN6: - ifaddr_v6_get_lifetime(ifdev, num, - &ifr.ifr_ifru.ifru_lifetime); - - return sockdriver_copyout(data, 0, &ifr, sizeof(ifr)); - - default: - return ENOTTY; - } -} - -/* - * Process an IOCTL request for AF_INET6 with an "in6_aliasreq" structure. - */ -static int -ifconf_ioctl_v6_ifalias(unsigned long request, - const struct sockdriver_data * data) -{ - struct ifdev *ifdev; - struct in6_aliasreq ifra; - int r; - - if ((r = sockdriver_copyin(data, 0, &ifra, sizeof(ifra))) != OK) - return r; - - ifra.ifra_name[sizeof(ifra.ifra_name) - 1] = '\0'; - - if ((ifdev = ifdev_find_by_name(ifra.ifra_name)) == NULL) - return ENXIO; - - switch (request) { - case SIOCAIFADDR_IN6: - return ifaddr_v6_add(ifdev, &ifra.ifra_addr, - &ifra.ifra_prefixmask, &ifra.ifra_dstaddr, - ifra.ifra_flags, &ifra.ifra_lifetime); - - default: - return ENOTTY; - } -} - -/* - * Process an IOCTL request for AF_INET6 with an "in6_ndireq" structure. - */ -static int -ifconf_ioctl_v6_ndireq(unsigned long request, - const struct sockdriver_data * data) -{ - struct ifdev *ifdev; - struct in6_ndireq ndi; - int r; - - if ((r = sockdriver_copyin(data, 0, &ndi, sizeof(ndi))) != OK) - return r; - - ndi.ifname[sizeof(ndi.ifname) - 1] = '\0'; - - if ((ifdev = ifdev_find_by_name(ndi.ifname)) == NULL) - return ENXIO; - - switch (request) { - case SIOCGIFINFO_IN6: - memset(&ndi.ndi, 0, sizeof(ndi.ndi)); - - ndi.ndi.linkmtu = ifdev_get_mtu(ifdev); - ndi.ndi.flags = ifdev_get_nd6flags(ifdev); - ndi.ndi.initialized = 1; - /* TODO: all the other fields.. */ - - return sockdriver_copyout(data, 0, &ndi, sizeof(ndi)); - - case SIOCSIFINFO_IN6: - /* TODO: all the other fields.. */ - - /* FALLTHROUGH */ - case SIOCSIFINFO_FLAGS: - return ifdev_set_nd6flags(ifdev, ndi.ndi.flags); - - default: - return ENOTTY; - } -} - -/* - * Process an IOCTL request for AF_INET6 with an "in6_nbrinfo" structure. - */ -static int -ifconf_ioctl_v6_nbrinfo(unsigned long request, - const struct sockdriver_data * data) -{ - struct ifdev *ifdev; - struct sockaddr_in6 addr; - struct in6_nbrinfo nbri; - lldata_ndp_num_t num; - int r; - - if ((r = sockdriver_copyin(data, 0, &nbri, sizeof(nbri))) != OK) - return r; - - nbri.ifname[sizeof(nbri.ifname) - 1] = '\0'; - - if ((ifdev = ifdev_find_by_name(nbri.ifname)) == NULL) - return ENXIO; - - switch (request) { - case SIOCGNBRINFO_IN6: - /* - * Convert the given in6_addr to a full sockaddr_in6, mainly - * for internal consistency. It would have been nice if the - * KAME management API had had any sort of consistency itself. - */ - memset(&addr, 0, sizeof(addr)); - addr.sin6_family = AF_INET6; - memcpy(&addr.sin6_addr.s6_addr, &nbri.addr, - sizeof(addr.sin6_addr.s6_addr)); - - if ((r = lldata_ndp_find(ifdev, &addr, &num)) != OK) - return r; - - lldata_ndp_get_info(num, &nbri.asked, &nbri.isrouter, - &nbri.state, &nbri.expire); - - return sockdriver_copyout(data, 0, &nbri, sizeof(nbri)); - - default: - return ENOTTY; - } -} - -/* - * Process an IOCTL request for AF_INET6. - */ -static int -ifconf_ioctl_v6(unsigned long request, const struct sockdriver_data * data, - endpoint_t user_endpt) -{ - - switch (request) { - case SIOCDIFADDR_IN6: - if (!util_is_root(user_endpt)) - return EPERM; - - /* FALLTHROUGH */ - case SIOCGIFADDR_IN6: - case SIOCGIFNETMASK_IN6: - case SIOCGIFAFLAG_IN6: - case SIOCGIFALIFETIME_IN6: - return ifconf_ioctl_v6_ifreq(request, data); - - case SIOCAIFADDR_IN6: - if (!util_is_root(user_endpt)) - return EPERM; - - return ifconf_ioctl_v6_ifalias(request, data); - - case SIOCSIFINFO_IN6: - case SIOCSIFINFO_FLAGS: - if (!util_is_root(user_endpt)) - return EPERM; - - /* FALLTHROUGH */ - case SIOCGIFINFO_IN6: - return ifconf_ioctl_v6_ndireq(request, data); - - case SIOCGNBRINFO_IN6: - return ifconf_ioctl_v6_nbrinfo(request, data); - - default: - return ENOTTY; - } -} -#endif /* INET6 */ - -/* - * Process an IOCTL request for AF_LINK with an "if_laddrreq" structure. - */ -static int -ifconf_ioctl_dl_lifaddr(unsigned long request, - const struct sockdriver_data * data) -{ - struct ifdev *ifdev; - struct if_laddrreq iflr; - ifaddr_dl_num_t num; - int r; - - if ((r = sockdriver_copyin(data, 0, &iflr, sizeof(iflr))) != OK) - return r; - - iflr.iflr_name[sizeof(iflr.iflr_name) - 1] = '\0'; - - if ((ifdev = ifdev_find_by_name(iflr.iflr_name)) == NULL) - return ENXIO; - - switch (request) { - case SIOCGLIFADDR: - if (iflr.flags & IFLR_PREFIX) { - /* We ignore the prefix length, like NetBSD does. */ - if ((r = ifaddr_dl_find(ifdev, - (struct sockaddr_dlx *)&iflr.addr, - sizeof(iflr.addr), &num)) != OK) - return r; - } else - num = (ifaddr_dl_num_t)0; /* this always works */ - - ifaddr_dl_get(ifdev, num, (struct sockaddr_dlx *)&iflr.addr); - iflr.flags = ifaddr_dl_get_flags(ifdev, num); - memset(&iflr.dstaddr, 0, sizeof(iflr.dstaddr)); - - return sockdriver_copyout(data, 0, &iflr, sizeof(iflr)); - - case SIOCALIFADDR: - return ifaddr_dl_add(ifdev, (struct sockaddr_dlx *)&iflr.addr, - sizeof(iflr.addr), iflr.flags); - - case SIOCDLIFADDR: - if ((r = ifaddr_dl_find(ifdev, - (struct sockaddr_dlx *)&iflr.addr, sizeof(iflr.addr), - &num)) != OK) - return r; - - return ifaddr_dl_del(ifdev, num); - - default: - return ENOTTY; - } -} - -/* - * Process an IOCTL request for AF_LINK. - */ -static int -ifconf_ioctl_dl(unsigned long request, const struct sockdriver_data * data, - endpoint_t user_endpt) -{ - - switch (request) { - case SIOCALIFADDR: - case SIOCDLIFADDR: - if (!util_is_root(user_endpt)) - return EPERM; - - /* FALLTHROUGH */ - case SIOCGLIFADDR: - return ifconf_ioctl_dl_lifaddr(request, data); - - default: - return ENOTTY; - } -} - -/* - * Process an IOCTL request. This routine is shared between TCP, UDP, RAW, and - * link sockets. The given socket may be used to obtain the target domain: - * AF_INET, AF_INET6, or AF_LINK. - */ -int -ifconf_ioctl(struct sock * sock, unsigned long request, - const struct sockdriver_data * data, endpoint_t user_endpt) -{ - int domain; - - domain = sockevent_get_domain(sock); - - switch (request) { - case SIOCSIFFLAGS: - case SIOCSIFMETRIC: - case SIOCSIFMEDIA: - case SIOCSIFMTU: - case SIOCIFCREATE: - case SIOCIFDESTROY: - if (!util_is_root(user_endpt)) - return EPERM; - - /* FALLTHROUGH */ - case SIOCGIFFLAGS: - case SIOCGIFMETRIC: - case SIOCGIFMTU: - case SIOCGIFDLT: - case SIOCGIFINDEX: - return ifconf_ioctl_ifreq(request, data); - - case SIOCSIFCAP: - if (!util_is_root(user_endpt)) - return EPERM; - - /* FALLTHROUGH */ - case SIOCGIFCAP: - return ifconf_ioctl_ifcap(request, data); - - case MINIX_SIOCGIFMEDIA: - return ifconf_ioctl_ifmedia(request, data); - - case MINIX_SIOCIFGCLONERS: - return ifconf_ioctl_ifclone(request, data); - - case SIOCSIFADDRPREF: - if (!util_is_root(user_endpt)) - return EPERM; - - /* FALLTHROUGH */ - case SIOCGIFADDRPREF: - return ifconf_ioctl_ifaddrpref(request, data); - - default: - switch (domain) { - case AF_INET: - return ifconf_ioctl_v4(request, data, user_endpt); - -#ifdef INET6 - case AF_INET6: - return ifconf_ioctl_v6(request, data, user_endpt); -#endif /* INET6 */ - - case AF_LINK: - return ifconf_ioctl_dl(request, data, user_endpt); - - default: - return ENOTTY; - } - } -} diff --git a/minix/net/lwip/ifdev.c b/minix/net/lwip/ifdev.c deleted file mode 100644 index 1808a2a6a..000000000 --- a/minix/net/lwip/ifdev.c +++ /dev/null @@ -1,1064 +0,0 @@ -/* LWIP service - ifdev.c - network interface devices */ - -#include "lwip.h" -#include "mcast.h" -#include "ifaddr.h" -#include "rtsock.h" -#include "route.h" -#include "bpfdev.h" - -#include - -/* - * The highest possible interface index number, plus one. We currently let - * lwIP choose the interface index. lwIP will generate a number between 1 and - * 255 inclusive. For efficiency, we use an array to look up an interface - * device object by its index. Thus, this array must be large enough to be - * indexed by the largest possible index number generated by lwIP. lwIP uses - * an unsigned 8-bit field to store the index number. - */ -#define MAX_IFDEV (UINT8_MAX + 1) - -/* The table is indexed by the interface index minus one. */ -static struct ifdev *ifdev_table[MAX_IFDEV]; /* index-based lookup table */ - -static TAILQ_HEAD(, ifdev) ifdev_list; /* list of active interfaces */ - -static struct ifdev *ifdev_loopback; /* loopback interface */ - -/* - * The maximum number of virtual interface types--that is, interface types for - * which interfaces may be created and destroyed dynamically. The BSDs call - * these "clones". There should be enough slots for all types, which are - * registered by their respective modules through ifdev_register(). Increase - * as necessary. - */ -#define MAX_VTYPE 4 - -static struct { - const char *ifvt_name; /* interface name without digits (e.g. "lo") */ - size_t ifvt_namelen; /* length of the name, excluding null term. */ - int (*ifvt_create)(const char *); /* ifdev create function */ -} ifdev_vtype[MAX_VTYPE]; - -static unsigned int ifdev_vtypes; /* number of in-use vtype slots */ - -#define IFDEV_MIN_MTU 1280 /* minimum interface MTU, required by IPv6 */ - -/* - * Initialize the network interface devices module. This call must be issued - * before any virtual interfaces are initialized, because the virtual types - * array is initialized here. - */ -void -ifdev_init(void) -{ - - memset(ifdev_table, 0, sizeof(ifdev_table)); - - TAILQ_INIT(&ifdev_list); - - memset(ifdev_vtype, 0, sizeof(ifdev_vtype)); - ifdev_vtypes = 0; -} - -/* - * Check all active interfaces to see if any tasks need to be performed. This - * function is called as part of each message loop iteration. - */ -void -ifdev_poll(void) -{ - struct ifdev *ifdev; - - /* - * Call the polling function of the active interfaces. Note that - * interfaces may not remove themselves as a result of polling! - */ - TAILQ_FOREACH(ifdev, &ifdev_list, ifdev_next) { - if (ifdev->ifdev_ops->iop_poll != NULL) - ifdev->ifdev_ops->iop_poll(ifdev); - } -} - -/* - * Handle an incoming packet on an interface. This function assumes ownership - * of the packet buffers: the caller must no longer refer to it afterward. For - * packets looped back for a non-loopback interface, 'ifdev' is the loopback - * interface and 'netif' is the original (non-loopback) interface's netif. For - * other packets, 'ifdev' is the actual interface and 'netif' is NULL. The - * packet is passed to BPF devices only if 'to_bpf' is set. - */ -void -ifdev_input(struct ifdev * ifdev, struct pbuf * pbuf, struct netif * netif, - int to_bpf) -{ - struct bpfdev_link *bpfl; - err_t err; - - /* - * Looped-back packets are captured on the loopback device, not on the - * original interface. Similarly, we account the traffic to the - * loopback interface. This is a policy decision (inspired by NetBSD's - * behavior) and may be changed later. - */ - if (to_bpf) { - TAILQ_FOREACH(bpfl, &ifdev->ifdev_bpf, bpfl_next) - bpfdev_input(bpfl, pbuf); - } - - ifdev->ifdev_data.ifi_ipackets++; - ifdev->ifdev_data.ifi_ibytes += pbuf->tot_len; - - if (pbuf->flags & PBUF_FLAG_LLMCAST) - ifdev->ifdev_data.ifi_imcasts++; - - /* - * For looped-back packets, we must bypass the regular netif input - * function (as that one is for link-layer packet handling) and instead - * pass it directly to the IP-layer packet handling function of lwIP. - */ - if (netif != NULL) - err = ip_input(pbuf, netif); - else - err = ifdev->ifdev_netif.input(pbuf, &ifdev->ifdev_netif); - - if (err != ERR_OK) - pbuf_free(pbuf); -} - -/* - * Handle an outgoing packet on an interface. Return ERR_OK if the packet was - * transmitted or another lwIP ERR_ error code upon failure. Either way, the - * caller is responsible for freeing the packet buffers. If the packet is - * to be looped back to a non-loopback interface (because its destination is a - * local address), 'ifdev' is the loopback interface and 'netif' is set to the - * original interface's netif. In all other cases, 'ifdev' is the packet's - * source interface and 'netif' is NULL. The packet is passed to attached BPF - * devices only if 'to_bpf' is set. If 'hdrcmplt' is set, the source address - * of the data link header is already filled in; otherwise, the source address - * must be set to the device's source address, if applicable. - */ -err_t -ifdev_output(struct ifdev * ifdev, struct pbuf * pbuf, struct netif * netif, - int to_bpf, int hdrcmplt) -{ - struct bpfdev_link *bpfl; - - /* - * If the interface and/or the link is down, discard the packet without - * reporting it to BPF or the actual interface module. - */ - if (!ifdev_is_up(ifdev) || !ifdev_is_link_up(ifdev)) - return ERR_IF; /* this should translate to ENETDOWN */ - - /* - * If the link-layer header is not yet complete, fill in the source - * address now. This exception applies to BPF-generated packets only. - * Complete the header before passing the packet back to BPF, which - * should see the completed version of the packet. - */ - if (!hdrcmplt && ifdev->ifdev_ops->iop_hdrcmplt != NULL) - ifdev->ifdev_ops->iop_hdrcmplt(ifdev, pbuf); - - /* - * As in ifdev_input(), we use the loopback interface for BPF and - * statistics even if the packet originates from a non-loopback device. - */ - if (to_bpf) { - TAILQ_FOREACH(bpfl, &ifdev->ifdev_bpf, bpfl_next) - bpfdev_output(bpfl, pbuf); - } - - ifdev->ifdev_data.ifi_opackets++; - ifdev->ifdev_data.ifi_obytes += pbuf->tot_len; - - /* - * TODO: this is rather imprecise, because it works only when we set - * the pbuf flag explicitly ourselves. That happens only for UDP/RAW - * packets, and not for (e.g.) ND6 multicast traffic. We have reasons - * to set the flags ourselves anyway, namely to support MSG_MCAST and - * MSG_BCAST on loopback interfaces, but they should be complemented by - * additional checks here on, say, the destination ethernet address. - */ - if (pbuf->flags & PBUF_FLAG_LLMCAST) - ifdev->ifdev_data.ifi_omcasts++; - - return ifdev->ifdev_ops->iop_output(ifdev, pbuf, netif); -} - -/* - * Transmit an IPv4 packet on an interface, as requested by lwIP. Pass on the - * packet to the interface's link processor (e.g., etharp), unless the packet - * should be rejected or blackholed according to route information, or it is to - * be looped back into the interface. The latter may occur if the destination - * address belongs to the interface. In that case, we send the packet over a - * loopback interface instead. In addition, if this is a multicast packet that - * should be looped back, send a copy over a loopback interface as well. - * Loopback interfaces themselves are exempt from these special cases. - */ -static err_t -ifdev_output_v4(struct netif * netif, struct pbuf * pbuf, - const ip4_addr_t * ipaddr) -{ - struct ifdev *ifdev = netif_get_ifdev(netif); - err_t err; - - assert(ifdev_loopback != NULL); - - /* Check for reject/blackhole routes. */ - if (!route_output_v4(ifdev, ipaddr, &err)) - return err; - - /* Handle looping of multicast packets on non-loopback interfaces. */ - if (!ifdev_is_loopback(ifdev) && (pbuf->flags & PBUF_FLAG_MCASTLOOP)) - (void)ifdev_output(ifdev_loopback, pbuf, netif, - FALSE /*to_bpf*/, TRUE /*hdrcmplt*/); - - /* Divert packets sent to the local interface address. */ - if (!ifdev_is_loopback(ifdev) && ifdev->ifdev_v4set && - ip4_addr_cmp(netif_ip4_addr(&ifdev->ifdev_netif), ipaddr)) - ifdev = ifdev_loopback; - else - netif = NULL; - - if (ifdev->ifdev_ops->iop_output_v4 != NULL) - return ifdev->ifdev_ops->iop_output_v4(ifdev_get_netif(ifdev), - pbuf, ipaddr); - else - return ifdev_output(ifdev, pbuf, netif, TRUE /*to_bpf*/, - TRUE /*hdrcmplt*/); -} - -/* - * Transmit an IPv6 packet on an interface, as requested by lwIP. As for IPv4. - */ -static err_t -ifdev_output_v6(struct netif * netif, struct pbuf * pbuf, - const ip6_addr_t * ipaddr) -{ - struct ifdev *ifdev = netif_get_ifdev(netif); - err_t err; - - assert(ifdev_loopback != NULL); - - /* Check for reject/blackhole routes. */ - if (!route_output_v6(ifdev, ipaddr, &err)) - return err; - - /* Handle looping of multicast packets on non-loopback interfaces. */ - if (!ifdev_is_loopback(ifdev) && (pbuf->flags & PBUF_FLAG_MCASTLOOP)) - (void)ifdev_output(ifdev_loopback, pbuf, netif, - FALSE /*to_bpf*/, TRUE /*hdrcmplt*/); - - /* Divert packets sent to the local interface address. */ - if (!ifdev_is_loopback(ifdev) && - (netif_get_ip6_addr_match(&ifdev->ifdev_netif, ipaddr) != -1 || - ip6_addr_ismulticast_iflocal(ipaddr))) - ifdev = ifdev_loopback; - else - netif = NULL; - - if (ifdev->ifdev_ops->iop_output_v6 != NULL) - return ifdev->ifdev_ops->iop_output_v6(ifdev_get_netif(ifdev), - pbuf, ipaddr); - else - return ifdev_output(ifdev, pbuf, netif, TRUE /*to_bpf*/, - TRUE /*hdrcmplt*/); -} - -/* - * Status callback function, called by lwIP whenever certain status changes are - * made on the netif. These changes may be initiated either by lwIP itself or - * by us. We use this callback to check lwIP-initiated state changes on local - * IPv6 addresses, using shadow state to filter out self-initiated changes. - * - * One day we might switch to the extended netif callback mechanism offered by - * lwIP. Currently, netif state changes are rare and it takes us little effort - * to find out whether anything changed, so there is no immediate need. - */ -static void -ifdev_status_callback(struct netif * netif) -{ - struct ifdev *ifdev = netif_get_ifdev(netif); - - ifaddr_v6_check(ifdev); -} - -/* - * Initialize the netif structure for a new interface. Most of this is handled - * by the specific interface module. - */ -static err_t -ifdev_init_netif(struct netif * netif) -{ - struct ifdev *ifdev = netif_get_ifdev(netif); - - assert(ifdev != NULL); - - netif->output = ifdev_output_v4; - netif->output_ip6 = ifdev_output_v6; - - netif->hwaddr_len = ifdev->ifdev_data.ifi_addrlen; - netif->mtu = ifdev->ifdev_data.ifi_mtu; - - netif_set_status_callback(netif, ifdev_status_callback); - - return ifdev->ifdev_ops->iop_init(ifdev, netif); -} - -/* - * Retrieve an interface device by its interface index. Return a pointer to - * the interface device if found, or NULL otherwise. If the given interface - * index is zero, this function will always return NULL. - */ -struct ifdev * -ifdev_get_by_index(uint32_t ifindex) -{ - - if (ifindex >= __arraycount(ifdev_table)) - return NULL; - - return ifdev_table[ifindex]; -} - -/* - * Find an interface device by its name. Return a pointer to the interface - * device if found, or NULL otherwise. - */ -struct ifdev * -ifdev_find_by_name(const char * name) -{ - struct ifdev *ifdev; - - TAILQ_FOREACH(ifdev, &ifdev_list, ifdev_next) { - if (!strcmp(ifdev->ifdev_name, name)) - return ifdev; - } - - return NULL; -} - -/* - * Given either NULL or a previously returned interface device object pointer, - * return the first or next interface device object pointer, or NULL if there - * are no more. - */ -struct ifdev * -ifdev_enum(struct ifdev * last) -{ - - if (last == NULL) - return TAILQ_FIRST(&ifdev_list); - else - return TAILQ_NEXT(last, ifdev_next); -} - -/* - * Attach a BPF device as listener to this interface. - */ -void -ifdev_attach_bpf(struct ifdev * ifdev, struct bpfdev_link * bpfl) -{ - - TAILQ_INSERT_TAIL(&ifdev->ifdev_bpf, bpfl, bpfl_next); -} - -/* - * Detach a previously attached BPF device from this interface. - */ -void -ifdev_detach_bpf(struct ifdev * ifdev, struct bpfdev_link * bpfl) -{ - - TAILQ_REMOVE(&ifdev->ifdev_bpf, bpfl, bpfl_next); -} - -/* - * Register the calling party as interested in putting the interface in - * promiscuous mode. There may be multiple such parties, each of which can - * call this function once, after which they must call ifdev_clear_promisc() - * later. If possible, the interface is put in promiscuous mode if there is at - * least one interested party. Return TRUE on success, or FALSE on failure. - */ -int -ifdev_set_promisc(struct ifdev * ifdev) -{ - - /* - * A bit silly, but we want to retain the ability to fail this call for - * other reasons in the future, with BPF handling that case properly. - */ - if (ifdev->ifdev_promisc == UINT_MAX) - return FALSE; - - if (ifdev->ifdev_promisc++ == 0) { - ifdev_update_ifflags(ifdev, - ifdev->ifdev_ifflags | IFF_PROMISC); - - if (ifdev->ifdev_ops->iop_set_promisc != NULL) - ifdev->ifdev_ops->iop_set_promisc(ifdev, TRUE); - } - - return TRUE; -} - -/* - * Deregister a previously registered party interested in putting the interface - * in promiscuous mode. Once the last party deregisters, the device is pulled - * out of promiscuous mode. - */ -void -ifdev_clear_promisc(struct ifdev * ifdev) -{ - - assert(ifdev->ifdev_promisc > 0); - - if (--ifdev->ifdev_promisc == 0) { - if (ifdev->ifdev_ops->iop_set_promisc != NULL) - ifdev->ifdev_ops->iop_set_promisc(ifdev, FALSE); - - ifdev_update_ifflags(ifdev, - ifdev->ifdev_ifflags & ~IFF_PROMISC); - } -} - -/* - * Set NetBSD-style interface flags (IFF_) for an interface. - */ -int -ifdev_set_ifflags(struct ifdev * ifdev, unsigned int ifflags) -{ - int r; - - /* Check and update only the subset of flags that may be changed. */ - ifflags &= ~(IFF_CANTCHANGE | IFF_LOOPBACK); - - /* - * Important: the callback function may call ifdev_update_ifflags() - * itself immediately, to update read-only flags such as IFF_RUNNING - * based on read-write flags such as IFF_UP. So as to make that work.. - * - * 1) this function MUST succeed if the callback function succeeds; - * 2) this function MUST NOT make assumptions about the ifdev_ifflags - * field across the callback invocation. - * - * Conversely, the callback function should be aware that the flags - * field will still be updated with the flags. In this model, it is - * not possible for the callback function to silently change any of the - * given flags. If that is ever necessary, API changes are needed. - */ - if ((r = ifdev->ifdev_ops->iop_set_ifflags(ifdev, ifflags)) != OK) - return r; - - /* - * On success, merge the updated subset with the subset that may not be - * changed. - */ - ifflags |= ifdev->ifdev_ifflags & (IFF_CANTCHANGE | IFF_LOOPBACK); - - ifdev_update_ifflags(ifdev, ifflags); - - return OK; -} - -/* - * Update NetBSD-style interface flags (IFF_) for an interface, and perform any - * required operations as a result of certain flags changing. This function - * bypasses all input checks and directly changes the flags field to exactly - * the given set of flags. - */ -void -ifdev_update_ifflags(struct ifdev * ifdev, unsigned int ifflags) -{ - struct netif *netif; - - /* - * First update the flags field itself. The new value should be - * visible in the routing messages generated below, for example. - */ - ifdev->ifdev_ifflags = ifflags; - - /* - * Then perform operations as a result of the flags field changing. - * For now, this is relevant for IFF_UP only. - */ - netif = ifdev_get_netif(ifdev); - - if ((ifflags & IFF_UP) && !netif_is_up(netif)) { - netif_set_up(netif); - - rtsock_msg_ifinfo(ifdev); - - /* - * Check if all conditions are now met for link-local IPv6 - * address assignment. - */ - ifaddr_v6_set_linklocal(ifdev); - - /* See if we should also reset address states now. */ - if (netif_is_link_up(netif)) - ifaddr_v6_set_up(ifdev); - } else if (!(ifflags & IFF_UP) && netif_is_up(netif)) { - netif_set_down(netif); - - rtsock_msg_ifinfo(ifdev); - } -} - -/* - * Retrieve NetBSD-style interface capabilities (IFCAP_) for an interface: both - * the supported and the enabled capabilities. - */ -void -ifdev_get_ifcap(struct ifdev * ifdev, uint64_t * ifcap, uint64_t * ifena) -{ - - *ifcap = 0; - *ifena = 0; - - if (ifdev->ifdev_ops->iop_get_ifcap != NULL) - ifdev->ifdev_ops->iop_get_ifcap(ifdev, ifcap, ifena); -} - -/* - * Set enabled NetBSD-style interface capabilities (IFCAP_) for an interface. - */ -int -ifdev_set_ifcap(struct ifdev * ifdev, uint64_t ifena) -{ - - if (ifdev->ifdev_ops->iop_set_ifcap != NULL) - return ifdev->ifdev_ops->iop_set_ifcap(ifdev, ifena); - else - return EINVAL; -} - -/* - * Retrieve NetBSD-style media type (IFM_) for an interface. Return OK on - * success, with the current media type selection stored in 'ifcurrent', the - * driver-reported active media type in 'ifactive', and the link status in - * 'ifstatus'. Return a negative error code on failure. - */ -int -ifdev_get_ifmedia(struct ifdev * ifdev, int * ifcurrent, int * ifactive) -{ - - if (ifdev->ifdev_ops->iop_get_ifmedia == NULL) - return ENOTTY; - - ifdev->ifdev_ops->iop_get_ifmedia(ifdev, ifcurrent, ifactive); - - return OK; -} - -/* - * Set NetBSD-style media type (IFM_) for an interface. Return OK on success, - * or a negative error code on failure. - */ -int -ifdev_set_ifmedia(struct ifdev * ifdev, int ifmedia) -{ - - if (ifdev->ifdev_ops->iop_set_ifmedia == NULL) - return ENOTTY; - - if (ifmedia < 0) - return EINVAL; - - return ifdev->ifdev_ops->iop_set_ifmedia(ifdev, ifmedia); -} - -/* - * Set the Maximum Transmission Unit for an interface. Return OK on success, - * or a negative error code on failure. - */ -int -ifdev_set_mtu(struct ifdev * ifdev, unsigned int mtu) -{ - - if (ifdev->ifdev_ops->iop_set_mtu == NULL) - return ENOTTY; - - if (mtu < IFDEV_MIN_MTU || mtu > UINT16_MAX || - !ifdev->ifdev_ops->iop_set_mtu(ifdev, mtu)) - return EINVAL; - - ifdev->ifdev_data.ifi_mtu = mtu; - ifdev->ifdev_netif.mtu = mtu; - - return OK; -} - -/* - * Set IPv6 Neighbor Discovery related flags. - */ -int -ifdev_set_nd6flags(struct ifdev * ifdev, uint32_t nd6flags) -{ - - /* For now, refuse setting any flags that are not even known. */ - if ((nd6flags & ~(ND6_IFF_PERFORMNUD | ND6_IFF_ACCEPT_RTADV | - ND6_IFF_IFDISABLED | ND6_IFF_OVERRIDE_RTADV | - ND6_IFF_AUTO_LINKLOCAL)) != 0) - return EINVAL; - - /* - * Unfortunately, the mismatch between NetBSD and lwIP requires us to - * support but butcher ND6 flags. The current status is as follows: - * - * - ND6_IFF_PERFORMNUD: set by default as lwIP always implements NUD; - * changes are disregarded but possible, for dhcpcd(8). - * - ND6_IFF_ACCEPT_RTADV: disregarded but settable, for dhcpcd(8); in - * our case, lwIP always processes router advertisements but never - * autoconfigures addresses, so this flag has no meaning for us. - * - ND6_IFF_IFDISABLED: not supported; can only be cleared; we could - * probably do detection of link-local address collision and set this - * flag (and disable the interface if set) when that happens; TODO. - * - ND6_IFF_OVERRIDE_RTADV: same as _ACCEPT_ above. - * - ND6_IFF_AUTO_LINKLOCAL: supported, but not initialized based on - * the corresponding sysctl(7) flag for reasons mentioned in ifaddr. - */ - if (nd6flags & ND6_IFF_IFDISABLED) - return EINVAL; - - ifdev->ifdev_nd6flags = nd6flags; - - return OK; -} - -/* - * Report an update to the interface's active hardware address that is *not* - * the result of a user action. If the 'is_factory' flag is set, the address - * is the factory (driver-given) address. This function is for use by - * interface modules, to update the internal state to their current external - * state. - */ -void -ifdev_update_hwaddr(struct ifdev * ifdev, const uint8_t * hwaddr, - int is_factory) -{ - - return ifaddr_dl_update(ifdev, hwaddr, is_factory); -} - -/* - * Insert a new interface device into the list of interface devices, at a - * location determined by policy. - */ -static void -ifdev_insert(struct ifdev * ifdev) -{ - struct ifdev *ifdev2; - const char *p; - unsigned int unit, unit2; - size_t namelen; - int found; - - /* - * While NetBSD can set up all interfaces in the order it wants them to - * appear in, we do not have such luxury: network device drivers come - * up and report to us in no particular predefined order, and we have - * no way to know how many and which will appear. The result is that - * we always have to create the loopback device first, something that - * is explicitly said to be bad in NetBSD. Instead, we create an - * illusion of a reasonable order by performing insertion sort on the - * interface list, using (for now) these rules, ordered by priority: - * - * 1. same-named devices are sorted by their unit number; - * 2. loopback interfaces are inserted after all other interfaces; - * 3. new devices are added at the end of their type category. - * - * In the future, other forms of real-vs-virtual sorting may be added. - */ - - /* First check for same-named devices (#1). */ - for (p = ifdev->ifdev_name; *p != '\0' && (*p < '0' || *p > '9'); p++); - - namelen = (size_t)(p - ifdev->ifdev_name); - - for (unit = 0; *p >= '0' && *p <= '9'; p++) - unit = unit * 10 + *p - '0'; - - found = FALSE; - TAILQ_FOREACH(ifdev2, &ifdev_list, ifdev_next) { - if (!strncmp(ifdev->ifdev_name, ifdev2->ifdev_name, namelen) && - *(p = &ifdev2->ifdev_name[namelen]) >= '0' && *p <= '9') { - for (unit2 = 0; *p >= '0' && *p <= '9'; p++) - unit2 = unit2 * 10 + *p - '0'; - - assert(unit != unit2); - - found = TRUE; - if (unit2 > unit) - break; - } else if (found) - break; - } - - if (found) { - if (ifdev2 != NULL) - TAILQ_INSERT_BEFORE(ifdev2, ifdev, ifdev_next); - else - TAILQ_INSERT_TAIL(&ifdev_list, ifdev, ifdev_next); - - return; - } - - /* - * No same-named device found. Is this a loopback interface? If not, - * insert before the first loopback device, if any. - */ - if (!ifdev_is_loopback(ifdev)) { - TAILQ_FOREACH(ifdev2, &ifdev_list, ifdev_next) { - if (ifdev_is_loopback(ifdev2)) { - TAILQ_INSERT_BEFORE(ifdev2, ifdev, ifdev_next); - - return; - } - } - } - - /* - * The given device is not a loopback device, or there was no loopback - * device in the list, possibly because it was empty. Add to the tail. - */ - TAILQ_INSERT_TAIL(&ifdev_list, ifdev, ifdev_next); -} - -/* - * Add and initialize an interface device. - */ -void -ifdev_add(struct ifdev * ifdev, const char * name, unsigned int ifflags, - unsigned int iftype, size_t hdrlen, size_t addrlen, unsigned int dlt, - unsigned int mtu, uint32_t nd6flags, const struct ifdev_ops * iop) -{ - unsigned int ifindex; - ip4_addr_t ip4addr_any, ip4addr_none; - - /* - * Since the call to netif_add() may end up invoking some of our - * callbacks (the add-multicast-address ones in particular), make sure - * that everything else is set up first. We cannot set up the index - * mapping until netif_add() returns, but this is currently no problem. - */ - strlcpy(ifdev->ifdev_name, name, sizeof(ifdev->ifdev_name)); - ifdev->ifdev_ifflags = 0; /* will be updated below */ - ifdev->ifdev_dlt = dlt; - ifdev->ifdev_nd6flags = nd6flags; - ifdev->ifdev_ops = iop; - - memset(&ifdev->ifdev_data, 0, sizeof(ifdev->ifdev_data)); - - assert(addrlen <= NETIF_MAX_HWADDR_LEN); - assert(mtu >= IFDEV_MIN_MTU && mtu <= UINT16_MAX); - - ifdev->ifdev_data.ifi_type = iftype; - ifdev->ifdev_data.ifi_hdrlen = hdrlen; - ifdev->ifdev_data.ifi_addrlen = addrlen; - ifdev->ifdev_data.ifi_link_state = LINK_STATE_UNKNOWN; - ifdev->ifdev_data.ifi_mtu = mtu; - - TAILQ_INIT(&ifdev->ifdev_bpf); - - ifaddr_init(ifdev); - - /* - * We have to assign an IPv4 address at netif addition time, but we may - * not have one yet, so pass in an "any" address for now. Hopefully - * lwIP will not mistake this for a real IPv4 address if we happen to - * enable the interface with only an IPv6 address later on. - */ - ip4_addr_set_any(&ip4addr_any); - ip4_addr_set_u32(&ip4addr_none, PP_HTONL(INADDR_NONE)); - - /* - * Insert the new interface device into a sensible place in the current - * list of interfaces. - */ - ifdev_insert(ifdev); - - /* - * netif_add() can fail only as a result of the initialization callback - * failing, which is something that should never happen in our case. - */ - if (netif_add(&ifdev->ifdev_netif, &ip4addr_any, &ip4addr_none, - &ip4addr_any, ifdev, ifdev_init_netif, iop->iop_input) == NULL) - panic("unable to add netif"); - - /* - * Set up the index mapping. Since interface index zero never - * generated, table slot zero is always NULL. We could shift all - * elements by one to save four bytes, but there's no real point. - */ - ifindex = netif_get_index(&ifdev->ifdev_netif); - - if (ifindex == 0 || ifindex >= __arraycount(ifdev_table)) - panic("invalid lwIP-generated interface index %u", ifindex); - - ifdev_table[ifindex] = ifdev; - - /* - * Set the initial interface flags. Use the regular procedure for this - * just in case the interface module is crazy enough to set the - * interface up right away (which is never a good idea but still). - */ - ifdev_update_ifflags(ifdev, ifflags); - - /* - * If this is the first loopback interface to be registered, save it as - * the loopback interface that we will use to loop back self-destined - * packets on other interfaces. Do this after setting the interface - * flags, since those are what we use to perform this loopback check. - */ - if (ifdev_loopback == NULL && ifdev_is_loopback(ifdev)) - ifdev_loopback = ifdev; - - /* Finally, announce the new interface. */ - rtsock_msg_ifannounce(ifdev, TRUE /*arrival*/); -} - -/* - * Remove an interface device. Return OK on success, or a negative error code - * on failure. Only loopback interfaces may be refused for removal. - */ -int -ifdev_remove(struct ifdev * ifdev) -{ - struct bpfdev_link *bpfl; - - /* - * If this is the loopback interface used to loop back packets for - * other interfaces (typically lo0), we cannot afford to get rid of it. - */ - if (ifdev == ifdev_loopback) - return EPERM; - - /* - * Take down the interface for the purpose of sending a routing - * message. NetBSD sends a RTM_IFINFO even if the interface was down - * already, and so we do not check whether IFF_UP was set at all here. - */ - ifdev_update_ifflags(ifdev, ifdev->ifdev_ifflags & ~IFF_UP); - - /* - * Report all associated addresses as deleted. It is not necessary to - * actually delete the addresses, nor is that even possible in all - * cases. In particular, the active hardware address cannot be - * deleted. Since the active hardware address is used in all address - * change announcements, delete it at the very end. - */ - ifaddr_v4_clear(ifdev); - ifaddr_v6_clear(ifdev); - ifaddr_dl_clear(ifdev); - - /* - * Delete all remaining routes associated with the interface. These - * are reported as well. We do this after clearing the addresses so as - * not to confuse the route deletion part of clearing addresses. - */ - route_clear(ifdev); - - /* Finally, announce the interface itself as gone. */ - rtsock_msg_ifannounce(ifdev, FALSE /*arrival*/); - - /* - * Free up all per-socket multicast membership structures associated to - * the interface. There is no need to leave the multicast groups. - */ - mcast_clear(ifdev); - - /* - * Also tell attached BPF devices that the interface is now gone. Do - * not bother to reset the list. - */ - TAILQ_FOREACH(bpfl, &ifdev->ifdev_bpf, bpfl_next) - bpfdev_detach(bpfl); - - /* Then perform the actual interface removal. */ - netif_remove(&ifdev->ifdev_netif); - - TAILQ_REMOVE(&ifdev_list, ifdev, ifdev_next); - - assert(ifdev_table[ifdev_get_index(ifdev)] == ifdev); - ifdev_table[ifdev_get_index(ifdev)] = NULL; - - return OK; -} - -/* - * Return the loopback interface. - */ -struct ifdev * -ifdev_get_loopback(void) -{ - - assert(ifdev_loopback != NULL); - - return ifdev_loopback; -} - -/* - * Report an update of the link state of the given interface, to 'unknown', - * 'up', or 'down', using NetBSD's LINK_STATE_ values. The link state is - * changed in the associated lwIP netif, and is reported on monitoring routing - * sockets. This function is for use by interface modules, to update the - * internal state to their current external state. - */ -void -ifdev_update_link(struct ifdev * ifdev, int iflink) -{ - struct netif *netif; - int was_up, is_up; - - ifdev->ifdev_data.ifi_link_state = iflink; - - /* - * For netif, 'up' and 'unknown' are the same link state: we simply try - * to send and receive packets in both cases. Thus, transitions from - * and to the 'down' link state are the ones that matter. - */ - netif = ifdev_get_netif(ifdev); - - was_up = netif_is_link_up(netif); - is_up = (iflink != LINK_STATE_DOWN); - - if (was_up != is_up) { - if (is_up) { - netif_set_link_up(netif); - - /* See if we should also reset address states now. */ - if (ifdev_is_up(ifdev)) - ifaddr_v6_set_up(ifdev); - } else - netif_set_link_down(netif); - - rtsock_msg_ifinfo(ifdev); - } -} - -/* - * Register a virtual interface type, using a name prefix and a function that - * is called when creation of a virtual interface of that type is requested. - */ -void -ifdev_register(const char * name, int (* create)(const char *)) -{ - - if (ifdev_vtypes == __arraycount(ifdev_vtype)) - panic("too few slots for all virtual interface types"); - - ifdev_vtype[ifdev_vtypes].ifvt_name = name; - ifdev_vtype[ifdev_vtypes].ifvt_namelen = strlen(name); - ifdev_vtype[ifdev_vtypes].ifvt_create = create; - ifdev_vtypes++; -} - -/* - * Verify that the given name is a valid interface name that can be used for - * creating a new interface. In particular, check that the given name is a - * valid interface name, consisting of an alphabetic string (the interface type - * or driver name) followed by a number string (the unit or instance number). - * Furthermore, make sure that the name does not already exist. Finally, see - * if the name prefix is reserved for a virtual interface type. If the given - * 'vtype_slot' pointer is not NULL, the prefix must be, and the virtual type - * slot number is returned in 'vtype_slot' on success. If 'vtype_slot' is - * NULL, the name must not have a virtual interface prefix, and an error is - * returned if it is. Since vtype slot numbers are meaningless outside of this - * module, external callers must always pass in NULL. This function returns OK - * on succes or a negative error code on error. - */ -int -ifdev_check_name(const char * name, unsigned int * vtype_slot) -{ - const char *p; - size_t namelen; - unsigned int slot; - - /* - * First see if the name is valid at all. TODO: decide if we want to - * allow uppercase letters, dashes, and/or underscores. - */ - for (p = name; *p >= 'a' && *p <= 'z'; p++); - - if (p == name || *p == '\0') - return EINVAL; - - namelen = (size_t)(p - name); - - for (; *p >= '0' && *p <= '9'; p++); - - if (*p != '\0') - return EINVAL; - - /* Then make sure that it does not already exist. */ - if (ifdev_find_by_name(name) != NULL) - return EEXIST; - - /* See if there is a matching virtual interface type for the name. */ - for (slot = 0; slot < ifdev_vtypes; slot++) { - if (ifdev_vtype[slot].ifvt_namelen == namelen && - !strncmp(ifdev_vtype[slot].ifvt_name, name, namelen)) - break; - } - - /* The interpretation of the result depends on 'vtype_slot'. */ - if (vtype_slot != NULL) { - if (slot == ifdev_vtypes) - return EINVAL; - - *vtype_slot = slot; - } else if (slot != ifdev_vtypes) - return EINVAL; - - return OK; -} - -/* - * Create a new virtual interface. The virtual interface type is based on the - * given name (without unit number). Return OK if the virtual interface has - * been successfully created, or a negative error code otherwise. This - * function is used both for the SIOCIFCREATE ioctl and internally. - */ -int -ifdev_create(const char * name) -{ - unsigned int slot; - int r; - - /* Verify that the given name is an acceptable interface name. */ - if ((r = ifdev_check_name(name, &slot)) != OK) - return EINVAL; - - /* Let the virtual interface implementation handle the rest. */ - return ifdev_vtype[slot].ifvt_create(name); -} - -/* - * Destroy an interface, if possible. - */ -int -ifdev_destroy(struct ifdev * ifdev) -{ - - if (ifdev->ifdev_ops->iop_destroy == NULL) - return EINVAL; - - return ifdev->ifdev_ops->iop_destroy(ifdev); -} - -/* - * Enumerate the names of currently supported virtual interface types. Return - * a pointer to the null-terminated name prefix of the Nth virtual interface - * type if the (zero-based) N value is within range, or NULL otherwise. - */ -const char * -ifdev_enum_vtypes(unsigned int num) -{ - - if (num < ifdev_vtypes) - return ifdev_vtype[num].ifvt_name; - else - return NULL; -} diff --git a/minix/net/lwip/ifdev.h b/minix/net/lwip/ifdev.h deleted file mode 100644 index 16206f906..000000000 --- a/minix/net/lwip/ifdev.h +++ /dev/null @@ -1,155 +0,0 @@ -#ifndef MINIX_NET_LWIP_IFDEV_H -#define MINIX_NET_LWIP_IFDEV_H - -#include -#include -#include -#include - -/* - * NetBSD makes setting a hardware address through ifconfig(8) a whole lot - * harder than it needs to be, namely by keeping a list of possible hardware - * addresses and marking one of them as active. For us, that level of extra - * flexibility is completely useless. In order to shield individual interface - * modules from having to deal with the rather extended interface for the list - * management, we maintain the list in ifdev and simply use a iop_set_hwaddr() - * call to the modules when the active address changes. This setting is the - * maximum number of hardware addresses in the list maintained by ifdev. It - * should be at least 2, or changing hardware addresses will not be possible. - */ -#define IFDEV_NUM_HWADDRS 3 - -struct ifdev; -struct bpfdev_link; -struct sockaddr_dlx; - -/* Interface operations table. */ -struct ifdev_ops { - err_t (* iop_init)(struct ifdev * ifdev, struct netif * netif); - err_t (* iop_input)(struct pbuf * pbuf, struct netif * netif); - err_t (* iop_output)(struct ifdev * ifdev, struct pbuf * pbuf, - struct netif * netif); - err_t (* iop_output_v4)(struct netif * netif, struct pbuf * pbuf, - const ip4_addr_t * ipaddr); - err_t (* iop_output_v6)(struct netif * netif, struct pbuf * pbuf, - const ip6_addr_t * ipaddr); - void (* iop_hdrcmplt)(struct ifdev * ifdev, struct pbuf * pbuf); - void (* iop_poll)(struct ifdev * ifdev); - int (* iop_set_ifflags)(struct ifdev * ifdev, unsigned int ifflags); - void (* iop_get_ifcap)(struct ifdev * ifdev, uint64_t * ifcap, - uint64_t * ifena); - int (* iop_set_ifcap)(struct ifdev * ifdev, uint64_t ifcap); - void (* iop_get_ifmedia)(struct ifdev * ifdev, int * ifcurrent, - int * ifactive); - int (* iop_set_ifmedia)(struct ifdev * ifdev, int ifmedia); - void (* iop_set_promisc)(struct ifdev * ifdev, int promisc); - int (* iop_set_hwaddr)(struct ifdev * ifdev, const uint8_t * hwaddr); - int (* iop_set_mtu)(struct ifdev * ifdev, unsigned int mtu); - int (* iop_destroy)(struct ifdev * ifdev); -}; - -/* Hardware address list entry. The first entry, if any, is the active one. */ -struct ifdev_hwaddr { - uint8_t ifhwa_addr[NETIF_MAX_HWADDR_LEN]; - uint8_t ifhwa_flags; -}; -#define IFHWAF_VALID 0x01 /* entry contains an address */ -#define IFHWAF_FACTORY 0x02 /* factory (device-given) address */ - -/* Interface structure. */ -struct ifdev { - TAILQ_ENTRY(ifdev) ifdev_next; /* list of active interfaces */ - char ifdev_name[IFNAMSIZ]; /* interface name, null terminated */ - unsigned int ifdev_ifflags; /* NetBSD-style interface flags */ - unsigned int ifdev_dlt; /* data link type (DLT_) */ - unsigned int ifdev_promisc; /* number of promiscuity requestors */ - struct netif ifdev_netif; /* lwIP interface structure */ - struct if_data ifdev_data; /* NetBSD-style interface data */ - char ifdev_v4set; /* interface has an IPv4 address? */ - uint8_t ifdev_v6prefix[LWIP_IPV6_NUM_ADDRESSES]; /* IPv6 prefixes */ - uint8_t ifdev_v6flags[LWIP_IPV6_NUM_ADDRESSES]; /* v6 address flags */ - uint8_t ifdev_v6state[LWIP_IPV6_NUM_ADDRESSES]; /* v6 shadow states */ - uint8_t ifdev_v6scope[LWIP_IPV6_NUM_ADDRESSES]; /* cached v6 scopes */ - struct ifdev_hwaddr ifdev_hwlist[IFDEV_NUM_HWADDRS]; /* HW addr's */ - uint32_t ifdev_nd6flags; /* ND6-related flags (ND6_IFF_) */ - const struct ifdev_ops *ifdev_ops; /* interface operations table */ - TAILQ_HEAD(, bpfdev_link) ifdev_bpf; /* list of attached BPF devices */ -}; - -#define ifdev_get_name(ifdev) ((ifdev)->ifdev_name) -#define ifdev_get_ifflags(ifdev) ((ifdev)->ifdev_ifflags) -#define ifdev_get_dlt(ifdev) ((ifdev)->ifdev_dlt) -#define ifdev_is_promisc(ifdev) ((ifdev)->ifdev_promisc != 0) -#define ifdev_get_netif(ifdev) (&(ifdev)->ifdev_netif) -#define ifdev_get_nd6flags(ifdev) ((ifdev)->ifdev_nd6flags) -#define ifdev_get_iftype(ifdev) ((ifdev)->ifdev_data.ifi_type) -#define ifdev_get_hwlen(ifdev) ((ifdev)->ifdev_data.ifi_addrlen) -#define ifdev_get_hdrlen(ifdev) ((ifdev)->ifdev_data.ifi_hdrlen) -#define ifdev_get_link(ifdev) ((ifdev)->ifdev_data.ifi_link_state) -#define ifdev_get_mtu(ifdev) ((ifdev)->ifdev_data.ifi_mtu) -#define ifdev_get_metric(ifdev) ((ifdev)->ifdev_data.ifi_metric) -#define ifdev_get_ifdata(ifdev) (&(ifdev)->ifdev_data) -#define ifdev_is_loopback(ifdev) ((ifdev)->ifdev_ifflags & IFF_LOOPBACK) -#define ifdev_is_up(ifdev) ((ifdev)->ifdev_ifflags & IFF_UP) -#define ifdev_is_link_up(ifdev) (netif_is_link_up(&(ifdev)->ifdev_netif)) -#define ifdev_set_metric(ifdev, metric) \ - ((void)((ifdev)->ifdev_data.ifi_metric = (metric))) -#define ifdev_get_index(ifdev) \ - ((uint32_t)(netif_get_index(ifdev_get_netif(ifdev)))) - -#define ifdev_output_drop(ifdev) ((ifdev)->ifdev_data.ifi_oerrors++) - -#define netif_get_ifdev(netif) ((struct ifdev *)(netif)->state) - -void ifdev_init(void); -void ifdev_poll(void); - -void ifdev_register(const char * name, int (* create)(const char *)); - -void ifdev_input(struct ifdev * ifdev, struct pbuf * pbuf, - struct netif * netif, int to_bpf); -err_t ifdev_output(struct ifdev * ifdev, struct pbuf * pbuf, - struct netif * netif, int to_bpf, int hdrcmplt); - -void ifdev_attach_bpf(struct ifdev * ifdev, struct bpfdev_link * bpfl); -void ifdev_detach_bpf(struct ifdev * ifdev, struct bpfdev_link * bpfl); - -struct ifdev *ifdev_get_by_index(uint32_t ifindex); -struct ifdev *ifdev_find_by_name(const char * name); -struct ifdev *ifdev_enum(struct ifdev * last); - -int ifdev_check_name(const char * name, unsigned int * vtype_slot); - -int ifdev_set_promisc(struct ifdev * ifdev); -void ifdev_clear_promisc(struct ifdev * ifdev); - -int ifdev_set_ifflags(struct ifdev * ifdev, unsigned int ifflags); -void ifdev_update_ifflags(struct ifdev * ifdev, unsigned int ifflags); - -void ifdev_get_ifcap(struct ifdev * ifdev, uint64_t * ifcap, - uint64_t * ifena); -int ifdev_set_ifcap(struct ifdev * ifdev, uint64_t ifena); - -int ifdev_get_ifmedia(struct ifdev * ifdev, int * ifcurrent, int * ifactive); -int ifdev_set_ifmedia(struct ifdev * ifdev, int ifmedia); - -int ifdev_set_mtu(struct ifdev * ifdev, unsigned int mtu); - -int ifdev_set_nd6flags(struct ifdev * ifdev, uint32_t nd6flags); - -void ifdev_add(struct ifdev * ifdev, const char * name, unsigned int ifflags, - unsigned int iftype, size_t hdrlen, size_t addrlen, unsigned int dlt, - unsigned int mtu, uint32_t nd6flags, const struct ifdev_ops * iop); -int ifdev_remove(struct ifdev * ifdev); - -struct ifdev *ifdev_get_loopback(void); - -void ifdev_update_link(struct ifdev * ifdev, int link); -void ifdev_update_hwaddr(struct ifdev * ifdev, const uint8_t * hwaddr, - int is_factory); - -int ifdev_create(const char * name); -int ifdev_destroy(struct ifdev * ifdev); -const char *ifdev_enum_vtypes(unsigned int num); - -#endif /* !MINIX_NET_LWIP_IFDEV_H */ diff --git a/minix/net/lwip/lldata.c b/minix/net/lwip/lldata.c deleted file mode 100644 index 3867c3985..000000000 --- a/minix/net/lwip/lldata.c +++ /dev/null @@ -1,584 +0,0 @@ -/* LWIP service - lldata.c - link-layer (ARP, NDP) data related routines */ -/* - * This module is largely isolated from the regular routing code. There are - * two reasons for that. First, mixing link-layer routes with regular routes - * would not work well due to the fact that lwIP keeps these data structures - * entirely separate. Second, as of version 8, NetBSD keeps the IP-layer and - * link-layer routing separate as well. - * - * Unfortunately, lwIP does not provide much in the way of implementing the - * functionality that would be expected for this module. As such, the current - * implementation is very restricted and simple. - * - * For ARP table entries, lwIP only allows for adding and deleting static - * entries. Non-static entries cannot be deleted. Incomplete (pending) - * entries cannot even be enumerated, nor can (e.g.) expiry information be - * obtained. The lwIP ARP datastructures are completely hidden, so there is no - * way to overcome these limitations without changing lwIP itself. As a - * result, not all functionality of the arp(8) userland utility is supported. - * - * For NDP table entries, lwIP offers no API at all. However, since the data - * structures are exposed directly, we can use those to implement full support - * for exposing information in a read-only way. However, manipulating data - * structures directly from here is too risky, nor does lwIP currently support - * the concept of static NDP table entries. Therefore, adding, changing, and - * deleting NDP entries is currently not supported, and will also first require - * changes to lwIP itself. - * - * The ndp(8) userland utility is also able to show and manipulate various - * other neighbor discovery related tables and settings. We support only a - * small subset of them. The main reason for this is that the other tables, - * in particular the prefix and default router lists, are not relevant: on - * MINIX 3, these are always managed fully in userland (usually dhcpcd(8)), and - * we even hardcode lwIP not to parse Router Advertisement messages at all, so - * even though those tables are still part of lwIP, they are always empty. - * Other ndp(8) functionality are unsupported for similar reasons. - */ - -#include "lwip.h" -#include "lldata.h" -#include "route.h" -#include "rtsock.h" - -#include "lwip/etharp.h" -#include "lwip/nd6.h" -#include "lwip/priv/nd6_priv.h" /* for neighbor_cache */ - -/* - * Process a routing command specifically for an ARP table entry. Return OK if - * the routing command has been processed successfully and a routing socket - * reply message has already been generated. Return a negative error code on - * failure, in which case the caller will generate a reply message instead. - */ -static int -lldata_arp_process(unsigned int type, const ip_addr_t * dst_addr, - const struct eth_addr * gw_addr, struct ifdev * ifdev, - unsigned int flags, const struct rtsock_request * rtr) -{ - const ip4_addr_t *ip4addr; - struct eth_addr ethaddr, *ethptr; - struct netif *netif; - lldata_arp_num_t num; - err_t err; - - netif = (ifdev != NULL) ? ifdev_get_netif(ifdev) : NULL; - - num = etharp_find_addr(netif, ip_2_ip4(dst_addr), ðptr, &ip4addr); - - if (type != RTM_ADD && num < 0) - return ESRCH; - else if (type == RTM_ADD && num >= 0) - return EEXIST; - - switch (type) { - case RTM_CHANGE: - /* - * This request is not used by arp(8), so keep things simple. - * For RTM_ADD we support only static entries; we support only - * those too here, and thus we can use delete-and-readd. If - * the ethernet address is not being changed, try readding the - * entry with the previous ethernet address. - */ - if (gw_addr == NULL) - gw_addr = ethptr; - - if (etharp_remove_static_entry(ip_2_ip4(dst_addr)) != ERR_OK) - return EPERM; - - /* FALLTHROUGH */ - case RTM_ADD: - assert(gw_addr != NULL); - - memcpy(ðaddr, gw_addr, sizeof(ethaddr)); - - /* - * Adding static, permanent, unpublished, non-proxy entries is - * all that lwIP supports right now. We also do not get to - * specify the interface, and the way lwIP picks the interface - * may in fact result in a different one. - */ - if ((err = etharp_add_static_entry(ip_2_ip4(dst_addr), - ðaddr)) != ERR_OK) - return util_convert_err(err); - - if ((num = etharp_find_addr(NULL /*netif*/, ip_2_ip4(dst_addr), - ðptr, &ip4addr)) < 0) - panic("unable to find just-added static ARP entry"); - - /* FALLTHROUGH */ - case RTM_LOCK: - case RTM_GET: - rtsock_msg_arp(num, type, rtr); - - return OK; - - case RTM_DELETE: - memcpy(ðaddr, ethptr, sizeof(ethaddr)); - - if (etharp_remove_static_entry(ip_2_ip4(dst_addr)) != ERR_OK) - return EPERM; - - /* - * FIXME: the following block is a hack, because we cannot - * predict whether the above removal will succeed, while at the - * same time we need the entry to be present in order to report - * the deleted address to the routing socket. We temporarily - * readd and then remove the entry just for the purpose of - * generating the routing socket reply. There are other ways - * to resolve this, but only a better lwIP etharp API would - * allow us to resolve this problem cleanly. - */ - (void)etharp_add_static_entry(ip_2_ip4(dst_addr), ðaddr); - - num = etharp_find_addr(NULL /*netif*/, ip_2_ip4(dst_addr), - ðptr, &ip4addr); - assert(num >= 0); - - rtsock_msg_arp(num, type, rtr); - - (void)etharp_remove_static_entry(ip_2_ip4(dst_addr)); - - return OK; - - default: - return EINVAL; - } -} - -/* - * Enumerate ARP table entries. Return TRUE if there is at least one more ARP - * table entry, of which the number is stored in 'num'. The caller should set - * 'num' to 0 initially, and increase it by one between a successful call and - * the next call. Return FALSE if there are no more ARP table entries. - */ -int -lldata_arp_enum(lldata_arp_num_t * num) -{ - ip4_addr_t *ip4addr; - struct netif *netif; - struct eth_addr *ethaddr; - - for (; *num < ARP_TABLE_SIZE; ++*num) { - if (etharp_get_entry(*num, &ip4addr, &netif, ðaddr)) - return TRUE; - } - - return FALSE; -} - -/* - * Obtain information about the ARP table entry identified by 'num'. The IPv4 - * address of the entry is stored in 'addr'. Its ethernet address is stored in - * 'gateway'. The associated interface is stored in 'ifdevp', and the entry's - * routing flags (RTF_) are stored in 'flagsp'. - */ -void -lldata_arp_get(lldata_arp_num_t num, struct sockaddr_in * addr, - struct sockaddr_dlx * gateway, struct ifdev ** ifdevp, - unsigned int * flagsp) -{ - ip_addr_t ipaddr; - ip4_addr_t *ip4addr; - struct netif *netif; - struct ifdev *ifdev; - struct eth_addr *ethaddr; - socklen_t addr_len; - - if (!etharp_get_entry(num, &ip4addr, &netif, ðaddr)) - panic("request for invalid ARP entry"); - - ip_addr_copy_from_ip4(ipaddr, *ip4addr); - - assert(netif != NULL); - ifdev = netif_get_ifdev(netif); - - addr_len = sizeof(*addr); - - addr_put_inet((struct sockaddr *)addr, &addr_len, &ipaddr, - TRUE /*kame*/, 0 /*port*/); - - addr_len = sizeof(*gateway); - - addr_put_link((struct sockaddr *)gateway, &addr_len, - ifdev_get_index(ifdev), ifdev_get_iftype(ifdev), NULL /*name*/, - ethaddr->addr, sizeof(ethaddr->addr)); - - *ifdevp = ifdev; - - /* - * TODO: this is not necessarily accurate, but lwIP does not provide us - * with information as to whether this is a static entry or not.. - */ - *flagsp = RTF_HOST | RTF_LLINFO | RTF_LLDATA | RTF_STATIC | RTF_CLONED; -} - -/* - * Obtain information about the ND6 neighbor cache entry 'i', which must be a - * number between 0 (inclusive) and LWIP_ND6_NUM_NEIGHBORS (exclusive). If an - * entry with this number exists, return a pointer to its IPv6 address, and - * additional information in each of the given pointers if not NULL. The - * associated interface is stored in 'netif'. If the entry has an associated - * link-layer address, a pointer to it is stored in 'lladdr'. The entry's - * state (ND6_{INCOMPLETE,REACHABLE,STALE,DELAY,PROBE}) is stored in 'state'. - * The 'isrouter' parameter is filled with a boolean value indicating whether - * the entry is for a router. For ND6_INCOMPLETE and ND6_PROBE, the number of - * probes sent so far is stored in 'probes_sent'; for other states, the value - * is set to zero. For ND6_REACHABLE and ND6_DELAY, the time until expiration - * in ND6_TMR_INTERVAL-millisecond units is stored in 'expire_time'; for other - * states, the value is set to zero. If an entry with number 'i' does not - * exist, NULL is returned. - * - * TODO: upstream this function to lwIP. - */ -static const ip6_addr_t * -nd6_get_neighbor_cache_entry(int8_t i, struct netif ** netif, - const uint8_t ** lladdr, uint8_t * state, uint8_t * isrouter, - uint32_t * probes_sent, uint32_t * expire_time) -{ - - if (i < 0 || i >= LWIP_ND6_NUM_NEIGHBORS || - neighbor_cache[i].state == ND6_NO_ENTRY) - return NULL; - - if (netif != NULL) - *netif = neighbor_cache[i].netif; - - if (lladdr != NULL) { - if (neighbor_cache[i].state != ND6_INCOMPLETE) - *lladdr = neighbor_cache[i].lladdr; - else - *lladdr = NULL; - } - - if (state != NULL) - *state = neighbor_cache[i].state; - - if (isrouter != NULL) - *isrouter = neighbor_cache[i].isrouter; - - if (probes_sent != NULL) { - if (neighbor_cache[i].state == ND6_INCOMPLETE || - neighbor_cache[i].state == ND6_PROBE) - *probes_sent = neighbor_cache[i].counter.probes_sent; - else - *probes_sent = 0; - } - - if (expire_time != NULL) { - switch (neighbor_cache[i].state) { - case ND6_REACHABLE: - *expire_time = - neighbor_cache[i].counter.reachable_time / - ND6_TMR_INTERVAL; - break; - case ND6_DELAY: - *expire_time = neighbor_cache[i].counter.delay_time; - break; - case ND6_INCOMPLETE: - case ND6_PROBE: - /* Probes are sent once per timer tick. */ - *expire_time = (LWIP_ND6_MAX_MULTICAST_SOLICIT + 1 - - neighbor_cache[i].counter.probes_sent) * - (ND6_TMR_INTERVAL / 1000); - break; - default: - /* Stale entries do not expire; they get replaced. */ - *expire_time = 0; - break; - } - } - - return &neighbor_cache[i].next_hop_address; -} - -/* - * Find a neighbor cache entry by IPv6 address. Return its index number if - * found, or -1 if not. This is a reimplementation of the exact same function - * internal to lwIP. - * - * TODO: make this function public in lwIP. - */ -static int8_t -nd6_find_neighbor_cache_entry(const ip6_addr_t * addr) -{ - int8_t i; - - for (i = 0; i < LWIP_ND6_NUM_NEIGHBORS; i++) { - if (ip6_addr_cmp(addr, &neighbor_cache[i].next_hop_address)) - return i; - } - - return -1; -} - -/* - * Find an NDP table entry based on the given interface and IPv6 address. On - * success, return OK, with the entry's index number stored in 'nump'. On - * failure, return an appropriate error code. - */ -int -lldata_ndp_find(struct ifdev * ifdev, const struct sockaddr_in6 * addr, - lldata_ndp_num_t * nump) -{ - ip_addr_t ipaddr; - int8_t i; - int r; - - if ((r = addr_get_inet((const struct sockaddr *)addr, sizeof(*addr), - IPADDR_TYPE_V6, &ipaddr, TRUE /*kame*/, NULL /*port*/)) != OK) - return r; - - /* - * For given link-local addresses, no zone may be provided in the - * address at all. In such cases, add the zone ourselves, using the - * given interface. - */ - if (ip6_addr_lacks_zone(ip_2_ip6(&ipaddr), IP6_UNKNOWN)) - ip6_addr_assign_zone(ip_2_ip6(&ipaddr), IP6_UNKNOWN, - ifdev_get_netif(ifdev)); - - i = nd6_find_neighbor_cache_entry(ip_2_ip6(&ipaddr)); - if (i < 0) - return ESRCH; - - /* - * We should compare the neighbor cache entry's associated netif to - * the given ifdev, but since the lwIP neighbor cache is currently not - * keyed by netif anyway (i.e. the internal lookups are purely by IPv6 - * address as well), doing so makes little sense in practice. - */ - - *nump = (lldata_ndp_num_t)i; - return OK; -} - -/* - * Process a routing command specifically for an NDP table entry. Return OK if - * the routing command has been processed successfully and a routing socket - * reply message has already been generated. Return a negative error code on - * failure, in which case the caller will generate a reply message instead. - */ -static int -lldata_ndp_process(unsigned int type, const ip_addr_t * dst_addr, - const struct eth_addr * gw_addr, - struct ifdev * ifdev, unsigned int flags, - const struct rtsock_request * rtr) -{ - lldata_ndp_num_t num; - - num = (lldata_ndp_num_t) - nd6_find_neighbor_cache_entry(ip_2_ip6(dst_addr)); - - if (type != RTM_ADD && num < 0) - return ESRCH; - else if (type == RTM_ADD && num >= 0) - return EEXIST; - - switch (type) { - case RTM_LOCK: - case RTM_GET: - rtsock_msg_arp(num, type, rtr); - - return OK; - - case RTM_ADD: - case RTM_CHANGE: - case RTM_DELETE: - /* TODO: add lwIP support to implement these commands. */ - return ENOSYS; - - default: - return EINVAL; - } -} - -/* - * Enumerate NDP table entries. Return TRUE if there is at least one more NDP - * table entry, of which the number is stored in 'num'. The caller should set - * 'num' to 0 initially, and increase it by one between a successful call and - * the next call. Return FALSE if there are no more NDP table entries. - */ -int -lldata_ndp_enum(lldata_ndp_num_t * num) -{ - - for (; *num < LWIP_ND6_NUM_NEIGHBORS; ++*num) { - if (nd6_get_neighbor_cache_entry(*num, NULL /*netif*/, - NULL /*lladdr*/, NULL /*state*/, NULL /*isrouter*/, - NULL /*probes_sent*/, NULL /*expire_time*/) != NULL) - return TRUE; - } - - return FALSE; -} - -/* - * Obtain information about the NDP table entry identified by 'num'. The IPv6 - * address of the entry is stored in 'addr'. Its ethernet address is stored in - * 'gateway'. The associated interface is stored in 'ifdevp', and the entry's - * routing flags (RTF_) are stored in 'flagsp'. - */ -void -lldata_ndp_get(lldata_ndp_num_t num, struct sockaddr_in6 * addr, - struct sockaddr_dlx * gateway, struct ifdev ** ifdevp, - unsigned int * flagsp) -{ - const ip6_addr_t *ip6addr; - ip_addr_t ipaddr; - struct netif *netif = NULL /*gcc*/; - struct ifdev *ifdev; - const uint8_t *lladdr = NULL /*gcc*/; - socklen_t addr_len; - - ip6addr = nd6_get_neighbor_cache_entry(num, &netif, &lladdr, - NULL /*state*/, NULL /*isrouter*/, NULL /*probes_sent*/, - NULL /*expire_time*/); - assert(ip6addr != NULL); - - ip_addr_copy_from_ip6(ipaddr, *ip6addr); - - ifdev = netif_get_ifdev(netif); - assert(ifdev != NULL); - - addr_len = sizeof(*addr); - - addr_put_inet((struct sockaddr *)addr, &addr_len, &ipaddr, - TRUE /*kame*/, 0 /*port*/); - - addr_len = sizeof(*gateway); - - addr_put_link((struct sockaddr *)gateway, &addr_len, - ifdev_get_index(ifdev), ifdev_get_iftype(ifdev), NULL /*name*/, - lladdr, ifdev_get_hwlen(ifdev)); - - *ifdevp = ifdev; - *flagsp = RTF_HOST | RTF_LLINFO | RTF_LLDATA | RTF_CLONED; -} - -/* - * Obtain information about the NDP table entry with the number 'num', which - * must be obtained through a previous call to lldata_ndp_find(). On return, - * 'asked' is filled with the number of probes sent so far (0 if inapplicable), - * 'isrouter' is set to 1 or 0 depending on whether the entry is for a router, - * 'state' is set to the entry's state (ND6_LLINFO_), and 'expire' is set to - * either the UNIX timestamp of expiry for the entry; 0 for permanent entries. - * None of the given pointers must be NULL. This function always succeeds. - */ -void -lldata_ndp_get_info(lldata_ndp_num_t num, long * asked, int * isrouter, - int * state, int * expire) -{ - uint32_t nd6_probes_sent = 0 /*gcc*/, nd6_expire_time = 0 /*gcc*/; - uint8_t nd6_state = 0 /*gcc*/, nd6_isrouter = 0 /*gcc*/; - - (void)nd6_get_neighbor_cache_entry(num, NULL /*netif*/, - NULL /*lladdr*/, &nd6_state, &nd6_isrouter, &nd6_probes_sent, - &nd6_expire_time); - - *asked = (long)nd6_probes_sent; - - *isrouter = !!nd6_isrouter; - - switch (nd6_state) { - case ND6_INCOMPLETE: *state = ND6_LLINFO_INCOMPLETE; break; - case ND6_REACHABLE: *state = ND6_LLINFO_REACHABLE; break; - case ND6_STALE: *state = ND6_LLINFO_STALE; break; - case ND6_DELAY: *state = ND6_LLINFO_DELAY; break; - case ND6_PROBE: *state = ND6_LLINFO_PROBE; break; - default: panic("unknown ND6 state %u", nd6_state); - } - - if (nd6_expire_time != 0) - *expire = clock_time(NULL) + - (int)nd6_expire_time * (ND6_TMR_INTERVAL / 1000); - else - *expire = 0; -} - -/* - * Process a routing command specifically for a link-layer route, as one of the - * specific continuations of processing started by route_process(). The RTM_ - * routing command is given as 'type'. The route destination is given as - * 'dst_addr'; its address type determines whether the operation is for ARP or - * NDP. The sockaddr structure for 'gateway' is passed on as is and may have - * to be parsed here if not NULL. 'ifdev' is the interface to be associated - * with the route; it is non-NULL only if an interface name (IFP) or address - * (IFA) was given. The RTF_ flags field has been checked against the globally - * supported flags, but may have to be checked for flags that do not apply to - * ARP/NDP routes. Return OK or a negative error code, following the same - * semantics as route_process(). - */ -int -lldata_process(unsigned int type, const ip_addr_t * dst_addr, - const struct sockaddr * gateway, struct ifdev * ifdev, - unsigned int flags, const struct rtsock_request * rtr) -{ - const struct route_entry *route; - struct eth_addr ethaddr, *gw_addr; - int r; - - assert(flags & RTF_LLDATA); - - /* - * It seems that RTF_UP does not apply to link-layer routing entries. - * We basically accept any flags that we can return, but we do not - * actually check most of them anywhere. - */ - if ((flags & ~(RTF_HOST | RTF_LLINFO | RTF_LLDATA | RTF_STATIC | - RTF_CLONED | RTF_ANNOUNCE)) != 0) - return EINVAL; - - gw_addr = NULL; - - if (type == RTM_ADD || type == RTM_CHANGE) { - /* - * Link-layer entries are always host entries. Not all - * requests pass in this flag though, so check only when the - * flags are supposed to be set. - */ - if ((type == RTM_ADD || type == RTM_CHANGE) && - !(flags & RTF_HOST)) - return EINVAL; - - /* lwIP does not support publishing custom entries. */ - if (flags & RTF_ANNOUNCE) - return ENOSYS; - - /* RTF_GATEWAY is always cleared for link-layer entries. */ - if (gateway != NULL) { - if ((r = addr_get_link(gateway, gateway->sa_len, - NULL /*name*/, 0 /*name_max*/, ethaddr.addr, - sizeof(ethaddr.addr))) != OK) - return r; - - gw_addr = ðaddr; - } - - if (type == RTM_ADD) { - if (gateway == NULL) - return EINVAL; - - /* - * If no interface has been specified, see if the - * destination address is on a locally connected - * network. If so, use that network's interface. - * Otherwise reject the request altogether: we must - * have an interface to which to associate the entry. - */ - if (ifdev == NULL) { - if ((route = route_lookup(dst_addr)) != NULL && - !(route_get_flags(route) & RTF_GATEWAY)) - ifdev = route_get_ifdev(route); - else - return ENETUNREACH; - } - } - } - - if (IP_IS_V4(dst_addr)) - return lldata_arp_process(type, dst_addr, gw_addr, ifdev, - flags, rtr); - else - return lldata_ndp_process(type, dst_addr, gw_addr, ifdev, - flags, rtr); -} diff --git a/minix/net/lwip/loopif.c b/minix/net/lwip/loopif.c deleted file mode 100644 index db21db09a..000000000 --- a/minix/net/lwip/loopif.c +++ /dev/null @@ -1,420 +0,0 @@ -/* LWIP service - loopif.c - loopback interfaces */ -/* - * There is always at least one loopback device. This device is used also to - * loop back packets sent on other interfaces to the local interface address. - * Therefore, not all packets on the loopback device have a source or - * destination address corresponding to the loopback device. - */ - -#include "lwip.h" - -/* - * As a safety measure, if lwIP somehow gets stuck in a loop replying to its - * own packets on a loopback interface, stop with immediately feeding packets - * back into lwIP after this many packets. The remaining packets will still be - * delivered, but not before the main message loop has had a chance to run. - */ -#define LOOPIF_LIMIT 65536 - -/* - * The MTU is restricted to 65531 bytes, because we need space for a 4-byte - * header to identify the original interface of the packet. - */ -#define LOOPIF_MAX_MTU (UINT16_MAX - sizeof(uint32_t)) /* maximum MTU */ -#define LOOPIF_DEF_MTU LOOPIF_MAX_MTU /* default MTU */ - -#define NR_LOOPIF 2 /* number of loopback devices */ - -struct loopif { - struct ifdev loopif_ifdev; /* interface device, MUST be first */ - struct pbuf *loopif_head; /* head of pending loopback packets */ - struct pbuf **loopif_tailp; /* tail ptr-ptr of pending packets */ - TAILQ_ENTRY(loopif) loopif_next; /* next in free list */ -} loopif_array[NR_LOOPIF]; - -static TAILQ_HEAD(, loopif) loopif_freelist; /* free loop interfaces list */ -static TAILQ_HEAD(, loopif) loopif_activelist; /* active loop interfaces */ - -#define loopif_get_netif(loopif) (ifdev_get_netif(&(loopif)->loopif_ifdev)) - -static unsigned int loopif_cksum_flags; - -static int loopif_create(const char *name); - -static const struct ifdev_ops loopif_ops; - -/* - * Initialize the loopback interface module. - */ -void -loopif_init(void) -{ - unsigned int slot; - - /* Initialize the lists of loopback interfaces. */ - TAILQ_INIT(&loopif_freelist); - TAILQ_INIT(&loopif_activelist); - - for (slot = 0; slot < __arraycount(loopif_array); slot++) - TAILQ_INSERT_TAIL(&loopif_freelist, &loopif_array[slot], - loopif_next); - - /* - * The default is to perform no checksumming on loopback interfaces, - * except for ICMP messages because otherwise we would need additional - * changes in the code receiving those. In fact, for future - * compatibility, disable only those flags that we manage ourselves. - */ - loopif_cksum_flags = NETIF_CHECKSUM_ENABLE_ALL & - ~(NETIF_CHECKSUM_GEN_IP | NETIF_CHECKSUM_CHECK_IP | - NETIF_CHECKSUM_GEN_UDP | NETIF_CHECKSUM_CHECK_UDP | - NETIF_CHECKSUM_GEN_TCP | NETIF_CHECKSUM_CHECK_TCP); - - /* Tell the ifdev module that users may create more loopif devices. */ - ifdev_register("lo", loopif_create); -} - -/* - * Polling function, invoked after each message loop iteration. Forward any - * packets received on the output side of the loopback device during this - * loop iteration, to the input side of the device. - */ -static void -loopif_poll(struct ifdev * ifdev) -{ - struct loopif *loopif = (struct loopif *)ifdev; - struct pbuf *pbuf, **pnext; - struct ifdev *oifdev; - struct netif *netif; - uint32_t oifindex; - unsigned int count; - static int warned = FALSE; - - count = 0; - - while ((pbuf = loopif->loopif_head) != NULL) { - /* - * Prevent endless loops. Keep in mind that packets may be - * added to the queue as part of processing packets from the - * queue here, so the queue itself will never reach this - * length. As such the limit can (and must) be fairly high. - * - * In any case, if this warning is shown, that basically means - * that a bug in lwIP has been triggered. There should be no - * such bugs, so if there are, they should be fixed in lwIP. - */ - if (count++ == LOOPIF_LIMIT) { - if (!warned) { - printf("LWIP: excess loopback traffic, " - "throttling output\n"); - warned = TRUE; - } - - break; - } - - pnext = pchain_end(pbuf); - - if ((loopif->loopif_head = *pnext) == NULL) - loopif->loopif_tailp = &loopif->loopif_head; - *pnext = NULL; - - /* - * Get the original interface for the packet, which if non-zero - * must also be used to pass the packet back to. The interface - * should still exist in all cases, but better safe than sorry. - */ - memcpy(&oifindex, pbuf->payload, sizeof(oifindex)); - - util_pbuf_header(pbuf, -(int)sizeof(oifindex)); - - if (oifindex != 0 && - (oifdev = ifdev_get_by_index(oifindex)) != NULL) - netif = ifdev_get_netif(oifdev); - else - netif = NULL; - - /* - * Loopback devices hand packets to BPF on output only. Doing - * so on input as well would duplicate all captured packets. - */ - ifdev_input(ifdev, pbuf, netif, FALSE /*to_bpf*/); - } -} - -/* - * Process a packet as output on a loopback interface. Packets cannot be - * passed back into lwIP right away, nor can the original packets be passed - * back into lwIP. Therefore, make a copy of the packet, and pass it back to - * lwIP at the end of the current message loop iteration. - */ -static err_t -loopif_output(struct ifdev * ifdev, struct pbuf * pbuf, struct netif * netif) -{ - struct loopif *loopif = (struct loopif *)ifdev; - struct ifdev *oifdev; - struct pbuf *pcopy; - uint32_t oifindex; - - /* Reject oversized packets immediately. This should not happen. */ - if (pbuf->tot_len > UINT16_MAX - sizeof(oifindex)) { - printf("LWIP: attempt to send oversized loopback packet\n"); - - return ERR_MEM; - } - - /* - * If the service is low on memory, this is a likely place where - * allocation failures will occur. Thus, do not print anything here. - * The user can diagnose such problems with interface statistics. - */ - pcopy = pchain_alloc(PBUF_RAW, sizeof(oifindex) + pbuf->tot_len); - if (pcopy == NULL) { - ifdev_output_drop(ifdev); - - return ERR_MEM; - } - - /* - * If the packet was purposely diverted from a non-loopback interface - * to this interface, we have to remember the original interface, so - * that we can pass back the packet to that interface as well. If we - * don't, packets to link-local addresses assigned to non-loopback - * interfaces will not be processed correctly. - */ - if (netif != NULL) { - oifdev = netif_get_ifdev(netif); - oifindex = ifdev_get_index(oifdev); - } else - oifindex = 0; - - assert(pcopy->len >= sizeof(oifindex)); - - memcpy(pcopy->payload, &oifindex, sizeof(oifindex)); - - util_pbuf_header(pcopy, -(int)sizeof(oifindex)); - - if (pbuf_copy(pcopy, pbuf) != ERR_OK) - panic("unexpected pbuf copy failure"); - - pcopy->flags |= pbuf->flags & (PBUF_FLAG_LLMCAST | PBUF_FLAG_LLBCAST); - - util_pbuf_header(pcopy, sizeof(oifindex)); - - *loopif->loopif_tailp = pcopy; - loopif->loopif_tailp = pchain_end(pcopy); - - return ERR_OK; -} - -/* - * Initialization function for a loopback-type netif interface, called from - * lwIP at interface creation time. - */ -static err_t -loopif_init_netif(struct ifdev * ifdev, struct netif * netif) -{ - - netif->name[0] = 'l'; - netif->name[1] = 'o'; - - /* - * FIXME: unfortunately, lwIP does not allow one to enable multicast on - * an interface without also enabling multicast management traffic - * (that is, IGMP and MLD). Thus, for now, joining multicast groups - * and assigning local IPv6 addresses will incur such traffic even on - * loopback interfaces. For now this is preferable over not supporting - * multicast on loopback interfaces at all. - */ - netif->flags |= NETIF_FLAG_IGMP | NETIF_FLAG_MLD6; - - NETIF_SET_CHECKSUM_CTRL(netif, loopif_cksum_flags); - - return ERR_OK; -} - -/* - * Create a new loopback device. - */ -static int -loopif_create(const char * name) -{ - struct loopif *loopif; - - /* Find a free loopback interface slot, if available. */ - if (TAILQ_EMPTY(&loopif_freelist)) - return ENOBUFS; - - loopif = TAILQ_FIRST(&loopif_freelist); - TAILQ_REMOVE(&loopif_freelist, loopif, loopif_next); - - /* Initialize the loopif structure. */ - TAILQ_INSERT_HEAD(&loopif_activelist, loopif, loopif_next); - - loopif->loopif_head = NULL; - loopif->loopif_tailp = &loopif->loopif_head; - - /* - * For simplicity and efficiency, we do not prepend the address family - * (IPv4/IPv6) to the packet for BPF, which means our loopback devices - * are of type DLT_RAW rather than (NetBSD's) DLT_NULL. - */ - ifdev_add(&loopif->loopif_ifdev, name, IFF_LOOPBACK | IFF_MULTICAST, - IFT_LOOP, 0 /*hdrlen*/, 0 /*addrlen*/, DLT_RAW, LOOPIF_MAX_MTU, - 0 /*nd6flags*/, &loopif_ops); - - ifdev_update_link(&loopif->loopif_ifdev, LINK_STATE_UP); - - return OK; -} - -/* - * Destroy an existing loopback device. - */ -static int -loopif_destroy(struct ifdev * ifdev) -{ - struct loopif *loopif = (struct loopif *)ifdev; - struct pbuf *pbuf, **pnext; - int r; - - /* - * The ifdev module may refuse to remove this interface if it is the - * loopback interface used to loop back packets for other interfaces. - */ - if ((r = ifdev_remove(&loopif->loopif_ifdev)) != OK) - return r; - - /* - * Clean up. The loopback queue can be non-empty only if we have been - * throttling in case of a feedback loop. - */ - while ((pbuf = loopif->loopif_head) != NULL) { - pnext = pchain_end(pbuf); - - if ((loopif->loopif_head = *pnext) == NULL) - loopif->loopif_tailp = &loopif->loopif_head; - *pnext = NULL; - - pbuf_free(pbuf); - } - - TAILQ_REMOVE(&loopif_activelist, loopif, loopif_next); - - TAILQ_INSERT_HEAD(&loopif_freelist, loopif, loopif_next); - - return OK; -} - -/* - * Set NetBSD-style interface flags (IFF_) for a loopback interface. - */ -static int -loopif_set_ifflags(struct ifdev * ifdev, unsigned int ifflags) -{ - struct loopif *loopif = (struct loopif *)ifdev; - - /* - * Only the IFF_UP flag may be set and cleared. We adjust the - * IFF_RUNNING flag immediately based on this flag. This is a bit - * dangerous, but the caller takes this possibility into account. - */ - if ((ifflags & ~IFF_UP) != 0) - return EINVAL; - - if (ifflags & IFF_UP) - ifdev_update_ifflags(&loopif->loopif_ifdev, - ifdev_get_ifflags(&loopif->loopif_ifdev) | IFF_RUNNING); - else - ifdev_update_ifflags(&loopif->loopif_ifdev, - ifdev_get_ifflags(&loopif->loopif_ifdev) & ~IFF_RUNNING); - - return OK; -} - -/* - * Set the Maximum Transmission Unit for this interface. Return TRUE if the - * new value is acceptable, in which case the caller will do the rest. Return - * FALSE otherwise. - */ -static int -loopif_set_mtu(struct ifdev * ifdev __unused, unsigned int mtu) -{ - - return (mtu <= LOOPIF_MAX_MTU); -} - -static const struct ifdev_ops loopif_ops = { - .iop_init = loopif_init_netif, - .iop_input = ip_input, - .iop_output = loopif_output, - .iop_poll = loopif_poll, - .iop_set_ifflags = loopif_set_ifflags, - .iop_set_mtu = loopif_set_mtu, - .iop_destroy = loopif_destroy, -}; - -/* - * Set and/or retrieve a per-protocol loopback checksumming option through - * sysctl(7). - */ -ssize_t -loopif_cksum(struct rmib_call * call, struct rmib_node * node __unused, - struct rmib_oldp * oldp, struct rmib_newp * newp) -{ - struct loopif *loopif; - unsigned int flags; - int r, val; - - /* - * The third name field is the protocol. We ignore the domain (the - * second field), thus sharing settings between PF_INET and PF_INET6. - * This is necessary because lwIP does not support TCP/UDP checksumming - * flags on a per-domain basis. - */ - switch (call->call_oname[2]) { - case IPPROTO_IP: - flags = NETIF_CHECKSUM_GEN_IP | NETIF_CHECKSUM_CHECK_IP; - break; - case IPPROTO_UDP: - flags = NETIF_CHECKSUM_GEN_UDP | NETIF_CHECKSUM_CHECK_UDP; - break; - case IPPROTO_TCP: - flags = NETIF_CHECKSUM_GEN_TCP | NETIF_CHECKSUM_CHECK_TCP; - break; - default: - return EINVAL; - } - - /* Copy out the old (current) checksumming option. */ - if (oldp != NULL) { - val = !!(loopif_cksum_flags & flags); - - if ((r = rmib_copyout(oldp, 0, &val, sizeof(val))) < 0) - return r; - } - - if (newp != NULL) { - if ((r = rmib_copyin(newp, &val, sizeof(val))) != OK) - return r; - - if (val) - loopif_cksum_flags |= flags; - else - loopif_cksum_flags &= ~flags; - - /* - * Apply the new checksum flags to all loopback interfaces. - * Technically, this may result in dropped packets when - * enabling checksumming on a throttled loopif, but that is a - * case so rare and unimportant that we ignore it. - */ - TAILQ_FOREACH(loopif, &loopif_activelist, loopif_next) { - NETIF_SET_CHECKSUM_CTRL(loopif_get_netif(loopif), - loopif_cksum_flags); - } - } - - /* Return the length of the node. */ - return sizeof(val); -} diff --git a/minix/net/lwip/lwip.h b/minix/net/lwip/lwip.h deleted file mode 100644 index 2f65ab8ee..000000000 --- a/minix/net/lwip/lwip.h +++ /dev/null @@ -1,130 +0,0 @@ -#ifndef MINIX_NET_LWIP_LWIP_H -#define MINIX_NET_LWIP_LWIP_H - -#include -#include -#include -#include -#include -#include - -#include "lwip/ip.h" -#include "lwiphooks.h" - -#include "addr.h" -#include "ipsock.h" -#include "ifdev.h" -#include "util.h" - -/* - * The standard sockaddr_dl is an absolute pain, because the actual structure - * is dynamically sized, while the standard definition is neither the minimum - * nor the maximum size. We use our own version, which uses the maximum size - * that we will ever produce and accept. This greatly simplifies dealing with - * this structure while also limiting stack usage a bit. - */ -struct sockaddr_dlx { - uint8_t sdlx_len; /* actual length of this structure */ - sa_family_t sdlx_family; /* address family, always AF_LINK */ - uint16_t sdlx_index; /* interface index */ - uint8_t sdlx_type; /* interface type (IFT_) */ - uint8_t sdlx_nlen; /* interface name length, w/o nul */ - uint8_t sdlx_alen; /* link-layer address length */ - uint8_t sdlx_slen; /* selector length, always 0 */ - uint8_t sdlx_data[IFNAMSIZ + NETIF_MAX_HWADDR_LEN]; -}; - -STATIC_SOCKADDR_MAX_ASSERT(sockaddr_in); -STATIC_SOCKADDR_MAX_ASSERT(sockaddr_in6); -STATIC_SOCKADDR_MAX_ASSERT(sockaddr_dlx); - -/* This is our own, much smaller internal version of sockaddr_storage. */ -union sockaddr_any { - struct sockaddr sa; - struct sockaddr_in sin; - struct sockaddr_in6 sin6; - struct sockaddr_dlx sdlx; -}; - -/* Number of bits in each of the types of IP addresses. */ -#define IP4_BITS 32 /* number of bits in an IPv4 address */ -#define IP6_BITS 128 /* number of bits in an IPv6 address */ - -/* - * Each socket module maintains its own set of sockets, but all sockets must be - * given globally unique identifiers. Therefore, we use these modifier masks, - * which are bitwise OR'ed with the per-module socket identifiers. - */ -#define SOCKID_TCP 0x00000000 -#define SOCKID_UDP 0x00100000 -#define SOCKID_RAW 0x00200000 -#define SOCKID_RT 0x00400000 -#define SOCKID_LNK 0x00800000 - -/* - * Static remote MIB node identifiers for nodes that are dynamically numbered - * on NetBSD, because they do not have a corresponding protocol family number. - */ -#define NET_INTERFACES (PF_MAX) /* net.interfaces (TODO) */ -#define NET_BPF (PF_MAX + 1) /* net.bpf */ - -#define ROOT_EUID 0 /* effective user ID of superuser */ - -/* - * Function declarations. Modules with more extended interfaces have their own - * header files. - */ - -/* mempool.c */ -void mempool_init(void); -unsigned int mempool_cur_buffers(void); -unsigned int mempool_max_buffers(void); - -/* pchain.c */ -struct pbuf **pchain_end(struct pbuf * pbuf); -size_t pchain_size(struct pbuf * pbuf); - -/* addrpol.c */ -int addrpol_get_label(const ip_addr_t * ipaddr); -int addrpol_get_scope(const ip_addr_t * ipaddr, int is_src); - -/* tcpsock.c */ -void tcpsock_init(void); -sockid_t tcpsock_socket(int domain, int protocol, struct sock ** sock, - const struct sockevent_ops ** ops); - -/* udpsock.c */ -void udpsock_init(void); -sockid_t udpsock_socket(int domain, int protocol, struct sock ** sock, - const struct sockevent_ops ** ops); - -/* rawsock.c */ -void rawsock_init(void); -sockid_t rawsock_socket(int domain, int protocol, struct sock ** sock, - const struct sockevent_ops ** ops); - -/* loopif.c */ -void loopif_init(void); -ssize_t loopif_cksum(struct rmib_call * call, struct rmib_node * node, - struct rmib_oldp * oldp, struct rmib_newp * newp); - -/* lnksock.c */ -void lnksock_init(void); -sockid_t lnksock_socket(int type, int protocol, struct sock ** sock, - const struct sockevent_ops ** ops); - -/* mibtree.c */ -void mibtree_init(void); -void mibtree_register_inet(int domain, int protocol, struct rmib_node * node); -void mibtree_register_lwip(struct rmib_node * node); - -/* ifconf.c */ -void ifconf_init(void); -int ifconf_ioctl(struct sock * sock, unsigned long request, - const struct sockdriver_data * data, endpoint_t user_endpt); - -/* bpf_filter.c */ -u_int bpf_filter_ext(const struct bpf_insn * pc, const struct pbuf * pbuf, - const u_char * packet, u_int total, u_int len); - -#endif /* !MINIX_NET_LWIP_LWIP_H */ diff --git a/minix/net/lwip/pktsock.c b/minix/net/lwip/pktsock.c deleted file mode 100644 index 5ddb9b55a..000000000 --- a/minix/net/lwip/pktsock.c +++ /dev/null @@ -1,1236 +0,0 @@ -/* LWIP service - pktsock.c - packet code shared between UDP and RAW */ - -#include "lwip.h" -#include "pktsock.h" -#include "ifaddr.h" - -/* - * This buffer should be much bigger (at least 10KB, according to RFC 3542), - * but we do not support the ancillary options that take so much space anyway. - */ -#define PKTSOCK_CTLBUF_SIZE 256 - -static char pktsock_ctlbuf[PKTSOCK_CTLBUF_SIZE]; - -/* - * Header structures with ancillary data for received packets. The reason that - * we do not simply use a generic pkthdr structure with ip_addr_t source and - * destination addresses, is that for UDP packets, we put this structure in - * place of the received (ethernet and IP headers), and such a full structure - * (including IPv6-size addresses) would not fit in the header space for IPv4 - * packets. So instead we use two address structures, one for IPv4 and one for - * IPv6, and a generic header structure on top of it, which also identifies - * which address structure is underneath. The combination of the address - * structure and the header structure must fit in the IP header. The IPv6 - * packet header is already so close to the limit here that we have to use - * packed addresses. For IPv4 we use the regular addresses for simplicity. - */ -struct pkthdr { - uint16_t port; /* source port number (UDP only) */ - uint8_t dstif; /* interface that received the pkt */ - uint8_t addrif; /* interface that accepted the pkt */ - uint8_t tos; /* TOS/TC value from the IP header */ - uint8_t ttl; /* TTL/HL value from the IP header */ - uint8_t flags; /* packet flags (PKTHF_) */ - uint8_t _unused; /* all that is still available.. */ -}; - -#define PKTHF_IPV6 0x01 /* packet has IPv6 header */ -#define PKTHF_MCAST 0x02 /* packet has multicast destination */ -#define PKTHF_BCAST 0x04 /* packet has broadcast destination */ - -struct pktaddr4 { - ip4_addr_t srcaddr; - ip4_addr_t dstaddr; -}; - -struct pktaddr6 { - ip6_addr_p_t srcaddr; - ip6_addr_p_t dstaddr; -}; - -/* - * Create a packet socket. Relay parameters and return values to and from the - * IP module's socket creation function. This function must not allocate any - * resources in any form, as socket creation may still fail later, in which - * case no destruction function is called. - */ -int -pktsock_socket(struct pktsock * pkt, int domain, size_t sndbuf, size_t rcvbuf, - struct sock ** sockp) -{ - - pkt->pkt_rcvhead = NULL; - pkt->pkt_rcvtailp = &pkt->pkt_rcvhead; - pkt->pkt_rcvlen = 0; - - mcast_reset(&pkt->pkt_mcast); - - memset(&pkt->pkt_srcaddr, 0, sizeof(pkt->pkt_srcaddr)); - pkt->pkt_ifindex = 0; - - /* - * Any PKTF_ type flags should be initialized on the socket only after - * the following call, as this call will clear the flags field. For - * now, no PKTF_ flags need to be set by default, though. - */ - return ipsock_socket(&pkt->pkt_ipsock, domain, sndbuf, rcvbuf, sockp); -} - -/* - * Return TRUE if the given packet can and should be received on the given - * socket, or FALSE if there is a reason not to receive the packet. - */ -static int -pktsock_may_recv(struct pktsock * pkt, struct pbuf * pbuf) -{ - - /* - * By policy, multicast packets should not be received on sockets of - * which the owning application is not multicast aware. - */ - if (ip_addr_ismulticast(ip_current_dest_addr()) && - !(ipsock_get_flag(&pkt->pkt_ipsock, PKTF_MCAWARE))) - return FALSE; - - /* - * Due to fragment reassembly, we might end up with packets that take - * up more buffer space than their byte size, even after rounding up - * the latter. The user probably does not want packets to get dropped - * for that reason, e.g. when they set a 64K limit and the packet ends - * up being estimated as 65K and dropped. So, we test against - * 'pbuf->tot_len' rather than the rounded-up packet size. However, - * 'pkt->pkt_rcvlen' itself is increased by the rounded-up packet size - * when enqueuing the packet, so that we still count the memory - * consumption (generally) conservatively, which is what we want. - */ - return (pkt->pkt_rcvlen + pbuf->tot_len <= - ipsock_get_rcvbuf(&pkt->pkt_ipsock)); -} - -/* - * Check whether the given packet can and should be received on the given - * socket. If so, return the amount of space for ancillary information that - * will be necessary for the packet. If not, return a negative value. - */ -int -pktsock_test_input(struct pktsock * pkt, struct pbuf * pbuf) -{ - - /* - * This check will be done again in pktsock_input(), but this function - * is called for raw packets only (not for UDP packets) and, if this - * (cheap) check fails, we can avoid a (rather expensive) packet copy. - */ - if (!pktsock_may_recv(pkt, pbuf)) - return -1; - - if (ip_current_is_v6()) - return (int)(sizeof(struct pktaddr6) + sizeof(struct pkthdr)); - else - return (int)(sizeof(struct pktaddr4) + sizeof(struct pkthdr)); -} - -/* - * A packet has arrived on a packet socket. We own the given packet buffer, - * and so we must free it if we do not want to keep it. - */ -void -pktsock_input(struct pktsock * pkt, struct pbuf * pbuf, - const ip_addr_t * srcaddr, uint16_t port) -{ - struct pktaddr4 pktaddr4; - struct pktaddr6 pktaddr6; - struct pkthdr pkthdr; - void *pktaddr; - struct ifdev *ifdev; - size_t pktaddrlen; - - /* - * We are going to mess with the packet's header and contents, so we - * must be the exclusive owner of the packet. For UDP packets, lwIP - * must have made a copy for us in case of non-exclusive delivery - * (e.g., multicast packets). For raw packets, we have made a copy of - * the packet ourselves just before the call to this function. - */ - if (pbuf->ref != 1) - panic("input packet has multiple references!"); - - /* If the packet should not be received on this socket, drop it. */ - if (!pktsock_may_recv(pkt, pbuf)) { - pbuf_free(pbuf); - - return; - } - - /* - * Enqueue the packet. Overwrite the leading IP header with packet - * information that is used at the time of receipt by userland. The - * data structures are such that the information always fits in what - * was the IP header. The reference count check earlier ensures that - * we never overwrite part of a packet that is still in use elsewhere. - */ - if (ip_current_is_v6()) { - assert(IP_IS_V6(srcaddr)); - assert(ip6_current_dest_addr() != NULL); - - ip6_addr_copy_to_packed(pktaddr6.srcaddr, *ip_2_ip6(srcaddr)); - ip6_addr_copy_to_packed(pktaddr6.dstaddr, - *ip6_current_dest_addr()); - pktaddr = &pktaddr6; - pktaddrlen = sizeof(pktaddr6); - - assert(pktaddrlen + sizeof(pkthdr) <= IP6_HLEN); - - pkthdr.tos = IP6H_TC(ip6_current_header()); - pkthdr.ttl = IP6H_HOPLIM(ip6_current_header()); - pkthdr.flags = PKTHF_IPV6; - } else { - assert(IP_IS_V4(srcaddr)); - assert(ip4_current_dest_addr() != NULL); - - memcpy(&pktaddr4.srcaddr, ip_2_ip4(srcaddr), - sizeof(pktaddr4.srcaddr)); - memcpy(&pktaddr4.dstaddr, ip4_current_dest_addr(), - sizeof(pktaddr4.srcaddr)); - pktaddr = &pktaddr4; - pktaddrlen = sizeof(pktaddr4); - - assert(pktaddrlen + sizeof(pkthdr) <= IP_HLEN); - - pkthdr.tos = IPH_TOS(ip4_current_header()); - pkthdr.ttl = IPH_TTL(ip4_current_header()); - pkthdr.flags = 0; - } - - /* - * Save both the interface on which the packet was received (for - * PKTINFO) and the interface that owns the destination address of the - * packet (for the source address's zone ID). - */ - assert(ip_current_input_netif() != NULL); - ifdev = netif_get_ifdev(ip_current_input_netif()); - pkthdr.dstif = (uint16_t)ifdev_get_index(ifdev); - - assert(ip_current_netif() != NULL); - ifdev = netif_get_ifdev(ip_current_netif()); - pkthdr.addrif = (uint16_t)ifdev_get_index(ifdev); - - if ((pbuf->flags & PBUF_FLAG_LLMCAST) || - ip_addr_ismulticast(ip_current_dest_addr())) - pkthdr.flags |= PKTHF_MCAST; - else if ((pbuf->flags & PBUF_FLAG_LLBCAST) || - ip_addr_isbroadcast(ip_current_dest_addr(), ip_current_netif())) - pkthdr.flags |= PKTHF_BCAST; - - pkthdr.port = port; - - util_pbuf_header(pbuf, sizeof(pkthdr)); - - memcpy(pbuf->payload, &pkthdr, sizeof(pkthdr)); - - util_pbuf_header(pbuf, pktaddrlen); - - memcpy(pbuf->payload, pktaddr, pktaddrlen); - - util_pbuf_header(pbuf, -(int)(sizeof(pkthdr) + pktaddrlen)); - - *pkt->pkt_rcvtailp = pbuf; - pkt->pkt_rcvtailp = pchain_end(pbuf); - pkt->pkt_rcvlen += pchain_size(pbuf); - - sockevent_raise(ipsock_get_sock(&pkt->pkt_ipsock), SEV_RECV); -} - -/* - * Obtain interface and source address information for an outgoing packet. In - * particular, parse any IPV6_PKTINFO options provided as either sticky options - * on the socket 'pkt' or as ancillary options in the packet options 'pkto'. - * On success, return OK, with 'ifdevp' set to either the outgoing interface to - * use for the packet, or NULL if no outgoing interface was specified using - * either of the aforementioned options. If, and only if, 'ifdevp' is set to - * an actual interface (i.e., not NULL), then 'src_addrp' is filled with either - * a locally owned, validated, unicast address to use as source of the packet, - * or the unspecified ('any') address if no source address was specified using - * the options. On failure, return a negative error code. - */ -int -pktsock_get_pktinfo(struct pktsock * pkt, struct pktopt * pkto, - struct ifdev ** ifdevp, ip_addr_t * src_addrp) -{ - struct ifdev *ifdev, *ifdev2; - ip_addr_t ipaddr; - uint32_t ifindex; - int r; - - /* We support only IPV6_PKTINFO. IP_PKTINFO is not supported. */ - if (!ipsock_is_ipv6(&pkt->pkt_ipsock)) { - *ifdevp = NULL; - return OK; - } - - /* - * TODO: we are spending a lot of effort on initializing and copying - * stuff around, even just to find out whether there is anything to do - * at all here. See if this can be optimized. - */ - ip_addr_set_zero_ip6(&ipaddr); - - /* - * Ancillary data takes precedence over sticky options. We treat the - * source address and interface index fields as separate, overriding - * each earlier value only if non-zero. TODO: is that correct? - */ - if (pkto->pkto_flags & PKTOF_PKTINFO) { - memcpy(ip_2_ip6(&ipaddr)->addr, &pkto->pkto_srcaddr.addr, - sizeof(ip_2_ip6(&ipaddr)->addr)); - ifindex = pkto->pkto_ifindex; - } else - ifindex = 0; - - if (ip6_addr_isany(ip_2_ip6(&ipaddr))) - memcpy(ip_2_ip6(&ipaddr)->addr, &pkt->pkt_srcaddr.addr, - sizeof(ip_2_ip6(&ipaddr)->addr)); - if (ifindex == 0) - ifindex = pkt->pkt_ifindex; - - /* If both fields are blank, there is nothing more to do. */ - if (ip6_addr_isany(ip_2_ip6(&ipaddr)) && ifindex == 0) { - *ifdevp = NULL; - return OK; - } - - /* If an interface index is specified, it must be valid. */ - ifdev = NULL; - - if (ifindex != 0 && (ifdev = ifdev_get_by_index(ifindex)) == NULL) - return ENXIO; - - /* - * Use the interface index to set a zone on the source address, if the - * source address has a scope. - */ - if (ip6_addr_has_scope(ip_2_ip6(&ipaddr), IP6_UNKNOWN)) { - if (ifindex == 0) - return EADDRNOTAVAIL; - - ip6_addr_set_zone(ip_2_ip6(&ipaddr), ifindex); - } - - /* - * We need to validate the given address just as thoroughly as an - * address given through bind(). If we don't, we could allow forged - * source addresses etcetera. To be sure: this call may change the - * address to an IPv4 type address if needed. - */ - if ((r = ipsock_check_src_addr(pktsock_get_ipsock(pkt), &ipaddr, - FALSE /*allow_mcast*/, &ifdev2)) != OK) - return r; - - if (ifdev2 != NULL) { - if (ifdev == NULL) - ifdev = ifdev2; - else if (ifdev != ifdev2) - return EADDRNOTAVAIL; - } else { - /* - * There should be no cases where the (non-multicast) address - * successfully parsed, is not unspecified, and yet did not map - * to an interface. Eliminate the possibility anyway by - * throwing an error for this case. As a result, we are left - * with one of two cases: - * - * 1) ifdevp is not NULL, and src_addrp is unspecified; - * 2) ifdevp is not NULL, and src_addrp is a locally assigned - * (unicast) address. - * - * This is why we need not fill src_addrp when ifdevp is NULL. - */ - if (!ip_addr_isany(&ipaddr)) - return EADDRNOTAVAIL; - } - - *ifdevp = ifdev; - if (ifdev != NULL) - *src_addrp = ipaddr; - return OK; -} - -/* - * Parse a chunk of user-provided control data, on an IPv4 socket provided as - * 'pkt'. The control chunk is given as 'cmsg', and the length of the data - * following the control header (possibly zero) is given as 'len'. On success, - * return OK, with any parsed options merged into the set of packet options - * 'pkto'. On failure, return a negative error code. - */ -static int -pktsock_parse_ctl_v4(struct pktsock * pkt __unused, struct cmsghdr * cmsg, - socklen_t len, struct pktopt * pkto) -{ - uint8_t byte; - int val; - - if (cmsg->cmsg_level != IPPROTO_IP) - return EAFNOSUPPORT; - - switch (cmsg->cmsg_type) { - case IP_TOS: - /* - * Some userland code (bind's libisc in particular) supplies - * a single byte instead of a full integer for this option. - * We go out of our way to accept that format, too. - */ - if (len != sizeof(val) && len != sizeof(byte)) - return EINVAL; - - if (len == sizeof(byte)) { - memcpy(&byte, CMSG_DATA(cmsg), sizeof(byte)); - val = (int)byte; - } else - memcpy(&val, CMSG_DATA(cmsg), sizeof(val)); - - if (val < 0 || val > UINT8_MAX) - return EINVAL; - - pkto->pkto_flags |= PKTOF_TOS; - pkto->pkto_tos = (uint8_t)val; - - return OK; - - case IP_TTL: - if (len != sizeof(val)) - return EINVAL; - - memcpy(&val, CMSG_DATA(cmsg), sizeof(val)); - - if (val < 0 || val > UINT8_MAX) - return EINVAL; - - pkto->pkto_flags |= PKTOF_TTL; - pkto->pkto_ttl = (uint8_t)val; - - return OK; - - /* - * Implementing IP_PKTINFO might be a bit harder than its IPV6_PKTINFO - * sibling, because it would require the use of zone IDs (interface - * indices) for IPv4, which is not supported yet. - */ - } - - return EINVAL; -} - -/* - * Parse a chunk of user-provided control data, on an IPv6 socket provided as - * 'pkt'. The control chunk is given as 'cmsg', and the length of the data - * following the control header (possibly zero) is given as 'len'. On success, - * return OK, with any parsed options merged into the set of packet options - * 'pkto'. On failure, return a negative error code. - */ -static int -pktsock_parse_ctl_v6(struct pktsock * pkt, struct cmsghdr * cmsg, - socklen_t len, struct pktopt * pkto) -{ - struct in6_pktinfo ipi6; - int val; - - if (cmsg->cmsg_level != IPPROTO_IPV6) - return EAFNOSUPPORT; - - switch (cmsg->cmsg_type) { - case IPV6_TCLASS: - if (len != sizeof(val)) - return EINVAL; - - memcpy(&val, CMSG_DATA(cmsg), sizeof(val)); - - if (val < -1 || val > UINT8_MAX) - return EINVAL; - - if (val == -1) - val = 0; - - pkto->pkto_flags |= PKTOF_TOS; - pkto->pkto_tos = (uint8_t)val; - - return OK; - - case IPV6_HOPLIMIT: - if (len != sizeof(val)) - return EINVAL; - - memcpy(&val, CMSG_DATA(cmsg), sizeof(val)); - - if (val < -1 || val > UINT8_MAX) - return EINVAL; - - if (val == -1) - val = IP_DEFAULT_TTL; - - pkto->pkto_flags |= PKTOF_TTL; - pkto->pkto_ttl = (uint8_t)val; - - return OK; - - case IPV6_PKTINFO: - if (len != sizeof(ipi6)) - return EINVAL; - - memcpy(&ipi6, CMSG_DATA(cmsg), sizeof(ipi6)); - - pkto->pkto_flags |= PKTOF_PKTINFO; - memcpy(&pkto->pkto_srcaddr.addr, &ipi6.ipi6_addr, - sizeof(pkto->pkto_srcaddr.addr)); - pkto->pkto_ifindex = ipi6.ipi6_ifindex; - - return OK; - - case IPV6_USE_MIN_MTU: - if (len != sizeof(int)) - return EINVAL; - - memcpy(&val, CMSG_DATA(cmsg), sizeof(val)); - - if (val < -1 || val > 1) - return EINVAL; - - /* TODO: not supported by lwIP, but needed by applications. */ - return OK; - } - - return EINVAL; -} - -/* - * Copy in and parse control data, as part of sending a packet on socket 'pkt'. - * The control data is accessible through 'ctl', with a user-provided length of - * 'ctl_len'. On success, return OK, with any parsed packet options stored in - * 'pkto'. On failure, return a negative error code. - */ -int -pktsock_get_ctl(struct pktsock * pkt, const struct sockdriver_data * ctl, - socklen_t ctl_len, struct pktopt * pkto) -{ - struct msghdr msghdr; - struct cmsghdr *cmsg; - socklen_t left, len; - int r; - - /* The default: no packet options are being overridden. */ - assert(pkto->pkto_flags == 0); - - /* If no control length is given, we are done here. */ - if (ctl_len == 0) - return OK; - - /* - * For now, we put a rather aggressive limit on the size of the control - * data. We copy in and parse the whole thing in a single buffer. - */ - if (ctl_len > sizeof(pktsock_ctlbuf)) { - printf("LWIP: too much control data given (%u bytes)\n", - ctl_len); - - return ENOBUFS; - } - - if ((r = sockdriver_copyin(ctl, 0, pktsock_ctlbuf, ctl_len)) != OK) - return r; - - memset(&msghdr, 0, sizeof(msghdr)); - msghdr.msg_control = pktsock_ctlbuf; - msghdr.msg_controllen = ctl_len; - - for (cmsg = CMSG_FIRSTHDR(&msghdr); cmsg != NULL; - cmsg = CMSG_NXTHDR(&msghdr, cmsg)) { - /* Check for bogus lengths. */ - assert((socklen_t)((char *)cmsg - pktsock_ctlbuf) <= ctl_len); - left = ctl_len - (socklen_t)((char *)cmsg - pktsock_ctlbuf); - assert(left >= CMSG_LEN(0)); /* guaranteed by CMSG_xxHDR */ - - if (cmsg->cmsg_len < CMSG_LEN(0) || cmsg->cmsg_len > left) { - printf("LWIP: malformed control data rejected\n"); - - return EINVAL; - } - - len = cmsg->cmsg_len - CMSG_LEN(0); - - if (ipsock_is_ipv6(&pkt->pkt_ipsock)) - r = pktsock_parse_ctl_v6(pkt, cmsg, len, pkto); - else - r = pktsock_parse_ctl_v4(pkt, cmsg, len, pkto); - - if (r != OK) - return r; - } - - return OK; -} - -/* - * Copy in the packet data from the calling user process, and store it in the - * buffer 'pbuf' that must already have been allocated with the appropriate - * size. - */ -int -pktsock_get_data(struct pktsock * pkt, const struct sockdriver_data * data, - size_t len, struct pbuf * pbuf) - -{ - - return util_copy_data(data, len, 0, pbuf, 0, TRUE /*copy_in*/); -} - -/* - * Dequeue and free the head of the receive queue of a packet socket. - */ -static void -pktsock_dequeue(struct pktsock * pkt) -{ - struct pbuf *pbuf, **pnext; - size_t size; - - pbuf = pkt->pkt_rcvhead; - assert(pbuf != NULL); - - pnext = pchain_end(pbuf); - size = pchain_size(pbuf); - - if ((pkt->pkt_rcvhead = *pnext) == NULL) - pkt->pkt_rcvtailp = &pkt->pkt_rcvhead; - - assert(pkt->pkt_rcvlen >= size); - pkt->pkt_rcvlen -= size; - - *pnext = NULL; - pbuf_free(pbuf); -} - -/* - * Perform preliminary checks on a receive request. - */ -int -pktsock_pre_recv(struct sock * sock __unused, endpoint_t user_endpt __unused, - int flags) -{ - - /* - * We accept the same flags across all socket types in LWIP, and then - * simply ignore the ones we do not support for packet sockets. - */ - if ((flags & ~(MSG_PEEK | MSG_WAITALL)) != 0) - return EOPNOTSUPP; - - return OK; -} - -/* - * Add a chunk of control data to the global control buffer, starting from - * offset 'off'. The chunk has the given level and type, and its data is given - * in the buffer 'ptr' with size 'len'. Return the (padded) size of the chunk - * that was generated as a result. - */ -static size_t -pktsock_add_ctl(int level, int type, void * ptr, socklen_t len, size_t off) -{ - struct cmsghdr cmsg; - size_t size; - - size = CMSG_SPACE(len); - - /* - * The global control buffer must be large enough to store one chunk - * of each of the supported options. If this panic triggers, increase - * PKTSOCK_CTLBUF_SIZE by as much as needed. - */ - if (off + size > sizeof(pktsock_ctlbuf)) - panic("control buffer too small, increase " - "PKTSOCK_CTLBUF_SIZE"); - - memset(&cmsg, 0, sizeof(cmsg)); - cmsg.cmsg_len = CMSG_LEN(len); - cmsg.cmsg_level = level; - cmsg.cmsg_type = type; - - /* - * Clear any padding space. This can be optimized, but in any case we - * must be careful not to copy out any bytes that have not been - * initialized at all. - */ - memset(&pktsock_ctlbuf[off], 0, size); - - memcpy(&pktsock_ctlbuf[off], &cmsg, sizeof(cmsg)); - memcpy(CMSG_DATA((struct cmsghdr *)&pktsock_ctlbuf[off]), ptr, len); - - return size; -} - -/* - * Generate and copy out control data, as part of delivering a packet from - * socket 'pkt' to userland. The control data buffer is given as 'ctl', with - * a user-given length of 'ctl_len' bytes. The packet's header information is - * provided as 'pkthdr', and its source and destination addresses as 'pktaddr', - * which maybe a pktaddr4 or pktaddr6 structure depending on the value of the - * PKTHF_IPV6 flag in the 'flags' field in 'pkthdr'. Note that we support - * dual-stack sockets, and as such it is possible that the socket is of domain - * AF_INET6 while the received packet is an IPv4 packet. On success, return - * the size of the control data copied out (possibly zero). If more control - * data were generated than copied out, also merge the MSG_CTRUNC flag into - * 'rflags'. On failure, return a negative error code. - */ -static int -pktsock_put_ctl(struct pktsock * pkt, const struct sockdriver_data * ctl, - socklen_t ctl_len, struct pkthdr * pkthdr, void * pktaddr, - int * rflags) -{ - struct pktaddr6 *pktaddr6; - struct pktaddr4 *pktaddr4; - struct in_pktinfo ipi; - struct in6_pktinfo ipi6; - ip_addr_t ipaddr; - unsigned int flags; - uint8_t byte; - size_t off; - int r, val; - - flags = ipsock_get_flags(&pkt->pkt_ipsock); - - if (!(flags & (PKTF_RECVINFO | PKTF_RECVTOS | PKTF_RECVTTL))) - return 0; - - /* - * Important: all generated control chunks must fit in the global - * control buffer together. When adding more options here, ensure that - * the control buffer remains large enough to receive all options at - * once. See also the panic in pktsock_add_ctl(). - */ - off = 0; - - /* - * IPv6 sockets may receive IPv4 packets. The ancillary data is in the - * format corresponding to the socket, which means we may have to - * convert any IPv4 addresses from the packet to IPv4-mapped IPv6 - * addresses for the ancillary data, just like the source address. - */ - if (ipsock_is_ipv6(&pkt->pkt_ipsock)) { - if (flags & PKTF_RECVTTL) { - val = pkthdr->ttl; - - off += pktsock_add_ctl(IPPROTO_IPV6, IPV6_HOPLIMIT, - &val, sizeof(val), off); - } - - if (flags & PKTF_RECVTOS) { - val = pkthdr->tos; - - off += pktsock_add_ctl(IPPROTO_IPV6, IPV6_TCLASS, &val, - sizeof(val), off); - } - - if (flags & PKTF_RECVINFO) { - memset(&ipi6, 0, sizeof(ipi6)); - - if (pkthdr->flags & PKTHF_IPV6) { - pktaddr6 = (struct pktaddr6 *)pktaddr; - memcpy(&ipi6.ipi6_addr, &pktaddr6->dstaddr, - sizeof(ipi6.ipi6_addr)); - } else { - pktaddr4 = (struct pktaddr4 *)pktaddr; - - addr_make_v4mapped_v6(&ipaddr, - &pktaddr4->dstaddr); - - memcpy(&ipi6.ipi6_addr, - ip_2_ip6(&ipaddr)->addr, - sizeof(ipi6.ipi6_addr)); - } - ipi6.ipi6_ifindex = pkthdr->dstif; - - off += pktsock_add_ctl(IPPROTO_IPV6, IPV6_PKTINFO, - &ipi6, sizeof(ipi6), off); - } - } else { - if (flags & PKTF_RECVTTL) { - byte = pkthdr->ttl; - - off += pktsock_add_ctl(IPPROTO_IP, IP_TTL, &byte, - sizeof(byte), off); - } - - if (flags & PKTF_RECVINFO) { - assert(!(pkthdr->flags & PKTHF_IPV6)); - pktaddr4 = (struct pktaddr4 *)pktaddr; - - memset(&ipi, 0, sizeof(ipi)); - memcpy(&ipi.ipi_addr, &pktaddr4->dstaddr, - sizeof(ipi.ipi_addr)); - ipi.ipi_ifindex = pkthdr->dstif; - - off += pktsock_add_ctl(IPPROTO_IP, IP_PKTINFO, &ipi, - sizeof(ipi), off); - } - } - - assert(off > 0); - - if (ctl_len >= off) - ctl_len = off; - else - *rflags |= MSG_CTRUNC; - - if (ctl_len > 0 && - (r = sockdriver_copyout(ctl, 0, pktsock_ctlbuf, ctl_len)) != OK) - return r; - - return ctl_len; -} - -/* - * Receive data on a packet socket. - */ -int -pktsock_recv(struct sock * sock, const struct sockdriver_data * data, - size_t len, size_t * off, const struct sockdriver_data * ctl, - socklen_t ctl_len, socklen_t * ctl_off, struct sockaddr * addr, - socklen_t * addr_len, endpoint_t user_endpt __unused, int flags, - size_t min __unused, int * rflags) -{ - struct pktsock *pkt = (struct pktsock *)sock; - struct pktaddr4 pktaddr4; - struct pktaddr6 pktaddr6; - struct pkthdr pkthdr; - void *pktaddr; - struct pbuf *pbuf; - ip_addr_t srcaddr; - int r; - - if ((pbuf = pkt->pkt_rcvhead) == NULL) - return SUSPEND; - - /* - * Get the ancillary data for the packet. The format of the ancillary - * data depends on the received packet type, which may be different - * from the socket type. - */ - util_pbuf_header(pbuf, sizeof(pkthdr)); - - memcpy(&pkthdr, pbuf->payload, sizeof(pkthdr)); - - if (pkthdr.flags & PKTHF_IPV6) { - util_pbuf_header(pbuf, sizeof(pktaddr6)); - - memcpy(&pktaddr6, pbuf->payload, sizeof(pktaddr6)); - pktaddr = &pktaddr6; - - ip_addr_copy_from_ip6_packed(srcaddr, pktaddr6.srcaddr); - if (ip6_addr_has_scope(ip_2_ip6(&srcaddr), IP6_UNICAST)) - ip6_addr_set_zone(ip_2_ip6(&srcaddr), pkthdr.addrif); - - util_pbuf_header(pbuf, - -(int)(sizeof(pkthdr) + sizeof(pktaddr6))); - } else { - util_pbuf_header(pbuf, sizeof(pktaddr4)); - - memcpy(&pktaddr4, pbuf->payload, sizeof(pktaddr4)); - pktaddr = &pktaddr4; - - ip_addr_copy_from_ip4(srcaddr, pktaddr4.srcaddr); - - util_pbuf_header(pbuf, - -(int)(sizeof(pkthdr) + sizeof(pktaddr4))); - } - - /* Copy out the packet data to the calling user process. */ - if (len >= pbuf->tot_len) - len = pbuf->tot_len; - else - *rflags |= MSG_TRUNC; - - r = util_copy_data(data, len, 0, pbuf, 0, FALSE /*copy_in*/); - - if (r != OK) - return r; - - /* Generate and copy out ancillary (control) data, if requested. */ - if ((r = pktsock_put_ctl(pkt, ctl, ctl_len, &pkthdr, pktaddr, - rflags)) < 0) - return r; - - /* Store the source IP address. */ - ipsock_put_addr(&pkt->pkt_ipsock, addr, addr_len, &srcaddr, - pkthdr.port); - - /* Set multicast or broadcast message flag, if applicable. */ - if (pkthdr.flags & PKTHF_MCAST) - *rflags |= MSG_MCAST; - else if (pkthdr.flags & PKTHF_BCAST) - *rflags |= MSG_BCAST; - - /* Discard the packet now, unless we were instructed to peek only. */ - if (!(flags & MSG_PEEK)) - pktsock_dequeue(pkt); - - /* Return the received part of the packet length. */ - *off = len; - *ctl_off = r; - return OK; -} - -/* - * Test whether data can be received on a packet socket, and if so, how many - * bytes of data. - */ -int -pktsock_test_recv(struct sock * sock, size_t min __unused, size_t * size) -{ - struct pktsock *pkt = (struct pktsock *)sock; - - if (pkt->pkt_rcvhead == NULL) - return SUSPEND; - - if (size != NULL) - *size = pkt->pkt_rcvhead->tot_len; - return OK; -} - -/* - * The caller has performed a multicast operation on the given socket. Thus, - * the caller is multicast aware. Remember this, because that means the socket - * may also receive traffic to multicast destinations. - */ -void -pktsock_set_mcaware(struct pktsock * pkt) -{ - - ipsock_set_flag(&pkt->pkt_ipsock, PKTF_MCAWARE); -} - -/* - * Set socket options on a packet socket. - */ -int -pktsock_setsockopt(struct pktsock * pkt, int level, int name, - const struct sockdriver_data * data, socklen_t len, - struct ipopts * ipopts) -{ - struct ip_mreq imr; - struct ipv6_mreq ipv6mr; - struct in6_pktinfo ipi6; - ip_addr_t ipaddr, ifaddr; - struct ifdev *ifdev; - unsigned int flag; - uint32_t ifindex; - int r, val, has_scope; - - switch (level) { - case IPPROTO_IP: - if (ipsock_is_ipv6(&pkt->pkt_ipsock)) - break; - - switch (name) { - case IP_ADD_MEMBERSHIP: - case IP_DROP_MEMBERSHIP: - pktsock_set_mcaware(pkt); - - if ((r = sockdriver_copyin_opt(data, &imr, sizeof(imr), - len)) != OK) - return r; - - ip_addr_set_ip4_u32(&ipaddr, imr.imr_multiaddr.s_addr); - ip_addr_set_ip4_u32(&ifaddr, imr.imr_interface.s_addr); - - if (!ip_addr_isany(&ifaddr)) { - ifdev = ifaddr_map_by_addr(&ifaddr); - - if (ifdev == NULL) - return EADDRNOTAVAIL; - } else - ifdev = NULL; - - if (name == IP_ADD_MEMBERSHIP) - r = mcast_join(&pkt->pkt_mcast, &ipaddr, - ifdev); - else - r = mcast_leave(&pkt->pkt_mcast, &ipaddr, - ifdev); - - return r; - - case IP_RECVTTL: - case IP_RECVPKTINFO: - if ((r = sockdriver_copyin_opt(data, &val, sizeof(val), - len)) != OK) - return r; - - switch (name) { - case IP_RECVTTL: flag = PKTF_RECVTTL; break; - case IP_RECVPKTINFO: flag = PKTF_RECVINFO; break; - default: flag = 0; assert(0); break; - } - - if (val) - ipsock_set_flag(&pkt->pkt_ipsock, flag); - else - ipsock_clear_flag(&pkt->pkt_ipsock, flag); - - return OK; - } - - break; - - case IPPROTO_IPV6: - if (!ipsock_is_ipv6(&pkt->pkt_ipsock)) - break; - - switch (name) { - case IPV6_JOIN_GROUP: - case IPV6_LEAVE_GROUP: - pktsock_set_mcaware(pkt); - - if ((r = sockdriver_copyin_opt(data, &ipv6mr, - sizeof(ipv6mr), len)) != OK) - return r; - - ip_addr_set_zero_ip6(&ipaddr); - memcpy(ip_2_ip6(&ipaddr)->addr, - &ipv6mr.ipv6mr_multiaddr, - sizeof(ip_2_ip6(&ipaddr)->addr)); - - /* - * We currently do not support joining IPv4 multicast - * groups on IPv6 sockets. The reason for this is that - * this would require decisions on what to do if the - * socket is set to V6ONLY later, as well as various - * additional exceptions for a case that hopefully - * doesn't occur in practice anyway. - */ - if (ip6_addr_isipv4mappedipv6(ip_2_ip6(&ipaddr))) - return EADDRNOTAVAIL; - - has_scope = ip6_addr_has_scope(ip_2_ip6(&ipaddr), - IP6_UNKNOWN); - - if ((ifindex = ipv6mr.ipv6mr_interface) != 0) { - ifdev = ifdev_get_by_index(ifindex); - - if (ifdev == NULL) - return ENXIO; - - if (has_scope) - ip6_addr_set_zone(ip_2_ip6(&ipaddr), - ifindex); - } else { - if (has_scope) - return EADDRNOTAVAIL; - - ifdev = NULL; - } - - if (name == IPV6_JOIN_GROUP) - r = mcast_join(&pkt->pkt_mcast, &ipaddr, - ifdev); - else - r = mcast_leave(&pkt->pkt_mcast, &ipaddr, - ifdev); - - return r; - - case IPV6_USE_MIN_MTU: - if ((r = sockdriver_copyin_opt(data, &val, sizeof(val), - len)) != OK) - return r; - - if (val < -1 || val > 1) - return EINVAL; - - /* - * lwIP does not support path MTU discovery, so do - * nothing. TODO: see if this is actually good enough. - */ - return OK; - - case IPV6_PKTINFO: - if ((r = sockdriver_copyin_opt(data, &ipi6, - sizeof(ipi6), len)) != OK) - return r; - - /* - * Simply copy in what is given. The values will be - * parsed only once a packet is sent, in - * pktsock_get_pktinfo(). Otherwise, if we perform - * checks here, they may be outdated by the time the - * values are actually used. - */ - memcpy(&pkt->pkt_srcaddr.addr, &ipi6.ipi6_addr, - sizeof(pkt->pkt_srcaddr.addr)); - pkt->pkt_ifindex = ipi6.ipi6_ifindex; - - return OK; - - case IPV6_RECVPKTINFO: - case IPV6_RECVHOPLIMIT: - case IPV6_RECVTCLASS: - if ((r = sockdriver_copyin_opt(data, &val, sizeof(val), - len)) != OK) - return r; - - switch (name) { - case IPV6_RECVPKTINFO: flag = PKTF_RECVINFO; break; - case IPV6_RECVHOPLIMIT: flag = PKTF_RECVTTL; break; - case IPV6_RECVTCLASS: flag = PKTF_RECVTOS; break; - default: flag = 0; assert(0); break; - } - - if (val) - ipsock_set_flag(&pkt->pkt_ipsock, flag); - else - ipsock_clear_flag(&pkt->pkt_ipsock, flag); - - return OK; - } - - break; - } - - return ipsock_setsockopt(&pkt->pkt_ipsock, level, name, data, len, - ipopts); -} - -/* - * Retrieve socket options on a packet socket. - */ -int -pktsock_getsockopt(struct pktsock * pkt, int level, int name, - const struct sockdriver_data * data, socklen_t * len, - struct ipopts * ipopts) -{ - struct in6_pktinfo ipi6; - unsigned int flag; - int val; - - switch (level) { - case IPPROTO_IP: - if (ipsock_is_ipv6(&pkt->pkt_ipsock)) - break; - - switch (name) { - case IP_RECVTTL: - case IP_RECVPKTINFO: - switch (name) { - case IP_RECVTTL: flag = PKTF_RECVTTL; break; - case IP_RECVPKTINFO: flag = PKTF_RECVINFO; break; - default: flag = 0; assert(0); break; - } - - val = !!(ipsock_get_flag(&pkt->pkt_ipsock, flag)); - - return sockdriver_copyout_opt(data, &val, sizeof(val), - len); - } - - break; - - case IPPROTO_IPV6: - if (!ipsock_is_ipv6(&pkt->pkt_ipsock)) - break; - - switch (name) { - case IPV6_USE_MIN_MTU: - /* - * TODO: sort out exactly what lwIP actually supports - * in the way of path MTU discovery. Value 1 means - * that path MTU discovery is disabled and packets are - * sent at the minimum MTU (RFC 3542). - */ - val = 1; - - return sockdriver_copyout_opt(data, &val, sizeof(val), - len); - - case IPV6_PKTINFO: - memset(&ipi6, 0, sizeof(ipi6)); - - /* - * Simply copy out whatever was given before. These - * fields are initialized to zero on socket creation. - */ - memcpy(&ipi6.ipi6_addr, &pkt->pkt_srcaddr.addr, - sizeof(ipi6.ipi6_addr)); - ipi6.ipi6_ifindex = pkt->pkt_ifindex; - - return sockdriver_copyout_opt(data, &ipi6, - sizeof(ipi6), len); - - case IPV6_RECVPKTINFO: - case IPV6_RECVHOPLIMIT: - case IPV6_RECVTCLASS: - switch (name) { - case IPV6_RECVPKTINFO: flag = PKTF_RECVINFO; break; - case IPV6_RECVHOPLIMIT: flag = PKTF_RECVTTL; break; - case IPV6_RECVTCLASS: flag = PKTF_RECVTOS; break; - default: flag = 0; assert(0); break; - } - - val = !!(ipsock_get_flag(&pkt->pkt_ipsock, flag)); - - return sockdriver_copyout_opt(data, &val, sizeof(val), - len); - } - - break; - } - - return ipsock_getsockopt(&pkt->pkt_ipsock, level, name, data, len, - ipopts); -} - -/* - * Drain the receive queue of a packet socket. - */ -static void -pktsock_drain(struct pktsock * pkt) -{ - - while (pkt->pkt_rcvhead != NULL) - pktsock_dequeue(pkt); - - assert(pkt->pkt_rcvlen == 0); - assert(pkt->pkt_rcvtailp == &pkt->pkt_rcvhead); -} - -/* - * Shut down a packet socket for reading and/or writing. - */ -void -pktsock_shutdown(struct pktsock * pkt, unsigned int mask) -{ - - if (mask & SFL_SHUT_RD) - pktsock_drain(pkt); -} - -/* - * Close a packet socket. - */ -void -pktsock_close(struct pktsock * pkt) -{ - - pktsock_drain(pkt); - - mcast_leave_all(&pkt->pkt_mcast); -} - -/* - * Return the rounded-up number of bytes in the packet socket's receive queue, - * for sysctl(7). NetBSD returns the used portion of each buffer, but that - * would be quite some extra effort for us (TODO). - */ -size_t -pktsock_get_recvlen(struct pktsock * pkt) -{ - - return pkt->pkt_rcvlen; -} diff --git a/minix/net/lwip/rawsock.c b/minix/net/lwip/rawsock.c deleted file mode 100644 index d00df01e4..000000000 --- a/minix/net/lwip/rawsock.c +++ /dev/null @@ -1,1341 +0,0 @@ -/* LWIP service - rawsock.c - RAW sockets */ -/* - * For IPv6 sockets, this module attempts to implement a part of RFC 3542, but - * currently not more than what is supported by lwIP and/or what is expected by - * a handful of standard utilities (dhcpcd, ping6, traceroute6..). - * - * For general understanding, be aware that IPv4 raw sockets always receive - * packets including the IP header, and may be used to send packets including - * the IP header if IP_HDRINCL is set, while IPv6 raw sockets always send and - * receive actual payloads only, using ancillary (control) data to set and - * retrieve per-packet IP header fields. - * - * For packet headers we follow general BSD semantics. For example, some IPv4 - * header fields are swapped both when sending and when receiving. Also, like - * on NetBSD, IPPROTO_RAW is not a special value in any way. - */ - -#include "lwip.h" -#include "ifaddr.h" -#include "pktsock.h" - -#include "lwip/raw.h" -#include "lwip/inet_chksum.h" - -#include -#include -#include -#include - -/* The number of RAW sockets. Inherited from the lwIP configuration. */ -#define NR_RAWSOCK MEMP_NUM_RAW_PCB - -/* - * Outgoing packets are not getting buffered, so the send buffer size simply - * determines the maximum size for sent packets. The send buffer maximum is - * therefore limited to the maximum size of a single packet (64K-1 bytes), - * which is already enforced by lwIP's 16-bit length parameter to pbuf_alloc(). - * - * The actual transmission may enforce a lower limit, though. The full packet - * size must not exceed the same 64K-1 limit, and that includes any headers - * that still have to be prepended to the given packet. The size of those - * headers depends on the socket type (IPv4/IPv6) and the IP_HDRINCL setting. - * - * The default is equal to the maximum here, because if a (by definition, - * privileged) application wishes to send large raw packets, it probably has a - * good reason, and we do not want to get in its way. - */ -#define RAW_MAX_PAYLOAD (UINT16_MAX) - -#define RAW_SNDBUF_MIN 1 /* minimum RAW send buffer size */ -#define RAW_SNDBUF_DEF RAW_MAX_PAYLOAD /* default RAW send buffer size */ -#define RAW_SNDBUF_MAX RAW_MAX_PAYLOAD /* maximum RAW send buffer size */ -#define RAW_RCVBUF_MIN MEMPOOL_BUFSIZE /* minimum RAW receive buffer size */ -#define RAW_RCVBUF_DEF 32768 /* default RAW receive buffer size */ -#define RAW_RCVBUF_MAX 65536 /* maximum RAW receive buffer size */ - -static struct rawsock { - struct pktsock raw_pktsock; /* packet socket object */ - struct raw_pcb *raw_pcb; /* lwIP RAW control block */ - TAILQ_ENTRY(rawsock) raw_next; /* next in active/free list */ - struct icmp6_filter raw_icmp6filter; /* ICMPv6 type filter */ -} raw_array[NR_RAWSOCK]; - -static TAILQ_HEAD(, rawsock) raw_freelist; /* list of free RAW sockets */ -static TAILQ_HEAD(, rawsock) raw_activelist; /* list, in-use RAW sockets */ - -static const struct sockevent_ops rawsock_ops; - -#define rawsock_get_sock(raw) (ipsock_get_sock(rawsock_get_ipsock(raw))) -#define rawsock_get_ipsock(raw) (pktsock_get_ipsock(&(raw)->raw_pktsock)) -#define rawsock_is_ipv6(raw) (ipsock_is_ipv6(rawsock_get_ipsock(raw))) -#define rawsock_is_v6only(raw) (ipsock_is_v6only(rawsock_get_ipsock(raw))) -#define rawsock_is_conn(raw) \ - (raw_flags((raw)->raw_pcb) & RAW_FLAGS_CONNECTED) -#define rawsock_is_hdrincl(raw) \ - (raw_flags((raw)->raw_pcb) & RAW_FLAGS_HDRINCL) - -static ssize_t rawsock_pcblist(struct rmib_call *, struct rmib_node *, - struct rmib_oldp *, struct rmib_newp *); - -/* The CTL_NET {PF_INET,PF_INET6} IPPROTO_RAW subtree. */ -/* All dynamically numbered; the sendspace/recvspace entries are ours. */ -static struct rmib_node net_inet_raw_table[] = { - RMIB_INT(RMIB_RO, RAW_SNDBUF_DEF, "sendspace", - "Default RAW send buffer size"), - RMIB_INT(RMIB_RO, RAW_RCVBUF_DEF, "recvspace", - "Default RAW receive buffer size"), - RMIB_FUNC(RMIB_RO | CTLTYPE_NODE, 0, rawsock_pcblist, "pcblist", - "RAW IP protocol control block list"), -}; - -static struct rmib_node net_inet_raw_node = - RMIB_NODE(RMIB_RO, net_inet_raw_table, "raw", "RAW IPv4 settings"); -static struct rmib_node net_inet6_raw6_node = - RMIB_NODE(RMIB_RO, net_inet_raw_table, "raw6", "RAW IPv6 settings"); - -/* - * Initialize the raw sockets module. - */ -void -rawsock_init(void) -{ - unsigned int slot; - - /* Initialize the list of free RAW sockets. */ - TAILQ_INIT(&raw_freelist); - - for (slot = 0; slot < __arraycount(raw_array); slot++) - TAILQ_INSERT_TAIL(&raw_freelist, &raw_array[slot], raw_next); - - /* Initialize the list of active RAW sockets. */ - TAILQ_INIT(&raw_activelist); - - /* Register the net.inet.raw and net.inet6.raw6 RMIB subtrees. */ - mibtree_register_inet(PF_INET, IPPROTO_RAW, &net_inet_raw_node); - mibtree_register_inet(PF_INET6, IPPROTO_RAW, &net_inet6_raw6_node); -} - -/* - * Check whether the given arrived IPv6 packet is fit to be received on the - * given raw socket. - */ -static int -rawsock_check_v6(struct rawsock * raw, struct pbuf * pbuf) -{ - uint8_t type; - - assert(rawsock_is_ipv6(raw)); - - /* - * For ICMPv6 packets, test against the configured type filter. - */ - if (raw->raw_pcb->protocol == IPPROTO_ICMPV6) { - if (pbuf->len < offsetof(struct icmp6_hdr, icmp6_dataun)) - return FALSE; - - memcpy(&type, &((struct icmp6_hdr *)pbuf->payload)->icmp6_type, - sizeof(type)); - - if (!ICMP6_FILTER_WILLPASS((int)type, &raw->raw_icmp6filter)) - return FALSE; - } - - /* - * For ICMPv6 packets, or if IPV6_CHECKSUM is enabled, we have to - * verify the checksum of the packet before passing it to the user. - * This is costly, but it needs to be done and lwIP is not doing it for - * us (as of writing, anyway), even though it maintains the offset.. - */ - if (raw->raw_pcb->chksum_reqd && - (pbuf->tot_len < raw->raw_pcb->chksum_offset + sizeof(uint16_t) || - ip6_chksum_pseudo(pbuf, raw->raw_pcb->protocol, pbuf->tot_len, - ip6_current_src_addr(), ip6_current_dest_addr()) != 0)) { - return FALSE; - } - - /* No reason to filter out this packet. */ - return TRUE; -} - -/* - * Adjust the given arrived IPv4 packet by changing the length and offset - * fields to host-byte order, as is done by the BSDs. This effectively mirrors - * the swapping part of the preparation done on IPv4 packets being sent if the - * IP_HDRINCL socket option is enabled. - */ -static void -rawsock_adjust_v4(struct pbuf * pbuf) -{ - struct ip_hdr *iphdr; - - if (pbuf->len < sizeof(struct ip_hdr)) - return; - - iphdr = (struct ip_hdr *)pbuf->payload; - - /* - * W. Richard Stevens also mentions ip_id, but at least on NetBSD that - * field seems to be swapped neither when sending nor when receiving.. - */ - IPH_LEN(iphdr) = htons(IPH_LEN(iphdr)); - IPH_OFFSET(iphdr) = htons(IPH_OFFSET(iphdr)); -} - -/* - * A packet has arrived on a raw socket. Since the same packet may have to be - * delivered to multiple raw sockets, we always return 0 (= not consumed) from - * this function. As such, we must make a copy of the given packet if we want - * to keep it, and never free it. - */ -static uint8_t -rawsock_input(void * arg, struct raw_pcb * pcb __unused, struct pbuf * psrc, - const ip_addr_t * srcaddr) -{ - struct rawsock *raw = (struct rawsock *)arg; - struct pbuf *pbuf; - int off, hdrlen; - - assert(raw->raw_pcb == pcb); - - /* - * If adding this packet would cause the receive buffer to go beyond - * the current limit, drop the new packet. This is just an estimation, - * because the copy we are about to make may not take the exact same - * amount of memory, due to the fact that 1) the pbuf we're given has - * an unknown set of headers in front of it, and 2) we need to store - * extra information in our copy. The return value of this call, if - * not -1, is the number of bytes we need to reserve to store that - * extra information. - */ - if ((hdrlen = pktsock_test_input(&raw->raw_pktsock, psrc)) < 0) - return 0; - - /* - * Raw IPv6 sockets receive only the actual packet data, whereas raw - * IPv4 sockets receive the IP header as well. - */ - if (ip_current_is_v6()) { - off = ip_current_header_tot_len(); - - util_pbuf_header(psrc, -off); - - if (!rawsock_check_v6(raw, psrc)) { - util_pbuf_header(psrc, off); - - return 0; - } - } else { - /* - * For IPv6 sockets, drop the packet if it was sent as an IPv4 - * packet and checksumming is enabled (this includes ICMPv6). - * Otherwise, the packet would bypass the above checks that we - * perform on IPv6 packets. Applications that want to use a - * dual-stack protocol with checksumming will have to do the - * checksum verification part themselves. Presumably the two - * different pseudoheaders would result in different checksums - * anyhow, so it would be useless to try to support that. - * - * Beyond that, for IPv4 packets on IPv6 sockets, hide the IPv4 - * header. - */ - if (rawsock_is_ipv6(raw)) { - if (raw->raw_pcb->chksum_reqd) - return 0; - - off = IP_HLEN; - - util_pbuf_header(psrc, -off); - } else - off = 0; - } - - /* - * We need to make a copy of the incoming packet. If we eat the one - * given to us, this will 1) stop any other raw sockets from getting - * the same packet, 2) allow a single raw socket to discard all TCP/UDP - * traffic, and 3) present us with a problem on how to store ancillary - * data. Raw sockets are not that performance critical so the extra - * copy -even when not always necessary- is not that big of a deal. - */ - if ((pbuf = pchain_alloc(PBUF_RAW, hdrlen + psrc->tot_len)) == NULL) { - if (off > 0) - util_pbuf_header(psrc, off); - - return 0; - } - - util_pbuf_header(pbuf, -hdrlen); - - if (pbuf_copy(pbuf, psrc) != ERR_OK) - panic("unexpected pbuf copy failure"); - - pbuf->flags |= psrc->flags & (PBUF_FLAG_LLMCAST | PBUF_FLAG_LLBCAST); - - if (off > 0) - util_pbuf_header(psrc, off); - - if (!rawsock_is_ipv6(raw)) - rawsock_adjust_v4(pbuf); - - pktsock_input(&raw->raw_pktsock, pbuf, srcaddr, 0); - - return 0; -} - -/* - * Create a raw socket. - */ -sockid_t -rawsock_socket(int domain, int protocol, struct sock ** sockp, - const struct sockevent_ops ** ops) -{ - struct rawsock *raw; - unsigned int flags; - uint8_t ip_type; - - if (protocol < 0 || protocol > UINT8_MAX) - return EPROTONOSUPPORT; - - if (TAILQ_EMPTY(&raw_freelist)) - return ENOBUFS; - - raw = TAILQ_FIRST(&raw_freelist); - - /* - * Initialize the structure. Do not memset it to zero, as it is still - * part of the linked free list. Initialization may still fail. - */ - - ip_type = pktsock_socket(&raw->raw_pktsock, domain, RAW_SNDBUF_DEF, - RAW_RCVBUF_DEF, sockp); - - /* We should have enough PCBs so this call should not fail.. */ - if ((raw->raw_pcb = raw_new_ip_type(ip_type, protocol)) == NULL) - return ENOBUFS; - raw_recv(raw->raw_pcb, rawsock_input, (void *)raw); - - /* By default, the multicast TTL is 1 and looping is enabled. */ - raw_set_multicast_ttl(raw->raw_pcb, 1); - - flags = raw_flags(raw->raw_pcb); - raw_setflags(raw->raw_pcb, flags | RAW_FLAGS_MULTICAST_LOOP); - - /* - * For ICMPv6, checksum generation and verification is mandatory and - * type filtering of incoming packets is supported (RFC 3542). For all - * other IPv6 protocols, checksumming may be turned on by the user. - */ - if (rawsock_is_ipv6(raw) && protocol == IPPROTO_ICMPV6) { - raw->raw_pcb->chksum_reqd = 1; - raw->raw_pcb->chksum_offset = - offsetof(struct icmp6_hdr, icmp6_cksum); - - ICMP6_FILTER_SETPASSALL(&raw->raw_icmp6filter); - } else - raw->raw_pcb->chksum_reqd = 0; - - TAILQ_REMOVE(&raw_freelist, raw, raw_next); - - TAILQ_INSERT_TAIL(&raw_activelist, raw, raw_next); - - *ops = &rawsock_ops; - return SOCKID_RAW | (sockid_t)(raw - raw_array); -} - -/* - * Bind a raw socket to a local address. - */ -static int -rawsock_bind(struct sock * sock, const struct sockaddr * addr, - socklen_t addr_len, endpoint_t user_endpt) -{ - struct rawsock *raw = (struct rawsock *)sock; - ip_addr_t ipaddr; - err_t err; - int r; - - /* - * Raw sockets may be rebound even if that is not too useful. However, - * we do not allow (re)binding when the socket is connected, so as to - * eliminate any problems with source and destination type mismatches: - * such mismatches are detected at connect time, and rebinding would - * avoid those, possibly triggering lwIP asserts as a result. - */ - if (rawsock_is_conn(raw)) - return EINVAL; - - if ((r = ipsock_get_src_addr(rawsock_get_ipsock(raw), addr, addr_len, - user_endpt, &raw->raw_pcb->local_ip, 0 /*local_port*/, - TRUE /*allow_mcast*/, &ipaddr, NULL /*portp*/)) != OK) - return r; - - err = raw_bind(raw->raw_pcb, &ipaddr); - - return util_convert_err(err); -} - -/* - * Connect a raw socket to a remote address. - */ -static int -rawsock_connect(struct sock * sock, const struct sockaddr * addr, - socklen_t addr_len, endpoint_t user_endpt __unused) -{ - struct rawsock *raw = (struct rawsock *)sock; - const ip_addr_t *src_addr; - ip_addr_t dst_addr; - struct ifdev *ifdev; - uint32_t ifindex, ifindex2; - err_t err; - int r; - - /* - * One may "unconnect" socket by providing an address with family - * AF_UNSPEC. - */ - if (addr_is_unspec(addr, addr_len)) { - raw_disconnect(raw->raw_pcb); - - return OK; - } - - if ((r = ipsock_get_dst_addr(rawsock_get_ipsock(raw), addr, addr_len, - &raw->raw_pcb->local_ip, &dst_addr, NULL /*dst_port*/)) != OK) - return r; - - /* - * Bind explicitly to a source address if the PCB is not bound to one - * yet. This is expected in the BSD socket API, but lwIP does not do - * it for us. - */ - if (ip_addr_isany(&raw->raw_pcb->local_ip)) { - /* Help the multicast case a bit, if possible. */ - ifdev = NULL; - if (ip_addr_ismulticast(&dst_addr)) { - ifindex = pktsock_get_ifindex(&raw->raw_pktsock); - ifindex2 = raw_get_multicast_netif_index(raw->raw_pcb); - if (ifindex == 0) - ifindex = ifindex2; - - if (ifindex != 0) { - ifdev = ifdev_get_by_index(ifindex); - - if (ifdev == NULL) - return ENXIO; - } - } - - src_addr = ifaddr_select(&dst_addr, ifdev, NULL /*ifdevp*/); - - if (src_addr == NULL) - return EHOSTUNREACH; - - err = raw_bind(raw->raw_pcb, src_addr); - - if (err != ERR_OK) - return util_convert_err(err); - } - - /* - * Connecting a raw socket serves two main purposes: 1) the socket uses - * the address as destination when sending, and 2) the socket receives - * packets from only the connected address. - */ - err = raw_connect(raw->raw_pcb, &dst_addr); - - if (err != ERR_OK) - return util_convert_err(err); - - return OK; -} - -/* - * Perform preliminary checks on a send request. - */ -static int -rawsock_pre_send(struct sock * sock, size_t len, socklen_t ctl_len __unused, - const struct sockaddr * addr, socklen_t addr_len __unused, - endpoint_t user_endpt __unused, int flags) -{ - struct rawsock *raw = (struct rawsock *)sock; - - if ((flags & ~MSG_DONTROUTE) != 0) - return EOPNOTSUPP; - - if (!rawsock_is_conn(raw) && addr == NULL) - return EDESTADDRREQ; - - /* - * This is only one part of the length check. The rest is done from - * rawsock_send(), once we have more information. - */ - if (len > ipsock_get_sndbuf(rawsock_get_ipsock(raw))) - return EMSGSIZE; - - return OK; -} - -/* - * Swap IP-level options between the RAW PCB and the packet options structure, - * for all options that have their flag set in the packet options structure. - * This function is called twice when sending a packet. The result is that the - * flagged options are overridden for only the packet being sent. - */ -static void -rawsock_swap_opt(struct rawsock * raw, struct pktopt * pkto) -{ - uint8_t tos, ttl, mcast_ttl; - - if (pkto->pkto_flags & PKTOF_TOS) { - tos = raw->raw_pcb->tos; - raw->raw_pcb->tos = pkto->pkto_tos; - pkto->pkto_tos = tos; - } - - if (pkto->pkto_flags & PKTOF_TTL) { - ttl = raw->raw_pcb->ttl; - mcast_ttl = raw_get_multicast_ttl(raw->raw_pcb); - raw->raw_pcb->ttl = pkto->pkto_ttl; - raw_set_multicast_ttl(raw->raw_pcb, pkto->pkto_ttl); - pkto->pkto_ttl = ttl; - pkto->pkto_mcast_ttl = mcast_ttl; - } -} - -/* - * We are about to send the given packet that already includes an IPv4 header, - * because the IP_HDRINCL option is enabled on a raw IPv4 socket. Prepare the - * IPv4 header for sending, by modifying a few fields in it, as expected by - * userland. - */ -static int -rawsock_prepare_hdrincl(struct rawsock * raw, struct pbuf * pbuf, - const ip_addr_t * src_addr) -{ - struct ip_hdr *iphdr; - size_t hlen; - - /* - * lwIP obtains the destination address from the IP packet header in - * this case, so make sure the packet has a full-sized header. - */ - if (pbuf->len < sizeof(struct ip_hdr)) - return EINVAL; - - iphdr = (struct ip_hdr *)pbuf->payload; - - /* - * Fill in the source address if it is not set, and do the byte - * swapping and checksum computation common for the BSDs, without which - * ping(8) and traceroute(8) do not work properly. We consider this a - * convenience feature, so malformed packets are simply sent as is. - * TODO: deal with type punning.. - */ - hlen = (size_t)IPH_HL(iphdr) << 2; - - if (pbuf->len >= hlen) { - /* Fill in the source address if it is blank. */ - if (iphdr->src.addr == PP_HTONL(INADDR_ANY)) { - assert(IP_IS_V4(src_addr)); - - iphdr->src.addr = ip_addr_get_ip4_u32(src_addr); - } - - IPH_LEN(iphdr) = htons(IPH_LEN(iphdr)); - IPH_OFFSET(iphdr) = htons(IPH_OFFSET(iphdr)); - IPH_CHKSUM(iphdr) = 0; - - IPH_CHKSUM(iphdr) = inet_chksum(iphdr, hlen); - } - - return OK; -} - -/* - * Send a packet on a raw socket. - */ -static int -rawsock_send(struct sock * sock, const struct sockdriver_data * data, - size_t len, size_t * off, const struct sockdriver_data * ctl __unused, - socklen_t ctl_len __unused, socklen_t * ctl_off __unused, - const struct sockaddr * addr, socklen_t addr_len, - endpoint_t user_endpt __unused, int flags, size_t min __unused) -{ - struct rawsock *raw = (struct rawsock *)sock; - struct pktopt pktopt; - struct pbuf *pbuf; - struct ifdev *ifdev; - struct netif *netif; - const ip_addr_t *dst_addrp, *src_addrp; - ip_addr_t src_addr, dst_addr; /* for storage only; not always used! */ - size_t hdrlen; - uint32_t ifindex; - err_t err; - int r; - - /* Copy in and parse any packet options. */ - pktopt.pkto_flags = 0; - - if ((r = pktsock_get_ctl(&raw->raw_pktsock, ctl, ctl_len, - &pktopt)) != OK) - return r; - - /* - * For a more in-depth explanation of what is going on here, see the - * udpsock module, which has largely the same code but with more - * elaborate comments. - */ - - /* - * Start by checking whether the source address and/or the outgoing - * interface are overridden using sticky and/or ancillary options. - */ - if ((r = pktsock_get_pktinfo(&raw->raw_pktsock, &pktopt, &ifdev, - &src_addr)) != OK) - return r; - - if (ifdev != NULL && !ip_addr_isany(&src_addr)) { - /* This is guaranteed to be a proper local unicast address. */ - src_addrp = &src_addr; - } else { - src_addrp = &raw->raw_pcb->local_ip; - - /* - * If the socket is bound to a multicast address, use the - * unspecified ('any') address as source address instead. A - * real source address will then be selected further below. - */ - if (ip_addr_ismulticast(src_addrp)) - src_addrp = IP46_ADDR_ANY(IP_GET_TYPE(src_addrp)); - } - - /* - * Determine the destination address to use. If the socket is - * connected, always ignore any address provided in the send call. - */ - if (!rawsock_is_conn(raw)) { - assert(addr != NULL); /* already checked in pre_send */ - - if ((r = ipsock_get_dst_addr(rawsock_get_ipsock(raw), addr, - addr_len, src_addrp, &dst_addr, NULL /*dst_port*/)) != OK) - return r; - - dst_addrp = &dst_addr; - } else - dst_addrp = &raw->raw_pcb->remote_ip; - - /* - * If the destination is a multicast address, select the outgoing - * interface based on the multicast interface index, if one is set. - * This must however *not* override an interface index already - * specified using IPV6_PKTINFO, as per RFC 3542 Sec. 6.7. - */ - if (ifdev == NULL && ip_addr_ismulticast(dst_addrp)) { - ifindex = raw_get_multicast_netif_index(raw->raw_pcb); - - if (ifindex != NETIF_NO_INDEX) - ifdev = ifdev_get_by_index(ifindex); /* (may fail) */ - } - - /* - * If an interface has been determined already now, the send operation - * will bypass routing. In that case, we must perform our own checks - * on address zone violations, because those will not be made anywhere - * else. Subsequent steps below will never introduce violations. - */ - if (ifdev != NULL && IP_IS_V6(dst_addrp)) { - if (ifaddr_is_zone_mismatch(ip_2_ip6(dst_addrp), ifdev)) - return EHOSTUNREACH; - - if (IP_IS_V6(src_addrp) && - ifaddr_is_zone_mismatch(ip_2_ip6(src_addrp), ifdev)) - return EHOSTUNREACH; - } - - /* - * If we do not yet have an interface at this point, perform a route - * lookup to determine the outgoing interface, unless MSG_DONTROUTE is - * set. - */ - if (ifdev == NULL) { - if (!(flags & MSG_DONTROUTE)) { - /* - * ip_route() should never be called with an - * IPADDR_TYPE_ANY type address. This is a lwIP- - * internal requirement; while we override both routing - * functions, we do not deviate from it. - */ - if (IP_IS_ANY_TYPE_VAL(*src_addrp)) - src_addrp = - IP46_ADDR_ANY(IP_GET_TYPE(dst_addrp)); - - /* Perform the route lookup. */ - if ((netif = ip_route(src_addrp, dst_addrp)) == NULL) - return EHOSTUNREACH; - - ifdev = netif_get_ifdev(netif); - } else { - if ((ifdev = ifaddr_map_by_subnet(dst_addrp)) == NULL) - return EHOSTUNREACH; - } - } - - /* - * At this point we have an outgoing interface. If we do not have a - * source address yet, pick one now. As a sidenote, if the destination - * address is scoped but has no zone, we could also fill in the zone - * now. We let lwIP handle that instead, though. - */ - assert(ifdev != NULL); - - if (ip_addr_isany(src_addrp)) { - src_addrp = ifaddr_select(dst_addrp, ifdev, NULL /*ifdevp*/); - - if (src_addrp == NULL) - return EHOSTUNREACH; - } - - /* - * Now that we know the full conditions of what we are about to send, - * check whether the packet size leaves enough room for lwIP to prepend - * headers. If so, allocate a chain of pbufs for the packet. - */ - assert(len <= RAW_MAX_PAYLOAD); - - if (rawsock_is_hdrincl(raw)) - hdrlen = 0; - else if (IP_IS_V6(dst_addrp)) - hdrlen = IP6_HLEN; - else - hdrlen = IP_HLEN; - - if (hdrlen + len > RAW_MAX_PAYLOAD) - return EMSGSIZE; - - if ((pbuf = pchain_alloc(PBUF_IP, len)) == NULL) - return ENOBUFS; - - /* Copy in the packet data. */ - if ((r = pktsock_get_data(&raw->raw_pktsock, data, len, pbuf)) != OK) { - pbuf_free(pbuf); - - return r; - } - - /* - * If the user has turned on IPV6_CHECKSUM, ensure that the packet is - * not only large enough to have the checksum stored at the configured - * place, but also that the checksum fits within the first pbuf: if we - * do not test this here, an assert will trigger in lwIP later. Also - * zero out the checksum field first, because lwIP does not do that. - */ - if (raw->raw_pcb->chksum_reqd) { - if (pbuf->len < raw->raw_pcb->chksum_offset + - sizeof(uint16_t)) { - pbuf_free(pbuf); - - return EINVAL; - } - - memset((char *)pbuf->payload + raw->raw_pcb->chksum_offset, 0, - sizeof(uint16_t)); - } - - /* - * For sockets where an IPv4 header is already included in the packet, - * we need to alter a few header fields to be compatible with BSD. - */ - if (rawsock_is_hdrincl(raw) && - (r = rawsock_prepare_hdrincl(raw, pbuf, src_addrp)) != OK) { - pbuf_free(pbuf); - - return r; - } - - /* Set broadcast/multicast flags for accounting purposes. */ - if (ip_addr_ismulticast(dst_addrp)) - pbuf->flags |= PBUF_FLAG_LLMCAST; - else if (ip_addr_isbroadcast(dst_addrp, ifdev_get_netif(ifdev))) - pbuf->flags |= PBUF_FLAG_LLBCAST; - - /* Send the packet. */ - rawsock_swap_opt(raw, &pktopt); - - assert(!ip_addr_isany(src_addrp)); - assert(!ip_addr_ismulticast(src_addrp)); - - err = raw_sendto_if_src(raw->raw_pcb, pbuf, dst_addrp, - ifdev_get_netif(ifdev), src_addrp); - - rawsock_swap_opt(raw, &pktopt); - - /* Free the pbuf again. */ - pbuf_free(pbuf); - - /* - * On success, make sure to return the size of the sent packet as well. - * As an aside: ctl_off need not be updated, as it is not returned. - */ - if ((r = util_convert_err(err)) == OK) - *off = len; - return r; -} - -/* - * Update the set of flag-type socket options on a raw socket. - */ -static void -rawsock_setsockmask(struct sock * sock, unsigned int mask) -{ - struct rawsock *raw = (struct rawsock *)sock; - - /* - * FIXME: raw sockets are not supposed to have a broardcast check, so - * perhaps just remove this and instead always set SOF_BROADCAST? - */ - if (mask & SO_BROADCAST) - ip_set_option(raw->raw_pcb, SOF_BROADCAST); - else - ip_reset_option(raw->raw_pcb, SOF_BROADCAST); -} - -/* - * Prepare a helper structure for IP-level option processing. - */ -static void -rawsock_get_ipopts(struct rawsock * raw, struct ipopts * ipopts) -{ - - ipopts->local_ip = &raw->raw_pcb->local_ip; - ipopts->remote_ip = &raw->raw_pcb->remote_ip; - ipopts->tos = &raw->raw_pcb->tos; - ipopts->ttl = &raw->raw_pcb->ttl; - ipopts->sndmin = RAW_SNDBUF_MIN; - ipopts->sndmax = RAW_SNDBUF_MAX; - ipopts->rcvmin = RAW_RCVBUF_MIN; - ipopts->rcvmax = RAW_RCVBUF_MAX; -} - -/* - * Set socket options on a raw socket. - */ -static int -rawsock_setsockopt(struct sock * sock, int level, int name, - const struct sockdriver_data * data, socklen_t len) -{ - struct rawsock *raw = (struct rawsock *)sock; - struct ipopts ipopts; - struct icmp6_filter filter; - ip_addr_t ipaddr; - struct in_addr in_addr; - struct ifdev *ifdev; - unsigned int flags; - uint32_t ifindex; - uint8_t byte; - int r, val; - - /* - * Unfortunately, we have to duplicate most of the multicast options - * rather than sharing them with udpsock at the pktsock level. The - * reason is that each of the PCBs have their own multicast abstraction - * functions and so we cannot merge the rest. Same for getsockopt. - */ - - switch (level) { - case IPPROTO_IP: - if (rawsock_is_ipv6(raw)) - break; - - switch (name) { - case IP_HDRINCL: - if ((r = sockdriver_copyin_opt(data, &val, sizeof(val), - len)) != OK) - return r; - - if (val) { - raw_setflags(raw->raw_pcb, - raw_flags(raw->raw_pcb) | - RAW_FLAGS_HDRINCL); - } else { - raw_setflags(raw->raw_pcb, - raw_flags(raw->raw_pcb) & - ~RAW_FLAGS_HDRINCL); - } - - return OK; - - case IP_MULTICAST_IF: - pktsock_set_mcaware(&raw->raw_pktsock); - - if ((r = sockdriver_copyin_opt(data, &in_addr, - sizeof(in_addr), len)) != OK) - return r; - - ip_addr_set_ip4_u32(&ipaddr, in_addr.s_addr); - - if ((ifdev = ifaddr_map_by_addr(&ipaddr)) == NULL) - return EADDRNOTAVAIL; - - raw_set_multicast_netif_index(raw->raw_pcb, - ifdev_get_index(ifdev)); - - return OK; - - case IP_MULTICAST_LOOP: - pktsock_set_mcaware(&raw->raw_pktsock); - - if ((r = sockdriver_copyin_opt(data, &byte, - sizeof(byte), len)) != OK) - return r; - - flags = raw_flags(raw->raw_pcb); - - if (byte) - flags |= RAW_FLAGS_MULTICAST_LOOP; - else - flags &= ~RAW_FLAGS_MULTICAST_LOOP; - - raw_setflags(raw->raw_pcb, flags); - - return OK; - - case IP_MULTICAST_TTL: - pktsock_set_mcaware(&raw->raw_pktsock); - - if ((r = sockdriver_copyin_opt(data, &byte, - sizeof(byte), len)) != OK) - return r; - - raw_set_multicast_ttl(raw->raw_pcb, byte); - - return OK; - } - - break; - - case IPPROTO_IPV6: - if (!rawsock_is_ipv6(raw)) - break; - - switch (name) { - case IPV6_CHECKSUM: - /* ICMPv6 checksums are always computed. */ - if (raw->raw_pcb->protocol == IPPROTO_ICMPV6) - return EINVAL; - - if ((r = sockdriver_copyin_opt(data, &val, sizeof(val), - len)) != OK) - return r; - - if (val == -1) { - raw->raw_pcb->chksum_reqd = 0; - - return OK; - } else if (val >= 0 && !(val & 1)) { - raw->raw_pcb->chksum_reqd = 1; - raw->raw_pcb->chksum_offset = val; - - return OK; - } else - return EINVAL; - - case IPV6_MULTICAST_IF: - pktsock_set_mcaware(&raw->raw_pktsock); - - if ((r = sockdriver_copyin_opt(data, &val, sizeof(val), - len)) != OK) - return r; - - if (val != 0) { - ifindex = (uint32_t)val; - - ifdev = ifdev_get_by_index(ifindex); - - if (ifdev == NULL) - return ENXIO; - } else - ifindex = NETIF_NO_INDEX; - - raw_set_multicast_netif_index(raw->raw_pcb, ifindex); - - return OK; - - case IPV6_MULTICAST_LOOP: - pktsock_set_mcaware(&raw->raw_pktsock); - - if ((r = sockdriver_copyin_opt(data, &val, sizeof(val), - len)) != OK) - return r; - - if (val < 0 || val > 1) - return EINVAL; - - flags = raw_flags(raw->raw_pcb); - - if (val) - flags |= RAW_FLAGS_MULTICAST_LOOP; - else - flags &= ~RAW_FLAGS_MULTICAST_LOOP; - - /* - * lwIP's IPv6 functionality does not actually check - * this flag at all yet. We set it in the hope that - * one day this will magically start working. - */ - raw_setflags(raw->raw_pcb, flags); - - return OK; - - case IPV6_MULTICAST_HOPS: - pktsock_set_mcaware(&raw->raw_pktsock); - - if ((r = sockdriver_copyin_opt(data, &val, sizeof(val), - len)) != OK) - return r; - - if (val < -1 || val > UINT8_MAX) - return EINVAL; - - if (val == -1) - val = 1; - - raw_set_multicast_ttl(raw->raw_pcb, val); - - return OK; - } - - break; - - case IPPROTO_ICMPV6: - if (!rawsock_is_ipv6(raw) || - raw->raw_pcb->protocol != IPPROTO_ICMPV6) - break; - - switch (name) { - case ICMP6_FILTER: - /* Who comes up with these stupid exceptions? */ - if (len == 0) { - ICMP6_FILTER_SETPASSALL(&raw->raw_icmp6filter); - - return OK; - } - - if ((r = sockdriver_copyin_opt(data, &filter, - sizeof(filter), len)) != OK) - return r; - - /* - * As always, never copy in the data into the actual - * destination, as any copy may run into a copy fault - * halfway through, potentially leaving the destination - * in a half-updated and thus corrupted state. - */ - memcpy(&raw->raw_icmp6filter, &filter, sizeof(filter)); - - return OK; - } - } - - rawsock_get_ipopts(raw, &ipopts); - - return pktsock_setsockopt(&raw->raw_pktsock, level, name, data, len, - &ipopts); -} - -/* - * Retrieve socket options on a raw socket. - */ -static int -rawsock_getsockopt(struct sock * sock, int level, int name, - const struct sockdriver_data * data, socklen_t * len) -{ - struct rawsock *raw = (struct rawsock *)sock; - struct ipopts ipopts; - const ip4_addr_t *ip4addr; - struct in_addr in_addr; - struct ifdev *ifdev; - unsigned int flags; - uint32_t ifindex; - uint8_t byte; - int val; - - switch (level) { - case IPPROTO_IP: - if (rawsock_is_ipv6(raw)) - break; - - switch (name) { - case IP_HDRINCL: - val = !!rawsock_is_hdrincl(raw); - - return sockdriver_copyout_opt(data, &val, sizeof(val), - len); - - case IP_MULTICAST_IF: - ifindex = raw_get_multicast_netif_index(raw->raw_pcb); - - /* - * Map back from the interface index to the IPv4 - * address assigned to the corresponding interface. - * Should this not work out, return the 'any' address. - */ - if (ifindex != NETIF_NO_INDEX && - (ifdev = ifdev_get_by_index(ifindex)) != NULL) { - ip4addr = - netif_ip4_addr(ifdev_get_netif(ifdev)); - - in_addr.s_addr = ip4_addr_get_u32(ip4addr); - } else - in_addr.s_addr = PP_HTONL(INADDR_ANY); - - return sockdriver_copyout_opt(data, &in_addr, - sizeof(in_addr), len); - - case IP_MULTICAST_LOOP: - flags = raw_flags(raw->raw_pcb); - - byte = !!(flags & RAW_FLAGS_MULTICAST_LOOP); - - return sockdriver_copyout_opt(data, &byte, - sizeof(byte), len); - - case IP_MULTICAST_TTL: - byte = raw_get_multicast_ttl(raw->raw_pcb); - - return sockdriver_copyout_opt(data, &byte, - sizeof(byte), len); - } - - break; - - case IPPROTO_IPV6: - if (!rawsock_is_ipv6(raw)) - break; - - switch (name) { - case IPV6_CHECKSUM: - if (raw->raw_pcb->chksum_reqd) - val = raw->raw_pcb->chksum_offset; - else - val = -1; - - return sockdriver_copyout_opt(data, &val, sizeof(val), - len); - - case IPV6_MULTICAST_IF: - ifindex = raw_get_multicast_netif_index(raw->raw_pcb); - - val = (int)ifindex; - - return sockdriver_copyout_opt(data, &val, sizeof(val), - len); - - case IPV6_MULTICAST_LOOP: - flags = raw_flags(raw->raw_pcb); - - val = !!(flags & RAW_FLAGS_MULTICAST_LOOP); - - return sockdriver_copyout_opt(data, &val, sizeof(val), - len); - - case IPV6_MULTICAST_HOPS: - val = raw_get_multicast_ttl(raw->raw_pcb); - - return sockdriver_copyout_opt(data, &val, sizeof(val), - len); - } - - break; - - case IPPROTO_ICMPV6: - if (!rawsock_is_ipv6(raw) || - raw->raw_pcb->protocol != IPPROTO_ICMPV6) - break; - - switch (name) { - case ICMP6_FILTER: - return sockdriver_copyout_opt(data, - &raw->raw_icmp6filter, - sizeof(raw->raw_icmp6filter), len); - } - - break; - } - - rawsock_get_ipopts(raw, &ipopts); - - return pktsock_getsockopt(&raw->raw_pktsock, level, name, data, len, - &ipopts); -} - -/* - * Retrieve the local socket address of a raw socket. - */ -static int -rawsock_getsockname(struct sock * sock, struct sockaddr * addr, - socklen_t * addr_len) -{ - struct rawsock *raw = (struct rawsock *)sock; - - ipsock_put_addr(rawsock_get_ipsock(raw), addr, addr_len, - &raw->raw_pcb->local_ip, 0 /*port*/); - - return OK; -} - -/* - * Retrieve the remote socket address of a raw socket. - */ -static int -rawsock_getpeername(struct sock * sock, struct sockaddr * addr, - socklen_t * addr_len) -{ - struct rawsock *raw = (struct rawsock *)sock; - - if (!rawsock_is_conn(raw)) - return ENOTCONN; - - ipsock_put_addr(rawsock_get_ipsock(raw), addr, addr_len, - &raw->raw_pcb->remote_ip, 0 /*port*/); - - return OK; -} - -/* - * Shut down a raw socket for reading and/or writing. - */ -static int -rawsock_shutdown(struct sock * sock, unsigned int mask) -{ - struct rawsock *raw = (struct rawsock *)sock; - - if (mask & SFL_SHUT_RD) - raw_recv(raw->raw_pcb, NULL, NULL); - - pktsock_shutdown(&raw->raw_pktsock, mask); - - return OK; -} - -/* - * Close a raw socket. - */ -static int -rawsock_close(struct sock * sock, int force __unused) -{ - struct rawsock *raw = (struct rawsock *)sock; - - raw_recv(raw->raw_pcb, NULL, NULL); - - raw_remove(raw->raw_pcb); - raw->raw_pcb = NULL; - - pktsock_close(&raw->raw_pktsock); - - return OK; -} - -/* - * Free up a closed raw socket. - */ -static void -rawsock_free(struct sock * sock) -{ - struct rawsock *raw = (struct rawsock *)sock; - - assert(raw->raw_pcb == NULL); - - TAILQ_REMOVE(&raw_activelist, raw, raw_next); - - TAILQ_INSERT_HEAD(&raw_freelist, raw, raw_next); -} - -/* - * Fill the given kinfo_pcb sysctl(7) structure with information about the RAW - * PCB identified by the given pointer. - */ -static void -rawsock_get_info(struct kinfo_pcb * ki, const void * ptr) -{ - const struct raw_pcb *pcb = (const struct raw_pcb *)ptr; - struct rawsock *raw; - - /* We iterate our own list so we can't find "strange" PCBs. */ - raw = (struct rawsock *)pcb->recv_arg; - assert(raw >= raw_array && - raw < &raw_array[__arraycount(raw_array)]); - - ki->ki_type = SOCK_RAW; - ki->ki_protocol = pcb->protocol; - - ipsock_get_info(ki, &pcb->local_ip, 0 /*local_port*/, - &raw->raw_pcb->remote_ip, 0 /*remote_port*/); - - /* TODO: change this so that sockstat(1) may work one day. */ - ki->ki_sockaddr = (uint64_t)(uintptr_t)rawsock_get_sock(raw); - - ki->ki_rcvq = pktsock_get_recvlen(&raw->raw_pktsock); - - if (rawsock_is_hdrincl(raw)) - ki->ki_pflags |= INP_HDRINCL; -} - -/* - * Given either NULL or a previously returned RAW PCB pointer, return the first - * or next RAW PCB pointer, or NULL if there are no more. lwIP does not expose - * 'raw_pcbs', but other modules in this service may also use RAW PCBs (which - * should then stay hidden), so we iterate through our own list instead. - */ -static const void * -rawsock_enum(const void * last) -{ - const struct raw_pcb *pcb; - struct rawsock *raw; - - if (last != NULL) { - pcb = (const struct raw_pcb *)last; - - raw = (struct rawsock *)pcb->recv_arg; - assert(raw >= raw_array && - raw < &raw_array[__arraycount(raw_array)]); - - raw = TAILQ_NEXT(raw, raw_next); - } else - raw = TAILQ_FIRST(&raw_activelist); - - if (raw != NULL) - return raw->raw_pcb; - else - return NULL; -} - -/* - * Obtain the list of RAW protocol control blocks, for sysctl(7). - */ -static ssize_t -rawsock_pcblist(struct rmib_call * call, struct rmib_node * node, - struct rmib_oldp * oldp, struct rmib_newp * newp __unused) -{ - - return util_pcblist(call, oldp, rawsock_enum, rawsock_get_info); -} - -static const struct sockevent_ops rawsock_ops = { - .sop_bind = rawsock_bind, - .sop_connect = rawsock_connect, - .sop_pre_send = rawsock_pre_send, - .sop_send = rawsock_send, - .sop_pre_recv = pktsock_pre_recv, - .sop_recv = pktsock_recv, - .sop_test_recv = pktsock_test_recv, - .sop_ioctl = ifconf_ioctl, - .sop_setsockmask = rawsock_setsockmask, - .sop_setsockopt = rawsock_setsockopt, - .sop_getsockopt = rawsock_getsockopt, - .sop_getsockname = rawsock_getsockname, - .sop_getpeername = rawsock_getpeername, - .sop_shutdown = rawsock_shutdown, - .sop_close = rawsock_close, - .sop_free = rawsock_free -}; diff --git a/minix/net/lwip/route.c b/minix/net/lwip/route.c deleted file mode 100644 index 11a77fb33..000000000 --- a/minix/net/lwip/route.c +++ /dev/null @@ -1,1654 +0,0 @@ -/* LWIP service - route.c - route management */ -/* - * This module provides a destination-based routing implementation, roughly - * matching the routing as done traditionally by the BSDs and by current NetBSD - * in particular. As such, this implementation almost completely replaces - * lwIP's own more limited (and less rigid) routing algorithms. It does this - * using a combination of overriding lwIP functions (ip4_route, ip6_route) with - * weak-symbol patching, and lwIP-provided gateway hooks. Especially the - * former gives us a level of control that lwIP's routing hooks do not provide: - * not only does such overriding give us the ability to flag that no route was - * found at all, we also bypass a number of default decisions taken by lwIP - * where the routing hooks are not called at all. - * - * As a result, the routing tables as visible to the user are an almost - * completely accurate reflection of the routing decisions taken by this TCP/IP - * stack in practice. There is currently only one exception: for IPv4 gateway - * selection, lwIP will bypass the gateway hook if the given address is on the - * local subnet according to the locally assigned IP address and subnet mask. - * This exception should practically affect noone, though. - * - * Our routing implementation differs from NetBSD's in various aspects, though. - * Perhaps the most important one, also noted elsewhere, is that we do not - * support the coexistence of an all-bits-set network route and a host route - * for the same IP address. If necessary, this issue can be resolved. - * - * We use a custom concept of "immutable" routes for local addresses, which are - * a somewhat special case as explained in the ifaddr module. Since those - * RTF_LOCAL routes cannot be deleted, a small change is made to the route(8) - * flush-all command to skip them. Packets directed at local addresses on - * non-loopback interfaces are handled in a way that differs from NetBSD's, - * too. This is explained in the ifdev module. - * - * The BSDs support special routes that reject or blackhole packets, based on - * routing flags. We support such routes as well, but implement them somewhat - * differently from the BSDs: such packets always get routed over a loopback - * interface (regardless of their associated interface), in order to save on - * routing lookups for packets in the common case. - * - * As general rules of thumb: if there is no route to a destination, assignment - * of a local address will already fail with a "no route to host" error. If - * there is an RTF_REJECT route, a local address will be assigned, but actual - * packets will be routed to a loopback interface and result in a "no route to - * host" error upon reception there - this is what NetBSD seems to do too, even - * though the documentation says that RTF_REJECT routes generate ICMP messages - * instead. RTF_BLACKHOLE behaves similarly to RTF_REJECT, except that the - * packet is simply discarded upon receipt by the loopback interface. - * - * In various places, both here and elsewhere, we check to make sure that on - * routing and output, scoped IPv6 source and destination addresses never leave - * their zone. For example, a packet must not be sent to an outgoing interface - * if its source address is a link-local address with a zone for another - * interface. lwIP does not check for such violations, and so we must make - * sure that this does not happen ourselves. - * - * Normally, one would tell lwIP to use a particular default IPv4 gateway by - * associating the gateway address to a particular interface, and then setting - * that interface as default interface (netif_default). We explicitly do - * neither of these things. Instead, the routing hooks should return the - * default route whenever applicable, and the gateway hooks should return the - * default route's gateway IP address whenever needed. - * - * Due to lwIP's limited set of error codes, we do not properly distinguish - * between cases where EHOSTUNREACH or ENETUNREACH should be thrown, and throw - * the former in most cases. - */ - -#include "lwip.h" -#include "ifaddr.h" -#include "rttree.h" -#include "rtsock.h" -#include "route.h" -#include "lldata.h" - -#include "lwip/nd6.h" - -/* - * The maximum number of uint8_t bytes needed to represent a routing address. - * This value is the maximum of 4 (for IPv4) and 16 (for IPv6). - */ -#define ROUTE_ADDR_MAX (MAX(IP4_BITS, IP6_BITS) / NBBY) - -/* - * We use a shared routing entry data structure for IPv4 and IPv6 routing - * entries. The result is cleaner code at the cost of (currently) about 2.3KB - * of memory wasted (costing 12 bytes per address for three addresses for 64 of - * the 128 routing entries that would be for IPv4), although with the benefit - * that either address family may use more than half of the routing entries. - * From that 2.3KB, 1KB can be reclaimed by moving the destination address and - * mask into the rttree_entry data structure, at the cost of its generality. - */ -struct route_entry { - struct rttree_entry re_entry; /* routing tree entry */ - union pxfer_re_pu { - struct ifdev *repu_ifdev; /* associated interface */ - SIMPLEQ_ENTRY(route_entry) repu_next; /* next free pointer */ - } re_pu; - unsigned int re_flags; /* routing flags (RTF_) */ - unsigned int re_use; /* number of times used */ - uint8_t re_addr[ROUTE_ADDR_MAX]; /* destination address */ - uint8_t re_mask[ROUTE_ADDR_MAX]; /* destination mask */ - union ixfer_re_gu { - ip4_addr_p_t regu_gw4; /* gateway (IPv4) */ - ip6_addr_p_t regu_gw6; /* gateway (IPv6) */ - } re_gu; -}; -#define re_ifdev re_pu.repu_ifdev -#define re_next re_pu.repu_next -#define re_gw4 re_gu.regu_gw4 -#define re_gw6 re_gu.regu_gw6 - -/* Routes for local addresses are immutable, for reasons explained in ifdev. */ -#define route_is_immutable(route) ((route)->re_flags & RTF_LOCAL) - -/* - * We override a subset of the BSD routing flags in order to store our own - * local settings. In particular, we have to have a way to store whether a - * route is for an IPv4 or IPv6 destination address. We override BSD's - * RTF_DONE flag for this: RTF_DONE is only used with routing sockets, and - * never associated with actual routes. In contrast, RTF_IPV6 is only used - * with actual routes, and never sent across routing sockets. In general, - * overriding flags is preferable to adding new ones, as BSD might later add - * more flags itself as well, while it can never remove existing flags. - */ -#define RTF_IPV6 RTF_DONE /* route is for an IPv6 destination */ - -/* The total number of routing entries (IPv4 and IPv6 combined). */ -#define NR_ROUTE_ENTRY 128 - -static struct route_entry route_array[NR_ROUTE_ENTRY]; /* routing entries */ - -static SIMPLEQ_HEAD(, route_entry) route_freelist; /* free entry list */ - -/* The routing trees. There are two: one for IPv4 and one for IPv6. */ -#define ROUTE_TREE_V4 0 -#define ROUTE_TREE_V6 1 -#define NR_ROUTE_TREE 2 - -static struct rttree route_tree[NR_ROUTE_TREE]; - -/* We support a single cached routing entry per address family (IPv4, IPv6). */ -static int rtcache_v4set; -static ip4_addr_t rtcache_v4addr; -static struct route_entry *rtcache_v4route; - -static int rtcache_v6set; -static ip6_addr_t rtcache_v6addr; -static struct route_entry *rtcache_v6route; - -/* - * Initialize the routing cache. There are a lot of trivial functions here, - * but this is designed to be extended in the future. - */ -static void -rtcache_init(void) -{ - - rtcache_v4set = FALSE; - rtcache_v6set = FALSE; -} - -/* - * Look up the given IPv4 address in the routing cache. If there is a match, - * return TRUE with the associated route in 'route', possibly NULL if a - * negative result was cached. Return FALSE if the routing cache does not - * cache the given IPv4 address. - */ -static inline int -rtcache_lookup_v4(const ip4_addr_t * ipaddr, struct route_entry ** route) -{ - - if (rtcache_v4set && ip4_addr_cmp(&rtcache_v4addr, ipaddr)) { - *route = rtcache_v4route; - - return TRUE; - } else - return FALSE; -} - -/* - * Add the given IPv4 address and the given routing entry (NULL for negative - * caching) to the routing cache. - */ -static inline void -rtcache_add_v4(const ip4_addr_t * ipaddr, struct route_entry * route) -{ - - rtcache_v4addr = *ipaddr; - rtcache_v4route = route; - rtcache_v4set = TRUE; -} - -/* - * Reset the IPv4 routing cache. - */ -static void -rtcache_reset_v4(void) -{ - - rtcache_v4set = FALSE; -} - -/* - * Look up the given IPv6 address in the routing cache. If there is a match, - * return TRUE with the associated route in 'route', possibly NULL if a - * negative result was cached. Return FALSE if the routing cache does not - * cache the given IPv6 address. - */ -static inline int -rtcache_lookup_v6(const ip6_addr_t * ipaddr, struct route_entry ** route) -{ - - if (rtcache_v6set && ip6_addr_cmp(&rtcache_v6addr, ipaddr)) { - *route = rtcache_v6route; - - return TRUE; - } else - return FALSE; -} - -/* - * Add the given IPv6 address and the given routing entry (NULL for negative - * caching) to the routing cache. Caching of scoped addresses without zones is - * not supported. - */ -static inline void -rtcache_add_v6(const ip6_addr_t * ipaddr, struct route_entry * route) -{ - - rtcache_v6addr = *ipaddr; - rtcache_v6route = route; - rtcache_v6set = TRUE; -} - -/* - * Reset the IPv6 routing cache. - */ -static void -rtcache_reset_v6(void) -{ - - rtcache_v6set = FALSE; -} - -/* - * Initialize the routing module. - */ -void -route_init(void) -{ - unsigned int slot; - - /* Initialize the routing trees. */ - rttree_init(&route_tree[ROUTE_TREE_V4], IP4_BITS); - rttree_init(&route_tree[ROUTE_TREE_V6], IP6_BITS); - - /* Initialize the list of free routing entries. */ - SIMPLEQ_INIT(&route_freelist); - - for (slot = 0; slot < __arraycount(route_array); slot++) - SIMPLEQ_INSERT_TAIL(&route_freelist, &route_array[slot], - re_next); - - /* Reset the routing cache. */ - rtcache_init(); -} - -/* - * Prepare for a routing tree operation by converting the given IPv4 address - * into a raw address that can be used in that routing tree operation. - */ -static inline void -route_prepare_v4(const ip4_addr_t * ip4addr, uint8_t rtaddr[ROUTE_ADDR_MAX]) -{ - uint32_t val; - - val = ip4_addr_get_u32(ip4addr); - - memcpy(rtaddr, &val, sizeof(val)); -} - -/* - * Prepare for a routing tree operation by converting the given IPv6 address - * into a raw address that can be used in that routing tree operation. If the - * given prefix length allows for it, also incorporate the address zone. - */ -static inline void -route_prepare_v6(const ip6_addr_t * ip6addr, unsigned int prefix, - uint8_t rtaddr[ROUTE_ADDR_MAX]) -{ - - assert(sizeof(ip6addr->addr) == IP6_BITS / NBBY); - - /* - * TODO: in most cases, we could actually return a pointer to the - * address contained in the given lwIP IP address structure. However, - * doing so would make a lot things quite a bit messier around here, - * but the small performance gain may still make it worth it. - */ - memcpy(rtaddr, ip6addr->addr, sizeof(ip6addr->addr)); - - /* - * Embed the zone ID into the address, KAME style. This is the - * easiest way to have link-local addresses for multiple interfaces - * coexist in a single routing tree. Do this only if the full zone ID - * would be included in the prefix though, or we might de-normalize the - * address. - */ - if (ip6_addr_has_zone(ip6addr) && prefix >= 32) - rtaddr[3] = ip6_addr_zone(ip6addr); -} - -/* - * Prepare for a routing tree operation by converting the given IP address into - * a raw address that can be used in that routing tree operation. The given - * address's zone ID is embedded "KAME-style" into the raw (IPv6) address when - * applicable and if the given prefix length allows for it. Return the index - * of the routing tree to use (ROUTE_TREE_V4 or ROUTE_TREE_V6). - */ -static unsigned int -route_prepare(const ip_addr_t * ipaddr, unsigned int prefix, - uint8_t rtaddr[ROUTE_ADDR_MAX]) -{ - - switch (IP_GET_TYPE(ipaddr)) { - case IPADDR_TYPE_V4: - route_prepare_v4(ip_2_ip4(ipaddr), rtaddr); - - return ROUTE_TREE_V4; - - case IPADDR_TYPE_V6: - route_prepare_v6(ip_2_ip6(ipaddr), prefix, rtaddr); - - return ROUTE_TREE_V6; - - default: - panic("unknown IP address type: %u", IP_GET_TYPE(ipaddr)); - } -} - -/* - * The given routing tree (ROUTE_TREE_V4 or ROUTE_TREE_V6) has been updated. - * Invalidate any cache entries that may now have become stale, both locally - * and in lwIP. - */ -static void -route_updated(unsigned int tree) -{ - - if (tree == ROUTE_TREE_V6) { - rtcache_reset_v6(); - - /* - * Also clear the lwIP ND6 destination cache, which may now - * contain entries for the wrong gateway. - */ - nd6_clear_destination_cache(); - } else - rtcache_reset_v4(); -} - -/* - * Add a route to the appropriate routing table. The address, address zone, - * prefix, and RTF_HOST flag in the flags field make up the identity of the - * route. If the flags field contains RTF_GATEWAY, a gateway must be given; - * otherwise, it must be NULL. The route is associated with the given - * interface, which may not be NULL. The caller must ensure that the flags - * field does not contain unsupported flags. On success, return OK, and also - * also announce the addition. On failure, return a negative error code. - */ -int -route_add(const ip_addr_t * addr, unsigned int prefix, - const ip_addr_t * gateway, struct ifdev * ifdev, unsigned int flags, - const struct rtsock_request * rtr) -{ - struct route_entry *route; - unsigned int tree, byte; - int r; - - assert(flags & RTF_UP); - assert(!!(flags & RTF_GATEWAY) == (gateway != NULL)); - assert(ifdev != NULL); - - /* Get a routing entry, if any are available. */ - if (SIMPLEQ_EMPTY(&route_freelist)) - return ENOBUFS; - - route = SIMPLEQ_FIRST(&route_freelist); - - /* - * Perform sanity checks on the input, and fill in enough of the - * routing entry to be able to try and add it to the routing tree. - */ - memset(route->re_addr, 0, sizeof(route->re_addr)); - - tree = route_prepare(addr, prefix, route->re_addr); - - switch (tree) { - case ROUTE_TREE_V4: - if (prefix > IP4_BITS || - (prefix != IP4_BITS && (flags & RTF_HOST))) - return EINVAL; - - flags &= ~RTF_IPV6; - - break; - - case ROUTE_TREE_V6: - if (prefix > IP6_BITS || - (prefix != IP6_BITS && (flags & RTF_HOST))) - return EINVAL; - - flags |= RTF_IPV6; - - break; - - default: - return EINVAL; - } - - /* Generate the (raw) network mask. This is protocol agnostic! */ - addr_make_netmask(route->re_mask, sizeof(route->re_mask), prefix); - - /* The given address must be normalized to its mask. */ - for (byte = 0; byte < __arraycount(route->re_addr); byte++) - if ((route->re_addr[byte] & ~route->re_mask[byte]) != 0) - return EINVAL; - - /* - * Attempt to add the routing entry. Host-type entries do not have an - * associated mask, enabling ever-so-slightly faster matching. - */ - if ((r = rttree_add(&route_tree[tree], &route->re_entry, - route->re_addr, (flags & RTF_HOST) ? NULL : route->re_mask, - prefix)) != OK) - return r; - - /* - * Success. Finish the routing entry. Remove the entry from the free - * list before assigning re_ifdev, as these two use the same memory. - */ - SIMPLEQ_REMOVE_HEAD(&route_freelist, re_next); - - route->re_ifdev = ifdev; - route->re_flags = flags; - - /* - * Store the gateway if one is given. Store the address in lwIP format - * because that is the easiest way use it later again. Store it as a - * union to keep the route entry structure as small as possible. Store - * the address without its zone, because the gateway's address zone is - * implied by its associated ifdev. - * - * If no gateway is given, this is a link-type route, i.e., a route for - * a local network, with all nodes directly connected and reachable. - */ - if (flags & RTF_GATEWAY) { - if (flags & RTF_IPV6) - ip6_addr_copy_to_packed(route->re_gw6, - *ip_2_ip6(gateway)); - else - ip4_addr_copy(route->re_gw4, *ip_2_ip4(gateway)); - } - - /* We have made routing changes. */ - route_updated(tree); - - /* Announce the route addition. */ - rtsock_msg_route(route, RTM_ADD, rtr); - - return OK; -} - -/* - * Check whether it is possible to add a route for the given destination to the - * corresponding routing table, that is, a subsequent route_add() call for this - * destination address is guaranteed to succeed (if all its parameters are - * valid). Return TRUE if adding the route is guaranteed to succeed, or FALSE - * if creating a route for the given destination would fail. - */ -int -route_can_add(const ip_addr_t * addr, unsigned int prefix, - int is_host __unused) -{ - uint8_t rtaddr[ROUTE_ADDR_MAX]; - unsigned int tree; - - tree = route_prepare(addr, prefix, rtaddr); - - /* - * The corresponding routing tree must not already contain an exact - * match for the destination. If the routing tree implementation is - * ever extended with support for coexisting host and net entries with - * the same prefix, we should also pass in 'is_host' here. - */ - if (rttree_lookup_exact(&route_tree[tree], rtaddr, prefix) != NULL) - return FALSE; - - /* There must be a routing entry on the free list as well. */ - return !SIMPLEQ_EMPTY(&route_freelist); -} - -/* - * Find a route with the exact given route identity. Return the route if - * found, or NULL if no route exists with this identity. - */ -struct route_entry * -route_find(const ip_addr_t * addr, unsigned int prefix, int is_host) -{ - struct rttree_entry *entry; - struct route_entry *route; - uint8_t rtaddr[ROUTE_ADDR_MAX]; - unsigned int tree; - - tree = route_prepare(addr, prefix, rtaddr); - - entry = rttree_lookup_exact(&route_tree[tree], rtaddr, prefix); - if (entry == NULL) - return NULL; - - route = (struct route_entry *)entry; - - /* - * As long as the routing tree code does not support coexisting host - * and net entries with the same prefix, we have to check the type. - */ - if (!!(route->re_flags & RTF_HOST) != is_host) - return NULL; - - return route; -} - -/* - * A route lookup failed for the given IP address. Generate an RTM_MISS - * message on routing sockets. - */ -static void -route_miss(const ip_addr_t * ipaddr) -{ - union sockaddr_any addr; - socklen_t addr_len; - - addr_len = sizeof(addr); - - addr_put_inet(&addr.sa, &addr_len, ipaddr, TRUE /*kame*/, 0 /*port*/); - - rtsock_msg_miss(&addr.sa); -} - -/* - * A route lookup failed for the given IPv4 address. Generate an RTM_MISS - * message on routing sockets. - */ -static void -route_miss_v4(const ip4_addr_t * ip4addr) -{ - ip_addr_t ipaddr; - - ip_addr_copy_from_ip4(ipaddr, *ip4addr); - - route_miss(&ipaddr); -} - -/* - * A route lookup failed for the given IPv6 address. Generate an RTM_MISS - * message on routing sockets. - */ -static void -route_miss_v6(const ip6_addr_t * ip6addr) -{ - ip_addr_t ipaddr; - - ip_addr_copy_from_ip6(ipaddr, *ip6addr); - - route_miss(&ipaddr); -} - -/* - * Look up the most narrow matching routing entry for the given IPv4 address. - * Return the routing entry if one exists at all, or NULL otherwise. This - * function performs caching. - */ -static inline struct route_entry * -route_lookup_v4(const ip4_addr_t * ip4addr) -{ - uint8_t rtaddr[ROUTE_ADDR_MAX]; - struct route_entry *route; - - /* - * Look up the route for the destination IP address, unless we have a - * cached route entry. We cache negatives in order to avoid generating - * lots of RTM_MISS messages for the same destination in a row. - */ - if (rtcache_lookup_v4(ip4addr, &route)) - return route; - - route_prepare_v4(ip4addr, rtaddr); - - route = (struct route_entry *) - rttree_lookup_match(&route_tree[ROUTE_TREE_V4], rtaddr); - - /* Cache the result, even if we found no route. */ - rtcache_add_v4(ip4addr, route); - - return route; -} - -/* - * Look up the most narrow matching routing entry for the given IPv6 address, - * taking into account its zone ID if applicable. Return the routing entry if - * one exists at all, or NULL otherwise. This function performs caching. - */ -static inline struct route_entry * -route_lookup_v6(const ip6_addr_t * ip6addr) -{ - uint8_t rtaddr[ROUTE_ADDR_MAX]; - struct route_entry *route; - int use_cache; - - /* - * We do not support caching of addresses that should have a zone but - * do not: in different contexts, such addresses could yield different - * routes. - */ - use_cache = !ip6_addr_lacks_zone(ip6addr, IP6_UNKNOWN); - - if (use_cache && rtcache_lookup_v6(ip6addr, &route)) - return route; - - route_prepare_v6(ip6addr, IP6_BITS, rtaddr); - - route = (struct route_entry *) - rttree_lookup_match(&route_tree[ROUTE_TREE_V6], rtaddr); - - /* Cache the result, even if no route was found. */ - if (use_cache) - rtcache_add_v6(ip6addr, route); - - return route; -} - -/* - * Look up the most narrow matching routing entry for the given IP address, - * taking into account its zone ID if applicable. Return the routing entry if - * one exists at all, or NULL otherwise. This function performs caching. - */ -struct route_entry * -route_lookup(const ip_addr_t * addr) -{ - - if (IP_IS_V4(addr)) - return route_lookup_v4(ip_2_ip4(addr)); - else - return route_lookup_v6(ip_2_ip6(addr)); -} - -/* - * Change an existing routing entry. Its flags are always updated to the new - * set of given flags, although certain flags are always preserved. If the - * new flags set has RTF_GATEWAY set and 'gateway' is not NULL, update the - * gateway associated with the route. If 'ifdev' is not NULL, reassociate the - * route with the given interface; this will not affect the zone of the - * route's destination address. On success, return OK, and also announce the - * change. On failure, return a negative error code. - */ -static int -route_change(struct route_entry * route, const ip_addr_t * gateway, - struct ifdev * ifdev, unsigned int flags, - const struct rtsock_request * rtr) -{ - unsigned int tree, preserve; - - tree = (route->re_flags & RTF_IPV6) ? ROUTE_TREE_V6 : ROUTE_TREE_V4; - - /* Update the associated interface (only) if a new one is given. */ - if (ifdev != NULL) - route->re_ifdev = ifdev; - - /* - * These flags may not be changed. RTF_UP should always be set anyway. - * RTF_HOST and RTF_IPV6 are part of the route's identity. RTF_LOCAL - * should be preserved as well, although we will not get here if either - * the old or the new flags have it set anyway. - */ - preserve = RTF_UP | RTF_HOST | RTF_IPV6 | RTF_LOCAL; - - /* Always update the flags. There is no way not to. */ - route->re_flags = (route->re_flags & preserve) | (flags & ~preserve); - - /* - * If a new gateway is given *and* RTF_GATEWAY is set, update the - * gateway. If RTF_GATEWAY is not set, this is a link-type route with - * no gateway. If no new gateway is given, we keep the gateway as is. - */ - if (gateway != NULL && (flags & RTF_GATEWAY)) { - if (flags & RTF_IPV6) - ip6_addr_copy_to_packed(route->re_gw6, - *ip_2_ip6(gateway)); - else - ip4_addr_copy(route->re_gw4, *ip_2_ip4(gateway)); - } - - /* We have made routing changes. */ - route_updated(tree); - - /* Announce the route change. */ - rtsock_msg_route(route, RTM_CHANGE, rtr); - - return OK; -} - -/* - * Delete the given route, and announce its deletion. - */ -void -route_delete(struct route_entry * route, const struct rtsock_request * rtr) -{ - unsigned int tree; - - /* First announce the deletion, while the route is still around. */ - tree = (route->re_flags & RTF_IPV6) ? ROUTE_TREE_V6 : ROUTE_TREE_V4; - - rtsock_msg_route(route, RTM_DELETE, rtr); - - /* Then actually delete the route. */ - rttree_delete(&route_tree[tree], &route->re_entry); - - SIMPLEQ_INSERT_HEAD(&route_freelist, route, re_next); - - /* We have made routing changes. */ - route_updated(tree); -} - -/* - * Delete all routes associated with the given interface, typically as part of - * destroying the interface. - */ -void -route_clear(struct ifdev * ifdev) -{ - struct rttree_entry *entry, *parent; - struct route_entry *route; - unsigned int tree; - - /* - * Delete all routes associated with the given interface. Fortunately, - * we need not also delete addresses zoned to the given interface, - * because no route can be created with a zone ID that does not match - * the associated interface. That is the main reason why we ignore - * zone IDs for gateways when adding or changing routes.. - */ - for (tree = 0; tree < NR_ROUTE_TREE; tree++) { - parent = NULL; - - while ((entry = rttree_enum(&route_tree[tree], - parent)) != NULL) { - route = (struct route_entry *)entry; - - if (route->re_ifdev == ifdev) - route_delete(route, NULL /*request*/); - else - parent = entry; - } - } -} - -/* - * Process a routing command specifically for an IPv4 or IPv6 route, as one of - * the specific continuations of processing started by route_process(). The - * RTM_ routing command is given as 'type'. The route destination is given as - * 'dst_addr'; its address type determines whether the operation is for IPv4 or - * IPv6. The sockaddr structures for 'mask' and 'gateway' are passed on as is - * and may have to be parsed here if not NULL. 'ifdev' is the interface to be - * associated with a route; it is non-NULL only if an interface name (IFP) or - * address (IFA) was given. The RTF_ flags field 'flags' has been checked - * against the globally supported flags, but may have to be checked for flags - * that do not apply to IPv4/IPv6 routes. Return OK or a negative error code, - * following the same semantics as route_process(). - */ -static int -route_process_inet(unsigned int type, const ip_addr_t * dst_addr, - const struct sockaddr * mask, const struct sockaddr * gateway, - struct ifdev * ifdev, unsigned int flags, - const struct rtsock_request * rtr) -{ - struct route_entry *route; - ip_addr_t gw_storage, *gw_addr; - struct ifdev *ifdev2; - uint32_t zone; - unsigned int prefix; - int r; - - assert(!(flags & RTF_LLDATA)); - - if ((flags & (RTF_DYNAMIC | RTF_MODIFIED | RTF_DONE | RTF_XRESOLVE | - RTF_LLINFO | RTF_CLONED | RTF_SRC | RTF_ANNOUNCE | - RTF_BROADCAST)) != 0) - return EINVAL; - - /* - * For network entries, a network mask must be provided in all cases. - * For host entries, the network mask is ignored, and we use a prefix - * with all bits set. - */ - if (!(flags & RTF_HOST)) { - if (mask == NULL) - return EINVAL; - - if ((r = addr_get_netmask(mask, mask->sa_len, - IP_GET_TYPE(dst_addr), &prefix, NULL /*ipaddr*/)) != OK) - return r; - } else { - if (IP_IS_V4(dst_addr)) - prefix = IP4_BITS; - else - prefix = IP6_BITS; - } - - gw_addr = NULL; - - /* - * Determine the gateway and interface for the routing entry, if - * applicable. - */ - if (type == RTM_ADD || type == RTM_CHANGE) { - /* - * The RTF_UP flag must always be set, but only if the flags - * field is used at all. - */ - if (!(flags & RTF_UP)) - return EINVAL; - - if ((flags & RTF_GATEWAY) && gateway != NULL) { - if ((r = addr_get_inet(gateway, gateway->sa_len, - IP_GET_TYPE(dst_addr), &gw_storage, TRUE /*kame*/, - NULL /*port*/)) != OK) - return r; - - gw_addr = &gw_storage; - - /* - * We use the zone of the gateway to help determine the - * interface, but we do not reject a mismatching zone - * here. The reason for this is that we do not want - * routes that have zones for an interface other than - * the one associated with the route, as that could - * create a world of trouble: packets leaving their - * zone, complications with cleaning up interfaces.. - */ - if (IP_IS_V6(gw_addr) && - ip6_addr_has_zone(ip_2_ip6(gw_addr))) { - zone = ip6_addr_zone(ip_2_ip6(gw_addr)); - - ifdev2 = ifdev_get_by_index(zone); - - if (ifdev != NULL && ifdev != ifdev2) - return EINVAL; - else - ifdev = ifdev2; - } - - /* - * If we still have no interface at this point, see if - * we can find one based on just the gateway address. - * See if a locally attached network owns the address. - * That may not succeed, leaving ifdev set to NULL. - */ - if (ifdev == NULL) - ifdev = ifaddr_map_by_subnet(gw_addr); - } - - /* - * When adding routes, all necessary information must be given. - * When changing routes, we can leave some settings as is. - */ - if (type == RTM_ADD) { - if ((flags & RTF_GATEWAY) && gw_addr == NULL) - return EINVAL; - - /* TODO: try harder to find a matching interface.. */ - if (ifdev == NULL) - return ENETUNREACH; - } - } - - /* - * All route commands except RTM_ADD require that a route exists for - * the given identity, although RTM_GET, when requesting a host entry, - * may return a wider (network) route based on just the destination - * address. - */ - if (type != RTM_ADD) { - /* For RTM_GET (only), a host query may return a net route. */ - if (type == RTM_GET && (flags & RTF_HOST)) - route = route_lookup(dst_addr); - else - route = route_find(dst_addr, prefix, - !!(flags & RTF_HOST)); - - if (route == NULL) - return ESRCH; - } else - route = NULL; - - /* Process the actual routing command. */ - switch (type) { - case RTM_ADD: - return route_add(dst_addr, prefix, gw_addr, ifdev, flags, rtr); - - case RTM_CHANGE: - /* Routes for local addresses are immutable. */ - if (route_is_immutable(route)) - return EPERM; - - return route_change(route, gw_addr, ifdev, flags, rtr); - - case RTM_DELETE: - /* Routes for local addresses are immutable. */ - if (route_is_immutable(route)) - return EPERM; - - route_delete(route, rtr); - - return OK; - - case RTM_LOCK: - /* - * TODO: implement even the suggestion that we support this. - * For now, we do not keep per-route metrics, let alone change - * them dynamically ourselves, so "locking" metrics is really - * not a concept that applies to us. We may however have to - * save the lock mask and return it in queries.. - */ - /* FALLTHROUGH */ - case RTM_GET: - /* Simply generate a message for the route we just found. */ - rtsock_msg_route(route, type, rtr); - - return OK; - - default: - return EINVAL; - } -} - -/* - * Process a routing command from a routing socket. The RTM_ type of command - * is given as 'type', and is one of RTM_ADD, RTM_CHANGE, RTM_DELETE, RTM_GET, - * RTM_LOCK. In addition, the function takes a set of sockaddr pointers as - * provided by the routing command. Each of these sockaddr pointers may be - * NULL; if not NULL, the structure is at least large enough to contain the - * address length (sa_len) and family (sa_family), and the length never exceeds - * the amount of memory used to store the sockaddr structure. However, the - * length itself has not yet been checked against the expected protocol - * structure and could even be zero. The command's RTF_ routing flags and - * metrics are provided as well. On success, return OK, in which case the - * caller assumes that a routing socket announcement for the processed command - * has been sent already (passing on 'rtr' to the announcement function as is). - * On failure, return a negative error code; in that case, the caller will send - * a failure response on the original routing socket itself. - */ -int -route_process(unsigned int type, const struct sockaddr * dst, - const struct sockaddr * mask, const struct sockaddr * gateway, - const struct sockaddr * ifp, const struct sockaddr * ifa, - unsigned int flags, unsigned long inits, - const struct rt_metrics * rmx, const struct rtsock_request * rtr) -{ - struct ifdev *ifdev, *ifdev2; - char name[IFNAMSIZ]; - ip_addr_t dst_addr, if_addr; - uint32_t zone; - uint8_t addr_type; - int r; - - /* - * The identity of a route is determined by its destination address, - * destination zone, prefix length, and whether it is a host entry - * or not. If it is a host entry (RTF_HOST is set), the prefix length - * is implied by the protocol; otherwise it should be obtained from the - * given netmask if necessary. For link-local addresses, the zone ID - * must be embedded KAME-style in the destination address. A - * destination address must always be given. The destination address - * also determines the overall address family. - */ - if (dst == NULL) - return EINVAL; - - switch (dst->sa_family) { - case AF_INET: - addr_type = IPADDR_TYPE_V4; - break; -#ifdef INET6 - case AF_INET6: - addr_type = IPADDR_TYPE_V6; - break; -#endif /* INET6 */ - default: - return EAFNOSUPPORT; - } - - if ((r = addr_get_inet(dst, dst->sa_len, addr_type, &dst_addr, - TRUE /*kame*/, NULL /*port*/)) != OK) - return r; - - /* - * Perform a generic test on the given flags. This covers everything - * we support at all, plus a few flags we ignore. Specific route types - * may have further restrictions; those tests are performed later. - */ - if ((flags & ~(RTF_UP | RTF_GATEWAY | RTF_HOST | RTF_REJECT | - RTF_CLONING | RTF_LLINFO | RTF_LLDATA | RTF_STATIC | - RTF_BLACKHOLE | RTF_CLONED | RTF_PROTO2 | RTF_PROTO1)) != 0) - return EINVAL; - - ifdev = NULL; - - if (type == RTM_ADD || type == RTM_CHANGE) { - /* - * If an interface address or name is given, use that to - * identify the target interface. If both are given, make sure - * that both identify the same interface--a hopefully helpful - * feature to detect wrong route(8) usage (NetBSD simply takes - * IFP over IFA). An empty interface name is ignored on the - * basis that libc link_addr(3) is broken. - */ - if (ifp != NULL) { - if ((r = addr_get_link(ifp, ifp->sa_len, name, - sizeof(name), NULL /*hwaddr*/, - 0 /*hwaddr_len*/)) != OK) - return r; - - if (name[0] != '\0' && - (ifdev = ifdev_find_by_name(name)) == NULL) - return ENXIO; - } - - if (ifa != NULL) { - /* - * This is similar to retrieval of source addresses in - * ipsock, with the difference that we do not impose - * that a zone ID be given for link-local addresses. - */ - if ((r = addr_get_inet(ifa, ifa->sa_len, addr_type, - &if_addr, TRUE /*kame*/, NULL /*port*/)) != OK) - return r; - - if ((ifdev2 = ifaddr_map_by_addr(&if_addr)) == NULL) - return EADDRNOTAVAIL; - - if (ifdev != NULL && ifdev != ifdev2) - return EINVAL; - else - ifdev = ifdev2; - } - - /* - * If the destination address has a zone, then it must not - * conflict with the interface, if one was given. If not, we - * may use it to decide the interface to use for the route. - */ - if (IP_IS_V6(&dst_addr) && - ip6_addr_has_zone(ip_2_ip6(&dst_addr))) { - if (ifdev == NULL) { - zone = ip6_addr_zone(ip_2_ip6(&dst_addr)); - - ifdev = ifdev_get_by_index(zone); - } else { - if (!ip6_addr_test_zone(ip_2_ip6(&dst_addr), - ifdev_get_netif(ifdev))) - return EADDRNOTAVAIL; - } - } - } - - /* - * For now, no initializers are supported by any of the sub-processing - * routines, so outright reject requests that set any initializers. - * Most importantly, we do not support per-route MTU settings (RTV_MTU) - * because lwIP would not use them, and we do not support non-zero - * expiry (RTV_EXPIRE) because for IPv4/IPv6 routes it is not a widely - * used feature and for ARP/NDP we would have to change lwIP. - * dhcpcd(8) does supply RTV_MTU, we have to ignore that option rather - * than reject it, unfortunately. arp(8) always sets RTV_EXPIRE, so we - * reject only non-zero expiry there. - */ - if ((inits & ~(RTV_EXPIRE | RTV_MTU)) != 0 || - ((inits & RTV_EXPIRE) != 0 && rmx->rmx_expire != 0)) - return ENOSYS; - - /* - * From here on, the processing differs for ARP, NDP, and IP routes. - * As of writing, our userland is from NetBSD 7, which puts link-local - * route entries in its main route tables. This means we would have to - * search for existing routes before we can determine whether, say, a - * RTM_GET request is for an IP or an ARP route entry. As of NetBSD 8, - * the link-local administration is separated, and all requests use the - * RTF_LLDATA flag to indicate that they are for ARP/NDP routes rather - * than IP routes. Since that change makes things much cleaner for us, - * we borrow from the future, patching arp(8) and ndp(8) to add the - * RTF_LLDATA flag now, so that we can implement a clean split here. - */ - if (!(flags & RTF_LLDATA)) - return route_process_inet(type, &dst_addr, mask, gateway, - ifdev, flags, rtr); - else - return lldata_process(type, &dst_addr, gateway, ifdev, flags, - rtr); -} - -/* - * Return the routing flags (RTF_) for the given routing entry. Strip out any - * internal flags. - */ -unsigned int -route_get_flags(const struct route_entry * route) -{ - - return route->re_flags & ~RTF_IPV6; -} - -/* - * Return TRUE if the given routing entry is for the IPv6 address family, or - * FALSE if it is for IPv4. - */ -int -route_is_ipv6(const struct route_entry * route) -{ - - return !!(route->re_flags & RTF_IPV6); -} - -/* - * Return the interface associated with the given routing entry. The resulting - * interface is never NULL. - */ -struct ifdev * -route_get_ifdev(const struct route_entry * route) -{ - - return route->re_ifdev; -} - -/* - * Convert the given raw routing address pointed to by 'rtaddr' into a - * lwIP-style IP address 'ipaddr' of type 'type', which must by IPADDR_TYPE_V4 - * or IPADDR_TYPE_V6. - */ -static void -route_get_addr(ip_addr_t * ipaddr, const uint8_t * rtaddr, uint8_t type) -{ - ip6_addr_t *ip6addr; - uint32_t val, zone; - - /* - * Convert the routing address to a lwIP-type IP address. Take out the - * KAME-style embedded zone, if needed. - */ - memset(ipaddr, 0, sizeof(*ipaddr)); - IP_SET_TYPE(ipaddr, type); - - switch (type) { - case IPADDR_TYPE_V4: - memcpy(&val, rtaddr, sizeof(val)); - - ip_addr_set_ip4_u32(ipaddr, val); - - break; - - case IPADDR_TYPE_V6: - ip6addr = ip_2_ip6(ipaddr); - - memcpy(ip6addr->addr, rtaddr, sizeof(ip6addr->addr)); - - if (ip6_addr_has_scope(ip6addr, IP6_UNKNOWN)) { - zone = ntohl(ip6addr->addr[0]) & 0x0000ffffU; - - ip6addr->addr[0] &= PP_HTONL(0xffff0000U); - - ip6_addr_set_zone(ip6addr, zone); - } - - break; - - default: - panic("unknown IP address type: %u", type); - } -} - -/* - * Obtain information about an IPv4 or IPv6 routing entry, by filling 'addr', - * 'mask', 'gateway', and optionally (if not NULL) 'ifp' and 'ifa' with - * sockaddr-type data for each of those fields. Also store the associated - * interface in 'ifdevp', the routing entry's flags in 'flags', and the route's - * usage count in 'use'. - */ -void -route_get(const struct route_entry * route, union sockaddr_any * addr, - union sockaddr_any * mask, union sockaddr_any * gateway, - union sockaddr_any * ifp, union sockaddr_any * ifa, - struct ifdev ** ifdevp, unsigned int * flags, unsigned int * use) -{ - const ip_addr_t *src_addr; - ip_addr_t dst_addr, gw_addr; - struct ifdev *ifdev; - socklen_t addr_len; - uint8_t type; - - type = (route->re_flags & RTF_IPV6) ? IPADDR_TYPE_V6 : IPADDR_TYPE_V4; - - /* Get the destination address. */ - route_get_addr(&dst_addr, route->re_addr, type); - - addr_len = sizeof(*addr); - - addr_put_inet(&addr->sa, &addr_len, &dst_addr, TRUE /*kame*/, - 0 /*port*/); - - /* Get the network mask, if applicable. */ - if (!(route->re_flags & RTF_HOST)) { - addr_len = sizeof(*mask); - - addr_put_netmask(&mask->sa, &addr_len, type, - rttree_get_prefix(&route->re_entry)); - } else - mask->sa.sa_len = 0; - - /* Get the gateway, which may be an IP address or a local link. */ - addr_len = sizeof(*gateway); - - ifdev = route->re_ifdev; - - if (route->re_flags & RTF_GATEWAY) { - if (type == IPADDR_TYPE_V4) - ip_addr_copy_from_ip4(gw_addr, route->re_gw4); - else - ip_addr_copy_from_ip6_packed(gw_addr, route->re_gw6); - - addr_put_inet(&gateway->sa, &addr_len, &gw_addr, TRUE /*kame*/, - 0 /*port*/); - } else { - addr_put_link(&gateway->sa, &addr_len, ifdev_get_index(ifdev), - ifdev_get_iftype(ifdev), NULL /*name*/, NULL /*hwaddr*/, - 0 /*hwaddr_len*/); - } - - /* Get the associated interface name. */ - if (ifp != NULL) { - addr_len = sizeof(*ifp); - - addr_put_link(&ifp->sa, &addr_len, ifdev_get_index(ifdev), - ifdev_get_iftype(ifdev), ifdev_get_name(ifdev), - NULL /*hwaddr*/, 0 /*hwaddr_len*/); - } - - /* Get the associated source address, if we can determine one. */ - if (ifa != NULL) { - src_addr = ifaddr_select(&dst_addr, ifdev, NULL /*ifdevp*/); - - if (src_addr != NULL) { - addr_len = sizeof(*ifa); - - addr_put_inet(&ifa->sa, &addr_len, src_addr, - TRUE /*kame*/, 0 /*port*/); - } else - ifa->sa.sa_len = 0; - } - - /* Get other fields. */ - *flags = route_get_flags(route); /* strip any internal flags */ - *ifdevp = ifdev; - *use = route->re_use; -} - -/* - * Enumerate IPv4 routing entries. Return the first IPv4 routing entry if - * 'last' is NULL, or the next routing entry after 'last' if it is not NULL. - * In both cases, the return value may be NULL if there are no more routes. - */ -struct route_entry * -route_enum_v4(struct route_entry * last) -{ - - assert(last == NULL || !(last->re_flags & RTF_IPV6)); - - return (struct route_entry *)rttree_enum(&route_tree[ROUTE_TREE_V4], - (last != NULL) ? &last->re_entry : NULL); -} - -/* - * Enumerate IPv6 routing entries. Return the first IPv6 routing entry if - * 'last' is NULL, or the next routing entry after 'last' if it is not NULL. - * In both cases, the return value may be NULL if there are no more routes. - */ -struct route_entry * -route_enum_v6(struct route_entry * last) -{ - - assert(last == NULL || (last->re_flags & RTF_IPV6)); - - return (struct route_entry *)rttree_enum(&route_tree[ROUTE_TREE_V6], - (last != NULL) ? &last->re_entry : NULL); -} - -/* - * lwIP IPv4 routing function. Given an IPv4 destination address, look up and - * return the target interface, or NULL if there is no route to the address. - * - * This is a full replacement of the corresponding lwIP function, which should - * be overridden with weak symbols, using patches against the lwIP source code. - * As such, the lwIP headers should already provide the correct prototype for - * this function. If not, something will have changed in the lwIP - * implementation, and this code must be revised accordingly. - */ -struct netif * -ip4_route(const ip4_addr_t * dst) -{ - struct route_entry *route; - struct ifdev *ifdev; - - /* - * Look up the route for the destination IPv4 address. If no route is - * found at all, return NULL to the caller. - */ - if ((route = route_lookup_v4(dst)) == NULL) { - route_miss_v4(dst); - - return NULL; - } - - /* - * For now, we increase the use counter only for actual route lookups, - * and not for gateway lookups or user queries. As of writing, - * route(8) does not print this number anyway.. - */ - route->re_use++; - - /* - * For all packets that are supposed to be rejected or blackholed, use - * a loopback interface, regardless of the interface to which the route - * is associated (even though it will typically be lo0 anyway). The - * reason for this is that on packet output, we perform another route - * route lookup just to check for rejection/blackholing, but for - * efficiency reasons, we limit such checks to loopback interfaces: - * loopback traffic will typically use only one IP address anyway, thus - * limiting route misses from such rejection/blackhole route lookups as - * much as we can. The lookup is implemented in route_output_v4(). We - * divert only if the target interface is not a loopback interface - * already, mainly to allow userland tests to create blackhole routes - * to a specific loopback interface for testing purposes. - * - * It is not correct to return NULL for RTF_REJECT routes here, because - * this could cause e.g. connect() calls to fail immediately, which is - * not how rejection should work. Related: a previous incarnation of - * support for these flags used a dedicated netif to eliminate the - * extra route lookup on regular output altogether, but in the current - * situation, that netif would have to be assigned (IPv4 and IPv6) - * addresses in order not to break e.g. connect() in the same way. - */ - if ((route->re_flags & (RTF_REJECT | RTF_BLACKHOLE)) && - !ifdev_is_loopback(route->re_ifdev)) - ifdev = ifdev_get_loopback(); - else - ifdev = route->re_ifdev; - - return ifdev_get_netif(ifdev); -} - -/* - * lwIP IPv4 routing hook. Since this hook is called only from lwIP's own - * ip4_route() implementation, this hook must never fire. If it does, either - * something is wrong with overriding ip4_route(), or lwIP added other places - * from which this hook is called. Both cases are highly problematic and must - * be resolved somehow, which is why we simply call panic() here. - */ -struct netif * -lwip_hook_ip4_route(const ip4_addr_t * dst) -{ - - panic("IPv4 routing hook called - this should not happen!"); -} - -/* - * lwIP IPv4 ARP gateway hook. - */ -const ip4_addr_t * -lwip_hook_etharp_get_gw(struct netif * netif, const ip4_addr_t * ip4addr) -{ - static ip4_addr_t gw_addr; /* may be returned to the caller */ - struct route_entry *route; - - /* Look up the route for the destination IP address. */ - if ((route = route_lookup_v4(ip4addr)) == NULL) - return NULL; - - /* - * This case could only ever trigger as a result of lwIP taking its own - * routing decisions instead of calling the IPv4 routing hook. While - * not impossible, such cases should be extremely rare. We cannot - * provide a meaningful gateway address in this case either, though. - */ - if (route->re_ifdev != netif_get_ifdev(netif)) { - printf("LWIP: unexpected interface for gateway lookup\n"); - - return NULL; - } - - /* - * If this route has a gateway, return the IP address of the gateway. - * Otherwise, the route is for a local network, and we would typically - * not get here because lwIP performs the local-network check itself. - * It is possible that the local network consists of more than one IP - * range, and the user has configured a route for the other range. In - * that case, return the IP address of the actual destination. - * - * We store a packed version of the IPv4 address, so reconstruct the - * unpacked version to a static variable first - for consistency with - * the IPv6 code. - */ - if (route->re_flags & RTF_GATEWAY) { - ip4_addr_copy(gw_addr, route->re_gw4); - - return &gw_addr; - } else - return ip4addr; -} - -/* - * lwIP IPv6 routing function. Given an IPv6 source and destination address, - * look up and return the target interface, or NULL if there is no route to the - * address. Our routing algorithm is destination-based, meaning that the - * source address must be considered only to resolve zone ambiguity. - * - * This is a full replacement of the corresponding lwIP function, which should - * be overridden with weak symbols, using patches against the lwIP source code. - * As such, the lwIP headers should already provide the correct prototype for - * this function. If not, something will have changed in the lwIP - * implementation, and this code must be revised accordingly. - */ -struct netif * -ip6_route(const ip6_addr_t * src, const ip6_addr_t * dst) -{ - struct route_entry *route; - struct ifdev *ifdev; - ip6_addr_t dst_addr; - uint32_t zone; - - assert(src != NULL); - assert(dst != NULL); - - /* - * If the destination address is scoped but has no zone, use the source - * address to determine a zone, which we then set on the destination - * address to find the route, if successful. Obviously, the interface - * is not going to be different from the zone, but we do need to check - * other aspects of the route (e.g., one might want to null-route all - * multicast traffic). In the case that no source address is given at - * all, first see if the destination address happens to be a locally - * assigned address. In theory this could yield multiple matches, so - * pick the first one. If not even that helps, we have absolutely - * nothing we can use to refine route selection. We could pick an - * arbitrary interface in that case, but we currently don't. - */ - zone = IP6_NO_ZONE; - - if (ip6_addr_lacks_zone(dst, IP6_UNKNOWN)) { - if (ip6_addr_has_zone(src)) - zone = ip6_addr_zone(src); - else if (!ip6_addr_isany(src)) { - if ((ifdev = ifaddr_v6_map_by_addr(src)) == NULL) - return NULL; /* should never happen */ - zone = ifdev_get_index(ifdev); - } else { - if ((ifdev = ifaddr_v6_map_by_addr(dst)) != NULL) - zone = ifdev_get_index(ifdev); - else - return NULL; /* TODO: try harder */ - } - - if (zone != IP6_NO_ZONE) { - dst_addr = *dst; - - ip6_addr_set_zone(&dst_addr, zone); - - dst = &dst_addr; - } - } - - route = route_lookup_v6(dst); - - /* - * Look up the route for the destination IPv6 address. If no route is - * found at all, return NULL to the caller. - */ - if (route == NULL) { - /* - * Since we rely on userland to create routes for on-link - * prefixes and default routers, we do not have to call lwIP's - * nd6_find_route() here. - */ - - /* Generate an RTM_MISS message. */ - route_miss_v6(dst); - - return NULL; - } - - /* - * We have found a route based on the destination address. If we did - * not pick the destination address zone based on the source address, - * we should now check for source address zone violations. Note that - * if even the destination address zone violates its target interface, - * this case will be caught by route_lookup_v6(). - */ - if (zone == IP6_NO_ZONE && - ifaddr_is_zone_mismatch(src, route->re_ifdev)) - return NULL; - - route->re_use++; - - /* - * See ip4_route() for an explanation of the use of loopback here. For - * the IPv6 case, the matching logic is in route_output_v6(). - */ - if ((route->re_flags & (RTF_REJECT | RTF_BLACKHOLE)) && - !ifdev_is_loopback(route->re_ifdev)) - ifdev = ifdev_get_loopback(); - else - ifdev = route->re_ifdev; - - /* - * If the selected interface would cause the destination address to - * leave its zone, fail route selection altogether. This case may - * trigger especially for reject routes, for which the interface change - * to loopback may introduce a zone violation. - */ - if (ip6_addr_has_zone(dst) && - !ip6_addr_test_zone(dst, ifdev_get_netif(ifdev))) - return NULL; - - return ifdev_get_netif(ifdev); -} - -/* - * lwIP IPv6 (source) routing hook. Since this hook is called only from lwIP's - * own ip6_route() implementation, this hook must never fire. If it does, - * either something is wrong with overriding ip6_route(), or lwIP added other - * places from which this hook is called. Both cases are highly problematic - * and must be resolved somehow, which is why we simply call panic() here. - */ -struct netif * -lwip_hook_ip6_route(const ip6_addr_t * src, const ip6_addr_t * dst) -{ - - panic("IPv6 routing hook called - this should not happen!"); -} - -/* - * lwIP IPv6 ND6 gateway hook. - */ -const ip6_addr_t * -lwip_hook_nd6_get_gw(struct netif * netif, const ip6_addr_t * ip6addr) -{ - static ip6_addr_t gw_addr; /* may be returned to the caller */ - struct route_entry *route; - struct ifdev *ifdev; - - ifdev = netif_get_ifdev(netif); - assert(ifdev != NULL); - - /* Look up the route for the destination IP address. */ - if ((route = route_lookup_v6(ip6addr)) == NULL) - return NULL; - - /* As for IPv4. */ - if (route->re_ifdev != ifdev) { - printf("LWIP: unexpected interface for gateway lookup\n"); - - return NULL; - } - - /* - * We save memory by storing a packed (zoneless) version of the IPv6 - * gateway address. That means we cannot return a pointer to it here. - * Instead, we have to resort to expanding the address into a static - * variable. The caller will immediately make a copy anyway, though. - */ - if (route->re_flags & RTF_GATEWAY) { - ip6_addr_copy_from_packed(gw_addr, route->re_gw6); - ip6_addr_assign_zone(&gw_addr, IP6_UNKNOWN, netif); - - return &gw_addr; - } else - return ip6addr; -} - -/* - * Check whether a packet is allowed to be sent to the given destination IPv4 - * address 'ipaddr' on the interface 'ifdev', according to route information. - * Return TRUE if the packet should be sent. Return FALSE if the packet should - * be rejected or discarded, with 'err' set to the error to return to lwIP. - */ -int -route_output_v4(struct ifdev * ifdev, const ip4_addr_t * ipaddr, err_t * err) -{ - const struct route_entry *route; - - /* See if we should reject/blackhole packets to this destination. */ - if (ifdev_is_loopback(ifdev) && - (route = route_lookup_v4(ipaddr)) != NULL && - (route->re_flags & (RTF_REJECT | RTF_BLACKHOLE))) { - if (route->re_flags & RTF_REJECT) - *err = ERR_RTE; - else - *err = ERR_OK; - - return FALSE; - } - - return TRUE; -} - -/* - * Check whether a packet is allowed to be sent to the given destination IPv6 - * address 'ipaddr' on the interface 'ifdev', according to route information. - * Return TRUE if the packet should be sent. Return FALSE if the packet should - * be rejected or discarded, with 'err' set to the error to return to lwIP. - */ -int -route_output_v6(struct ifdev * ifdev, const ip6_addr_t * ipaddr, err_t * err) -{ - const struct route_entry *route; - - /* Do one more zone violation test, just in case. It's cheap. */ - if (ip6_addr_has_zone(ipaddr) && - !ip6_addr_test_zone(ipaddr, ifdev_get_netif(ifdev))) { - *err = ERR_RTE; - - return FALSE; - } - - /* See if we should reject/blackhole packets to this destination. */ - if (ifdev_is_loopback(ifdev) && - (route = route_lookup_v6(ipaddr)) != NULL && - (route->re_flags & (RTF_REJECT | RTF_BLACKHOLE))) { - if (route->re_flags & RTF_REJECT) - *err = ERR_RTE; - else - *err = ERR_OK; - - return FALSE; - } - - return TRUE; -} diff --git a/minix/net/lwip/rtsock.c b/minix/net/lwip/rtsock.c deleted file mode 100644 index 7af8bb296..000000000 --- a/minix/net/lwip/rtsock.c +++ /dev/null @@ -1,1912 +0,0 @@ -/* LWIP service - rtsock.c - routing sockets and route sysctl support */ -/* - * In a nutshell, the intended abstraction is that only this module deals with - * route messages, message headers, and RTA arrays, whereas other modules - * (ifaddr, route) are responsible for parsing and providing sockaddr_* type - * addresses, with the exception of compression and expansion which is - * particular to routing sockets. Concretely, there should be no reference to - * (e.g.) rt_msghdr outside this module, and no mention of ip_addr_t inside it. - */ - -#include "lwip.h" -#include "ifaddr.h" -#include "rtsock.h" -#include "route.h" -#include "lldata.h" - -/* The number of routing sockets. */ -#define NR_RTSOCK 8 - -/* - * The send buffer maximum determines the maximum size of requests. The - * maximum possible request size is the size of the routing message header plus - * RTAX_MAX times the maximum socket address size, including alignment. That - * currently works out to a number in the low 400s, so 512 should be fine for - * now. At this time we do not support changing the send buffer size, because - * there really is no point in doing so. Hence also no RT_SNDBUF_{MIN,DEF}. - */ -#define RT_SNDBUF_MAX 512 /* maximum RT send buffer size */ - -#define RT_RCVBUF_MIN 0 /* minimum RT receive buffer size */ -#define RT_RCVBUF_DEF 16384 /* default RT receive buffer size */ -#define RT_RCVBUF_MAX 65536 /* maximum RT receive buffer size */ - -/* Address length of routing socket address structures; two bytes only. */ -#define RTSOCK_ADDR_LEN offsetof(struct sockaddr, sa_data) - -struct rtsock_rta { - const void *rta_ptr[RTAX_MAX]; - socklen_t rta_len[RTAX_MAX]; -}; - -static const char rtsock_padbuf[RT_ROUNDUP(0)]; - -static struct rtsock { - struct sock rt_sock; /* socket object, MUST be first */ - int rt_family; /* address family filter if not zero */ - unsigned int rt_flags; /* routing socket flags (RTF_) */ - struct pbuf *rt_rcvhead; /* receive buffer, first packet */ - struct pbuf **rt_rcvtailp; /* receive buffer, last ptr-ptr */ - size_t rt_rcvlen; /* receive buffer, length in bytes */ - size_t rt_rcvbuf; /* receive buffer, maximum size */ - TAILQ_ENTRY(rtsock) rt_next; /* next in active or free list */ -} rt_array[NR_RTSOCK]; - -#define RTF_NOLOOPBACK 0x1 /* suppress reply messages */ - -static TAILQ_HEAD(, rtsock) rt_freelist; /* free routing sockets */ -static TAILQ_HEAD(, rtsock) rt_activelist; /* active routing sockets */ - -struct rtsock_request { - struct rtsock *rtr_src; /* source socket of the request */ - pid_t rtr_pid; /* process ID of requesting process */ - int rtr_seq; /* sequence number from the request */ - int rtr_getif; /* RTM_GET only: get interface info */ -}; - -static const struct sockevent_ops rtsock_ops; - -static ssize_t rtsock_info(struct rmib_call *, struct rmib_node *, - struct rmib_oldp *, struct rmib_newp *); - -/* The CTL_NET PF_ROUTE subtree. */ -static struct rmib_node net_route_table[] = { - [0] = RMIB_FUNC(RMIB_RO | CTLTYPE_NODE, 0, rtsock_info, - "rtable", "Routing table information"), -}; - -/* The CTL_NET PF_ROUTE node. */ -static struct rmib_node net_route_node = - RMIB_NODE(RMIB_RO, net_route_table, "route", "PF_ROUTE information"); - -/* - * Initialize the routing sockets module. - */ -void -rtsock_init(void) -{ - const int mib[] = { CTL_NET, PF_ROUTE }; - unsigned int slot; - int r; - - /* Initialize the list of free routing sockets. */ - TAILQ_INIT(&rt_freelist); - - for (slot = 0; slot < __arraycount(rt_array); slot++) - TAILQ_INSERT_TAIL(&rt_freelist, &rt_array[slot], rt_next); - - /* Initialize the list of acive routing sockets. */ - TAILQ_INIT(&rt_activelist); - - /* Register the "net.route" subtree with the MIB service. */ - if ((r = rmib_register(mib, __arraycount(mib), &net_route_node)) != OK) - panic("unable to register net.route RMIB tree: %d", r); -} - -/* - * Allocate a pbuf suitable for storing a routing message of 'size' bytes. - * Return the allocated pbuf on success, or NULL on memory allocation failure. - */ -static struct pbuf * -rtsock_alloc(size_t size) -{ - struct pbuf *pbuf; - - /* - * The data will currently always fit in a single pool buffer. Just in - * case this changes in the future, warn and fail cleanly. The rest of - * the code is not able to deal with buffer chains as it is, although - * that can be changed if necessary. - */ - if (size > MEMPOOL_BUFSIZE) { - printf("LWIP: routing socket packet too large (%zu)\n", size); - - return NULL; - } - - pbuf = pbuf_alloc(PBUF_RAW, size, PBUF_RAM); - - assert(pbuf == NULL || pbuf->tot_len == pbuf->len); - - return pbuf; -} - -/* - * Initialize a routing addresses map. - */ -static void -rtsock_rta_init(struct rtsock_rta * rta) -{ - - memset(rta, 0, sizeof(*rta)); -} - -/* - * Set an entry in a routing addresses map. When computing sizes, 'ptr' may be - * NULL. - */ -static void -rtsock_rta_set(struct rtsock_rta * rta, unsigned int rtax, const void * ptr, - socklen_t len) -{ - - assert(rtax < RTAX_MAX); - - rta->rta_ptr[rtax] = ptr; - rta->rta_len[rtax] = len; -} - -/* - * Copy out a message with a header and any entries in a routing addresses map, - * either into a pbuf allocated for this purpose, or to a RMIB (sysctl) caller, - * at the given offset. If no destination is given ('pbuf ' and 'oldp' are - * both NULL), compute just the size of the resulting data. Otherwise, set the - * length and address mask fields in the header as a side effect. Return the - * number of bytes copied on success, and if 'pbuf' is not NULL, it is filled - * with a pointer to the newly allocated pbuf. Return a negative error code on - * failure. Note that when computing the size only, any actual data pointers - * ('hdr', 'msglen', 'addrs', and the pointers in 'rta') may be NULL or even - * invalid, even though the corresponding sizes should still be supplied. - */ -static ssize_t -rtsock_rta_finalize(void * hdr, size_t hdrlen, u_short * msglen, int * addrs, - const struct rtsock_rta * rta, struct pbuf ** pbuf, - struct rmib_oldp * oldp, ssize_t off) -{ - iovec_t iov[1 + RTAX_MAX * 2]; - size_t len, padlen, totallen; - unsigned int i, iovcnt; - int mask; - - assert(pbuf == NULL || oldp == NULL); - assert(pbuf == NULL || off == 0); - assert(RT_ROUNDUP(hdrlen) == hdrlen); - - iov[0].iov_addr = (vir_bytes)hdr; - iov[0].iov_size = hdrlen; - iovcnt = 1; - - totallen = hdrlen; - mask = 0; - - /* - * The addresses in the given RTA map, as present, should be stored in - * the numbering order of the map. - */ - for (i = 0; i < RTAX_MAX; i++) { - if (rta->rta_ptr[i] == NULL) - continue; - - if ((len = rta->rta_len[i]) > 0) { - assert(iovcnt < __arraycount(iov)); - iov[iovcnt].iov_addr = (vir_bytes)rta->rta_ptr[i]; - iov[iovcnt++].iov_size = len; - } - - /* Note that RT_ROUNDUP(0) is not 0.. */ - if ((padlen = RT_ROUNDUP(len) - len) > 0) { - assert(iovcnt < __arraycount(iov)); - iov[iovcnt].iov_addr = (vir_bytes)rtsock_padbuf; - iov[iovcnt++].iov_size = padlen; - } - - totallen += len + padlen; - mask |= (1 << i); /* convert RTAX_ to RTA_ */ - } - - /* If only the length was requested, return it now. */ - if (pbuf == NULL && oldp == NULL) - return totallen; - - /* - * Casting 'hdr' would violate C99 strict aliasing rules, but the - * address mask is not always at the same location anyway. - */ - *msglen = totallen; - *addrs = mask; - - if (pbuf != NULL) { - if ((*pbuf = rtsock_alloc(totallen)) == NULL) - return ENOMEM; - - return util_coalesce((char *)(*pbuf)->payload, totallen, iov, - iovcnt); - } else - return rmib_vcopyout(oldp, off, iov, iovcnt); -} - -/* - * Reduce the size of a network mask to the bytes actually used. It is highly - * doubtful that this extra complexity pays off in any form, but it is what the - * BSDs historically do. We currently implement compression for IPv4 only. - */ -static void -rtsock_compress_netmask(struct sockaddr * sa) -{ - struct sockaddr_in sin; - uint32_t addr; - - if (sa->sa_family != AF_INET) - return; /* nothing to do */ - - memcpy(&sin, sa, sizeof(sin)); /* no type punning.. (sigh) */ - - addr = htonl(sin.sin_addr.s_addr); - - if (addr & 0x000000ff) - sa->sa_len = 8; - else if (addr & 0x0000ffff) - sa->sa_len = 7; - else if (addr & 0x00ffffff) - sa->sa_len = 6; - else if (addr != 0) - sa->sa_len = 5; - else - sa->sa_len = 0; -} - -/* - * Expand a possibly compressed IPv4 or IPv6 network mask, given as 'sa', into - * 'mask'. Return TRUE if expansion succeeded. In that case, the resulting - * mask must have sa.sa_len and sa.sa_family filled in correctly, and have the - * appropriate size for its address family. Return FALSE if expansion failed - * and an error should be returned to the caller. - */ -static int -rtsock_expand_netmask(union sockaddr_any * mask, const struct sockaddr * sa) -{ - - if (sa->sa_len > sizeof(*mask)) - return FALSE; - - memset(mask, 0, sizeof(*mask)); - memcpy(mask, sa, sa->sa_len); - - /* - * Amazingly, even the address family may be chopped off, in which case - * an IPv4 address is implied. - */ - if (sa->sa_len >= offsetof(struct sockaddr, sa_data) && - sa->sa_family == AF_INET6) { - if (sa->sa_len > sizeof(struct sockaddr_in6)) - return FALSE; - - mask->sa.sa_len = sizeof(struct sockaddr_in6); - mask->sa.sa_family = AF_INET6; - } else { - if (sa->sa_len > sizeof(struct sockaddr_in)) - return FALSE; - - mask->sa.sa_len = sizeof(struct sockaddr_in); - mask->sa.sa_family = AF_INET; - } - - return TRUE; -} - -/* - * Create a routing socket. - */ -sockid_t -rtsock_socket(int type, int protocol, struct sock ** sockp, - const struct sockevent_ops ** ops) -{ - struct rtsock *rt; - - /* - * There is no superuser check here: regular users are allowed to issue - * (only) RTM_GET requests on routing sockets. - */ - if (type != SOCK_RAW) - return EPROTOTYPE; - - /* We could accept only the protocols we know, but this is fine too. */ - if (protocol < 0 || protocol >= AF_MAX) - return EPROTONOSUPPORT; - - if (TAILQ_EMPTY(&rt_freelist)) - return ENOBUFS; - - rt = TAILQ_FIRST(&rt_freelist); - TAILQ_REMOVE(&rt_freelist, rt, rt_next); - - rt->rt_flags = 0; - rt->rt_family = protocol; - rt->rt_rcvhead = NULL; - rt->rt_rcvtailp = &rt->rt_rcvhead; - rt->rt_rcvlen = 0; - rt->rt_rcvbuf = RT_RCVBUF_DEF; - - TAILQ_INSERT_HEAD(&rt_activelist, rt, rt_next); - - *sockp = &rt->rt_sock; - *ops = &rtsock_ops; - return SOCKID_RT | (sockid_t)(rt - rt_array); -} - -/* - * Enqueue data on the receive queue of a routing socket. The caller must have - * checked whether the receive buffer size allows for the receipt of the data. - */ -static void -rtsock_enqueue(struct rtsock * rt, struct pbuf * pbuf) -{ - - *rt->rt_rcvtailp = pbuf; - rt->rt_rcvtailp = pchain_end(pbuf); - rt->rt_rcvlen += pchain_size(pbuf); - - sockevent_raise(&rt->rt_sock, SEV_RECV); -} - -/* - * Determine whether a routing message for address family 'family', originated - * from routing socket 'rtsrc' if not NULL, should be sent to routing socket - * 'rt'. Return TRUE if the message should be sent to this socket, or FALSE - * if it should not. - */ -static int -rtsock_can_send(struct rtsock *rt, struct rtsock *rtsrc, int family) -{ - - /* Do not send anything on sockets shut down for reading. */ - if (sockevent_is_shutdown(&rt->rt_sock, SFL_SHUT_RD)) - return FALSE; - - /* - * Do not send a reply message to the source of the request if the - * source is not interested in replies to its own requests. - */ - if (rt == rtsrc && (rt->rt_flags & RTF_NOLOOPBACK)) - return FALSE; - - /* - * For address family specific messages, make sure the routing socket - * is interested in that family. Make an exception if the socket was - * the source of the request, though: we currently do not prevent user - * processes from issuing commands for the "wrong" family. - */ - if (rt->rt_family != AF_UNSPEC && family != AF_UNSPEC && - rt->rt_family != family && rt != rtsrc) - return FALSE; - - /* - * See whether the receive queue of the socket is already full. We do - * not consider the size of the current request, in order to not drop - * larger messages and then enqueue smaller ones. - */ - if (rt->rt_rcvlen >= rt->rt_rcvbuf) - return FALSE; - - /* All is well: go on and deliver the message. */ - return TRUE; -} - -/* - * Send the routing message in 'pbuf' to the given routing socket if possible, - * or check whether such a message could be sent to that socket if 'pbuf' is - * NULL. In the former case, the function takes ownership of 'pbuf'. The - * given routing socket is assumed to be the source of the routing request that - * generated this message. In the latter case, the function returns TRUE if - * the socket would take the message or FALSE if not. If 'family' is not - * AF_UNSPEC, it is to be the address family of the message. - */ -static int -rtsock_msg_one(struct rtsock * rt, int family, struct pbuf * pbuf) -{ - - if (rtsock_can_send(rt, rt, family)) { - if (pbuf != NULL) - rtsock_enqueue(rt, pbuf); - - return TRUE; - } else { - if (pbuf != NULL) - pbuf_free(pbuf); - - return FALSE; - } -} - -/* - * Send the routing message in 'pbuf' to all matching routing sockets, or check - * whether there are any such matching routing sockets if 'pbuf' is NULL. In - * the former case, the function takes ownership of 'pbuf'. In the latter - * case, the function returns TRUE if there are any matching sockets or FALSE - * if there are none. If 'rtsrc' is not NULL, it is to be the routing socket - * that is the source of the message. If 'family' is not AF_UNSPEC, it is to - * be the address family of the message. - */ -static int -rtsock_msg_match(struct rtsock * rtsrc, int family, struct pbuf * pbuf) -{ - struct rtsock *rt, *rtprev; - struct pbuf *pcopy; - - rtprev = NULL; - - TAILQ_FOREACH(rt, &rt_activelist, rt_next) { - if (!rtsock_can_send(rt, rtsrc, family)) - continue; - - /* - * There is at least one routing socket that is interested in - * receiving this message, and able to receive it. - */ - if (pbuf == NULL) - return TRUE; - - /* - * We need to make copies of the generated message for all but - * the last matching socket, which gets the original. If we're - * out of memory, free the original and stop: there are more - * important things to spend memory on than routing sockets. - */ - if (rtprev != NULL) { - if ((pcopy = rtsock_alloc(pbuf->tot_len)) == NULL) { - pbuf_free(pbuf); - - return TRUE; - } - - if (pbuf_copy(pcopy, pbuf) != ERR_OK) - panic("unexpected pbuf copy failure"); - - rtsock_enqueue(rtprev, pcopy); - } - - rtprev = rt; - } - - if (rtprev != NULL) - rtsock_enqueue(rtprev, pbuf); - else if (pbuf != NULL) - pbuf_free(pbuf); - - return (rtprev != NULL); -} - -/* - * Dequeue and free the head of the receive queue of a routing socket. - */ -static void -rtsock_dequeue(struct rtsock * rt) -{ - struct pbuf *pbuf, **pnext; - size_t size; - - pbuf = rt->rt_rcvhead; - assert(pbuf != NULL); - - pnext = pchain_end(pbuf); - size = pchain_size(pbuf); - - if ((rt->rt_rcvhead = *pnext) == NULL) - rt->rt_rcvtailp = &rt->rt_rcvhead; - - assert(rt->rt_rcvlen >= size); - rt->rt_rcvlen -= size; - - *pnext = NULL; - pbuf_free(pbuf); -} - -/* - * Process a routing message sent on a socket. Return OK on success, in which - * case the caller assumes that the processing routine has sent a reply to the - * user and possibly other routing sockets. Return a negative error code on - * failure, in which case the caller will send the reply to the user instead. - */ -static int -rtsock_process(struct rtsock *rt, struct rt_msghdr * rtm, char * buf, - size_t len, int is_root) -{ - struct rtsock_request rtr; - struct rtsock_rta rta; - const struct sockaddr *netmask; - struct sockaddr sa; - union sockaddr_any mask; - size_t off; - int i; - - if (rtm->rtm_msglen != len) - return EINVAL; - - if (rtm->rtm_version != RTM_VERSION) { - printf("LWIP: PID %d uses routing sockets version %u\n", - rtm->rtm_pid, rtm->rtm_version); - - return EPROTONOSUPPORT; - } - - /* - * Make sure that we won't misinterpret the rest of the message. While - * looking at the message type, also make sure non-root users can only - * ever issue RTM_GET requests. - */ - switch (rtm->rtm_type) { - case RTM_ADD: - case RTM_DELETE: - case RTM_CHANGE: - case RTM_LOCK: - if (!is_root) - return EPERM; - - /* FALLTHROUGH */ - case RTM_GET: - break; - - default: - return EOPNOTSUPP; - } - - /* - * Extract all given addresses. We do not actually support all types - * of entries, but we cannot skip the ones we do not need either. - */ - rtsock_rta_init(&rta); - - off = sizeof(*rtm); - assert(off == RT_ROUNDUP(off)); - - for (i = 0; i < RTAX_MAX; i++) { - if (!(rtm->rtm_addrs & (1 << i))) - continue; - - if (off + offsetof(struct sockaddr, sa_data) > len) - return EINVAL; - - /* - * It is safe to access sa_len and even sa_family in all cases, - * in particular even when the structure is of size zero. - */ - assert(offsetof(struct sockaddr, sa_data) <= RT_ROUNDUP(0)); - - memcpy(&sa, &buf[off], offsetof(struct sockaddr, sa_data)); - - if (off + sa.sa_len > len) - return EINVAL; - - rtsock_rta_set(&rta, i, &buf[off], sa.sa_len); - - off += RT_ROUNDUP((size_t)sa.sa_len); - } - - /* - * Expand the given netmask if it is in compressed IPv4 form. We do - * this here because it is particular to routing sockets; we also do - * the compression in this module. Note how the compression may even - * strip off the address family; really, who came up with this ****? - */ - netmask = (const struct sockaddr *)rta.rta_ptr[RTAX_NETMASK]; - - if (netmask != NULL) { - if (!rtsock_expand_netmask(&mask, netmask)) - return EINVAL; - - rtsock_rta_set(&rta, RTAX_NETMASK, &mask, mask.sa.sa_len); - } - - /* - * Actually process the command. Pass on enough information so that a - * reply can be generated on success. The abstraction as sketched at - * the top of the file imposes that we pass quite a few parameters. - */ - rtr.rtr_src = rt; - rtr.rtr_pid = rtm->rtm_pid; - rtr.rtr_seq = rtm->rtm_seq; - rtr.rtr_getif = (rtm->rtm_type == RTM_GET && - (rta.rta_ptr[RTAX_IFP] != NULL || rta.rta_ptr[RTAX_IFA] != NULL)); - - return route_process(rtm->rtm_type, - (const struct sockaddr *)rta.rta_ptr[RTAX_DST], - (const struct sockaddr *)rta.rta_ptr[RTAX_NETMASK], - (const struct sockaddr *)rta.rta_ptr[RTAX_GATEWAY], - (const struct sockaddr *)rta.rta_ptr[RTAX_IFP], - (const struct sockaddr *)rta.rta_ptr[RTAX_IFA], - rtm->rtm_flags, rtm->rtm_inits, &rtm->rtm_rmx, &rtr); -} - -/* - * Perform preliminary checks on a send request. - */ -static int -rtsock_pre_send(struct sock * sock __unused, size_t len, - socklen_t ctl_len __unused, const struct sockaddr * addr, - socklen_t addr_len __unused, endpoint_t user_endpt __unused, int flags) -{ - - if (flags != 0) - return EOPNOTSUPP; - - if (addr != NULL) - return EISCONN; - - /* - * For the most basic failures - that is, we cannot even manage to - * receive the request - we do not generate a reply message. - */ - if (len < sizeof(struct rt_msghdr)) - return ENOBUFS; - if (len > RT_SNDBUF_MAX) - return EMSGSIZE; - - return OK; -} - -/* - * Send data on a routing socket. - */ -static int -rtsock_send(struct sock * sock, const struct sockdriver_data * data, - size_t len, size_t * offp, const struct sockdriver_data * ctl __unused, - socklen_t ctl_len __unused, socklen_t * ctl_off __unused, - const struct sockaddr * addr __unused, socklen_t addr_len __unused, - endpoint_t user_endpt, int flags __unused, size_t min __unused) -{ - struct rtsock *rt = (struct rtsock *)sock; - char buf[RT_SNDBUF_MAX] __aligned(4); - struct rt_msghdr rtm; - struct pbuf *pbuf; - uid_t euid; - int r, is_root; - - /* Copy in the request, and adjust some fields right away. */ - assert(len >= sizeof(rtm)); - assert(len <= sizeof(buf)); - - if ((r = sockdriver_copyin(data, 0, buf, len)) != OK) - return r; - - memcpy(&rtm, buf, sizeof(rtm)); - rtm.rtm_errno = 0; - rtm.rtm_flags &= ~RTF_DONE; - rtm.rtm_pid = getepinfo(user_endpt, &euid, NULL /*gid*/); - - is_root = (euid == ROOT_EUID); - - /* Process the request. */ - r = rtsock_process(rt, &rtm, buf, len, is_root); - - /* - * If the request has been processed successfully, a reply has been - * sent already, possibly also to other routing sockets. Here, we - * handle the case that the request has resulted in failure, in which - * case we send a reply to the caller only. This behavior is different - * from the traditional BSD behavior, which also sends failure replies - * to other sockets. Our motivation is that while other parties are - * never going to be interested in failures anyway, it is in fact easy - * for an unprivileged user process to abuse the failure-reply system - * in order to fake other types of routing messages (e.g., RTM_IFINFO) - * to other parties. By sending failure replies only to the requestor, - * we eliminate the need for security-sensitive request validation. - */ - if (r != OK && rtsock_can_send(rt, rt, AF_UNSPEC)) { - rtm.rtm_errno = -r; - - if ((pbuf = rtsock_alloc(len)) == NULL) - return ENOMEM; - - /* For the reply, reuse the request message largely as is. */ - memcpy(pbuf->payload, &rtm, sizeof(rtm)); - if (len > sizeof(rtm)) - memcpy((uint8_t *)pbuf->payload + sizeof(rtm), - buf + sizeof(rtm), len - sizeof(rtm)); - - rtsock_enqueue(rt, pbuf); - } else if (r == OK) - *offp = len; - - return r; -} - -/* - * Perform preliminary checks on a receive request. - */ -static int -rtsock_pre_recv(struct sock * sock __unused, endpoint_t user_endpt __unused, - int flags) -{ - - /* - * We accept the same flags across all socket types in LWIP, and then - * simply ignore the ones we do not support for routing sockets. - */ - if ((flags & ~(MSG_PEEK | MSG_WAITALL)) != 0) - return EOPNOTSUPP; - - return OK; -} - -/* - * Receive data on a routing socket. - */ -static int -rtsock_recv(struct sock * sock, const struct sockdriver_data * data, - size_t len, size_t * off, const struct sockdriver_data * ctl __unused, - socklen_t ctl_len __unused, socklen_t * ctl_off __unused, - struct sockaddr * addr, socklen_t * addr_len, - endpoint_t user_endpt __unused, int flags, size_t min __unused, - int * rflags) -{ - struct rtsock *rt = (struct rtsock *)sock; - struct pbuf *pbuf; - int r; - - if ((pbuf = rt->rt_rcvhead) == NULL) - return SUSPEND; - - /* Copy out the data to the calling user process. */ - if (len >= pbuf->tot_len) - len = pbuf->tot_len; - else - *rflags |= MSG_TRUNC; - - r = util_copy_data(data, len, 0, pbuf, 0, FALSE /*copy_in*/); - - if (r != OK) - return r; - - /* Generate a dummy source address. */ - addr->sa_len = RTSOCK_ADDR_LEN; - addr->sa_family = AF_ROUTE; - *addr_len = RTSOCK_ADDR_LEN; - - /* Discard the data now, unless we were instructed to peek only. */ - if (!(flags & MSG_PEEK)) - rtsock_dequeue(rt); - - /* Return the received part of the data length. */ - *off = len; - return OK; -} - -/* - * Test whether data can be received on a routing socket, and if so, how many - * bytes of data. - */ -static int -rtsock_test_recv(struct sock * sock, size_t min __unused, size_t * size) -{ - struct rtsock *rt = (struct rtsock *)sock; - - if (rt->rt_rcvhead == NULL) - return SUSPEND; - - if (size != NULL) - *size = rt->rt_rcvhead->tot_len; - return OK; -} - -/* - * Set socket options on a routing socket. - */ -static int -rtsock_setsockopt(struct sock * sock, int level, int name, - const struct sockdriver_data * data, socklen_t len) -{ - struct rtsock *rt = (struct rtsock *)sock; - int r, val; - - if (level == SOL_SOCKET) { - switch (name) { - case SO_USELOOPBACK: - if ((r = sockdriver_copyin_opt(data, &val, sizeof(val), - len)) != OK) - return r; - - if (!val) - rt->rt_flags |= RTF_NOLOOPBACK; - else - rt->rt_flags &= ~RTF_NOLOOPBACK; - - return OK; - - case SO_RCVBUF: - if ((r = sockdriver_copyin_opt(data, &val, sizeof(val), - len)) != OK) - return r; - - if (val < RT_RCVBUF_MIN || val > RT_RCVBUF_MAX) - return EINVAL; - - rt->rt_rcvbuf = (size_t)val; - - return OK; - } - } - - return ENOPROTOOPT; -} - -/* - * Retrieve socket options on a routing socket. - */ -static int -rtsock_getsockopt(struct sock * sock, int level, int name, - const struct sockdriver_data * data, socklen_t * len) -{ - struct rtsock *rt = (struct rtsock *)sock; - int val; - - if (level == SOL_SOCKET) { - switch (name) { - case SO_USELOOPBACK: - val = !(rt->rt_flags & RTF_NOLOOPBACK); - - return sockdriver_copyout_opt(data, &val, sizeof(val), - len); - - case SO_RCVBUF: - val = rt->rt_rcvbuf; - - return sockdriver_copyout_opt(data, &val, sizeof(val), - len); - } - } - - return ENOPROTOOPT; -} - -/* - * Retrieve the local or remote socket address of a routing socket. - */ -static int -rtsock_getname(struct sock * sock __unused, struct sockaddr * addr, - socklen_t * addr_len) -{ - - /* This is entirely useless but apparently common between OSes. */ - addr->sa_len = RTSOCK_ADDR_LEN; - addr->sa_family = AF_ROUTE; - *addr_len = RTSOCK_ADDR_LEN; - - return OK; -} - -/* - * Drain the receive queue of a routing socket. - */ -static void -rtsock_drain(struct rtsock * rt) -{ - - while (rt->rt_rcvhead != NULL) - rtsock_dequeue(rt); -} - -/* - * Shut down a routing socket for reading and/or writing. - */ -static int -rtsock_shutdown(struct sock * sock, unsigned int mask) -{ - struct rtsock *rt = (struct rtsock *)sock; - - if (mask & SFL_SHUT_RD) - rtsock_drain(rt); - - return OK; -} - -/* - * Close a routing socket. - */ -static int -rtsock_close(struct sock * sock, int force __unused) -{ - struct rtsock *rt = (struct rtsock *)sock; - - rtsock_drain(rt); - - return OK; -} - -/* - * Free up a closed routing socket. - */ -static void -rtsock_free(struct sock * sock) -{ - struct rtsock *rt = (struct rtsock *)sock; - - TAILQ_REMOVE(&rt_activelist, rt, rt_next); - - TAILQ_INSERT_HEAD(&rt_freelist, rt, rt_next); -} - -static const struct sockevent_ops rtsock_ops = { - .sop_pre_send = rtsock_pre_send, - .sop_send = rtsock_send, - .sop_pre_recv = rtsock_pre_recv, - .sop_recv = rtsock_recv, - .sop_test_recv = rtsock_test_recv, - .sop_setsockopt = rtsock_setsockopt, - .sop_getsockopt = rtsock_getsockopt, - .sop_getsockname = rtsock_getname, - .sop_getpeername = rtsock_getname, - .sop_shutdown = rtsock_shutdown, - .sop_close = rtsock_close, - .sop_free = rtsock_free -}; - -/* - * Send an interface announcement message about the given interface. If - * 'arrival' is set, the interface has just been created; otherwise, the - * interface is about to be destroyed. - */ -void -rtsock_msg_ifannounce(struct ifdev * ifdev, int arrival) -{ - struct if_announcemsghdr ifan; - struct pbuf *pbuf; - - if (!rtsock_msg_match(NULL /*rtsrc*/, AF_UNSPEC, NULL /*pbuf*/)) - return; - - memset(&ifan, 0, sizeof(ifan)); - ifan.ifan_msglen = sizeof(ifan); - ifan.ifan_version = RTM_VERSION; - ifan.ifan_type = RTM_IFANNOUNCE; - ifan.ifan_index = ifdev_get_index(ifdev); - strlcpy(ifan.ifan_name, ifdev_get_name(ifdev), sizeof(ifan.ifan_name)); - ifan.ifan_what = (arrival) ? IFAN_ARRIVAL : IFAN_DEPARTURE; - - if ((pbuf = rtsock_alloc(sizeof(ifan))) == NULL) - return; - memcpy(pbuf->payload, &ifan, sizeof(ifan)); - - rtsock_msg_match(NULL /*rtsrc*/, AF_UNSPEC, pbuf); -} - -/* - * Send an interface information routing message. - */ -void -rtsock_msg_ifinfo(struct ifdev * ifdev) -{ - struct if_msghdr ifm; - struct pbuf *pbuf; - - if (!rtsock_msg_match(NULL /*rtsrc*/, AF_UNSPEC, NULL /*pbuf*/)) - return; - - memset(&ifm, 0, sizeof(ifm)); - ifm.ifm_msglen = sizeof(ifm); - ifm.ifm_version = RTM_VERSION; - ifm.ifm_type = RTM_IFINFO; - ifm.ifm_addrs = 0; - ifm.ifm_flags = ifdev_get_ifflags(ifdev); - ifm.ifm_index = ifdev_get_index(ifdev); - memcpy(&ifm.ifm_data, ifdev_get_ifdata(ifdev), sizeof(ifm.ifm_data)); - - if ((pbuf = rtsock_alloc(sizeof(ifm))) == NULL) - return; - memcpy(pbuf->payload, &ifm, sizeof(ifm)); - - rtsock_msg_match(NULL /*rtsrc*/, AF_UNSPEC, pbuf); -} - -/* - * Set up a RTA map and an interface address structure for use in a RTM_xxxADDR - * routing message. - */ -static void -rtsock_rta_init_ifam(struct rtsock_rta * rta, struct ifa_msghdr * ifam, - struct ifdev * ifdev, unsigned int type, struct sockaddr_dlx * sdlx) -{ - - memset(ifam, 0, sizeof(*ifam)); - ifam->ifam_version = RTM_VERSION; - ifam->ifam_type = type; - ifam->ifam_flags = 0; - ifam->ifam_index = ifdev_get_index(ifdev); - ifam->ifam_metric = ifdev_get_metric(ifdev); - - rtsock_rta_init(rta); - - ifaddr_dl_get(ifdev, (ifaddr_dl_num_t)0, sdlx); - - rtsock_rta_set(rta, RTAX_IFP, sdlx, sdlx->sdlx_len); -} - -/* - * Add a specific link-layer address for an interface to the given RTA map. - */ -static void -rtsock_rta_add_dl(struct rtsock_rta * rta, struct ifdev * ifdev, - ifaddr_dl_num_t num, struct sockaddr_dlx * sdlx) -{ - - /* Obtain the address data. */ - ifaddr_dl_get(ifdev, num, sdlx); - - /* Add the interface address. */ - rtsock_rta_set(rta, RTAX_IFA, sdlx, sdlx->sdlx_len); - - /* - * NetBSD also adds a RTAX_NETMASK entry here. At this moment it is - * not clear to me why, and it is a pain to make, so for now we do not. - */ -} - -/* - * Send a routing message about a new, changed, or deleted datalink address for - * the given interface. - */ -void -rtsock_msg_addr_dl(struct ifdev * ifdev, unsigned int type, - ifaddr_dl_num_t num) -{ - struct rtsock_rta rta; - struct ifa_msghdr ifam; - struct sockaddr_dlx name, addr; - struct pbuf *pbuf; - - if (!rtsock_msg_match(NULL /*rtsrc*/, AF_LINK, NULL /*pbuf*/)) - return; - - rtsock_rta_init_ifam(&rta, &ifam, ifdev, type, &name); - - rtsock_rta_add_dl(&rta, ifdev, num, &addr); - - if (rtsock_rta_finalize(&ifam, sizeof(ifam), &ifam.ifam_msglen, - &ifam.ifam_addrs, &rta, &pbuf, NULL, 0) > 0) - rtsock_msg_match(NULL /*rtsrc*/, AF_LINK, pbuf); -} - -/* - * Add a specific IPv4 address for an interface to the given RTA map. - */ -static void -rtsock_rta_add_v4(struct rtsock_rta * rta, struct ifdev * ifdev, - ifaddr_v4_num_t num, struct sockaddr_in sin[4]) -{ - - /* Obtain the address data. */ - (void)ifaddr_v4_get(ifdev, num, &sin[0], &sin[1], &sin[2], &sin[3]); - - /* Add the interface address. */ - rtsock_rta_set(rta, RTAX_IFA, &sin[0], sin[0].sin_len); - - /* Add the netmask, after compressing it. */ - rtsock_compress_netmask((struct sockaddr *)&sin[1]); - - rtsock_rta_set(rta, RTAX_NETMASK, &sin[1], sin[1].sin_len); - - /* Possibly add a broadcast or destination address. */ - if (sin[2].sin_len != 0) - rtsock_rta_set(rta, RTAX_BRD, &sin[2], sin[2].sin_len); - else if (sin[3].sin_len != 0) - rtsock_rta_set(rta, RTAX_DST, &sin[3], sin[3].sin_len); -} - -/* - * Send a routing message about a new or deleted IPv4 address for the given - * interface. - */ -void -rtsock_msg_addr_v4(struct ifdev * ifdev, unsigned int type, - ifaddr_v4_num_t num) -{ - struct rtsock_rta rta; - struct ifa_msghdr ifam; - struct sockaddr_dlx name; - struct sockaddr_in sin[4]; - struct pbuf *pbuf; - - if (!rtsock_msg_match(NULL /*rtsrc*/, AF_INET, NULL /*pbuf*/)) - return; - - rtsock_rta_init_ifam(&rta, &ifam, ifdev, type, &name); - - rtsock_rta_add_v4(&rta, ifdev, num, sin); - - if (rtsock_rta_finalize(&ifam, sizeof(ifam), &ifam.ifam_msglen, - &ifam.ifam_addrs, &rta, &pbuf, NULL, 0) > 0) - rtsock_msg_match(NULL /*rtsrc*/, AF_INET, pbuf); -} - -/* - * Add a specific IPv6 address for an interface to the given RTA map. - */ -static void -rtsock_rta_add_v6(struct rtsock_rta * rta, struct ifdev * ifdev, - ifaddr_v6_num_t num, struct sockaddr_in6 sin6[3]) -{ - - /* Obtain the address data. */ - ifaddr_v6_get(ifdev, num, &sin6[0], &sin6[1], &sin6[2]); - - /* Add the interface address. */ - rtsock_rta_set(rta, RTAX_IFA, &sin6[0], sin6[0].sin6_len); - - /* Add the netmask, after compressing it (a no-op at the moment). */ - rtsock_compress_netmask((struct sockaddr *)&sin6[1]); - - rtsock_rta_set(rta, RTAX_NETMASK, &sin6[1], sin6[1].sin6_len); - - /* Possibly add a destination address. */ - if (sin6[2].sin6_len != 0) - rtsock_rta_set(rta, RTAX_DST, &sin6[2], sin6[2].sin6_len); -} - -/* - * Send a routing message about a new or deleted IPv6 address for the given - * interface. - */ -void -rtsock_msg_addr_v6(struct ifdev * ifdev, unsigned int type, - ifaddr_v6_num_t num) -{ - struct rtsock_rta rta; - struct ifa_msghdr ifam; - struct sockaddr_dlx name; - struct sockaddr_in6 sin6[3]; - struct pbuf *pbuf; - - if (!rtsock_msg_match(NULL /*rtsrc*/, AF_INET6, NULL /*pbuf*/)) - return; - - rtsock_rta_init_ifam(&rta, &ifam, ifdev, type, &name); - - rtsock_rta_add_v6(&rta, ifdev, num, sin6); - - if (rtsock_rta_finalize(&ifam, sizeof(ifam), &ifam.ifam_msglen, - &ifam.ifam_addrs, &rta, &pbuf, NULL, 0) > 0) - rtsock_msg_match(NULL /*rtsrc*/, AF_INET6, pbuf); -} - -/* - * Send an RTM_MISS routing message about an address for which no route was - * found. The caller must provide the address in the appropriate form and - * perform any per-address rate limiting. - */ -void -rtsock_msg_miss(const struct sockaddr * addr) -{ - struct rt_msghdr rtm; - struct rtsock_rta rta; - struct pbuf *pbuf; - - /* - * Unfortunately the destination address has already been generated (as - * 'addr'), which is a big part of the work. Still, skip the rest if - * there is no routing socket to deliver the message to. - */ - if (!rtsock_msg_match(NULL /*rtsrc*/, addr->sa_family, NULL /*pbuf*/)) - return; - - memset(&rtm, 0, sizeof(rtm)); - rtm.rtm_version = RTM_VERSION; - rtm.rtm_type = RTM_MISS; - - rtsock_rta_init(&rta); - - rtsock_rta_set(&rta, RTAX_DST, addr, addr->sa_len); - - if (rtsock_rta_finalize(&rtm, sizeof(rtm), &rtm.rtm_msglen, - &rtm.rtm_addrs, &rta, &pbuf, NULL, 0) > 0) - rtsock_msg_match(NULL /*rtsrc*/, addr->sa_family, pbuf); -} - -/* - * Generate routing socket data for a route, for either routing socket - * broadcasting or a sysctl(7) request. The route is given as 'route'. The - * type of the message (RTM_) is given as 'type'. The resulting routing - * message header is stored in 'rtm' and an address vector is stored in 'rta'. - * The latter may point to addresses generated in 'addr', 'mask', 'gateway', - * and optionally (if not NULL) 'ifp' and 'ifa'. The caller is responsible for - * combining the results into an appropriate routing message. - */ -static void -rtsock_get_route(struct rt_msghdr * rtm, struct rtsock_rta * rta, - union sockaddr_any * addr, union sockaddr_any * mask, - union sockaddr_any * gateway, union sockaddr_any * ifp, - union sockaddr_any * ifa, const struct route_entry * route, - unsigned int type) -{ - struct ifdev *ifdev; - unsigned int flags, use; - - route_get(route, addr, mask, gateway, ifp, ifa, &ifdev, &flags, &use); - - memset(rtm, 0, sizeof(*rtm)); - rtm->rtm_version = RTM_VERSION; - rtm->rtm_type = type; - rtm->rtm_flags = flags; - rtm->rtm_index = ifdev_get_index(ifdev); - rtm->rtm_use = use; - - rtsock_rta_init(rta); - - rtsock_rta_set(rta, RTAX_DST, addr, addr->sa.sa_len); - - if (!(flags & RTF_HOST)) { - rtsock_compress_netmask(&mask->sa); - - rtsock_rta_set(rta, RTAX_NETMASK, mask, mask->sa.sa_len); - } - - rtsock_rta_set(rta, RTAX_GATEWAY, gateway, gateway->sa.sa_len); - - if (ifp != NULL) - rtsock_rta_set(rta, RTAX_IFP, ifp, ifp->sa.sa_len); - - if (ifa != NULL) - rtsock_rta_set(rta, RTAX_IFA, ifa, ifa->sa.sa_len); -} - -/* - * Send a routing message about a route, with the given type which may be one - * of RTM_ADD, RTM_CHANGE, RTM_DELETE, RTM_LOCK, and RTM_GET. The routing - * socket request information 'rtr', if not NULL, provides additional - * information about the routing socket that was the source of the request (if - * any), various fields that should be echoed, and (for RTM_GET) whether to - * add interface information to the output. - */ -void -rtsock_msg_route(const struct route_entry * route, unsigned int type, - const struct rtsock_request * rtr) -{ - union sockaddr_any addr, mask, gateway, ifp, ifa; - struct rt_msghdr rtm; - struct rtsock_rta rta; - struct rtsock *rtsrc; - struct pbuf *pbuf; - int family, getif; - - rtsrc = (rtr != NULL) ? rtr->rtr_src : NULL; - family = (route_is_ipv6(route)) ? AF_INET6 : AF_INET; - - if (!rtsock_msg_match(rtsrc, family, NULL /*pbuf*/)) - return; - - getif = (rtr != NULL && rtr->rtr_getif); - - rtsock_get_route(&rtm, &rta, &addr, &mask, &gateway, - (getif) ? &ifp : NULL, (getif) ? &ifa : NULL, route, type); - - if (rtr != NULL) { - rtm.rtm_flags |= RTF_DONE; - rtm.rtm_pid = rtr->rtr_pid; - rtm.rtm_seq = rtr->rtr_seq; - } - - if (rtsock_rta_finalize(&rtm, sizeof(rtm), &rtm.rtm_msglen, - &rtm.rtm_addrs, &rta, &pbuf, NULL, 0) > 0) - rtsock_msg_match(rtsrc, family, pbuf); -} - -/* - * Generate sysctl(7) output or length for the given routing table entry - * 'route', provided that the route passes the flags filter 'filter'. The - * address length 'addr_len' is used to compute a cheap length estimate. On - * success, return the byte size of the output. If the route was not a match - * for the filter, return zero. On failure, return a negative error code. - */ -static ssize_t -rtsock_info_rtable_entry(const struct route_entry * route, unsigned int filter, - socklen_t addr_len, struct rmib_oldp * oldp, size_t off) -{ - union sockaddr_any addr, mask, gateway; - struct rt_msghdr rtm; - struct rtsock_rta rta; - unsigned int flags; - ssize_t len; - - flags = route_get_flags(route); - - /* Apparently, matching any of the flags (if given) is sufficient. */ - if (filter != 0 && (filter & flags) != 0) - return 0; - - /* Size (over)estimation shortcut. */ - if (oldp == NULL) { - len = sizeof(rtm) + RT_ROUNDUP(addr_len) + - RT_ROUNDUP(sizeof(gateway)); - - if (!(flags & RTF_HOST)) - len += RT_ROUNDUP(addr_len); - - return len; - } - - rtsock_get_route(&rtm, &rta, &addr, &mask, &gateway, NULL /*ifp*/, - NULL /*ifa*/, route, RTM_GET); - - return rtsock_rta_finalize(&rtm, sizeof(rtm), &rtm.rtm_msglen, - &rtm.rtm_addrs, &rta, NULL /*pbuf*/, oldp, off); -} - -/* - * Obtain routing table entries. - */ -static ssize_t -rtsock_info_rtable(struct rmib_oldp * oldp, int family, int filter) -{ - struct route_entry *route; - ssize_t r, off; - - off = 0; - - if (family == AF_UNSPEC || family == AF_INET) { - for (route = NULL; (route = route_enum_v4(route)) != NULL; ) { - if ((r = rtsock_info_rtable_entry(route, - (unsigned int)filter, sizeof(struct sockaddr_in), - oldp, off)) < 0) - return r; - off += r; - } - } - - if (family == AF_UNSPEC || family == AF_INET6) { - for (route = NULL; (route = route_enum_v6(route)) != NULL; ) { - if ((r = rtsock_info_rtable_entry(route, - (unsigned int)filter, sizeof(struct sockaddr_in6), - oldp, off)) < 0) - return r; - off += r; - } - } - - /* TODO: should we add slack here? */ - return off; -} - -/* - * Generate routing socket data for an ARP table entry, for either routing - * socket broadcasting or a sysctl(7) request. The ARP table entry number is - * given as 'num'. The type of the message (RTM_) is given as 'type'. The - * resulting routing message header is stored in 'rtm' and an address vector is - * stored in 'rta'. The latter may point to addresses generated in 'addr' and - * 'gateway'. The caller is responsible for combining the results into an - * appropriate routing message. - */ -static void -rtsock_get_arp(struct rt_msghdr * rtm, struct rtsock_rta * rta, - struct sockaddr_in * addr, struct sockaddr_dlx * gateway, - lldata_arp_num_t num, unsigned int type) -{ - struct ifdev *ifdev; - unsigned int flags; - - lldata_arp_get(num, addr, gateway, &ifdev, &flags); - - memset(rtm, 0, sizeof(*rtm)); - rtm->rtm_version = RTM_VERSION; - rtm->rtm_type = type; - rtm->rtm_flags = flags; - rtm->rtm_index = ifdev_get_index(ifdev); - - /* TODO: obtaining and reporting the proper expiry time, if any. */ - if (!(flags & RTF_STATIC)) - rtm->rtm_rmx.rmx_expire = (time_t)-1; - - rtsock_rta_init(rta); - - rtsock_rta_set(rta, RTAX_DST, addr, addr->sin_len); - - rtsock_rta_set(rta, RTAX_GATEWAY, gateway, gateway->sdlx_len); -} - -/* - * Send a routing message about an ARP table entry, with the given type which - * may be one of RTM_ADD, RTM_CHANGE, RTM_DELETE, RTM_LOCK, and RTM_GET. The - * routing socket request information 'rtr', if not NULL, provides additional - * information about the routing socket that was the source of the request (if - * any) and various fields that should be echoed. - */ -void -rtsock_msg_arp(lldata_arp_num_t num, unsigned int type, - const struct rtsock_request * rtr) -{ - struct sockaddr_in addr; - struct sockaddr_dlx gateway; - struct rt_msghdr rtm; - struct rtsock_rta rta; - struct pbuf *pbuf; - - assert(rtr != NULL); - - /* - * We do not maintain the link-local tables ourselves, and thus, we do - * not have a complete view of modifications to them. In order not to - * confuse userland with inconsistent updates (e.g., deletion of - * previously unreported entries), send these routing messages to the - * source of the routing request only. - */ - if (!rtsock_msg_one(rtr->rtr_src, AF_INET, NULL /*pbuf*/)) - return; - - rtsock_get_arp(&rtm, &rta, &addr, &gateway, num, type); - - if (rtr != NULL) { - rtm.rtm_flags |= RTF_DONE; - rtm.rtm_pid = rtr->rtr_pid; - rtm.rtm_seq = rtr->rtr_seq; - } - - if (rtsock_rta_finalize(&rtm, sizeof(rtm), &rtm.rtm_msglen, - &rtm.rtm_addrs, &rta, &pbuf, NULL, 0) > 0) - rtsock_msg_one(rtr->rtr_src, AF_INET, pbuf); -} - -/* - * Obtain ARP table entries. - */ -static ssize_t -rtsock_info_lltable_arp(struct rmib_oldp * oldp) -{ - struct sockaddr_in addr; - struct sockaddr_dlx gateway; - struct rt_msghdr rtm; - struct rtsock_rta rta; - lldata_arp_num_t num; - ssize_t r, off; - - off = 0; - - for (num = 0; lldata_arp_enum(&num); num++) { - /* Size (over)estimation shortcut. */ - if (oldp == NULL) { - off += sizeof(struct rt_msghdr) + - RT_ROUNDUP(sizeof(addr)) + - RT_ROUNDUP(sizeof(gateway)); - - continue; - } - - rtsock_get_arp(&rtm, &rta, &addr, &gateway, num, RTM_GET); - - if ((r = rtsock_rta_finalize(&rtm, sizeof(rtm), - &rtm.rtm_msglen, &rtm.rtm_addrs, &rta, NULL /*pbuf*/, oldp, - off)) < 0) - return r; - off += r; - } - - /* TODO: should we add slack here? */ - return off; -} - -/* - * Generate routing socket data for an NDP table entry, for either routing - * socket broadcasting or a sysctl(7) request. The NDP table entry number is - * given as 'num'. The type of the message (RTM_) is given as 'type'. The - * resulting routing message header is stored in 'rtm' and an address vector is - * stored in 'rta'. The latter may point to addresses generated in 'addr' and - * 'gateway'. The caller is responsible for combining the results into an - * appropriate routing message. - */ -static void -rtsock_get_ndp(struct rt_msghdr * rtm, struct rtsock_rta * rta, - struct sockaddr_in6 * addr, struct sockaddr_dlx * gateway, - lldata_ndp_num_t num, unsigned int type) -{ - struct ifdev *ifdev; - unsigned int flags; - - lldata_ndp_get(num, addr, gateway, &ifdev, &flags); - - memset(rtm, 0, sizeof(*rtm)); - rtm->rtm_version = RTM_VERSION; - rtm->rtm_type = type; - rtm->rtm_flags = flags; - rtm->rtm_index = ifdev_get_index(ifdev); - - rtsock_rta_init(rta); - - rtsock_rta_set(rta, RTAX_DST, addr, addr->sin6_len); - - rtsock_rta_set(rta, RTAX_GATEWAY, gateway, gateway->sdlx_len); -} - -/* - * Send a routing message about an NDP table entry, with the given type which - * may be one of RTM_ADD, RTM_CHANGE, RTM_DELETE, RTM_LOCK, and RTM_GET. The - * routing socket request information 'rtr', if not NULL, provides additional - * information about the routing socket that was the source of the request (if - * any) and various fields that should be echoed. - */ -void -rtsock_msg_ndp(lldata_ndp_num_t num, unsigned int type, - const struct rtsock_request * rtr) -{ - struct sockaddr_in6 addr; - struct sockaddr_dlx gateway; - struct rt_msghdr rtm; - struct rtsock_rta rta; - struct pbuf *pbuf; - - assert(rtr != NULL); - - /* - * We do not maintain the link-local tables ourselves, and thus, we do - * not have a complete view of modifications to them. In order not to - * confuse userland with inconsistent updates (e.g., deletion of - * previously unreported entries), send these routing messages to the - * source of the routing request only. - */ - if (!rtsock_msg_one(rtr->rtr_src, AF_INET6, NULL /*pbuf*/)) - return; - - rtsock_get_ndp(&rtm, &rta, &addr, &gateway, num, type); - - if (rtr != NULL) { - rtm.rtm_flags |= RTF_DONE; - rtm.rtm_pid = rtr->rtr_pid; - rtm.rtm_seq = rtr->rtr_seq; - } - - if (rtsock_rta_finalize(&rtm, sizeof(rtm), &rtm.rtm_msglen, - &rtm.rtm_addrs, &rta, &pbuf, NULL, 0) > 0) - rtsock_msg_one(rtr->rtr_src, AF_INET6, pbuf); -} - -/* - * Obtain NDP table entries. - */ -static ssize_t -rtsock_info_lltable_ndp(struct rmib_oldp * oldp) -{ - struct rt_msghdr rtm; - struct rtsock_rta rta; - struct sockaddr_in6 addr; - struct sockaddr_dlx gateway; - lldata_ndp_num_t num; - ssize_t r, off; - - off = 0; - - for (num = 0; lldata_ndp_enum(&num); num++) { - /* Size (over)estimation shortcut. */ - if (oldp == NULL) { - off += sizeof(struct rt_msghdr) + - RT_ROUNDUP(sizeof(addr)) + - RT_ROUNDUP(sizeof(gateway)); - - continue; - } - - rtsock_get_ndp(&rtm, &rta, &addr, &gateway, num, RTM_GET); - - if ((r = rtsock_rta_finalize(&rtm, sizeof(rtm), - &rtm.rtm_msglen, &rtm.rtm_addrs, &rta, NULL /*pbuf*/, oldp, - off)) < 0) - return r; - off += r; - } - - /* TODO: should we add slack here? */ - return off; -} - -/* - * Obtain link-layer (ARP, NDP) table entries. - */ -static ssize_t -rtsock_info_lltable(struct rmib_oldp * oldp, int family) -{ - - switch (family) { - case AF_INET: - return rtsock_info_lltable_arp(oldp); - - case AF_INET6: - return rtsock_info_lltable_ndp(oldp); - - default: - return 0; - } -} - -/* - * Obtain link-layer address information for one specific interface. - */ -static ssize_t -rtsock_info_if_dl(struct ifdev * ifdev, struct ifa_msghdr * ifam, - struct rmib_oldp * oldp, ssize_t off) -{ - struct rtsock_rta rta; - struct sockaddr_dlx sdlx; - ifaddr_dl_num_t num; - ssize_t r, len; - - len = 0; - - for (num = 0; ifaddr_dl_enum(ifdev, &num); num++) { - if (oldp == NULL) { - len += sizeof(*ifam) + RT_ROUNDUP(sizeof(sdlx)); - - continue; - } - - rtsock_rta_init(&rta); - - rtsock_rta_add_dl(&rta, ifdev, num, &sdlx); - - if ((r = rtsock_rta_finalize(ifam, sizeof(*ifam), - &ifam->ifam_msglen, &ifam->ifam_addrs, &rta, NULL /*pbuf*/, - oldp, off + len)) < 0) - return r; - len += r; - } - - return len; -} - -/* - * Obtain IPv4 address information for one specific interface. - */ -static ssize_t -rtsock_info_if_v4(struct ifdev * ifdev, struct ifa_msghdr * ifam, - struct rmib_oldp * oldp, ssize_t off) -{ - struct sockaddr_in sin[4]; - struct rtsock_rta rta; - ifaddr_v4_num_t num; - ssize_t r, len; - - len = 0; - - /* - * Mostly for future compatibility, we support multiple IPv4 interface - * addresses here. Every interface has an interface address and a - * netmask. In addition, an interface may have either a broadcast or a - * destination address. - */ - for (num = 0; ifaddr_v4_enum(ifdev, &num); num++) { - /* Size (over)estimation shortcut. */ - if (oldp == NULL) { - len += sizeof(*ifam) + RT_ROUNDUP(sizeof(sin[0])) * 3; - - continue; - } - - rtsock_rta_init(&rta); - - rtsock_rta_add_v4(&rta, ifdev, num, sin); - - if ((r = rtsock_rta_finalize(ifam, sizeof(*ifam), - &ifam->ifam_msglen, &ifam->ifam_addrs, &rta, NULL /*pbuf*/, - oldp, off + len)) < 0) - return r; - len += r; - } - - return len; -} - -/* - * Obtain IPv6 address information for one specific interface. - */ -static ssize_t -rtsock_info_if_v6(struct ifdev * ifdev, struct ifa_msghdr * ifam, - struct rmib_oldp * oldp, ssize_t off) -{ - struct sockaddr_in6 sin6[3]; - struct rtsock_rta rta; - ifaddr_v6_num_t num; - ssize_t r, len; - - len = 0; - - /* As with IPv4, except that IPv6 has no broadcast addresses. */ - for (num = 0; ifaddr_v6_enum(ifdev, &num); num++) { - /* Size (over)estimation shortcut. */ - if (oldp == NULL) { - len += sizeof(*ifam) + RT_ROUNDUP(sizeof(sin6[0])) * 3; - - continue; - } - - rtsock_rta_init(&rta); - - rtsock_rta_add_v6(&rta, ifdev, num, sin6); - - if ((r = rtsock_rta_finalize(ifam, sizeof(*ifam), - &ifam->ifam_msglen, &ifam->ifam_addrs, &rta, NULL /*pbuf*/, - oldp, off + len)) < 0) - return r; - len += r; - } - - return len; -} - -/* - * Obtain information for one specific interface. - */ -static ssize_t -rtsock_info_if(struct ifdev * ifdev, struct rmib_oldp * oldp, ssize_t off, - int family) -{ - struct rtsock_rta rta; - struct sockaddr_dlx sdlx; - struct if_msghdr ifm; - struct ifa_msghdr ifam; - unsigned int ifflags; - ssize_t r, len, sdlxsize; - - len = 0; - - ifflags = ifdev_get_ifflags(ifdev); - - /* Create an interface information entry. */ - rtsock_rta_init(&rta); - - if (oldp != NULL) { - memset(&ifm, 0, sizeof(ifm)); - ifm.ifm_version = RTM_VERSION; - ifm.ifm_type = RTM_IFINFO; - ifm.ifm_flags = ifflags; - ifm.ifm_index = ifdev_get_index(ifdev); - memcpy(&ifm.ifm_data, ifdev_get_ifdata(ifdev), - sizeof(ifm.ifm_data)); - } - - /* - * Generate a datalink socket address structure. TODO: see if it is - * worth obtaining just the length for the (oldp == NULL) case here. - */ - memset(&sdlx, 0, sizeof(sdlx)); - - ifaddr_dl_get(ifdev, 0, &sdlx); - - sdlxsize = RT_ROUNDUP(sdlx.sdlx_len); - - rtsock_rta_set(&rta, RTAX_IFP, &sdlx, sdlxsize); - - if ((r = rtsock_rta_finalize(&ifm, sizeof(ifm), &ifm.ifm_msglen, - &ifm.ifm_addrs, &rta, NULL /*pbuf*/, oldp, off + len)) < 0) - return r; - len += r; - - /* Generate a header for all addresses once. */ - if (oldp != NULL) { - memset(&ifam, 0, sizeof(ifam)); - ifam.ifam_version = RTM_VERSION; - ifam.ifam_type = RTM_NEWADDR; - ifam.ifam_flags = 0; - ifam.ifam_index = ifdev_get_index(ifdev); - ifam.ifam_metric = ifdev_get_metric(ifdev); - } - - /* If requested and applicable, add any datalink addresses. */ - if (family == AF_UNSPEC || family == AF_LINK) { - if ((r = rtsock_info_if_dl(ifdev, &ifam, oldp, off + len)) < 0) - return r; - len += r; - } - - /* If requested and applicable, add any IPv4 addresses. */ - if (family == AF_UNSPEC || family == AF_INET) { - if ((r = rtsock_info_if_v4(ifdev, &ifam, oldp, off + len)) < 0) - return r; - len += r; - } - - /* If requested and applicable, add any IPv6 addresses. */ - if (family == AF_UNSPEC || family == AF_INET6) { - if ((r = rtsock_info_if_v6(ifdev, &ifam, oldp, off + len)) < 0) - return r; - len += r; - } - - return len; -} - -/* - * Obtain interface information. - */ -static ssize_t -rtsock_info_iflist(struct rmib_oldp * oldp, int family, uint32_t ifindex) -{ - struct ifdev *ifdev; - ssize_t r, off; - - /* - * If information about a specific interface index is requested, then - * return information for just that interface. - */ - if (ifindex != 0) { - if ((ifdev = ifdev_get_by_index(ifindex)) != NULL) - return rtsock_info_if(ifdev, oldp, 0, family); - else - return 0; - } - - /* Otherwise, iterate through the list of all interfaces. */ - off = 0; - - for (ifdev = ifdev_enum(NULL); ifdev != NULL; - ifdev = ifdev_enum(ifdev)) { - - /* Avoid generating results that are never copied out. */ - if (oldp != NULL && !rmib_inrange(oldp, off)) - oldp = NULL; - - if ((r = rtsock_info_if(ifdev, oldp, off, family)) < 0) - return r; - - off += r; - } - - /* TODO: should we add slack here? */ - return off; -} - -/* - * Obtain routing table, ARP cache, and interface information through - * sysctl(7). Return the (produced, or if oldp is NULL, estimated) byte size - * of the output on success, or a negative error code on failure. - */ -static ssize_t -rtsock_info(struct rmib_call * call, struct rmib_node * node __unused, - struct rmib_oldp * oldp, struct rmib_newp * newp __unused) -{ - int family, filter; - - if (call->call_namelen != 3) - return EINVAL; - - family = call->call_name[0]; - filter = call->call_name[2]; - - switch (call->call_name[1]) { - case NET_RT_FLAGS: - /* - * Preliminary support for changes as of NetBSD 8, where by - * default, the use of this subcall implies an ARP/NDP-only - * request. - */ - if (filter == 0) - filter |= RTF_LLDATA; - - if (filter & RTF_LLDATA) { - if (family == AF_UNSPEC) - return EINVAL; - - /* - * Split off ARP/NDP handling from the normal routing - * table listing, as done since NetBSD 8. We generate - * the ARP/NDP listing from here, and keep those - * entries out of the routing table dump below. Since - * the filter is of a match-any type, and we have just - * matched a flag, no further filtering is needed here. - */ - return rtsock_info_lltable(oldp, family); - } - - /* FALLTHROUGH */ - case NET_RT_DUMP: - return rtsock_info_rtable(oldp, family, filter); - - case NET_RT_IFLIST: - return rtsock_info_iflist(oldp, family, filter); - - default: - return EINVAL; - } -} diff --git a/minix/net/lwip/tcpsock.c b/minix/net/lwip/tcpsock.c deleted file mode 100644 index 8266a05c3..000000000 --- a/minix/net/lwip/tcpsock.c +++ /dev/null @@ -1,2793 +0,0 @@ -/* LWIP service - tcpsock.c - TCP sockets */ -/* - * This module implements support for TCP sockets based on lwIP's core TCP PCB - * module, which is largely but not fully cooperative with exactly what we want - * to achieve, with as a result that this module is rather complicated. - * - * Each socket has a send queue and a receive queue. Both are using lwIP's own - * (pbuf) buffers, which largely come out of the main 512-byte buffer pool. - * The buffers on the send queue are allocated and freed by us--the latter only - * once they are no longer in use by lwIP as well. A bit counterintuitively, - * we deliberately use a smaller lwIP per-PCB TCP send buffer limit - * (TCP_SND_BUF) in the lwIP send configuration (lwipopts.h) in order to more - * easily trigger conditions where we cannot enqueue data (or the final FIN) - * right away. This way, we get to test the internal logic of this module a - * lot more easily. The small lwIP send queue size should not have any impact - * on performance, as our own per-socket send queues can be much larger and we - * enqueue more of that on the lwIP PCB as soon as we can in all cases. - * - * The receive queue consists of whatever buffers were given to us by lwIP, but - * since those may be many buffers with small amounts of data each, we perform - * fairly aggressive merging of consecutive buffers. The intended result is - * that we waste no more than 50% of memory within the receive queue. Merging - * requires memory copies, which makes it expensive, but we do not configure - * lwIP with enough buffers to make running out of buffers a non-issue, so this - * trade-off is necessary. Practical experience and measurements of the merge - * policy will have to show whether and how the current policy may be improved. - * - * As can be expected, the connection close semantics are by far the most - * complicated part of this module. We attempt to get rid of the lwIP PCB as - * soon as we can, letting lwIP take care of the TIME_WAIT state for example. - * However, there are various conditions that have to be met before we can - * forget about the PCB here--most importantly, that none of our sent data - * blocks are still referenced by lwIP because they have not yet been sent or - * acknowledged. We can only free the data blocks once lwIP is done with them. - * - * We do consider the TCP state of lwIP's PCB, in order to avoid duplicating - * full state tracking here. However, we do not look at a socket's TCP state - * while in a lwIP-generated event for that socket, because the state may not - * necessarily reflect the (correct or new) TCP state of the connection, nor - * may the PCB be available--this is the case for error events. For these - * reasons we use a few internal TCPF_ flags to perform partial state tracking. - * - * More generally, we tend to access lwIP PCB fields directly only when lwIP's - * own BSD API implementation does that too and there is no better alternative. - * One example of this is the check to see if our FIN was acknowledged, for - * SO_LINGER support. In terms of maintenance, our hope is that if lwIP's API - * changes later, we can change our code to imitate whatever lwIP's BSD API - * implementation does at that point. - */ - -#include -#include -#include -#include -#include -#include -#include - -/* - * Unfortunately, NetBSD and lwIP have different definitions of a few relevant - * preprocessor variables. Make sure we do not attempt to use the NetBSD one - * where it matters. We do need one of the NetBSD definitions though. - */ -static const unsigned int NETBSD_TF_NODELAY = TF_NODELAY; -#undef TF_NODELAY -#undef TCP_MSS - -#include "lwip.h" -#include "tcpisn.h" - -#include "lwip/tcp.h" -#include "lwip/priv/tcp_priv.h" /* for tcp_pcb_lists */ - -/* - * The number of TCP sockets (NR_TCPSOCK) is defined in the lwIP configuration. - */ - -/* - * We fully control the send buffer, so we can let its size be set to whatever - * we want. The receive buffer is different: if it is smaller than the window - * size, we may have to refuse data that lwIP hands us, at which point more - * incoming data will cause lwIP to abort the TCP connection--even aside from - * performance issues. Therefore, we must make sure the receive buffer is - * larger than the TCP window at all times. - */ -#define TCP_SNDBUF_MIN 1 /* minimum TCP send buffer size */ -#define TCP_SNDBUF_DEF 32768 /* default TCP send buffer size */ -#define TCP_SNDBUF_MAX 131072 /* maximum TCP send buffer size */ -#define TCP_RCVBUF_MIN TCP_WND /* minimum TCP receive buffer size */ -#define TCP_RCVBUF_DEF MAX(TCP_WND, 32768) /* default TCP recv buffer size */ -#define TCP_RCVBUF_MAX MAX(TCP_WND, 131072) /* maximum TCP recv buffer size */ - -/* - * The total number of buffers that may in use for TCP socket send queues. The - * goal is to allow at least some progress to be made on receiving from TCP - * sockets and on differently-typed sockets, at least as long as the LWIP - * service can manage to allocate the memory it wants. For the case that it - * does not, we can only reactively kill off TCP sockets and/or free enqueued - * ethernet packets, neither of which is currently implemented (TODO). - */ -#define TCP_MAX_SENDBUFS (mempool_max_buffers() * 3 / 4) - -/* Polling intervals, in 500-millsecond units. */ -#define TCP_POLL_REG_INTERVAL 10 /* interval for reattempting sends */ -#define TCP_POLL_CLOSE_INTERVAL 1 /* interval while closing connection */ - -static struct tcpsock { - struct ipsock tcp_ipsock; /* IP socket, MUST be first */ - struct tcp_pcb *tcp_pcb; /* lwIP TCP control block */ - union pxfer_tcp_queue { /* free/accept queue */ - TAILQ_ENTRY(tcpsock) tq_next; /* next in queue */ - TAILQ_HEAD(, tcpsock) tq_head; /* head of queue */ - } tcp_queue; - struct tcpsock *tcp_listener; /* listener if on accept q. */ - struct { /* send queue */ - struct pbuf *ts_head; /* first pbuf w/unacked data */ - struct pbuf *ts_unsent; /* first pbuf w/unsent data */ - struct pbuf *ts_tail; /* most recently added data */ - size_t ts_len; /* total sent + unsent */ - unsigned short ts_head_off; /* offset into head pbuf */ - unsigned short ts_unsent_off; /* offset into unsent pbuf */ - } tcp_snd; - struct { /* receive queue */ - struct pbuf *tr_head; /* first pbuf w/unrecvd data */ - struct pbuf **tr_pre_tailp; /* ptr-ptr to newest pbuf */ - size_t tr_len; /* bytes on receive queue */ - unsigned short tr_head_off; /* offset into head pbuf */ - unsigned short tr_unacked; /* current window reduction */ - } tcp_rcv; -} tcp_array[NR_TCPSOCK]; - -static TAILQ_HEAD(, tcpsock) tcp_freelist; /* list of free TCP sockets */ - -static const struct sockevent_ops tcpsock_ops; - -static unsigned int tcpsock_sendbufs; /* # send buffers in use */ -static unsigned int tcpsock_recvbufs; /* # receive buffers in use */ - -/* A bunch of macros that are just for convenience. */ -#define tcpsock_get_id(tcp) (SOCKID_TCP | (sockid_t)((tcp) - tcp_array)) -#define tcpsock_get_ipsock(tcp) (&(tcp)->tcp_ipsock) -#define tcpsock_get_sock(tcp) (ipsock_get_sock(tcpsock_get_ipsock(tcp))) -#define tcpsock_get_sndbuf(tcp) (ipsock_get_sndbuf(tcpsock_get_ipsock(tcp))) -#define tcpsock_get_rcvbuf(tcp) (ipsock_get_rcvbuf(tcpsock_get_ipsock(tcp))) -#define tcpsock_is_ipv6(tcp) (ipsock_is_ipv6(tcpsock_get_ipsock(tcp))) -#define tcpsock_is_shutdown(tcp,fl) \ - (sockevent_is_shutdown(tcpsock_get_sock(tcp), fl)) -#define tcpsock_is_listening(tcp) \ - (sockevent_is_listening(tcpsock_get_sock(tcp))) -#define tcpsock_get_flags(tcp) (ipsock_get_flags(tcpsock_get_ipsock(tcp))) -#define tcpsock_set_flag(tcp,fl) \ - (ipsock_set_flag(tcpsock_get_ipsock(tcp), fl)) -#define tcpsock_clear_flag(tcp,fl) \ - (ipsock_clear_flag(tcpsock_get_ipsock(tcp), fl)) - -static ssize_t tcpsock_pcblist(struct rmib_call *, struct rmib_node *, - struct rmib_oldp *, struct rmib_newp *); - -/* The CTL_NET {PF_INET,PF_INET6} IPPROTO_TCP subtree. */ -/* TODO: add many more and make some of them writable.. */ -static struct rmib_node net_inet_tcp_table[] = { -/* 2*/ [TCPCTL_SENDSPACE] = RMIB_INT(RMIB_RO, TCP_SNDBUF_DEF, - "sendspace", - "Default TCP send buffer size"), -/* 3*/ [TCPCTL_RECVSPACE] = RMIB_INT(RMIB_RO, TCP_RCVBUF_DEF, - "recvspace", - "Default TCP receive buffer size"), -/*29*/ [TCPCTL_LOOPBACKCKSUM] = RMIB_FUNC(RMIB_RW | CTLTYPE_INT, sizeof(int), - loopif_cksum, "do_loopback_cksum", - "Perform TCP checksum on loopback"), -/*+0*/ [TCPCTL_MAXID] = RMIB_FUNC(RMIB_RO | CTLTYPE_NODE, 0, - tcpsock_pcblist, "pcblist", - "TCP protocol control block list"), -/*+1*/ [TCPCTL_MAXID + 1] = RMIB_FUNC(RMIB_RW | CTLFLAG_PRIVATE | - CTLFLAG_HIDDEN | CTLTYPE_STRING, - TCPISN_SECRET_HEX_LENGTH, tcpisn_secret, - "isn_secret", - "TCP ISN secret (MINIX 3 specific)") -}; - -static struct rmib_node net_inet_tcp_node = - RMIB_NODE(RMIB_RO, net_inet_tcp_table, "tcp", "TCP related settings"); -static struct rmib_node net_inet6_tcp6_node = - RMIB_NODE(RMIB_RO, net_inet_tcp_table, "tcp6", "TCP related settings"); - -/* - * Initialize the TCP sockets module. - */ -void -tcpsock_init(void) -{ - unsigned int slot; - - /* Initialize the list of free TCP sockets. */ - TAILQ_INIT(&tcp_freelist); - - for (slot = 0; slot < __arraycount(tcp_array); slot++) - TAILQ_INSERT_TAIL(&tcp_freelist, &tcp_array[slot], - tcp_queue.tq_next); - - /* Initialize other variables. */ - tcpsock_sendbufs = 0; - - /* Register the net.inet.tcp and net.inet6.tcp6 RMIB subtrees. */ - mibtree_register_inet(PF_INET, IPPROTO_TCP, &net_inet_tcp_node); - mibtree_register_inet(PF_INET6, IPPROTO_TCP, &net_inet6_tcp6_node); -} - -/* - * Initialize the state of a TCP socket's send queue. - */ -static void -tcpsock_reset_send(struct tcpsock * tcp) -{ - - tcp->tcp_snd.ts_tail = NULL; - tcp->tcp_snd.ts_unsent = NULL; - tcp->tcp_snd.ts_head = NULL; - tcp->tcp_snd.ts_len = 0; - tcp->tcp_snd.ts_unsent_off = 0; - tcp->tcp_snd.ts_head_off = 0; -} - -/* - * Initialize the state of a TCP socket's receive queue. - */ -static void -tcpsock_reset_recv(struct tcpsock * tcp) -{ - - tcp->tcp_rcv.tr_pre_tailp = NULL; - tcp->tcp_rcv.tr_head = NULL; - tcp->tcp_rcv.tr_len = 0; - tcp->tcp_rcv.tr_head_off = 0; - tcp->tcp_rcv.tr_unacked = 0; -} - -/* - * Create a TCP socket. - */ -sockid_t -tcpsock_socket(int domain, int protocol, struct sock ** sockp, - const struct sockevent_ops ** ops) -{ - struct tcpsock *tcp; - uint8_t ip_type; - - switch (protocol) { - case 0: - case IPPROTO_TCP: - break; - - default: - return EPROTONOSUPPORT; - } - - if (TAILQ_EMPTY(&tcp_freelist)) - return ENOBUFS; - - tcp = TAILQ_FIRST(&tcp_freelist); - - /* - * Initialize the structure. Do not memset it to zero, as it is still - * part of the linked free list. Initialization may still fail. When - * adding new fields, make sure to change tcpsock_clone() accordingly. - */ - - ip_type = ipsock_socket(tcpsock_get_ipsock(tcp), domain, - TCP_SNDBUF_DEF, TCP_RCVBUF_DEF, sockp); - - if ((tcp->tcp_pcb = tcp_new_ip_type(ip_type)) == NULL) - return ENOBUFS; - tcp_arg(tcp->tcp_pcb, tcp); - - tcp->tcp_listener = NULL; - - tcpsock_reset_send(tcp); - tcpsock_reset_recv(tcp); - - TAILQ_REMOVE(&tcp_freelist, tcp, tcp_queue.tq_next); - - *ops = &tcpsock_ops; - return tcpsock_get_id(tcp); -} - -/* - * Create a TCP socket for the TCP PCB 'pcb' which identifies a new connection - * incoming on listening socket 'listener'. The new socket is essentially a - * "clone" of the listening TCP socket, in that it should inherit any settings - * from the listening socket. The socket has not yet been accepted by userland - * so add it to the queue of connetions pending for the listening socket. On - * success, return OK. On failure, return a negative error code. - */ -static int -tcpsock_clone(struct tcpsock * listener, struct tcp_pcb * pcb) -{ - struct tcpsock *tcp; - - if (TAILQ_EMPTY(&tcp_freelist)) - return ENOBUFS; - - tcp = TAILQ_FIRST(&tcp_freelist); - - /* - * Initialize the structure. Do not memset it to zero, as it is still - * part of the linked free list. Initialization may still fail. Most - * settings should be inherited from the listening socket here, rather - * than being initialized to their default state. - */ - - ipsock_clone(tcpsock_get_ipsock(listener), tcpsock_get_ipsock(tcp), - tcpsock_get_id(tcp)); - - tcp->tcp_pcb = pcb; - tcp_arg(pcb, tcp); - - tcpsock_reset_send(tcp); - tcpsock_reset_recv(tcp); - - /* - * Remove the new socket from the free list, and add it to the queue of - * the listening socket--in this order, because the same next pointer - * is used for both. - */ - TAILQ_REMOVE(&tcp_freelist, tcp, tcp_queue.tq_next); - - TAILQ_INSERT_TAIL(&listener->tcp_queue.tq_head, tcp, - tcp_queue.tq_next); - tcp->tcp_listener = listener; - - return OK; -} - -/* - * Allocate a buffer from the pool, using the standard pool size. The returned - * buffer is a single element--never a chain. - */ -static struct pbuf * -tcpsock_alloc_buf(void) -{ - struct pbuf *pbuf; - - pbuf = pbuf_alloc(PBUF_RAW, MEMPOOL_BUFSIZE, PBUF_RAM); - - assert(pbuf == NULL || pbuf->len == pbuf->tot_len); - - return pbuf; -} - -/* - * Free the given buffer. Ensure that pbuf_free() will not attempt to free the - * next buffer(s) in the chain as well. This may be called for pbufs other - * than those allocated with tcpsock_alloc_buf(). - */ -static void -tcpsock_free_buf(struct pbuf * pbuf) -{ - - /* - * Resetting the length is currently not necessary, but better safe - * than sorry.. - */ - pbuf->len = pbuf->tot_len; - pbuf->next = NULL; - - pbuf_free(pbuf); -} - -/* - * Clear the send queue of a TCP socket. The caller must ensure that lwIP will - * no longer access any of data on the send queue. - */ -static void -tcpsock_clear_send(struct tcpsock * tcp) -{ - struct pbuf *phead; - - assert(tcp->tcp_pcb == NULL); - - while ((phead = tcp->tcp_snd.ts_head) != NULL) { - tcp->tcp_snd.ts_head = phead->next; - - assert(tcpsock_sendbufs > 0); - tcpsock_sendbufs--; - - tcpsock_free_buf(phead); - } - - tcpsock_reset_send(tcp); -} - -/* - * Clear the receive queue of a TCP socket. If 'ack_data' is set, also - * acknowledge the previous contents of the receive queue to lwIP. - */ -static size_t -tcpsock_clear_recv(struct tcpsock * tcp, int ack_data) -{ - struct pbuf *phead; - size_t rlen; - - rlen = tcp->tcp_rcv.tr_len; - - while ((phead = tcp->tcp_rcv.tr_head) != NULL) { - tcp->tcp_rcv.tr_head = phead->next; - - assert(tcpsock_recvbufs > 0); - tcpsock_recvbufs--; - - tcpsock_free_buf(phead); - } - - /* - * From now on, we will basically be discarding incoming data as fast - * as possible, to keep the full window open at all times. - */ - if (ack_data && tcp->tcp_pcb != NULL && tcp->tcp_rcv.tr_unacked > 0) - tcp_recved(tcp->tcp_pcb, tcp->tcp_rcv.tr_unacked); - - tcpsock_reset_recv(tcp); - - return rlen; -} - -/* - * The TCP socket's PCB has been detached from the socket, typically because - * the connection was aborted, either by us or by lwIP. Either way, any TCP - * connection is gone. Clear the socket's send queue, remove the socket from - * a listening socket's queue, and if the socket itself is ready and allowed to - * be freed, free it now. The socket is ready to be freed if it was either on - * a listening queue or being closed already. The socket is allowed to be - * freed only if 'may_free' is TRUE. If the socket is not freed, its receive - * queue is left as is, as it may still have data to be received by userland. - */ -static int -tcpsock_cleanup(struct tcpsock * tcp, int may_free) -{ - int destroy; - - assert(tcp->tcp_pcb == NULL); - - /* - * Free any data on the send queue. This is safe to do right now, - * because the PCB has been aborted (or was already gone). We must be - * very careful about clearing the send queue in all other situations. - */ - tcpsock_clear_send(tcp); - - /* - * If this was a socket pending acceptance, remove it from the - * corresponding listener socket's queue, and free it. Otherwise, free - * the socket only if it suspended a graceful close operation. - */ - if (tcp->tcp_listener != NULL) { - TAILQ_REMOVE(&tcp->tcp_listener->tcp_queue.tq_head, tcp, - tcp_queue.tq_next); - tcp->tcp_listener = NULL; - - /* - * The listener socket's backlog count should be adjusted by - * lwIP whenever the PCB is freed up, so we need (and must) not - * attempt to do that here. - */ - - destroy = TRUE; - } else - destroy = sockevent_is_closing(tcpsock_get_sock(tcp)); - - /* - * Do not free the socket if 'may_free' is FALSE. That flag may be set - * if we are currently in the second tcpsock_close() call on the - * socket, in which case sockevent_is_closing() is TRUE but we must - * still not free the socket now: doing so would derail libsockevent. - */ - if (destroy && may_free) { - (void)tcpsock_clear_recv(tcp, FALSE /*ack_data*/); - - sockevent_raise(tcpsock_get_sock(tcp), SEV_CLOSE); - } - - return destroy; -} - -/* - * Abort the lwIP PCB for the given socket, using tcp_abort(). If the PCB is - * connected, this will cause the connection to be reset. The PCB, which must - * have still been present before the call, will be gone after the call. - */ -static void -tcpsock_pcb_abort(struct tcpsock * tcp) -{ - - assert(tcp->tcp_pcb != NULL); - assert(!tcpsock_is_listening(tcp)); - - tcp_recv(tcp->tcp_pcb, NULL); - tcp_sent(tcp->tcp_pcb, NULL); - tcp_err(tcp->tcp_pcb, NULL); - tcp_poll(tcp->tcp_pcb, NULL, TCP_POLL_REG_INTERVAL); - - tcp_arg(tcp->tcp_pcb, NULL); - - tcp_abort(tcp->tcp_pcb); - - tcp->tcp_pcb = NULL; -} - -/* - * Close the lwIP PCB for the given socket, using tcp_close(). If the PCB is - * connected, its graceful close will be finished by lwIP in the background. - * The PCB, which must have still been present before the call, will be gone - * after the call. - */ -static void -tcpsock_pcb_close(struct tcpsock * tcp) -{ - err_t err; - - assert(tcp->tcp_pcb != NULL); - assert(tcp->tcp_snd.ts_len == 0); - - if (!tcpsock_is_listening(tcp)) { - tcp_recv(tcp->tcp_pcb, NULL); - tcp_sent(tcp->tcp_pcb, NULL); - tcp_err(tcp->tcp_pcb, NULL); - tcp_poll(tcp->tcp_pcb, NULL, TCP_POLL_REG_INTERVAL); - } - - tcp_arg(tcp->tcp_pcb, NULL); - - if ((err = tcp_close(tcp->tcp_pcb)) != ERR_OK) - panic("unexpected TCP close failure: %d", err); - - tcp->tcp_pcb = NULL; -} - -/* - * Return TRUE if all conditions are met for closing the TCP socket's PCB, or - * FALSE if they are not. Upon calling this function, the socket's PCB must - * still be around. - */ -static int -tcpsock_may_close(struct tcpsock * tcp) -{ - - assert(tcp->tcp_pcb != NULL); - - /* - * Regular closing of the PCB requires three conditions to be met: - * - * 1. all our data has been transmitted AND acknowledged, so that we do - * not risk corruption in case there are still unsent or unack'ed - * data buffers that may otherwise be recycled too soon; - * 2. we have sent our FIN to the peer; and, - * 3. we have received a FIN from the peer. - */ - return ((tcpsock_get_flags(tcp) & (TCPF_SENT_FIN | TCPF_RCVD_FIN)) == - (TCPF_SENT_FIN | TCPF_RCVD_FIN) && tcp->tcp_snd.ts_len == 0); -} - -/* - * The given socket is ready to be closed as per the tcpsock_may_close() rules. - * This implies that its send queue is already empty. Gracefully close the - * PCB. In addition, if the socket is being closed gracefully, meaning we - * suspended an earlier tcpsock_close() call (and as such already emptied the - * receive queue as well), then tell libsockevent that the close is finished, - * freeing the socket. Return TRUE if the socket has indeed been freed this - * way, or FALSE if the socket is still around. - */ -static int -tcpsock_finish_close(struct tcpsock * tcp) -{ - - assert(tcp->tcp_snd.ts_len == 0); - assert(tcp->tcp_listener == NULL); - - /* - * If we get here, we have already shut down the sending side of the - * PCB. Technically, we are interested only in shutting down the - * receiving side of the PCB here, so that lwIP may decide to recycle - * the socket later etcetera. We call tcp_close() because we do not - * want to rely on tcp_shutdown(RX) doing the exact same thing. - * However, we do rely on the fact that the PCB is not immediately - * destroyed by the tcp_close() call: otherwise we may have to return - * ERR_ABRT if this function is called from a lwIP-generated event. - */ - tcpsock_pcb_close(tcp); - - /* - * If we suspended an earlier tcpsock_close() call, we have to tell - * libsockevent that the close operation is now complete. - */ - if (sockevent_is_closing(tcpsock_get_sock(tcp))) { - assert(tcp->tcp_rcv.tr_len == 0); - - sockevent_raise(tcpsock_get_sock(tcp), SEV_CLOSE); - - return TRUE; - } else - return FALSE; -} - -/* - * Attempt to start or resume enqueuing data and/or a FIN to send on the given - * TCP socket. Return TRUE if anything at all could be newly enqueued on the - * lwIP PCB, even if less than desired. In that case, the caller should try to - * send whatever was enqueued, and if applicable, check if the socket may now - * be closed (due to the FIN being enqueued). In particular, in any situation - * where the socket may be in the process of being closed, the caller must use - * tcpsock_may_close() if TRUE is returned. Return FALSE if nothing new could - * be enqueued, in which case no send attempt need to be made either. - */ -static int -tcpsock_pcb_enqueue(struct tcpsock * tcp) -{ - struct pbuf *punsent; - size_t space, chunk; - unsigned int flags; - err_t err; - int enqueued; - - assert(tcp->tcp_pcb != NULL); - - if (tcpsock_get_flags(tcp) & TCPF_FULL) - return FALSE; - - /* - * Attempt to enqueue more unsent data, if any, on the PCB's send - * queue. - */ - enqueued = FALSE; - - while (tcp->tcp_snd.ts_unsent != NULL) { - if ((space = tcp_sndbuf(tcp->tcp_pcb)) == 0) - break; - - /* - * We may maintain a non-NULL unsent pointer even when there is - * nothing more to send right now, because the tail buffer may - * be filled up further later on. - */ - punsent = tcp->tcp_snd.ts_unsent; - - assert(punsent->len >= tcp->tcp_snd.ts_unsent_off); - - chunk = (size_t)punsent->len - tcp->tcp_snd.ts_unsent_off; - if (chunk == 0) - break; - - if (chunk > space) - chunk = space; - - /* Try to enqueue more data for sending. */ - if (chunk < punsent->len || punsent->next != NULL) - flags = TCP_WRITE_FLAG_MORE; - else - flags = 0; - - err = tcp_write(tcp->tcp_pcb, (char *)punsent->payload + - tcp->tcp_snd.ts_unsent_off, chunk, flags); - - /* - * Since tcp_write() enqueues data only, it should only return - * out-of-memory errors; no fatal ones. In any case, stop. - */ - if (err != ERR_OK) { - assert(err == ERR_MEM); - - break; - } - - /* We have successfully enqueued data. */ - enqueued = TRUE; - - tcp->tcp_snd.ts_unsent_off += chunk; - - if (tcp->tcp_snd.ts_unsent_off < punsent->tot_len) { - assert(tcp->tcp_snd.ts_unsent_off < punsent->len || - punsent->next == NULL); - - break; - } - - tcp->tcp_snd.ts_unsent = punsent->next; - tcp->tcp_snd.ts_unsent_off = 0; - } - - /* - * If all pending data has been enqueued for sending, and we should - * shut down the sending end of the socket, try that now. - */ - if ((tcp->tcp_snd.ts_unsent == NULL || - tcp->tcp_snd.ts_unsent_off == tcp->tcp_snd.ts_unsent->len) && - tcpsock_is_shutdown(tcp, SFL_SHUT_WR) && - !(tcpsock_get_flags(tcp) & TCPF_SENT_FIN)) { - err = tcp_shutdown(tcp->tcp_pcb, 0 /*shut_rx*/, 1 /*shut_tx*/); - - if (err == ERR_OK) { - /* - * We have successfully enqueued a FIN. The caller is - * now responsible for checking whether the PCB and - * possibly even the socket object can now be freed. - */ - tcpsock_set_flag(tcp, TCPF_SENT_FIN); - - enqueued = TRUE; - } else { - assert(err == ERR_MEM); - - /* - * FIXME: the resolution for lwIP bug #47485 has taken - * away even more control over the closing process from - * us, making tracking sockets especially for SO_LINGER - * even harder. For now, we simply effectively undo - * the patch by clearing TF_CLOSEPEND if tcp_shutdown() - * returns ERR_MEM. This will not be sustainable in - * the long term, though. - */ - tcp->tcp_pcb->flags &= ~TF_CLOSEPEND; - - tcpsock_set_flag(tcp, TCPF_FULL); - } - } - - return enqueued; -} - -/* - * Request lwIP to start sending any enqueued data and/or FIN on the TCP - * socket's lwIP PCB. On success, return OK. On failure, return a negative - * error code, after cleaning up the socket, freeing the PCB. If the socket - * was already being closed, also free the socket object in that case; the - * caller must then not touch the socket object anymore upon return. If the - * socket object is not freed, and if 'raise_error' is TRUE, raise the error - * on the socket object. - */ -static int -tcpsock_pcb_send(struct tcpsock * tcp, int raise_error) -{ - err_t err; - int r; - - assert(tcp->tcp_pcb != NULL); - - /* - * If we have enqueued something, ask lwIP to send TCP packets now. - * This may result in a fatal error, in which case we clean up the - * socket and return the error to the caller. Since cleaning up the - * socket may free the socket object, and the caller cannot tell - * whether that will happen or has happened, also possibly raise the - * error on the socket object if it is not gone. As such, callers that - * set 'raise_error' to FALSE must know for sure that the socket was - * not being closed, for example because the caller is processing a - * (send) call from userland. - */ - err = tcp_output(tcp->tcp_pcb); - - if (err != ERR_OK && err != ERR_MEM) { - tcpsock_pcb_abort(tcp); - - r = util_convert_err(err); - - if (!tcpsock_cleanup(tcp, TRUE /*may_free*/)) { - if (raise_error) - sockevent_set_error(tcpsock_get_sock(tcp), r); - } - /* Otherwise, do not touch the socket object anymore! */ - - return r; - } else - return OK; -} - -/* - * Callback from lwIP. The given number of data bytes have been acknowledged - * as received by the remote end. Dequeue and free data from the TCP socket's - * send queue as appropriate. - */ -static err_t -tcpsock_event_sent(void * arg, struct tcp_pcb * pcb __unused, uint16_t len) -{ - struct tcpsock *tcp = (struct tcpsock *)arg; - struct pbuf *phead; - size_t left; - - assert(tcp != NULL); - assert(pcb == tcp->tcp_pcb); - assert(len > 0); - - assert(tcp->tcp_snd.ts_len >= len); - assert(tcp->tcp_snd.ts_head != NULL); - - left = len; - - /* - * First see if we can free up whole buffers. Check against the head - * buffer's 'len' rather than 'tot_len', or we may end up leaving an - * empty buffer on the chain. - */ - while ((phead = tcp->tcp_snd.ts_head) != NULL && - left >= (size_t)phead->len - tcp->tcp_snd.ts_head_off) { - left -= (size_t)phead->len - tcp->tcp_snd.ts_head_off; - - tcp->tcp_snd.ts_head = phead->next; - tcp->tcp_snd.ts_head_off = 0; - - if (phead == tcp->tcp_snd.ts_unsent) { - assert(tcp->tcp_snd.ts_unsent_off == phead->len); - - tcp->tcp_snd.ts_unsent = phead->next; - tcp->tcp_snd.ts_unsent_off = 0; - } - - assert(tcpsock_sendbufs > 0); - tcpsock_sendbufs--; - - tcpsock_free_buf(phead); - } - - /* - * The rest of the given length is for less than the current head - * buffer. - */ - if (left > 0) { - assert(tcp->tcp_snd.ts_head != NULL); - assert((size_t)tcp->tcp_snd.ts_head->len - - tcp->tcp_snd.ts_head_off > left); - - tcp->tcp_snd.ts_head_off += left; - } - - tcp->tcp_snd.ts_len -= (size_t)len; - - if (tcp->tcp_snd.ts_head == NULL) { - assert(tcp->tcp_snd.ts_len == 0); - assert(tcp->tcp_snd.ts_unsent == NULL); - tcp->tcp_snd.ts_tail = NULL; - } else - assert(tcp->tcp_snd.ts_len > 0); - - /* - * If we emptied the send queue, and we already managed to send a FIN - * earlier, we may now have met all requirements to close the socket's - * PCB. Otherwise, we may also be able to send more now, so try to - * resume sending. Since we are invoked from the "sent" event, - * tcp_output() will not actually process anything, and so we do not - * call it either. If we did, we would have to deal with errors here. - */ - if (tcpsock_may_close(tcp)) { - if (tcpsock_finish_close(tcp)) - return ERR_OK; - } else { - tcpsock_clear_flag(tcp, TCPF_FULL); - - /* - * If we now manage to enqueue a FIN, we may be ready to close - * the PCB after all. - */ - if (tcpsock_pcb_enqueue(tcp)) { - if (tcpsock_may_close(tcp) && - tcpsock_finish_close(tcp)) - return ERR_OK; - } - } - - /* The user may also be able to send more now. */ - sockevent_raise(tcpsock_get_sock(tcp), SEV_SEND); - - return ERR_OK; -} - -/* - * Check whether any (additional) data previously received on a TCP socket - * should be acknowledged, possibly allowing the remote end to send additional - * data as a result. - */ -static void -tcpsock_ack_recv(struct tcpsock * tcp) -{ - size_t rcvbuf, left, delta, ack; - - assert(tcp->tcp_pcb != NULL); - - /* - * We must make sure that at all times, we can still add an entire - * window's worth of data to the receive queue. If the amount of free - * space drops below that threshold, we stop acknowledging received - * data. The user may change the receive buffer size at all times; we - * update the window size lazily as appropriate. - */ - rcvbuf = tcpsock_get_rcvbuf(tcp); - - if (rcvbuf > tcp->tcp_rcv.tr_len && tcp->tcp_rcv.tr_unacked > 0) { - /* - * The number of bytes that lwIP can still give us at any time - * is represented as 'left'. The number of bytes that we still - * allow to be stored in the receive queue is represented as - * 'delta'. We must make sure that 'left' does not ever exceed - * 'delta' while acknowledging as many bytes as possible under - * that rule. - */ - left = TCP_WND - tcp->tcp_rcv.tr_unacked; - delta = rcvbuf - tcp->tcp_rcv.tr_len; - - if (left < delta) { - ack = delta - left; - - if (ack > tcp->tcp_rcv.tr_unacked) - ack = tcp->tcp_rcv.tr_unacked; - - tcp_recved(tcp->tcp_pcb, ack); - - tcp->tcp_rcv.tr_unacked -= ack; - - assert(tcp->tcp_rcv.tr_len + TCP_WND - - tcp->tcp_rcv.tr_unacked <= rcvbuf); - } - } -} - -/* - * Attempt to merge two consecutive underfilled buffers in the receive queue of - * a TCP socket, freeing up one of the two buffers as a result. The first - * (oldest) buffer is 'ptail', and the pointer to this buffer is stored at - * 'pnext'. The second (new) buffer is 'pbuf', which is already attached to - * the first buffer. The second buffer may be followed by additional buffers - * with even more new data. Return TRUE if buffers have been merged, in which - * case the pointer at 'pnext' may have changed, and no assumptions should be - * made about whether 'ptail' and 'pbuf' still exist in any form. Return FALSE - * if no merging was necessary or if no new buffer could be allocated. - */ -static int -tcpsock_try_merge(struct pbuf **pnext, struct pbuf * ptail, struct pbuf * pbuf) -{ - struct pbuf *pnew; - - assert(*pnext == ptail); - assert(ptail->next == pbuf); - - /* - * Unfortunately, we cannot figure out what kind of pbuf we were given - * by the lower layers, so we cannot merge two buffers without first - * allocating a third. Once we have done that, though, we can easily - * merge more into that new buffer. For now we use the following - * policies: - * - * 1. if two consecutive lwIP-provided buffers are both used less than - * half the size of a full buffer, try to allocate a new buffer and - * copy both lwIP-provided buffers into that new buffer, freeing up - * the pair afterwards; - * 2. if the tail buffer on the chain is allocated by us and not yet - * full, and the next buffer's contents can be added to the tail - * buffer in their entirety, do just that. - * - * Obviously there is a trade-off between the performance overhead of - * copying and the resource overhead of keeping less-than-full buffers - * on the receive queue, but this policy should both keep actual memory - * usage to no more than twice the receive queue length and prevent - * excessive copying. The policy deliberately performs more aggressive - * merging into a buffer that we allocated ourselves. - */ - if (ptail->tot_len <= MEMPOOL_BUFSIZE / 2 && - pbuf->len <= MEMPOOL_BUFSIZE / 2) { - /* - * Case #1. - */ - assert(ptail->tot_len == ptail->len); - assert(pbuf->tot_len == pbuf->len); - - pnew = tcpsock_alloc_buf(); - if (pnew == NULL) - return FALSE; - - memcpy(pnew->payload, ptail->payload, ptail->len); - memcpy((char *)pnew->payload + ptail->len, pbuf->payload, - pbuf->len); - pnew->len = ptail->len + pbuf->len; - assert(pnew->len <= pnew->tot_len); - - pnew->next = pbuf->next; - /* For now, we need not inherit any flags from either pbuf. */ - - *pnext = pnew; - - /* One allocated, two about to be deallocated. */ - assert(tcpsock_recvbufs > 0); - tcpsock_recvbufs--; - - tcpsock_free_buf(ptail); - tcpsock_free_buf(pbuf); - - return TRUE; - } else if (ptail->tot_len - ptail->len >= pbuf->len) { - /* - * Case #2. - */ - memcpy((char *)ptail->payload + ptail->len, pbuf->payload, - pbuf->len); - - ptail->len += pbuf->len; - - ptail->next = pbuf->next; - - assert(tcpsock_recvbufs > 0); - tcpsock_recvbufs--; - - tcpsock_free_buf(pbuf); - - return TRUE; - } else - return FALSE; -} - -/* - * Callback from lwIP. New data or flags have been received on a TCP socket. - */ -static err_t -tcpsock_event_recv(void * arg, struct tcp_pcb * pcb __unused, - struct pbuf * pbuf, err_t err) -{ - struct tcpsock *tcp = (struct tcpsock *)arg; - struct pbuf *ptail, **pprevp; - size_t len; - - assert(tcp != NULL); - assert(pcb == tcp->tcp_pcb); - - /* - * lwIP should never provide anything other than ERR_OK in 'err', and - * it is not clear what we should do if it would. If lwIP ever changes - * in this regard, we will likely have to change this code accordingly. - */ - if (err != ERR_OK) - panic("TCP receive event with error: %d", err); - - /* If the given buffer is NULL, we have received a FIN. */ - if (pbuf == NULL) { - tcpsock_set_flag(tcp, TCPF_RCVD_FIN); - - /* Userland may now receive EOF. */ - if (!tcpsock_is_shutdown(tcp, SFL_SHUT_RD)) - sockevent_raise(tcpsock_get_sock(tcp), SEV_RECV); - - /* - * If we were in the process of closing the socket, and we - * receive a FIN before our FIN got acknowledged, we close the - * socket anyway, as described in tcpsock_close(). However, if - * there is still unacknowledged outgoing data or we did not - * even manage to send our FIN yet, hold off closing the socket - * for now. - */ - if (tcpsock_may_close(tcp)) - (void)tcpsock_finish_close(tcp); - - return ERR_OK; - } - - /* - * If the socket is being closed, receiving new data should cause a - * reset. - */ - if (sockevent_is_closing(tcpsock_get_sock(tcp))) { - tcpsock_pcb_abort(tcp); - - (void)tcpsock_cleanup(tcp, TRUE /*may_free*/); - /* Do not touch the socket object anymore! */ - - pbuf_free(pbuf); - - return ERR_ABRT; - } - - /* - * If the socket has already been shut down for reading, discard the - * incoming data and do nothing else. - */ - if (tcpsock_is_shutdown(tcp, SFL_SHUT_RD)) { - tcp_recved(tcp->tcp_pcb, pbuf->tot_len); - - pbuf_free(pbuf); - - return ERR_OK; - } - - /* - * We deliberately ignore the PBUF_FLAG_PUSH flag. This flag would - * enable the receive functionality to delay delivering "un-pushed" - * data to applications. The implementation of this scheme could track - * the amount of data up to and including the last-pushed segment using - * a "tr_push_len" field or so. Deciding when to deliver "un-pushed" - * data after all is a bit tricker though. As far as I can tell, the - * BSDs do not implement anything like that. Windows does, and this - * results in interaction problems with even more lightweight TCP/IP - * stacks that do not send the TCP PSH flag. Currently, there is no - * obvious benefit for us to support delaying data delivery like that. - * In addition, testing its implementation reliably would be difficult. - */ - - len = (size_t)pbuf->tot_len; - - /* - * Count the number of buffers that are now owned by us. The new total - * of buffers owned by us must not exceed the size of the memory pool. - * Any more would indicate an accounting error. Note that - * tcpsock_recvbufs is currently used for debugging only! - */ - tcpsock_recvbufs += pbuf_clen(pbuf); - assert(tcpsock_recvbufs < mempool_cur_buffers()); - - /* - * The pre-tail pointer points to whatever is pointing to the tail - * buffer. The latter pointer may be the 'tr_head' field in our - * tcpsock structure, or the 'next' field in the penultimate buffer, - * or NULL if there are currently no buffers on the receive queue. - */ - if ((pprevp = tcp->tcp_rcv.tr_pre_tailp) != NULL) { - ptail = *pprevp; - - assert(ptail != NULL); - assert(ptail->next == NULL); - assert(tcp->tcp_rcv.tr_head != NULL); - - ptail->next = pbuf; - pbuf->tot_len = pbuf->len; /* to help freeing on merges */ - - if (tcpsock_try_merge(pprevp, ptail, pbuf)) { - ptail = *pprevp; - pbuf = ptail->next; - } - - if (pbuf != NULL) - pprevp = &ptail->next; - } else { - assert(tcp->tcp_rcv.tr_head == NULL); - assert(tcp->tcp_rcv.tr_head_off == 0); - - tcp->tcp_rcv.tr_head = pbuf; - - pprevp = &tcp->tcp_rcv.tr_head; - } - - /* - * Chop up the chain into individual buffers. This is necessary as we - * overload 'tot_len' to mean "space available in the buffer", as we - * want for buffers allocated by us as part of buffer merges. Also get - * a pointer to the pointer to the new penultimate tail buffer. Due to - * merging, the chain may already be empty by now, though. - */ - if (pbuf != NULL) { - for (; pbuf->next != NULL; pbuf = pbuf->next) { - pbuf->tot_len = pbuf->len; - - pprevp = &pbuf->next; - } - assert(pbuf->len == pbuf->tot_len); - } - - assert(*pprevp != NULL); - assert((*pprevp)->next == NULL); - tcp->tcp_rcv.tr_pre_tailp = pprevp; - - tcp->tcp_rcv.tr_len += len; - tcp->tcp_rcv.tr_unacked += len; - - assert(tcp->tcp_rcv.tr_unacked <= TCP_WND); - - /* - * Note that tr_len may now exceed the receive buffer size in the - * highly exceptional case that the user is adjusting the latter after - * the socket had already received data. - */ - - /* See if we can immediately acknowledge some or all of the data. */ - tcpsock_ack_recv(tcp); - - /* Also wake up any receivers now. */ - sockevent_raise(tcpsock_get_sock(tcp), SEV_RECV); - - return ERR_OK; -} - -/* - * Callback from lwIP. The PCB corresponding to the socket identified by 'arg' - * has been closed by lwIP, with the reason specified in 'err': either the - * connection has been aborted locally (ERR_ABRT), it has been reset by the - * remote end (ERR_RST), or it is closed due to state transitions (ERR_CLSD). - */ -static void -tcpsock_event_err(void * arg, err_t err) -{ - struct tcpsock *tcp = (struct tcpsock *)arg; - int r; - - assert(tcp != NULL); - assert(tcp->tcp_pcb != NULL); - assert(err != ERR_OK); - - /* The original PCB is now gone, or will be shortly. */ - tcp->tcp_pcb = NULL; - - /* - * Clean up the socket. As a result it may be freed, in which case we - * must not touch it anymore. No need to return ERR_ABRT from here, as - * the PCB has been aborted already. - */ - if (tcpsock_cleanup(tcp, TRUE /*may_free*/)) - return; - - if (err == ERR_CLSD) { - /* - * We may get here if the socket is shut down for writing and - * we already received a FIN from the remote side, thus putting - * the socket in LAST_ACK state, and we receive that last - * acknowledgment. There is nothing more we need to do. - * - * We will never get here in the other case that ERR_CLSD is - * raised, which is when the socket is reset because of - * unacknowledged data while closing: we handle the - * reset-on-ACK case ourselves in tcpsock_close(), and the - * socket is in closing state after that. - */ - assert(tcpsock_is_shutdown(tcp, SFL_SHUT_WR)); - assert(tcpsock_get_flags(tcp) & TCPF_RCVD_FIN); - } else { - /* - * Anything else should be an error directly from lwIP; - * currently either ERR_ABRT and ERR_RST. Covert it to a - * regular error and set it on the socket. Doing so will also - * raise the appropriate events. - */ - /* - * Unfortunately, lwIP is not throwing accurate errors even - * when it can. We convert some errors to reflect more - * accurately the most likely cause. - * - * TODO: fix lwIP in this regard.. - */ - r = util_convert_err(err); - - if (tcpsock_get_flags(tcp) & TCPF_CONNECTING) { - switch (err) { - case ERR_ABRT: r = ETIMEDOUT; break; - case ERR_RST: r = ECONNREFUSED; break; - } - } - - sockevent_set_error(tcpsock_get_sock(tcp), r); - } -} - -/* - * Callback from lwIP. Perform regular checks on a TCP socket. This function - * is called one per five seconds on connected sockets, and twice per second on - * closing sockets. - */ -static err_t -tcpsock_event_poll(void * arg, struct tcp_pcb * pcb __unused) -{ - struct tcpsock *tcp = (struct tcpsock *)arg; - err_t err; - int r; - - assert(tcp != NULL); - assert(pcb == tcp->tcp_pcb); - - /* - * If we ended up running out of buffers earlier, try resuming any send - * requests now, both for enqueuing TCP data with lwIP and for user - * requests. - */ - if (tcpsock_get_flags(tcp) & (TCPF_FULL | TCPF_OOM)) { - tcpsock_clear_flag(tcp, TCPF_FULL); - tcpsock_clear_flag(tcp, TCPF_OOM); - - /* See if we can enqueue more data with lwIP. */ - if (tcpsock_pcb_enqueue(tcp)) { - /* In some cases, we can now close the PCB. */ - if (tcpsock_may_close(tcp)) { - (void)tcpsock_finish_close(tcp); - /* - * The PCB is definitely gone here, and the - * entire socket object may be gone now too. - * Do not touch either anymore! - */ - - return ERR_OK; - } - - /* - * If actually sending the data fails, the PCB will be - * gone, and the socket object may be gone as well. Do - * not touch either anymore in that case! - */ - if (tcpsock_pcb_send(tcp, TRUE /*raise_error*/) != OK) - return ERR_ABRT; - } - - /* - * If we ran out of buffers earlier, it may be possible to take - * in more data from a user process now, even if we did not - * manage to enqueue any more pending data with lwIP. - */ - sockevent_raise(tcpsock_get_sock(tcp), SEV_SEND); - - assert(tcp->tcp_pcb != NULL); - } else if (tcp->tcp_snd.ts_unsent != NULL && - tcp->tcp_snd.ts_unsent_off < tcp->tcp_snd.ts_unsent->len) { - /* - * If the send buffer is full, we will no longer call - * tcp_output(), which means we may also miss out on fatal - * errors that would otherwise kill the connection (e.g., no - * route). As a result, the connection may erroneously - * continue to exist for a long time. To avoid this, we call - * tcp_output() every once in a while when there are still - * unsent data. - */ - err = tcp_output(tcp->tcp_pcb); - - if (err != ERR_OK && err != ERR_MEM) { - tcpsock_pcb_abort(tcp); - - if (!tcpsock_cleanup(tcp, TRUE /*may_free*/)) { - r = util_convert_err(err); - - sockevent_set_error(tcpsock_get_sock(tcp), r); - } - /* Otherwise do not touch the socket object anymore! */ - - return ERR_ABRT; - } - } - - /* - * If we are closing the socket, and we sent a FIN, see if the FIN got - * acknowledged. If so, finish closing the socket. Unfortunately, we - * can perform this check by polling only. TODO: change lwIP.. - */ - if (sockevent_is_closing(tcpsock_get_sock(tcp)) && - (tcpsock_get_flags(tcp) & TCPF_SENT_FIN) && - tcp->tcp_pcb->unsent == NULL && tcp->tcp_pcb->unacked == NULL) { - assert(tcp->tcp_snd.ts_len == 0); - - tcpsock_finish_close(tcp); - } - - return ERR_OK; -} - -/* - * Bind a TCP socket to a local address. - */ -static int -tcpsock_bind(struct sock * sock, const struct sockaddr * addr, - socklen_t addr_len, endpoint_t user_endpt) -{ - struct tcpsock *tcp = (struct tcpsock *)sock; - ip_addr_t ipaddr; - uint16_t port; - err_t err; - int r; - - if (tcp->tcp_pcb == NULL || tcp->tcp_pcb->state != CLOSED) - return EINVAL; - - if ((r = ipsock_get_src_addr(tcpsock_get_ipsock(tcp), addr, addr_len, - user_endpt, &tcp->tcp_pcb->local_ip, tcp->tcp_pcb->local_port, - FALSE /*allow_mcast*/, &ipaddr, &port)) != OK) - return r; - - err = tcp_bind(tcp->tcp_pcb, &ipaddr, port); - - return util_convert_err(err); -} - -/* - * Callback from lwIP. A new connection 'pcb' has arrived on the listening - * socket identified by 'arg'. Note that 'pcb' may be NULL in the case that - * lwIP could not accept the connection itself. - */ -static err_t -tcpsock_event_accept(void * arg, struct tcp_pcb * pcb, err_t err) -{ - struct tcpsock *tcp = (struct tcpsock *)arg; - - assert(tcp != NULL); - assert(tcpsock_is_listening(tcp)); - - /* - * If the given PCB is NULL, then lwIP ran out of memory allocating a - * PCB for the new connection. There is nothing we can do with that - * information. Also check 'err' just to make sure. - */ - if (pcb == NULL || err != OK) - return ERR_OK; - - /* - * The TCP socket is the listening socket, but the PCB is for the - * incoming connection. - */ - if (tcpsock_clone(tcp, pcb) != OK) { - /* - * We could not allocate the resources necessary to accept the - * connection. Abort it immediately. - */ - tcp_abort(pcb); - - return ERR_ABRT; - } - - /* - * The connection has not yet been accepted, and thus should still be - * considered on the listen queue. - */ - tcp_backlog_delayed(pcb); - - /* Set the callback functions. */ - tcp_recv(pcb, tcpsock_event_recv); - tcp_sent(pcb, tcpsock_event_sent); - tcp_err(pcb, tcpsock_event_err); - tcp_poll(pcb, tcpsock_event_poll, TCP_POLL_REG_INTERVAL); - - sockevent_raise(tcpsock_get_sock(tcp), SEV_ACCEPT); - - return ERR_OK; -} - -/* - * Put a TCP socket in listening mode. - */ -static int -tcpsock_listen(struct sock * sock, int backlog) -{ - struct tcpsock *tcp = (struct tcpsock *)sock; - struct tcp_pcb *pcb; - err_t err; - - /* The maximum backlog value must not exceed its field size. */ - assert(SOMAXCONN <= UINT8_MAX); - - /* - * Allow only CLOSED sockets to enter listening mode. If the socket - * was already in listening mode, allow its backlog value to be - * updated, even if it was shut down already (making this a no-op). - */ - if (!tcpsock_is_listening(tcp) && - (tcp->tcp_pcb == NULL || tcp->tcp_pcb->state != CLOSED)) - return EINVAL; - - /* - * If the socket was not already in listening mode, put it in that mode - * now. That involves switching PCBs as lwIP attempts to save memory - * by replacing the original PCB with a smaller one. If the socket was - * already in listening mode, simply update its backlog value--this has - * no effect on the sockets already in the backlog. - */ - if (!tcpsock_is_listening(tcp)) { - assert(tcp->tcp_pcb != NULL); - - /* - * If the socket has not been bound to a port yet, do that - * first. This does mean that the listen call may fail with - * side effects, but that is acceptable in this case. - */ - if (tcp->tcp_pcb->local_port == 0) { - err = tcp_bind(tcp->tcp_pcb, &tcp->tcp_pcb->local_ip, - 0 /*port*/); - - if (err != ERR_OK) - return util_convert_err(err); - } - - /* - * Clear the argument on the PCB that is about to be replaced, - * because if we do not, once the PCB is reused (which does not - * clear the argument), we might get weird events. Do this - * before the tcp_listen() call, because we should no longer - * access the old PCB afterwards (even if we can). - */ - tcp_arg(tcp->tcp_pcb, NULL); - - pcb = tcp_listen_with_backlog_and_err(tcp->tcp_pcb, backlog, - &err); - - if (pcb == NULL) { - tcp_arg(tcp->tcp_pcb, tcp); /* oops, undo. */ - - return util_convert_err(err); - } - - tcp_arg(pcb, tcp); - tcp->tcp_pcb = pcb; - - tcp_accept(pcb, tcpsock_event_accept); - - /* Initialize the queue head for sockets pending acceptance. */ - TAILQ_INIT(&tcp->tcp_queue.tq_head); - } else if (tcp->tcp_pcb != NULL) - tcp_backlog_set(tcp->tcp_pcb, backlog); - - return OK; -} - -/* - * Callback from lwIP. A socket connection attempt has succeeded. Note that - * failed socket events will trigger the tcpsock_event_err() callback instead. - */ -static err_t -tcpsock_event_connected(void * arg, struct tcp_pcb * pcb __unused, err_t err) -{ - struct tcpsock *tcp = (struct tcpsock *)arg; - - assert(tcp != NULL); - assert(pcb == tcp->tcp_pcb); - assert(tcpsock_get_flags(tcp) & TCPF_CONNECTING); - - /* - * If lwIP ever changes so that this callback is called for connect - * failures as well, then we need to change the code here accordingly. - */ - if (err != ERR_OK) - panic("TCP connected event with error: %d", err); - - tcpsock_clear_flag(tcp, TCPF_CONNECTING); - - sockevent_raise(tcpsock_get_sock(tcp), SEV_CONNECT | SEV_SEND); - - return ERR_OK; -} - -/* - * Connect a TCP socket to a remote address. - */ -static int -tcpsock_connect(struct sock * sock, const struct sockaddr * addr, - socklen_t addr_len, endpoint_t user_endpt) -{ - struct tcpsock *tcp = (struct tcpsock *)sock; - ip_addr_t dst_addr; - uint16_t dst_port; - err_t err; - int r; - - /* - * Listening sockets may not have a PCB, so we use higher-level flags - * to throw the correct error code for those instead. - */ - if (tcpsock_is_listening(tcp)) - return EOPNOTSUPP; - - /* - * If there is no longer any PCB, we obviously cannot perform the - * connection, but POSIX is not clear on which error to return. We - * copy NetBSD's. - */ - if (tcp->tcp_pcb == NULL) - return EINVAL; - - /* - * The only state from which a connection can be initiated, is CLOSED. - * Some of the other states require distinct error codes, though. - */ - switch (tcp->tcp_pcb->state) { - case CLOSED: - break; - case SYN_SENT: - return EALREADY; - case LISTEN: - assert(0); /* we just checked.. */ - default: - return EISCONN; - } - - /* - * Get the destination address, and attempt to start connecting. If - * the socket was not bound before, or it was bound to a port only, - * then lwIP will select a source address for us. We cannot do this - * ourselves even if we wanted to: it is impossible to re-bind a TCP - * PCB in the case it was previously bound to a port only. - */ - if ((r = ipsock_get_dst_addr(tcpsock_get_ipsock(tcp), addr, addr_len, - &tcp->tcp_pcb->local_ip, &dst_addr, &dst_port)) != OK) - return r; - - err = tcp_connect(tcp->tcp_pcb, &dst_addr, dst_port, - tcpsock_event_connected); - - /* - * Note that various tcp_connect() error cases will leave the PCB with - * a newly set local and remote IP address anyway. We should be - * careful not to rely on the addresses being as they were before. - */ - if (err != ERR_OK) - return util_convert_err(err); - - /* Set the other callback functions. */ - tcp_recv(tcp->tcp_pcb, tcpsock_event_recv); - tcp_sent(tcp->tcp_pcb, tcpsock_event_sent); - tcp_err(tcp->tcp_pcb, tcpsock_event_err); - tcp_poll(tcp->tcp_pcb, tcpsock_event_poll, TCP_POLL_REG_INTERVAL); - - /* - * Set a flag so that we can correct lwIP's error codes in case the - * connection fails. - */ - tcpsock_set_flag(tcp, TCPF_CONNECTING); - - return SUSPEND; -} - -/* - * Test whether any new connections are pending on a listening TCP socket. - */ -static int -tcpsock_test_accept(struct sock * sock) -{ - struct tcpsock *tcp = (struct tcpsock *)sock; - - /* Is this socket in listening mode at all? */ - if (!tcpsock_is_listening(tcp)) - return EINVAL; - - /* Are there any connections to accept right now? */ - if (!TAILQ_EMPTY(&tcp->tcp_queue.tq_head)) - return OK; - - /* If the socket has been shut down, we return ECONNABORTED. */ - if (tcp->tcp_pcb == NULL) - return ECONNABORTED; - - /* Otherwise, wait for a new connection first. */ - return SUSPEND; -} - -/* - * Accept a connection on a listening TCP socket, creating a new TCP socket. - */ -static sockid_t -tcpsock_accept(struct sock * sock, struct sockaddr * addr, - socklen_t * addr_len, endpoint_t user_endpt __unused, - struct sock ** newsockp) -{ - struct tcpsock *listener = (struct tcpsock *)sock; - struct tcpsock *tcp; - int r; - - if ((r = tcpsock_test_accept(sock)) != OK) - return r; - /* Below, we must not assume that the listener has a PCB. */ - - tcp = TAILQ_FIRST(&listener->tcp_queue.tq_head); - assert(tcp->tcp_listener == listener); - assert(tcp->tcp_pcb != NULL); - - TAILQ_REMOVE(&listener->tcp_queue.tq_head, tcp, tcp_queue.tq_next); - tcp->tcp_listener = NULL; - - tcp_backlog_accepted(tcp->tcp_pcb); - - ipsock_put_addr(tcpsock_get_ipsock(tcp), addr, addr_len, - &tcp->tcp_pcb->remote_ip, tcp->tcp_pcb->remote_port); - - /* - * Set 'newsockp' to NULL so that libsockevent knows we already cloned - * the socket, and it must not be reinitialized anymore. - */ - *newsockp = NULL; - return tcpsock_get_id(tcp); -} - -/* - * Perform preliminary checks on a send request. - */ -static int -tcpsock_pre_send(struct sock * sock, size_t len __unused, - socklen_t ctl_len __unused, const struct sockaddr * addr __unused, - socklen_t addr_len __unused, endpoint_t user_endpt __unused, int flags) -{ - - /* - * Reject calls with unknown flags. Since libsockevent strips out the - * flags it handles itself here, we only have to test for ones we can - * not handle. Currently, there are no send flags that we support. - */ - if (flags != 0) - return EOPNOTSUPP; - - return OK; -} - -/* - * Test whether the given number of data bytes can be sent on a TCP socket. - */ -static int -tcpsock_test_send(struct sock * sock, size_t min) -{ - struct tcpsock *tcp = (struct tcpsock *)sock; - size_t sndbuf; - - if (tcp->tcp_pcb == NULL) - return EPIPE; - - switch (tcp->tcp_pcb->state) { - case CLOSED: /* new */ - case LISTEN: /* listening */ - return ENOTCONN; - case SYN_SENT: /* connecting */ - case SYN_RCVD: /* simultaneous open, maybe someday? */ - return SUSPEND; - case ESTABLISHED: /* connected */ - case CLOSE_WAIT: /* closed remotely */ - break; - default: /* shut down locally */ - assert(tcpsock_is_shutdown(tcp, SFL_SHUT_WR)); - return EPIPE; - } - - sndbuf = tcpsock_get_sndbuf(tcp); - if (min > sndbuf) - min = sndbuf; - - if (tcp->tcp_snd.ts_len + min > sndbuf) - return SUSPEND; - else - return OK; -} - -/* - * Send data on a TCP socket. - */ -static int -tcpsock_send(struct sock * sock, const struct sockdriver_data * data, - size_t len, size_t * offp, const struct sockdriver_data * ctl __unused, - socklen_t ctl_len __unused, socklen_t * ctl_off __unused, - const struct sockaddr * addr __unused, socklen_t addr_len __unused, - endpoint_t user_endpt __unused, int flags __unused, size_t min) -{ - struct tcpsock *tcp = (struct tcpsock *)sock; - struct pbuf *ptail, *pfirst, *pnext, *plast; - size_t off, tail_off, chunk, left, sndbuf; - int r; - - if ((r = tcpsock_test_send(sock, min)) != OK) - return r; - - if (len == 0) - return OK; /* nothing to do */ - - sndbuf = tcpsock_get_sndbuf(tcp); - if (min > sndbuf) - min = sndbuf; - assert(min > 0); - - assert(sndbuf > tcp->tcp_snd.ts_len); - left = sndbuf - tcp->tcp_snd.ts_len; - if (left > len) - left = len; - - /* - * First see if we can fit any more data in the current tail buffer. - * If so, we set 'ptail' to point to it and 'tail_off' to the previous - * length of the tail buffer, while optimistically extending it to - * include the new data. If not, we set them to NULL/0. - */ - if ((ptail = tcp->tcp_snd.ts_tail) != NULL && - ptail->len < ptail->tot_len) { - assert(ptail->len > 0); - tail_off = (size_t)ptail->len; - - /* - * Optimistically extend the head buffer to include whatever - * fits in it. This is needed for util_copy_data(). - */ - assert(ptail->tot_len > ptail->len); - off = (size_t)ptail->tot_len - (size_t)ptail->len; - if (off > left) - off = left; - ptail->len += off; - } else { - ptail = NULL; - tail_off = 0; - off = 0; - } - - /* - * Then, if there is more to send, allocate new buffers as needed. If - * we run out of memory, work with whatever we did manage to grab. - */ - pfirst = NULL; - plast = NULL; - while (off < left) { - if (tcpsock_sendbufs >= TCP_MAX_SENDBUFS || - (pnext = tcpsock_alloc_buf()) == NULL) { - /* - * Chances are that we will end up suspending this send - * request because of being out of buffers. We try to - * resume such requests from the polling function. - */ - tcpsock_set_flag(tcp, TCPF_OOM); - - break; - } - - tcpsock_sendbufs++; - - if (pfirst == NULL) - pfirst = pnext; - else - plast->next = pnext; - plast = pnext; - - chunk = (size_t)pnext->tot_len; - if (chunk > left - off) - chunk = left - off; - pnext->len = chunk; - off += chunk; - } - - /* - * Copy in the data and continue, unless we did not manage to find - * enough space to even meet the low send watermark, in which case we - * undo any allocation and suspend the call until later. - */ - if (off >= min) { - /* - * Optimistically attach the new buffers to the tail, also for - * util_copy_data(). We undo all this if the copy fails. - */ - if (ptail != NULL) { - ptail->next = pfirst; - - pnext = ptail; - } else - pnext = pfirst; - - assert(pnext != NULL); - - r = util_copy_data(data, off, *offp, pnext, tail_off, - TRUE /*copy_in*/); - } else - r = SUSPEND; - - if (r != OK) { - /* Undo the modifications made so far. */ - while (pfirst != NULL) { - pnext = pfirst->next; - - assert(tcpsock_sendbufs > 0); - tcpsock_sendbufs--; - - tcpsock_free_buf(pfirst); - - pfirst = pnext; - } - - if (ptail != NULL) { - ptail->next = NULL; - - ptail->len = tail_off; - } - - return r; - } - - /* Attach the new buffers, if any, to the buffer tail. */ - if (pfirst != NULL) { - if ((ptail = tcp->tcp_snd.ts_tail) != NULL) { - assert(ptail->len == ptail->tot_len); - - /* - * Due to our earlier optimistic modifications, this - * may or may not be redundant. - */ - ptail->next = pfirst; - } - - assert(plast != NULL); - tcp->tcp_snd.ts_tail = plast; - - if (tcp->tcp_snd.ts_head == NULL) { - tcp->tcp_snd.ts_head = pfirst; - assert(tcp->tcp_snd.ts_head_off == 0); - } - if (tcp->tcp_snd.ts_unsent == NULL) { - tcp->tcp_snd.ts_unsent = pfirst; - assert(tcp->tcp_snd.ts_unsent_off == 0); - } - } - - tcp->tcp_snd.ts_len += off; - - /* - * See if we can send any of the data we just enqueued. The socket is - * still open as we are still processing a call from userland on it; - * this saves us from having to deal with the cases that the following - * calls end up freeing the socket object. - */ - if (tcpsock_pcb_enqueue(tcp) && - (r = tcpsock_pcb_send(tcp, FALSE /*raise_error*/)) != OK) { - /* - * That did not go well. Return the error immediately if we - * had not made any progress earlier. Otherwise, return our - * partial progress and leave the error to be picked up later. - */ - if (*offp > 0) { - sockevent_set_error(tcpsock_get_sock(tcp), r); - - return OK; - } else - return r; - } - - *offp += off; - return (off < len) ? SUSPEND : OK; -} - -/* - * Perform preliminary checks on a receive request. - */ -static int -tcpsock_pre_recv(struct sock * sock __unused, endpoint_t user_endpt __unused, - int flags) -{ - - /* - * Reject calls with unknown flags. Since libsockevent strips out the - * flags it handles itself here, we only have to test for ones we can - * not handle. - */ - if ((flags & ~(MSG_PEEK | MSG_WAITALL)) != 0) - return EOPNOTSUPP; - - return OK; -} - -/* - * Return TRUE if receive calls may wait for more data to come in on the - * connection, or FALSE if we already know that that is not going to happen. - */ -static int -tcpsock_may_wait(struct tcpsock * tcp) -{ - - return (tcp->tcp_pcb != NULL && - !(tcpsock_get_flags(tcp) & TCPF_RCVD_FIN)); -} - -/* - * Test whether data can be received on a TCP socket, and if so, how many bytes - * of data. - */ -static int -tcpsock_test_recv(struct sock * sock, size_t min, size_t * size) -{ - struct tcpsock *tcp = (struct tcpsock *)sock; - int may_wait; - - /* If there is and never was a connection, refuse the call at all. */ - if (tcp->tcp_pcb != NULL && (tcp->tcp_pcb->state == CLOSED || - tcp->tcp_pcb->state == LISTEN)) - return ENOTCONN; - - /* - * If we are certain that no more data will come in later, ignore the - * low receive watermark. Otherwise, bound it to the size of the - * receive buffer, or receive calls may block forever. - */ - if (!(may_wait = tcpsock_may_wait(tcp))) - min = 1; - else if (min > tcpsock_get_rcvbuf(tcp)) - min = tcpsock_get_rcvbuf(tcp); - - if (tcp->tcp_rcv.tr_len >= min) { - if (size != NULL) - *size = tcp->tcp_rcv.tr_len; - - return OK; - } - - return (may_wait) ? SUSPEND : SOCKEVENT_EOF; -} - -/* - * Receive data on a TCP socket. - */ -static int -tcpsock_recv(struct sock * sock, const struct sockdriver_data * data, - size_t len, size_t * offp, const struct sockdriver_data * ctl __unused, - socklen_t ctl_len __unused, socklen_t * ctl_off __unused, - struct sockaddr * addr __unused, socklen_t * addr_len __unused, - endpoint_t user_endpt __unused, int flags, size_t min, - int * rflags __unused) -{ - struct tcpsock *tcp = (struct tcpsock *)sock; - struct pbuf *ptail; - size_t off, left; - int r; - - /* See if we can receive at all, and if so, how much at most. */ - if ((r = tcpsock_test_recv(sock, min, NULL)) != OK) - return r; - - if (len == 0) - return OK; /* nothing to do */ - - off = tcp->tcp_rcv.tr_len; - if (off > len) - off = len; - - assert(tcp->tcp_rcv.tr_head != NULL); - assert(tcp->tcp_rcv.tr_head_off < tcp->tcp_rcv.tr_head->len); - - /* Copy out the data to the caller. */ - if ((r = util_copy_data(data, off, *offp, tcp->tcp_rcv.tr_head, - tcp->tcp_rcv.tr_head_off, FALSE /*copy_in*/)) != OK) - return r; - - /* Unless peeking, remove the data from the receive queue. */ - if (!(flags & MSG_PEEK)) { - left = off; - - /* Dequeue and free as many entire buffers as possible. */ - while ((ptail = tcp->tcp_rcv.tr_head) != NULL && - left >= (size_t)ptail->len - tcp->tcp_rcv.tr_head_off) { - left -= (size_t)ptail->len - tcp->tcp_rcv.tr_head_off; - - tcp->tcp_rcv.tr_head = ptail->next; - tcp->tcp_rcv.tr_head_off = 0; - - if (tcp->tcp_rcv.tr_head == NULL) - tcp->tcp_rcv.tr_pre_tailp = NULL; - else if (tcp->tcp_rcv.tr_pre_tailp == &ptail->next) - tcp->tcp_rcv.tr_pre_tailp = - &tcp->tcp_rcv.tr_head; - - assert(tcpsock_recvbufs > 0); - tcpsock_recvbufs--; - - tcpsock_free_buf(ptail); - } - - /* - * If only part of the (new) head buffer is consumed, adjust - * the saved offset into that buffer. - */ - if (left > 0) { - assert(tcp->tcp_rcv.tr_head != NULL); - assert((size_t)tcp->tcp_rcv.tr_head->len - - tcp->tcp_rcv.tr_head_off > left); - - tcp->tcp_rcv.tr_head_off += left; - } - - tcp->tcp_rcv.tr_len -= off; - - if (tcp->tcp_rcv.tr_head != NULL) { - assert(tcp->tcp_rcv.tr_pre_tailp != NULL); - assert(tcp->tcp_rcv.tr_len > 0); - } else { - assert(tcp->tcp_rcv.tr_pre_tailp == NULL); - assert(tcp->tcp_rcv.tr_len == 0); - } - - /* - * The receive buffer has shrunk, so there may now be space to - * receive more data. - */ - if (tcp->tcp_pcb != NULL) - tcpsock_ack_recv(tcp); - } else - flags &= ~MSG_WAITALL; /* for the check below */ - - /* Advance the current copy position, and see if we are done. */ - *offp += off; - if ((flags & MSG_WAITALL) && off < len && tcpsock_may_wait(tcp)) - return SUSPEND; - else - return OK; -} - -/* - * Update the set of flag-type socket options on a TCP socket. - */ -static void -tcpsock_setsockmask(struct sock * sock, unsigned int mask) -{ - struct tcpsock *tcp = (struct tcpsock *)sock; - - if (tcp->tcp_pcb == NULL) - return; - - if (mask & SO_REUSEADDR) - ip_set_option(tcp->tcp_pcb, SOF_REUSEADDR); - else - ip_reset_option(tcp->tcp_pcb, SOF_REUSEADDR); - - if (mask & SO_KEEPALIVE) - ip_set_option(tcp->tcp_pcb, SOF_KEEPALIVE); - else - ip_reset_option(tcp->tcp_pcb, SOF_KEEPALIVE); -} - -/* - * Prepare a helper structure for IP-level option processing. - */ -static void -tcpsock_get_ipopts(struct tcpsock * tcp, struct ipopts * ipopts) -{ - - ipopts->local_ip = &tcp->tcp_pcb->local_ip; - ipopts->remote_ip = &tcp->tcp_pcb->remote_ip; - ipopts->tos = &tcp->tcp_pcb->tos; - ipopts->ttl = &tcp->tcp_pcb->ttl; - ipopts->sndmin = TCP_SNDBUF_MIN; - ipopts->sndmax = TCP_SNDBUF_MAX; - ipopts->rcvmin = TCP_RCVBUF_MIN; - ipopts->rcvmax = TCP_RCVBUF_MAX; -} - -/* - * Set socket options on a TCP socket. - */ -static int -tcpsock_setsockopt(struct sock * sock, int level, int name, - const struct sockdriver_data * data, socklen_t len) -{ - struct tcpsock *tcp = (struct tcpsock *)sock; - struct ipopts ipopts; - uint32_t uval; - int r, val; - - if (tcp->tcp_pcb == NULL) - return ECONNRESET; - - /* Handle TCP-level options. */ - switch (level) { - case IPPROTO_IPV6: - switch (name) { - case IPV6_RECVTCLASS: - if ((r = sockdriver_copyin_opt(data, &val, sizeof(val), - len)) != OK) - return r; - - /* - * This option is not supported for TCP sockets; it - * would not even make sense. However, named(8) - * insists on trying to set it anyway. We accept the - * request but ignore the value, not even returning - * what was set through getsockopt(2). - */ - return OK; - - case IPV6_FAITH: - if ((r = sockdriver_copyin_opt(data, &val, sizeof(val), - len)) != OK) - return r; - - /* - * This option is not supported at all, but to save - * ourselves from having to remember the current state - * for getsockopt(2), we also refuse to enable it. - */ - if (val != 0) - return EINVAL; - - return OK; - } - - break; - - case IPPROTO_TCP: - switch (name) { - case TCP_NODELAY: - /* - * lwIP's listening TCP PCBs do not have this field. - * If this ever becomes an issue, we can create our own - * shadow flag and do the inheritance ourselves. - */ - if (tcp->tcp_pcb->state == LISTEN) - return EINVAL; - - if ((r = sockdriver_copyin_opt(data, &val, sizeof(val), - len)) != OK) - return r; - - if (val) - tcp_nagle_disable(tcp->tcp_pcb); - else - tcp_nagle_enable(tcp->tcp_pcb); - - return OK; - - case TCP_KEEPIDLE: - case TCP_KEEPINTVL: - /* - * lwIP's listening TCP PCBs do not have these fields. - */ - if (tcp->tcp_pcb->state == LISTEN) - return EINVAL; - - if ((r = sockdriver_copyin_opt(data, &val, sizeof(val), - len)) != OK) - return r; - - if (val == 0) - return EINVAL; - - /* - * The given value is unsigned, but lwIP stores the - * value in milliseconds in a uint32_t field, so we - * have to limit large values to whatever fits in the - * field anyway. - */ - if (val < 0 || (uint32_t)val > UINT32_MAX / 1000) - uval = UINT32_MAX; - else - uval = (uint32_t)val * 1000; - - if (name == TCP_KEEPIDLE) - tcp->tcp_pcb->keep_idle = uval; - else - tcp->tcp_pcb->keep_intvl = uval; - - return OK; - - case TCP_KEEPCNT: - /* lwIP's listening TCP PCBs do not have this field. */ - if (tcp->tcp_pcb->state == LISTEN) - return EINVAL; - - if ((r = sockdriver_copyin_opt(data, &val, sizeof(val), - len)) != OK) - return r; - - if (val == 0) - return EINVAL; - - tcp->tcp_pcb->keep_cnt = (uint32_t)val; - - return OK; - } - - return EOPNOTSUPP; - } - - /* Handle all other options at the IP level. */ - tcpsock_get_ipopts(tcp, &ipopts); - - return ipsock_setsockopt(tcpsock_get_ipsock(tcp), level, name, data, - len, &ipopts); -} - -/* - * Retrieve socket options on a TCP socket. - */ -static int -tcpsock_getsockopt(struct sock * sock, int level, int name, - const struct sockdriver_data * data, socklen_t * len) -{ - struct tcpsock *tcp = (struct tcpsock *)sock; - struct ipopts ipopts; - int val; - - if (tcp->tcp_pcb == NULL) - return ECONNRESET; - - /* Handle TCP-level options. */ - switch (level) { - case IPPROTO_IPV6: - switch (name) { - case IPV6_RECVTCLASS: - case IPV6_FAITH: - val = 0; - - return sockdriver_copyout_opt(data, &val, sizeof(val), - len); - } - - break; - - case IPPROTO_TCP: - switch (name) { - case TCP_NODELAY: - /* lwIP's listening TCP PCBs do not have this field. */ - if (tcp->tcp_pcb->state == LISTEN) - return EINVAL; - - val = tcp_nagle_disabled(tcp->tcp_pcb); - - return sockdriver_copyout_opt(data, &val, sizeof(val), - len); - - case TCP_MAXSEG: - /* lwIP's listening TCP PCBs do not have this field. */ - if (tcp->tcp_pcb->state == LISTEN) - return EINVAL; - - /* This option is read-only at this time. */ - val = tcp->tcp_pcb->mss; - - return sockdriver_copyout_opt(data, &val, sizeof(val), - len); - - case TCP_KEEPIDLE: - /* lwIP's listening TCP PCBs do not have this field. */ - if (tcp->tcp_pcb->state == LISTEN) - return EINVAL; - - val = (int)(tcp->tcp_pcb->keep_idle / 1000); - - return sockdriver_copyout_opt(data, &val, sizeof(val), - len); - - case TCP_KEEPINTVL: - /* lwIP's listening TCP PCBs do not have this field. */ - if (tcp->tcp_pcb->state == LISTEN) - return EINVAL; - - val = (int)(tcp->tcp_pcb->keep_intvl / 1000); - - return sockdriver_copyout_opt(data, &val, sizeof(val), - len); - - case TCP_KEEPCNT: - /* lwIP's listening TCP PCBs do not have this field. */ - if (tcp->tcp_pcb->state == LISTEN) - return EINVAL; - - val = (int)tcp->tcp_pcb->keep_cnt; - - return sockdriver_copyout_opt(data, &val, sizeof(val), - len); - } - - return EOPNOTSUPP; - } - - /* Handle all other options at the IP level. */ - tcpsock_get_ipopts(tcp, &ipopts); - - return ipsock_getsockopt(tcpsock_get_ipsock(tcp), level, name, data, - len, &ipopts); -} - -/* - * Retrieve the local socket address of a TCP socket. - */ -static int -tcpsock_getsockname(struct sock * sock, struct sockaddr * addr, - socklen_t * addr_len) -{ - struct tcpsock *tcp = (struct tcpsock *)sock; - - if (tcp->tcp_pcb == NULL) - return EINVAL; - - ipsock_put_addr(tcpsock_get_ipsock(tcp), addr, addr_len, - &tcp->tcp_pcb->local_ip, tcp->tcp_pcb->local_port); - - return OK; -} - -/* - * Retrieve the remote socket address of a TCP socket. - */ -static int -tcpsock_getpeername(struct sock * sock, struct sockaddr * addr, - socklen_t * addr_len) -{ - struct tcpsock *tcp = (struct tcpsock *)sock; - - if (tcp->tcp_pcb == NULL || tcp->tcp_pcb->state == CLOSED || - tcp->tcp_pcb->state == LISTEN || tcp->tcp_pcb->state == SYN_SENT) - return ENOTCONN; - - ipsock_put_addr(tcpsock_get_ipsock(tcp), addr, addr_len, - &tcp->tcp_pcb->remote_ip, tcp->tcp_pcb->remote_port); - - return OK; -} - -/* - * Perform a TCP half-close on a TCP socket. This operation may not complete - * immediately due to memory conditions, in which case it will be completed at - * a later time. - */ -static void -tcpsock_send_fin(struct tcpsock * tcp) -{ - - sockevent_set_shutdown(tcpsock_get_sock(tcp), SFL_SHUT_WR); - - /* - * Attempt to send the FIN. If a fatal error occurs as a result, raise - * it as an asynchronous error, because this function's callers cannot - * do much with it. That happens to match the way these functions are - * used elsewhere. In any case, as a result, the PCB may be closed. - * However, we are never called from a situation where the socket is - * being closed here, so the socket object will not be freed either. - */ - if (tcpsock_pcb_enqueue(tcp)) { - assert(!sockevent_is_closing(tcpsock_get_sock(tcp))); - - if (tcpsock_may_close(tcp)) - tcpsock_finish_close(tcp); - else - (void)tcpsock_pcb_send(tcp, TRUE /*raise_error*/); - } -} - -/* - * Shut down a TCP socket for reading and/or writing. - */ -static int -tcpsock_shutdown(struct sock * sock, unsigned int mask) -{ - struct tcpsock *tcp = (struct tcpsock *)sock; - - /* - * If the PCB is gone, we want to allow shutdowns for reading but not - * writing: shutting down for writing affects the PCB, shutting down - * for reading does not. Also, if the PCB is in CLOSED state, we would - * not know how to deal with subsequent operations after a shutdown for - * writing, so forbid such calls altogether. - */ - if ((tcp->tcp_pcb == NULL || tcp->tcp_pcb->state == CLOSED) && - (mask & SFL_SHUT_WR)) - return ENOTCONN; - - /* - * Handle listening sockets as a special case. Shutting down a - * listening socket frees its PCB. Sockets pending on the accept queue - * may still be accepted, but after that, accept(2) will start - * returning ECONNABORTED. This feature allows multi-process server - * applications to shut down gracefully, supposedly.. - */ - if (tcpsock_is_listening(tcp)) { - if (tcp->tcp_pcb != NULL) - tcpsock_pcb_close(tcp); - - return OK; - } - - /* - * We control shutdown-for-reading locally, and intentially do not tell - * lwIP about it: if we do that and also shut down for writing, the PCB - * may disappear (now or eventually), which is not what we want. - * Instead, we only tell lwIP to shut down for reading once we actually - * want to get rid of the PCB, using tcp_close(). In the meantime, if - * the socket is shut down for reading by the user, we simply discard - * received data as fast as we can--one out of a number of possible - * design choices there, and (reportedly) the one used by the BSDs. - */ - if (mask & SFL_SHUT_RD) - (void)tcpsock_clear_recv(tcp, TRUE /*ack_data*/); - - /* - * Shutting down for writing a connecting socket simply closes its PCB. - * Closing a PCB in SYN_SENT state simply deallocates it, so this can - * not fail. On the other hand, for connected sockets we want to send - * a FIN, which may fail due to memory shortage, in which case we have - * to try again later.. - */ - if (mask & SFL_SHUT_WR) { - if (tcp->tcp_pcb->state == SYN_SENT) - tcpsock_pcb_close(tcp); - else if (!tcpsock_is_shutdown(tcp, SFL_SHUT_WR)) - tcpsock_send_fin(tcp); - } - - return OK; -} - -/* - * Close a TCP socket. Complete the operation immediately if possible, or - * otherwise initiate the closing process and complete it later, notifying - * libsockevent about that as well. Depending on linger settings, this - * function may be called twice on the same socket: the first time with the - * 'force' flag cleared, and the second time with the 'force' flag set. - */ -static int -tcpsock_close(struct sock * sock, int force) -{ - struct tcpsock *tcp = (struct tcpsock *)sock; - struct tcpsock *queued; - size_t rlen; - - assert(tcp->tcp_listener == NULL); - - /* - * If this was a listening socket, so abort and clean up any and all - * connections on its listener queue. Note that the listening socket - * may or may not have a PCB at this point. - */ - if (tcpsock_is_listening(tcp)) { - while (!TAILQ_EMPTY(&tcp->tcp_queue.tq_head)) { - queued = TAILQ_FIRST(&tcp->tcp_queue.tq_head); - - tcpsock_pcb_abort(queued); - - (void)tcpsock_cleanup(queued, TRUE /*may_free*/); - } - } - - /* - * Clear the receive queue, and make sure that we no longer add new - * data to it. The latter is relevant only for the case that we end up - * returning SUSPEND below. Remember whether there were bytes left, - * because we should reset the connection if there were. - */ - rlen = tcpsock_clear_recv(tcp, FALSE /*ack_data*/); - - sockevent_set_shutdown(tcpsock_get_sock(tcp), SFL_SHUT_RD); - - /* - * If the socket is connected, perform a graceful shutdown, unless 1) - * we are asked to force-close the socket, or 2) if the local side has - * not consumed all data, as per RFC 1122 Sec.4.2.2.13. Normally lwIP - * would take care of the second point, but we may have data in our - * receive buffer of which lwIP is not aware. - * - * Implementing proper linger support is somewhat difficult with lwIP. - * In particular, we cannot reliably wait for our FIN to be ACK'ed by - * the other side in all cases: - * - * - the lwIP TCP transition from states CLOSING to TIME_WAIT does not - * trigger any event and once in the TIME_WAIT state, the poll event - * no longer triggers either; - * - the lwIP TCP transition from states FIN_WAIT_1 and FIN_WAIT_2 to - * TIME_WAIT will trigger a receive event, but it is not clear - * whether we can reliably check that our FIN was ACK'ed from there. - * - * That means we have to compromise. Instead of the proper approach, - * we complete our side of the close operation whenever: - * - * 1. all of or data was acknowledged, AND, - * 2. our FIN was sent, AND, - * 3a. our FIN was acknowledged, OR, - * 3b. we received a FIN from the other side. - * - * With the addition of the rule 3b, we do not run into the above - * reliability problems, but we may return from SO_LINGER-blocked close - * calls too early and thus give callers a false impression of success. - * TODO: if lwIP ever gets improved on this point, the code in this - * module should be rewritten to make use of the improvements. - * - * The set of rules is basically the same as for closing the PCB early - * as per tcpsock_may_close(), except with the check for our FIN being - * acknowledged. Unfortunately only the FIN_WAIT_2, TIME_WAIT, and - * (reentered) CLOSED TCP states guarantee that there are no - * unacknowledged data segments anymore, so we may have to wait for - * reaching any one of these before we can actually finish closing the - * socket with tcp_close(). - * - * In addition, lwIP does not tell us when our FIN gets acknowledged, - * so we have to use polling and direct access to lwIP's PCB fields - * instead, just like lwIP's BSD API does. There is no other way. - * Also, we may not even be able to send the FIN right away, in which - * case we must defer that until later. - */ - if (tcp->tcp_pcb != NULL) { - switch (tcp->tcp_pcb->state) { - case CLOSE_WAIT: - case CLOSING: - case LAST_ACK: - assert(tcpsock_get_flags(tcp) & TCPF_RCVD_FIN); - - /* FALLTHROUGH */ - case SYN_RCVD: - case ESTABLISHED: - case FIN_WAIT_1: - /* First check if we should abort the connection. */ - if (force || rlen > 0) - break; - - /* - * If we have not sent a FIN yet, try sending it now; - * if all other conditions are met for closing the - * socket, successful FIN transmission will complete - * the close. Otherwise, perform the close check - * explicitly. - */ - if (!tcpsock_is_shutdown(tcp, SFL_SHUT_WR)) - tcpsock_send_fin(tcp); - else if (tcpsock_may_close(tcp)) - tcpsock_pcb_close(tcp); - - /* - * If at this point the PCB is gone, we managed to - * close the connection immediately, and the socket has - * already been cleaned up by now. This may occur if - * there is no unacknowledged data and we already - * received a FIN earlier on. - */ - if (tcp->tcp_pcb == NULL) - return OK; - - /* - * Complete the close operation at a later time. - * Adjust the polling interval, so that we can detect - * completion of the close as quickly as possible. - */ - tcp_poll(tcp->tcp_pcb, tcpsock_event_poll, - TCP_POLL_CLOSE_INTERVAL); - - return SUSPEND; - - default: - /* - * The connection is either not yet established, or - * already in a state where we can close it right now. - */ - tcpsock_pcb_close(tcp); - } - } - - /* - * Abort the connection is the PCB is still around, and clean up the - * socket. We cannot let tcpsock_cleanup() free the socket object yet, - * because we are still in the callback from libsockevent, and the - * latter cannot handle the socket object being freed from here. - */ - if (tcp->tcp_pcb != NULL) - tcpsock_pcb_abort(tcp); - - (void)tcpsock_cleanup(tcp, FALSE /*may_free*/); - - return OK; -} - -/* - * Free up a closed TCP socket. - */ -static void -tcpsock_free(struct sock * sock) -{ - struct tcpsock *tcp = (struct tcpsock *)sock; - - assert(tcp->tcp_pcb == NULL); - assert(tcp->tcp_snd.ts_len == 0); - assert(tcp->tcp_snd.ts_head == NULL); - assert(tcp->tcp_rcv.tr_len == 0); - assert(tcp->tcp_rcv.tr_head == NULL); - - TAILQ_INSERT_HEAD(&tcp_freelist, tcp, tcp_queue.tq_next); -} - -/* This table maps TCP states from lwIP numbers to NetBSD numbers. */ -static const struct { - int tsm_tstate; - int tsm_sostate; -} tcpsock_statemap[] = { - [CLOSED] = { TCPS_CLOSED, SS_ISDISCONNECTED }, - [LISTEN] = { TCPS_LISTEN, 0 }, - [SYN_SENT] = { TCPS_SYN_SENT, SS_ISCONNECTING }, - [SYN_RCVD] = { TCPS_SYN_RECEIVED, SS_ISCONNECTING }, - [ESTABLISHED] = { TCPS_ESTABLISHED, SS_ISCONNECTED }, - [FIN_WAIT_1] = { TCPS_FIN_WAIT_1, SS_ISDISCONNECTING }, - [FIN_WAIT_2] = { TCPS_FIN_WAIT_2, SS_ISDISCONNECTING }, - [CLOSE_WAIT] = { TCPS_CLOSE_WAIT, SS_ISCONNECTED }, - [CLOSING] = { TCPS_CLOSING, SS_ISDISCONNECTING }, - [LAST_ACK] = { TCPS_LAST_ACK, SS_ISDISCONNECTING }, - [TIME_WAIT] = { TCPS_TIME_WAIT, SS_ISDISCONNECTED }, -}; - -/* - * Fill the given kinfo_pcb sysctl(7) structure with information about the TCP - * PCB identified by the given pointer. - */ -static void -tcpsock_get_info(struct kinfo_pcb * ki, const void * ptr) -{ - const struct tcp_pcb *pcb = (const struct tcp_pcb *)ptr; - struct tcpsock *tcp; - - /* - * Not all TCP PCBs have an associated tcpsock structure. We are - * careful enough clearing the callback argument for PCBs on any of the - * TCP lists that we can use that callback argument to determine - * whether there is an associated tcpsock structure, although with one - * exception: PCBs for incoming connections that have not yet been - * fully established (i.e., in SYN_RCVD state). These will have the - * callback argument of the listening socket (which itself may already - * have been deallocated at this point) but should not be considered as - * associated with the listening socket's tcpsock structure. - */ - if (pcb->callback_arg != NULL && pcb->state != SYN_RCVD) { - tcp = (struct tcpsock *)pcb->callback_arg; - assert(tcp >= tcp_array && - tcp < &tcp_array[__arraycount(tcp_array)]); - - /* TODO: change this so that sockstat(1) may work one day. */ - ki->ki_sockaddr = (uint64_t)(uintptr_t)tcpsock_get_sock(tcp); - } else { - /* No tcpsock. Could also be in TIME_WAIT state etc. */ - tcp = NULL; - - ki->ki_sostate = SS_NOFDREF; - } - - ki->ki_type = SOCK_STREAM; - - if ((unsigned int)pcb->state < __arraycount(tcpsock_statemap)) { - ki->ki_tstate = tcpsock_statemap[pcb->state].tsm_tstate; - /* TODO: this needs work, but does anything rely on it? */ - ki->ki_sostate |= tcpsock_statemap[pcb->state].tsm_sostate; - } - - /* Careful with the LISTEN state here (see below). */ - ipsock_get_info(ki, &pcb->local_ip, pcb->local_port, - &pcb->remote_ip, (pcb->state != LISTEN) ? pcb->remote_port : 0); - - /* - * The PCBs for listening sockets are actually smaller. Thus, for - * listening sockets, do not attempt to access any of the fields beyond - * those provided in the smaller structure. - */ - if (pcb->state == LISTEN) { - assert(tcp != NULL); - ki->ki_refs = - (uint64_t)(uintptr_t)TAILQ_FIRST(&tcp->tcp_queue.tq_head); - } else { - if (tcp_nagle_disabled(pcb)) - ki->ki_tflags |= NETBSD_TF_NODELAY; - - if (tcp != NULL) { - ki->ki_rcvq = tcp->tcp_rcv.tr_len; - ki->ki_sndq = tcp->tcp_snd.ts_len; - - if (tcp->tcp_listener != NULL) - ki->ki_nextref = (uint64_t)(uintptr_t) - TAILQ_NEXT(tcp, tcp_queue.tq_next); - } - } -} - -/* - * Given either NULL or a previously returned TCP PCB pointer, return the first - * or next TCP PCB pointer, or NULL if there are no more. The current - * implementation supports only one concurrent iteration at once. - */ -static const void * -tcpsock_enum(const void * last) -{ - static struct { - unsigned int i; - const struct tcp_pcb *pcb; - } iter; - - if (last != NULL && (iter.pcb = iter.pcb->next) != NULL) - return (const void *)iter.pcb; - - for (iter.i = (last != NULL) ? iter.i + 1 : 0; - iter.i < __arraycount(tcp_pcb_lists); iter.i++) { - if ((iter.pcb = *tcp_pcb_lists[iter.i]) != NULL) - return (const void *)iter.pcb; - } - - return NULL; -} - -/* - * Obtain the list of TCP protocol control blocks, for sysctl(7). - */ -static ssize_t -tcpsock_pcblist(struct rmib_call * call, struct rmib_node * node __unused, - struct rmib_oldp * oldp, struct rmib_newp * newp __unused) -{ - - return util_pcblist(call, oldp, tcpsock_enum, tcpsock_get_info); -} - -static const struct sockevent_ops tcpsock_ops = { - .sop_bind = tcpsock_bind, - .sop_listen = tcpsock_listen, - .sop_connect = tcpsock_connect, - .sop_accept = tcpsock_accept, - .sop_test_accept = tcpsock_test_accept, - .sop_pre_send = tcpsock_pre_send, - .sop_send = tcpsock_send, - .sop_test_send = tcpsock_test_send, - .sop_pre_recv = tcpsock_pre_recv, - .sop_recv = tcpsock_recv, - .sop_test_recv = tcpsock_test_recv, - .sop_ioctl = ifconf_ioctl, - .sop_setsockmask = tcpsock_setsockmask, - .sop_setsockopt = tcpsock_setsockopt, - .sop_getsockopt = tcpsock_getsockopt, - .sop_getsockname = tcpsock_getsockname, - .sop_getpeername = tcpsock_getpeername, - .sop_shutdown = tcpsock_shutdown, - .sop_close = tcpsock_close, - .sop_free = tcpsock_free -}; diff --git a/minix/net/lwip/udpsock.c b/minix/net/lwip/udpsock.c deleted file mode 100644 index eea8194fc..000000000 --- a/minix/net/lwip/udpsock.c +++ /dev/null @@ -1,997 +0,0 @@ -/* LWIP service - udpsock.c - UDP sockets */ - -#include "lwip.h" -#include "ifaddr.h" -#include "pktsock.h" - -#include "lwip/udp.h" - -#include -#include -#include - -/* The number of UDP sockets. Inherited from the lwIP configuration. */ -#define NR_UDPSOCK MEMP_NUM_UDP_PCB - -/* - * Outgoing packets are not getting buffered, so the send buffer size simply - * determines the maximum size for sent packets. The send buffer maximum is - * therefore limited to the maximum size of a single packet (64K-1 bytes), - * which is already enforced by lwIP's 16-bit length parameter to pbuf_alloc(). - * - * The actual transmission may enforce a lower limit, though. The full packet - * size must not exceed the same 64K-1 limit, and that includes any headers - * that still have to be prepended to the given packet. The size of those - * headers depends on the socket type (IPv4/IPv6) and the IP_HDRINCL setting. - */ -#define UDP_MAX_PAYLOAD (UINT16_MAX) - -#define UDP_SNDBUF_MIN 1 /* minimum UDP send buffer size */ -#define UDP_SNDBUF_DEF 8192 /* default UDP send buffer size */ -#define UDP_SNDBUF_MAX UDP_MAX_PAYLOAD /* maximum UDP send buffer size */ -#define UDP_RCVBUF_MIN MEMPOOL_BUFSIZE /* minimum UDP receive buffer size */ -#define UDP_RCVBUF_DEF 32768 /* default UDP receive buffer size */ -#define UDP_RCVBUF_MAX 65536 /* maximum UDP receive buffer size */ - -static struct udpsock { - struct pktsock udp_pktsock; /* pkt socket, MUST be first */ - struct udp_pcb *udp_pcb; /* lwIP UDP control block */ - SIMPLEQ_ENTRY(udpsock) udp_next; /* next in free list */ -} udp_array[NR_UDPSOCK]; - -static SIMPLEQ_HEAD(, udpsock) udp_freelist; /* list of free UDP sockets */ - -static const struct sockevent_ops udpsock_ops; - -#define udpsock_get_sock(udp) (ipsock_get_sock(udpsock_get_ipsock(udp))) -#define udpsock_get_ipsock(udp) (pktsock_get_ipsock(&(udp)->udp_pktsock)) -#define udpsock_is_ipv6(udp) (ipsock_is_ipv6(udpsock_get_ipsock(udp))) -#define udpsock_is_conn(udp) \ - (udp_flags((udp)->udp_pcb) & UDP_FLAGS_CONNECTED) - -static ssize_t udpsock_pcblist(struct rmib_call *, struct rmib_node *, - struct rmib_oldp *, struct rmib_newp *); - -/* The CTL_NET {PF_INET,PF_INET6} IPPROTO_UDP subtree. */ -/* TODO: add many more and make some of them writable.. */ -static struct rmib_node net_inet_udp_table[] = { -/* 1*/ [UDPCTL_CHECKSUM] = RMIB_INT(RMIB_RO, 1, "checksum", - "Compute UDP checksums"), -/* 2*/ [UDPCTL_SENDSPACE] = RMIB_INT(RMIB_RO, UDP_SNDBUF_DEF, - "sendspace", - "Default UDP send buffer size"), -/* 3*/ [UDPCTL_RECVSPACE] = RMIB_INT(RMIB_RO, UDP_RCVBUF_DEF, - "recvspace", - "Default UDP receive buffer size"), -/* 4*/ [UDPCTL_LOOPBACKCKSUM] = RMIB_FUNC(RMIB_RW | CTLTYPE_INT, sizeof(int), - loopif_cksum, "do_loopback_cksum", - "Perform UDP checksum on loopback"), -/*+0*/ [UDPCTL_MAXID] = RMIB_FUNC(RMIB_RO | CTLTYPE_NODE, 0, - udpsock_pcblist, "pcblist", - "UDP protocol control block list"), -}; - -static struct rmib_node net_inet_udp_node = - RMIB_NODE(RMIB_RO, net_inet_udp_table, "udp", "UDPv4 related settings"); -static struct rmib_node net_inet6_udp6_node = - RMIB_NODE(RMIB_RO, net_inet_udp_table, "udp6", "UDPv6 related settings"); - -/* - * Initialize the UDP sockets module. - */ -void -udpsock_init(void) -{ - unsigned int slot; - - /* Initialize the list of free UDP sockets. */ - SIMPLEQ_INIT(&udp_freelist); - - for (slot = 0; slot < __arraycount(udp_array); slot++) - SIMPLEQ_INSERT_TAIL(&udp_freelist, &udp_array[slot], udp_next); - - /* Register the net.inet.udp and net.inet6.udp6 RMIB subtrees. */ - mibtree_register_inet(PF_INET, IPPROTO_UDP, &net_inet_udp_node); - mibtree_register_inet(PF_INET6, IPPROTO_UDP, &net_inet6_udp6_node); -} - -/* - * A packet has arrived on a UDP socket. We own the given packet buffer, and - * so we must free it if we do not want to keep it. - */ -static void -udpsock_input(void * arg, struct udp_pcb * pcb __unused, struct pbuf * pbuf, - const ip_addr_t * ipaddr, uint16_t port) -{ - struct udpsock *udp = (struct udpsock *)arg; - - /* All UDP input processing is handled by pktsock. */ - pktsock_input(&udp->udp_pktsock, pbuf, ipaddr, port); -} - -/* - * Create a UDP socket. - */ -sockid_t -udpsock_socket(int domain, int protocol, struct sock ** sockp, - const struct sockevent_ops ** ops) -{ - struct udpsock *udp; - unsigned int flags; - uint8_t ip_type; - - switch (protocol) { - case 0: - case IPPROTO_UDP: - break; - - /* NetBSD does not support IPPROTO_UDPLITE, even though lwIP does. */ - default: - return EPROTONOSUPPORT; - } - - if (SIMPLEQ_EMPTY(&udp_freelist)) - return ENOBUFS; - - udp = SIMPLEQ_FIRST(&udp_freelist); - - ip_type = pktsock_socket(&udp->udp_pktsock, domain, UDP_SNDBUF_DEF, - UDP_RCVBUF_DEF, sockp); - - /* We should have enough PCBs so this call should not fail.. */ - if ((udp->udp_pcb = udp_new_ip_type(ip_type)) == NULL) - return ENOBUFS; - udp_recv(udp->udp_pcb, udpsock_input, (void *)udp); - - /* By default, the multicast TTL is 1 and looping is enabled. */ - udp_set_multicast_ttl(udp->udp_pcb, 1); - - flags = udp_flags(udp->udp_pcb); - udp_setflags(udp->udp_pcb, flags | UDP_FLAGS_MULTICAST_LOOP); - - SIMPLEQ_REMOVE_HEAD(&udp_freelist, udp_next); - - *ops = &udpsock_ops; - return SOCKID_UDP | (sockid_t)(udp - udp_array); -} - -/* - * Bind a UDP socket to a local address. - */ -static int -udpsock_bind(struct sock * sock, const struct sockaddr * addr, - socklen_t addr_len, endpoint_t user_endpt) -{ - struct udpsock *udp = (struct udpsock *)sock; - ip_addr_t ipaddr; - uint16_t port; - err_t err; - int r; - - if ((r = ipsock_get_src_addr(udpsock_get_ipsock(udp), addr, addr_len, - user_endpt, &udp->udp_pcb->local_ip, udp->udp_pcb->local_port, - TRUE /*allow_mcast*/, &ipaddr, &port)) != OK) - return r; - - err = udp_bind(udp->udp_pcb, &ipaddr, port); - - return util_convert_err(err); -} - -/* - * Connect a UDP socket to a remote address. - */ -static int -udpsock_connect(struct sock * sock, const struct sockaddr * addr, - socklen_t addr_len, endpoint_t user_endpt __unused) -{ - struct udpsock *udp = (struct udpsock *)sock; - struct ifdev *ifdev; - const ip_addr_t *src_addr; - ip_addr_t dst_addr; - uint16_t dst_port; - uint32_t ifindex, ifindex2; - err_t err; - int r; - - /* - * One may "unconnect" socket by providing an address with family - * AF_UNSPEC. Providing an :0 address does not achieve the same. - */ - if (addr_is_unspec(addr, addr_len)) { - udp_disconnect(udp->udp_pcb); - - return OK; - } - - if ((r = ipsock_get_dst_addr(udpsock_get_ipsock(udp), addr, - addr_len, &udp->udp_pcb->local_ip, &dst_addr, &dst_port)) != OK) - return r; - - /* - * Bind explicitly to a source address if the PCB is not bound to one - * yet. This is expected in the BSD socket API, but lwIP does not do - * it for us. - */ - if (ip_addr_isany(&udp->udp_pcb->local_ip)) { - /* Help the multicast case a bit, if possible. */ - ifdev = NULL; - - if (ip_addr_ismulticast(&dst_addr)) { - ifindex = pktsock_get_ifindex(&udp->udp_pktsock); - ifindex2 = udp_get_multicast_netif_index(udp->udp_pcb); - if (ifindex == 0) - ifindex = ifindex2; - - if (ifindex != 0) { - ifdev = ifdev_get_by_index(ifindex); - - if (ifdev == NULL) - return ENXIO; - } - } - - src_addr = ifaddr_select(&dst_addr, ifdev, NULL /*ifdevp*/); - - if (src_addr == NULL) - return EHOSTUNREACH; - - err = udp_bind(udp->udp_pcb, src_addr, - udp->udp_pcb->local_port); - - if (err != ERR_OK) - return util_convert_err(err); - } - - /* - * Connecting a UDP socket serves two main purposes: 1) the socket uses - * the address as destination when sending, and 2) the socket receives - * packets from only the connected address. - */ - err = udp_connect(udp->udp_pcb, &dst_addr, dst_port); - - if (err != ERR_OK) - return util_convert_err(err); - - return OK; -} - -/* - * Perform preliminary checks on a send request. - */ -static int -udpsock_pre_send(struct sock * sock, size_t len, socklen_t ctl_len __unused, - const struct sockaddr * addr, socklen_t addr_len __unused, - endpoint_t user_endpt __unused, int flags) -{ - struct udpsock *udp = (struct udpsock *)sock; - - if ((flags & ~MSG_DONTROUTE) != 0) - return EOPNOTSUPP; - - if (!udpsock_is_conn(udp) && addr == NULL) - return EDESTADDRREQ; - - /* - * This is only one part of the length check. The rest is done from - * udpsock_send(), once we have more information. - */ - if (len > ipsock_get_sndbuf(udpsock_get_ipsock(udp))) - return EMSGSIZE; - - return OK; -} - -/* - * Swap IP-level options between the UDP PCB and the packet options structure, - * for all options that have their flag set in the packet options structure. - * This function is called twice when sending a packet. The result is that the - * flagged options are overridden for only the packet being sent. - */ -static void -udpsock_swap_opt(struct udpsock * udp, struct pktopt * pkto) -{ - uint8_t tos, ttl, mcast_ttl; - - if (pkto->pkto_flags & PKTOF_TOS) { - tos = udp->udp_pcb->tos; - udp->udp_pcb->tos = pkto->pkto_tos; - pkto->pkto_tos = tos; - } - - if (pkto->pkto_flags & PKTOF_TTL) { - ttl = udp->udp_pcb->ttl; - mcast_ttl = udp_get_multicast_ttl(udp->udp_pcb); - udp->udp_pcb->ttl = pkto->pkto_ttl; - udp_set_multicast_ttl(udp->udp_pcb, pkto->pkto_mcast_ttl); - pkto->pkto_ttl = ttl; - pkto->pkto_mcast_ttl = mcast_ttl; - } -} - -/* - * Send a packet on a UDP socket. - */ -static int -udpsock_send(struct sock * sock, const struct sockdriver_data * data, - size_t len, size_t * off, const struct sockdriver_data * ctl, - socklen_t ctl_len, socklen_t * ctl_off __unused, - const struct sockaddr * addr, socklen_t addr_len, - endpoint_t user_endpt __unused, int flags, size_t min __unused) -{ - struct udpsock *udp = (struct udpsock *)sock; - struct pktopt pktopt; - struct pbuf *pbuf; - struct ifdev *ifdev; - struct netif *netif; - const ip_addr_t *src_addrp, *dst_addrp; - ip_addr_t src_addr, dst_addr; /* for storage only; not always used! */ - uint16_t dst_port; - uint32_t ifindex; - size_t hdrlen; - err_t err; - int r; - - /* Copy in and parse any packet options. */ - pktopt.pkto_flags = 0; - - if ((r = pktsock_get_ctl(&udp->udp_pktsock, ctl, ctl_len, - &pktopt)) != OK) - return r; - - /* - * The code below will both determine an outgoing interface and a - * source address for the packet. Even though lwIP could do this for - * us in some cases, there are other cases where we must do so - * ourselves, with as main reasons 1) the possibility that either or - * both have been provided through IPV6_PKTINFO, and 2) our intent to - * detect and stop zone violations for (combinations of) scoped IPv6 - * addresses. As a result, it is easier to simply take over the - * selection tasks lwIP in their entirety. - * - * Much of the same applies to rawsock_send() as well. Functional - * differences (e.g. IP_HDRINCL support) as well as the PCB accesses in - * the code make it hard to merge the two into a single pktsock copy. - * Please do keep the two in sync as much as possible. - */ - - /* - * Start by checking whether the source address and/or the outgoing - * interface are overridden using sticky and/or ancillary options. The - * call to pktsock_get_pktinfo(), if successful, will either set - * 'ifdev' to NULL, in which case there is no override, or it will set - * 'ifdev' to the outgoing interface to use, and (only) in that case - * also fill 'src_addr', with an address that may either be a locally - * owned unicast address or the unspecified ('any') address. If it is - * a unicast address, that is the source address to use for the packet. - * Otherwise, fall back to the address to which the socket is bound, - * which may also be the unspecified address or even a multicast - * address. In those case we will pick a source address further below. - */ - if ((r = pktsock_get_pktinfo(&udp->udp_pktsock, &pktopt, &ifdev, - &src_addr)) != OK) - return r; - - if (ifdev != NULL && !ip_addr_isany(&src_addr)) { - /* This is guaranteed to be a proper local unicast address. */ - src_addrp = &src_addr; - } else { - src_addrp = &udp->udp_pcb->local_ip; - - /* - * If the socket is bound to a multicast address, use the - * unspecified ('any') address as source address instead, until - * we select a real source address (further below). This - * substitution keeps the rest of the code a bit simpler. - */ - if (ip_addr_ismulticast(src_addrp)) - src_addrp = IP46_ADDR_ANY(IP_GET_TYPE(src_addrp)); - } - - /* - * Determine the destination address to use. If the socket is - * connected, always ignore any address provided in the send call. - */ - if (!udpsock_is_conn(udp)) { - assert(addr != NULL); /* already checked in pre_send */ - - if ((r = ipsock_get_dst_addr(udpsock_get_ipsock(udp), addr, - addr_len, src_addrp, &dst_addr, &dst_port)) != OK) - return r; - - dst_addrp = &dst_addr; - } else { - dst_addrp = &udp->udp_pcb->remote_ip; - dst_port = udp->udp_pcb->remote_port; - } - - /* - * If the destination is a multicast address, select the outgoing - * interface based on the multicast interface index, if one is set. - * This must be done here in order to allow the code further below to - * detect zone violations, because if we leave this selection to lwIP, - * it will not perform zone violation detection at all. Also note that - * this case must *not* override an interface index already specified - * using IPV6_PKTINFO, as per RFC 3542 Sec. 6.7. - */ - if (ifdev == NULL && ip_addr_ismulticast(dst_addrp)) { - ifindex = udp_get_multicast_netif_index(udp->udp_pcb); - - if (ifindex != NETIF_NO_INDEX) - ifdev = ifdev_get_by_index(ifindex); /* (may fail) */ - } - - /* - * If an interface has been determined already now, the send operation - * will bypass routing. In that case, we must perform our own checks - * on address zone violations, because those will not be made anywhere - * else. Subsequent steps below will never introduce violations. - */ - if (ifdev != NULL && IP_IS_V6(dst_addrp)) { - if (ifaddr_is_zone_mismatch(ip_2_ip6(dst_addrp), ifdev)) - return EHOSTUNREACH; - - if (IP_IS_V6(src_addrp) && - ifaddr_is_zone_mismatch(ip_2_ip6(src_addrp), ifdev)) - return EHOSTUNREACH; - } - - /* - * If we do not yet have an interface at this point, perform a route - * lookup to determine the outgoing interface. Unless MSG_DONTROUTE is - * set (which covers SO_DONTROUTE as well), in which case we look for a - * local subnet that matches the destination address. - */ - if (ifdev == NULL) { - if (!(flags & MSG_DONTROUTE)) { - /* - * ip_route() should never be called with an - * IPADDR_TYPE_ANY type address. This is a lwIP- - * internal requirement; while we override both routing - * functions, we do not deviate from it. - */ - if (IP_IS_ANY_TYPE_VAL(*src_addrp)) - src_addrp = - IP46_ADDR_ANY(IP_GET_TYPE(dst_addrp)); - - /* Perform the route lookup. */ - if ((netif = ip_route(src_addrp, dst_addrp)) == NULL) - return EHOSTUNREACH; - - ifdev = netif_get_ifdev(netif); - } else { - if ((ifdev = ifaddr_map_by_subnet(dst_addrp)) == NULL) - return EHOSTUNREACH; - } - } - - /* - * At this point we have an outgoing interface. If we do not have a - * source address yet, pick one now. - */ - assert(ifdev != NULL); - - if (ip_addr_isany(src_addrp)) { - src_addrp = ifaddr_select(dst_addrp, ifdev, NULL /*ifdevp*/); - - if (src_addrp == NULL) - return EHOSTUNREACH; - } - - /* - * Now that we know the full conditions of what we are about to send, - * check whether the packet size leaves enough room for lwIP to prepend - * headers. If so, allocate a chain of pbufs for the packet. - */ - assert(len <= UDP_MAX_PAYLOAD); - - if (IP_IS_V6(dst_addrp)) - hdrlen = IP6_HLEN + UDP_HLEN; - else - hdrlen = IP_HLEN + UDP_HLEN; - - if (hdrlen + len > UDP_MAX_PAYLOAD) - return EMSGSIZE; - - if ((pbuf = pchain_alloc(PBUF_TRANSPORT, len)) == NULL) - return ENOBUFS; - - /* Copy in the packet data. */ - if ((r = pktsock_get_data(&udp->udp_pktsock, data, len, pbuf)) != OK) { - pbuf_free(pbuf); - - return r; - } - - /* - * Set broadcast/multicast flags for accounting purposes. Only the - * multicast flag is used for output accounting, but for loopback - * traffic, both flags are copied and used for input accounting and - * setting MSG_MCAST/MSG_BCAST. - */ - if (ip_addr_ismulticast(dst_addrp)) - pbuf->flags |= PBUF_FLAG_LLMCAST; - else if (ip_addr_isbroadcast(dst_addrp, ifdev_get_netif(ifdev))) - pbuf->flags |= PBUF_FLAG_LLBCAST; - - /* Send the packet. */ - udpsock_swap_opt(udp, &pktopt); - - assert(!ip_addr_isany(src_addrp)); - assert(!ip_addr_ismulticast(src_addrp)); - - err = udp_sendto_if_src(udp->udp_pcb, pbuf, dst_addrp, dst_port, - ifdev_get_netif(ifdev), src_addrp); - - udpsock_swap_opt(udp, &pktopt); - - /* Free the pbuf, as a copy has been made. */ - pbuf_free(pbuf); - - /* - * On success, make sure to return the size of the sent packet as well. - * As an aside: ctl_off need not be updated, as it is not returned. - */ - if ((r = util_convert_err(err)) == OK) - *off = len; - return r; -} - -/* - * Update the set of flag-type socket options on a UDP socket. - */ -static void -udpsock_setsockmask(struct sock * sock, unsigned int mask) -{ - struct udpsock *udp = (struct udpsock *)sock; - - if (mask & SO_REUSEADDR) - ip_set_option(udp->udp_pcb, SOF_REUSEADDR); - else - ip_reset_option(udp->udp_pcb, SOF_REUSEADDR); - - if (mask & SO_BROADCAST) - ip_set_option(udp->udp_pcb, SOF_BROADCAST); - else - ip_reset_option(udp->udp_pcb, SOF_BROADCAST); -} - -/* - * Prepare a helper structure for IP-level option processing. - */ -static void -udpsock_get_ipopts(struct udpsock * udp, struct ipopts * ipopts) -{ - - ipopts->local_ip = &udp->udp_pcb->local_ip; - ipopts->remote_ip = &udp->udp_pcb->remote_ip; - ipopts->tos = &udp->udp_pcb->tos; - ipopts->ttl = &udp->udp_pcb->ttl; - ipopts->sndmin = UDP_SNDBUF_MIN; - ipopts->sndmax = UDP_SNDBUF_MAX; - ipopts->rcvmin = UDP_RCVBUF_MIN; - ipopts->rcvmax = UDP_RCVBUF_MAX; -} - -/* - * Set socket options on a UDP socket. - */ -static int -udpsock_setsockopt(struct sock * sock, int level, int name, - const struct sockdriver_data * data, socklen_t len) -{ - struct udpsock *udp = (struct udpsock *)sock; - struct ipopts ipopts; - ip_addr_t ipaddr; - struct in_addr in_addr; - struct ifdev *ifdev; - unsigned int flags; - uint32_t ifindex; - uint8_t byte; - int r, val; - - /* - * Unfortunately, we have to duplicate most of the multicast options - * rather than sharing them with rawsock at the pktsock level. The - * reason is that each of the PCBs have their own multicast abstraction - * functions and so we cannot merge the rest. Same for getsockopt. - */ - - switch (level) { - case IPPROTO_IP: - if (udpsock_is_ipv6(udp)) - break; - - switch (name) { - case IP_MULTICAST_IF: - pktsock_set_mcaware(&udp->udp_pktsock); - - if ((r = sockdriver_copyin_opt(data, &in_addr, - sizeof(in_addr), len)) != OK) - return r; - - ip_addr_set_ip4_u32(&ipaddr, in_addr.s_addr); - - if ((ifdev = ifaddr_map_by_addr(&ipaddr)) == NULL) - return EADDRNOTAVAIL; - - udp_set_multicast_netif_index(udp->udp_pcb, - ifdev_get_index(ifdev)); - - return OK; - - case IP_MULTICAST_LOOP: - pktsock_set_mcaware(&udp->udp_pktsock); - - if ((r = sockdriver_copyin_opt(data, &byte, - sizeof(byte), len)) != OK) - return r; - - flags = udp_flags(udp->udp_pcb); - - if (byte) - flags |= UDP_FLAGS_MULTICAST_LOOP; - else - flags &= ~UDP_FLAGS_MULTICAST_LOOP; - - udp_setflags(udp->udp_pcb, flags); - - return OK; - - case IP_MULTICAST_TTL: - pktsock_set_mcaware(&udp->udp_pktsock); - - if ((r = sockdriver_copyin_opt(data, &byte, - sizeof(byte), len)) != OK) - return r; - - udp_set_multicast_ttl(udp->udp_pcb, byte); - - return OK; - } - - break; - - case IPPROTO_IPV6: - if (!udpsock_is_ipv6(udp)) - break; - - switch (name) { - case IPV6_MULTICAST_IF: - pktsock_set_mcaware(&udp->udp_pktsock); - - if ((r = sockdriver_copyin_opt(data, &val, sizeof(val), - len)) != OK) - return r; - - if (val != 0) { - ifindex = (uint32_t)val; - - ifdev = ifdev_get_by_index(ifindex); - - if (ifdev == NULL) - return ENXIO; - } else - ifindex = NETIF_NO_INDEX; - - udp_set_multicast_netif_index(udp->udp_pcb, ifindex); - - return OK; - - case IPV6_MULTICAST_LOOP: - pktsock_set_mcaware(&udp->udp_pktsock); - - if ((r = sockdriver_copyin_opt(data, &val, sizeof(val), - len)) != OK) - return r; - - if (val < 0 || val > 1) - return EINVAL; - - flags = udp_flags(udp->udp_pcb); - - if (val) - flags |= UDP_FLAGS_MULTICAST_LOOP; - else - flags &= ~UDP_FLAGS_MULTICAST_LOOP; - - /* - * lwIP's IPv6 functionality does not actually check - * this flag at all yet. We set it in the hope that - * one day this will magically start working. - */ - udp_setflags(udp->udp_pcb, flags); - - return OK; - - case IPV6_MULTICAST_HOPS: - pktsock_set_mcaware(&udp->udp_pktsock); - - if ((r = sockdriver_copyin_opt(data, &val, sizeof(val), - len)) != OK) - return r; - - if (val < -1 || val > UINT8_MAX) - return EINVAL; - - if (val == -1) - val = 1; - - udp_set_multicast_ttl(udp->udp_pcb, val); - - return OK; - } - - break; - } - - /* Handle all other options at the packet or IP level. */ - udpsock_get_ipopts(udp, &ipopts); - - return pktsock_setsockopt(&udp->udp_pktsock, level, name, data, len, - &ipopts); -} - -/* - * Retrieve socket options on a UDP socket. - */ -static int -udpsock_getsockopt(struct sock * sock, int level, int name, - const struct sockdriver_data * data, socklen_t * len) -{ - struct udpsock *udp = (struct udpsock *)sock; - struct ipopts ipopts; - const ip4_addr_t *ip4addr; - struct in_addr in_addr; - struct ifdev *ifdev; - unsigned int flags; - uint32_t ifindex; - uint8_t byte; - int val; - - switch (level) { - case IPPROTO_IP: - if (udpsock_is_ipv6(udp)) - break; - - switch (name) { - case IP_MULTICAST_IF: - ifindex = udp_get_multicast_netif_index(udp->udp_pcb); - - /* - * Map back from the interface index to the IPv4 - * address assigned to the corresponding interface. - * Should this not work out, return the 'any' address. - */ - if (ifindex != NETIF_NO_INDEX && - (ifdev = ifdev_get_by_index(ifindex)) != NULL) { - ip4addr = - netif_ip4_addr(ifdev_get_netif(ifdev)); - - in_addr.s_addr = ip4_addr_get_u32(ip4addr); - } else - in_addr.s_addr = PP_HTONL(INADDR_ANY); - - return sockdriver_copyout_opt(data, &in_addr, - sizeof(in_addr), len); - - case IP_MULTICAST_LOOP: - flags = udp_flags(udp->udp_pcb); - - byte = !!(flags & UDP_FLAGS_MULTICAST_LOOP); - - return sockdriver_copyout_opt(data, &byte, - sizeof(byte), len); - - case IP_MULTICAST_TTL: - byte = udp_get_multicast_ttl(udp->udp_pcb); - - return sockdriver_copyout_opt(data, &byte, - sizeof(byte), len); - } - - break; - - case IPPROTO_IPV6: - if (!udpsock_is_ipv6(udp)) - break; - - switch (name) { - case IPV6_MULTICAST_IF: - ifindex = udp_get_multicast_netif_index(udp->udp_pcb); - - val = (int)ifindex; - - return sockdriver_copyout_opt(data, &val, sizeof(val), - len); - - case IPV6_MULTICAST_LOOP: - flags = udp_flags(udp->udp_pcb); - - val = !!(flags & UDP_FLAGS_MULTICAST_LOOP); - - return sockdriver_copyout_opt(data, &val, sizeof(val), - len); - - case IPV6_MULTICAST_HOPS: - val = udp_get_multicast_ttl(udp->udp_pcb); - - return sockdriver_copyout_opt(data, &val, sizeof(val), - len); - } - - break; - } - - /* Handle all other options at the packet or IP level. */ - udpsock_get_ipopts(udp, &ipopts); - - return pktsock_getsockopt(&udp->udp_pktsock, level, name, data, len, - &ipopts); -} - -/* - * Retrieve the local socket address of a UDP socket. - */ -static int -udpsock_getsockname(struct sock * sock, struct sockaddr * addr, - socklen_t * addr_len) -{ - struct udpsock *udp = (struct udpsock *)sock; - - ipsock_put_addr(udpsock_get_ipsock(udp), addr, addr_len, - &udp->udp_pcb->local_ip, udp->udp_pcb->local_port); - - return OK; -} - -/* - * Retrieve the remote socket address of a UDP socket. - */ -static int -udpsock_getpeername(struct sock * sock, struct sockaddr * addr, - socklen_t * addr_len) -{ - struct udpsock *udp = (struct udpsock *)sock; - - if (!udpsock_is_conn(udp)) - return ENOTCONN; - - ipsock_put_addr(udpsock_get_ipsock(udp), addr, addr_len, - &udp->udp_pcb->remote_ip, udp->udp_pcb->remote_port); - - return OK; -} - -/* - * Shut down a UDP socket for reading and/or writing. - */ -static int -udpsock_shutdown(struct sock * sock, unsigned int mask) -{ - struct udpsock *udp = (struct udpsock *)sock; - - if (mask & SFL_SHUT_RD) - udp_recv(udp->udp_pcb, NULL, NULL); - - pktsock_shutdown(&udp->udp_pktsock, mask); - - return OK; -} - -/* - * Close a UDP socket. - */ -static int -udpsock_close(struct sock * sock, int force __unused) -{ - struct udpsock *udp = (struct udpsock *)sock; - - udp_recv(udp->udp_pcb, NULL, NULL); - - udp_remove(udp->udp_pcb); - udp->udp_pcb = NULL; - - pktsock_close(&udp->udp_pktsock); - - return OK; -} - -/* - * Free up a closed UDP socket. - */ -static void -udpsock_free(struct sock * sock) -{ - struct udpsock *udp = (struct udpsock *)sock; - - assert(udp->udp_pcb == NULL); - - SIMPLEQ_INSERT_HEAD(&udp_freelist, udp, udp_next); -} - -/* - * Fill the given kinfo_pcb sysctl(7) structure with information about the UDP - * PCB identified by the given pointer. - */ -static void -udpsock_get_info(struct kinfo_pcb * ki, const void * ptr) -{ - const struct udp_pcb *pcb = (const struct udp_pcb *)ptr; - struct udpsock *udp; - - ki->ki_type = SOCK_DGRAM; - - /* - * All UDP sockets should be created by this module, but protect - * ourselves from the case that that is not true anyway. - */ - if (pcb->recv_arg != NULL) { - udp = (struct udpsock *)pcb->recv_arg; - - assert(udp >= udp_array && - udp < &udp_array[__arraycount(udp_array)]); - } else - udp = NULL; - - ipsock_get_info(ki, &pcb->local_ip, pcb->local_port, &pcb->remote_ip, - pcb->remote_port); - - if (udp != NULL) { - /* TODO: change this so that sockstat(1) may work one day. */ - ki->ki_sockaddr = (uint64_t)(uintptr_t)udpsock_get_sock(udp); - - ki->ki_rcvq = pktsock_get_recvlen(&udp->udp_pktsock); - } -} - -/* - * Given either NULL or a previously returned UDP PCB pointer, return the first - * or next UDP PCB pointer, or NULL if there are no more. Skip UDP PCBs that - * are not bound to an address, as there is no use reporting them. - */ -static const void * -udpsock_enum(const void * last) -{ - const struct udp_pcb *pcb; - - if (last != NULL) - pcb = (const void *)((const struct udp_pcb *)last)->next; - else - pcb = (const void *)udp_pcbs; - - while (pcb != NULL && pcb->local_port == 0) - pcb = pcb->next; - - return pcb; -} - -/* - * Obtain the list of UDP protocol control blocks, for sysctl(7). - */ -static ssize_t -udpsock_pcblist(struct rmib_call * call, struct rmib_node * node __unused, - struct rmib_oldp * oldp, struct rmib_newp * newp __unused) -{ - - return util_pcblist(call, oldp, udpsock_enum, udpsock_get_info); -} - -static const struct sockevent_ops udpsock_ops = { - .sop_bind = udpsock_bind, - .sop_connect = udpsock_connect, - .sop_pre_send = udpsock_pre_send, - .sop_send = udpsock_send, - .sop_pre_recv = pktsock_pre_recv, - .sop_recv = pktsock_recv, - .sop_test_recv = pktsock_test_recv, - .sop_ioctl = ifconf_ioctl, - .sop_setsockmask = udpsock_setsockmask, - .sop_setsockopt = udpsock_setsockopt, - .sop_getsockopt = udpsock_getsockopt, - .sop_getsockname = udpsock_getsockname, - .sop_getpeername = udpsock_getpeername, - .sop_shutdown = udpsock_shutdown, - .sop_close = udpsock_close, - .sop_free = udpsock_free -}; diff --git a/minix/net/uds/io.c b/minix/net/uds/io.c deleted file mode 100644 index 0314e2dff..000000000 --- a/minix/net/uds/io.c +++ /dev/null @@ -1,1805 +0,0 @@ -/* UNIX Domain Sockets - io.c - sending and receiving */ - -#include "uds.h" -#include - -/* - * Our UDS sockets do not have a send buffer. They only have a receive buffer. - * This receive buffer, when not empty, is split up in segments. Each segment - * may contain regular data, ancillary data, both, or (for SOCK_SEQPACKET and - * (SOCK_DGRAM) neither. There are two types of ancillary data: in-flight file - * descriptors and sender credentials. In addition, for SOCK_DGRAM sockets, - * the segment may contain the sender's socket path (if the sender's socket is - * bound). Each segment has a header, containing the full segment size, the - * size of the actual data in the segment (if any), and a flags field that - * states which ancillary are associated with the segment (if any). For - * SOCK_STREAM type sockets, new data may be merged into a previous segment, - * but only if it has no ancillary data. For the other two socket types, each - * packet has its own header. The resulting behavior should be in line with - * the POSIX "Socket Receive Queue" specification. - * - * More specifically, each segment consists of the following parts: - * - always a five-byte header, containing a two-byte segment length (including - * the header, so always non-zero), a two-byte regular data length (zero or - * more), and a one-byte flags field which is a bitwise combination of - * UDS_HAS_{FD,CRED,PATH} flags; - * - next, if UDS_HAS_CRED is set in the segment header: a sockcred structure; - * since this structure is variable-size, the structure is prepended by a - * single byte that contains the length of the structure (excluding the byte - * itself, thus ranging from sizeof(struct sockcred) to UDS_MAXCREDLEN); - * - next, if UDS_HAS_PATH is set in the segment header: - * - next, if the data length is non-zero, the actual regular data. - * If the segment is not the last in the receive buffer, it is followed by the - * next segment immediately afterward. There is no alignment. - * - * It is the sender's responsibility to merge new data into the last segment - * whenever possible, so that the receiver side never needs to consider more - * than one segment at once. In order to allow such merging, each receive - * buffer has not only a tail and in-use length (pointing to the head when - * combined) but also an offset from the tail to the last header, if any. Note - * that the receiver may over time still look at multiple segments for a single - * request: this happens when a MSG_WAITALL request empties the buffer and then - * blocks - the next piece of arriving data can then obviously not be merged. - * - * If a segment has the UDS_HAS_FD flag set, then one or more in-flight file - * descriptors are associated with the segment. These are stored in a separate - * data structure, mainly to simplify cleaning up when the socket is shut down - * for reading or closed. That structure also contains the number of file - * descriptors associated with the current segment, so this is not stored in - * the segment itself. As mentioned later, this may be changed in the future. - * - * On the sender side, there is a trade-off between fully utilizing the receive - * buffer, and not repeatedly performing expensive actions for the same call: - * it may be costly to determine exactly how many in-flight file descriptors - * there will be (if any) and/or how much space is needed to store credentials. - * We currently use the policy that we rather block/reject a send request that - * may (just) have fit in the remaining part of the receive buffer, than obtain - * the same information multiple times or keep state between callbacks. In - * practice this is not expected to make a difference, especially since - * transfer of ancillary data should be rare anyway. - */ -/* - * The current layout of the segment header is as follows. - * - * The first byte contains the upper eight bits of the total segment length. - * The second byte contains the lower eight bits of the total segment length. - * The third byte contains the upper eight bits of the data length. - * The fourth byte contains the lower eight bits of the data length. - * The fifth byte is a bitmask for ancillary data associated with the segment. - */ -#define UDS_HDRLEN 5 - -#define UDS_HAS_FDS 0x01 /* segment has in-flight file descriptors */ -#define UDS_HAS_CRED 0x02 /* segment has sender credentials */ -#define UDS_HAS_PATH 0x04 /* segment has source socket path */ - -#define UDS_MAXCREDLEN SOCKCREDSIZE(NGROUPS_MAX) - -#define uds_get_head(uds) \ - ((size_t)((uds)->uds_tail + (uds)->uds_len) % UDS_BUF) -#define uds_get_last(uds) \ - ((size_t)((uds)->uds_tail + (uds)->uds_last) % UDS_BUF) -#define uds_advance(pos,add) (((pos) + (add)) % UDS_BUF) - -/* - * All in-flight file descriptors are (co-)owned by the UDS driver itself, as - * local open file descriptors. Like any other process, the UDS driver can not - * have more than OPEN_MAX open file descriptors at any time. Thus, this is - * also the inherent maximum number of in-flight file descriptors. Therefore, - * we maintain a single pool of in-flight FD structures, and we associate these - * structures with sockets as needed. - */ -static struct uds_fd uds_fds[OPEN_MAX]; -static SIMPLEQ_HEAD(uds_freefds, uds_fd) uds_freefds; - -static char uds_ctlbuf[UDS_CTL_MAX]; -static int uds_ctlfds[UDS_CTL_MAX / sizeof(int)]; - -/* - * Initialize the input/output part of the UDS service. - */ -void -uds_io_init(void) -{ - unsigned int slot; - - SIMPLEQ_INIT(&uds_freefds); - - for (slot = 0; slot < __arraycount(uds_fds); slot++) - SIMPLEQ_INSERT_TAIL(&uds_freefds, &uds_fds[slot], ufd_next); -} - -/* - * Set up all input/output state for the given socket, which has just been - * allocated. As part of this, allocate memory for the receive buffer of the - * socket. Return OK or a negative error code. - */ -int -uds_io_setup(struct udssock * uds) -{ - - /* TODO: decide if we should preallocate the memory. */ - if ((uds->uds_buf = mmap(NULL, UDS_BUF, PROT_READ | PROT_WRITE, - MAP_ANON | MAP_PRIVATE, -1, 0)) == MAP_FAILED) - return ENOMEM; - - uds->uds_tail = 0; - uds->uds_len = 0; - uds->uds_last = 0; - - SIMPLEQ_INIT(&uds->uds_fds); - - return OK; -} - -/* - * Clean up the input/output state for the given socket, which is about to be - * freed. As part of this, deallocate memory for the receive buffer and close - * any file descriptors still in flight on the socket. - */ -void -uds_io_cleanup(struct udssock * uds) -{ - - /* Close any in-flight file descriptors. */ - uds_io_reset(uds); - - /* Free the receive buffer memory. */ - if (munmap(uds->uds_buf, UDS_BUF) != 0) - panic("UDS: munmap failed: %d", errno); -} - -/* - * The socket is being closed or shut down for reading. If there are still any - * in-flight file descriptors, theey will never be received anymore, so close - * them now. - */ -void -uds_io_reset(struct udssock * uds) -{ - struct uds_fd *ufd; - - /* - * The UDS service may have the last and only reference to any of these - * file descriptors here. For that reason, we currently disallow - * transfer of UDS file descriptors, because the close(2) here could - * block on a socket close operation back to us, leading to a deadlock. - * Also, we use a non-blocking variant of close(2), to prevent that we - * end up hanging on sockets with SO_LINGER turned on. - */ - SIMPLEQ_FOREACH(ufd, &uds->uds_fds, ufd_next) { - dprintf(("UDS: closing local fd %d\n", ufd->ufd_fd)); - - closenb(ufd->ufd_fd); - } - - SIMPLEQ_CONCAT(&uds_freefds, &uds->uds_fds); - - /* - * If this reset happens as part of a shutdown, it might be done - * again on close, so ensure that it will find a clean state. The - * receive buffer should never be looked at again either way, but reset - * it too just to be sure. - */ - uds->uds_tail = 0; - uds->uds_len = 0; - uds->uds_last = 0; - - SIMPLEQ_INIT(&uds->uds_fds); -} - -/* - * Return the maximum usable part of the receive buffer, in bytes. The return - * value is used for the SO_SNDBUF and SO_RCVBUF socket options. - */ -size_t -uds_io_buflen(void) -{ - - /* - * TODO: it would be nicer if at least for SOCK_STREAM-type sockets, we - * could use the full receive buffer for data. This would require that - * we store up to one header in the socket object rather than in the - * receive buffer. - */ - return UDS_BUF - UDS_HDRLEN; -} - -/* - * Fetch 'len' bytes starting from absolute position 'pos' into the receive - * buffer of socket 'uds', and copy them into the buffer pointed to by 'ptr'. - * Return the absolute position of the first byte after the fetched data in the - * receive buffer. - */ -static size_t -uds_fetch(struct udssock * uds, size_t off, void * ptr, size_t len) -{ - size_t left; - - assert(off < UDS_BUF); - - left = UDS_BUF - off; - if (len >= left) { - memcpy(ptr, &uds->uds_buf[off], left); - - if ((len -= left) > 0) - memcpy((char *)ptr + left, &uds->uds_buf[0], len); - - return len; - } else { - memcpy(ptr, &uds->uds_buf[off], len); - - return off + len; - } -} - -/* - * Store 'len' bytes from the buffer pointed to by 'ptr' into the receive - * buffer of socket 'uds', starting at absolute position 'pos' into the receive - * buffer. Return the absolute position of the first byte after the stored - * data in the receive buffer. - */ -static size_t -uds_store(struct udssock * uds, size_t off, const void * ptr, size_t len) -{ - size_t left; - - assert(off < UDS_BUF); - - left = UDS_BUF - off; - if (len >= left) { - memcpy(&uds->uds_buf[off], ptr, left); - - if ((len -= left) > 0) - memcpy(&uds->uds_buf[0], (const char *)ptr + left, - len); - - return len; - } else { - memcpy(&uds->uds_buf[off], ptr, len); - - return off + len; - } -} - -/* - * Fetch a segment header previously stored in the receive buffer of socket - * 'uds' at absolute position 'off'. Return the absolute position of the first - * byte after the header, as well as the entire segment length in 'seglen', the - * length of the data in the segment in 'datalen', and the segment flags in - * 'segflags'. - */ -static size_t -uds_fetch_hdr(struct udssock * uds, size_t off, size_t * seglen, - size_t * datalen, unsigned int * segflags) -{ - unsigned char hdr[UDS_HDRLEN]; - - off = uds_fetch(uds, off, hdr, sizeof(hdr)); - - *seglen = ((size_t)hdr[0] << 8) | (size_t)hdr[1]; - *datalen = ((size_t)hdr[2] << 8) | (size_t)hdr[3]; - *segflags = hdr[4]; - - assert(*seglen >= UDS_HDRLEN); - assert(*seglen <= uds->uds_len); - assert(*datalen <= *seglen - UDS_HDRLEN); - assert(*segflags != 0 || *datalen == *seglen - UDS_HDRLEN); - assert(!(*segflags & ~(UDS_HAS_FDS | UDS_HAS_CRED | UDS_HAS_PATH))); - - return off; -} - -/* - * Store a segment header in the receive buffer of socket 'uds' at absolute - * position 'off', with the segment length 'seglen', the segment data length - * 'datalen', and the segment flags 'segflags'. Return the absolute receive - * buffer position of the first data byte after the stored header. - */ -static size_t -uds_store_hdr(struct udssock * uds, size_t off, size_t seglen, size_t datalen, - unsigned int segflags) -{ - unsigned char hdr[UDS_HDRLEN]; - - assert(seglen <= USHRT_MAX); - assert(datalen <= seglen); - assert(segflags <= UCHAR_MAX); - assert(!(segflags & ~(UDS_HAS_FDS | UDS_HAS_CRED | UDS_HAS_PATH))); - - hdr[0] = (seglen >> 8) & 0xff; - hdr[1] = seglen & 0xff; - hdr[2] = (datalen >> 8) & 0xff; - hdr[3] = datalen & 0xff; - hdr[4] = segflags; - - return uds_store(uds, off, hdr, sizeof(hdr)); -} - -/* - * Perform initial checks on a send request, before it may potentially be - * suspended. Return OK if this send request is valid, or a negative error - * code if it is not. - */ -int -uds_pre_send(struct sock * sock, size_t len, socklen_t ctl_len __unused, - const struct sockaddr * addr, socklen_t addr_len __unused, - endpoint_t user_endpt __unused, int flags) -{ - struct udssock *uds = (struct udssock *)sock; - size_t pathlen; - - /* - * Reject calls with unknown flags. Besides the flags handled entirely - * by libsockevent (which are not part of 'flags' here), that is all of - * them. TODO: ensure that we should really reject all other flags - * rather than ignore them. - */ - if (flags != 0) - return EOPNOTSUPP; - - /* - * Perform very basic address and message size checks on the send call. - * For non-stream sockets, we must reject packets that may never fit in - * the receive buffer, or otherwise (at least for SOCK_SEQPACKET) the - * send call may end up being suspended indefinitely. Therefore, we - * assume the worst-case scenario, which is that a full set of - * credentials must be associated with the packet. As a result, we may - * reject some large packets that could actually just fit. Checking - * the peer's LOCAL_CREDS setting here is not safe: even if we know the - * peer already at all (for SOCK_DGRAM we do not), the send may still - * block and the option toggled before it unblocks. - */ - switch (uds_get_type(uds)) { - case SOCK_STREAM: - /* Nothing to check for this case. */ - break; - - case SOCK_SEQPACKET: - if (len > UDS_BUF - UDS_HDRLEN - 1 - UDS_MAXCREDLEN) - return EMSGSIZE; - - break; - - case SOCK_DGRAM: - if (!uds_has_link(uds) && addr == NULL) - return EDESTADDRREQ; - - /* - * The path is stored without null terminator, but with leading - * byte containing the path length--if there is a path at all. - */ - pathlen = (size_t)uds->uds_pathlen; - if (pathlen > 0) - pathlen++; - - if (len > UDS_BUF - UDS_HDRLEN - pathlen - 1 - UDS_MAXCREDLEN) - return EMSGSIZE; - - break; - - default: - assert(0); - } - - return OK; -} - -/* - * Determine whether the (real or pretend) send request should be processed - * now, suspended until later, or rejected based on the current socket state. - * Return OK if the send request should be processed now. Return SUSPEND if - * the send request should be retried later. Return an appropriate negative - * error code if the send request should fail. - */ -static int -uds_send_test(struct udssock * uds, size_t len, socklen_t ctl_len, size_t min, - int partial) -{ - struct udssock *conn; - size_t avail, hdrlen, credlen; - - assert(!uds_is_shutdown(uds, SFL_SHUT_WR)); - - if (uds_get_type(uds) != SOCK_DGRAM) { - if (uds_is_connecting(uds)) - return SUSPEND; - if (!uds_is_connected(uds) && !uds_is_disconnected(uds)) - return ENOTCONN; - if (!uds_has_conn(uds)) - return EPIPE; - - conn = uds->uds_conn; - - if (uds_is_shutdown(conn, SFL_SHUT_RD)) - return EPIPE; - - /* - * For connection-type sockets, we now have to check if there - * is enough room in the receive buffer. For SOCK_STREAM - * sockets, we must check if at least 'min' bytes can be moved - * into the receive buffer, at least if that is a reasonable - * value for ever making any forward progress at all. For - * SOCK_SEQPACKET sockets, we must check if the entire packet - * of size 'len' can be stored in the receive buffer. In both - * cases, we must take into account any metadata to store along - * with the data. - * - * Unlike in uds_pre_send(), we can now check safely whether - * the peer is expecting credentials, but we still don't know - * the actual size of the credentials, so again we take the - * maximum possible size. The same applies to file descriptors - * transferred via control data: all we have the control length - * right now, which if non-zero we assume to mean there might - * be file descriptors. - * - * In both cases, the reason of overestimating is that actually - * getting accurate sizes, by obtaining credentials or copying - * in control data, is very costly. We want to do that only - * when we are sure we will not suspend the send call after - * all. It is no problem to overestimate how much space will - * be needed here, but not to underestimate: that could cause - * applications that use select(2) and non-blocking sockets to - * end up in a busy-wait loop. - */ - if (!partial && (conn->uds_flags & UDSF_PASSCRED)) - credlen = 1 + UDS_MAXCREDLEN; - else - credlen = 0; - - avail = UDS_BUF - conn->uds_len; - - if (uds_get_type(uds) == SOCK_STREAM) { - /* - * Limit the low threshold to the maximum that can ever - * be sent at once. - */ - if (min > UDS_BUF - UDS_HDRLEN - credlen) - min = UDS_BUF - UDS_HDRLEN - credlen; - - /* - * Suspend the call only if not even the low threshold - * is met. Otherwise we may make (partial) progress. - */ - if (len > min) - len = min; - - /* - * If the receive buffer already has at least one - * segment, and there are certainly no file descriptors - * to transfer now, and we do not have to store - * credentials either, then this segment can be merged - * with the previous one. In that case, we need no - * space for a header. That is certainly the case if - * we are resuming an already partially completed send. - */ - hdrlen = (avail == UDS_BUF || ctl_len != 0 || - credlen > 0) ? UDS_HDRLEN : 0; - } else - hdrlen = UDS_HDRLEN; - - if (avail < hdrlen + credlen + len) - return SUSPEND; - } - - return OK; -} - -/* - * Get the destination peer for a send request. The send test has already been - * performed first. On success, return OK, with a pointer to the peer socket - * stored in 'peerp'. On failure, return an appropriate error code. - */ -static int -uds_send_peer(struct udssock * uds, const struct sockaddr * addr, - socklen_t addr_len, endpoint_t user_endpt, struct udssock ** peerp) -{ - struct udssock *peer; - int r; - - if (uds_get_type(uds) == SOCK_DGRAM) { - if (!uds_has_link(uds)) { - /* This was already checked in uds_pre_check(). */ - assert(addr != NULL); - - /* - * Find the socket identified by the given address. - * If it exists at all, see if it is a proper match. - */ - if ((r = uds_lookup(uds, addr, addr_len, user_endpt, - &peer)) != OK) - return r; - - /* - * If the peer socket is connected to a target, it - * must be this socket. Unfortunately, POSIX does not - * specify an error code for this. We borrow Linux's. - */ - if (uds_has_link(peer) && peer->uds_link != uds) - return EPERM; - } else - peer = uds->uds_link; - - /* - * If the receiving end will never receive this packet, we - * might as well not send it, so drop it immediately. Indicate - * this condition to the caller using the MINIX error code for a - * full buffer. - */ - if (uds_is_shutdown(peer, SFL_SHUT_RD)) - return ENOBUFS; - } else { - assert(uds_has_conn(uds)); - - peer = uds->uds_conn; - } - - *peerp = peer; - return OK; -} - -/* - * Generate a new segment for the current send request, or arrange things such - * that new data can be merged with a previous segment. As part of this, - * decide whether we can merge data at all. The segment will be merged if, and - * only if, all of the following requirements are met: - * - * 1) the socket is of type SOCK_STREAM; - * 2) there is a previous segment in the receive buffer; - * 3) there is no ancillary data for the current send request. - * - * Also copy in regular data (if any), retrieve the sender's credentials (if - * needed), and copy over the source path (if applicable). However, do not yet - * commit the segment (or the new part to be merged), because the send request - * may still fail for other reasons. - * - * On success, return the length of the new segment (or, when merging, the - * length to be added to the last segment), as well as a flag indicating - * whether we are merging into the last segment in 'mergep', the length of the - * (new) data in the segment in 'datalenp', and the new segment's flags in - * 'segflagsp' (always zero when merging). Note that a return value of zero - * implies that we are merging zero extra bytes into the last segment, which - * means that effectively nothing changes; in that case the send call will be - * cut short and return zero to the caller as well. On failure, return a - * negative error code. - */ -static int -uds_send_data(struct udssock * uds, struct udssock * peer, - const struct sockdriver_data * data, size_t len, size_t off, - endpoint_t user_endpt, unsigned int nfds, int * __restrict mergep, - size_t * __restrict datalenp, unsigned int * __restrict segflagsp) -{ - struct sockcred sockcred; - gid_t groups[NGROUPS_MAX]; - iovec_t iov[2]; - unsigned int iovcnt, segflags; - unsigned char lenbyte; - size_t credlen, pathlen, datalen, seglen; - size_t avail, pos, left; - int r, merge; - - /* - * At this point we should add the data to the peer's receive buffer. - * In the case of SOCK_STREAM sockets, we should add as much of the - * data as possible and suspend the call to send the rest later, if - * applicable. In the case of SOCK_DGRAM sockets, we should drop the - * packet if it does not fit in the buffer. - * - * Due to the checks in uds_can_send(), we know for sure that we no - * longer have to suspend without making any progress at this point. - */ - segflags = (nfds > 0) ? UDS_HAS_FDS : 0; - - /* - * Obtain the credentials now. Doing so allows us to determine how - * much space we actually need for them. - */ - if (off == 0 && (peer->uds_flags & UDSF_PASSCRED)) { - memset(&sockcred, 0, sizeof(sockcred)); - - if ((r = getsockcred(user_endpt, &sockcred, groups, - __arraycount(groups))) != OK) - return r; - - /* - * getsockcred(3) returns the total number of groups for the - * process, which may exceed the size of the given array. Our - * groups array should always be large enough for all groups, - * but we check to be sure anyway. - */ - assert(sockcred.sc_ngroups <= (int)__arraycount(groups)); - - credlen = 1 + SOCKCREDSIZE(sockcred.sc_ngroups); - - segflags |= UDS_HAS_CRED; - } else - credlen = 0; - - /* For bound source datagram sockets, include the source path. */ - if (uds_get_type(uds) == SOCK_DGRAM && uds->uds_pathlen != 0) { - pathlen = (size_t)uds->uds_pathlen + 1; - - segflags |= UDS_HAS_PATH; - } else - pathlen = 0; - - avail = UDS_BUF - peer->uds_len; - - if (uds_get_type(uds) == SOCK_STREAM) { - /* - * Determine whether we can merge data into the previous - * segment. This is a more refined version of the test in - * uds_can_send(), as we now know whether there are actually - * any FDs to transfer. - */ - merge = (peer->uds_len != 0 && nfds == 0 && credlen == 0); - - /* Determine how much we can send at once. */ - if (!merge) { - assert(avail > UDS_HDRLEN + credlen); - datalen = avail - UDS_HDRLEN - credlen; - } else - datalen = avail; - - if (datalen > len) - datalen = len; - - /* If we cannot make progress, we should have suspended.. */ - assert(datalen != 0 || len == 0); - } else { - merge = FALSE; - - datalen = len; - } - assert(datalen <= len); - assert(datalen <= UDS_BUF); - - /* - * Compute the total amount of space we need for the segment in the - * receive buffer. Given that we have done will-it-fit tests in - * uds_can_send() for SOCK_STREAM and SOCK_SEQPACKET, there is only one - * case left where the result may not fit, and that is for SOCK_DGRAM - * packets. In that case, we drop the packet. POSIX says we should - * throw an error in that case, and that is also what NetBSD does. - */ - if (!merge) - seglen = UDS_HDRLEN + credlen + pathlen + datalen; - else - seglen = datalen; - - if (seglen > avail) { - assert(uds_get_type(uds) == SOCK_DGRAM); - - /* Drop the packet and return the MINIX error code for - * insufficient buffer space. */ - return ENOBUFS; - } - - /* - * Generate the full segment, but do not yet update the buffer head. - * We may still run into an error (copying in file descriptors) or even - * decide that nothing gets sent after all (if there are no data or - * file descriptors). If we are merging the new data into the previous - * segment, do not generate a header. - */ - pos = uds_get_head(peer); - - /* Generate the header, if needed. */ - if (!merge) - pos = uds_store_hdr(peer, pos, seglen, datalen, segflags); - else - assert(segflags == 0); - - /* Copy in and store the sender's credentials, if desired. */ - if (credlen > 0) { - assert(credlen >= 1 + sizeof(sockcred)); - assert(credlen <= UCHAR_MAX); - - lenbyte = credlen - 1; - pos = uds_store(peer, pos, &lenbyte, 1); - - if (sockcred.sc_ngroups > 0) { - pos = uds_store(peer, pos, &sockcred, - offsetof(struct sockcred, sc_groups)); - pos = uds_store(peer, pos, groups, - sockcred.sc_ngroups * sizeof(gid_t)); - } else - pos = uds_store(peer, pos, &sockcred, - sizeof(sockcred)); - } - - /* Store the sender's address if any. Datagram sockets only. */ - if (pathlen > 0) { - assert(pathlen > 1); - assert(pathlen <= UCHAR_MAX); - - lenbyte = uds->uds_pathlen; - pos = uds_store(peer, pos, &lenbyte, 1); - pos = uds_store(peer, pos, uds->uds_path, pathlen - 1); - } - - /* Lastly, copy in the actual data (if any) from the caller. */ - if (datalen > 0) { - iov[0].iov_addr = (vir_bytes)&peer->uds_buf[pos]; - left = UDS_BUF - pos; - - if (left < datalen) { - assert(left > 0); - iov[0].iov_size = left; - iov[1].iov_addr = (vir_bytes)&peer->uds_buf[0]; - iov[1].iov_size = datalen - left; - iovcnt = 2; - } else { - iov[0].iov_size = datalen; - iovcnt = 1; - } - - if ((r = sockdriver_vcopyin(data, off, iov, iovcnt)) != OK) - return r; - } - - *mergep = merge; - *datalenp = datalen; - *segflagsp = segflags; - return seglen; -} - -/* - * Copy in control data for the current send request, and extract any file - * descriptors to be transferred. Do not yet duplicate the file descriptors, - * but rather store a list in a temporary buffer: the send request may still - * fail in which case we want to avoid having to undo the duplication. - * - * On success, return the number of (zero or more) file descriptors extracted - * from the request and stored in the temporary buffer. On failure, return a - * negative error code. - */ -static int -uds_send_ctl(const struct sockdriver_data * ctl, socklen_t ctl_len, - endpoint_t user_endpt) -{ - struct msghdr msghdr; - struct cmsghdr *cmsg; - socklen_t left; - unsigned int i, n, nfds; - int r; - - /* - * Copy in the control data. We can spend a lot of effort copying in - * the data in small chunks, and change the receiving side to do the - * same, but it is really not worth it: applications never send a whole - * lot of file descriptors at once, and the buffer size is currently - * such that the UDS service itself will exhaust its OPEN_MAX limit - * anyway if they do. - */ - if (ctl_len > sizeof(uds_ctlbuf)) - return ENOBUFS; - - if ((r = sockdriver_copyin(ctl, 0, uds_ctlbuf, ctl_len)) != OK) - return r; - - if (ctl_len < sizeof(uds_ctlbuf)) - memset(&uds_ctlbuf[ctl_len], 0, sizeof(uds_ctlbuf) - ctl_len); - - /* - * Look for any file descriptors, and store their remote file - * descriptor numbers into a temporary array. - */ - memset(&msghdr, 0, sizeof(msghdr)); - msghdr.msg_control = uds_ctlbuf; - msghdr.msg_controllen = ctl_len; - - nfds = 0; - r = OK; - - /* - * The sender may provide file descriptors in multiple chunks. - * Currently we do not preserve these chunk boundaries, instead - * generating one single chunk with all file descriptors for the - * segment upon receipt. If needed, we can fairly easily adapt this - * later. - */ - for (cmsg = CMSG_FIRSTHDR(&msghdr); cmsg != NULL; - cmsg = CMSG_NXTHDR(&msghdr, cmsg)) { - /* - * Check for bogus lengths. There is no excuse for this; - * either the caller does not know what they are doing or we - * are looking at a hacking attempt. - */ - assert((socklen_t)((char *)cmsg - uds_ctlbuf) <= ctl_len); - left = ctl_len - (socklen_t)((char *)cmsg - uds_ctlbuf); - assert(left >= CMSG_LEN(0)); /* guaranteed by CMSG_xxHDR */ - - if (cmsg->cmsg_len < CMSG_LEN(0) || cmsg->cmsg_len > left) { - printf("UDS: malformed control data from %u\n", - user_endpt); - r = EINVAL; - break; - } - - if (cmsg->cmsg_level != SOL_SOCKET || - cmsg->cmsg_type != SCM_RIGHTS) - continue; - - n = (cmsg->cmsg_len - CMSG_LEN(0)) / sizeof(int); - - for (i = 0; i < n; i++) { - /* - * Copy the file descriptor to the temporary buffer, - * whose size is based on the control data buffer, so - * it is always large enough to contain all FDs. - */ - assert(nfds < __arraycount(uds_ctlfds)); - - memcpy(&uds_ctlfds[nfds], - &((int *)CMSG_DATA(cmsg))[i], sizeof(int)); - - nfds++; - } - } - - return nfds; -} - -/* - * Actually duplicate any file descriptors that we extracted from the sender's - * control data and stored in our temporary buffer. On success, return OK, - * with all file descriptors stored in file descriptor objects that are - * appended to the socket's list of in-flight FD objects. Thus, on success, - * the send request may no longer fail. On failure, return a negative error - * code, with any partial duplication undone. - */ -static int -uds_send_fds(struct udssock * peer, unsigned int nfds, endpoint_t user_endpt) -{ - SIMPLEQ_HEAD(, uds_fd) fds; - struct uds_fd *ufd; - unsigned int i; - int r; - - SIMPLEQ_INIT(&fds); - - for (i = 0; i < nfds; i++) { - if (SIMPLEQ_EMPTY(&uds_freefds)) { - /* UDS itself may already have OPEN_MAX FDs. */ - r = ENFILE; - break; - } - - /* - * The caller may have given an invalid FD, or UDS itself may - * unexpectedly have run out of available file descriptors etc. - */ - if ((r = copyfd(user_endpt, uds_ctlfds[i], COPYFD_FROM)) < 0) - break; - - ufd = SIMPLEQ_FIRST(&uds_freefds); - SIMPLEQ_REMOVE_HEAD(&uds_freefds, ufd_next); - - ufd->ufd_fd = r; - ufd->ufd_count = 0; - - SIMPLEQ_INSERT_TAIL(&fds, ufd, ufd_next); - - dprintf(("UDS: copied in fd %d -> %d\n", uds_ctlfds[i], r)); - } - - /* Did we experience an error while copying in the file descriptors? */ - if (r < 0) { - /* Revert the successful copyfd() calls made so far. */ - SIMPLEQ_FOREACH(ufd, &fds, ufd_next) { - dprintf(("UDS: closing local fd %d\n", ufd->ufd_fd)); - - closenb(ufd->ufd_fd); - } - - SIMPLEQ_CONCAT(&uds_freefds, &fds); - - return r; - } - - /* - * Success. If there were any file descriptors at all, add them to the - * peer's list of in-flight file descriptors. Assign the number of - * file descriptors copied in to the first file descriptor object, so - * that we know how many to copy out (or discard) for this segment. - * Also set the UDS_HAS_FDS flag on the segment. - */ - ufd = SIMPLEQ_FIRST(&fds); - ufd->ufd_count = nfds; - - SIMPLEQ_CONCAT(&peer->uds_fds, &fds); - - return OK; -} - -/* - * The current send request is successful or at least has made progress. - * Commit the new segment or, if we decided to merge the new data into the last - * segment, update the header of the last segment. Also wake up the receiving - * side, because there will now be new data to receive. - */ -static void -uds_send_advance(struct udssock * uds, struct udssock * peer, size_t datalen, - int merge, size_t seglen, unsigned int segflags) -{ - size_t pos, prevseglen, prevdatalen; - - /* - * For non-datagram sockets, credentials are sent only once after - * setting the LOCAL_CREDS option. After that, the option is unset. - */ - if ((segflags & UDS_HAS_CRED) && uds_get_type(uds) != SOCK_DGRAM) - peer->uds_flags &= ~UDSF_PASSCRED; - - if (merge) { - assert(segflags == 0); - - pos = uds_get_last(peer); - - (void)uds_fetch_hdr(peer, pos, &prevseglen, &prevdatalen, - &segflags); - - peer->uds_len += seglen; - assert(peer->uds_len <= UDS_BUF); - - seglen += prevseglen; - datalen += prevdatalen; - assert(seglen <= UDS_BUF); - - uds_store_hdr(peer, pos, seglen, datalen, segflags); - } else { - peer->uds_last = peer->uds_len; - - peer->uds_len += seglen; - assert(peer->uds_len <= UDS_BUF); - } - - /* Now that there are new data, wake up the receiver side. */ - sockevent_raise(&peer->uds_sock, SEV_RECV); -} - -/* - * Process a send request. Return OK if the send request has successfully - * completed, SUSPEND if it should be tried again later, or a negative error - * code on failure. In all cases, the values of 'off' and 'ctl_off' must be - * updated if any progress has been made; if either is non-zero, libsockevent - * will return the partial progress rather than an error code. - */ -int -uds_send(struct sock * sock, const struct sockdriver_data * data, size_t len, - size_t * off, const struct sockdriver_data * ctl, socklen_t ctl_len, - socklen_t * ctl_off, const struct sockaddr * addr, socklen_t addr_len, - endpoint_t user_endpt, int flags __unused, size_t min) -{ - struct udssock *uds = (struct udssock *)sock; - struct udssock *peer; - size_t seglen, datalen = 0 /*gcc*/; - unsigned int nfds, segflags = 0 /*gcc*/; - int r, partial, merge = 0 /*gcc*/; - - dprintf(("UDS: send(%d,%zu,%zu,%u,%u,0x%x)\n", - uds_get_id(uds), len, (off != NULL) ? *off : 0, ctl_len, - (ctl_off != NULL) ? *ctl_off : 0, flags)); - - partial = (off != NULL && *off > 0); - - /* - * First see whether we can process this send call at all right now. - * Most importantly, for connected sockets, if the peer's receive - * buffer is full, we may have to suspend the call until some space has - * been freed up. - */ - if ((r = uds_send_test(uds, len, ctl_len, min, partial)) != OK) - return r; - - /* - * Then get the peer socket. For connected sockets, this is trivial. - * For unconnected sockets, it may involve a lookup of the given - * address. - */ - if ((r = uds_send_peer(uds, addr, addr_len, user_endpt, &peer)) != OK) - return r; - - /* - * We now know for sure that we will not suspend this call without - * making any progress. However, the call may still fail. Copy in - * control data first now, so that we know whether there are any file - * descriptors to transfer. This aspect may determine whether or not - * we can merge data with a previous segment. Do not actually copy in - * the actual file descriptors yet, because that is much harder to undo - * in case of a failure later on. - */ - if (ctl_len > 0) { - /* We process control data once, in full. */ - assert(*ctl_off == 0); - - if ((r = uds_send_ctl(ctl, ctl_len, user_endpt)) < 0) - return r; - nfds = (unsigned int)r; - } else - nfds = 0; - - /* - * Now generate a new segment, or (if possible) merge new data into the - * last segment. Since the call may still fail, prepare the segment - * but do not update the buffer head yet. Note that the segment - * contains not just regular data (in fact it may contain no data at - * all) but (also) certain ancillary data. - */ - if ((r = uds_send_data(uds, peer, data, len, *off, user_endpt, nfds, - &merge, &datalen, &segflags)) <= 0) - return r; - seglen = (size_t)r; - - /* - * If we extracted any file descriptors from the control data earlier, - * copy them over to ourselves now. The resulting in-flight file - * descriptors are stored in a separate data structure. This is the - * last point where the send call may actually fail. - */ - if (nfds > 0) { - if ((r = uds_send_fds(peer, nfds, user_endpt)) != OK) - return r; - } - - /* - * The transmission is now known to be (partially) successful. Commit - * the new work by moving the receive buffer head. - */ - uds_send_advance(uds, peer, datalen, merge, seglen, segflags); - - /* - * Register the result. For stream-type sockets, the expected behavior - * is that all data be sent, and so we may still have to suspend the - * call after partial progress. Otherwise, we are now done. Either - * way, we are done with the control data, so mark it as consumed. - */ - *off += datalen; - *ctl_off += ctl_len; - if (uds_get_type(uds) == SOCK_STREAM && datalen < len) - return SUSPEND; - else - return OK; -} - -/* - * Test whether a send request would block. The given 'min' parameter contains - * the minimum number of bytes that should be possible to send without blocking - * (the low send watermark). Return SUSPEND if the send request would block, - * or any other error code if it would not. - */ -int -uds_test_send(struct sock * sock, size_t min) -{ - struct udssock *uds = (struct udssock *)sock; - - return uds_send_test(uds, min, 0, min, FALSE /*partial*/); -} - -/* - * Perform initial checks on a receive request, before it may potentially be - * suspended. Return OK if this receive request is valid, or a negative error - * code if it is not. - */ -int -uds_pre_recv(struct sock * sock __unused, endpoint_t user_endpt __unused, - int flags) -{ - - /* - * Reject calls with unknown flags. TODO: ensure that we should really - * reject all other flags rather than ignore them. - */ - if ((flags & ~(MSG_PEEK | MSG_WAITALL | MSG_CMSG_CLOEXEC)) != 0) - return EOPNOTSUPP; - - return OK; -} - -/* - * Determine whether the (real or pretend) receive request should be processed - * now, suspended until later, or rejected based on the current socket state. - * Return OK if the receive request should be processed now, along with a first - * indication whether the call may still be suspended later in 'may_block'. - * Return SUSPEND if the receive request should be retried later. Return an - * appropriate negative error code if the receive request should fail. - */ -static int -uds_recv_test(struct udssock * uds, size_t len, size_t min, int partial, - int * may_block) -{ - size_t seglen, datalen; - unsigned int segflags; - int r; - - /* - * If there are any pending data, those should always be received - * first. However, if there is nothing to receive, then whether we - * should suspend the receive call or fail immediately depends on other - * conditions. We first look at these other conditions. - */ - r = OK; - - if (uds_get_type(uds) != SOCK_DGRAM) { - if (uds_is_connecting(uds)) - r = SUSPEND; - else if (!uds_is_connected(uds) && !uds_is_disconnected(uds)) - r = ENOTCONN; - else if (!uds_has_conn(uds) || - uds_is_shutdown(uds->uds_conn, SFL_SHUT_WR)) - r = SOCKEVENT_EOF; - } - - if (uds->uds_len == 0) { - /* - * For stream-type sockets, we use the policy: if no regular - * data is requested, then end the call without receiving - * anything. For packet-type sockets, the request should block - * until there is a packet to discard, though. - */ - if (r != OK || (uds_get_type(uds) == SOCK_STREAM && len == 0)) - return r; - - return SUSPEND; - } - - /* - * For stream-type sockets, we should still suspend the call if fewer - * than 'min' bytes are available right now, and there is a possibility - * that more data may arrive later. More may arrive later iff 'r' is - * OK (i.e., no EOF or error will follow) and, in case we already - * received some partial results, there is not already a next segment - * with ancillary data (i.e, nonzero segment flags), or in any case - * there isn't more than one segment in the buffer. Limit 'min' to the - * maximum that can ever be received, though. Since that is difficult - * in our case, we check whether the buffer is entirely full instead. - */ - if (r == OK && uds_get_type(uds) == SOCK_STREAM && min > 0 && - uds->uds_len < UDS_BUF) { - assert(uds->uds_len >= UDS_HDRLEN); - - (void)uds_fetch_hdr(uds, uds->uds_tail, &seglen, &datalen, - &segflags); - - if (datalen < min && seglen == uds->uds_len && - (!partial || segflags == 0)) - return SUSPEND; - } - - /* - * Also start the decision process as to whether we should suspend the - * current call if MSG_WAITALL is given. Unfortunately there is no one - * place where we can conveniently do all the required checks. - */ - if (may_block != NULL) - *may_block = (r == OK && uds_get_type(uds) == SOCK_STREAM); - return OK; -} - -/* - * Receive regular data, and possibly the source path, from the tail segment in - * the receive buffer. On success, return the positive non-zero length of the - * tail segment, with 'addr' and 'addr_len' modified to store the source - * address if applicable, the result flags in 'rflags' updated as appropriate, - * the tail segment's data length stored in 'datalen', the number of received - * regular data bytes stored in 'reslen', the segment flags stored in - * 'segflags', and the absolute receive buffer position of the credentials in - * the segment stored in 'credpos' if applicable. Since the receive call may - * still fail, this function must not yet update the tail or any other aspect - * of the receive buffer. Return zero if the current receive call was already - * partially successful (due to MSG_WAITALL) and can no longer make progress, - * and thus should be ended. Return a negative error code on failure. - */ -static int -uds_recv_data(struct udssock * uds, const struct sockdriver_data * data, - size_t len, size_t off, struct sockaddr * addr, socklen_t * addr_len, - int * __restrict rflags, size_t * __restrict datalen, - size_t * __restrict reslen, unsigned int * __restrict segflags, - size_t * __restrict credpos) -{ - iovec_t iov[2]; - unsigned char lenbyte; - unsigned int iovcnt; - size_t pos, seglen, left; - int r; - - pos = uds_fetch_hdr(uds, uds->uds_tail, &seglen, datalen, segflags); - - /* - * If a partially completed receive now runs into a segment that cannot - * be logically merged with the previous one (because it has at least - * one segment flag set, meaning it has ancillary data), then we must - * shortcut the receive now. - */ - if (off != 0 && *segflags != 0) - return OK; - - /* - * As stated, for stream-type sockets, we choose to ignore zero-size - * receive calls. This has the consequence that reading a zero-sized - * segment (with ancillary data) requires a receive request for at - * least one regular data byte. Such a receive call would then return - * zero. The problem with handling zero-data receive requests is that - * we need to know whether the current segment is terminated (i.e., no - * more data can possibly be merged into it later), which is a test - * that we rather not perform, not in the least because we do not know - * whether there is an error pending on the socket. - * - * For datagrams, we currently allow a zero-size receive call to - * discard the next datagram. - * - * TODO: compare this against policies on other platforms. - */ - if (len == 0 && uds_get_type(uds) == SOCK_STREAM) - return OK; - - /* - * We have to skip the credentials for now: these are copied out as - * control data, and thus will (well, may) be looked at when dealing - * with the control data. For the same reason, we do not even look at - * UDS_HAS_FDS here. - */ - if (*segflags & UDS_HAS_CRED) { - *credpos = pos; - - pos = uds_fetch(uds, pos, &lenbyte, 1); - pos = uds_advance(pos, (size_t)lenbyte); - } - - /* - * Copy out the source address, but only if the (datagram) socket is - * not connected. TODO: even when it is connected, it may still - * receive packets sent to it from other sockets *before* being - * connected, and the receiver has no way of knowing that those packets - * did not come from its new peer. Ideally, the older packets should - * be dropped.. - */ - if (*segflags & UDS_HAS_PATH) { - pos = uds_fetch(uds, pos, &lenbyte, 1); - - if (uds_get_type(uds) == SOCK_DGRAM && !uds_has_link(uds)) - uds_make_addr((const char *)&uds->uds_buf[pos], - (size_t)lenbyte, addr, addr_len); - - pos = uds_advance(pos, (size_t)lenbyte); - } - - /* - * We can receive no more data than those that are present in the - * segment, obviously. For stream-type sockets, any more data that - * could have been received along with the current data would have been - * merged in the current segment, so we need not search for any next - * segments. - * - * For non-stream sockets, the caller may receive less than a whole - * packet if it supplied a small buffer. In that case, the rest of the - * packet will be discarded (but not here yet!) and the caller gets - * the MSG_TRUNC flag in its result, if it was using sendmsg(2) anyway. - */ - if (len > *datalen) - len = *datalen; - else if (len < *datalen && uds_get_type(uds) != SOCK_STREAM) - *rflags |= MSG_TRUNC; - - /* Copy out the data to the caller. */ - if (len > 0) { - iov[0].iov_addr = (vir_bytes)&uds->uds_buf[pos]; - left = UDS_BUF - pos; - - if (left < len) { - iov[0].iov_size = left; - iov[1].iov_addr = (vir_bytes)&uds->uds_buf[0]; - iov[1].iov_size = len - left; - iovcnt = 2; - } else { - iov[0].iov_size = len; - iovcnt = 1; - } - - if ((r = sockdriver_vcopyout(data, off, iov, iovcnt)) != OK) - return r; - } - - *reslen = len; - assert(seglen > 0 && seglen <= INT_MAX); - return (int)seglen; -} - -/* - * The current segment has associated file descriptors. If possible, copy out - * all file descriptors to the receiver, and generate and copy out a chunk of - * control data that contains their file descriptor numbers. If not all - * file descriptors fit in the receiver's buffer, or if any error occurs, no - * file descriptors are copied out. - */ -static int -uds_recv_fds(struct udssock * uds, const struct sockdriver_data * ctl, - socklen_t ctl_len, socklen_t ctl_off, endpoint_t user_endpt, int flags) -{ - struct msghdr msghdr; - struct cmsghdr *cmsg; - struct uds_fd *ufd; - unsigned int i, nfds; - socklen_t chunklen, chunkspace; - int r, fd, what; - - /* See how many file descriptors should be part of this chunk. */ - assert(!SIMPLEQ_EMPTY(&uds->uds_fds)); - ufd = SIMPLEQ_FIRST(&uds->uds_fds); - nfds = ufd->ufd_count; - assert(nfds > 0); - - /* - * We produce and copy out potentially unaligned chunks, using - * CMSG_LEN, but return the aligned size at the end, using CMSG_SPACE. - * This may leave "gap" bytes unchanged in userland, but that should - * not be a problem. By producing unaligned chunks, we eliminate a - * potential boundary case where the unaligned chunk passed in (by the - * sender) no longer fits in the same buffer after being aligned here. - */ - chunklen = CMSG_LEN(sizeof(int) * nfds); - chunkspace = CMSG_SPACE(sizeof(int) * nfds); - assert(chunklen <= sizeof(uds_ctlbuf)); - if (chunklen > ctl_len) - return 0; /* chunk would not fit, so produce nothing instead */ - if (chunkspace > ctl_len) - chunkspace = ctl_len; - - memset(&msghdr, 0, sizeof(msghdr)); - msghdr.msg_control = uds_ctlbuf; - msghdr.msg_controllen = sizeof(uds_ctlbuf); - - memset(uds_ctlbuf, 0, chunklen); - cmsg = CMSG_FIRSTHDR(&msghdr); - cmsg->cmsg_len = chunklen; - cmsg->cmsg_level = SOL_SOCKET; - cmsg->cmsg_type = SCM_RIGHTS; - - /* - * Copy the group's local file descriptors to the target endpoint, and - * store the resulting remote file descriptors in the chunk buffer. - */ - r = OK; - - for (i = 0; i < nfds; i++) { - assert(ufd != SIMPLEQ_END(&uds->uds_fds)); - assert(i == 0 || ufd->ufd_count == 0); - - what = COPYFD_TO; - if (flags & MSG_CMSG_CLOEXEC) - what |= COPYFD_CLOEXEC; - - /* Failure may happen legitimately here (e.g., EMFILE). */ - if ((r = copyfd(user_endpt, ufd->ufd_fd, what)) < 0) - break; /* we keep our progress so far in 'i' */ - - fd = r; - - dprintf(("UDS: copied out fd %d -> %d\n", ufd->ufd_fd, fd)); - - memcpy(&((int *)CMSG_DATA(cmsg))[i], &fd, sizeof(int)); - - ufd = SIMPLEQ_NEXT(ufd, ufd_next); - } - - /* If everything went well so far, copy out the produced chunk. */ - if (r >= 0) - r = sockdriver_copyout(ctl, ctl_off, uds_ctlbuf, chunklen); - - /* - * Handle errors. At this point, the 'i' variable contains the number - * of file descriptors that have already been successfully copied out. - */ - if (r < 0) { - /* Revert the successful copyfd() calls made so far. */ - while (i-- > 0) { - memcpy(&fd, &((int *)CMSG_DATA(cmsg))[i], sizeof(int)); - - (void)copyfd(user_endpt, fd, COPYFD_CLOSE); - } - - return r; - } - - /* - * Success. Return the aligned size of the produced chunk, if the - * given length permits it. From here on, the receive call may no - * longer fail, as that would result in lost file descriptors. - */ - return chunkspace; -} - -/* - * Generate and copy out a chunk of control data with the sender's credentials. - * Return the aligned chunk size on success, or a negative error code on - * failure. - */ -static int -uds_recv_cred(struct udssock * uds, const struct sockdriver_data * ctl, - socklen_t ctl_len, socklen_t ctl_off, size_t credpos) -{ - struct msghdr msghdr; - struct cmsghdr *cmsg; - socklen_t chunklen, chunkspace; - unsigned char lenbyte; - size_t credlen; - int r; - - /* - * Since the sender side already did the hard work of producing the - * (variable-size) sockcred structure as it should be received, there - * is relatively little work to be done here. - */ - credpos = uds_fetch(uds, credpos, &lenbyte, 1); - credlen = (size_t)lenbyte; - - chunklen = CMSG_LEN(credlen); - chunkspace = CMSG_SPACE(credlen); - assert(chunklen <= sizeof(uds_ctlbuf)); - if (chunklen > ctl_len) - return 0; /* chunk would not fit, so produce nothing instead */ - if (chunkspace > ctl_len) - chunkspace = ctl_len; - - memset(&msghdr, 0, sizeof(msghdr)); - msghdr.msg_control = uds_ctlbuf; - msghdr.msg_controllen = sizeof(uds_ctlbuf); - - memset(uds_ctlbuf, 0, chunklen); - cmsg = CMSG_FIRSTHDR(&msghdr); - cmsg->cmsg_len = chunklen; - cmsg->cmsg_level = SOL_SOCKET; - cmsg->cmsg_type = SCM_CREDS; - - uds_fetch(uds, credpos, CMSG_DATA(cmsg), credlen); - - if ((r = sockdriver_copyout(ctl, ctl_off, uds_ctlbuf, chunklen)) != OK) - return r; - - return chunkspace; -} - -/* - * Copy out control data for the ancillary data associated with the current - * segment, if any. Return OK on success, at which point the current receive - * call may no longer fail. 'rflags' may be updated with additional result - * flags. Return a negative error code on failure. - */ -static int -uds_recv_ctl(struct udssock * uds, const struct sockdriver_data * ctl, - socklen_t ctl_len, socklen_t * ctl_off, endpoint_t user_endpt, - int flags, unsigned int segflags, size_t credpos, int * rflags) -{ - int r; - - /* - * We first copy out all file descriptors, if any. We put them in one - * SCM_RIGHTS chunk, even if the sender put them in separate SCM_RIGHTS - * chunks. We believe that this should not cause application-level - * issues, but if it does, we can change that later with some effort. - * We then copy out credentials, if any. - * - * We copy out each control chunk independently of the others, and also - * perform error recovery on a per-chunk basis. This implies the - * following. If producing or copying out the first chunk fails, the - * entire recvmsg(2) call will fail with an appropriate error. If - * producing or copying out any subsequent chunk fails, the recvmsg(2) - * call will still return the previously generated chunks (a "short - * control read" if you will) as well as the MSG_CTRUNC flag. This - * approach is simple and clean, and it guarantees that we can always - * copy out at least as many file descriptors as we copied in for this - * segment, even if credentials are present as well. However, the - * approach does cause slightly more overhead when there are multiple - * chunks per call, as those are copied out separately. - * - * Since the generated SCM_RIGHTS chunk is never larger than the - * originally received SCM_RIGHTS chunk, the temporary "uds_ctlbuf" - * buffer is always large enough to contain the chunk in its entirety. - * SCM_CREDS chunks should always fit easily as well. - * - * The MSG_CTRUNC flag will be returned iff not the entire user-given - * control buffer was filled and not all control chunks were delivered. - * Our current implementation does not deliver partial chunks. NetBSD - * does, except for SCM_RIGHTS chunks. - * - * TODO: get rid of the redundancy in processing return values. - */ - if (segflags & UDS_HAS_FDS) { - r = uds_recv_fds(uds, ctl, ctl_len, *ctl_off, user_endpt, - flags); - - /* - * At this point, 'r' contains one of the following: - * - * r > 0 a chunk of 'r' bytes was added successfully. - * r == 0 not enough space left; the chunk was not added. - * r < 0 an error occurred; the chunk was not added. - */ - if (r < 0 && *ctl_off == 0) - return r; - - if (r > 0) { - ctl_len -= r; - *ctl_off += r; - } else - *rflags |= MSG_CTRUNC; - } - - if (segflags & UDS_HAS_CRED) { - r = uds_recv_cred(uds, ctl, ctl_len, *ctl_off, credpos); - - /* As above. */ - if (r < 0 && *ctl_off == 0) - return r; - - if (r > 0) { - ctl_len -= r; - *ctl_off += r; - } else - *rflags |= MSG_CTRUNC; - } - - return OK; -} - -/* - * The current receive request is successful or, in the case of MSG_WAITALL, - * has made progress. Advance the receive buffer tail, either by discarding - * the entire tail segment or by generating a new, smaller tail segment that - * contains only the regular data left to be received from the original tail - * segment. Also wake up the sending side for connection-oriented sockets if - * applicable, because there may now be room for more data to be sent. Update - * 'may_block' if we are now sure that the call may not block on MSG_WAITALL - * after all. - */ -static void -uds_recv_advance(struct udssock * uds, size_t seglen, size_t datalen, - size_t reslen, unsigned int segflags, int * may_block) -{ - struct udssock *conn; - struct uds_fd *ufd; - size_t delta, nseglen, advance; - unsigned int nfds; - - /* Note that 'reslen' may be legitimately zero. */ - assert(reslen <= datalen); - - if (uds_get_type(uds) != SOCK_STREAM && reslen < datalen) - reslen = datalen; - - delta = datalen - reslen; - - if (delta == 0) { - /* - * Fully consume the tail segment. We advance the tail by the - * full segment length, thus moving up to either the next - * segment in the receive buffer, or an empty receive buffer. - */ - advance = seglen; - - uds->uds_tail = uds_advance(uds->uds_tail, advance); - } else { - /* - * Partially consume the tail segment. We put a new segment - * header right in front of the remaining data, which obviously - * always fits. Since any ancillary data was consumed along - * with the first data byte of the segment, the new segment has - * no ancillary data anymore (and thus a zero flags field). - */ - nseglen = UDS_HDRLEN + delta; - assert(nseglen < seglen); - - advance = seglen - nseglen; - - uds->uds_tail = uds_advance(uds->uds_tail, advance); - - uds_store_hdr(uds, uds->uds_tail, nseglen, delta, 0); - } - - /* - * For datagram-oriented sockets, we always consume at least a header. - * For stream-type sockets, we either consume a zero-data segment along - * with its ancillary data, or we consume at least one byte from a - * segment that does have regular data. In all other cases, the - * receive call has already been ended by now. Thus, we always advance - * the tail of the receive buffer here. - */ - assert(advance > 0); - - /* - * The receive buffer's used length (uds_len) and pointer to the - * previous segment header (uds_last) are offsets from the tail. Now - * that we have moved the tail, we need to adjust these accordingly. - * If the buffer is now empty, reset the tail to the buffer start so as - * to avoid splitting inter-process copies whenever possible. - */ - assert(uds->uds_len >= advance); - uds->uds_len -= advance; - - if (uds->uds_len == 0) - uds->uds_tail = 0; - - /* - * If uds_last is zero here, it was pointing to the segment we just - * (partially) consumed. By leaving it zero, it will still point to - * the new or next segment. - */ - if (uds->uds_last > 0) { - assert(uds->uds_len > 0); - assert(uds->uds_last >= advance); - uds->uds_last -= advance; - } - - /* - * If there were any file descriptors associated with this segment, - * close and free them now. - */ - if (segflags & UDS_HAS_FDS) { - assert(!SIMPLEQ_EMPTY(&uds->uds_fds)); - ufd = SIMPLEQ_FIRST(&uds->uds_fds); - nfds = ufd->ufd_count; - assert(nfds > 0); - - while (nfds-- > 0) { - assert(!SIMPLEQ_EMPTY(&uds->uds_fds)); - ufd = SIMPLEQ_FIRST(&uds->uds_fds); - SIMPLEQ_REMOVE_HEAD(&uds->uds_fds, ufd_next); - - dprintf(("UDS: closing local fd %d\n", ufd->ufd_fd)); - - closenb(ufd->ufd_fd); - - SIMPLEQ_INSERT_TAIL(&uds_freefds, ufd, ufd_next); - } - } - - /* - * If there is now any data left in the receive buffer, then there has - * been a reason that we haven't received it. For stream sockets, that - * reason is that the next segment has ancillary data. In any case, - * this means we should never block the current receive operation - * waiting for more data. Otherwise, we may block on MSG_WAITALL. - */ - if (uds->uds_len > 0) - *may_block = FALSE; - - /* - * If the (non-datagram) socket has a peer that is not shut down for - * writing, see if it can be woken up to send more data. Note that - * the event will never be processed immediately. - */ - if (uds_is_connected(uds)) { - assert(uds_get_type(uds) != SOCK_DGRAM); - - conn = uds->uds_conn; - - if (!uds_is_shutdown(conn, SFL_SHUT_WR)) - sockevent_raise(&conn->uds_sock, SEV_SEND); - } -} - -/* - * Process a receive request. Return OK if the receive request has completed - * successfully, SUSPEND if it should be tried again later, SOCKEVENT_EOF if an - * end-of-file condition is reached, or a negative error code on failure. In - * all cases, the values of 'off' and 'ctl_off' must be updated if any progress - * has been made; if either is non-zero, libsockevent will return the partial - * progress rather than an error code or EOF. - */ -int -uds_recv(struct sock * sock, const struct sockdriver_data * data, size_t len, - size_t * off, const struct sockdriver_data * ctl, socklen_t ctl_len, - socklen_t * ctl_off, struct sockaddr * addr, socklen_t * addr_len, - endpoint_t user_endpt, int flags, size_t min, int * rflags) -{ - struct udssock *uds = (struct udssock *)sock; - size_t seglen, datalen, reslen = 0 /*gcc*/, credpos = 0 /*gcc*/; - unsigned int segflags; - int r, partial, may_block = 0 /*gcc*/; - - dprintf(("UDS: recv(%d,%zu,%zu,%u,%u,0x%x)\n", - uds_get_id(uds), len, (off != NULL) ? *off : 0, ctl_len, - (ctl_off != NULL) ? *ctl_off : 0, flags)); - - /* - * Start by testing whether anything can be received at all, or whether - * an error or EOF should be returned instead, or whether the receive - * call should be suspended until later otherwise. If no (regular or - * control) data can be received, or if this was a test for select, - * we bail out right after. - */ - partial = (off != NULL && *off > 0); - - if ((r = uds_recv_test(uds, len, min, partial, &may_block)) != OK) - return r; - - /* - * Copy out regular data, if any. Do this before copying out control - * data, because the latter is harder to undo on failure. This data - * copy function returns returns OK (0) if we are to return a result of - * zero bytes (which is *not* EOF) to the caller without doing anything - * else. The function returns a nonzero positive segment length if we - * should carry on with the receive call (as it happens, all its other - * returned values may in fact be zero). - */ - if ((r = uds_recv_data(uds, data, len, *off, addr, addr_len, rflags, - &datalen, &reslen, &segflags, &credpos)) <= 0) - return r; - seglen = (size_t)r; - - /* - * Copy out control data, if any: transfer and copy out records of file - * descriptors, and/or copy out sender credentials. This is the last - * part of the call that may fail. - */ - if ((r = uds_recv_ctl(uds, ctl, ctl_len, ctl_off, user_endpt, flags, - segflags, credpos, rflags)) != OK) - return r; - - /* - * Now that the call has succeeded, move the tail of the receive - * buffer, unless we were merely peeking. - */ - if (!(flags & MSG_PEEK)) - uds_recv_advance(uds, seglen, datalen, reslen, segflags, - &may_block); - else - may_block = FALSE; - - /* - * If the MSG_WAITALL flag was given, we may still have to suspend the - * call after partial success. In particular, the receive call may - * suspend after partial success if all of these conditions are met: - * - * 1) the socket is a stream-type socket; - * 2) MSG_WAITALL is set; - * 3) MSG_PEEK is not set; - * 4) MSG_DONTWAIT is not set (tested upon return); - * 5) the socket must not have a pending error (tested upon return); - * 6) the socket must not be shut down for reading (tested later); - * 7) the socket must still be connected to a peer (no EOF); - * 8) the peer must not have been shut down for writing (no EOF); - * 9) the next segment, if any, contains no ancillary data. - * - * Together, these points guarantee that the call could conceivably - * receive more after being resumed. Points 4 to 6 are covered by - * libsockevent, which will end the call even if we return SUSPEND - * here. Due to segment merging, we cover point 9 by checking that - * there is currently no next segment at all. Once a new segment - * arrives, the ancillary-data test is done then. - */ - *off += reslen; - if ((flags & MSG_WAITALL) && reslen < len && may_block) - return SUSPEND; - else - return OK; -} - -/* - * Test whether a receive request would block. The given 'min' parameter - * contains the minimum number of bytes that should be possible to receive - * without blocking (the low receive watermark). Return SUSPEND if the send - * request would block. Otherwise, return any other error code (including OK - * or SOCKEVENT_EOF), and if 'size' is not a NULL pointer, it should be filled - * with the number of bytes available for receipt right now (if not zero). - * Note that if 'size' is not NULL, 'min' will always be zero. - */ -int -uds_test_recv(struct sock * sock, size_t min, size_t * size) -{ - struct udssock *uds = (struct udssock *)sock; - size_t seglen; - unsigned int segflags; - int r; - - if ((r = uds_recv_test(uds, min, min, FALSE /*partial*/, - NULL /*may_block*/)) == SUSPEND) - return r; - - if (size != NULL && uds->uds_len > 0) - (void)uds_fetch_hdr(uds, uds->uds_tail, &seglen, size, - &segflags); - - return r; -} diff --git a/minix/net/uds/stat.c b/minix/net/uds/stat.c deleted file mode 100644 index 2759f6318..000000000 --- a/minix/net/uds/stat.c +++ /dev/null @@ -1,186 +0,0 @@ -/* UNIX Domain Sockets - stat.c - network status */ - -#include "uds.h" -#include -#include - -/* - * Fill the given 'ki' structure with information about the socket 'uds'. - */ -static void -uds_get_info(struct kinfo_pcb * ki, const struct udssock * uds) -{ - struct udssock *peer; - socklen_t len; - int type; - - type = uds_get_type(uds); - peer = uds_get_peer(uds); - - ki->ki_pcbaddr = (uint64_t)(uintptr_t)uds; - ki->ki_ppcbaddr = (uint64_t)(uintptr_t)uds; - ki->ki_sockaddr = (uint64_t)(uintptr_t)&uds->uds_sock; - ki->ki_family = AF_UNIX; - ki->ki_type = type; - ki->ki_protocol = UDSPROTO_UDS; - ki->ki_pflags = 0; - if (uds->uds_flags & UDSF_CONNWAIT) - ki->ki_pflags |= UNP_CONNWAIT; - if (uds->uds_flags & UDSF_PASSCRED) - ki->ki_pflags |= UNP_WANTCRED; - if (type != SOCK_DGRAM && uds->uds_cred.unp_pid != -1) { - if (uds_is_listening(uds)) - ki->ki_pflags |= UNP_EIDSBIND; - else if (uds_is_connecting(uds) || uds_is_connected(uds)) - ki->ki_pflags |= UNP_EIDSVALID; - } - /* Not sure about NetBSD connection states. First attempt here. */ - if (uds_is_connecting(uds)) - ki->ki_sostate = SS_ISCONNECTING; - else if (uds_is_connected(uds)) - ki->ki_sostate = SS_ISCONNECTED; - else if (uds_is_disconnected(uds)) - ki->ki_sostate = SS_ISDISCONNECTED; - ki->ki_rcvq = uds->uds_len; - /* We currently mirror the peer's receive queue size when connected. */ - if (uds_is_connected(uds)) - ki->ki_sndq = peer->uds_len; - /* The source is not set for bound connection-type sockets here. */ - if (type == SOCK_DGRAM || uds_is_listening(uds)) - uds_make_addr(uds->uds_path, (size_t)uds->uds_pathlen, - &ki->ki_src, &len); - if (peer != NULL) - uds_make_addr(peer->uds_path, (size_t)peer->uds_pathlen, - &ki->ki_dst, &len); - /* TODO: we should set ki_inode and ki_vnode, but to what? */ - ki->ki_conn = (uint64_t)(uintptr_t)peer; - if (!TAILQ_EMPTY(&uds->uds_queue)) - ki->ki_refs = - (uint64_t)(uintptr_t)TAILQ_FIRST(&uds->uds_queue); - if (uds_has_link(uds)) - ki->ki_nextref = - (uint64_t)(uintptr_t)TAILQ_NEXT(uds, uds_next); -} - -/* - * Remote MIB implementation of CTL_NET PF_LOCAL {SOCK_STREAM,SOCK_DGRAM, - * SOCK_SEQPACKET} 0. This function handles all queries on the - * "net.local.{stream,dgram,seqpacket}.pcblist" sysctl(7) nodes. - * - * The 0 for "pcblist" is a MINIXism: we use it to keep our arrays small. - * NetBSD numbers these nodes dynamically and so they have numbers above - * CREATE_BASE. That also means that no userland application can possibly - * hardcode their numbers, and must perform lookups by name. In turn, that - * means that we can safely change the 0 to another number if NetBSD ever - * introduces statically numbered nodes in these subtrees. - */ -static ssize_t -net_local_pcblist(struct rmib_call * call, struct rmib_node * node __unused, - struct rmib_oldp * oldp, struct rmib_newp * newp __unused) -{ - struct udssock *uds; - struct kinfo_pcb ki; - ssize_t off; - int r, type, size, max; - - if (call->call_namelen != 4) - return EINVAL; - - /* The first two added name fields are not used. */ - - size = call->call_name[2]; - if (size < 0 || (size_t)size > sizeof(ki)) - return EINVAL; - if (size == 0) - size = sizeof(ki); - max = call->call_name[3]; - - type = call->call_oname[2]; - - off = 0; - - for (uds = uds_enum(NULL, type); uds != NULL; - uds = uds_enum(uds, type)) { - if (rmib_inrange(oldp, off)) { - memset(&ki, 0, sizeof(ki)); - - uds_get_info(&ki, uds); - - if ((r = rmib_copyout(oldp, off, &ki, size)) < 0) - return r; - } - - off += size; - if (max > 0 && --max == 0) - break; - } - - /* - * Margin to limit the possible effects of the inherent race condition - * between receiving just the data size and receiving the actual data. - */ - if (oldp == NULL) - off += PCB_SLOP * size; - - return off; -} - -/* The CTL_NET PF_LOCAL SOCK_STREAM subtree. */ -static struct rmib_node net_local_stream_table[] = { - [0] = RMIB_FUNC(RMIB_RO | CTLTYPE_NODE, 0, net_local_pcblist, - "pcblist", "SOCK_STREAM protocol control block list"), -}; - -/* The CTL_NET PF_LOCAL SOCK_DGRAM subtree. */ -static struct rmib_node net_local_dgram_table[] = { - [0] = RMIB_FUNC(RMIB_RO | CTLTYPE_NODE, 0, net_local_pcblist, - "pcblist", "SOCK_DGRAM protocol control block list"), -}; - -/* The CTL_NET PF_LOCAL SOCK_SEQPACKET subtree. */ -static struct rmib_node net_local_seqpacket_table[] = { - [0] = RMIB_FUNC(RMIB_RO | CTLTYPE_NODE, 0, net_local_pcblist, - "pcblist", "SOCK_SEQPACKET protocol control block list"), -}; - -/* The CTL_NET PF_LOCAL subtree. */ -static struct rmib_node net_local_table[] = { -/* 1*/ [SOCK_STREAM] = RMIB_NODE(RMIB_RO, net_local_stream_table, - "stream", "SOCK_STREAM settings"), -/* 2*/ [SOCK_DGRAM] = RMIB_NODE(RMIB_RO, net_local_dgram_table, - "dgram", "SOCK_DGRAM settings"), -/* 5*/ [SOCK_SEQPACKET] = RMIB_NODE(RMIB_RO, net_local_seqpacket_table, - "seqpacket", "SOCK_SEQPACKET settings"), -}; - -static struct rmib_node net_local_node = - RMIB_NODE(RMIB_RO, net_local_table, "local", "PF_LOCAL related settings"); - -/* - * Initialize the status module. - */ -void -uds_stat_init(void) -{ - const int mib[] = { CTL_NET, PF_LOCAL }; - int r; - - /* - * Register our own "net.local" subtree with the MIB service. - * - * This call only returns local failures. Remote failures (in the MIB - * service) are silently ignored. So, we can safely panic on failure. - */ - if ((r = rmib_register(mib, __arraycount(mib), &net_local_node)) != OK) - panic("UDS: unable to register remote MIB tree: %d", r); -} - -/* - * Clean up the status module. - */ -void -uds_stat_cleanup(void) -{ - - rmib_deregister(&net_local_node); -} diff --git a/minix/net/uds/uds.c b/minix/net/uds/uds.c deleted file mode 100644 index 2052a2ac2..000000000 --- a/minix/net/uds/uds.c +++ /dev/null @@ -1,1417 +0,0 @@ -/* UNIX Domain Sockets - uds.c - socket management */ - -#include "uds.h" - -static struct udssock uds_array[NR_UDSSOCK]; -static TAILQ_HEAD(uds_freelist, udssock) uds_freelist; -static unsigned int uds_in_use; -static int uds_running; - -static const struct sockevent_ops uds_ops; - -static SLIST_HEAD(udshash, udssock) udshash[UDSHASH_SLOTS]; - -/* - * Initialize file-to-socket hash table. - */ -static void -udshash_init(void) -{ - unsigned int slot; - - for (slot = 0; slot < __arraycount(udshash); slot++) - SLIST_INIT(&udshash[slot]); -} - -/* - * Return a hash table slot number for the given pair. - */ -static unsigned int -udshash_slot(dev_t dev, ino_t ino) -{ - - assert(dev != NO_DEV); - assert(ino != 0); - - /* - * Effectively combining two 64-bit numbers into a single 6-or-so-bit - * hash is not too easy. This hash function is probably among the - * worst options. Then again it is not all that critical as we are not - * expecting that many bound UDS sockets in the system anyway. - */ - return (unsigned int)(dev ^ ino) % UDSHASH_SLOTS; -} - -/* - * Look for a socket that is bound to the given pair. Return a - * pointer to the socket if found, or NULL otherwise. - */ -static struct udssock * -udshash_get(dev_t dev, ino_t ino) -{ - struct udssock *uds; - unsigned int slot; - - slot = udshash_slot(dev, ino); - - SLIST_FOREACH(uds, &udshash[slot], uds_hash) { - if (uds->uds_dev == dev && uds->uds_ino == ino) - return uds; - } - - return NULL; -} - -/* - * Add a socket to the file-to-socket hash table. The socket must have its - * device and inode fields set, and must not be in the hash table already. - */ -static void -udshash_add(struct udssock * uds) -{ - unsigned int slot; - - slot = udshash_slot(uds->uds_dev, uds->uds_ino); - - SLIST_INSERT_HEAD(&udshash[slot], uds, uds_hash); -} - -/* - * Remove a socket from the file-to-socket hash table. The socket must be in - * the hash table. - */ -static void -udshash_del(struct udssock * uds) -{ - unsigned int slot; - - slot = udshash_slot(uds->uds_dev, uds->uds_ino); - - /* This macro is O(n). */ - SLIST_REMOVE(&udshash[slot], uds, udssock, uds_hash); -} - -/* - * Return the socket identifier for the given UDS socket object. - */ -sockid_t -uds_get_id(struct udssock * uds) -{ - - return (sockid_t)(uds - uds_array); -} - -/* - * Given either NULL or a previously returned socket, return the next in-use - * UDS socket of the given socket type, or NULL if there are no more matches. - * The sockets are returned in random order, but each matching socket is - * returned exactly once (until any socket is allocated or freed). - */ -struct udssock * -uds_enum(struct udssock * prev, int type) -{ - sockid_t id; - - if (prev != NULL) - id = uds_get_id(prev) + 1; - else - id = 0; - - for (; id < NR_UDSSOCK; id++) - if ((uds_array[id].uds_flags & UDSF_IN_USE) && - uds_get_type(&uds_array[id]) == type) - return &uds_array[id]; - - return NULL; -} - -/* - * Invalidate credentials on the socket. - */ -static void -uds_clear_cred(struct udssock * uds) -{ - - uds->uds_cred.unp_pid = -1; - uds->uds_cred.unp_euid = -1; - uds->uds_cred.unp_egid = -1; -} - -/* - * Obtain the credentials (process, user, and group ID) of the given user - * endpoint and associate them with the socket for later retrieval. It is - * important to note that this information is obtained once at connect time, - * and never updated later. The party receiving the credentials must take this - * into account. - */ -static void -uds_get_cred(struct udssock * uds, endpoint_t user_endpt) -{ - int r; - - if ((uds->uds_cred.unp_pid = r = getepinfo(user_endpt, - &uds->uds_cred.unp_euid, &uds->uds_cred.unp_egid)) < 0) { - printf("UDS: failed obtaining credentials of %d (%d)\n", - user_endpt, r); - - uds_clear_cred(uds); - } -} - -/* - * Allocate and initialize a UDS socket. On succes, return OK with a pointer - * to the new socket in 'udsp'. On failure, return a negative error code. - */ -static int -uds_alloc(struct udssock ** udsp) -{ - struct udssock *uds; - int r; - - /* Allocate, initialize, and return a UNIX domain socket object. */ - if (TAILQ_EMPTY(&uds_freelist)) - return ENOBUFS; - - uds = TAILQ_FIRST(&uds_freelist); - - uds->uds_conn = NULL; /* not connected */ - uds->uds_link = NULL; /* not connecting or linked */ - uds->uds_queued = 0; - uds->uds_flags = UDSF_IN_USE; /* may be found through enumeration */ - uds->uds_pathlen = 0; /* not bound: no path */ - uds->uds_dev = NO_DEV; /* not hashed: no socket file device */ - uds->uds_ino = 0; /* not hashed: no socket file inode */ - uds_clear_cred(uds); /* no bind/connect-time credentials */ - TAILQ_INIT(&uds->uds_queue); /* an empty queue */ - - if ((r = uds_io_setup(uds)) != OK) - return r; - - TAILQ_REMOVE(&uds_freelist, uds, uds_next); - - assert(uds_in_use < NR_UDSSOCK); - uds_in_use++; - - *udsp = uds; - return OK; -} - -/* - * Free a previously allocated socket. - */ -static void -uds_free(struct sock * sock) -{ - struct udssock *uds = (struct udssock *)sock; - - uds_io_cleanup(uds); - - uds->uds_flags = 0; /* no longer in use */ - - TAILQ_INSERT_HEAD(&uds_freelist, uds, uds_next); - - assert(uds_in_use > 0); - if (--uds_in_use == 0 && uds_running == FALSE) - sef_cancel(); -} - -/* - * Create a new socket. - */ -static sockid_t -uds_socket(int domain, int type, int protocol, endpoint_t user_endpt __unused, - struct sock ** sockp, const struct sockevent_ops ** ops) -{ - struct udssock *uds; - int r; - - dprintf(("UDS: socket(%d,%d,%d)\n", domain, type, protocol)); - - if (domain != PF_UNIX) { - /* This means the service was configured incorrectly. */ - printf("UDS: got request for domain %d\n", domain); - - return EAFNOSUPPORT; - } - - /* We support the following three socket types. */ - switch (type) { - case SOCK_STREAM: - case SOCK_SEQPACKET: - case SOCK_DGRAM: - break; - default: - return EPROTOTYPE; - } - - /* - * The PF_UNIX domain does not support particular protocols, so the - * given protocol must be zero (= anything that matches). - */ - if (protocol != UDSPROTO_UDS) - return EPROTONOSUPPORT; - - if ((r = uds_alloc(&uds)) != OK) - return r; - - dprintf(("UDS: socket returns %d\n", uds_get_id(uds))); - - *sockp = &uds->uds_sock; - *ops = &uds_ops; - return uds_get_id(uds); -} - -/* - * Connect a pair of sockets. - */ -static int -uds_pair(struct sock * sock1, struct sock * sock2, endpoint_t user_endpt) -{ - struct udssock *uds1 = (struct udssock *)sock1; - struct udssock *uds2 = (struct udssock *)sock2; - - dprintf(("UDS: pair(%d,%d)\n", uds_get_id(uds1), uds_get_id(uds2))); - - /* Only connection-oriented types are acceptable. */ - if (uds_get_type(uds1) == SOCK_DGRAM) - return EOPNOTSUPP; - - /* Connect the sockets. */ - uds1->uds_conn = uds2; - uds2->uds_conn = uds1; - uds1->uds_flags |= UDSF_CONNECTED; - uds2->uds_flags |= UDSF_CONNECTED; - - /* Obtain the (same) credentials for both sides of the connection. */ - uds_get_cred(uds1, user_endpt); - memcpy(&uds2->uds_cred, &uds1->uds_cred, sizeof(uds2->uds_cred)); - - return OK; -} - -/* - * Disconnect a UDS socket, notifying or freeing up the other end of the - * connection depending on whether the socket was linked, that is, on the - * accept queue of a listening socket. - */ -static void -uds_disconnect(struct udssock * uds, int was_linked) -{ - struct udssock *conn; - - assert(uds_is_connected(uds)); - assert(uds_has_conn(uds)); - - conn = uds->uds_conn; - - assert(uds_is_connected(conn)); - assert(uds_has_conn(conn)); - assert(!uds_has_link(conn)); - assert(conn->uds_conn == uds); - - /* Disconnect the sockets. */ - uds->uds_conn = NULL; - conn->uds_conn = NULL; - - /* - * If the given socket is linked, then it is a connected socket for - * which the other end has been created but not yet accepted. In that - * case, the other end ('conn') will have to be freed up. Otherwise, - * it is a regular user-created socket and we must properly transition - * it into disconnected state. - */ - if (!was_linked) { - sockevent_raise(&conn->uds_sock, SEV_SEND | SEV_RECV); - - /* - * Clear the peer credentials so that they will not be mistaken - * for having been obtained at bind time. - */ - uds_clear_cred(conn); - } else - sockevent_raise(&conn->uds_sock, SEV_CLOSE); -} - -/* - * Add the socket 'link' to the queue of the socket 'uds'. This also implies - * that 'link's link socket is set to 'uds'. - */ -static void -uds_add_queue(struct udssock * uds, struct udssock * link) -{ - - dprintf(("UDS: add_queue(%d,%d)\n", - uds_get_id(uds), uds_get_id(link))); - - TAILQ_INSERT_TAIL(&uds->uds_queue, link, uds_next); - - uds->uds_queued++; - assert(uds->uds_queued != 0); - - link->uds_link = uds; -} - -/* - * Remove the socket 'link' from the queue of the socket 'uds'. This also - * reset 'link's link to NULL. - */ -static void -uds_del_queue(struct udssock * uds, struct udssock * link) -{ - - dprintf(("UDS: del_queue(%d,%d)\n", - uds_get_id(uds), uds_get_id(link))); - - assert(link->uds_link == uds); - - TAILQ_REMOVE(&uds->uds_queue, link, uds_next); - - assert(uds->uds_queued > 0); - uds->uds_queued--; - - link->uds_link = NULL; -} - -/* - * Remove all sockets from the queue of the socket 'uds', with the exception of - * 'except' if non-NULL. Raise an ECONNRESET error on all removed sockets that - * are not equal to 'uds'. - */ -static void -uds_clear_queue(struct udssock * uds, struct udssock * except) -{ - struct udssock *link, *tmp; - int found; - - dprintf(("UDS: clear_queue(%d,%d)\n", - uds_get_id(uds), (except != NULL) ? uds_get_id(except) : -1)); - - found = 0; - - /* - * Abort all connecting sockets queued on this socket, except for the - * given exception, which may be NULL. - */ - TAILQ_FOREACH_SAFE(link, &uds->uds_queue, uds_next, tmp) { - if (link == except) { - found++; - - continue; - } - - dprintf(("UDS: clear_queue removes %d\n", uds_get_id(link))); - - assert(uds_get_type(link) == SOCK_DGRAM || - uds_is_connecting(link) || uds_is_connected(link)); - - uds_del_queue(uds, link); - - /* - * Generate an error only if the socket was not linked to - * itself (only datagram sockets can be linked to themselves). - * The error is not helpful for applications in that case. - */ - if (uds != link) - sockevent_set_error(&link->uds_sock, ECONNRESET); - - /* - * If this is a listening socket, disconnect the connecting or - * connected end. If a connected peer was already created for - * the queued socket, dispose of that peer. - * - * Clear credentials obtained when starting to connect (in - * which case the socket is always a connection-oriented - * socket), so that they will not be mistaken for credentials - * obtained at bind time. - */ - if (uds_get_type(link) != SOCK_DGRAM) { - if (uds_is_connected(link)) - uds_disconnect(link, TRUE /*was_linked*/); - else - uds_clear_cred(link); - } - } - - assert(uds->uds_queued == found); -} - -/* - * Check whether the socket address given in 'addr', with length 'addr_len', is - * a valid UNIX domain socket address (including a path to a socket file). On - * success, return the (non-zero) length of the socket file's path, minus the - * null terminator which may in fact not be present. The caller is responsible - * for copying and terminating the path as needed. A pointer to the path as - * stored in 'addr' is returned in 'pathp'. On failure, return an error code. - */ -static int -uds_check_addr(const struct sockaddr * addr, socklen_t addr_len, - const char ** pathp) -{ - const char *p; - size_t len; - - /* - * We could cast to a sockaddr_un structure pointer first, but that - * would not provide any benefits here. Instead, we use sa_data as the - * generic equivalent of sun_path. - */ - if (addr_len < offsetof(struct sockaddr, sa_data)) - return EINVAL; - - if (addr->sa_family != AF_UNIX) - return EAFNOSUPPORT; - - len = (size_t)addr_len - offsetof(struct sockaddr, sa_data); - if (len > 0 && (p = memchr(addr->sa_data, '\0', len)) != NULL) - len = (size_t)(p - addr->sa_data); - - /* The given path name must not be an empty string. */ - if (len == 0) - return ENOENT; - - /* This check should be redundant but better safe than sorry. */ - if (len >= UDS_PATH_MAX) - return EINVAL; - - *pathp = (const char *)addr->sa_data; - return len; -} - -/* - * Given the socket file path given as 'path' with length 'path_len' (not - * necessarily null terminated), store a socket address with the path in - * 'addr', and return the socket address length in 'addr_len'. The calling - * libraries (libsockdriver, libsockevent) and the static assert in uds.h - * guarantee that 'addr' is sufficiently large to store any address we generate - * here. The libraries may subsequently copy out only a part of it to the user - * process. This function always succeeds. - */ -void -uds_make_addr(const char * path, size_t len, struct sockaddr * addr, - socklen_t * addr_len) -{ - - /* - * Generate the address. The stored length (sa_len/sun_len) does not - * include a null terminator. The entire structure does include a null - * terminator, but only if the socket is bound. - */ - addr->sa_len = offsetof(struct sockaddr, sa_data) + len; - addr->sa_family = AF_UNIX; - if (len > 0) { - /* This call may (intentionally) overrun the sa_data size. */ - memcpy((char *)addr->sa_data, path, len); - ((char *)addr->sa_data)[len] = '\0'; - - /* The socket is bound, so include the null terminator. */ - len++; - assert(len <= UDS_PATH_MAX); - } - - /* Note that this length may be different from sa_len/sun_len now. */ - *addr_len = offsetof(struct sockaddr, sa_data) + len; -} - -/* - * Bind a socket to a local address. - */ -static int -uds_bind(struct sock * sock, const struct sockaddr * addr, socklen_t addr_len, - endpoint_t user_endpt) -{ - struct udssock *uds = (struct udssock *)sock; - struct udssock *uds2; - const char *path; - size_t len; - dev_t dev; - ino_t ino; - int r; - - dprintf(("UDS: bind(%d)\n", uds_get_id(uds))); - - /* A socket may be bound at any time, but only once. */ - if (uds_is_bound(uds)) - return EINVAL; - - /* Verify that the user gave us an acceptable address. */ - if ((r = uds_check_addr(addr, addr_len, &path)) < 0) - return r; - len = (size_t)r; - - /* Attempt to create the socket file on the file system. */ - r = socketpath(user_endpt, path, len, SPATH_CREATE, &dev, &ino); - if (r != OK) - return r; - assert(dev != NO_DEV && ino != 0); - - /* - * It is possible that a socket file of a previously bound socket was - * unlinked, and due to inode number reuse, a new socket file has now - * been created with the same pair. In that case, we must - * unbind the old socket, because it must no longer be found. The old - * socket will still have a path (and behave as though it is bound) but - * no longer be found through hash lookups. - */ - if ((uds2 = udshash_get(dev, ino)) != NULL) { - udshash_del(uds2); - - uds2->uds_dev = NO_DEV; - uds2->uds_ino = 0; - } - - /* - * Obtain credentials for the socket, unless the socket is already - * connecting or connected, in which case we must not replace the - * credentials we obtained already. We later clear those credentials - * upon a connection failure or disconnect, so that if the socket is - * then put in listening mode, we know there are no bind-time - * credentials. Not ideal, but we really need two separate sets of - * credentials if we want to get this right, which is a waste of memory - * as no sane application writer would ever rely on credential passing - * after recycling a socket.. - */ - if (uds_get_type(uds) != SOCK_DGRAM && !uds_is_connecting(uds) && - !uds_is_connected(uds)) - uds_get_cred(uds, user_endpt); - - /* Asssign the address to the socket. */ - uds->uds_pathlen = len; - memcpy(&uds->uds_path, path, len); - uds->uds_dev = dev; - uds->uds_ino = ino; - - udshash_add(uds); - - return OK; -} - -/* - * Look up a UDS socket based on a user-given address. If a socket exists for - * the address, check if it is type-compatible with the given UDS socket. - * On succes, return OK, with 'peerp' set to the socket that was found. On - * failure, return a negative error code. - */ -int -uds_lookup(struct udssock * uds, const struct sockaddr * addr, - socklen_t addr_len, endpoint_t user_endpt, struct udssock ** peerp) -{ - struct udssock *peer; - const char *path; - size_t len; - dev_t dev; - ino_t ino; - int r; - - /* Verify that the user gave us an acceptable address. */ - if ((r = uds_check_addr(addr, addr_len, &path)) < 0) - return r; - len = (size_t)r; - - /* Attempt to look up the socket file on the file system. */ - r = socketpath(user_endpt, path, len, SPATH_CHECK, &dev, &ino); - if (r != OK) - return r; - assert(dev != NO_DEV && ino != 0); - - if ((peer = udshash_get(dev, ino)) == NULL) - return ECONNREFUSED; - if (uds_get_type(peer) != uds_get_type(uds)) - return EPROTOTYPE; - - *peerp = peer; - return OK; -} - -/* - * Given the listening socket 'uds', and the socket 'link' that is calling or - * has called connect(2) and is or will be linked to the listening socket's - * queue, create a new socket and connect it to 'link', putting both sockets in - * the connected state. The given link socket may be in unconnected, - * connecting, or disconnected state prior to the call. Return OK or an error - * code. The link state of the link socket remains unchanged in any case. - */ -static int -uds_attach(struct udssock * uds, struct udssock * link) -{ - struct udssock *conn; - int r; - - /* - * Allocate a new socket to use as peer socket for the connection that - * is about to be established. The new socket is not yet known by - * libsockevent. - */ - if ((r = uds_alloc(&conn)) != OK) - return r; - - /* - * Ask libsockevent to clone the sock object in the new UDS socket from - * the listening socket. This adds the sock object to libsockevent's - * data structures and ensures that we can safely use the socket - * despite the fact that it has not yet been accepted (and thus - * returned to libsockevent). From this moment on, we must either - * return the socket's ID (but not a pointer to it!) from uds_accept() - * or raise SEV_CLOSE on it. - */ - sockevent_clone(&uds->uds_sock, &conn->uds_sock, uds_get_id(conn)); - - /* Connect the link socket to the new socket. */ - link->uds_conn = conn; - link->uds_flags |= UDSF_CONNECTED; - - /* - * Connect the new socket to the link socket as well. The child - * socket should also inherit pretty much all settings from the - * listening socket, including the bind path and the listening socket's - * bind-time credentials. - */ - conn->uds_conn = link; - conn->uds_flags = uds->uds_flags & (UDSF_PASSCRED | UDSF_CONNWAIT); - conn->uds_flags |= UDSF_CONNECTED; - conn->uds_pathlen = uds->uds_pathlen; - memcpy(conn->uds_path, uds->uds_path, (size_t)uds->uds_pathlen); - memcpy(&conn->uds_cred, &uds->uds_cred, sizeof(conn->uds_cred)); - - return OK; -} - -/* - * Connect a socket to a remote address. - */ -static int -uds_connect(struct sock * sock, const struct sockaddr * addr, - socklen_t addr_len, endpoint_t user_endpt) -{ - struct udssock *uds = (struct udssock *)sock; - struct udssock *link; - int r; - - dprintf(("UDS: connect(%d)\n", uds_get_id(uds))); - - /* For connection-oriented sockets, several state checks apply. */ - if (uds_get_type(uds) != SOCK_DGRAM) { - if (uds_is_listening(uds)) - return EOPNOTSUPP; - if (uds_is_connecting(uds)) - return EALREADY; - if (uds_is_connected(uds)) - return EISCONN; - /* Disconnected sockets may be reconnected, see below. */ - } else { - /* - * Connectionless sockets may be unconnected by providing an - * address with family AF_UNSPEC. Handle this case first here. - */ - if (addr_len >= offsetof(struct sockaddr, sa_data) && - addr->sa_family == AF_UNSPEC) { - /* - * Reset this socket's previous connection to another - * socket, if any. Unconnecting has no effect on other - * sockets connected to this socket, though. - */ - if (uds_has_link(uds)) - uds_del_queue(uds->uds_link, uds); - - return OK; - } - } - - /* - * Find the socket identified by the given address. If it exists at - * all, see if it is a proper match. - */ - if ((r = uds_lookup(uds, addr, addr_len, user_endpt, &link)) != OK) - return r; - - /* - * Handle connectionless sockets first, in which case a connect links - * the socket to a send target and limits receipt to datagrams from - * that target. We actually point the socket to the peer socket, - * through uds_link. That also means that if the target socket - * disappears, we have to reset any sockets connected to it, in which - * case we return them to the unconnected state. In order to allow - * finding all sockets connected to a particular socket, we put all - * those sockets on their target's queue, hence why we use uds_link and - * not uds_conn. As mentioned before, we allow reconnecting without - * restrictions. - * TODO: see if reconnecting should clear a pending ECONNRESET. - * - * An important note: 'uds' and 'link' may actually be the same socket, - * if the caller chooses to connect a socket with itself! - */ - if (uds_get_type(uds) == SOCK_DGRAM) { - /* Reconnecting to the same socket has no effect. */ - if (uds_has_link(uds) && uds->uds_link == link) - return OK; - - /* - * If the intended target is linked to another socket, we - * refuse linking to it. Sending or receiving would never work - * anyway. Do allow a socket to link to itself after being - * linked to another socket. The error code is the same as in - * the sending code, borrowed from Linux. - */ - if (uds != link && uds_has_link(link) && link->uds_link != uds) - return EPERM; - - /* - * Reset this socket's previous link to another socket, if any. - */ - if (uds_has_link(uds)) - uds_del_queue(uds->uds_link, uds); - - /* - * Reset any links to this socket, except for the one by - * the intended target. Sending or receiving would no longer - * work anyway. If the socket was linked to itself, clear its - * self-link without generating an ECONNRESET. If the socket - * is relinking to itself, reestablish the link after first - * clearing it. - */ - uds_clear_queue(uds, (uds != link) ? link : NULL); - - uds_add_queue(link, uds); - - return OK; - } - - /* - * For connection-oriented sockets there is more to do. First, make - * sure that the peer is a listening socket, that it has not been shut - * down, and that its backlog is not already at the configured maximum. - */ - if (!uds_is_listening(link)) - return ECONNREFUSED; - - if (uds_is_shutdown(link, SFL_SHUT_RD | SFL_SHUT_WR)) - return ECONNREFUSED; - - if (link->uds_queued >= link->uds_backlog) - return ECONNREFUSED; - - /* - * The behavior of connect(2) now depends on whether LOCAL_CONNWAIT is - * set on either the connecting or the listening socket. If it is not, - * the socket will be connected to a new as-yet invisible socket, which - * will be the one returned from accept(2) later. If it was, the - * socket will be put in the connecting state. - */ - if (!((uds->uds_flags | link->uds_flags) & UDSF_CONNWAIT)) { - if ((r = uds_attach(link, uds)) != OK) - return r; - - assert(uds_is_connected(uds)); - } else { - /* - * Disconnected sockets now stop being connected. Any pending - * data can still be received, though. - */ - uds->uds_flags &= ~UDSF_CONNECTED; - - r = SUSPEND; - } - - /* Obtain credentials for the socket. */ - uds_get_cred(uds, user_endpt); - - /* Add the socket at the end of the listening socket's queue. */ - uds_add_queue(link, uds); - - assert(r != SUSPEND || uds_is_connecting(uds)); - - /* - * Let an accept call handle the rest, which will in turn resume this - * connect call. The sockevent library ensures that this works even if - * the call is non-blocking. - */ - sockevent_raise(&link->uds_sock, SEV_ACCEPT); - - return r; -} - -/* - * Put a socket in listening mode. - */ -static int -uds_listen(struct sock * sock, int backlog) -{ - struct udssock *uds = (struct udssock *)sock; - - /* The maximum backlog value must not exceed its field size. */ - assert(SOMAXCONN <= USHRT_MAX); - - dprintf(("UDS: listen(%d)\n", uds_get_id(uds))); - - /* Only connection-oriented types may be put in listening mode. */ - if (uds_get_type(uds) == SOCK_DGRAM) - return EOPNOTSUPP; - - /* A connecting or connected socket may not listen. */ - if (uds_is_connecting(uds) || uds_is_connected(uds)) - return EINVAL; - - /* POSIX says that this is now the appropriate error code here. */ - if (!uds_is_bound(uds)) - return EDESTADDRREQ; - - /* - * The socket is now entering the listening state. If it was - * previously disconnected, clear the connection flag. - */ - uds->uds_flags &= ~UDSF_CONNECTED; - - /* - * We do not remove sockets from the backlog if it is now being dropped - * below the current number of queued sockets. We only refuse newly - * connecting sockets beyond the backlog size. - */ - uds->uds_backlog = backlog; - - return OK; -} - -/* - * Test whether an accept request would block. Return OK if a socket could be - * accepted, an appropriate error code if an accept call would fail instantly, - * or SUSPEND if the accept request would block waiting for a connection. - */ -static int -uds_test_accept(struct sock * sock) -{ - struct udssock *uds = (struct udssock *)sock; - - /* - * Ensure that the socket is in listening mode. If not, we must return - * the error code that is appropriate for this socket type. - */ - if (uds_get_type(uds) == SOCK_DGRAM) - return EOPNOTSUPP; - if (!uds_is_listening(uds)) - return EINVAL; - - /* - * If the socket has been shut down, new connections are no longer - * accepted and accept calls no longer block. This is not a POSIX - * requirement, but rather an application convenience feature. - */ - if (uds->uds_queued == 0) { - if (uds_is_shutdown(uds, SFL_SHUT_RD | SFL_SHUT_WR)) - return ECONNABORTED; - - return SUSPEND; - } - - return OK; -} - -/* - * Accept a connection on a listening socket, creating a new socket. On - * success, return the new socket identifier, with the new socket stored in - * 'newsockp'. Otherwise, return an error code. - */ -static sockid_t -uds_accept(struct sock * sock, struct sockaddr * addr, socklen_t * addr_len, - endpoint_t user_endpt __unused, struct sock ** newsockp) -{ - struct udssock *uds = (struct udssock *)sock; - struct udssock *link, *conn; - sockid_t r; - - dprintf(("UDS: accept(%d)\n", uds_get_id(uds))); - - if ((r = uds_test_accept(sock)) != OK) - return r; - - /* - * Take the first connecting socket off the listening queue. - */ - assert(!TAILQ_EMPTY(&uds->uds_queue)); - - link = TAILQ_FIRST(&uds->uds_queue); - - /* - * Depending on the LOCAL_CONNWAIT setting at the time of connect(2), - * the socket may be connecting or connected. In the latter case, its - * attached socket is the socket we will return now. Otherwise we have - * to attach a socket first. - */ - assert(uds_is_connecting(link) || uds_is_connected(link)); - - if (uds_is_connecting(link)) { - /* - * Attach a new socket. If this fails, return the error but - * leave the connecting socket on the listening queue. - */ - if ((r = uds_attach(uds, link)) != OK) - return r; - - assert(uds_is_connected(link)); - - /* - * Wake up blocked (connect, send, select) calls on the peer - * socket. - */ - sockevent_raise(&link->uds_sock, SEV_CONNECT); - } - - uds_del_queue(uds, link); - - /* Return the peer socket's address to the caller. */ - uds_make_addr(link->uds_path, link->uds_pathlen, addr, addr_len); - - conn = link->uds_conn; - - dprintf(("UDS: accept returns %d\n", uds_get_id(conn))); - - /* - * We already cloned the sock object, so return its ID but not a - * pointer to it. That tells libsockevent not to reinitialize it. - */ - *newsockp = NULL; - return uds_get_id(conn); -} - -/* - * Set socket options. - */ -static int -uds_setsockopt(struct sock * sock, int level, int name, - const struct sockdriver_data * data, socklen_t len) -{ - struct udssock *uds = (struct udssock *)sock; - int r, val; - - dprintf(("UDS: setsockopt(%d,%d,%d)\n", uds_get_id(uds), level, name)); - - switch (level) { - case SOL_SOCKET: - switch (name) { - case SO_SNDBUF: - case SO_RCVBUF: - /* - * The send buffer size may not be changed because the - * buffer is the same as the other side's receive - * buffer, and what the other side is may vary from - * send call to send call. Changing the receive buffer - * size would disallow us from even accurately guessing - * the send buffer size in getsockopt calls. Therefore - * both are hardcoded and cannot actually be changed. - * In order to support applications that want at least - * a certain minimum, we do accept requests to shrink - * either buffer, but we ignore the given size. - */ - if ((r = sockdriver_copyin_opt(data, &val, sizeof(val), - len)) != OK) - return r; - - if (val <= 0 || (size_t)val > uds_io_buflen()) - return EINVAL; - - return OK; /* ignore new value */ - } - - break; - - case UDSPROTO_UDS: - switch (name) { - case LOCAL_CREDS: - if ((r = sockdriver_copyin_opt(data, &val, sizeof(val), - len)) != OK) - return r; - - if (val) - uds->uds_flags |= UDSF_PASSCRED; - else - uds->uds_flags &= ~UDSF_PASSCRED; - - /* - * In incredibly rare cases, disabling this flag may - * allow blocked sends to be resumed, because suddenly - * no room for the credentials is needed in the receive - * buffer anymore. - */ - if (!val) - sockevent_raise(&uds->uds_sock, SEV_SEND); - - return OK; - - case LOCAL_CONNWAIT: - if ((r = sockdriver_copyin_opt(data, &val, sizeof(val), - len)) != OK) - return r; - - if (val) - uds->uds_flags |= UDSF_CONNWAIT; - else - uds->uds_flags &= ~UDSF_CONNWAIT; - - /* - * Changing the setting does not affect sockets that - * are currently pending to be accepted. Therefore, - * uds_accept() may have to deal with either case on a - * socket-by-socket basis. - */ - return OK; - - case LOCAL_PEEREID: - /* This option may be retrieved but not set. */ - return ENOPROTOOPT; - } - - break; - } - - return ENOPROTOOPT; -} - -/* - * Retrieve socket options. - */ -static int -uds_getsockopt(struct sock * sock, int level, int name, - const struct sockdriver_data * data, socklen_t * len) -{ - struct udssock *uds = (struct udssock *)sock; - int val; - - dprintf(("UDS: getsockopt(%d,%d,%d)\n", uds_get_id(uds), level, name)); - - switch (level) { - case SOL_SOCKET: - switch (name) { - case SO_SNDBUF: - case SO_RCVBUF: - /* See uds_setsockopt() for why this is static. */ - val = (int)uds_io_buflen(); - - return sockdriver_copyout_opt(data, &val, sizeof(val), - len); - } - - break; - - case UDSPROTO_UDS: - switch (name) { - case LOCAL_CREDS: - val = !!(uds->uds_flags & UDSF_PASSCRED); - - return sockdriver_copyout_opt(data, &val, sizeof(val), - len); - - case LOCAL_CONNWAIT: - val = !!(uds->uds_flags & UDSF_CONNWAIT); - - return sockdriver_copyout_opt(data, &val, sizeof(val), - len); - - case LOCAL_PEEREID: - /* getpeereid(3) documents these error codes. */ - if (uds_get_type(uds) == SOCK_DGRAM) - return EINVAL; - if (!uds_is_connected(uds)) - return ENOTCONN; - - /* - * This is a custom MINIX3 error, indicating that there - * are no credentials to return. This could be due to - * a failure to obtain them (which *should* not happen) - * but also if the socket was bound while connected, - * disconnected, and then reused as listening socket. - */ - if (uds->uds_conn->uds_cred.unp_pid == -1) - return EINVAL; - - return sockdriver_copyout_opt(data, - &uds->uds_conn->uds_cred, - sizeof(uds->uds_conn->uds_cred), len); - } - - break; - } - - return ENOPROTOOPT; -} - -/* - * Retrieve a socket's local address. - */ -static int -uds_getsockname(struct sock * sock, struct sockaddr * addr, - socklen_t * addr_len) -{ - struct udssock *uds = (struct udssock *)sock; - - dprintf(("UDS: getsockname(%d)\n", uds_get_id(uds))); - - uds_make_addr(uds->uds_path, uds->uds_pathlen, addr, addr_len); - - return OK; -} - -/* - * Retrieve a socket's remote address. - */ -static int -uds_getpeername(struct sock * sock, struct sockaddr * addr, - socklen_t * addr_len) -{ - struct udssock *uds = (struct udssock *)sock; - struct udssock *peer; - - dprintf(("UDS: getpeername(%d)\n", uds_get_id(uds))); - - /* - * For disconnected sockets, we no longer have a peer socket and thus - * also no peer address. Too bad, but NetBSD does the same. - * - * For connecting sockets we could in fact return a peer address, but - * POSIX says (and other platforms agree) that we should deny the call. - */ - peer = uds_get_peer(uds); - - if (peer == NULL || uds_is_connecting(uds)) - return ENOTCONN; - - uds_make_addr(peer->uds_path, peer->uds_pathlen, addr, addr_len); - - return OK; -} - -/* - * Shut down socket send and receive operations. Note that 'flags' is a - * bitwise mask with libsockevent's SFL_SHUT_{RD,WR} flags rather than the set - * of SHUT_{RD,WR,RDWR} values from userland. - */ -static int -uds_shutdown(struct sock * sock, unsigned int flags) -{ - struct udssock *uds = (struct udssock *)sock; - struct udssock *conn; - unsigned int mask; - - dprintf(("UDS: shutdown(%d,0x%x)\n", uds_get_id(uds), flags)); - - /* - * If we are shutting down the socket for reading, we can already close - * any in-flight file descriptors associated with this socket. - */ - if (flags & SFL_SHUT_RD) - uds_io_reset(uds); - - /* - * A shutdown on this side of a connection may have an effect on - * ongoing operations on the other side. Fire appropriate events. - */ - if (uds_is_connected(uds)) { - assert(uds_get_type(uds) != SOCK_DGRAM); - - conn = uds->uds_conn; - - mask = 0; - if (flags & SFL_SHUT_RD) - mask |= SEV_SEND; - if (flags & SFL_SHUT_WR) - mask |= SEV_RECV; - - sockevent_raise(&conn->uds_sock, mask); - } - - return OK; -} - -/* - * Close a socket. - * - * The 'force' flag is unused because we need never wait for data to be sent, - * since we keep all in-flight data on the receiver side. - */ -static int -uds_close(struct sock * sock, int force __unused) -{ - struct udssock *uds = (struct udssock *)sock; - - dprintf(("UDS: close(%d)\n", uds_get_id(uds))); - - if (uds_get_type(uds) == SOCK_DGRAM) { - /* If this socket is linked to a target, disconnect it. */ - if (uds_has_link(uds)) - uds_del_queue(uds->uds_link, uds); - - /* Reset all sockets linked to this socket as a target. */ - uds_clear_queue(uds, NULL); - } else if (uds_is_listening(uds)) { - /* - * Abort all connecting sockets queued on this socket, and - * break all connections for connected sockets queued on this - * socket, freeing their peers. - */ - uds_clear_queue(uds, NULL); - } else if (uds_has_link(uds)) { - /* - * This socket is connecting or connected while the other side - * has not been accepted yet. Remove the socket from the - * listening socket's queue, and if it was connected, get rid - * of its peer socket altogether. - */ - assert(uds_is_listening(uds->uds_link)); - - uds_del_queue(uds->uds_link, uds); - - if (uds_is_connected(uds)) - uds_disconnect(uds, TRUE /*was_linked*/); - } else if (uds_is_connected(uds)) { - /* - * Decouple the peer socket from this socket, and possibly wake - * up any pending operations on it. The socket remains marked - * as connected, but will now be disconnected. - */ - uds_disconnect(uds, FALSE /*was_linked*/); - } - - if (uds_is_hashed(uds)) - udshash_del(uds); - - return OK; -} - -static const struct sockevent_ops uds_ops = { - .sop_pair = uds_pair, - .sop_bind = uds_bind, - .sop_connect = uds_connect, - .sop_listen = uds_listen, - .sop_accept = uds_accept, - .sop_test_accept = uds_test_accept, - .sop_pre_send = uds_pre_send, - .sop_send = uds_send, - .sop_test_send = uds_test_send, - .sop_pre_recv = uds_pre_recv, - .sop_recv = uds_recv, - .sop_test_recv = uds_test_recv, - .sop_setsockopt = uds_setsockopt, - .sop_getsockopt = uds_getsockopt, - .sop_getsockname = uds_getsockname, - .sop_getpeername = uds_getpeername, - .sop_shutdown = uds_shutdown, - .sop_close = uds_close, - .sop_free = uds_free -}; - -/* - * Initialize the service. - */ -static int -uds_init(int type __unused, sef_init_info_t * info __unused) -{ - unsigned int i; - - /* Initialize the list of free sockets. */ - TAILQ_INIT(&uds_freelist); - - for (i = 0; i < __arraycount(uds_array); i++) { - uds_array[i].uds_flags = 0; - - TAILQ_INSERT_TAIL(&uds_freelist, &uds_array[i], uds_next); - } - - /* Initialize the file-to-socket hash table. */ - udshash_init(); - - /* Initialize the input/output module. */ - uds_io_init(); - - /* Initialize the status module. */ - uds_stat_init(); - - /* Initialize the sockevent library. */ - sockevent_init(uds_socket); - - uds_in_use = 0; - uds_running = TRUE; - - return OK; -} - -/* - * Clean up before shutdown. - */ -static void -uds_cleanup(void) -{ - - /* Tell the status module to clean up. */ - uds_stat_cleanup(); -} - -/* - * The service has received a signal. - */ -static void -uds_signal(int signo) -{ - - /* Only check for the termination signal. Ignore anything else. */ - if (signo != SIGTERM) - return; - - /* Exit only once all sockets have been closed. */ - uds_running = FALSE; - - if (uds_in_use == 0) - sef_cancel(); -} - -/* - * Perform initialization using the System Event Framework (SEF). - */ -static void -uds_startup(void) -{ - - /* Register initialization callbacks. */ - sef_setcb_init_fresh(uds_init); - - /* Register signal callback. */ - sef_setcb_signal_handler(uds_signal); - - /* Let SEF perform startup. */ - sef_startup(); -} - -/* - * The UNIX Domain Sockets driver. - */ -int -main(void) -{ - message m; - int r, ipc_status; - - /* Initialize the service. */ - uds_startup(); - - /* Loop receiving and processing messages until instructed to stop. */ - while (uds_running || uds_in_use > 0) { - if ((r = sef_receive_status(ANY, &m, &ipc_status)) != OK) { - if (r == EINTR) - continue; /* sef_cancel() was called */ - - panic("UDS: sef_receive_status failed: %d", r); - } - - /* - * Messages from the MIB service are (ultimately) for the - * status module. Everything else is assumed to be a socket - * request and passed to libsockevent, which will ignore - * anything it does not recognize. - */ - if (m.m_source == MIB_PROC_NR) - rmib_process(&m, ipc_status); - else - sockevent_process(&m, ipc_status); - } - - /* Clean up before graceful shutdown. */ - uds_cleanup(); - - return EXIT_SUCCESS; -} diff --git a/minix/tests/Makefile b/minix/tests/Makefile deleted file mode 100644 index 4662fe872..000000000 --- a/minix/tests/Makefile +++ /dev/null @@ -1,136 +0,0 @@ -NOGCCERROR:= yes - -BINDIR?= /usr/tests/minix-posix -FILESDIR?= /usr/tests/minix-posix -WARNS?= 1 - -# Tests have no manpages -MKMAN= no - -# They are all bin-owned; by default normal executable mode -BINOWN= bin - -# Needed by testsh1.sh -FILES= test1.c - -CFLAGS+= -fno-builtin -LDADD+= -lm - -.include - -SUBDIR+= blocktest -SUBDIR+= ddekit -SUBDIR+= rmibtest - -# Some have special flags compiling -CPPFLAGS.test56.c += -D_MINIX_SYSTEM=1 - -COPTS.test9.c= -O0 -COPTS.test37.c= -O0 -COPTS.test53.c= -O0 -COPTS.test68.c= -O0 - -# Some have special libraries -LDADD.test59= -lmthread -LDADD.test76= -lutil -LDADD.test77= -lutil - -# Some have an extra file -OBJS.test57= test57loop.o -OBJS.test56+= common-socket.o -OBJS.test80+= common-socket.o -OBJS.test81+= common-socket.o - -# Cache testing programs -OBJS.test71+= testcache.o -OBJS.test72+= testcache.o -OBJS.test74+= testcache.o -LDADD.test72+= -lminixfs - -PROGS += testvm -OBJS.testvm+= testcache.o -LDSTATIC.testvm= -static -LDADD.testvm+= -lsys -ltimers - -FILES += testvm.conf - -# Network stack testing programs -OBJS.test90+= socklib.o -OBJS.test91+= socklib.o -OBJS.test92+= socklib.o -OBJS.test93+= socklib.o -# Uncomment the following lines to use SOCKLIB_SWEEP_GENERATE=1/2 in socklib.c -#.PATH: ${NETBSDSRCDIR}/minix/usr.bin/trace -#OBJS.test90+= error.o -#OBJS.test91+= error.o -#OBJS.test92+= error.o -#OBJS.test93+= error.o - -.if ${USE_INET6} == "no" -# Tests 91-94 will fail without IPv6 support, but they should at least compile. -CPPFLAGS.socklib.c += -DNO_INET6 -CPPFLAGS.test94.c += -DNO_INET6 -.endif # ${USE_INET6} == "no" - -# Tests to compile, For every architecture -MINIX_TESTS= \ - 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 \ -21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 \ -41 42 43 44 45 46 48 49 50 52 53 54 55 56 58 59 60 \ -61 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 \ -81 82 83 84 85 86 87 88 89 90 91 92 93 94 - -FILES += t84_h_nonexec.sh - -.if ${MACHINE_ARCH} == "i386" -MINIX_TESTS+= \ - 47 51 57 \ - 62 -.endif # ${MACHINE_ARCH} == "i386" - -.for t in ${MINIX_TESTS} -PROGS+= test${t} -.endfor - -PROGS+= t10a t11a t11b t40a t40b t40c t40d t40e t40f t40g t60a t60b \ - t67a t67b t68a t68b tvnd t84_h_spawn t84_h_spawnattr - -SCRIPTS+= run check-install testinterp.sh testsh1.sh testsh2.sh testmfs.sh \ - testisofs.sh testvnd.sh testkyua.sh testrelpol.sh testrmib.sh - -# test57loop.S is not linked into the .bcl file. -# This way, we can link it in when linking the final binary -LDADD.test57+= ${${USE_BITCODE:Uno} != "no":? test57loop.o -Wl,-allow-multiple-definition:} - -.if ${MKPIC} == "yes" -# Build them as dynamic executables by default if shared libraries -# are available; so that the building and executing of dynamic -# executables is tested -LDSTATIC= -dynamic - -LDFLAGS.mod+= -shared # make shared object - -# Files which have to be compiled with -fPIC -mod.o: mod.c - ${COMPILE.c} -fPIC ${.IMPSRC} - -common.o: common.c - ${COMPILE.c} -fPIC ${.IMPSRC} - -# Add test that must be linked dynamically, and its dynamically loaded -# module -PROGS+= test63 mod -.endif # ${MKPIC} == "yes" - -.for o in ${PROGS} -OBJS.${o} += common.o -.endfor - -.include "./arch/${MACHINE_ARCH}/Makefile.inc" - -# LSC Make sure there is not leftover after a failed testrun -clean: .PHONY .MAKE - @rm -rf DIR* - -.include -.include diff --git a/minix/tests/kernel/sys_padconf/Makefile b/minix/tests/kernel/sys_padconf/Makefile deleted file mode 100644 index ad82370c5..000000000 --- a/minix/tests/kernel/sys_padconf/Makefile +++ /dev/null @@ -1,16 +0,0 @@ -# Makefile for the sys_padconf test. - -.include - -PROG= padconftest -SRCS= padconftest.c - -DPADD+= ${LIBSYS} -LDADD+= -lsys - -MAN= - -BINDIR?= /usr/tests/minix-posix - -.include "${NETBSDSRCDIR}/drivers/Makefile.inc" -.include diff --git a/minix/tests/test27.c b/minix/tests/test27.c deleted file mode 100644 index fb5345734..000000000 --- a/minix/tests/test27.c +++ /dev/null @@ -1,310 +0,0 @@ -/* test27: stat() fstat() Author: Jan-Mark Wams (jms@cs.vu.nl) */ - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#define MODE_MASK (S_IRWXU | S_IRWXG | S_IRWXO | S_ISUID | S_ISGID) -int max_error = 4; -#include "common.h" - -#define ITERATIONS 2 - -#define System(cmd) if (system(cmd) != 0) printf("``%s'' failed\n", cmd) -#define Chdir(dir) if (chdir(dir) != 0) printf("Can't goto %s\n", dir) - - -int superuser; -char *MaxName; /* Name of maximum length */ -char MaxPath[PATH_MAX]; -char *ToLongName; /* Name of maximum +1 length */ -char ToLongPath[PATH_MAX + 1]; - -void test27a(void); -void test27b(void); -void test27c(void); -void makelongnames(void); - -int main(int argc, char *argv[]) -{ - int i, m = 0xFFFF; - - start(27); - if (argc == 2) m = atoi(argv[1]); - superuser = (getuid() == 0); - makelongnames(); - - for (i = 0; i < ITERATIONS; i++) { - if (m & 0001) test27a(); - if (m & 0002) test27b(); - if (m & 0004) test27c(); - } - quit(); - - return(-1); /* Unreachable */ -} - -void test27a() -{ /* Test Normal operation. */ - struct stat st1, st2; - time_t time1, time2; - int fd, pfd[2]; - - subtest = 1; - - time(&time1); /* get time before */ - while (time1 >= time((time_t *)0)) - ; /* Wait for time to change. */ - System("echo 7bytes > foo; chmod 4750 foo"); - if (stat("foo", &st1) != 0) e(1); /* get foo's info */ - time(&time2); - while (time2 >= time((time_t *)0)) - ; /* Wait for next second. */ - time(&time2); /* get time after */ - if ((st1.st_mode & MODE_MASK) != 04750) e(2); - if (st1.st_nlink != 1) e(3); /* check stat */ - if (st1.st_uid != geteuid()) e(4); -#if defined(NGROUPS_MAX) && NGROUPS_MAX == 0 - if (st1.st_gid != getegid()) e(5); -#endif /* defined(NGROUPS_MAX) && NGROUPS_MAX == 0 */ - if (st1.st_size != (size_t) 7) e(6); - if (st1.st_atime <= time1) e(7); - if (st1.st_atime >= time2) e(8); - if (st1.st_ctime <= time1) e(9); - if (st1.st_ctime >= time2) e(10); - if (st1.st_mtime <= time1) e(11); - if (st1.st_mtime >= time2) e(12); - - /* Compair stat and fstat. */ - System("echo 7bytes > bar"); - fd = open("bar", O_RDWR | O_APPEND); /* the bar is open! */ - if (fd != 3) e(13); /* should be stderr + 1 */ - if (stat("bar", &st1) != 0) e(14); /* get bar's info */ - if (fstat(fd, &st2) != 0) e(15); /* get bar's info */ - - /* St1 en st2 should be the same. */ - if (st1.st_dev != st2.st_dev) e(16); - if (st1.st_ino != st2.st_ino) e(17); - if (st1.st_mode != st2.st_mode) e(18); - if (st1.st_nlink != st2.st_nlink) e(19); - if (st1.st_uid != st2.st_uid) e(20); - if (st1.st_gid != st2.st_gid) e(21); - if (st1.st_size != st2.st_size) e(22); - if (st1.st_atime != st2.st_atime) e(23); - if (st1.st_ctime != st2.st_ctime) e(24); - if (st1.st_mtime != st2.st_mtime) e(25); - time(&time1); /* wait a sec. */ - while (time1 >= time((time_t *)0)) - ; - System("chmod 755 bar"); /* chainge mode */ - System("rm -f foobar; ln bar foobar"); /* chainge # links */ - if (write(fd, "foo", 4) != 4) e(26); /* write a bit (or two) */ - if (stat("bar", &st2) != 0) e(27); /* get new info */ - if (st2.st_dev != st1.st_dev) e(28); - if (st2.st_ino != st1.st_ino) e(29); /* compair the fealds */ - if ((st2.st_mode & MODE_MASK) != 0755) e(30); - if (!S_ISREG(st2.st_mode)) e(31); - if (st2.st_nlink != st1.st_nlink + 1) e(32); - if (st2.st_uid != st1.st_uid) e(33); - if (st2.st_gid != st1.st_gid) e(34); - if (st2.st_size != (size_t) 11) e(35); - if (st2.st_atime != st1.st_atime) e(36); - if (st2.st_ctime <= st1.st_ctime) e(37); - if (st2.st_mtime <= st1.st_mtime) e(38); - if (close(fd) != 0) e(39); /* sorry the bar is closed */ - - /* Check special file. */ - if (stat("/dev/tty", &st1) != 0) e(40); - if (!S_ISCHR(st1.st_mode)) e(41); -#if defined(__minix) && defined(_NETBSD_SOURCE) - if (stat("/dev/ram", &st1) != 0) e(42); - if (!S_ISBLK(st1.st_mode)) e(43); -#endif - - /* Check fifos. */ - time(&time1); - while (time1 >= time((time_t *)0)) - ; - if (mkfifo("fifo", 0640) != 0) e(44); - if (stat("fifo", &st1) != 0) e(45); /* get fifo's info */ - time(&time2); - while (time2 >= time((time_t *)0)) - ; - time(&time2); - if (!S_ISFIFO(st1.st_mode)) e(46); - if (st1.st_nlink != 1) e(47); /* check the stat info */ - if (st1.st_uid != geteuid()) e(48); -#if defined(NGROUPS_MAX) && NGROUPS_MAX == 0 - if (st1.st_gid != getegid()) e(49); -#endif /* defined(NGROUPS_MAX) && NGROUPS_MAX == 0 */ - if (st1.st_size != (size_t) 0) e(50); - if (st1.st_atime <= time1) e(51); - if (st1.st_atime >= time2) e(52); - if (st1.st_ctime <= time1) e(53); - if (st1.st_ctime >= time2) e(54); - if (st1.st_mtime <= time1) e(55); - if (st1.st_mtime >= time2) e(56); - - /* Note: the st_mode of a fstat on a pipe should contain a isfifo bit. */ - /* Check pipes. */ - time(&time1); - while (time1 >= time((time_t *)0)) - ; - if (pipe(pfd) != 0) e(57); - if (fstat(pfd[0], &st1) != 0) e(58); /* get pipe input info */ - time(&time2); - while (time2 >= time((time_t *)0)) - ; - time(&time2); - if (!(S_ISFIFO(st1.st_mode))) e(59); /* check stat struct */ - if (st1.st_uid != geteuid()) e(60); - if (st1.st_gid != getegid()) e(61); - if (st1.st_size != (size_t) 0) e(62); - if (st1.st_atime <= time1) e(63); - if (st1.st_atime >= time2) e(64); - if (st1.st_ctime <= time1) e(65); - if (st1.st_ctime >= time2) e(66); - if (st1.st_mtime <= time1) e(67); - if (st1.st_mtime >= time2) e(68); - if (fstat(pfd[1], &st1) != 0) e(69); /* get pipe output info */ - if (!(S_ISFIFO(st1.st_mode))) e(70); - if (st1.st_uid != geteuid()) e(71); - if (st1.st_gid != getegid()) e(72); - if (st1.st_size != (size_t) 0) e(73); - if (st1.st_atime < time1) e(74); - if (st1.st_atime > time2) e(75); - if (st1.st_ctime < time1) e(76); - if (st1.st_ctime > time2) e(77); - if (st1.st_mtime < time1) e(78); - if (st1.st_mtime > time2) e(79); - if (close(pfd[0]) != 0) e(80); - if (close(pfd[1]) != 0) e(81);/* close pipe */ - - /* Check dirs. */ - time(&time1); - while (time1 >= time((time_t *)0)) - ; - System("mkdir dir"); - if (stat("dir", &st1) != 0) e(82); /* get dir info */ - time(&time2); - while (time2 >= time((time_t *)0)) - ; - time(&time2); - if (!(S_ISDIR(st1.st_mode))) e(83); /* check stat struct */ - if (st1.st_uid != geteuid()) e(84); -#if defined(NGROUPS_MAX) && NGROUPS_MAX == 0 - if (st1.st_gid != getegid()) e(85); -#endif /* defined(NGROUPS_MAX) && NGROUPS_MAX == 0 */ - if (st1.st_atime < time1) e(86); - if (st1.st_atime > time2) e(87); - if (st1.st_ctime < time1) e(88); - if (st1.st_ctime > time2) e(89); - if (st1.st_mtime < time1) e(90); - if (st1.st_mtime > time2) e(91); - System("rm -rf ../DIR_27/*"); -} - -void test27b() -{ /* Test maxima. */ - struct stat st; - int fd; - - subtest = 2; - - /* Check stats on maximum length files names. */ - if (mkdir(MaxName, 0777) != 0) e(1); - if (stat(MaxName, &st) != 0) e(2); - if ((fd = open(MaxName, O_RDONLY)) != 3) e(3); - if (fstat(fd, &st) != 0) e(4); - if (close(fd) != 0) e(5); - if (rmdir(MaxName) != 0) e(6); - if (stat(MaxPath, &st) != 0) e(7); - if ((fd = open(MaxPath, O_RDONLY)) != 3) e(8); - if (fstat(fd, &st) != 0) e(9); - if (close(fd) != 0) e(10); - System("rm -rf ../DIR_27/*"); -} - -void test27c() -{ /* Test error response. */ - struct stat st; - int fd, i; - - subtest = 3; - - System("echo Hi > foo"); /* Make a file called foo. */ - /* Check if a un searchable dir is handled ok. */ - Chdir(".."); /* cd .. */ - System("chmod 677 DIR_27"); /* no search permission */ - if (stat("DIR_27/nono", &st) != -1) e(1); - if (superuser) { - if (errno != ENOENT) e(2); /* su has access */ - } - if (!superuser) { - if (errno != EACCES) e(3); /* we don't ;-) */ - } - System("chmod 777 DIR_27"); - Chdir("DIR_27"); /* back to test dir */ - - /* Check on ToLongName etc. */ - if (stat(ToLongPath, &st) != -1) e(6); /* path is too long */ - if (errno != ENAMETOOLONG) e(7); - - /* Test some common errors. */ - if (stat("nono", &st) != -1) e(8); /* nono nonexistent */ - if (errno != ENOENT) e(9); - if (stat("", &st) != -1) e(10); /* try empty */ - if (errno != ENOENT) e(11); - if (stat("foo/bar", &st) != -1) e(12); /* foo is a file */ - if (errno != ENOTDIR) e(13); - - /* Test fstat on file descriptors that are not open. */ - for (i = 3; i < 6; i++) { - if (fstat(i, &st) != -1) e(14); - if (errno != EBADF) e(15); - } - - /* Test if a just closed file is `fstat()'-able. */ - if ((fd = open("foo", O_RDONLY)) != 3) e(16); /* open foo */ - if (fstat(fd, &st) != 0) e(17); /* get stat */ - if (close(fd) != 0) e(18); /* close it */ - if (fstat(fd, &st) != -1) e(19); /* get stat */ - if (errno != EBADF) e(20); - System("rm -rf ../DIR_27/*"); -} - -void makelongnames() -{ - register int i; - int max_name_length; - - max_name_length = name_max("."); /* Aka NAME_MAX, but not every FS supports - * the same length, hence runtime check */ - MaxName = malloc(max_name_length + 1); - ToLongName = malloc(max_name_length + 1 + 1); /* Name of maximum +1 length */ - memset(MaxName, 'a', max_name_length); - MaxName[max_name_length] = '\0'; - - for (i = 0; i < PATH_MAX - 1; i++) { /* idem path */ - MaxPath[i++] = '.'; - MaxPath[i] = '/'; - } - MaxPath[PATH_MAX - 1] = '\0'; - - strcpy(ToLongName, MaxName); /* copy them Max to ToLong */ - strcpy(ToLongPath, MaxPath); - - ToLongName[max_name_length] = 'a'; - ToLongName[max_name_length+1] = '\0';/* extend ToLongName by one too many */ - ToLongPath[PATH_MAX - 1] = '/'; - ToLongPath[PATH_MAX] = '\0'; /* inc ToLongPath by one */ -} - diff --git a/minix/tests/test28.c b/minix/tests/test28.c deleted file mode 100644 index ce8b3557f..000000000 --- a/minix/tests/test28.c +++ /dev/null @@ -1,405 +0,0 @@ - /* test28: mkdir() rmdir() Author: Jan-Mark Wams (jms@cs.vu.nl) */ - -/* -** Not tested readonly file systems (EROFS.) -** Not tested fs full (ENOSPC.) -** Not really tested EBUSY. -** Not tested unlinking busy directories. -*/ - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -int max_error = 4; -#include "common.h" - -#define ITERATIONS 2 - - -#define DIRENT0 ((struct dirent *) NULL) - -#define System(cmd) if (system(cmd) != 0) printf("``%s'' failed\n", cmd) -#define Chdir(dir) if (chdir(dir) != 0) printf("Can't goto %s\n", dir) - -int subtest = 1; -int superuser; -char *MaxName; /* Name of maximum length */ -char MaxPath[PATH_MAX]; -char *ToLongName; /* Name of maximum +1 length */ -char ToLongPath[PATH_MAX + 1]; - -void test28a(void); -void test28c(void); -void test28b(void); -void makelongnames(void); - -int main(int argc, char *argv[]) -{ - int i, m = 0xFFFF; - - sync(); - if (argc == 2) m = atoi(argv[1]); - start(28); - superuser = (getuid() == 0); - makelongnames(); - umask(0000); /* no umask */ - - for (i = 0; i < ITERATIONS; i++) { - if (m & 0001) test28a(); - if (m & 0002) test28b(); - if (m & 0004) test28c(); - } - quit(); - - return(-1); /* Unreachable */ -} - -void test28a() -{ - int mode; /* used in for loop */ - struct stat st; - time_t time1, time2; - DIR *dirp; - struct dirent *dep; - int dot = 0, dotdot = 0; - - subtest = 1; - - System("rm -rf foo /tmp/foo");/* clean up junk */ - - /* Check relative path names */ - if (mkdir("./foo", 0777) != 0) e(1); /* make a dir foo */ - if (mkdir("./foo/bar", 0777) != 0) e(2); /* make foo/bar */ - if (rmdir("foo/bar") != 0) e(3); /* delete bar */ - if (mkdir("foo/../foo/bar", 0777) != 0) e(4); /* make bar again */ - if (rmdir("./foo/bar") != 0) e(5); /* and remove again */ - - /* Foo should be empty (ie. contain only "." and ".." */ - if ((dirp = opendir("foo")) == (DIR *) NULL) e(6); /* open foo */ - if ((dep = readdir(dirp)) == DIRENT0) e(7); /* get first entry */ - if (strcmp(dep->d_name, ".") == 0) dot += 1; /* record what it is */ - if (strcmp(dep->d_name, "..") == 0) dotdot += 1; - if ((dep = readdir(dirp)) == DIRENT0) e(8); /* get second entry */ - if (strcmp(dep->d_name, ".") == 0) dot += 1; /* record again */ - if (strcmp(dep->d_name, "..") == 0) dotdot += 1; - if ((dep = readdir(dirp)) != DIRENT0) e(9); /* no 3d entry */ - if (dot == 1 && dotdot != 1) e(10); /* only . and .. */ - if (closedir(dirp) != 0) e(11); /* close foo */ - if (rmdir("./foo") != 0) e(12); /* remove dir foo */ - - /* Check absolute path names */ - if (mkdir("/tmp/foo", 0777) != 0) e(13); - if (mkdir("/tmp/foo/bar", 0777) != 0) e(14); - if (rmdir("/tmp/foo/bar") != 0) e(15); /* make some dirs */ - if (rmdir("/tmp/foo") != 0) e(16); - - /* Check the mode arument for mkdir() */ - for (mode = 0; mode <= 0777; mode++) { - if (mkdir("foo", mode) != 0) e(17); /* make foo */ - if (stat("foo", &st) != 0) e(18); - if ((st.st_mode & 0777) != mode) e(19); /* check it's mode */ - if (rmdir("foo") != 0) e(20); /* and remove it */ - } - - /* Check the stat */ - time(&time1); - while (time1 >= time((time_t *)0)) - ; - if (mkdir("foo", 0765) != 0) e(21); /* make foo */ - if (stat("foo", &st) != 0) e(22); - time(&time2); - while (time2 >= time((time_t *)0)) - ; - time(&time2); - if (st.st_nlink != 2) e(23); - if (st.st_uid != geteuid()) e(24); - if (st.st_gid != getegid()) e(25); - if (st.st_size < 0) e(26); - if ((st.st_mode & 0777) != 0765) e(27); - if (st.st_atime <= time1) e(28); - if (st.st_atime >= time2) e(29); - if (st.st_ctime <= time1) e(30); - if (st.st_ctime >= time2) e(31); - if (st.st_mtime <= time1) e(32); - if (st.st_mtime >= time2) e(33); - - /* Check if parent is updated */ - if (stat(".", &st) != 0) e(34); - time(&time2); - while (time2 >= time((time_t *)0)) - ; - time(&time2); - if (st.st_ctime <= time1) e(35); - if (st.st_ctime >= time2) e(36); - if (st.st_mtime <= time1) e(37); - if (st.st_mtime >= time2) e(38); - time(&time1); - while (time1 >= time((time_t *)0)) - ; - if (rmdir("foo") != 0) e(39); - if (stat(".", &st) != 0) e(40); - time(&time2); - while (time2 >= time((time_t *)0)) - ; - time(&time2); - if (st.st_ctime <= time1) e(41); - if (st.st_ctime >= time2) e(42); - if (st.st_mtime <= time1) e(43); - if (st.st_mtime >= time2) e(44); -} - -void test28b() -{ /* Test critical values. */ - struct stat st; - DIR *dirp; - struct dirent *dep; - int fd; /* file descriptor */ - int other = 0, dot = 0, dotdot = 0; /* dirent counters */ - int r; /* Intermediate result */ - int rmdir_result; /* tmp var */ - int stat_loc, does_truncate; - - subtest = 2; - - System("rm -rf ../DIR_28/*"); - - /* Check funny but valid path names */ - if (mkdir("/../../..////.//../tmp/foo/", 0777) != 0) e(1); - if (mkdir("/tmp/foo//////..//foo//../foo/bar/", 0777) != 0) e(2); - if (rmdir("///tmp/..//tmp/foo/bar//../..//foo/bar") != 0) e(3); - if (mkdir("///tmp/foo/foobar//", 0777) != 0) e(4); - if (rmdir("/tmp/foo/foobar//") != 0) e(5); - if (rmdir("/.././/././/tmp/foo///////////////") != 0) e(6); - if (rmdir("/tmp/foo") != -1) e(7); /* try again */ - - /* Test max path ed. */ - if (mkdir(MaxName, 0777) != 0) e(9); /* make dir MaxName */ - if (rmdir(MaxName) != 0) e(10); /* and remove it */ - MaxPath[strlen(MaxPath) - 2] = '/'; /* convert MaxPath */ - MaxPath[strlen(MaxPath) - 1] = 'a'; /* to ././.../a */ - if (mkdir(MaxPath, 0777) != 0) e(11); /* it should be */ - if (rmdir(MaxPath) != 0) e(12); /* ok */ - - /* Test too long path ed. */ - does_truncate = does_fs_truncate(); - r = mkdir(ToLongName, 0777); - if (does_truncate ) { - /* FS truncates names, mkdir should've worked */ - if (r != 0) e(13); /* Try ToLongName */ - if (rmdir(ToLongName) != 0) e(14); /* and remove it */ - } else { - /* Too long, should've failed with ENAMETOOLONG */ - if (r == 0) e(15); - if (errno != ENAMETOOLONG) e(16); - } - ToLongPath[strlen(ToLongPath) - 2] = '/'; /* make ToLongPath */ - ToLongPath[strlen(ToLongPath) - 1] = 'a'; /* contain ././.../a */ - if (mkdir(ToLongPath, 0777) != -1) e(17); /* it should */ - if (errno != ENAMETOOLONG) e(18); /* not be ok */ - if (rmdir(ToLongPath) != -1) e(19); - if (errno != ENAMETOOLONG) e(20); - - if (mkdir("foo", 0777) != 0) e(21); - System("touch foo/xyzzy"); - - /* Test if rmdir removes only empty dirs */ - if (rmdir("foo") != -1) e(29);/* not empty */ - if (errno != EEXIST && errno != ENOTEMPTY) e(30); - /* Test if rmdir removes a dir with an empty file (it shouldn't.) */ - System("rm -rf foo"); /* cleanup */ - if (mkdir("foo", 0777) != 0) e(31); - System("> foo/empty"); /* > empty */ - if (rmdir("foo") != -1) e(32);/* not empty */ - if (errno != EEXIST && errno != ENOTEMPTY) e(33); - if (unlink("foo/empty") != 0) e(34); /* rm empty */ - - /* See what happens if foo is linked. */ -#if 0 - if (superuser) { - if (link("foo", "footoo") != 0) e(35); /* foo still */ - if (rmdir("footoo") != 0) e(36); /* exist */ - if (chdir("footoo") != -1) e(37); /* footoo */ - if (errno != ENOENT) e(38); /* is gone */ - } -#endif -#if defined(__minix) && defined(_NETBSD_SOURCE) - /* Some implementations might allow users to link directories. */ - if (!superuser) { - if (link("foo", "footoo") != -1) e(39); - if (errno != EPERM) e(40); - if (unlink("foo") != -1) e(41); - if (errno != EPERM) e(42); - } -#endif - - /* See if ".." and "." are removed from the dir, and if it is - * unwriteable - * Note, we can not remove any files in the PARENT - * process, because this - * will make readdir unpredicatble. (see - * 1003.1 page 84 line 30.) However - * removal of the directory is - * not specified in the standard. - */ - System("rm -rf /tmp/sema[12].07"); - switch (fork()) { - case -1: printf("Can't fork\n"); break; - - case 0: - alarm(20); - if ((fd = open("foo", O_RDONLY)) <= 2) e(43); /* open */ - if ((dirp = opendir("foo")) == (DIR *) NULL) e(44); /* opendir */ - /* UpA downB */ - system(">/tmp/sema1.07; while test -f /tmp/sema1.07; do sleep 1;done"); - while ((dep = readdir(dirp)) != DIRENT0) { - if (strcmp(dep->d_name, "..") == 0) - dotdot += 1; - else if (strcmp(dep->d_name, ".") == 0) - dot += 1; - else - other += 1; - } - if (dotdot != 0) e(45); /* no entrys */ - if (dot != 0) e(46); /* shoul be */ - if (other != 0) e(47); /* left or */ - - /* No new files (entrys) are allowed on foo */ - if (creat("foo/nono", 0777) != -1) e(48); /* makeable */ - if (closedir(dirp) != 0) e(49); /* close foo */ - system("while test ! -f /tmp/sema2.07; do sleep 1; done"); /* downA */ - System("rm -f /tmp/sema2.07"); /* clean up */ - - /* Foo still exist, so we should be able to get a fstat */ - if (fstat(fd, &st) != 0) e(50); - if (st.st_nlink != (nlink_t) 0) e(51); /* 0 left */ - if (close(fd) != 0) e(52); /* last one */ - exit(0); - - default: - system("while test ! -f /tmp/sema1.07; do sleep 1; done"); /* downA */ - if (rmdir("foo") != 0) e(53); /* cleanerup */ - System("rm -f /tmp/sema1.07"); /* upB */ - if (chdir("foo") != -1) e(54); /* it should */ - if (errno != ENOENT) e(55); /* be gone */ - System("> /tmp/sema2.07"); /* upA */ - if (wait(&stat_loc) == -1) e(56); - if (stat_loc != 0) e(57); - } - - /* See if foo isn't accessible any more */ - if (chdir("foo") != -1) e(58); - if (errno != ENOENT) e(59); - - /* Let's see if we can get a EBUSSY..... */ - if (mkdir("foo", 0777) != 0) e(60); /* mkdir foo */ - System("rm -f /tmp/sema.07"); /* unness */ - switch (fork()) { - case -1: printf("Can't fork\n"); break; - case 0: - alarm(20); - if (chdir("foo") != 0) e(61); /* child goes */ - System("> /tmp/sema.07"); /* upA */ - system("while test -f /tmp/sema.07; do sleep 1; done"); /* downB */ - sleep(1); - exit(0); - default: - system("while test ! -f /tmp/sema.07; do sleep 1; done"); /* downA */ - rmdir_result = rmdir("foo"); /* try remove */ - if (rmdir_result == -1) { /* if it failed */ - if (errno != EBUSY) e(62); /* foo is busy */ - } else { - if (rmdir_result != 0) e(63); - if (rmdir("foo") != -1) e(64); /* not removable */ - if (errno != ENOENT) e(65); /* again. */ - if (chdir("foo") != -1) e(66); /* we can't go */ - if (errno != ENOENT) e(67); /* there any more */ - if (mkdir("foo", 0777) != 0) e(68); /* we can remake foo */ - } - System("rm -f /tmp/sema.07"); /* upB */ - if (wait(&stat_loc) == -1) e(69); - if (stat_loc != 0) e(70); - } - if (rmdir("foo") != 0) e(71); /* clean up */ -} - -void test28c() -{ /* Test error handeling. */ - subtest = 3; - - System("rm -rf ../DIR_28/*"); - System("rm -rf foo /tmp/foo");/* clean up junk */ - - /* Test common errors */ - if (mkdir("foo", 0777) != 0) e(1); /* mkdir shouldn't fail */ - if (mkdir("foo", 0777) != -1) e(2); /* should fail the 2d time */ - if (errno != EEXIST) e(3); /* because it exists already */ - if (rmdir("foo") != 0) e(4); /* rmdir shouldn't fail */ - if (rmdir("foo") != -1) e(5); /* but it should now because */ - if (errno != ENOENT) e(6); /* it's gone the 1st time */ - /* Test on access etc. */ - if (mkdir("foo", 0777) != 0) e(7); - if (mkdir("foo/bar", 0777) != 0) e(8); - if (!superuser) { - System("chmod 677 foo");/* make foo inaccesable */ - if (mkdir("foo/foo", 0777) != -1) e(9); - if (errno != EACCES) e(10); - if (rmdir("foo/bar") != -1) e(11); - if (errno != EACCES) e(12); - System("chmod 577 foo");/* make foo unwritable */ - if (mkdir("foo/foo", 0777) != -1) e(13); - if (errno != EACCES) e(14); - if (rmdir("foo/bar") != -1) e(15); - if (errno != EACCES) e(16); - System("chmod 777 foo");/* make foo full accessable */ - } - if (rmdir("foo/bar") != 0) e(17); /* bar should be removable */ - if (mkdir("foo/no/foo", 0777) != -1) e(18); /* Note: "no" doesn't exist */ - if (errno != ENOENT) e(19); - if (mkdir("", 0777) != -1) e(20); /* empty string isn't ok */ - if (errno != ENOENT) e(21); - if (rmdir("") != -1) e(22); /* empty string isn't ok */ - if (errno != ENOENT) e(23); - System("> foo/no"); /* make a file "no" */ - if (mkdir("foo/no/foo", 0777) != -1) e(24); - if (errno != ENOTDIR) e(25); /* note: "no" is not a a dir */ - if (rmdir("foo/no/foo") != -1) e(26); - if (errno != ENOTDIR) e(27); - System("rm -rf foo"); /* clean up */ -} - -void makelongnames() -{ - register int i; - int max_name_length; - - max_name_length = name_max("."); /* Aka NAME_MAX, but not every FS supports - * the same length, hence runtime check */ - MaxName = malloc(max_name_length + 1); - ToLongName = malloc(max_name_length + 1 + 1); /* Name of maximum +1 length */ - memset(MaxName, 'a', max_name_length); - MaxName[max_name_length] = '\0'; - - for (i = 0; i < PATH_MAX - 1; i++) { /* idem path */ - MaxPath[i++] = '.'; - MaxPath[i] = '/'; - } - MaxPath[PATH_MAX - 1] = '\0'; - - strcpy(ToLongName, MaxName); /* copy them Max to ToLong */ - strcpy(ToLongPath, MaxPath); - - ToLongName[max_name_length] = 'a'; - ToLongName[max_name_length+1] = '\0';/* extend ToLongName by one too many */ - ToLongPath[PATH_MAX - 1] = '/'; - ToLongPath[PATH_MAX] = '\0'; /* inc ToLongPath by one */ -} diff --git a/releasetools/Makefile b/releasetools/Makefile deleted file mode 100644 index 7b4bed4fb..000000000 --- a/releasetools/Makefile +++ /dev/null @@ -1,109 +0,0 @@ -# Makefile for the kernel image. -.include - -GEN_FILES= *.bak image kernel *.iso *.iso.gz cdfdimage rootimage src - -# LSC detect where were built the objects files -PROGROOT:= .. -.if "${MAKEOBJDIR:S,${.CURDIR},,}" != "" -PROGROOT:= ${MAKEOBJDIR:S,releasetools,,} -.endif - -# Specify the programs that are part of the system image. -KERNEL= ${PROGROOT}/minix/kernel/kernel - -# PROGRAMS are in the order they should be loaded by boot -PROGRAMS+= ${PROGROOT}/minix/servers/ds/ds -PROGRAMS+= ${PROGROOT}/minix/servers/rs/rs -PROGRAMS+= ${PROGROOT}/minix/servers/pm/pm -PROGRAMS+= ${PROGROOT}/minix/servers/sched/sched -PROGRAMS+= ${PROGROOT}/minix/servers/vfs/vfs -PROGRAMS+= ${PROGROOT}/minix/drivers/storage/memory/memory -PROGRAMS+= ${PROGROOT}/minix/drivers/tty/tty/tty -PROGRAMS+= ${PROGROOT}/minix/servers/mib/mib -PROGRAMS+= ${PROGROOT}/minix/servers/vm/vm -PROGRAMS+= ${PROGROOT}/minix/fs/pfs/pfs -PROGRAMS+= ${PROGROOT}/minix/fs/mfs/mfs -PROGRAMS+= ${PROGROOT}/sbin/init/init - -all usage help: - @echo " " >&2 - @echo "Master Makefile to create new MINIX configuration." >&2 - @echo "Root privileges are required." >&2 - @echo " " >&2 - @echo "Usage:" >&2 - @echo " make includes # Install include files" >&2 - @echo " make depend # Generate dependency files" >&2 - @echo " make services # Compile and install all services" >&2 - @echo " make hdboot # Make image, and install to hard disk" >&2 - @echo " make clean # Remove all compiler results" >&2 - @echo " " >&2 - @echo "To create a fresh MINIX configuration, try:" >&2 - @echo " make clean install # new boot image" >&2 - @echo " " >&2 - -.gitignore: Makefile - echo ${GEN_FILES} | tr ' ' '\n' >.gitignore - -includes: - ${MAKE} -C ${NETBSDSRCDIR} includes - -depend: includes .gitignore - ${MAKE} -C ${NETBSDSRCDIR} depend - -libraries: includes - ${MAKE} -C ${NETBSDSRCDIR} do-lib - -kernel: libraries - ${MAKE} -C ${NETBSDSRCDIR}/minix/kernel - -servers: libraries - ${MAKE} -C ${NETBSDSRCDIR}/minix/fs all install - ${MAKE} -C ${NETBSDSRCDIR}/minix/net all install - ${MAKE} -C ${NETBSDSRCDIR}/minix/servers all install - -sbin: libraries - ${MAKE} -C ${NETBSDSRCDIR}/sbin all install - ${MAKE} -C ${NETBSDSRCDIR}/minix/sbin all install - -drivers: libraries - ${MAKE} -C ${NETBSDSRCDIR}/minix/drivers all install - -services: kernel servers drivers sbin - -do-hdboot: - @rm -rf ${DESTDIR}/boot/minix/.temp/ - ${INSTALL_DIR} ${DESTDIR}/boot/minix/.temp -# mod_0 is used to make alphabetical order equal to the boot order - @n=0; \ - for i in ${PROGRAMS}; \ - do \ - n=`expr $$n + 1`; \ - [ "$$n" -ge 10 ] && prefix="mod" || prefix="mod0"; \ - newname="${DESTDIR}/boot/minix/.temp/$${prefix}$${n}_`basename $$i`"; \ - ${INSTALL_FILE} $$i $$newname; \ - echo ${INSTALL_FILE} $$i $$newname; \ - done - @${INSTALL_FILE} ${KERNEL} ${DESTDIR}/boot/minix/.temp/ - @if [ "${MKINSTALLBOOT:Uno}" != "no" ] ; then \ - ${STRIP} -s ${DESTDIR}/boot/minix/.temp/* ; \ - gzip ${DESTDIR}/boot/minix/.temp/mod* ; \ - ${HOST_SH} mkboot hdboot ${DESTDIR}; \ - ${HOST_SH} ../minix/commands/update_bootcfg/update_bootcfg.sh;\ - fi - -hdboot: services .WAIT do-hdboot - -clean: - ${MAKE} -C ${NETBSDSRCDIR}/lib $@ - ${MAKE} -C ${NETBSDSRCDIR}/minix/kernel $@ - ${MAKE} -C ${NETBSDSRCDIR}/minix/fs $@ - ${MAKE} -C ${NETBSDSRCDIR}/minix/net $@ - ${MAKE} -C ${NETBSDSRCDIR}/minix/servers $@ - ${MAKE} -C ${NETBSDSRCDIR}/minix/drivers $@ - ${MAKE} -C ${NETBSDSRCDIR}/sbin $@ - ${MAKE} -C ${NETBSDSRCDIR}/minix/sbin $@ - rm -rf ${GEN_FILES} - -# LSC: For STRIP and HOST_SH -.include diff --git a/releasetools/arm_sdimage.sh b/releasetools/arm_sdimage.sh deleted file mode 100755 index 94d107695..000000000 --- a/releasetools/arm_sdimage.sh +++ /dev/null @@ -1,209 +0,0 @@ -#!/usr/bin/env bash -set -e - -# -# This script creates a bootable image and should at some point in the future -# be replaced by the proper NetBSD infrastructure. -# - -# -# Source settings if present -# -: ${SETTINGS_MINIX=.settings} -if [ -f "${SETTINGS_MINIX}" ] -then - echo "Sourcing settings from ${SETTINGS_MINIX}" - # Display the content (so we can check in the build logs - # what the settings contain. - cat ${SETTINGS_MINIX} | sed "s,^,CONTENT ,g" - . ${SETTINGS_MINIX} -fi - -: ${ARCH=evbearm-el} -: ${OBJ=../obj.${ARCH}} -: ${TOOLCHAIN_TRIPLET=arm-elf32-minix-} -: ${BUILDSH=build.sh} - -: ${SETS="minix-base minix-comp minix-games minix-man minix-tests tests"} -: ${IMG=minix_arm_sd.img} - -# ARM definitions: -: ${BUILDVARS=-V MKGCCCMDS=yes -V MKLLVM=no} -# These BUILDVARS are for building with LLVM: -#: ${BUILDVARS=-V MKLIBCXX=no -V MKKYUA=no -V MKATF=no -V MKLLVMCMDS=no} -: ${FAT_SIZE=$(( 10*(2**20) / 512))} # This is in sectors - -# Beagleboard-xm -: ${U_BOOT_BIN_DIR=build/omap3_beagle/} -: ${CONSOLE=tty02} - -# BeagleBone (and black) -#: ${U_BOOT_BIN_DIR=build/am335x_evm/} -#: ${CONSOLE=tty00} - -# -# We host u-boot binaries. -# -: ${MLO=MLO} -: ${UBOOT=u-boot.img} -U_BOOT_GIT_VERSION=cb5178f12787c690cb1c888d88733137e5a47b15 - -if [ ! -f ${BUILDSH} ] -then - echo "Please invoke me from the root source dir, where ${BUILDSH} is." - exit 1 -fi - -if [ -n "$BASE_URL" ] -then - #we no longer download u-boot but do a checkout - #BASE_URL used to be the base url for u-boot - #Downloads - echo "Warning:** Setting BASE_URL (u-boot) is no longer possible use U_BOOT_BIN_DIR" - echo "Look in ${RELEASETOOLSDIR}/arm_sdimage.sh for suggested values" - exit 1 -fi - -export PATH=/bin:/sbin:/usr/bin:/usr/sbin:/usr/local/bin:/usr/local/sbin:${PATH} - -# we create a disk image of about 2 gig's -# for alignment reasons, prefer sizes which are multiples of 4096 bytes -: ${IMG_SIZE=$(( 2*(2**30) ))} -: ${ROOT_SIZE=$(( 64*(2**20) ))} -: ${HOME_SIZE=$(( 128*(2**20) ))} -: ${USR_SIZE=$(( 1792*(2**20) ))} - -# set up disk creation environment -. releasetools/image.defaults -. releasetools/image.functions - -# all sizes are written in 512 byte blocks -ROOTSIZEARG="-b $((${ROOT_SIZE} / 512 / 8))" -USRSIZEARG="-b $((${USR_SIZE} / 512 / 8))" -HOMESIZEARG="-b $((${HOME_SIZE} / 512 / 8))" - -# where the kernel & boot modules will be -MODDIR=${DESTDIR}/boot/minix/.temp - -echo "Building work directory..." -build_workdir "$SETS" - -echo "Adding extra files..." - -# create a fstab entry in /etc -cat >${ROOT_DIR}/etc/fstab < ${WORK_DIR}/uEnv.txt -${RELEASETOOLSDIR}/gen_uEnv.txt.sh -c ${CONSOLE} > ${ROOT_DIR}/uEnv.txt - -# Do some last processing of the kernel and servers and then put them on the FAT -# partition. -${CROSS_PREFIX}objcopy ${OBJ}/minix/kernel/kernel -O binary ${ROOT_DIR}/kernel.bin - -for f in servers/vm/vm servers/rs/rs servers/pm/pm servers/sched/sched \ - servers/vfs/vfs servers/ds/ds servers/mib/mib fs/pfs/pfs fs/mfs/mfs \ - ../sbin/init/init drivers/tty/tty/tty drivers/storage/memory/memory -do - fn=`basename $f`.elf - cp ${OBJ}/minix/${f} ${ROOT_DIR}/${fn} - ${CROSS_PREFIX}strip -s ${ROOT_DIR}/${fn} -done -cat >${WORK_DIR}/boot.mtree < ${WORK_DIR}/extra.base <${ROOT_DIR}/boot.cfg <${ROOT_DIR}/boot.cfg < ${OBJ}/efi.img - EFI_START=$((${HOME_START} + ${_HOME_SIZE})) - echo " * EFI" - ${CROSS_TOOLS}/nbmakefs -t msdos -s ${EFI_SIZE} -o "F=32,c=1" ${OBJ}/efi.img ${EFI_DIR} - dd if=${OBJ}/efi.img >> ${IMG} - ${CROSS_TOOLS}/nbpartition -m ${IMG} ${BOOTXX_SECS} 81:${_ROOT_SIZE}* 81:${_USR_SIZE} 81:${_HOME_SIZE} EF:1+ -else - ${CROSS_TOOLS}/nbpartition -m ${IMG} ${BOOTXX_SECS} 81:${_ROOT_SIZE}* 81:${_USR_SIZE} 81:${_HOME_SIZE} -fi - -${CROSS_TOOLS}/nbinstallboot -f -m ${ARCH} ${IMG} ${DESTDIR}/usr/mdec/bootxx_minixfs3 - -echo "" -echo "Disk image at `pwd`/${IMG}" -echo "" -echo "To boot this image on kvm using the bootloader:" -echo "qemu-system-i386 --enable-kvm -m 256 -hda `pwd`/${IMG}" -echo "" -echo "To boot this image on kvm:" -echo "cd ${MODDIR} && qemu-system-i386 --enable-kvm -m 256M -kernel kernel -append \"rootdevname=c0d0p0\" -initrd \"${mods}\" -hda `pwd`/${IMG}" -echo "To boot this image on kvm with EFI (tianocore OVMF):" -echo "qemu-system-i386 -L . -bios OVMF-i32.fd -m 256M -drive file=minix_x86.img,if=ide,format=raw" diff --git a/releasetools/x86_ramimage.sh b/releasetools/x86_ramimage.sh deleted file mode 100755 index 6e691900d..000000000 --- a/releasetools/x86_ramimage.sh +++ /dev/null @@ -1,57 +0,0 @@ -#!/usr/bin/env bash -set -e - -# -# This script creates a bootable image and should at some point in the future -# be replaced by the proper NetBSD infrastructure. -# - -: ${ARCH=i386} -: ${OBJ=../obj.${ARCH}} -: ${TOOLCHAIN_TRIPLET=i586-elf32-minix-} -: ${BUILDSH=build.sh} - -: ${SETS="minix-base"} - -if [ ! -f ${BUILDSH} ] -then - echo "Please invoke me from the root source dir, where ${BUILDSH} is." - exit 1 -fi - -#: ${RAMDISK_SIZE=$(( 200*(2**20) ))} - -# set up disk creation environment -. releasetools/image.defaults -. releasetools/image.functions - -# where the kernel & boot modules will be -MODDIR=${DESTDIR}/boot/minix/.temp - -echo "Building work directory..." -build_workdir "$SETS" - -echo "Adding extra files..." -workdir_add_ramdisk_files - -# set correct message of the day (log in and install tip) -cp releasetools/release/ramdisk/etc/issue ${ROOT_DIR}/etc/issue -add_file_spec "etc/issue" extra.cdfiles - -echo "Bundling packages..." -bundle_packages "$BUNDLE_PACKAGES" - -echo "Creating specification files..." -create_input_spec -create_protos - -echo "Writing ramdisk image..." -# add the other modules for boot -cp ${MODDIR}/* ${WORK_DIR} -create_ramdisk_image ${RAMDISK_SIZE} - -echo "" -echo "RAM image modules at ${WORK_DIR}" -echo "" -echo "To boot this image on kvm:" -echo "cd ${WORK_DIR} && qemu-system-i386 --enable-kvm -m 1G -kernel kernel -append \"bootramdisk=1\" -initrd \"${mods}\"" diff --git a/releasetools/x86_usbimage.sh b/releasetools/x86_usbimage.sh deleted file mode 100755 index 63b07a32f..000000000 --- a/releasetools/x86_usbimage.sh +++ /dev/null @@ -1,105 +0,0 @@ -#!/usr/bin/env bash -set -e - -# -# This script creates a bootable image and should at some point in the future -# be replaced by the proper NetBSD infrastructure. -# - -: ${ARCH=i386} -: ${OBJ=../obj.${ARCH}} -: ${TOOLCHAIN_TRIPLET=i586-elf32-minix-} -: ${BUILDSH=build.sh} - -: ${SETS="minix-base"} -: ${IMG=minix_x86_usb.img} - -if [ ! -f ${BUILDSH} ] -then - echo "Please invoke me from the root source dir, where ${BUILDSH} is." - exit 1 -fi - -#: ${RAMDISK_SIZE=$(( 200*(2**20) ))} -: ${BOOTXX_SECS=32} - -# set up disk creation environment -. releasetools/image.defaults -. releasetools/image.functions - -# where the kernel & boot modules will be -MODDIR=${DESTDIR}/boot/minix/.temp - -echo "Building work directory..." -build_workdir "$SETS" - -echo "Adding extra files..." -workdir_add_ramdisk_files - -# set correct message of the day (log in and install tip) -cp releasetools/release/ramdisk/etc/issue ${ROOT_DIR}/etc/issue -add_file_spec "etc/issue" extra.cdfiles - -echo "Bundling packages..." -bundle_packages "$BUNDLE_PACKAGES" - -echo "Creating specification files..." -create_input_spec -create_protos - -echo "Writing ramdisk image..." -# add the other modules for boot -cp ${MODDIR}/* ${WORK_DIR} -create_ramdisk_image ${RAMDISK_SIZE} - -echo "Writing USB image..." -# clear ROOT_DIR -rm -rf ${ROOT_DIR}/* -echo ". type=dir uid=0 gid=0 mode=0755" > ${WORK_DIR}/extra.boot - -# move all modules back to ROOT_DIR -mv ${WORK_DIR}/kernel ${WORK_DIR}/mod* ${ROOT_DIR}/ -add_file_spec "kernel" extra.boot -for i in ${ROOT_DIR}/mod*; do - add_file_spec $(basename $i) extra.boot -done - -# add boot.cfg -cat >${ROOT_DIR}/boot.cfg <