socket.c

Go to the documentation of this file.
00001 /*
00002  * Copyright (C) 2004-2015  Internet Systems Consortium, Inc. ("ISC")
00003  * Copyright (C) 1998-2003  Internet Software Consortium.
00004  *
00005  * Permission to use, copy, modify, and/or distribute this software for any
00006  * purpose with or without fee is hereby granted, provided that the above
00007  * copyright notice and this permission notice appear in all copies.
00008  *
00009  * THE SOFTWARE IS PROVIDED "AS IS" AND ISC DISCLAIMS ALL WARRANTIES WITH
00010  * REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY
00011  * AND FITNESS.  IN NO EVENT SHALL ISC BE LIABLE FOR ANY SPECIAL, DIRECT,
00012  * INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM
00013  * LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE
00014  * OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR
00015  * PERFORMANCE OF THIS SOFTWARE.
00016  */
00017 
00018 /* $Id$ */
00019 
00020 /*! \file */
00021 
00022 #include <config.h>
00023 
00024 #include <sys/param.h>
00025 #include <sys/types.h>
00026 #include <sys/socket.h>
00027 #include <sys/stat.h>
00028 #include <sys/time.h>
00029 #include <sys/uio.h>
00030 
00031 #if defined(HAVE_LINUX_NETLINK_H) && defined(HAVE_LINUX_RTNETLINK_H)
00032 #include <linux/netlink.h>
00033 #include <linux/rtnetlink.h>
00034 #endif
00035 
00036 #include <errno.h>
00037 #include <fcntl.h>
00038 #include <stddef.h>
00039 #include <stdlib.h>
00040 #include <string.h>
00041 #include <unistd.h>
00042 #ifdef HAVE_INTTYPES_H
00043 #include <inttypes.h> /* uintptr_t */
00044 #endif
00045 
00046 #include <isc/buffer.h>
00047 #include <isc/bufferlist.h>
00048 #include <isc/condition.h>
00049 #include <isc/formatcheck.h>
00050 #include <isc/json.h>
00051 #include <isc/list.h>
00052 #include <isc/log.h>
00053 #include <isc/mem.h>
00054 #include <isc/msgs.h>
00055 #include <isc/mutex.h>
00056 #include <isc/net.h>
00057 #include <isc/once.h>
00058 #include <isc/platform.h>
00059 #include <isc/print.h>
00060 #include <isc/region.h>
00061 #include <isc/resource.h>
00062 #include <isc/socket.h>
00063 #include <isc/stats.h>
00064 #include <isc/strerror.h>
00065 #include <isc/task.h>
00066 #include <isc/thread.h>
00067 #include <isc/util.h>
00068 #include <isc/xml.h>
00069 
00070 #ifdef ISC_PLATFORM_HAVESYSUNH
00071 #include <sys/un.h>
00072 #endif
00073 #ifdef ISC_PLATFORM_HAVEKQUEUE
00074 #include <sys/event.h>
00075 #endif
00076 #ifdef ISC_PLATFORM_HAVEEPOLL
00077 #include <sys/epoll.h>
00078 #endif
00079 #ifdef ISC_PLATFORM_HAVEDEVPOLL
00080 #if defined(HAVE_SYS_DEVPOLL_H)
00081 #include <sys/devpoll.h>
00082 #elif defined(HAVE_DEVPOLL_H)
00083 #include <devpoll.h>
00084 #endif
00085 #endif
00086 
00087 #include "errno2result.h"
00088 
00089 /* See task.c about the following definition: */
00090 #ifdef ISC_PLATFORM_USETHREADS
00091 #define USE_WATCHER_THREAD
00092 #else
00093 #define USE_SHARED_MANAGER
00094 #endif  /* ISC_PLATFORM_USETHREADS */
00095 
00096 #ifndef USE_WATCHER_THREAD
00097 #include "socket_p.h"
00098 #include "../task_p.h"
00099 #endif /* USE_WATCHER_THREAD */
00100 
00101 #if defined(SO_BSDCOMPAT) && defined(__linux__)
00102 #include <sys/utsname.h>
00103 #endif
00104 
00105 /*%
00106  * Choose the most preferable multiplex method.
00107  */
00108 #ifdef ISC_PLATFORM_HAVEKQUEUE
00109 #define USE_KQUEUE
00110 #elif defined (ISC_PLATFORM_HAVEEPOLL)
00111 #define USE_EPOLL
00112 #elif defined (ISC_PLATFORM_HAVEDEVPOLL)
00113 #define USE_DEVPOLL
00114 typedef struct {
00115         unsigned int want_read : 1,
00116                 want_write : 1;
00117 } pollinfo_t;
00118 #else
00119 #define USE_SELECT
00120 #endif  /* ISC_PLATFORM_HAVEKQUEUE */
00121 
00122 #ifndef USE_WATCHER_THREAD
00123 #if defined(USE_KQUEUE) || defined(USE_EPOLL) || defined(USE_DEVPOLL)
00124 struct isc_socketwait {
00125         int nevents;
00126 };
00127 #elif defined (USE_SELECT)
00128 struct isc_socketwait {
00129         fd_set *readset;
00130         fd_set *writeset;
00131         int nfds;
00132         int maxfd;
00133 };
00134 #endif  /* USE_KQUEUE */
00135 #endif /* !USE_WATCHER_THREAD */
00136 
00137 /*
00138  * Set by the -T dscp option on the command line. If set to a value
00139  * other than -1, we check to make sure DSCP values match it, and
00140  * assert if not.
00141  */
00142 int isc_dscp_check_value = -1;
00143 
00144 /*%
00145  * Maximum number of allowable open sockets.  This is also the maximum
00146  * allowable socket file descriptor.
00147  *
00148  * Care should be taken before modifying this value for select():
00149  * The API standard doesn't ensure select() accept more than (the system default
00150  * of) FD_SETSIZE descriptors, and the default size should in fact be fine in
00151  * the vast majority of cases.  This constant should therefore be increased only
00152  * when absolutely necessary and possible, i.e., the server is exhausting all
00153  * available file descriptors (up to FD_SETSIZE) and the select() function
00154  * and FD_xxx macros support larger values than FD_SETSIZE (which may not
00155  * always by true, but we keep using some of them to ensure as much
00156  * portability as possible).  Note also that overall server performance
00157  * may be rather worsened with a larger value of this constant due to
00158  * inherent scalability problems of select().
00159  *
00160  * As a special note, this value shouldn't have to be touched if
00161  * this is a build for an authoritative only DNS server.
00162  */
00163 #ifndef ISC_SOCKET_MAXSOCKETS
00164 #if defined(USE_KQUEUE) || defined(USE_EPOLL) || defined(USE_DEVPOLL)
00165 #ifdef TUNE_LARGE
00166 #define ISC_SOCKET_MAXSOCKETS 21000
00167 #else
00168 #define ISC_SOCKET_MAXSOCKETS 4096
00169 #endif /* TUNE_LARGE */
00170 #elif defined(USE_SELECT)
00171 #define ISC_SOCKET_MAXSOCKETS FD_SETSIZE
00172 #endif  /* USE_KQUEUE... */
00173 #endif  /* ISC_SOCKET_MAXSOCKETS */
00174 
00175 #ifdef USE_SELECT
00176 /*%
00177  * Mac OS X needs a special definition to support larger values in select().
00178  * We always define this because a larger value can be specified run-time.
00179  */
00180 #ifdef __APPLE__
00181 #define _DARWIN_UNLIMITED_SELECT
00182 #endif  /* __APPLE__ */
00183 #endif  /* USE_SELECT */
00184 
00185 #ifdef ISC_SOCKET_USE_POLLWATCH
00186 /*%
00187  * If this macro is defined, enable workaround for a Solaris /dev/poll kernel
00188  * bug: DP_POLL ioctl could keep sleeping even if socket I/O is possible for
00189  * some of the specified FD.  The idea is based on the observation that it's
00190  * likely for a busy server to keep receiving packets.  It specifically works
00191  * as follows: the socket watcher is first initialized with the state of
00192  * "poll_idle".  While it's in the idle state it keeps sleeping until a socket
00193  * event occurs.  When it wakes up for a socket I/O event, it moves to the
00194  * poll_active state, and sets the poll timeout to a short period
00195  * (ISC_SOCKET_POLLWATCH_TIMEOUT msec).  If timeout occurs in this state, the
00196  * watcher goes to the poll_checking state with the same timeout period.
00197  * In this state, the watcher tries to detect whether this is a break
00198  * during intermittent events or the kernel bug is triggered.  If the next
00199  * polling reports an event within the short period, the previous timeout is
00200  * likely to be a kernel bug, and so the watcher goes back to the active state.
00201  * Otherwise, it moves to the idle state again.
00202  *
00203  * It's not clear whether this is a thread-related bug, but since we've only
00204  * seen this with threads, this workaround is used only when enabling threads.
00205  */
00206 
00207 typedef enum { poll_idle, poll_active, poll_checking } pollstate_t;
00208 
00209 #ifndef ISC_SOCKET_POLLWATCH_TIMEOUT
00210 #define ISC_SOCKET_POLLWATCH_TIMEOUT 10
00211 #endif  /* ISC_SOCKET_POLLWATCH_TIMEOUT */
00212 #endif  /* ISC_SOCKET_USE_POLLWATCH */
00213 
00214 /*%
00215  * Size of per-FD lock buckets.
00216  */
00217 #ifdef ISC_PLATFORM_USETHREADS
00218 #define FDLOCK_COUNT            1024
00219 #define FDLOCK_ID(fd)           ((fd) % FDLOCK_COUNT)
00220 #else
00221 #define FDLOCK_COUNT            1
00222 #define FDLOCK_ID(fd)           0
00223 #endif  /* ISC_PLATFORM_USETHREADS */
00224 
00225 /*%
00226  * Maximum number of events communicated with the kernel.  There should normally
00227  * be no need for having a large number.
00228  */
00229 #if defined(USE_KQUEUE) || defined(USE_EPOLL) || defined(USE_DEVPOLL)
00230 #ifndef ISC_SOCKET_MAXEVENTS
00231 #ifdef TUNE_LARGE
00232 #define ISC_SOCKET_MAXEVENTS    2048
00233 #else
00234 #define ISC_SOCKET_MAXEVENTS    64
00235 #endif /* TUNE_LARGE */
00236 #endif
00237 #endif
00238 
00239 /*%
00240  * Some systems define the socket length argument as an int, some as size_t,
00241  * some as socklen_t.  This is here so it can be easily changed if needed.
00242  */
00243 #ifndef ISC_SOCKADDR_LEN_T
00244 #define ISC_SOCKADDR_LEN_T unsigned int
00245 #endif
00246 
00247 /*%
00248  * Define what the possible "soft" errors can be.  These are non-fatal returns
00249  * of various network related functions, like recv() and so on.
00250  *
00251  * For some reason, BSDI (and perhaps others) will sometimes return <0
00252  * from recv() but will have errno==0.  This is broken, but we have to
00253  * work around it here.
00254  */
00255 #define SOFT_ERROR(e)   ((e) == EAGAIN || \
00256                          (e) == EWOULDBLOCK || \
00257                          (e) == EINTR || \
00258                          (e) == 0)
00259 
00260 #define DLVL(x) ISC_LOGCATEGORY_GENERAL, ISC_LOGMODULE_SOCKET, ISC_LOG_DEBUG(x)
00261 
00262 /*!<
00263  * DLVL(90)  --  Function entry/exit and other tracing.
00264  * DLVL(70)  --  Socket "correctness" -- including returning of events, etc.
00265  * DLVL(60)  --  Socket data send/receive
00266  * DLVL(50)  --  Event tracing, including receiving/sending completion events.
00267  * DLVL(20)  --  Socket creation/destruction.
00268  */
00269 #define TRACE_LEVEL             90
00270 #define CORRECTNESS_LEVEL       70
00271 #define IOEVENT_LEVEL           60
00272 #define EVENT_LEVEL             50
00273 #define CREATION_LEVEL          20
00274 
00275 #define TRACE           DLVL(TRACE_LEVEL)
00276 #define CORRECTNESS     DLVL(CORRECTNESS_LEVEL)
00277 #define IOEVENT         DLVL(IOEVENT_LEVEL)
00278 #define EVENT           DLVL(EVENT_LEVEL)
00279 #define CREATION        DLVL(CREATION_LEVEL)
00280 
00281 typedef isc_event_t intev_t;
00282 
00283 #define SOCKET_MAGIC            ISC_MAGIC('I', 'O', 'i', 'o')
00284 #define VALID_SOCKET(s)         ISC_MAGIC_VALID(s, SOCKET_MAGIC)
00285 
00286 /*!
00287  * IPv6 control information.  If the socket is an IPv6 socket we want
00288  * to collect the destination address and interface so the client can
00289  * set them on outgoing packets.
00290  */
00291 #ifdef ISC_PLATFORM_HAVEIN6PKTINFO
00292 #ifndef USE_CMSG
00293 #define USE_CMSG        1
00294 #endif
00295 #endif
00296 
00297 /*%
00298  * NetBSD and FreeBSD can timestamp packets.  XXXMLG Should we have
00299  * a setsockopt() like interface to request timestamps, and if the OS
00300  * doesn't do it for us, call gettimeofday() on every UDP receive?
00301  */
00302 #ifdef SO_TIMESTAMP
00303 #ifndef USE_CMSG
00304 #define USE_CMSG        1
00305 #endif
00306 #endif
00307 
00308 /*%
00309  * The size to raise the receive buffer to (from BIND 8).
00310  */
00311 #ifdef TUNE_LARGE
00312 #ifdef sun
00313 #define RCVBUFSIZE (1*1024*1024)
00314 #else
00315 #define RCVBUFSIZE (16*1024*1024)
00316 #endif
00317 #else
00318 #define RCVBUFSIZE (32*1024)
00319 #endif /* TUNE_LARGE */
00320 
00321 /*%
00322  * The number of times a send operation is repeated if the result is EINTR.
00323  */
00324 #define NRETRIES 10
00325 
00326 typedef struct isc__socket isc__socket_t;
00327 typedef struct isc__socketmgr isc__socketmgr_t;
00328 
00329 #define NEWCONNSOCK(ev) ((isc__socket_t *)(ev)->newsocket)
00330 
00331 struct isc__socket {
00332         /* Not locked. */
00333         isc_socket_t            common;
00334         isc__socketmgr_t        *manager;
00335         isc_mutex_t             lock;
00336         isc_sockettype_t        type;
00337         const isc_statscounter_t        *statsindex;
00338 
00339         /* Locked by socket lock. */
00340         ISC_LINK(isc__socket_t) link;
00341         unsigned int            references;
00342         int                     fd;
00343         int                     pf;
00344         char                            name[16];
00345         void *                          tag;
00346 
00347         ISC_LIST(isc_socketevent_t)             send_list;
00348         ISC_LIST(isc_socketevent_t)             recv_list;
00349         ISC_LIST(isc_socket_newconnev_t)        accept_list;
00350         ISC_LIST(isc_socket_connev_t)           connect_list;
00351 
00352         /*
00353          * Internal events.  Posted when a descriptor is readable or
00354          * writable.  These are statically allocated and never freed.
00355          * They will be set to non-purgable before use.
00356          */
00357         intev_t                 readable_ev;
00358         intev_t                 writable_ev;
00359 
00360         isc_sockaddr_t          peer_address;       /* remote address */
00361 
00362         unsigned int            pending_recv : 1,
00363                                 pending_send : 1,
00364                                 pending_accept : 1,
00365                                 listener : 1,       /* listener socket */
00366                                 connected : 1,
00367                                 connecting : 1,     /* connect pending */
00368                                 bound : 1,          /* bound to local addr */
00369                                 dupped : 1,
00370                                 active : 1,         /* currently active */
00371                                 pktdscp : 1;        /* per packet dscp */
00372 
00373 #ifdef ISC_NET_RECVOVERFLOW
00374         unsigned char           overflow; /* used for MSG_TRUNC fake */
00375 #endif
00376 
00377         char                    *recvcmsgbuf;
00378         ISC_SOCKADDR_LEN_T      recvcmsgbuflen;
00379         char                    *sendcmsgbuf;
00380         ISC_SOCKADDR_LEN_T      sendcmsgbuflen;
00381 
00382         void                    *fdwatcharg;
00383         isc_sockfdwatch_t       fdwatchcb;
00384         int                     fdwatchflags;
00385         isc_task_t              *fdwatchtask;
00386         unsigned int            dscp;
00387 };
00388 
00389 #define SOCKET_MANAGER_MAGIC    ISC_MAGIC('I', 'O', 'm', 'g')
00390 #define VALID_MANAGER(m)        ISC_MAGIC_VALID(m, SOCKET_MANAGER_MAGIC)
00391 
00392 struct isc__socketmgr {
00393         /* Not locked. */
00394         isc_socketmgr_t         common;
00395         isc_mem_t              *mctx;
00396         isc_mutex_t             lock;
00397         isc_mutex_t             *fdlock;
00398         isc_stats_t             *stats;
00399 #ifdef USE_KQUEUE
00400         int                     kqueue_fd;
00401         int                     nevents;
00402         struct kevent           *events;
00403 #endif  /* USE_KQUEUE */
00404 #ifdef USE_EPOLL
00405         int                     epoll_fd;
00406         int                     nevents;
00407         struct epoll_event      *events;
00408 #endif  /* USE_EPOLL */
00409 #ifdef USE_DEVPOLL
00410         int                     devpoll_fd;
00411         isc_resourcevalue_t     open_max;
00412         unsigned int            calls;
00413         int                     nevents;
00414         struct pollfd           *events;
00415 #endif  /* USE_DEVPOLL */
00416 #ifdef USE_SELECT
00417         int                     fd_bufsize;
00418 #endif  /* USE_SELECT */
00419         unsigned int            maxsocks;
00420 #ifdef ISC_PLATFORM_USETHREADS
00421         int                     pipe_fds[2];
00422 #endif
00423 
00424         /* Locked by fdlock. */
00425         isc__socket_t          **fds;
00426         int                     *fdstate;
00427 #ifdef USE_DEVPOLL
00428         pollinfo_t              *fdpollinfo;
00429 #endif
00430 
00431         /* Locked by manager lock. */
00432         ISC_LIST(isc__socket_t) socklist;
00433 #ifdef USE_SELECT
00434         fd_set                  *read_fds;
00435         fd_set                  *read_fds_copy;
00436         fd_set                  *write_fds;
00437         fd_set                  *write_fds_copy;
00438         int                     maxfd;
00439 #endif  /* USE_SELECT */
00440         int                     reserved;       /* unlocked */
00441 #ifdef USE_WATCHER_THREAD
00442         isc_thread_t            watcher;
00443         isc_condition_t         shutdown_ok;
00444 #else /* USE_WATCHER_THREAD */
00445         unsigned int            refs;
00446 #endif /* USE_WATCHER_THREAD */
00447         int                     maxudp;
00448 };
00449 
00450 #ifdef USE_SHARED_MANAGER
00451 static isc__socketmgr_t *socketmgr = NULL;
00452 #endif /* USE_SHARED_MANAGER */
00453 
00454 #define CLOSED                  0       /* this one must be zero */
00455 #define MANAGED                 1
00456 #define CLOSE_PENDING           2
00457 
00458 /*
00459  * send() and recv() iovec counts
00460  */
00461 #define MAXSCATTERGATHER_SEND   (ISC_SOCKET_MAXSCATTERGATHER)
00462 #ifdef ISC_NET_RECVOVERFLOW
00463 # define MAXSCATTERGATHER_RECV  (ISC_SOCKET_MAXSCATTERGATHER + 1)
00464 #else
00465 # define MAXSCATTERGATHER_RECV  (ISC_SOCKET_MAXSCATTERGATHER)
00466 #endif
00467 
00468 static isc_result_t socket_create(isc_socketmgr_t *manager0, int pf,
00469                                   isc_sockettype_t type,
00470                                   isc_socket_t **socketp,
00471                                   isc_socket_t *dup_socket);
00472 static void send_recvdone_event(isc__socket_t *, isc_socketevent_t **);
00473 static void send_senddone_event(isc__socket_t *, isc_socketevent_t **);
00474 static void send_connectdone_event(isc__socket_t *, isc_socket_connev_t **);
00475 static void free_socket(isc__socket_t **);
00476 static isc_result_t allocate_socket(isc__socketmgr_t *, isc_sockettype_t,
00477                                     isc__socket_t **);
00478 static void destroy(isc__socket_t **);
00479 static void internal_accept(isc_task_t *, isc_event_t *);
00480 static void internal_connect(isc_task_t *, isc_event_t *);
00481 static void internal_recv(isc_task_t *, isc_event_t *);
00482 static void internal_send(isc_task_t *, isc_event_t *);
00483 static void internal_fdwatch_write(isc_task_t *, isc_event_t *);
00484 static void internal_fdwatch_read(isc_task_t *, isc_event_t *);
00485 static void process_cmsg(isc__socket_t *, struct msghdr *, isc_socketevent_t *);
00486 static void build_msghdr_send(isc__socket_t *, isc_socketevent_t *,
00487                               struct msghdr *, struct iovec *, size_t *);
00488 static void build_msghdr_recv(isc__socket_t *, isc_socketevent_t *,
00489                               struct msghdr *, struct iovec *, size_t *);
00490 #ifdef USE_WATCHER_THREAD
00491 static isc_boolean_t process_ctlfd(isc__socketmgr_t *manager);
00492 #endif
00493 static void setdscp(isc__socket_t *sock, isc_dscp_t dscp);
00494 
00495 /*%
00496  * The following are intended for internal use (indicated by "isc__"
00497  * prefix) but are not declared as static, allowing direct access from
00498  * unit tests etc.
00499  */
00500 
00501 isc_result_t
00502 isc__socket_open(isc_socket_t *sock0);
00503 isc_result_t
00504 isc__socket_close(isc_socket_t *sock0);
00505 isc_result_t
00506 isc__socket_create(isc_socketmgr_t *manager, int pf, isc_sockettype_t type,
00507                    isc_socket_t **socketp);
00508 void
00509 isc__socket_attach(isc_socket_t *sock, isc_socket_t **socketp);
00510 void
00511 isc__socket_detach(isc_socket_t **socketp);
00512 isc_result_t
00513 isc__socket_recvv(isc_socket_t *sock, isc_bufferlist_t *buflist,
00514                  unsigned int minimum, isc_task_t *task,
00515                   isc_taskaction_t action, void *arg);
00516 isc_result_t
00517 isc__socket_recv(isc_socket_t *sock, isc_region_t *region,
00518                  unsigned int minimum, isc_task_t *task,
00519                  isc_taskaction_t action, void *arg);
00520 isc_result_t
00521 isc__socket_recv2(isc_socket_t *sock, isc_region_t *region,
00522                   unsigned int minimum, isc_task_t *task,
00523                   isc_socketevent_t *event, unsigned int flags);
00524 isc_result_t
00525 isc__socket_send(isc_socket_t *sock, isc_region_t *region,
00526                  isc_task_t *task, isc_taskaction_t action, void *arg);
00527 isc_result_t
00528 isc__socket_sendto(isc_socket_t *sock, isc_region_t *region,
00529                    isc_task_t *task, isc_taskaction_t action, void *arg,
00530                    isc_sockaddr_t *address, struct in6_pktinfo *pktinfo);
00531 isc_result_t
00532 isc__socket_sendv(isc_socket_t *sock, isc_bufferlist_t *buflist,
00533                   isc_task_t *task, isc_taskaction_t action, void *arg);
00534 isc_result_t
00535 isc__socket_sendtov(isc_socket_t *sock, isc_bufferlist_t *buflist,
00536                     isc_task_t *task, isc_taskaction_t action, void *arg,
00537                     isc_sockaddr_t *address, struct in6_pktinfo *pktinfo);
00538 isc_result_t
00539 isc__socket_sendtov2(isc_socket_t *sock, isc_bufferlist_t *buflist,
00540                      isc_task_t *task, isc_taskaction_t action, void *arg,
00541                      isc_sockaddr_t *address, struct in6_pktinfo *pktinfo,
00542                      unsigned int flags);
00543 isc_result_t
00544 isc__socket_sendto2(isc_socket_t *sock, isc_region_t *region,
00545                     isc_task_t *task,
00546                     isc_sockaddr_t *address, struct in6_pktinfo *pktinfo,
00547                     isc_socketevent_t *event, unsigned int flags);
00548 isc_socketevent_t *
00549 isc_socket_socketevent(isc_mem_t *mctx, void *sender,
00550                        isc_eventtype_t eventtype, isc_taskaction_t action,
00551                        void *arg);
00552 
00553 void
00554 isc__socket_cleanunix(isc_sockaddr_t *sockaddr, isc_boolean_t active);
00555 isc_result_t
00556 isc__socket_permunix(isc_sockaddr_t *sockaddr, isc_uint32_t perm,
00557                      isc_uint32_t owner, isc_uint32_t group);
00558 isc_result_t
00559 isc__socket_bind(isc_socket_t *sock, isc_sockaddr_t *sockaddr,
00560                  unsigned int options);
00561 isc_result_t
00562 isc__socket_filter(isc_socket_t *sock, const char *filter);
00563 isc_result_t
00564 isc__socket_listen(isc_socket_t *sock, unsigned int backlog);
00565 isc_result_t
00566 isc__socket_accept(isc_socket_t *sock,
00567                    isc_task_t *task, isc_taskaction_t action, void *arg);
00568 isc_result_t
00569 isc__socket_connect(isc_socket_t *sock, isc_sockaddr_t *addr,
00570                     isc_task_t *task, isc_taskaction_t action,
00571                     void *arg);
00572 isc_result_t
00573 isc__socket_getpeername(isc_socket_t *sock, isc_sockaddr_t *addressp);
00574 isc_result_t
00575 isc__socket_getsockname(isc_socket_t *sock, isc_sockaddr_t *addressp);
00576 void
00577 isc__socket_cancel(isc_socket_t *sock, isc_task_t *task, unsigned int how);
00578 isc_sockettype_t
00579 isc__socket_gettype(isc_socket_t *sock);
00580 isc_boolean_t
00581 isc__socket_isbound(isc_socket_t *sock);
00582 void
00583 isc__socket_ipv6only(isc_socket_t *sock, isc_boolean_t yes);
00584 void
00585 isc__socket_dscp(isc_socket_t *sock, isc_dscp_t dscp);
00586 isc_result_t
00587 isc__socket_fdwatchcreate(isc_socketmgr_t *manager, int fd, int flags,
00588                           isc_sockfdwatch_t callback, void *cbarg,
00589                           isc_task_t *task, isc_socket_t **socketp);
00590 isc_result_t
00591 isc__socket_fdwatchpoke(isc_socket_t *sock, int flags);
00592 isc_result_t
00593 isc__socket_dup(isc_socket_t *sock, isc_socket_t **socketp);
00594 int
00595 isc__socket_getfd(isc_socket_t *sock);
00596 
00597 isc_result_t
00598 isc__socketmgr_create(isc_mem_t *mctx, isc_socketmgr_t **managerp);
00599 isc_result_t
00600 isc__socketmgr_create2(isc_mem_t *mctx, isc_socketmgr_t **managerp,
00601                        unsigned int maxsocks);
00602 isc_result_t
00603 isc_socketmgr_getmaxsockets(isc_socketmgr_t *manager0, unsigned int *nsockp);
00604 void
00605 isc_socketmgr_setstats(isc_socketmgr_t *manager0, isc_stats_t *stats);
00606 void
00607 isc__socketmgr_destroy(isc_socketmgr_t **managerp);
00608 void
00609 isc__socket_setname(isc_socket_t *socket0, const char *name, void *tag);
00610 const char *
00611 isc__socket_getname(isc_socket_t *socket0);
00612 void *
00613 isc__socket_gettag(isc_socket_t *socket0);
00614 
00615 #ifdef HAVE_LIBXML2
00616 void
00617 isc__socketmgr_renderxml(isc_socketmgr_t *mgr0, xmlTextWriterPtr writer);
00618 #endif
00619 #ifdef HAVE_JSON
00620 isc_result_t
00621 isc__socketmgr_renderjson(isc_socketmgr_t *mgr0, json_object *stats);
00622 #endif
00623 
00624 static struct {
00625         isc_socketmethods_t methods;
00626 
00627         /*%
00628          * The following are defined just for avoiding unused static functions.
00629          */
00630         void *recvv, *send, *sendv, *sendto2, *cleanunix, *permunix, *filter,
00631              *listen, *accept, *getpeername, *isbound;
00632 } socketmethods = {
00633         {
00634                 isc__socket_attach,
00635                 isc__socket_detach,
00636                 isc__socket_bind,
00637                 isc__socket_sendto,
00638                 isc__socket_sendto2,
00639                 isc__socket_connect,
00640                 isc__socket_recv,
00641                 isc__socket_recv2,
00642                 isc__socket_cancel,
00643                 isc__socket_getsockname,
00644                 isc__socket_gettype,
00645                 isc__socket_ipv6only,
00646                 isc__socket_fdwatchpoke,
00647                 isc__socket_dup,
00648                 isc__socket_getfd,
00649                 isc__socket_dscp
00650         },
00651         (void *)isc__socket_recvv, (void *)isc__socket_send,
00652         (void *)isc__socket_sendv, (void *)isc__socket_sendto2,
00653         (void *)isc__socket_cleanunix, (void *)isc__socket_permunix,
00654         (void *)isc__socket_filter, (void *)isc__socket_listen,
00655         (void *)isc__socket_accept, (void *)isc__socket_getpeername,
00656         (void *)isc__socket_isbound
00657 };
00658 
00659 static isc_socketmgrmethods_t socketmgrmethods = {
00660         isc__socketmgr_destroy,
00661         isc__socket_create,
00662         isc__socket_fdwatchcreate
00663 };
00664 
00665 #define SELECT_POKE_SHUTDOWN            (-1)
00666 #define SELECT_POKE_NOTHING             (-2)
00667 #define SELECT_POKE_READ                (-3)
00668 #define SELECT_POKE_ACCEPT              (-3) /*%< Same as _READ */
00669 #define SELECT_POKE_WRITE               (-4)
00670 #define SELECT_POKE_CONNECT             (-4) /*%< Same as _WRITE */
00671 #define SELECT_POKE_CLOSE               (-5)
00672 
00673 #define SOCK_DEAD(s)                    ((s)->references == 0)
00674 
00675 /*%
00676  * Shortcut index arrays to get access to statistics counters.
00677  */
00678 enum {
00679         STATID_OPEN = 0,
00680         STATID_OPENFAIL = 1,
00681         STATID_CLOSE = 2,
00682         STATID_BINDFAIL = 3,
00683         STATID_CONNECTFAIL = 4,
00684         STATID_CONNECT = 5,
00685         STATID_ACCEPTFAIL = 6,
00686         STATID_ACCEPT = 7,
00687         STATID_SENDFAIL = 8,
00688         STATID_RECVFAIL = 9,
00689         STATID_ACTIVE = 10
00690 };
00691 static const isc_statscounter_t udp4statsindex[] = {
00692         isc_sockstatscounter_udp4open,
00693         isc_sockstatscounter_udp4openfail,
00694         isc_sockstatscounter_udp4close,
00695         isc_sockstatscounter_udp4bindfail,
00696         isc_sockstatscounter_udp4connectfail,
00697         isc_sockstatscounter_udp4connect,
00698         -1,
00699         -1,
00700         isc_sockstatscounter_udp4sendfail,
00701         isc_sockstatscounter_udp4recvfail,
00702         isc_sockstatscounter_udp4active
00703 };
00704 static const isc_statscounter_t udp6statsindex[] = {
00705         isc_sockstatscounter_udp6open,
00706         isc_sockstatscounter_udp6openfail,
00707         isc_sockstatscounter_udp6close,
00708         isc_sockstatscounter_udp6bindfail,
00709         isc_sockstatscounter_udp6connectfail,
00710         isc_sockstatscounter_udp6connect,
00711         -1,
00712         -1,
00713         isc_sockstatscounter_udp6sendfail,
00714         isc_sockstatscounter_udp6recvfail,
00715         isc_sockstatscounter_udp6active
00716 };
00717 static const isc_statscounter_t tcp4statsindex[] = {
00718         isc_sockstatscounter_tcp4open,
00719         isc_sockstatscounter_tcp4openfail,
00720         isc_sockstatscounter_tcp4close,
00721         isc_sockstatscounter_tcp4bindfail,
00722         isc_sockstatscounter_tcp4connectfail,
00723         isc_sockstatscounter_tcp4connect,
00724         isc_sockstatscounter_tcp4acceptfail,
00725         isc_sockstatscounter_tcp4accept,
00726         isc_sockstatscounter_tcp4sendfail,
00727         isc_sockstatscounter_tcp4recvfail,
00728         isc_sockstatscounter_tcp4active
00729 };
00730 static const isc_statscounter_t tcp6statsindex[] = {
00731         isc_sockstatscounter_tcp6open,
00732         isc_sockstatscounter_tcp6openfail,
00733         isc_sockstatscounter_tcp6close,
00734         isc_sockstatscounter_tcp6bindfail,
00735         isc_sockstatscounter_tcp6connectfail,
00736         isc_sockstatscounter_tcp6connect,
00737         isc_sockstatscounter_tcp6acceptfail,
00738         isc_sockstatscounter_tcp6accept,
00739         isc_sockstatscounter_tcp6sendfail,
00740         isc_sockstatscounter_tcp6recvfail,
00741         isc_sockstatscounter_tcp6active
00742 };
00743 static const isc_statscounter_t unixstatsindex[] = {
00744         isc_sockstatscounter_unixopen,
00745         isc_sockstatscounter_unixopenfail,
00746         isc_sockstatscounter_unixclose,
00747         isc_sockstatscounter_unixbindfail,
00748         isc_sockstatscounter_unixconnectfail,
00749         isc_sockstatscounter_unixconnect,
00750         isc_sockstatscounter_unixacceptfail,
00751         isc_sockstatscounter_unixaccept,
00752         isc_sockstatscounter_unixsendfail,
00753         isc_sockstatscounter_unixrecvfail,
00754         isc_sockstatscounter_unixactive
00755 };
00756 static const isc_statscounter_t fdwatchstatsindex[] = {
00757         -1,
00758         -1,
00759         isc_sockstatscounter_fdwatchclose,
00760         isc_sockstatscounter_fdwatchbindfail,
00761         isc_sockstatscounter_fdwatchconnectfail,
00762         isc_sockstatscounter_fdwatchconnect,
00763         -1,
00764         -1,
00765         isc_sockstatscounter_fdwatchsendfail,
00766         isc_sockstatscounter_fdwatchrecvfail,
00767         -1
00768 };
00769 static const isc_statscounter_t rawstatsindex[] = {
00770         isc_sockstatscounter_rawopen,
00771         isc_sockstatscounter_rawopenfail,
00772         isc_sockstatscounter_rawclose,
00773         -1,
00774         -1,
00775         -1,
00776         -1,
00777         -1,
00778         -1,
00779         isc_sockstatscounter_rawrecvfail,
00780         isc_sockstatscounter_rawactive
00781 };
00782 
00783 #if defined(USE_KQUEUE) || defined(USE_EPOLL) || defined(USE_DEVPOLL) || \
00784     defined(USE_WATCHER_THREAD)
00785 static void
00786 manager_log(isc__socketmgr_t *sockmgr,
00787             isc_logcategory_t *category, isc_logmodule_t *module, int level,
00788             const char *fmt, ...) ISC_FORMAT_PRINTF(5, 6);
00789 static void
00790 manager_log(isc__socketmgr_t *sockmgr,
00791             isc_logcategory_t *category, isc_logmodule_t *module, int level,
00792             const char *fmt, ...)
00793 {
00794         char msgbuf[2048];
00795         va_list ap;
00796 
00797         if (! isc_log_wouldlog(isc_lctx, level))
00798                 return;
00799 
00800         va_start(ap, fmt);
00801         vsnprintf(msgbuf, sizeof(msgbuf), fmt, ap);
00802         va_end(ap);
00803 
00804         isc_log_write(isc_lctx, category, module, level,
00805                       "sockmgr %p: %s", sockmgr, msgbuf);
00806 }
00807 #endif
00808 
00809 static void
00810 socket_log(isc__socket_t *sock, isc_sockaddr_t *address,
00811            isc_logcategory_t *category, isc_logmodule_t *module, int level,
00812            isc_msgcat_t *msgcat, int msgset, int message,
00813            const char *fmt, ...) ISC_FORMAT_PRINTF(9, 10);
00814 static void
00815 socket_log(isc__socket_t *sock, isc_sockaddr_t *address,
00816            isc_logcategory_t *category, isc_logmodule_t *module, int level,
00817            isc_msgcat_t *msgcat, int msgset, int message,
00818            const char *fmt, ...)
00819 {
00820         char msgbuf[2048];
00821         char peerbuf[ISC_SOCKADDR_FORMATSIZE];
00822         va_list ap;
00823 
00824         if (! isc_log_wouldlog(isc_lctx, level))
00825                 return;
00826 
00827         va_start(ap, fmt);
00828         vsnprintf(msgbuf, sizeof(msgbuf), fmt, ap);
00829         va_end(ap);
00830 
00831         if (address == NULL) {
00832                 isc_log_iwrite(isc_lctx, category, module, level,
00833                                msgcat, msgset, message,
00834                                "socket %p: %s", sock, msgbuf);
00835         } else {
00836                 isc_sockaddr_format(address, peerbuf, sizeof(peerbuf));
00837                 isc_log_iwrite(isc_lctx, category, module, level,
00838                                msgcat, msgset, message,
00839                                "socket %p %s: %s", sock, peerbuf, msgbuf);
00840         }
00841 }
00842 
00843 #if defined(_AIX) && defined(ISC_NET_BSD44MSGHDR) && \
00844     defined(USE_CMSG) && defined(IPV6_RECVPKTINFO)
00845 /*
00846  * AIX has a kernel bug where IPV6_RECVPKTINFO gets cleared by
00847  * setting IPV6_V6ONLY.
00848  */
00849 static void
00850 FIX_IPV6_RECVPKTINFO(isc__socket_t *sock)
00851 {
00852         char strbuf[ISC_STRERRORSIZE];
00853         int on = 1;
00854 
00855         if (sock->pf != AF_INET6 || sock->type != isc_sockettype_udp)
00856                 return;
00857 
00858         if (setsockopt(sock->fd, IPPROTO_IPV6, IPV6_RECVPKTINFO,
00859                        (void *)&on, sizeof(on)) < 0) {
00860 
00861                 isc__strerror(errno, strbuf, sizeof(strbuf));
00862                 UNEXPECTED_ERROR(__FILE__, __LINE__,
00863                                  "setsockopt(%d, IPV6_RECVPKTINFO) "
00864                                  "%s: %s", sock->fd,
00865                                  isc_msgcat_get(isc_msgcat,
00866                                                 ISC_MSGSET_GENERAL,
00867                                                 ISC_MSG_FAILED,
00868                                                 "failed"),
00869                                  strbuf);
00870         }
00871 }
00872 #else
00873 #define FIX_IPV6_RECVPKTINFO(sock) (void)0
00874 #endif
00875 
00876 /*%
00877  * Increment socket-related statistics counters.
00878  */
00879 static inline void
00880 inc_stats(isc_stats_t *stats, isc_statscounter_t counterid) {
00881         REQUIRE(counterid != -1);
00882 
00883         if (stats != NULL)
00884                 isc_stats_increment(stats, counterid);
00885 }
00886 
00887 /*%
00888  * Decrement socket-related statistics counters.
00889  */
00890 static inline void
00891 dec_stats(isc_stats_t *stats, isc_statscounter_t counterid) {
00892         REQUIRE(counterid != -1);
00893 
00894         if (stats != NULL)
00895                 isc_stats_decrement(stats, counterid);
00896 }
00897 
00898 static inline isc_result_t
00899 watch_fd(isc__socketmgr_t *manager, int fd, int msg) {
00900         isc_result_t result = ISC_R_SUCCESS;
00901 
00902 #ifdef USE_KQUEUE
00903         struct kevent evchange;
00904 
00905         memset(&evchange, 0, sizeof(evchange));
00906         if (msg == SELECT_POKE_READ)
00907                 evchange.filter = EVFILT_READ;
00908         else
00909                 evchange.filter = EVFILT_WRITE;
00910         evchange.flags = EV_ADD;
00911         evchange.ident = fd;
00912         if (kevent(manager->kqueue_fd, &evchange, 1, NULL, 0, NULL) != 0)
00913                 result = isc__errno2result(errno);
00914 
00915         return (result);
00916 #elif defined(USE_EPOLL)
00917         struct epoll_event event;
00918 
00919         if (msg == SELECT_POKE_READ)
00920                 event.events = EPOLLIN;
00921         else
00922                 event.events = EPOLLOUT;
00923         memset(&event.data, 0, sizeof(event.data));
00924         event.data.fd = fd;
00925         if (epoll_ctl(manager->epoll_fd, EPOLL_CTL_ADD, fd, &event) == -1 &&
00926             errno != EEXIST) {
00927                 result = isc__errno2result(errno);
00928         }
00929 
00930         return (result);
00931 #elif defined(USE_DEVPOLL)
00932         struct pollfd pfd;
00933         int lockid = FDLOCK_ID(fd);
00934 
00935         memset(&pfd, 0, sizeof(pfd));
00936         if (msg == SELECT_POKE_READ)
00937                 pfd.events = POLLIN;
00938         else
00939                 pfd.events = POLLOUT;
00940         pfd.fd = fd;
00941         pfd.revents = 0;
00942         LOCK(&manager->fdlock[lockid]);
00943         if (write(manager->devpoll_fd, &pfd, sizeof(pfd)) == -1)
00944                 result = isc__errno2result(errno);
00945         else {
00946                 if (msg == SELECT_POKE_READ)
00947                         manager->fdpollinfo[fd].want_read = 1;
00948                 else
00949                         manager->fdpollinfo[fd].want_write = 1;
00950         }
00951         UNLOCK(&manager->fdlock[lockid]);
00952 
00953         return (result);
00954 #elif defined(USE_SELECT)
00955         LOCK(&manager->lock);
00956         if (msg == SELECT_POKE_READ)
00957                 FD_SET(fd, manager->read_fds);
00958         if (msg == SELECT_POKE_WRITE)
00959                 FD_SET(fd, manager->write_fds);
00960         UNLOCK(&manager->lock);
00961 
00962         return (result);
00963 #endif
00964 }
00965 
00966 static inline isc_result_t
00967 unwatch_fd(isc__socketmgr_t *manager, int fd, int msg) {
00968         isc_result_t result = ISC_R_SUCCESS;
00969 
00970 #ifdef USE_KQUEUE
00971         struct kevent evchange;
00972 
00973         memset(&evchange, 0, sizeof(evchange));
00974         if (msg == SELECT_POKE_READ)
00975                 evchange.filter = EVFILT_READ;
00976         else
00977                 evchange.filter = EVFILT_WRITE;
00978         evchange.flags = EV_DELETE;
00979         evchange.ident = fd;
00980         if (kevent(manager->kqueue_fd, &evchange, 1, NULL, 0, NULL) != 0)
00981                 result = isc__errno2result(errno);
00982 
00983         return (result);
00984 #elif defined(USE_EPOLL)
00985         struct epoll_event event;
00986 
00987         if (msg == SELECT_POKE_READ)
00988                 event.events = EPOLLIN;
00989         else
00990                 event.events = EPOLLOUT;
00991         memset(&event.data, 0, sizeof(event.data));
00992         event.data.fd = fd;
00993         if (epoll_ctl(manager->epoll_fd, EPOLL_CTL_DEL, fd, &event) == -1 &&
00994             errno != ENOENT) {
00995                 char strbuf[ISC_STRERRORSIZE];
00996                 isc__strerror(errno, strbuf, sizeof(strbuf));
00997                 UNEXPECTED_ERROR(__FILE__, __LINE__,
00998                                  "epoll_ctl(DEL), %d: %s", fd, strbuf);
00999                 result = ISC_R_UNEXPECTED;
01000         }
01001         return (result);
01002 #elif defined(USE_DEVPOLL)
01003         struct pollfd pfds[2];
01004         size_t writelen = sizeof(pfds[0]);
01005         int lockid = FDLOCK_ID(fd);
01006 
01007         memset(pfds, 0, sizeof(pfds));
01008         pfds[0].events = POLLREMOVE;
01009         pfds[0].fd = fd;
01010 
01011         /*
01012          * Canceling read or write polling via /dev/poll is tricky.  Since it
01013          * only provides a way of canceling per FD, we may need to re-poll the
01014          * socket for the other operation.
01015          */
01016         LOCK(&manager->fdlock[lockid]);
01017         if (msg == SELECT_POKE_READ &&
01018             manager->fdpollinfo[fd].want_write == 1) {
01019                 pfds[1].events = POLLOUT;
01020                 pfds[1].fd = fd;
01021                 writelen += sizeof(pfds[1]);
01022         }
01023         if (msg == SELECT_POKE_WRITE &&
01024             manager->fdpollinfo[fd].want_read == 1) {
01025                 pfds[1].events = POLLIN;
01026                 pfds[1].fd = fd;
01027                 writelen += sizeof(pfds[1]);
01028         }
01029 
01030         if (write(manager->devpoll_fd, pfds, writelen) == -1)
01031                 result = isc__errno2result(errno);
01032         else {
01033                 if (msg == SELECT_POKE_READ)
01034                         manager->fdpollinfo[fd].want_read = 0;
01035                 else
01036                         manager->fdpollinfo[fd].want_write = 0;
01037         }
01038         UNLOCK(&manager->fdlock[lockid]);
01039 
01040         return (result);
01041 #elif defined(USE_SELECT)
01042         LOCK(&manager->lock);
01043         if (msg == SELECT_POKE_READ)
01044                 FD_CLR(fd, manager->read_fds);
01045         else if (msg == SELECT_POKE_WRITE)
01046                 FD_CLR(fd, manager->write_fds);
01047         UNLOCK(&manager->lock);
01048 
01049         return (result);
01050 #endif
01051 }
01052 
01053 static void
01054 wakeup_socket(isc__socketmgr_t *manager, int fd, int msg) {
01055         isc_result_t result;
01056         int lockid = FDLOCK_ID(fd);
01057 
01058         /*
01059          * This is a wakeup on a socket.  If the socket is not in the
01060          * process of being closed, start watching it for either reads
01061          * or writes.
01062          */
01063 
01064         INSIST(fd >= 0 && fd < (int)manager->maxsocks);
01065 
01066         if (msg == SELECT_POKE_CLOSE) {
01067                 /* No one should be updating fdstate, so no need to lock it */
01068                 INSIST(manager->fdstate[fd] == CLOSE_PENDING);
01069                 manager->fdstate[fd] = CLOSED;
01070                 (void)unwatch_fd(manager, fd, SELECT_POKE_READ);
01071                 (void)unwatch_fd(manager, fd, SELECT_POKE_WRITE);
01072                 (void)close(fd);
01073                 return;
01074         }
01075 
01076         LOCK(&manager->fdlock[lockid]);
01077         if (manager->fdstate[fd] == CLOSE_PENDING) {
01078                 UNLOCK(&manager->fdlock[lockid]);
01079 
01080                 /*
01081                  * We accept (and ignore) any error from unwatch_fd() as we are
01082                  * closing the socket, hoping it doesn't leave dangling state in
01083                  * the kernel.
01084                  * Note that unwatch_fd() must be called after releasing the
01085                  * fdlock; otherwise it could cause deadlock due to a lock order
01086                  * reversal.
01087                  */
01088                 (void)unwatch_fd(manager, fd, SELECT_POKE_READ);
01089                 (void)unwatch_fd(manager, fd, SELECT_POKE_WRITE);
01090                 return;
01091         }
01092         if (manager->fdstate[fd] != MANAGED) {
01093                 UNLOCK(&manager->fdlock[lockid]);
01094                 return;
01095         }
01096         UNLOCK(&manager->fdlock[lockid]);
01097 
01098         /*
01099          * Set requested bit.
01100          */
01101         result = watch_fd(manager, fd, msg);
01102         if (result != ISC_R_SUCCESS) {
01103                 /*
01104                  * XXXJT: what should we do?  Ignoring the failure of watching
01105                  * a socket will make the application dysfunctional, but there
01106                  * seems to be no reasonable recovery process.
01107                  */
01108                 isc_log_write(isc_lctx, ISC_LOGCATEGORY_GENERAL,
01109                               ISC_LOGMODULE_SOCKET, ISC_LOG_ERROR,
01110                               "failed to start watching FD (%d): %s",
01111                               fd, isc_result_totext(result));
01112         }
01113 }
01114 
01115 #ifdef USE_WATCHER_THREAD
01116 /*
01117  * Poke the select loop when there is something for us to do.
01118  * The write is required (by POSIX) to complete.  That is, we
01119  * will not get partial writes.
01120  */
01121 static void
01122 select_poke(isc__socketmgr_t *mgr, int fd, int msg) {
01123         int cc;
01124         int buf[2];
01125         char strbuf[ISC_STRERRORSIZE];
01126 
01127         buf[0] = fd;
01128         buf[1] = msg;
01129 
01130         do {
01131                 cc = write(mgr->pipe_fds[1], buf, sizeof(buf));
01132 #ifdef ENOSR
01133                 /*
01134                  * Treat ENOSR as EAGAIN but loop slowly as it is
01135                  * unlikely to clear fast.
01136                  */
01137                 if (cc < 0 && errno == ENOSR) {
01138                         sleep(1);
01139                         errno = EAGAIN;
01140                 }
01141 #endif
01142         } while (cc < 0 && SOFT_ERROR(errno));
01143 
01144         if (cc < 0) {
01145                 isc__strerror(errno, strbuf, sizeof(strbuf));
01146                 FATAL_ERROR(__FILE__, __LINE__,
01147                             isc_msgcat_get(isc_msgcat, ISC_MSGSET_SOCKET,
01148                                            ISC_MSG_WRITEFAILED,
01149                                            "write() failed "
01150                                            "during watcher poke: %s"),
01151                             strbuf);
01152         }
01153 
01154         INSIST(cc == sizeof(buf));
01155 }
01156 
01157 /*
01158  * Read a message on the internal fd.
01159  */
01160 static void
01161 select_readmsg(isc__socketmgr_t *mgr, int *fd, int *msg) {
01162         int buf[2];
01163         int cc;
01164         char strbuf[ISC_STRERRORSIZE];
01165 
01166         cc = read(mgr->pipe_fds[0], buf, sizeof(buf));
01167         if (cc < 0) {
01168                 *msg = SELECT_POKE_NOTHING;
01169                 *fd = -1;       /* Silence compiler. */
01170                 if (SOFT_ERROR(errno))
01171                         return;
01172 
01173                 isc__strerror(errno, strbuf, sizeof(strbuf));
01174                 FATAL_ERROR(__FILE__, __LINE__,
01175                             isc_msgcat_get(isc_msgcat, ISC_MSGSET_SOCKET,
01176                                            ISC_MSG_READFAILED,
01177                                            "read() failed "
01178                                            "during watcher poke: %s"),
01179                             strbuf);
01180         }
01181         INSIST(cc == sizeof(buf));
01182 
01183         *fd = buf[0];
01184         *msg = buf[1];
01185 }
01186 #else /* USE_WATCHER_THREAD */
01187 /*
01188  * Update the state of the socketmgr when something changes.
01189  */
01190 static void
01191 select_poke(isc__socketmgr_t *manager, int fd, int msg) {
01192         if (msg == SELECT_POKE_SHUTDOWN)
01193                 return;
01194         else if (fd >= 0)
01195                 wakeup_socket(manager, fd, msg);
01196         return;
01197 }
01198 #endif /* USE_WATCHER_THREAD */
01199 
01200 /*
01201  * Make a fd non-blocking.
01202  */
01203 static isc_result_t
01204 make_nonblock(int fd) {
01205         int ret;
01206         int flags;
01207         char strbuf[ISC_STRERRORSIZE];
01208 #ifdef USE_FIONBIO_IOCTL
01209         int on = 1;
01210 
01211         ret = ioctl(fd, FIONBIO, (char *)&on);
01212 #else
01213         flags = fcntl(fd, F_GETFL, 0);
01214         flags |= PORT_NONBLOCK;
01215         ret = fcntl(fd, F_SETFL, flags);
01216 #endif
01217 
01218         if (ret == -1) {
01219                 isc__strerror(errno, strbuf, sizeof(strbuf));
01220                 UNEXPECTED_ERROR(__FILE__, __LINE__,
01221 #ifdef USE_FIONBIO_IOCTL
01222                                  "ioctl(%d, FIONBIO, &on): %s", fd,
01223 #else
01224                                  "fcntl(%d, F_SETFL, %d): %s", fd, flags,
01225 #endif
01226                                  strbuf);
01227 
01228                 return (ISC_R_UNEXPECTED);
01229         }
01230 
01231         return (ISC_R_SUCCESS);
01232 }
01233 
01234 #ifdef USE_CMSG
01235 /*
01236  * Not all OSes support advanced CMSG macros: CMSG_LEN and CMSG_SPACE.
01237  * In order to ensure as much portability as possible, we provide wrapper
01238  * functions of these macros.
01239  * Note that cmsg_space() could run slow on OSes that do not have
01240  * CMSG_SPACE.
01241  */
01242 static inline ISC_SOCKADDR_LEN_T
01243 cmsg_len(ISC_SOCKADDR_LEN_T len) {
01244 #ifdef CMSG_LEN
01245         return (CMSG_LEN(len));
01246 #else
01247         ISC_SOCKADDR_LEN_T hdrlen;
01248 
01249         /*
01250          * Cast NULL so that any pointer arithmetic performed by CMSG_DATA
01251          * is correct.
01252          */
01253         hdrlen = (ISC_SOCKADDR_LEN_T)CMSG_DATA(((struct cmsghdr *)NULL));
01254         return (hdrlen + len);
01255 #endif
01256 }
01257 
01258 static inline ISC_SOCKADDR_LEN_T
01259 cmsg_space(ISC_SOCKADDR_LEN_T len) {
01260 #ifdef CMSG_SPACE
01261         return (CMSG_SPACE(len));
01262 #else
01263         struct msghdr msg;
01264         struct cmsghdr *cmsgp;
01265         /*
01266          * XXX: The buffer length is an ad-hoc value, but should be enough
01267          * in a practical sense.
01268          */
01269         char dummybuf[sizeof(struct cmsghdr) + 1024];
01270 
01271         memset(&msg, 0, sizeof(msg));
01272         msg.msg_control = dummybuf;
01273         msg.msg_controllen = sizeof(dummybuf);
01274 
01275         cmsgp = (struct cmsghdr *)dummybuf;
01276         cmsgp->cmsg_len = cmsg_len(len);
01277 
01278         cmsgp = CMSG_NXTHDR(&msg, cmsgp);
01279         if (cmsgp != NULL)
01280                 return ((char *)cmsgp - (char *)msg.msg_control);
01281         else
01282                 return (0);
01283 #endif
01284 }
01285 #endif /* USE_CMSG */
01286 
01287 /*
01288  * Process control messages received on a socket.
01289  */
01290 static void
01291 process_cmsg(isc__socket_t *sock, struct msghdr *msg, isc_socketevent_t *dev) {
01292 #ifdef USE_CMSG
01293         struct cmsghdr *cmsgp;
01294 #ifdef ISC_PLATFORM_HAVEIN6PKTINFO
01295         struct in6_pktinfo *pktinfop;
01296 #endif
01297 #ifdef SO_TIMESTAMP
01298         void *timevalp;
01299 #endif
01300 #endif
01301 
01302         /*
01303          * sock is used only when ISC_NET_BSD44MSGHDR and USE_CMSG are defined.
01304          * msg and dev are used only when ISC_NET_BSD44MSGHDR is defined.
01305          * They are all here, outside of the CPP tests, because it is
01306          * more consistent with the usual ISC coding style.
01307          */
01308         UNUSED(sock);
01309         UNUSED(msg);
01310         UNUSED(dev);
01311 
01312 #ifdef ISC_NET_BSD44MSGHDR
01313 
01314 #ifdef MSG_TRUNC
01315         if ((msg->msg_flags & MSG_TRUNC) == MSG_TRUNC)
01316                 dev->attributes |= ISC_SOCKEVENTATTR_TRUNC;
01317 #endif
01318 
01319 #ifdef MSG_CTRUNC
01320         if ((msg->msg_flags & MSG_CTRUNC) == MSG_CTRUNC)
01321                 dev->attributes |= ISC_SOCKEVENTATTR_CTRUNC;
01322 #endif
01323 
01324 #ifndef USE_CMSG
01325         return;
01326 #else
01327         if (msg->msg_controllen == 0U || msg->msg_control == NULL)
01328                 return;
01329 
01330 #ifdef SO_TIMESTAMP
01331         timevalp = NULL;
01332 #endif
01333 #ifdef ISC_PLATFORM_HAVEIN6PKTINFO
01334         pktinfop = NULL;
01335 #endif
01336 
01337         cmsgp = CMSG_FIRSTHDR(msg);
01338         while (cmsgp != NULL) {
01339                 socket_log(sock, NULL, TRACE,
01340                            isc_msgcat, ISC_MSGSET_SOCKET, ISC_MSG_PROCESSCMSG,
01341                            "processing cmsg %p", cmsgp);
01342 
01343 #ifdef ISC_PLATFORM_HAVEIN6PKTINFO
01344                 if (cmsgp->cmsg_level == IPPROTO_IPV6
01345                     && cmsgp->cmsg_type == IPV6_PKTINFO) {
01346 
01347                         pktinfop = (struct in6_pktinfo *)CMSG_DATA(cmsgp);
01348                         memmove(&dev->pktinfo, pktinfop,
01349                                 sizeof(struct in6_pktinfo));
01350                         dev->attributes |= ISC_SOCKEVENTATTR_PKTINFO;
01351                         socket_log(sock, NULL, TRACE,
01352                                    isc_msgcat, ISC_MSGSET_SOCKET,
01353                                    ISC_MSG_IFRECEIVED,
01354                                    "interface received on ifindex %u",
01355                                    dev->pktinfo.ipi6_ifindex);
01356                         if (IN6_IS_ADDR_MULTICAST(&pktinfop->ipi6_addr))
01357                                 dev->attributes |= ISC_SOCKEVENTATTR_MULTICAST;
01358                         goto next;
01359                 }
01360 #endif
01361 
01362 #ifdef SO_TIMESTAMP
01363                 if (cmsgp->cmsg_level == SOL_SOCKET
01364                     && cmsgp->cmsg_type == SCM_TIMESTAMP) {
01365                         struct timeval tv;
01366                         timevalp = CMSG_DATA(cmsgp);
01367                         memmove(&tv, timevalp, sizeof(tv));
01368                         dev->timestamp.seconds = tv.tv_sec;
01369                         dev->timestamp.nanoseconds = tv.tv_usec * 1000;
01370                         dev->attributes |= ISC_SOCKEVENTATTR_TIMESTAMP;
01371                         goto next;
01372                 }
01373 #endif
01374 
01375 #ifdef IPV6_TCLASS
01376                 if (cmsgp->cmsg_level == IPPROTO_IPV6
01377                     && cmsgp->cmsg_type == IPV6_TCLASS) {
01378                         dev->dscp = *(int *)CMSG_DATA(cmsgp);
01379                         dev->dscp >>= 2;
01380                         dev->attributes |= ISC_SOCKEVENTATTR_DSCP;
01381                         goto next;
01382                 }
01383 #endif
01384 
01385 #ifdef IP_TOS
01386                 if (cmsgp->cmsg_level == IPPROTO_IP
01387                     && (cmsgp->cmsg_type == IP_TOS
01388 #ifdef IP_RECVTOS
01389                         || cmsgp->cmsg_type == IP_RECVTOS
01390 #endif
01391                         )) {
01392                         dev->dscp = (int) *(unsigned char *)CMSG_DATA(cmsgp);
01393                         dev->dscp >>= 2;
01394                         dev->attributes |= ISC_SOCKEVENTATTR_DSCP;
01395                         goto next;
01396                 }
01397 #endif
01398         next:
01399                 cmsgp = CMSG_NXTHDR(msg, cmsgp);
01400         }
01401 #endif /* USE_CMSG */
01402 
01403 #endif /* ISC_NET_BSD44MSGHDR */
01404 }
01405 
01406 /*
01407  * Construct an iov array and attach it to the msghdr passed in.  This is
01408  * the SEND constructor, which will use the used region of the buffer
01409  * (if using a buffer list) or will use the internal region (if a single
01410  * buffer I/O is requested).
01411  *
01412  * Nothing can be NULL, and the done event must list at least one buffer
01413  * on the buffer linked list for this function to be meaningful.
01414  *
01415  * If write_countp != NULL, *write_countp will hold the number of bytes
01416  * this transaction can send.
01417  */
01418 static void
01419 build_msghdr_send(isc__socket_t *sock, isc_socketevent_t *dev,
01420                   struct msghdr *msg, struct iovec *iov, size_t *write_countp)
01421 {
01422         unsigned int iovcount;
01423         isc_buffer_t *buffer;
01424         isc_region_t used;
01425         size_t write_count;
01426         size_t skip_count;
01427 #ifdef ISC_NET_BSD44MSGHDR
01428         struct cmsghdr *cmsgp;
01429 #endif
01430 
01431         memset(msg, 0, sizeof(*msg));
01432 
01433         if (!sock->connected) {
01434                 msg->msg_name = (void *)&dev->address.type.sa;
01435                 msg->msg_namelen = dev->address.length;
01436         } else {
01437                 msg->msg_name = NULL;
01438                 msg->msg_namelen = 0;
01439         }
01440 
01441         buffer = ISC_LIST_HEAD(dev->bufferlist);
01442         write_count = 0;
01443         iovcount = 0;
01444 
01445         /*
01446          * Single buffer I/O?  Skip what we've done so far in this region.
01447          */
01448         if (buffer == NULL) {
01449                 write_count = dev->region.length - dev->n;
01450                 iov[0].iov_base = (void *)(dev->region.base + dev->n);
01451                 iov[0].iov_len = write_count;
01452                 iovcount = 1;
01453 
01454                 goto config;
01455         }
01456 
01457         /*
01458          * Multibuffer I/O.
01459          * Skip the data in the buffer list that we have already written.
01460          */
01461         skip_count = dev->n;
01462         while (buffer != NULL) {
01463                 REQUIRE(ISC_BUFFER_VALID(buffer));
01464                 if (skip_count < isc_buffer_usedlength(buffer))
01465                         break;
01466                 skip_count -= isc_buffer_usedlength(buffer);
01467                 buffer = ISC_LIST_NEXT(buffer, link);
01468         }
01469 
01470         while (buffer != NULL) {
01471                 INSIST(iovcount < MAXSCATTERGATHER_SEND);
01472 
01473                 isc_buffer_usedregion(buffer, &used);
01474 
01475                 if (used.length > 0) {
01476                         iov[iovcount].iov_base = (void *)(used.base
01477                                                           + skip_count);
01478                         iov[iovcount].iov_len = used.length - skip_count;
01479                         write_count += (used.length - skip_count);
01480                         skip_count = 0;
01481                         iovcount++;
01482                 }
01483                 buffer = ISC_LIST_NEXT(buffer, link);
01484         }
01485 
01486         INSIST(skip_count == 0U);
01487 
01488  config:
01489         msg->msg_iov = iov;
01490         msg->msg_iovlen = iovcount;
01491 
01492 #ifdef ISC_NET_BSD44MSGHDR
01493         msg->msg_control = NULL;
01494         msg->msg_controllen = 0;
01495         msg->msg_flags = 0;
01496 #if defined(USE_CMSG) && defined(ISC_PLATFORM_HAVEIN6PKTINFO)
01497         if ((sock->type == isc_sockettype_udp)
01498             && ((dev->attributes & ISC_SOCKEVENTATTR_PKTINFO) != 0)) {
01499 #if defined(IPV6_USE_MIN_MTU)
01500                 int use_min_mtu = 1;    /* -1, 0, 1 */
01501 #endif
01502                 struct in6_pktinfo *pktinfop;
01503 
01504                 socket_log(sock, NULL, TRACE,
01505                            isc_msgcat, ISC_MSGSET_SOCKET, ISC_MSG_SENDTODATA,
01506                            "sendto pktinfo data, ifindex %u",
01507                            dev->pktinfo.ipi6_ifindex);
01508 
01509                 msg->msg_controllen = cmsg_space(sizeof(struct in6_pktinfo));
01510                 INSIST(msg->msg_controllen <= sock->sendcmsgbuflen);
01511                 msg->msg_control = (void *)sock->sendcmsgbuf;
01512 
01513                 cmsgp = (struct cmsghdr *)sock->sendcmsgbuf;
01514                 cmsgp->cmsg_level = IPPROTO_IPV6;
01515                 cmsgp->cmsg_type = IPV6_PKTINFO;
01516                 cmsgp->cmsg_len = cmsg_len(sizeof(struct in6_pktinfo));
01517                 pktinfop = (struct in6_pktinfo *)CMSG_DATA(cmsgp);
01518                 memmove(pktinfop, &dev->pktinfo, sizeof(struct in6_pktinfo));
01519 #if defined(IPV6_USE_MIN_MTU)
01520                 /*
01521                  * Set IPV6_USE_MIN_MTU as a per packet option as FreeBSD
01522                  * ignores setsockopt(IPV6_USE_MIN_MTU) when IPV6_PKTINFO
01523                  * is used.
01524                  */
01525                 cmsgp = (struct cmsghdr *)(sock->sendcmsgbuf +
01526                                            msg->msg_controllen);
01527                 msg->msg_controllen += cmsg_space(sizeof(use_min_mtu));
01528                 INSIST(msg->msg_controllen <= sock->sendcmsgbuflen);
01529 
01530                 cmsgp->cmsg_level = IPPROTO_IPV6;
01531                 cmsgp->cmsg_type = IPV6_USE_MIN_MTU;
01532                 cmsgp->cmsg_len = cmsg_len(sizeof(use_min_mtu));
01533                 memmove(CMSG_DATA(cmsgp), &use_min_mtu, sizeof(use_min_mtu));
01534 #endif
01535         }
01536 
01537         if (isc_dscp_check_value > -1) {
01538                 if (sock->type == isc_sockettype_udp)
01539                         INSIST((int)dev->dscp == isc_dscp_check_value);
01540                 else if (sock->type == isc_sockettype_tcp)
01541                         INSIST((int)sock->dscp == isc_dscp_check_value);
01542         }
01543 
01544         if ((sock->type == isc_sockettype_udp) &&
01545             ((dev->attributes & ISC_SOCKEVENTATTR_DSCP) != 0))
01546         {
01547                 int dscp = (dev->dscp << 2) & 0xff;
01548 
01549                 INSIST(dev->dscp < 0x40);
01550 
01551 #ifdef IP_TOS
01552                 if (sock->pf == AF_INET && sock->pktdscp) {
01553                         cmsgp = (struct cmsghdr *)(sock->sendcmsgbuf +
01554                                                    msg->msg_controllen);
01555                         msg->msg_control = (void *)sock->sendcmsgbuf;
01556                         msg->msg_controllen += cmsg_space(sizeof(dscp));
01557                         INSIST(msg->msg_controllen <= sock->sendcmsgbuflen);
01558 
01559                         cmsgp->cmsg_level = IPPROTO_IP;
01560                         cmsgp->cmsg_type = IP_TOS;
01561                         cmsgp->cmsg_len = cmsg_len(sizeof(char));
01562                         *(unsigned char*)CMSG_DATA(cmsgp) = dscp;
01563                 } else if (sock->pf == AF_INET && sock->dscp != dev->dscp) {
01564                         if (setsockopt(sock->fd, IPPROTO_IP, IP_TOS,
01565                                (void *)&dscp, sizeof(int)) < 0)
01566                         {
01567                                 char strbuf[ISC_STRERRORSIZE];
01568                                 isc__strerror(errno, strbuf, sizeof(strbuf));
01569                                 UNEXPECTED_ERROR(__FILE__, __LINE__,
01570                                                  "setsockopt(%d, IP_TOS, %.02x)"
01571                                                  " %s: %s",
01572                                                  sock->fd, dscp >> 2,
01573                                                  isc_msgcat_get(isc_msgcat,
01574                                                         ISC_MSGSET_GENERAL,
01575                                                         ISC_MSG_FAILED,
01576                                                         "failed"),
01577                                                  strbuf);
01578                         } else
01579                                 sock->dscp = dscp;
01580                 }
01581 #endif
01582 #if defined(IPPROTO_IPV6) && defined(IPV6_TCLASS)
01583                 if (sock->pf == AF_INET6 && sock->pktdscp) {
01584                         cmsgp = (struct cmsghdr *)(sock->sendcmsgbuf +
01585                                                    msg->msg_controllen);
01586                         msg->msg_control = (void *)sock->sendcmsgbuf;
01587                         msg->msg_controllen += cmsg_space(sizeof(dscp));
01588                         INSIST(msg->msg_controllen <= sock->sendcmsgbuflen);
01589 
01590                         cmsgp->cmsg_level = IPPROTO_IPV6;
01591                         cmsgp->cmsg_type = IPV6_TCLASS;
01592                         cmsgp->cmsg_len = cmsg_len(sizeof(dscp));
01593                         memmove(CMSG_DATA(cmsgp), &dscp, sizeof(dscp));
01594                 } else if (sock->pf == AF_INET6 && sock->dscp != dev->dscp) {
01595                         if (setsockopt(sock->fd, IPPROTO_IPV6, IPV6_TCLASS,
01596                                        (void *)&dscp, sizeof(int)) < 0) {
01597                                 char strbuf[ISC_STRERRORSIZE];
01598                                 isc__strerror(errno, strbuf, sizeof(strbuf));
01599                                 UNEXPECTED_ERROR(__FILE__, __LINE__,
01600                                                  "setsockopt(%d, IPV6_TCLASS, "
01601                                                  "%.02x) %s: %s",
01602                                                  sock->fd, dscp >> 2,
01603                                                  isc_msgcat_get(isc_msgcat,
01604                                                         ISC_MSGSET_GENERAL,
01605                                                         ISC_MSG_FAILED,
01606                                                         "failed"),
01607                                                  strbuf);
01608                         } else
01609                                 sock->dscp = dscp;
01610                 }
01611 #endif
01612         }
01613 #endif /* USE_CMSG && ISC_PLATFORM_HAVEIPV6 */
01614 #else /* ISC_NET_BSD44MSGHDR */
01615         msg->msg_accrights = NULL;
01616         msg->msg_accrightslen = 0;
01617 #endif /* ISC_NET_BSD44MSGHDR */
01618 
01619         if (write_countp != NULL)
01620                 *write_countp = write_count;
01621 }
01622 
01623 /*
01624  * Construct an iov array and attach it to the msghdr passed in.  This is
01625  * the RECV constructor, which will use the available region of the buffer
01626  * (if using a buffer list) or will use the internal region (if a single
01627  * buffer I/O is requested).
01628  *
01629  * Nothing can be NULL, and the done event must list at least one buffer
01630  * on the buffer linked list for this function to be meaningful.
01631  *
01632  * If read_countp != NULL, *read_countp will hold the number of bytes
01633  * this transaction can receive.
01634  */
01635 static void
01636 build_msghdr_recv(isc__socket_t *sock, isc_socketevent_t *dev,
01637                   struct msghdr *msg, struct iovec *iov, size_t *read_countp)
01638 {
01639         unsigned int iovcount;
01640         isc_buffer_t *buffer;
01641         isc_region_t available;
01642         size_t read_count;
01643 
01644         memset(msg, 0, sizeof(struct msghdr));
01645 
01646         if (sock->type == isc_sockettype_udp) {
01647                 memset(&dev->address, 0, sizeof(dev->address));
01648 #ifdef BROKEN_RECVMSG
01649                 if (sock->pf == AF_INET) {
01650                         msg->msg_name = (void *)&dev->address.type.sin;
01651                         msg->msg_namelen = sizeof(dev->address.type.sin6);
01652                 } else if (sock->pf == AF_INET6) {
01653                         msg->msg_name = (void *)&dev->address.type.sin6;
01654                         msg->msg_namelen = sizeof(dev->address.type.sin6);
01655 #ifdef ISC_PLATFORM_HAVESYSUNH
01656                 } else if (sock->pf == AF_UNIX) {
01657                         msg->msg_name = (void *)&dev->address.type.sunix;
01658                         msg->msg_namelen = sizeof(dev->address.type.sunix);
01659 #endif
01660                 } else {
01661                         msg->msg_name = (void *)&dev->address.type.sa;
01662                         msg->msg_namelen = sizeof(dev->address.type);
01663                 }
01664 #else
01665                 msg->msg_name = (void *)&dev->address.type.sa;
01666                 msg->msg_namelen = sizeof(dev->address.type);
01667 #endif
01668 #ifdef ISC_NET_RECVOVERFLOW
01669                 /* If needed, steal one iovec for overflow detection. */
01670                 maxiov--;
01671 #endif
01672         } else { /* TCP */
01673                 msg->msg_name = NULL;
01674                 msg->msg_namelen = 0;
01675                 dev->address = sock->peer_address;
01676         }
01677 
01678         buffer = ISC_LIST_HEAD(dev->bufferlist);
01679         read_count = 0;
01680 
01681         /*
01682          * Single buffer I/O?  Skip what we've done so far in this region.
01683          */
01684         if (buffer == NULL) {
01685                 read_count = dev->region.length - dev->n;
01686                 iov[0].iov_base = (void *)(dev->region.base + dev->n);
01687                 iov[0].iov_len = read_count;
01688                 iovcount = 1;
01689 
01690                 goto config;
01691         }
01692 
01693         /*
01694          * Multibuffer I/O.
01695          * Skip empty buffers.
01696          */
01697         while (buffer != NULL) {
01698                 REQUIRE(ISC_BUFFER_VALID(buffer));
01699                 if (isc_buffer_availablelength(buffer) != 0)
01700                         break;
01701                 buffer = ISC_LIST_NEXT(buffer, link);
01702         }
01703 
01704         iovcount = 0;
01705         while (buffer != NULL) {
01706                 INSIST(iovcount < MAXSCATTERGATHER_RECV);
01707 
01708                 isc_buffer_availableregion(buffer, &available);
01709 
01710                 if (available.length > 0) {
01711                         iov[iovcount].iov_base = (void *)(available.base);
01712                         iov[iovcount].iov_len = available.length;
01713                         read_count += available.length;
01714                         iovcount++;
01715                 }
01716                 buffer = ISC_LIST_NEXT(buffer, link);
01717         }
01718 
01719  config:
01720 
01721         /*
01722          * If needed, set up to receive that one extra byte.  Note that
01723          * we know there is at least one iov left, since we stole it
01724          * at the top of this function.
01725          */
01726 #ifdef ISC_NET_RECVOVERFLOW
01727         if (sock->type == isc_sockettype_udp) {
01728                 iov[iovcount].iov_base = (void *)(&sock->overflow);
01729                 iov[iovcount].iov_len = 1;
01730                 iovcount++;
01731         }
01732 #endif
01733 
01734         msg->msg_iov = iov;
01735         msg->msg_iovlen = iovcount;
01736 
01737 #ifdef ISC_NET_BSD44MSGHDR
01738         msg->msg_control = NULL;
01739         msg->msg_controllen = 0;
01740         msg->msg_flags = 0;
01741 #if defined(USE_CMSG)
01742         msg->msg_control = sock->recvcmsgbuf;
01743         msg->msg_controllen = sock->recvcmsgbuflen;
01744 #endif /* USE_CMSG */
01745 #else /* ISC_NET_BSD44MSGHDR */
01746         msg->msg_accrights = NULL;
01747         msg->msg_accrightslen = 0;
01748 #endif /* ISC_NET_BSD44MSGHDR */
01749 
01750         if (read_countp != NULL)
01751                 *read_countp = read_count;
01752 }
01753 
01754 static void
01755 set_dev_address(isc_sockaddr_t *address, isc__socket_t *sock,
01756                 isc_socketevent_t *dev)
01757 {
01758         if (sock->type == isc_sockettype_udp) {
01759                 if (address != NULL)
01760                         dev->address = *address;
01761                 else
01762                         dev->address = sock->peer_address;
01763         } else if (sock->type == isc_sockettype_tcp) {
01764                 INSIST(address == NULL);
01765                 dev->address = sock->peer_address;
01766         }
01767 }
01768 
01769 static void
01770 destroy_socketevent(isc_event_t *event) {
01771         isc_socketevent_t *ev = (isc_socketevent_t *)event;
01772 
01773         INSIST(ISC_LIST_EMPTY(ev->bufferlist));
01774 
01775         (ev->destroy)(event);
01776 }
01777 
01778 static isc_socketevent_t *
01779 allocate_socketevent(isc_mem_t *mctx, void *sender,
01780                      isc_eventtype_t eventtype, isc_taskaction_t action,
01781                      void *arg)
01782 {
01783         isc_socketevent_t *ev;
01784 
01785         ev = (isc_socketevent_t *)isc_event_allocate(mctx, sender,
01786                                                      eventtype, action, arg,
01787                                                      sizeof(*ev));
01788 
01789         if (ev == NULL)
01790                 return (NULL);
01791 
01792         ev->result = ISC_R_UNSET;
01793         ISC_LINK_INIT(ev, ev_link);
01794         ISC_LIST_INIT(ev->bufferlist);
01795         ev->region.base = NULL;
01796         ev->n = 0;
01797         ev->offset = 0;
01798         ev->attributes = 0;
01799         ev->destroy = ev->ev_destroy;
01800         ev->ev_destroy = destroy_socketevent;
01801         ev->dscp = 0;
01802 
01803         return (ev);
01804 }
01805 
01806 #if defined(ISC_SOCKET_DEBUG)
01807 static void
01808 dump_msg(struct msghdr *msg) {
01809         unsigned int i;
01810 
01811         printf("MSGHDR %p\n", msg);
01812         printf("\tname %p, namelen %ld\n", msg->msg_name,
01813                (long) msg->msg_namelen);
01814         printf("\tiov %p, iovlen %ld\n", msg->msg_iov,
01815                (long) msg->msg_iovlen);
01816         for (i = 0; i < (unsigned int)msg->msg_iovlen; i++)
01817                 printf("\t\t%d\tbase %p, len %ld\n", i,
01818                        msg->msg_iov[i].iov_base,
01819                        (long) msg->msg_iov[i].iov_len);
01820 #ifdef ISC_NET_BSD44MSGHDR
01821         printf("\tcontrol %p, controllen %ld\n", msg->msg_control,
01822                (long) msg->msg_controllen);
01823 #endif
01824 }
01825 #endif
01826 
01827 #define DOIO_SUCCESS            0       /* i/o ok, event sent */
01828 #define DOIO_SOFT               1       /* i/o ok, soft error, no event sent */
01829 #define DOIO_HARD               2       /* i/o error, event sent */
01830 #define DOIO_EOF                3       /* EOF, no event sent */
01831 
01832 static int
01833 doio_recv(isc__socket_t *sock, isc_socketevent_t *dev) {
01834         int cc;
01835         struct iovec iov[MAXSCATTERGATHER_RECV];
01836         size_t read_count;
01837         size_t actual_count;
01838         struct msghdr msghdr;
01839         isc_buffer_t *buffer;
01840         int recv_errno;
01841         char strbuf[ISC_STRERRORSIZE];
01842 
01843         build_msghdr_recv(sock, dev, &msghdr, iov, &read_count);
01844 
01845 #if defined(ISC_SOCKET_DEBUG)
01846         dump_msg(&msghdr);
01847 #endif
01848 
01849         cc = recvmsg(sock->fd, &msghdr, 0);
01850         recv_errno = errno;
01851 
01852 #if defined(ISC_SOCKET_DEBUG)
01853         dump_msg(&msghdr);
01854 #endif
01855 
01856         if (cc < 0) {
01857                 if (SOFT_ERROR(recv_errno))
01858                         return (DOIO_SOFT);
01859 
01860                 if (isc_log_wouldlog(isc_lctx, IOEVENT_LEVEL)) {
01861                         isc__strerror(recv_errno, strbuf, sizeof(strbuf));
01862                         socket_log(sock, NULL, IOEVENT,
01863                                    isc_msgcat, ISC_MSGSET_SOCKET,
01864                                    ISC_MSG_DOIORECV,
01865                                   "doio_recv: recvmsg(%d) %d bytes, err %d/%s",
01866                                    sock->fd, cc, recv_errno, strbuf);
01867                 }
01868 
01869 #define SOFT_OR_HARD(_system, _isc) \
01870         if (recv_errno == _system) { \
01871                 if (sock->connected) { \
01872                         dev->result = _isc; \
01873                         inc_stats(sock->manager->stats, \
01874                                   sock->statsindex[STATID_RECVFAIL]); \
01875                         return (DOIO_HARD); \
01876                 } \
01877                 return (DOIO_SOFT); \
01878         }
01879 #define ALWAYS_HARD(_system, _isc) \
01880         if (recv_errno == _system) { \
01881                 dev->result = _isc; \
01882                 inc_stats(sock->manager->stats, \
01883                           sock->statsindex[STATID_RECVFAIL]); \
01884                 return (DOIO_HARD); \
01885         }
01886 
01887                 SOFT_OR_HARD(ECONNREFUSED, ISC_R_CONNREFUSED);
01888                 SOFT_OR_HARD(ENETUNREACH, ISC_R_NETUNREACH);
01889                 SOFT_OR_HARD(EHOSTUNREACH, ISC_R_HOSTUNREACH);
01890                 SOFT_OR_HARD(EHOSTDOWN, ISC_R_HOSTDOWN);
01891                 /* HPUX 11.11 can return EADDRNOTAVAIL. */
01892                 SOFT_OR_HARD(EADDRNOTAVAIL, ISC_R_ADDRNOTAVAIL);
01893                 ALWAYS_HARD(ENOBUFS, ISC_R_NORESOURCES);
01894                 /* Should never get this one but it was seen. */
01895 #ifdef ENOPROTOOPT
01896                 SOFT_OR_HARD(ENOPROTOOPT, ISC_R_HOSTUNREACH);
01897 #endif
01898                 /*
01899                  * HPUX returns EPROTO and EINVAL on receiving some ICMP/ICMPv6
01900                  * errors.
01901                  */
01902 #ifdef EPROTO
01903                 SOFT_OR_HARD(EPROTO, ISC_R_HOSTUNREACH);
01904 #endif
01905                 SOFT_OR_HARD(EINVAL, ISC_R_HOSTUNREACH);
01906 
01907 #undef SOFT_OR_HARD
01908 #undef ALWAYS_HARD
01909 
01910                 dev->result = isc__errno2result(recv_errno);
01911                 inc_stats(sock->manager->stats,
01912                           sock->statsindex[STATID_RECVFAIL]);
01913                 return (DOIO_HARD);
01914         }
01915 
01916         /*
01917          * On TCP and UNIX sockets, zero length reads indicate EOF,
01918          * while on UDP sockets, zero length reads are perfectly valid,
01919          * although strange.
01920          */
01921         switch (sock->type) {
01922         case isc_sockettype_tcp:
01923         case isc_sockettype_unix:
01924                 if (cc == 0)
01925                         return (DOIO_EOF);
01926                 break;
01927         case isc_sockettype_udp:
01928         case isc_sockettype_raw:
01929                 break;
01930         case isc_sockettype_fdwatch:
01931         default:
01932                 INSIST(0);
01933         }
01934 
01935         if (sock->type == isc_sockettype_udp) {
01936                 dev->address.length = msghdr.msg_namelen;
01937                 if (isc_sockaddr_getport(&dev->address) == 0) {
01938                         if (isc_log_wouldlog(isc_lctx, IOEVENT_LEVEL)) {
01939                                 socket_log(sock, &dev->address, IOEVENT,
01940                                            isc_msgcat, ISC_MSGSET_SOCKET,
01941                                            ISC_MSG_ZEROPORT,
01942                                            "dropping source port zero packet");
01943                         }
01944                         return (DOIO_SOFT);
01945                 }
01946                 /*
01947                  * Simulate a firewall blocking UDP responses bigger than
01948                  * 'maxudp' bytes.
01949                  */
01950                 if (sock->manager->maxudp != 0 && cc > sock->manager->maxudp)
01951                         return (DOIO_SOFT);
01952         }
01953 
01954         socket_log(sock, &dev->address, IOEVENT,
01955                    isc_msgcat, ISC_MSGSET_SOCKET, ISC_MSG_PKTRECV,
01956                    "packet received correctly");
01957 
01958         /*
01959          * Overflow bit detection.  If we received MORE bytes than we should,
01960          * this indicates an overflow situation.  Set the flag in the
01961          * dev entry and adjust how much we read by one.
01962          */
01963 #ifdef ISC_NET_RECVOVERFLOW
01964         if ((sock->type == isc_sockettype_udp) && ((size_t)cc > read_count)) {
01965                 dev->attributes |= ISC_SOCKEVENTATTR_TRUNC;
01966                 cc--;
01967         }
01968 #endif
01969 
01970         /*
01971          * If there are control messages attached, run through them and pull
01972          * out the interesting bits.
01973          */
01974         process_cmsg(sock, &msghdr, dev);
01975 
01976         /*
01977          * update the buffers (if any) and the i/o count
01978          */
01979         dev->n += cc;
01980         actual_count = cc;
01981         buffer = ISC_LIST_HEAD(dev->bufferlist);
01982         while (buffer != NULL && actual_count > 0U) {
01983                 REQUIRE(ISC_BUFFER_VALID(buffer));
01984                 if (isc_buffer_availablelength(buffer) <= actual_count) {
01985                         actual_count -= isc_buffer_availablelength(buffer);
01986                         isc_buffer_add(buffer,
01987                                        isc_buffer_availablelength(buffer));
01988                 } else {
01989                         isc_buffer_add(buffer, actual_count);
01990                         actual_count = 0;
01991                         POST(actual_count);
01992                         break;
01993                 }
01994                 buffer = ISC_LIST_NEXT(buffer, link);
01995                 if (buffer == NULL) {
01996                         INSIST(actual_count == 0U);
01997                 }
01998         }
01999 
02000         /*
02001          * If we read less than we expected, update counters,
02002          * and let the upper layer poke the descriptor.
02003          */
02004         if (((size_t)cc != read_count) && (dev->n < dev->minimum))
02005                 return (DOIO_SOFT);
02006 
02007         /*
02008          * Full reads are posted, or partials if partials are ok.
02009          */
02010         dev->result = ISC_R_SUCCESS;
02011         return (DOIO_SUCCESS);
02012 }
02013 
02014 /*
02015  * Returns:
02016  *      DOIO_SUCCESS    The operation succeeded.  dev->result contains
02017  *                      ISC_R_SUCCESS.
02018  *
02019  *      DOIO_HARD       A hard or unexpected I/O error was encountered.
02020  *                      dev->result contains the appropriate error.
02021  *
02022  *      DOIO_SOFT       A soft I/O error was encountered.  No senddone
02023  *                      event was sent.  The operation should be retried.
02024  *
02025  *      No other return values are possible.
02026  */
02027 static int
02028 doio_send(isc__socket_t *sock, isc_socketevent_t *dev) {
02029         int cc;
02030         struct iovec iov[MAXSCATTERGATHER_SEND];
02031         size_t write_count;
02032         struct msghdr msghdr;
02033         char addrbuf[ISC_SOCKADDR_FORMATSIZE];
02034         int attempts = 0;
02035         int send_errno;
02036         char strbuf[ISC_STRERRORSIZE];
02037 
02038         build_msghdr_send(sock, dev, &msghdr, iov, &write_count);
02039 
02040  resend:
02041         if (sock->type == isc_sockettype_udp &&
02042             sock->manager->maxudp != 0 &&
02043             write_count > (size_t)sock->manager->maxudp)
02044                 cc = write_count;
02045         else
02046                 cc = sendmsg(sock->fd, &msghdr, 0);
02047         send_errno = errno;
02048 
02049         /*
02050          * Check for error or block condition.
02051          */
02052         if (cc < 0) {
02053                 if (send_errno == EINTR && ++attempts < NRETRIES)
02054                         goto resend;
02055 
02056                 if (SOFT_ERROR(send_errno))
02057                         return (DOIO_SOFT);
02058 
02059 #define SOFT_OR_HARD(_system, _isc) \
02060         if (send_errno == _system) { \
02061                 if (sock->connected) { \
02062                         dev->result = _isc; \
02063                         inc_stats(sock->manager->stats, \
02064                                   sock->statsindex[STATID_SENDFAIL]); \
02065                         return (DOIO_HARD); \
02066                 } \
02067                 return (DOIO_SOFT); \
02068         }
02069 #define ALWAYS_HARD(_system, _isc) \
02070         if (send_errno == _system) { \
02071                 dev->result = _isc; \
02072                 inc_stats(sock->manager->stats, \
02073                           sock->statsindex[STATID_SENDFAIL]); \
02074                 return (DOIO_HARD); \
02075         }
02076 
02077                 SOFT_OR_HARD(ECONNREFUSED, ISC_R_CONNREFUSED);
02078                 ALWAYS_HARD(EACCES, ISC_R_NOPERM);
02079                 ALWAYS_HARD(EAFNOSUPPORT, ISC_R_ADDRNOTAVAIL);
02080                 ALWAYS_HARD(EADDRNOTAVAIL, ISC_R_ADDRNOTAVAIL);
02081                 ALWAYS_HARD(EHOSTUNREACH, ISC_R_HOSTUNREACH);
02082 #ifdef EHOSTDOWN
02083                 ALWAYS_HARD(EHOSTDOWN, ISC_R_HOSTUNREACH);
02084 #endif
02085                 ALWAYS_HARD(ENETUNREACH, ISC_R_NETUNREACH);
02086                 ALWAYS_HARD(ENOBUFS, ISC_R_NORESOURCES);
02087                 ALWAYS_HARD(EPERM, ISC_R_HOSTUNREACH);
02088                 ALWAYS_HARD(EPIPE, ISC_R_NOTCONNECTED);
02089                 ALWAYS_HARD(ECONNRESET, ISC_R_CONNECTIONRESET);
02090 
02091 #undef SOFT_OR_HARD
02092 #undef ALWAYS_HARD
02093 
02094                 /*
02095                  * The other error types depend on whether or not the
02096                  * socket is UDP or TCP.  If it is UDP, some errors
02097                  * that we expect to be fatal under TCP are merely
02098                  * annoying, and are really soft errors.
02099                  *
02100                  * However, these soft errors are still returned as
02101                  * a status.
02102                  */
02103                 isc_sockaddr_format(&dev->address, addrbuf, sizeof(addrbuf));
02104                 isc__strerror(send_errno, strbuf, sizeof(strbuf));
02105                 UNEXPECTED_ERROR(__FILE__, __LINE__, "internal_send: %s: %s",
02106                                  addrbuf, strbuf);
02107                 dev->result = isc__errno2result(send_errno);
02108                 inc_stats(sock->manager->stats,
02109                           sock->statsindex[STATID_SENDFAIL]);
02110                 return (DOIO_HARD);
02111         }
02112 
02113         if (cc == 0) {
02114                 inc_stats(sock->manager->stats,
02115                           sock->statsindex[STATID_SENDFAIL]);
02116                 UNEXPECTED_ERROR(__FILE__, __LINE__,
02117                                  "doio_send: send() %s 0",
02118                                  isc_msgcat_get(isc_msgcat, ISC_MSGSET_GENERAL,
02119                                                 ISC_MSG_RETURNED, "returned"));
02120         }
02121 
02122         /*
02123          * If we write less than we expected, update counters, poke.
02124          */
02125         dev->n += cc;
02126         if ((size_t)cc != write_count)
02127                 return (DOIO_SOFT);
02128 
02129         /*
02130          * Exactly what we wanted to write.  We're done with this
02131          * entry.  Post its completion event.
02132          */
02133         dev->result = ISC_R_SUCCESS;
02134         return (DOIO_SUCCESS);
02135 }
02136 
02137 /*
02138  * Kill.
02139  *
02140  * Caller must ensure that the socket is not locked and no external
02141  * references exist.
02142  */
02143 static void
02144 socketclose(isc__socketmgr_t *manager, isc__socket_t *sock, int fd) {
02145         isc_sockettype_t type = sock->type;
02146         int lockid = FDLOCK_ID(fd);
02147 
02148         /*
02149          * No one has this socket open, so the watcher doesn't have to be
02150          * poked, and the socket doesn't have to be locked.
02151          */
02152         LOCK(&manager->fdlock[lockid]);
02153         manager->fds[fd] = NULL;
02154         if (type == isc_sockettype_fdwatch)
02155                 manager->fdstate[fd] = CLOSED;
02156         else
02157                 manager->fdstate[fd] = CLOSE_PENDING;
02158         UNLOCK(&manager->fdlock[lockid]);
02159         if (type == isc_sockettype_fdwatch) {
02160                 /*
02161                  * The caller may close the socket once this function returns,
02162                  * and `fd' may be reassigned for a new socket.  So we do
02163                  * unwatch_fd() here, rather than defer it via select_poke().
02164                  * Note: this may complicate data protection among threads and
02165                  * may reduce performance due to additional locks.  One way to
02166                  * solve this would be to dup() the watched descriptor, but we
02167                  * take a simpler approach at this moment.
02168                  */
02169                 (void)unwatch_fd(manager, fd, SELECT_POKE_READ);
02170                 (void)unwatch_fd(manager, fd, SELECT_POKE_WRITE);
02171         } else
02172                 select_poke(manager, fd, SELECT_POKE_CLOSE);
02173 
02174         inc_stats(manager->stats, sock->statsindex[STATID_CLOSE]);
02175         if (sock->active == 1) {
02176                 dec_stats(manager->stats, sock->statsindex[STATID_ACTIVE]);
02177                 sock->active = 0;
02178         }
02179 
02180         /*
02181          * update manager->maxfd here (XXX: this should be implemented more
02182          * efficiently)
02183          */
02184 #ifdef USE_SELECT
02185         LOCK(&manager->lock);
02186         if (manager->maxfd == fd) {
02187                 int i;
02188 
02189                 manager->maxfd = 0;
02190                 for (i = fd - 1; i >= 0; i--) {
02191                         lockid = FDLOCK_ID(i);
02192 
02193                         LOCK(&manager->fdlock[lockid]);
02194                         if (manager->fdstate[i] == MANAGED) {
02195                                 manager->maxfd = i;
02196                                 UNLOCK(&manager->fdlock[lockid]);
02197                                 break;
02198                         }
02199                         UNLOCK(&manager->fdlock[lockid]);
02200                 }
02201 #ifdef ISC_PLATFORM_USETHREADS
02202                 if (manager->maxfd < manager->pipe_fds[0])
02203                         manager->maxfd = manager->pipe_fds[0];
02204 #endif
02205         }
02206 
02207         UNLOCK(&manager->lock);
02208 #endif  /* USE_SELECT */
02209 }
02210 
02211 static void
02212 destroy(isc__socket_t **sockp) {
02213         int fd;
02214         isc__socket_t *sock = *sockp;
02215         isc__socketmgr_t *manager = sock->manager;
02216 
02217         socket_log(sock, NULL, CREATION, isc_msgcat, ISC_MSGSET_SOCKET,
02218                    ISC_MSG_DESTROYING, "destroying");
02219 
02220         INSIST(ISC_LIST_EMPTY(sock->connect_list));
02221         INSIST(ISC_LIST_EMPTY(sock->accept_list));
02222         INSIST(ISC_LIST_EMPTY(sock->recv_list));
02223         INSIST(ISC_LIST_EMPTY(sock->send_list));
02224         REQUIRE(sock->fd == -1 || sock->fd < (int)manager->maxsocks);
02225 
02226         if (sock->fd >= 0) {
02227                 fd = sock->fd;
02228                 sock->fd = -1;
02229                 socketclose(manager, sock, fd);
02230         }
02231 
02232         LOCK(&manager->lock);
02233 
02234         ISC_LIST_UNLINK(manager->socklist, sock, link);
02235 
02236 #ifdef USE_WATCHER_THREAD
02237         if (ISC_LIST_EMPTY(manager->socklist))
02238                 SIGNAL(&manager->shutdown_ok);
02239 #endif /* USE_WATCHER_THREAD */
02240 
02241         /* can't unlock manager as its memory context is still used */
02242         free_socket(sockp);
02243 
02244         UNLOCK(&manager->lock);
02245 }
02246 
02247 static isc_result_t
02248 allocate_socket(isc__socketmgr_t *manager, isc_sockettype_t type,
02249                 isc__socket_t **socketp)
02250 {
02251         isc__socket_t *sock;
02252         isc_result_t result;
02253         ISC_SOCKADDR_LEN_T cmsgbuflen;
02254 
02255         sock = isc_mem_get(manager->mctx, sizeof(*sock));
02256 
02257         if (sock == NULL)
02258                 return (ISC_R_NOMEMORY);
02259 
02260         sock->common.magic = 0;
02261         sock->common.impmagic = 0;
02262         sock->references = 0;
02263 
02264         sock->manager = manager;
02265         sock->type = type;
02266         sock->fd = -1;
02267         sock->dscp = 0;         /* TOS/TCLASS is zero until set. */
02268         sock->dupped = 0;
02269         sock->statsindex = NULL;
02270         sock->active = 0;
02271 
02272         ISC_LINK_INIT(sock, link);
02273 
02274         sock->recvcmsgbuf = NULL;
02275         sock->sendcmsgbuf = NULL;
02276 
02277         /*
02278          * Set up cmsg buffers.
02279          */
02280         cmsgbuflen = 0;
02281 #if defined(USE_CMSG) && defined(ISC_PLATFORM_HAVEIN6PKTINFO)
02282         cmsgbuflen += cmsg_space(sizeof(struct in6_pktinfo));
02283 #endif
02284 #if defined(USE_CMSG) && defined(SO_TIMESTAMP)
02285         cmsgbuflen += cmsg_space(sizeof(struct timeval));
02286 #endif
02287 #if defined(USE_CMSG) && (defined(IPV6_TCLASS) || defined(IP_TOS))
02288         cmsgbuflen += cmsg_space(sizeof(int));
02289 #endif
02290         sock->recvcmsgbuflen = cmsgbuflen;
02291         if (sock->recvcmsgbuflen != 0U) {
02292                 sock->recvcmsgbuf = isc_mem_get(manager->mctx, cmsgbuflen);
02293                 if (sock->recvcmsgbuf == NULL) {
02294                         result = ISC_R_NOMEMORY;
02295                         goto error;
02296                 }
02297         }
02298 
02299         cmsgbuflen = 0;
02300 #if defined(USE_CMSG) && defined(ISC_PLATFORM_HAVEIN6PKTINFO)
02301         cmsgbuflen += cmsg_space(sizeof(struct in6_pktinfo));
02302 #if defined(IPV6_USE_MIN_MTU)
02303         /*
02304          * Provide space for working around FreeBSD's broken IPV6_USE_MIN_MTU
02305          * support.
02306          */
02307         cmsgbuflen += cmsg_space(sizeof(int));
02308 #endif
02309 #endif
02310 #if defined(USE_CMSG) && (defined(IP_TOS) || defined(IPV6_TCLASS))
02311         cmsgbuflen += cmsg_space(sizeof(int));
02312 #endif
02313         sock->sendcmsgbuflen = cmsgbuflen;
02314         if (sock->sendcmsgbuflen != 0U) {
02315                 sock->sendcmsgbuf = isc_mem_get(manager->mctx, cmsgbuflen);
02316                 if (sock->sendcmsgbuf == NULL) {
02317                         result = ISC_R_NOMEMORY;
02318                         goto error;
02319                 }
02320         }
02321 
02322         memset(sock->name, 0, sizeof(sock->name));
02323         sock->tag = NULL;
02324 
02325         /*
02326          * Set up list of readers and writers to be initially empty.
02327          */
02328         ISC_LIST_INIT(sock->recv_list);
02329         ISC_LIST_INIT(sock->send_list);
02330         ISC_LIST_INIT(sock->accept_list);
02331         ISC_LIST_INIT(sock->connect_list);
02332         sock->pending_recv = 0;
02333         sock->pending_send = 0;
02334         sock->pending_accept = 0;
02335         sock->listener = 0;
02336         sock->connected = 0;
02337         sock->connecting = 0;
02338         sock->bound = 0;
02339         sock->pktdscp = 0;
02340 
02341         /*
02342          * Initialize the lock.
02343          */
02344         result = isc_mutex_init(&sock->lock);
02345         if (result != ISC_R_SUCCESS) {
02346                 sock->common.magic = 0;
02347                 sock->common.impmagic = 0;
02348                 goto error;
02349         }
02350 
02351         /*
02352          * Initialize readable and writable events.
02353          */
02354         ISC_EVENT_INIT(&sock->readable_ev, sizeof(intev_t),
02355                        ISC_EVENTATTR_NOPURGE, NULL, ISC_SOCKEVENT_INTR,
02356                        NULL, sock, sock, NULL, NULL);
02357         ISC_EVENT_INIT(&sock->writable_ev, sizeof(intev_t),
02358                        ISC_EVENTATTR_NOPURGE, NULL, ISC_SOCKEVENT_INTW,
02359                        NULL, sock, sock, NULL, NULL);
02360 
02361         sock->common.magic = ISCAPI_SOCKET_MAGIC;
02362         sock->common.impmagic = SOCKET_MAGIC;
02363         *socketp = sock;
02364 
02365         return (ISC_R_SUCCESS);
02366 
02367  error:
02368         if (sock->recvcmsgbuf != NULL)
02369                 isc_mem_put(manager->mctx, sock->recvcmsgbuf,
02370                             sock->recvcmsgbuflen);
02371         if (sock->sendcmsgbuf != NULL)
02372                 isc_mem_put(manager->mctx, sock->sendcmsgbuf,
02373                             sock->sendcmsgbuflen);
02374         isc_mem_put(manager->mctx, sock, sizeof(*sock));
02375 
02376         return (result);
02377 }
02378 
02379 /*
02380  * This event requires that the various lists be empty, that the reference
02381  * count be 1, and that the magic number is valid.  The other socket bits,
02382  * like the lock, must be initialized as well.  The fd associated must be
02383  * marked as closed, by setting it to -1 on close, or this routine will
02384  * also close the socket.
02385  */
02386 static void
02387 free_socket(isc__socket_t **socketp) {
02388         isc__socket_t *sock = *socketp;
02389 
02390         INSIST(sock->references == 0);
02391         INSIST(VALID_SOCKET(sock));
02392         INSIST(!sock->connecting);
02393         INSIST(!sock->pending_recv);
02394         INSIST(!sock->pending_send);
02395         INSIST(!sock->pending_accept);
02396         INSIST(ISC_LIST_EMPTY(sock->recv_list));
02397         INSIST(ISC_LIST_EMPTY(sock->send_list));
02398         INSIST(ISC_LIST_EMPTY(sock->accept_list));
02399         INSIST(ISC_LIST_EMPTY(sock->connect_list));
02400         INSIST(!ISC_LINK_LINKED(sock, link));
02401 
02402         if (sock->recvcmsgbuf != NULL)
02403                 isc_mem_put(sock->manager->mctx, sock->recvcmsgbuf,
02404                             sock->recvcmsgbuflen);
02405         if (sock->sendcmsgbuf != NULL)
02406                 isc_mem_put(sock->manager->mctx, sock->sendcmsgbuf,
02407                             sock->sendcmsgbuflen);
02408 
02409         sock->common.magic = 0;
02410         sock->common.impmagic = 0;
02411 
02412         DESTROYLOCK(&sock->lock);
02413 
02414         isc_mem_put(sock->manager->mctx, sock, sizeof(*sock));
02415 
02416         *socketp = NULL;
02417 }
02418 
02419 #ifdef SO_RCVBUF
02420 static isc_once_t       rcvbuf_once = ISC_ONCE_INIT;
02421 static int              rcvbuf = RCVBUFSIZE;
02422 
02423 static void
02424 set_rcvbuf(void) {
02425         int fd;
02426         int max = rcvbuf, min;
02427         ISC_SOCKADDR_LEN_T len;
02428 
02429         fd = socket(AF_INET, SOCK_DGRAM, IPPROTO_UDP);
02430 #if defined(ISC_PLATFORM_HAVEIPV6)
02431         if (fd == -1) {
02432                 switch (errno) {
02433                 case EPROTONOSUPPORT:
02434                 case EPFNOSUPPORT:
02435                 case EAFNOSUPPORT:
02436                 /*
02437                  * Linux 2.2 (and maybe others) return EINVAL instead of
02438                  * EAFNOSUPPORT.
02439                  */
02440                 case EINVAL:
02441                         fd = socket(AF_INET6, SOCK_DGRAM, IPPROTO_UDP);
02442                         break;
02443                 }
02444         }
02445 #endif
02446         if (fd == -1)
02447                 return;
02448 
02449         len = sizeof(min);
02450         if (getsockopt(fd, SOL_SOCKET, SO_RCVBUF, (void *)&min, &len) == 0 &&
02451             min < rcvbuf) {
02452  again:
02453                 if (setsockopt(fd, SOL_SOCKET, SO_RCVBUF, (void *)&rcvbuf,
02454                                sizeof(rcvbuf)) == -1) {
02455                         if (errno == ENOBUFS && rcvbuf > min) {
02456                                 max = rcvbuf - 1;
02457                                 rcvbuf = (rcvbuf + min) / 2;
02458                                 goto again;
02459                         } else {
02460                                 rcvbuf = min;
02461                                 goto cleanup;
02462                         }
02463                 } else
02464                         min = rcvbuf;
02465                 if (min != max) {
02466                         rcvbuf = max;
02467                         goto again;
02468                 }
02469         }
02470  cleanup:
02471         close (fd);
02472 }
02473 #endif
02474 
02475 #ifdef SO_BSDCOMPAT
02476 /*
02477  * This really should not be necessary to do.  Having to workout
02478  * which kernel version we are on at run time so that we don't cause
02479  * the kernel to issue a warning about us using a deprecated socket option.
02480  * Such warnings should *never* be on by default in production kernels.
02481  *
02482  * We can't do this a build time because executables are moved between
02483  * machines and hence kernels.
02484  *
02485  * We can't just not set SO_BSDCOMAT because some kernels require it.
02486  */
02487 
02488 static isc_once_t         bsdcompat_once = ISC_ONCE_INIT;
02489 isc_boolean_t bsdcompat = ISC_TRUE;
02490 
02491 static void
02492 clear_bsdcompat(void) {
02493 #ifdef __linux__
02494          struct utsname buf;
02495          char *endp;
02496          long int major;
02497          long int minor;
02498 
02499          uname(&buf);    /* Can only fail if buf is bad in Linux. */
02500 
02501          /* Paranoia in parsing can be increased, but we trust uname(). */
02502          major = strtol(buf.release, &endp, 10);
02503          if (*endp == '.') {
02504                 minor = strtol(endp+1, &endp, 10);
02505                 if ((major > 2) || ((major == 2) && (minor >= 4))) {
02506                         bsdcompat = ISC_FALSE;
02507                 }
02508          }
02509 #endif /* __linux __ */
02510 }
02511 #endif
02512 
02513 static void
02514 use_min_mtu(isc__socket_t *sock) {
02515 #if !defined(IPV6_USE_MIN_MTU) && !defined(IPV6_MTU)
02516         UNUSED(sock);
02517 #endif
02518 #ifdef IPV6_USE_MIN_MTU
02519         /* use minimum MTU */
02520         if (sock->pf == AF_INET6) {
02521                 int on = 1;
02522                 (void)setsockopt(sock->fd, IPPROTO_IPV6, IPV6_USE_MIN_MTU,
02523                                 (void *)&on, sizeof(on));
02524         }
02525 #endif
02526 #if defined(IPV6_MTU)
02527         /*
02528          * Use minimum MTU on IPv6 sockets.
02529          */
02530         if (sock->pf == AF_INET6) {
02531                 int mtu = 1280;
02532                 (void)setsockopt(sock->fd, IPPROTO_IPV6, IPV6_MTU,
02533                                  &mtu, sizeof(mtu));
02534         }
02535 #endif
02536 }
02537 
02538 static isc_result_t
02539 opensocket(isc__socketmgr_t *manager, isc__socket_t *sock,
02540            isc__socket_t *dup_socket)
02541 {
02542         isc_result_t result;
02543         char strbuf[ISC_STRERRORSIZE];
02544         const char *err = "socket";
02545         int tries = 0;
02546 #if defined(USE_CMSG) || defined(SO_BSDCOMPAT) || defined(SO_NOSIGPIPE)
02547         int on = 1;
02548 #endif
02549 #if defined(SO_RCVBUF)
02550         ISC_SOCKADDR_LEN_T optlen;
02551         int size = 0;
02552 #endif
02553 
02554  again:
02555         if (dup_socket == NULL) {
02556                 switch (sock->type) {
02557                 case isc_sockettype_udp:
02558                         sock->fd = socket(sock->pf, SOCK_DGRAM, IPPROTO_UDP);
02559                         break;
02560                 case isc_sockettype_tcp:
02561                         sock->fd = socket(sock->pf, SOCK_STREAM, IPPROTO_TCP);
02562                         break;
02563                 case isc_sockettype_unix:
02564                         sock->fd = socket(sock->pf, SOCK_STREAM, 0);
02565                         break;
02566                 case isc_sockettype_raw:
02567                         errno = EPFNOSUPPORT;
02568                         /*
02569                          * PF_ROUTE is a alias for PF_NETLINK on linux.
02570                          */
02571 #if defined(PF_ROUTE)
02572                         if (sock->fd == -1 && sock->pf == PF_ROUTE) {
02573 #ifdef NETLINK_ROUTE
02574                                 sock->fd = socket(sock->pf, SOCK_RAW,
02575                                                   NETLINK_ROUTE);
02576 #else
02577                                 sock->fd = socket(sock->pf, SOCK_RAW, 0);
02578 #endif
02579                                 if (sock->fd != -1) {
02580 #ifdef NETLINK_ROUTE
02581                                         struct sockaddr_nl sa;
02582                                         int n;
02583 
02584                                         /*
02585                                          * Do an implicit bind.
02586                                          */
02587                                         memset(&sa, 0, sizeof(sa));
02588                                         sa.nl_family = AF_NETLINK;
02589                                         sa.nl_groups = RTMGRP_IPV4_IFADDR |
02590                                                        RTMGRP_IPV6_IFADDR;
02591                                         n = bind(sock->fd,
02592                                                  (struct sockaddr *) &sa,
02593                                                  sizeof(sa));
02594                                         if (n < 0) {
02595                                                 close(sock->fd);
02596                                                 sock->fd = -1;
02597                                         }
02598 #endif
02599                                         sock->bound = 1;
02600                                 }
02601                         }
02602 #endif
02603                         break;
02604                 case isc_sockettype_fdwatch:
02605                         /*
02606                          * We should not be called for isc_sockettype_fdwatch
02607                          * sockets.
02608                          */
02609                         INSIST(0);
02610                         break;
02611                 }
02612         } else {
02613                 sock->fd = dup(dup_socket->fd);
02614                 sock->dupped = 1;
02615                 sock->bound = dup_socket->bound;
02616         }
02617         if (sock->fd == -1 && errno == EINTR && tries++ < 42)
02618                 goto again;
02619 
02620 #ifdef F_DUPFD
02621         /*
02622          * Leave a space for stdio and TCP to work in.
02623          */
02624         if (manager->reserved != 0 && sock->type == isc_sockettype_udp &&
02625             sock->fd >= 0 && sock->fd < manager->reserved) {
02626                 int new, tmp;
02627                 new = fcntl(sock->fd, F_DUPFD, manager->reserved);
02628                 tmp = errno;
02629                 (void)close(sock->fd);
02630                 errno = tmp;
02631                 sock->fd = new;
02632                 err = "isc_socket_create: fcntl/reserved";
02633         } else if (sock->fd >= 0 && sock->fd < 20) {
02634                 int new, tmp;
02635                 new = fcntl(sock->fd, F_DUPFD, 20);
02636                 tmp = errno;
02637                 (void)close(sock->fd);
02638                 errno = tmp;
02639                 sock->fd = new;
02640                 err = "isc_socket_create: fcntl";
02641         }
02642 #endif
02643 
02644         if (sock->fd >= (int)manager->maxsocks) {
02645                 (void)close(sock->fd);
02646                 isc_log_iwrite(isc_lctx, ISC_LOGCATEGORY_GENERAL,
02647                                ISC_LOGMODULE_SOCKET, ISC_LOG_ERROR,
02648                                isc_msgcat, ISC_MSGSET_SOCKET,
02649                                ISC_MSG_TOOMANYFDS,
02650                                "socket: file descriptor exceeds limit (%d/%u)",
02651                                sock->fd, manager->maxsocks);
02652                 inc_stats(manager->stats, sock->statsindex[STATID_OPENFAIL]);
02653                 return (ISC_R_NORESOURCES);
02654         }
02655 
02656         if (sock->fd < 0) {
02657                 switch (errno) {
02658                 case EMFILE:
02659                 case ENFILE:
02660                         isc__strerror(errno, strbuf, sizeof(strbuf));
02661                         isc_log_iwrite(isc_lctx, ISC_LOGCATEGORY_GENERAL,
02662                                        ISC_LOGMODULE_SOCKET, ISC_LOG_ERROR,
02663                                        isc_msgcat, ISC_MSGSET_SOCKET,
02664                                        ISC_MSG_TOOMANYFDS,
02665                                        "%s: %s", err, strbuf);
02666                         /* fallthrough */
02667                 case ENOBUFS:
02668                         inc_stats(manager->stats,
02669                                   sock->statsindex[STATID_OPENFAIL]);
02670                         return (ISC_R_NORESOURCES);
02671 
02672                 case EPROTONOSUPPORT:
02673                 case EPFNOSUPPORT:
02674                 case EAFNOSUPPORT:
02675                 /*
02676                  * Linux 2.2 (and maybe others) return EINVAL instead of
02677                  * EAFNOSUPPORT.
02678                  */
02679                 case EINVAL:
02680                         inc_stats(manager->stats,
02681                                   sock->statsindex[STATID_OPENFAIL]);
02682                         return (ISC_R_FAMILYNOSUPPORT);
02683 
02684                 default:
02685                         isc__strerror(errno, strbuf, sizeof(strbuf));
02686                         UNEXPECTED_ERROR(__FILE__, __LINE__,
02687                                          "%s() %s: %s", err,
02688                                          isc_msgcat_get(isc_msgcat,
02689                                                         ISC_MSGSET_GENERAL,
02690                                                         ISC_MSG_FAILED,
02691                                                         "failed"),
02692                                          strbuf);
02693                         inc_stats(manager->stats,
02694                                   sock->statsindex[STATID_OPENFAIL]);
02695                         return (ISC_R_UNEXPECTED);
02696                 }
02697         }
02698 
02699         if (dup_socket != NULL)
02700                 goto setup_done;
02701 
02702         result = make_nonblock(sock->fd);
02703         if (result != ISC_R_SUCCESS) {
02704                 (void)close(sock->fd);
02705                 inc_stats(manager->stats, sock->statsindex[STATID_OPENFAIL]);
02706                 return (result);
02707         }
02708 
02709 #ifdef SO_BSDCOMPAT
02710         RUNTIME_CHECK(isc_once_do(&bsdcompat_once,
02711                                   clear_bsdcompat) == ISC_R_SUCCESS);
02712         if (sock->type != isc_sockettype_unix && bsdcompat &&
02713             setsockopt(sock->fd, SOL_SOCKET, SO_BSDCOMPAT,
02714                        (void *)&on, sizeof(on)) < 0) {
02715                 isc__strerror(errno, strbuf, sizeof(strbuf));
02716                 UNEXPECTED_ERROR(__FILE__, __LINE__,
02717                                  "setsockopt(%d, SO_BSDCOMPAT) %s: %s",
02718                                  sock->fd,
02719                                  isc_msgcat_get(isc_msgcat, ISC_MSGSET_GENERAL,
02720                                                 ISC_MSG_FAILED, "failed"),
02721                                  strbuf);
02722                 /* Press on... */
02723         }
02724 #endif
02725 
02726 #ifdef SO_NOSIGPIPE
02727         if (setsockopt(sock->fd, SOL_SOCKET, SO_NOSIGPIPE,
02728                        (void *)&on, sizeof(on)) < 0) {
02729                 isc__strerror(errno, strbuf, sizeof(strbuf));
02730                 UNEXPECTED_ERROR(__FILE__, __LINE__,
02731                                  "setsockopt(%d, SO_NOSIGPIPE) %s: %s",
02732                                  sock->fd,
02733                                  isc_msgcat_get(isc_msgcat, ISC_MSGSET_GENERAL,
02734                                                 ISC_MSG_FAILED, "failed"),
02735                                  strbuf);
02736                 /* Press on... */
02737         }
02738 #endif
02739 
02740         /*
02741          * Use minimum mtu if possible.
02742          */
02743         use_min_mtu(sock);
02744 
02745 #if defined(USE_CMSG) || defined(SO_RCVBUF)
02746         if (sock->type == isc_sockettype_udp) {
02747 
02748 #if defined(USE_CMSG)
02749 #if defined(SO_TIMESTAMP)
02750                 if (setsockopt(sock->fd, SOL_SOCKET, SO_TIMESTAMP,
02751                                (void *)&on, sizeof(on)) < 0
02752                     && errno != ENOPROTOOPT) {
02753                         isc__strerror(errno, strbuf, sizeof(strbuf));
02754                         UNEXPECTED_ERROR(__FILE__, __LINE__,
02755                                          "setsockopt(%d, SO_TIMESTAMP) %s: %s",
02756                                          sock->fd,
02757                                          isc_msgcat_get(isc_msgcat,
02758                                                         ISC_MSGSET_GENERAL,
02759                                                         ISC_MSG_FAILED,
02760                                                         "failed"),
02761                                          strbuf);
02762                         /* Press on... */
02763                 }
02764 #endif /* SO_TIMESTAMP */
02765 
02766 #if defined(ISC_PLATFORM_HAVEIPV6)
02767                 if (sock->pf == AF_INET6 && sock->recvcmsgbuflen == 0U) {
02768                         /*
02769                          * Warn explicitly because this anomaly can be hidden
02770                          * in usual operation (and unexpectedly appear later).
02771                          */
02772                         UNEXPECTED_ERROR(__FILE__, __LINE__,
02773                                          "No buffer available to receive "
02774                                          "IPv6 destination");
02775                 }
02776 #ifdef ISC_PLATFORM_HAVEIN6PKTINFO
02777 #ifdef IPV6_RECVPKTINFO
02778                 /* RFC 3542 */
02779                 if ((sock->pf == AF_INET6)
02780                     && (setsockopt(sock->fd, IPPROTO_IPV6, IPV6_RECVPKTINFO,
02781                                    (void *)&on, sizeof(on)) < 0)) {
02782                         isc__strerror(errno, strbuf, sizeof(strbuf));
02783                         UNEXPECTED_ERROR(__FILE__, __LINE__,
02784                                          "setsockopt(%d, IPV6_RECVPKTINFO) "
02785                                          "%s: %s", sock->fd,
02786                                          isc_msgcat_get(isc_msgcat,
02787                                                         ISC_MSGSET_GENERAL,
02788                                                         ISC_MSG_FAILED,
02789                                                         "failed"),
02790                                          strbuf);
02791                 }
02792 #else
02793                 /* RFC 2292 */
02794                 if ((sock->pf == AF_INET6)
02795                     && (setsockopt(sock->fd, IPPROTO_IPV6, IPV6_PKTINFO,
02796                                    (void *)&on, sizeof(on)) < 0)) {
02797                         isc__strerror(errno, strbuf, sizeof(strbuf));
02798                         UNEXPECTED_ERROR(__FILE__, __LINE__,
02799                                          "setsockopt(%d, IPV6_PKTINFO) %s: %s",
02800                                          sock->fd,
02801                                          isc_msgcat_get(isc_msgcat,
02802                                                         ISC_MSGSET_GENERAL,
02803                                                         ISC_MSG_FAILED,
02804                                                         "failed"),
02805                                          strbuf);
02806                 }
02807 #endif /* IPV6_RECVPKTINFO */
02808 #endif /* ISC_PLATFORM_HAVEIN6PKTINFO */
02809 #if defined(IPV6_MTU_DISCOVER) && defined(IPV6_PMTUDISC_DONT)
02810                 /*
02811                  * Turn off Path MTU discovery on IPv6/UDP sockets.
02812                  */
02813                 if (sock->pf == AF_INET6) {
02814                         int action = IPV6_PMTUDISC_DONT;
02815                         (void)setsockopt(sock->fd, IPPROTO_IPV6,
02816                                          IPV6_MTU_DISCOVER, &action,
02817                                          sizeof(action));
02818                 }
02819 #endif
02820 #endif /* ISC_PLATFORM_HAVEIPV6 */
02821 #endif /* defined(USE_CMSG) */
02822 
02823 #if defined(IP_MTU_DISCOVER) && defined(IP_PMTUDISC_DONT)
02824                 /*
02825                  * Turn off Path MTU discovery on IPv4/UDP sockets.
02826                  */
02827                 if (sock->pf == AF_INET) {
02828                         int action = IP_PMTUDISC_DONT;
02829                         (void)setsockopt(sock->fd, IPPROTO_IP, IP_MTU_DISCOVER,
02830                                          &action, sizeof(action));
02831                 }
02832 #endif
02833 #if defined(IP_DONTFRAG)
02834                 /*
02835                  * Turn off Path MTU discovery on IPv4/UDP sockets.
02836                  */
02837                 if (sock->pf == AF_INET) {
02838                         int off = 0;
02839                         (void)setsockopt(sock->fd, IPPROTO_IP, IP_DONTFRAG,
02840                                          &off, sizeof(off));
02841                 }
02842 #endif
02843 
02844 #if defined(SO_RCVBUF)
02845                 optlen = sizeof(size);
02846                 if (getsockopt(sock->fd, SOL_SOCKET, SO_RCVBUF,
02847                                (void *)&size, &optlen) == 0 && size < rcvbuf) {
02848                         RUNTIME_CHECK(isc_once_do(&rcvbuf_once,
02849                                                   set_rcvbuf) == ISC_R_SUCCESS);
02850                         if (setsockopt(sock->fd, SOL_SOCKET, SO_RCVBUF,
02851                                (void *)&rcvbuf, sizeof(rcvbuf)) == -1) {
02852                                 isc__strerror(errno, strbuf, sizeof(strbuf));
02853                                 UNEXPECTED_ERROR(__FILE__, __LINE__,
02854                                         "setsockopt(%d, SO_RCVBUF, %d) %s: %s",
02855                                         sock->fd, rcvbuf,
02856                                         isc_msgcat_get(isc_msgcat,
02857                                                        ISC_MSGSET_GENERAL,
02858                                                        ISC_MSG_FAILED,
02859                                                        "failed"),
02860                                         strbuf);
02861                         }
02862                 }
02863 #endif
02864         }
02865 #ifdef IPV6_RECVTCLASS
02866         if ((sock->pf == AF_INET6)
02867             && (setsockopt(sock->fd, IPPROTO_IPV6, IPV6_RECVTCLASS,
02868                            (void *)&on, sizeof(on)) < 0)) {
02869                 isc__strerror(errno, strbuf, sizeof(strbuf));
02870                 UNEXPECTED_ERROR(__FILE__, __LINE__,
02871                                  "setsockopt(%d, IPV6_RECVTCLASS) "
02872                                  "%s: %s", sock->fd,
02873                                  isc_msgcat_get(isc_msgcat, ISC_MSGSET_GENERAL,
02874                                                 ISC_MSG_FAILED, "failed"),
02875                                  strbuf);
02876         }
02877 #endif
02878 #ifdef IP_RECVTOS
02879         if ((sock->pf == AF_INET)
02880             && (setsockopt(sock->fd, IPPROTO_IP, IP_RECVTOS,
02881                            (void *)&on, sizeof(on)) < 0)) {
02882                 isc__strerror(errno, strbuf, sizeof(strbuf));
02883                 UNEXPECTED_ERROR(__FILE__, __LINE__,
02884                                  "setsockopt(%d, IP_RECVTOS) "
02885                                  "%s: %s", sock->fd,
02886                                  isc_msgcat_get(isc_msgcat, ISC_MSGSET_GENERAL,
02887                                                 ISC_MSG_FAILED, "failed"),
02888                                  strbuf);
02889         }
02890 #endif
02891 #endif /* defined(USE_CMSG) || defined(SO_RCVBUF) */
02892 
02893 setup_done:
02894         inc_stats(manager->stats, sock->statsindex[STATID_OPEN]);
02895         if (sock->active == 0) {
02896                 inc_stats(manager->stats, sock->statsindex[STATID_ACTIVE]);
02897                 sock->active = 1;
02898         }
02899 
02900         return (ISC_R_SUCCESS);
02901 }
02902 
02903 /*
02904  * Create a 'type' socket or duplicate an existing socket, managed
02905  * by 'manager'.  Events will be posted to 'task' and when dispatched
02906  * 'action' will be called with 'arg' as the arg value.  The new
02907  * socket is returned in 'socketp'.
02908  */
02909 static isc_result_t
02910 socket_create(isc_socketmgr_t *manager0, int pf, isc_sockettype_t type,
02911               isc_socket_t **socketp, isc_socket_t *dup_socket)
02912 {
02913         isc__socket_t *sock = NULL;
02914         isc__socketmgr_t *manager = (isc__socketmgr_t *)manager0;
02915         isc_result_t result;
02916         int lockid;
02917 
02918         REQUIRE(VALID_MANAGER(manager));
02919         REQUIRE(socketp != NULL && *socketp == NULL);
02920         REQUIRE(type != isc_sockettype_fdwatch);
02921 
02922         result = allocate_socket(manager, type, &sock);
02923         if (result != ISC_R_SUCCESS)
02924                 return (result);
02925 
02926         switch (sock->type) {
02927         case isc_sockettype_udp:
02928                 sock->statsindex =
02929                         (pf == AF_INET) ? udp4statsindex : udp6statsindex;
02930 #define DCSPPKT(pf) ((pf == AF_INET) ? ISC_NET_DSCPPKTV4 : ISC_NET_DSCPPKTV6)
02931                 sock->pktdscp = (isc_net_probedscp() & DCSPPKT(pf)) != 0;
02932                 break;
02933         case isc_sockettype_tcp:
02934                 sock->statsindex =
02935                         (pf == AF_INET) ? tcp4statsindex : tcp6statsindex;
02936                 break;
02937         case isc_sockettype_unix:
02938                 sock->statsindex = unixstatsindex;
02939                 break;
02940         case isc_sockettype_raw:
02941                 sock->statsindex = rawstatsindex;
02942                 break;
02943         default:
02944                 INSIST(0);
02945         }
02946 
02947         sock->pf = pf;
02948 
02949         result = opensocket(manager, sock, (isc__socket_t *)dup_socket);
02950         if (result != ISC_R_SUCCESS) {
02951                 free_socket(&sock);
02952                 return (result);
02953         }
02954 
02955         sock->common.methods = (isc_socketmethods_t *)&socketmethods;
02956         sock->references = 1;
02957         *socketp = (isc_socket_t *)sock;
02958 
02959         /*
02960          * Note we don't have to lock the socket like we normally would because
02961          * there are no external references to it yet.
02962          */
02963 
02964         lockid = FDLOCK_ID(sock->fd);
02965         LOCK(&manager->fdlock[lockid]);
02966         manager->fds[sock->fd] = sock;
02967         manager->fdstate[sock->fd] = MANAGED;
02968 #ifdef USE_DEVPOLL
02969         INSIST(sock->manager->fdpollinfo[sock->fd].want_read == 0 &&
02970                sock->manager->fdpollinfo[sock->fd].want_write == 0);
02971 #endif
02972         UNLOCK(&manager->fdlock[lockid]);
02973 
02974         LOCK(&manager->lock);
02975         ISC_LIST_APPEND(manager->socklist, sock, link);
02976 #ifdef USE_SELECT
02977         if (manager->maxfd < sock->fd)
02978                 manager->maxfd = sock->fd;
02979 #endif
02980         UNLOCK(&manager->lock);
02981 
02982         socket_log(sock, NULL, CREATION, isc_msgcat, ISC_MSGSET_SOCKET,
02983                    ISC_MSG_CREATED, dup_socket != NULL ? "dupped" : "created");
02984 
02985         return (ISC_R_SUCCESS);
02986 }
02987 
02988 /*%
02989  * Create a new 'type' socket managed by 'manager'.  Events
02990  * will be posted to 'task' and when dispatched 'action' will be
02991  * called with 'arg' as the arg value.  The new socket is returned
02992  * in 'socketp'.
02993  */
02994 isc_result_t
02995 isc__socket_create(isc_socketmgr_t *manager0, int pf, isc_sockettype_t type,
02996                    isc_socket_t **socketp)
02997 {
02998         return (socket_create(manager0, pf, type, socketp, NULL));
02999 }
03000 
03001 /*%
03002  * Duplicate an existing socket.  The new socket is returned
03003  * in 'socketp'.
03004  */
03005 isc_result_t
03006 isc__socket_dup(isc_socket_t *sock0, isc_socket_t **socketp) {
03007         isc__socket_t *sock = (isc__socket_t *)sock0;
03008 
03009         REQUIRE(VALID_SOCKET(sock));
03010         REQUIRE(socketp != NULL && *socketp == NULL);
03011 
03012         return (socket_create((isc_socketmgr_t *) sock->manager,
03013                               sock->pf, sock->type, socketp,
03014                               sock0));
03015 }
03016 
03017 isc_result_t
03018 isc__socket_open(isc_socket_t *sock0) {
03019         isc_result_t result;
03020         isc__socket_t *sock = (isc__socket_t *)sock0;
03021 
03022         REQUIRE(VALID_SOCKET(sock));
03023 
03024         LOCK(&sock->lock);
03025         REQUIRE(sock->references == 1);
03026         REQUIRE(sock->type != isc_sockettype_fdwatch);
03027         UNLOCK(&sock->lock);
03028         /*
03029          * We don't need to retain the lock hereafter, since no one else has
03030          * this socket.
03031          */
03032         REQUIRE(sock->fd == -1);
03033 
03034         result = opensocket(sock->manager, sock, NULL);
03035         if (result != ISC_R_SUCCESS)
03036                 sock->fd = -1;
03037 
03038         if (result == ISC_R_SUCCESS) {
03039                 int lockid = FDLOCK_ID(sock->fd);
03040 
03041                 LOCK(&sock->manager->fdlock[lockid]);
03042                 sock->manager->fds[sock->fd] = sock;
03043                 sock->manager->fdstate[sock->fd] = MANAGED;
03044 #ifdef USE_DEVPOLL
03045                 INSIST(sock->manager->fdpollinfo[sock->fd].want_read == 0 &&
03046                        sock->manager->fdpollinfo[sock->fd].want_write == 0);
03047 #endif
03048                 UNLOCK(&sock->manager->fdlock[lockid]);
03049 
03050 #ifdef USE_SELECT
03051                 LOCK(&sock->manager->lock);
03052                 if (sock->manager->maxfd < sock->fd)
03053                         sock->manager->maxfd = sock->fd;
03054                 UNLOCK(&sock->manager->lock);
03055 #endif
03056         }
03057 
03058         return (result);
03059 }
03060 
03061 /*
03062  * Create a new 'type' socket managed by 'manager'.  Events
03063  * will be posted to 'task' and when dispatched 'action' will be
03064  * called with 'arg' as the arg value.  The new socket is returned
03065  * in 'socketp'.
03066  */
03067 isc_result_t
03068 isc__socket_fdwatchcreate(isc_socketmgr_t *manager0, int fd, int flags,
03069                           isc_sockfdwatch_t callback, void *cbarg,
03070                           isc_task_t *task, isc_socket_t **socketp)
03071 {
03072         isc__socketmgr_t *manager = (isc__socketmgr_t *)manager0;
03073         isc__socket_t *sock = NULL;
03074         isc_result_t result;
03075         int lockid;
03076 
03077         REQUIRE(VALID_MANAGER(manager));
03078         REQUIRE(socketp != NULL && *socketp == NULL);
03079 
03080         result = allocate_socket(manager, isc_sockettype_fdwatch, &sock);
03081         if (result != ISC_R_SUCCESS)
03082                 return (result);
03083 
03084         sock->fd = fd;
03085         sock->fdwatcharg = cbarg;
03086         sock->fdwatchcb = callback;
03087         sock->fdwatchflags = flags;
03088         sock->fdwatchtask = task;
03089         sock->statsindex = fdwatchstatsindex;
03090 
03091         sock->common.methods = (isc_socketmethods_t *)&socketmethods;
03092         sock->references = 1;
03093         *socketp = (isc_socket_t *)sock;
03094 
03095         /*
03096          * Note we don't have to lock the socket like we normally would because
03097          * there are no external references to it yet.
03098          */
03099 
03100         lockid = FDLOCK_ID(sock->fd);
03101         LOCK(&manager->fdlock[lockid]);
03102         manager->fds[sock->fd] = sock;
03103         manager->fdstate[sock->fd] = MANAGED;
03104         UNLOCK(&manager->fdlock[lockid]);
03105 
03106         LOCK(&manager->lock);
03107         ISC_LIST_APPEND(manager->socklist, sock, link);
03108 #ifdef USE_SELECT
03109         if (manager->maxfd < sock->fd)
03110                 manager->maxfd = sock->fd;
03111 #endif
03112         UNLOCK(&manager->lock);
03113 
03114         if (flags & ISC_SOCKFDWATCH_READ)
03115                 select_poke(sock->manager, sock->fd, SELECT_POKE_READ);
03116         if (flags & ISC_SOCKFDWATCH_WRITE)
03117                 select_poke(sock->manager, sock->fd, SELECT_POKE_WRITE);
03118 
03119         socket_log(sock, NULL, CREATION, isc_msgcat, ISC_MSGSET_SOCKET,
03120                    ISC_MSG_CREATED, "fdwatch-created");
03121 
03122         return (ISC_R_SUCCESS);
03123 }
03124 
03125 /*
03126  * Indicate to the manager that it should watch the socket again.
03127  * This can be used to restart watching if the previous event handler
03128  * didn't indicate there was more data to be processed.  Primarily
03129  * it is for writing but could be used for reading if desired
03130  */
03131 
03132 isc_result_t
03133 isc__socket_fdwatchpoke(isc_socket_t *sock0, int flags)
03134 {
03135         isc__socket_t *sock = (isc__socket_t *)sock0;
03136 
03137         REQUIRE(VALID_SOCKET(sock));
03138 
03139         /*
03140          * We check both flags first to allow us to get the lock
03141          * once but only if we need it.
03142          */
03143 
03144         if ((flags & (ISC_SOCKFDWATCH_READ | ISC_SOCKFDWATCH_WRITE)) != 0) {
03145                 LOCK(&sock->lock);
03146                 if (((flags & ISC_SOCKFDWATCH_READ) != 0) &&
03147                     !sock->pending_recv)
03148                         select_poke(sock->manager, sock->fd,
03149                                     SELECT_POKE_READ);
03150                 if (((flags & ISC_SOCKFDWATCH_WRITE) != 0) &&
03151                     !sock->pending_send)
03152                         select_poke(sock->manager, sock->fd,
03153                                     SELECT_POKE_WRITE);
03154                 UNLOCK(&sock->lock);
03155         }
03156 
03157         socket_log(sock, NULL, TRACE, isc_msgcat, ISC_MSGSET_SOCKET,
03158                    ISC_MSG_POKED, "fdwatch-poked flags: %d", flags);
03159 
03160         return (ISC_R_SUCCESS);
03161 }
03162 
03163 /*
03164  * Attach to a socket.  Caller must explicitly detach when it is done.
03165  */
03166 void
03167 isc__socket_attach(isc_socket_t *sock0, isc_socket_t **socketp) {
03168         isc__socket_t *sock = (isc__socket_t *)sock0;
03169 
03170         REQUIRE(VALID_SOCKET(sock));
03171         REQUIRE(socketp != NULL && *socketp == NULL);
03172 
03173         LOCK(&sock->lock);
03174         sock->references++;
03175         UNLOCK(&sock->lock);
03176 
03177         *socketp = (isc_socket_t *)sock;
03178 }
03179 
03180 /*
03181  * Dereference a socket.  If this is the last reference to it, clean things
03182  * up by destroying the socket.
03183  */
03184 void
03185 isc__socket_detach(isc_socket_t **socketp) {
03186         isc__socket_t *sock;
03187         isc_boolean_t kill_socket = ISC_FALSE;
03188 
03189         REQUIRE(socketp != NULL);
03190         sock = (isc__socket_t *)*socketp;
03191         REQUIRE(VALID_SOCKET(sock));
03192 
03193         LOCK(&sock->lock);
03194         REQUIRE(sock->references > 0);
03195         sock->references--;
03196         if (sock->references == 0)
03197                 kill_socket = ISC_TRUE;
03198         UNLOCK(&sock->lock);
03199 
03200         if (kill_socket)
03201                 destroy(&sock);
03202 
03203         *socketp = NULL;
03204 }
03205 
03206 isc_result_t
03207 isc__socket_close(isc_socket_t *sock0) {
03208         isc__socket_t *sock = (isc__socket_t *)sock0;
03209         int fd;
03210         isc__socketmgr_t *manager;
03211 
03212         fflush(stdout);
03213         REQUIRE(VALID_SOCKET(sock));
03214 
03215         LOCK(&sock->lock);
03216 
03217         REQUIRE(sock->references == 1);
03218         REQUIRE(sock->type != isc_sockettype_fdwatch);
03219         REQUIRE(sock->fd >= 0 && sock->fd < (int)sock->manager->maxsocks);
03220 
03221         INSIST(!sock->connecting);
03222         INSIST(!sock->pending_recv);
03223         INSIST(!sock->pending_send);
03224         INSIST(!sock->pending_accept);
03225         INSIST(ISC_LIST_EMPTY(sock->recv_list));
03226         INSIST(ISC_LIST_EMPTY(sock->send_list));
03227         INSIST(ISC_LIST_EMPTY(sock->accept_list));
03228         INSIST(ISC_LIST_EMPTY(sock->connect_list));
03229 
03230         manager = sock->manager;
03231         fd = sock->fd;
03232         sock->fd = -1;
03233         sock->dupped = 0;
03234         memset(sock->name, 0, sizeof(sock->name));
03235         sock->tag = NULL;
03236         sock->listener = 0;
03237         sock->connected = 0;
03238         sock->connecting = 0;
03239         sock->bound = 0;
03240         isc_sockaddr_any(&sock->peer_address);
03241 
03242         UNLOCK(&sock->lock);
03243 
03244         socketclose(manager, sock, fd);
03245 
03246         return (ISC_R_SUCCESS);
03247 }
03248 
03249 /*
03250  * I/O is possible on a given socket.  Schedule an event to this task that
03251  * will call an internal function to do the I/O.  This will charge the
03252  * task with the I/O operation and let our select loop handler get back
03253  * to doing something real as fast as possible.
03254  *
03255  * The socket and manager must be locked before calling this function.
03256  */
03257 static void
03258 dispatch_recv(isc__socket_t *sock) {
03259         intev_t *iev;
03260         isc_socketevent_t *ev;
03261         isc_task_t *sender;
03262 
03263         INSIST(!sock->pending_recv);
03264 
03265         if (sock->type != isc_sockettype_fdwatch) {
03266                 ev = ISC_LIST_HEAD(sock->recv_list);
03267                 if (ev == NULL)
03268                         return;
03269                 socket_log(sock, NULL, EVENT, NULL, 0, 0,
03270                            "dispatch_recv:  event %p -> task %p",
03271                            ev, ev->ev_sender);
03272                 sender = ev->ev_sender;
03273         } else {
03274                 sender = sock->fdwatchtask;
03275         }
03276 
03277         sock->pending_recv = 1;
03278         iev = &sock->readable_ev;
03279 
03280         sock->references++;
03281         iev->ev_sender = sock;
03282         if (sock->type == isc_sockettype_fdwatch)
03283                 iev->ev_action = internal_fdwatch_read;
03284         else
03285                 iev->ev_action = internal_recv;
03286         iev->ev_arg = sock;
03287 
03288         isc_task_send(sender, (isc_event_t **)&iev);
03289 }
03290 
03291 static void
03292 dispatch_send(isc__socket_t *sock) {
03293         intev_t *iev;
03294         isc_socketevent_t *ev;
03295         isc_task_t *sender;
03296 
03297         INSIST(!sock->pending_send);
03298 
03299         if (sock->type != isc_sockettype_fdwatch) {
03300                 ev = ISC_LIST_HEAD(sock->send_list);
03301                 if (ev == NULL)
03302                         return;
03303                 socket_log(sock, NULL, EVENT, NULL, 0, 0,
03304                            "dispatch_send:  event %p -> task %p",
03305                            ev, ev->ev_sender);
03306                 sender = ev->ev_sender;
03307         } else {
03308                 sender = sock->fdwatchtask;
03309         }
03310 
03311         sock->pending_send = 1;
03312         iev = &sock->writable_ev;
03313 
03314         sock->references++;
03315         iev->ev_sender = sock;
03316         if (sock->type == isc_sockettype_fdwatch)
03317                 iev->ev_action = internal_fdwatch_write;
03318         else
03319                 iev->ev_action = internal_send;
03320         iev->ev_arg = sock;
03321 
03322         isc_task_send(sender, (isc_event_t **)&iev);
03323 }
03324 
03325 /*
03326  * Dispatch an internal accept event.
03327  */
03328 static void
03329 dispatch_accept(isc__socket_t *sock) {
03330         intev_t *iev;
03331         isc_socket_newconnev_t *ev;
03332 
03333         INSIST(!sock->pending_accept);
03334 
03335         /*
03336          * Are there any done events left, or were they all canceled
03337          * before the manager got the socket lock?
03338          */
03339         ev = ISC_LIST_HEAD(sock->accept_list);
03340         if (ev == NULL)
03341                 return;
03342 
03343         sock->pending_accept = 1;
03344         iev = &sock->readable_ev;
03345 
03346         sock->references++;  /* keep socket around for this internal event */
03347         iev->ev_sender = sock;
03348         iev->ev_action = internal_accept;
03349         iev->ev_arg = sock;
03350 
03351         isc_task_send(ev->ev_sender, (isc_event_t **)&iev);
03352 }
03353 
03354 static void
03355 dispatch_connect(isc__socket_t *sock) {
03356         intev_t *iev;
03357         isc_socket_connev_t *ev;
03358 
03359         iev = &sock->writable_ev;
03360 
03361         ev = ISC_LIST_HEAD(sock->connect_list);
03362         INSIST(ev != NULL); /* XXX */
03363 
03364         INSIST(sock->connecting);
03365 
03366         sock->references++;  /* keep socket around for this internal event */
03367         iev->ev_sender = sock;
03368         iev->ev_action = internal_connect;
03369         iev->ev_arg = sock;
03370 
03371         isc_task_send(ev->ev_sender, (isc_event_t **)&iev);
03372 }
03373 
03374 /*
03375  * Dequeue an item off the given socket's read queue, set the result code
03376  * in the done event to the one provided, and send it to the task it was
03377  * destined for.
03378  *
03379  * If the event to be sent is on a list, remove it before sending.  If
03380  * asked to, send and detach from the socket as well.
03381  *
03382  * Caller must have the socket locked if the event is attached to the socket.
03383  */
03384 static void
03385 send_recvdone_event(isc__socket_t *sock, isc_socketevent_t **dev) {
03386         isc_task_t *task;
03387 
03388         task = (*dev)->ev_sender;
03389 
03390         (*dev)->ev_sender = sock;
03391 
03392         if (ISC_LINK_LINKED(*dev, ev_link))
03393                 ISC_LIST_DEQUEUE(sock->recv_list, *dev, ev_link);
03394 
03395         if (((*dev)->attributes & ISC_SOCKEVENTATTR_ATTACHED)
03396             == ISC_SOCKEVENTATTR_ATTACHED)
03397                 isc_task_sendanddetach(&task, (isc_event_t **)dev);
03398         else
03399                 isc_task_send(task, (isc_event_t **)dev);
03400 }
03401 
03402 /*
03403  * See comments for send_recvdone_event() above.
03404  *
03405  * Caller must have the socket locked if the event is attached to the socket.
03406  */
03407 static void
03408 send_senddone_event(isc__socket_t *sock, isc_socketevent_t **dev) {
03409         isc_task_t *task;
03410 
03411         INSIST(dev != NULL && *dev != NULL);
03412 
03413         task = (*dev)->ev_sender;
03414         (*dev)->ev_sender = sock;
03415 
03416         if (ISC_LINK_LINKED(*dev, ev_link))
03417                 ISC_LIST_DEQUEUE(sock->send_list, *dev, ev_link);
03418 
03419         if (((*dev)->attributes & ISC_SOCKEVENTATTR_ATTACHED)
03420             == ISC_SOCKEVENTATTR_ATTACHED)
03421                 isc_task_sendanddetach(&task, (isc_event_t **)dev);
03422         else
03423                 isc_task_send(task, (isc_event_t **)dev);
03424 }
03425 
03426 /*
03427  * See comments for send_recvdone_event() above.
03428  *
03429  * Caller must have the socket locked if the event is attached to the socket.
03430  */
03431 static void
03432 send_connectdone_event(isc__socket_t *sock, isc_socket_connev_t **dev) {
03433         isc_task_t *task;
03434 
03435         INSIST(dev != NULL && *dev != NULL);
03436 
03437         task = (*dev)->ev_sender;
03438         (*dev)->ev_sender = sock;
03439 
03440         if (ISC_LINK_LINKED(*dev, ev_link))
03441                 ISC_LIST_DEQUEUE(sock->connect_list, *dev, ev_link);
03442 
03443         isc_task_sendanddetach(&task, (isc_event_t **)dev);
03444 }
03445 
03446 /*
03447  * Call accept() on a socket, to get the new file descriptor.  The listen
03448  * socket is used as a prototype to create a new isc_socket_t.  The new
03449  * socket has one outstanding reference.  The task receiving the event
03450  * will be detached from just after the event is delivered.
03451  *
03452  * On entry to this function, the event delivered is the internal
03453  * readable event, and the first item on the accept_list should be
03454  * the done event we want to send.  If the list is empty, this is a no-op,
03455  * so just unlock and return.
03456  */
03457 static void
03458 internal_accept(isc_task_t *me, isc_event_t *ev) {
03459         isc__socket_t *sock;
03460         isc__socketmgr_t *manager;
03461         isc_socket_newconnev_t *dev;
03462         isc_task_t *task;
03463         ISC_SOCKADDR_LEN_T addrlen;
03464         int fd;
03465         isc_result_t result = ISC_R_SUCCESS;
03466         char strbuf[ISC_STRERRORSIZE];
03467         const char *err = "accept";
03468 
03469         UNUSED(me);
03470 
03471         sock = ev->ev_sender;
03472         INSIST(VALID_SOCKET(sock));
03473 
03474         LOCK(&sock->lock);
03475         socket_log(sock, NULL, TRACE,
03476                    isc_msgcat, ISC_MSGSET_SOCKET, ISC_MSG_ACCEPTLOCK,
03477                    "internal_accept called, locked socket");
03478 
03479         manager = sock->manager;
03480         INSIST(VALID_MANAGER(manager));
03481 
03482         INSIST(sock->listener);
03483         INSIST(sock->pending_accept == 1);
03484         sock->pending_accept = 0;
03485 
03486         INSIST(sock->references > 0);
03487         sock->references--;  /* the internal event is done with this socket */
03488         if (sock->references == 0) {
03489                 UNLOCK(&sock->lock);
03490                 destroy(&sock);
03491                 return;
03492         }
03493 
03494         /*
03495          * Get the first item off the accept list.
03496          * If it is empty, unlock the socket and return.
03497          */
03498         dev = ISC_LIST_HEAD(sock->accept_list);
03499         if (dev == NULL) {
03500                 UNLOCK(&sock->lock);
03501                 return;
03502         }
03503 
03504         /*
03505          * Try to accept the new connection.  If the accept fails with
03506          * EAGAIN or EINTR, simply poke the watcher to watch this socket
03507          * again.  Also ignore ECONNRESET, which has been reported to
03508          * be spuriously returned on Linux 2.2.19 although it is not
03509          * a documented error for accept().  ECONNABORTED has been
03510          * reported for Solaris 8.  The rest are thrown in not because
03511          * we have seen them but because they are ignored by other
03512          * daemons such as BIND 8 and Apache.
03513          */
03514 
03515         addrlen = sizeof(NEWCONNSOCK(dev)->peer_address.type);
03516         memset(&NEWCONNSOCK(dev)->peer_address.type, 0, addrlen);
03517         fd = accept(sock->fd, &NEWCONNSOCK(dev)->peer_address.type.sa,
03518                     (void *)&addrlen);
03519 
03520 #ifdef F_DUPFD
03521         /*
03522          * Leave a space for stdio to work in.
03523          */
03524         if (fd >= 0 && fd < 20) {
03525                 int new, tmp;
03526                 new = fcntl(fd, F_DUPFD, 20);
03527                 tmp = errno;
03528                 (void)close(fd);
03529                 errno = tmp;
03530                 fd = new;
03531                 err = "accept/fcntl";
03532         }
03533 #endif
03534 
03535         if (fd < 0) {
03536                 if (SOFT_ERROR(errno))
03537                         goto soft_error;
03538                 switch (errno) {
03539                 case ENFILE:
03540                 case EMFILE:
03541                         isc_log_iwrite(isc_lctx, ISC_LOGCATEGORY_GENERAL,
03542                                        ISC_LOGMODULE_SOCKET, ISC_LOG_ERROR,
03543                                        isc_msgcat, ISC_MSGSET_SOCKET,
03544                                        ISC_MSG_TOOMANYFDS,
03545                                        "%s: too many open file descriptors",
03546                                        err);
03547                         goto soft_error;
03548 
03549                 case ENOBUFS:
03550                 case ENOMEM:
03551                 case ECONNRESET:
03552                 case ECONNABORTED:
03553                 case EHOSTUNREACH:
03554                 case EHOSTDOWN:
03555                 case ENETUNREACH:
03556                 case ENETDOWN:
03557                 case ECONNREFUSED:
03558 #ifdef EPROTO
03559                 case EPROTO:
03560 #endif
03561 #ifdef ENONET
03562                 case ENONET:
03563 #endif
03564                         goto soft_error;
03565                 default:
03566                         break;
03567                 }
03568                 isc__strerror(errno, strbuf, sizeof(strbuf));
03569                 UNEXPECTED_ERROR(__FILE__, __LINE__,
03570                                  "internal_accept: %s() %s: %s", err,
03571                                  isc_msgcat_get(isc_msgcat,
03572                                                 ISC_MSGSET_GENERAL,
03573                                                 ISC_MSG_FAILED,
03574                                                 "failed"),
03575                                  strbuf);
03576                 fd = -1;
03577                 result = ISC_R_UNEXPECTED;
03578         } else {
03579                 if (addrlen == 0U) {
03580                         UNEXPECTED_ERROR(__FILE__, __LINE__,
03581                                          "internal_accept(): "
03582                                          "accept() failed to return "
03583                                          "remote address");
03584 
03585                         (void)close(fd);
03586                         goto soft_error;
03587                 } else if (NEWCONNSOCK(dev)->peer_address.type.sa.sa_family !=
03588                            sock->pf)
03589                 {
03590                         UNEXPECTED_ERROR(__FILE__, __LINE__,
03591                                          "internal_accept(): "
03592                                          "accept() returned peer address "
03593                                          "family %u (expected %u)",
03594                                          NEWCONNSOCK(dev)->peer_address.
03595                                          type.sa.sa_family,
03596                                          sock->pf);
03597                         (void)close(fd);
03598                         goto soft_error;
03599                 } else if (fd >= (int)manager->maxsocks) {
03600                         isc_log_iwrite(isc_lctx, ISC_LOGCATEGORY_GENERAL,
03601                                        ISC_LOGMODULE_SOCKET, ISC_LOG_ERROR,
03602                                        isc_msgcat, ISC_MSGSET_SOCKET,
03603                                        ISC_MSG_TOOMANYFDS,
03604                                        "accept: "
03605                                        "file descriptor exceeds limit (%d/%u)",
03606                                        fd, manager->maxsocks);
03607                         (void)close(fd);
03608                         goto soft_error;
03609                 }
03610         }
03611 
03612         if (fd != -1) {
03613                 NEWCONNSOCK(dev)->peer_address.length = addrlen;
03614                 NEWCONNSOCK(dev)->pf = sock->pf;
03615         }
03616 
03617         /*
03618          * Pull off the done event.
03619          */
03620         ISC_LIST_UNLINK(sock->accept_list, dev, ev_link);
03621 
03622         /*
03623          * Poke watcher if there are more pending accepts.
03624          */
03625         if (!ISC_LIST_EMPTY(sock->accept_list))
03626                 select_poke(sock->manager, sock->fd, SELECT_POKE_ACCEPT);
03627 
03628         UNLOCK(&sock->lock);
03629 
03630         if (fd != -1) {
03631                 result = make_nonblock(fd);
03632                 if (result != ISC_R_SUCCESS) {
03633                         (void)close(fd);
03634                         fd = -1;
03635                 }
03636         }
03637 
03638         /*
03639          * -1 means the new socket didn't happen.
03640          */
03641         if (fd != -1) {
03642                 int lockid = FDLOCK_ID(fd);
03643 
03644                 NEWCONNSOCK(dev)->fd = fd;
03645                 NEWCONNSOCK(dev)->bound = 1;
03646                 NEWCONNSOCK(dev)->connected = 1;
03647 
03648                 /*
03649                  * Use minimum mtu if possible.
03650                  */
03651                 use_min_mtu(NEWCONNSOCK(dev));
03652 
03653                 /*
03654                  * Ensure DSCP settings are inherited across accept.
03655                  */
03656                 setdscp(NEWCONNSOCK(dev), sock->dscp);
03657 
03658                 /*
03659                  * Save away the remote address
03660                  */
03661                 dev->address = NEWCONNSOCK(dev)->peer_address;
03662 
03663                 LOCK(&manager->fdlock[lockid]);
03664                 manager->fds[fd] = NEWCONNSOCK(dev);
03665                 manager->fdstate[fd] = MANAGED;
03666                 UNLOCK(&manager->fdlock[lockid]);
03667 
03668                 LOCK(&manager->lock);
03669 
03670 #ifdef USE_SELECT
03671                 if (manager->maxfd < fd)
03672                         manager->maxfd = fd;
03673 #endif
03674 
03675                 socket_log(sock, &NEWCONNSOCK(dev)->peer_address, CREATION,
03676                            isc_msgcat, ISC_MSGSET_SOCKET, ISC_MSG_ACCEPTEDCXN,
03677                            "accepted connection, new socket %p",
03678                            dev->newsocket);
03679 
03680                 ISC_LIST_APPEND(manager->socklist, NEWCONNSOCK(dev), link);
03681 
03682                 UNLOCK(&manager->lock);
03683 
03684                 inc_stats(manager->stats, sock->statsindex[STATID_ACCEPT]);
03685                 inc_stats(manager->stats, sock->statsindex[STATID_ACTIVE]);
03686         } else {
03687                 inc_stats(manager->stats, sock->statsindex[STATID_ACCEPTFAIL]);
03688                 NEWCONNSOCK(dev)->references--;
03689                 free_socket((isc__socket_t **)&dev->newsocket);
03690         }
03691 
03692         /*
03693          * Fill in the done event details and send it off.
03694          */
03695         dev->result = result;
03696         task = dev->ev_sender;
03697         dev->ev_sender = sock;
03698 
03699         isc_task_sendanddetach(&task, ISC_EVENT_PTR(&dev));
03700         return;
03701 
03702  soft_error:
03703         select_poke(sock->manager, sock->fd, SELECT_POKE_ACCEPT);
03704         UNLOCK(&sock->lock);
03705 
03706         inc_stats(manager->stats, sock->statsindex[STATID_ACCEPTFAIL]);
03707         return;
03708 }
03709 
03710 static void
03711 internal_recv(isc_task_t *me, isc_event_t *ev) {
03712         isc_socketevent_t *dev;
03713         isc__socket_t *sock;
03714 
03715         INSIST(ev->ev_type == ISC_SOCKEVENT_INTR);
03716 
03717         sock = ev->ev_sender;
03718         INSIST(VALID_SOCKET(sock));
03719 
03720         LOCK(&sock->lock);
03721         socket_log(sock, NULL, IOEVENT,
03722                    isc_msgcat, ISC_MSGSET_SOCKET, ISC_MSG_INTERNALRECV,
03723                    "internal_recv: task %p got event %p", me, ev);
03724 
03725         INSIST(sock->pending_recv == 1);
03726         sock->pending_recv = 0;
03727 
03728         INSIST(sock->references > 0);
03729         sock->references--;  /* the internal event is done with this socket */
03730         if (sock->references == 0) {
03731                 UNLOCK(&sock->lock);
03732                 destroy(&sock);
03733                 return;
03734         }
03735 
03736         /*
03737          * Try to do as much I/O as possible on this socket.  There are no
03738          * limits here, currently.
03739          */
03740         dev = ISC_LIST_HEAD(sock->recv_list);
03741         while (dev != NULL) {
03742                 switch (doio_recv(sock, dev)) {
03743                 case DOIO_SOFT:
03744                         goto poke;
03745 
03746                 case DOIO_EOF:
03747                         /*
03748                          * read of 0 means the remote end was closed.
03749                          * Run through the event queue and dispatch all
03750                          * the events with an EOF result code.
03751                          */
03752                         do {
03753                                 dev->result = ISC_R_EOF;
03754                                 send_recvdone_event(sock, &dev);
03755                                 dev = ISC_LIST_HEAD(sock->recv_list);
03756                         } while (dev != NULL);
03757                         goto poke;
03758 
03759                 case DOIO_SUCCESS:
03760                 case DOIO_HARD:
03761                         send_recvdone_event(sock, &dev);
03762                         break;
03763                 }
03764 
03765                 dev = ISC_LIST_HEAD(sock->recv_list);
03766         }
03767 
03768  poke:
03769         if (!ISC_LIST_EMPTY(sock->recv_list))
03770                 select_poke(sock->manager, sock->fd, SELECT_POKE_READ);
03771 
03772         UNLOCK(&sock->lock);
03773 }
03774 
03775 static void
03776 internal_send(isc_task_t *me, isc_event_t *ev) {
03777         isc_socketevent_t *dev;
03778         isc__socket_t *sock;
03779 
03780         INSIST(ev->ev_type == ISC_SOCKEVENT_INTW);
03781 
03782         /*
03783          * Find out what socket this is and lock it.
03784          */
03785         sock = (isc__socket_t *)ev->ev_sender;
03786         INSIST(VALID_SOCKET(sock));
03787 
03788         LOCK(&sock->lock);
03789         socket_log(sock, NULL, IOEVENT,
03790                    isc_msgcat, ISC_MSGSET_SOCKET, ISC_MSG_INTERNALSEND,
03791                    "internal_send: task %p got event %p", me, ev);
03792 
03793         INSIST(sock->pending_send == 1);
03794         sock->pending_send = 0;
03795 
03796         INSIST(sock->references > 0);
03797         sock->references--;  /* the internal event is done with this socket */
03798         if (sock->references == 0) {
03799                 UNLOCK(&sock->lock);
03800                 destroy(&sock);
03801                 return;
03802         }
03803 
03804         /*
03805          * Try to do as much I/O as possible on this socket.  There are no
03806          * limits here, currently.
03807          */
03808         dev = ISC_LIST_HEAD(sock->send_list);
03809         while (dev != NULL) {
03810                 switch (doio_send(sock, dev)) {
03811                 case DOIO_SOFT:
03812                         goto poke;
03813 
03814                 case DOIO_HARD:
03815                 case DOIO_SUCCESS:
03816                         send_senddone_event(sock, &dev);
03817                         break;
03818                 }
03819 
03820                 dev = ISC_LIST_HEAD(sock->send_list);
03821         }
03822 
03823  poke:
03824         if (!ISC_LIST_EMPTY(sock->send_list))
03825                 select_poke(sock->manager, sock->fd, SELECT_POKE_WRITE);
03826 
03827         UNLOCK(&sock->lock);
03828 }
03829 
03830 static void
03831 internal_fdwatch_write(isc_task_t *me, isc_event_t *ev) {
03832         isc__socket_t *sock;
03833         int more_data;
03834 
03835         INSIST(ev->ev_type == ISC_SOCKEVENT_INTW);
03836 
03837         /*
03838          * Find out what socket this is and lock it.
03839          */
03840         sock = (isc__socket_t *)ev->ev_sender;
03841         INSIST(VALID_SOCKET(sock));
03842 
03843         LOCK(&sock->lock);
03844         socket_log(sock, NULL, IOEVENT,
03845                    isc_msgcat, ISC_MSGSET_SOCKET, ISC_MSG_INTERNALSEND,
03846                    "internal_fdwatch_write: task %p got event %p", me, ev);
03847 
03848         INSIST(sock->pending_send == 1);
03849 
03850         UNLOCK(&sock->lock);
03851         more_data = (sock->fdwatchcb)(me, (isc_socket_t *)sock,
03852                                       sock->fdwatcharg, ISC_SOCKFDWATCH_WRITE);
03853         LOCK(&sock->lock);
03854 
03855         sock->pending_send = 0;
03856 
03857         INSIST(sock->references > 0);
03858         sock->references--;  /* the internal event is done with this socket */
03859         if (sock->references == 0) {
03860                 UNLOCK(&sock->lock);
03861                 destroy(&sock);
03862                 return;
03863         }
03864 
03865         if (more_data)
03866                 select_poke(sock->manager, sock->fd, SELECT_POKE_WRITE);
03867 
03868         UNLOCK(&sock->lock);
03869 }
03870 
03871 static void
03872 internal_fdwatch_read(isc_task_t *me, isc_event_t *ev) {
03873         isc__socket_t *sock;
03874         int more_data;
03875 
03876         INSIST(ev->ev_type == ISC_SOCKEVENT_INTR);
03877 
03878         /*
03879          * Find out what socket this is and lock it.
03880          */
03881         sock = (isc__socket_t *)ev->ev_sender;
03882         INSIST(VALID_SOCKET(sock));
03883 
03884         LOCK(&sock->lock);
03885         socket_log(sock, NULL, IOEVENT,
03886                    isc_msgcat, ISC_MSGSET_SOCKET, ISC_MSG_INTERNALRECV,
03887                    "internal_fdwatch_read: task %p got event %p", me, ev);
03888 
03889         INSIST(sock->pending_recv == 1);
03890 
03891         UNLOCK(&sock->lock);
03892         more_data = (sock->fdwatchcb)(me, (isc_socket_t *)sock,
03893                                       sock->fdwatcharg, ISC_SOCKFDWATCH_READ);
03894         LOCK(&sock->lock);
03895 
03896         sock->pending_recv = 0;
03897 
03898         INSIST(sock->references > 0);
03899         sock->references--;  /* the internal event is done with this socket */
03900         if (sock->references == 0) {
03901                 UNLOCK(&sock->lock);
03902                 destroy(&sock);
03903                 return;
03904         }
03905 
03906         if (more_data)
03907                 select_poke(sock->manager, sock->fd, SELECT_POKE_READ);
03908 
03909         UNLOCK(&sock->lock);
03910 }
03911 
03912 /*
03913  * Process read/writes on each fd here.  Avoid locking
03914  * and unlocking twice if both reads and writes are possible.
03915  */
03916 static void
03917 process_fd(isc__socketmgr_t *manager, int fd, isc_boolean_t readable,
03918            isc_boolean_t writeable)
03919 {
03920         isc__socket_t *sock;
03921         isc_boolean_t unlock_sock;
03922         isc_boolean_t unwatch_read = ISC_FALSE, unwatch_write = ISC_FALSE;
03923         int lockid = FDLOCK_ID(fd);
03924 
03925         /*
03926          * If the socket is going to be closed, don't do more I/O.
03927          */
03928         LOCK(&manager->fdlock[lockid]);
03929         if (manager->fdstate[fd] == CLOSE_PENDING) {
03930                 UNLOCK(&manager->fdlock[lockid]);
03931 
03932                 (void)unwatch_fd(manager, fd, SELECT_POKE_READ);
03933                 (void)unwatch_fd(manager, fd, SELECT_POKE_WRITE);
03934                 return;
03935         }
03936 
03937         sock = manager->fds[fd];
03938         unlock_sock = ISC_FALSE;
03939         if (readable) {
03940                 if (sock == NULL) {
03941                         unwatch_read = ISC_TRUE;
03942                         goto check_write;
03943                 }
03944                 unlock_sock = ISC_TRUE;
03945                 LOCK(&sock->lock);
03946                 if (!SOCK_DEAD(sock)) {
03947                         if (sock->listener)
03948                                 dispatch_accept(sock);
03949                         else
03950                                 dispatch_recv(sock);
03951                 }
03952                 unwatch_read = ISC_TRUE;
03953         }
03954 check_write:
03955         if (writeable) {
03956                 if (sock == NULL) {
03957                         unwatch_write = ISC_TRUE;
03958                         goto unlock_fd;
03959                 }
03960                 if (!unlock_sock) {
03961                         unlock_sock = ISC_TRUE;
03962                         LOCK(&sock->lock);
03963                 }
03964                 if (!SOCK_DEAD(sock)) {
03965                         if (sock->connecting)
03966                                 dispatch_connect(sock);
03967                         else
03968                                 dispatch_send(sock);
03969                 }
03970                 unwatch_write = ISC_TRUE;
03971         }
03972         if (unlock_sock)
03973                 UNLOCK(&sock->lock);
03974 
03975  unlock_fd:
03976         UNLOCK(&manager->fdlock[lockid]);
03977         if (unwatch_read)
03978                 (void)unwatch_fd(manager, fd, SELECT_POKE_READ);
03979         if (unwatch_write)
03980                 (void)unwatch_fd(manager, fd, SELECT_POKE_WRITE);
03981 
03982 }
03983 
03984 #ifdef USE_KQUEUE
03985 static isc_boolean_t
03986 process_fds(isc__socketmgr_t *manager, struct kevent *events, int nevents) {
03987         int i;
03988         isc_boolean_t readable, writable;
03989         isc_boolean_t done = ISC_FALSE;
03990 #ifdef USE_WATCHER_THREAD
03991         isc_boolean_t have_ctlevent = ISC_FALSE;
03992 #endif
03993 
03994         if (nevents == manager->nevents) {
03995                 /*
03996                  * This is not an error, but something unexpected.  If this
03997                  * happens, it may indicate the need for increasing
03998                  * ISC_SOCKET_MAXEVENTS.
03999                  */
04000                 manager_log(manager, ISC_LOGCATEGORY_GENERAL,
04001                             ISC_LOGMODULE_SOCKET, ISC_LOG_INFO,
04002                             "maximum number of FD events (%d) received",
04003                             nevents);
04004         }
04005 
04006         for (i = 0; i < nevents; i++) {
04007                 REQUIRE(events[i].ident < manager->maxsocks);
04008 #ifdef USE_WATCHER_THREAD
04009                 if (events[i].ident == (uintptr_t)manager->pipe_fds[0]) {
04010                         have_ctlevent = ISC_TRUE;
04011                         continue;
04012                 }
04013 #endif
04014                 readable = ISC_TF(events[i].filter == EVFILT_READ);
04015                 writable = ISC_TF(events[i].filter == EVFILT_WRITE);
04016                 process_fd(manager, events[i].ident, readable, writable);
04017         }
04018 
04019 #ifdef USE_WATCHER_THREAD
04020         if (have_ctlevent)
04021                 done = process_ctlfd(manager);
04022 #endif
04023 
04024         return (done);
04025 }
04026 #elif defined(USE_EPOLL)
04027 static isc_boolean_t
04028 process_fds(isc__socketmgr_t *manager, struct epoll_event *events, int nevents)
04029 {
04030         int i;
04031         isc_boolean_t done = ISC_FALSE;
04032 #ifdef USE_WATCHER_THREAD
04033         isc_boolean_t have_ctlevent = ISC_FALSE;
04034 #endif
04035 
04036         if (nevents == manager->nevents) {
04037                 manager_log(manager, ISC_LOGCATEGORY_GENERAL,
04038                             ISC_LOGMODULE_SOCKET, ISC_LOG_INFO,
04039                             "maximum number of FD events (%d) received",
04040                             nevents);
04041         }
04042 
04043         for (i = 0; i < nevents; i++) {
04044                 REQUIRE(events[i].data.fd < (int)manager->maxsocks);
04045 #ifdef USE_WATCHER_THREAD
04046                 if (events[i].data.fd == manager->pipe_fds[0]) {
04047                         have_ctlevent = ISC_TRUE;
04048                         continue;
04049                 }
04050 #endif
04051                 if ((events[i].events & EPOLLERR) != 0 ||
04052                     (events[i].events & EPOLLHUP) != 0) {
04053                         /*
04054                          * epoll does not set IN/OUT bits on an erroneous
04055                          * condition, so we need to try both anyway.  This is a
04056                          * bit inefficient, but should be okay for such rare
04057                          * events.  Note also that the read or write attempt
04058                          * won't block because we use non-blocking sockets.
04059                          */
04060                         events[i].events |= (EPOLLIN | EPOLLOUT);
04061                 }
04062                 process_fd(manager, events[i].data.fd,
04063                            (events[i].events & EPOLLIN) != 0,
04064                            (events[i].events & EPOLLOUT) != 0);
04065         }
04066 
04067 #ifdef USE_WATCHER_THREAD
04068         if (have_ctlevent)
04069                 done = process_ctlfd(manager);
04070 #endif
04071 
04072         return (done);
04073 }
04074 #elif defined(USE_DEVPOLL)
04075 static isc_boolean_t
04076 process_fds(isc__socketmgr_t *manager, struct pollfd *events, int nevents) {
04077         int i;
04078         isc_boolean_t done = ISC_FALSE;
04079 #ifdef USE_WATCHER_THREAD
04080         isc_boolean_t have_ctlevent = ISC_FALSE;
04081 #endif
04082 
04083         if (nevents == manager->nevents) {
04084                 manager_log(manager, ISC_LOGCATEGORY_GENERAL,
04085                             ISC_LOGMODULE_SOCKET, ISC_LOG_INFO,
04086                             "maximum number of FD events (%d) received",
04087                             nevents);
04088         }
04089 
04090         for (i = 0; i < nevents; i++) {
04091                 REQUIRE(events[i].fd < (int)manager->maxsocks);
04092 #ifdef USE_WATCHER_THREAD
04093                 if (events[i].fd == manager->pipe_fds[0]) {
04094                         have_ctlevent = ISC_TRUE;
04095                         continue;
04096                 }
04097 #endif
04098                 process_fd(manager, events[i].fd,
04099                            (events[i].events & POLLIN) != 0,
04100                            (events[i].events & POLLOUT) != 0);
04101         }
04102 
04103 #ifdef USE_WATCHER_THREAD
04104         if (have_ctlevent)
04105                 done = process_ctlfd(manager);
04106 #endif
04107 
04108         return (done);
04109 }
04110 #elif defined(USE_SELECT)
04111 static void
04112 process_fds(isc__socketmgr_t *manager, int maxfd, fd_set *readfds,
04113             fd_set *writefds)
04114 {
04115         int i;
04116 
04117         REQUIRE(maxfd <= (int)manager->maxsocks);
04118 
04119         for (i = 0; i < maxfd; i++) {
04120 #ifdef USE_WATCHER_THREAD
04121                 if (i == manager->pipe_fds[0] || i == manager->pipe_fds[1])
04122                         continue;
04123 #endif /* USE_WATCHER_THREAD */
04124                 process_fd(manager, i, FD_ISSET(i, readfds),
04125                            FD_ISSET(i, writefds));
04126         }
04127 }
04128 #endif
04129 
04130 #ifdef USE_WATCHER_THREAD
04131 static isc_boolean_t
04132 process_ctlfd(isc__socketmgr_t *manager) {
04133         int msg, fd;
04134 
04135         for (;;) {
04136                 select_readmsg(manager, &fd, &msg);
04137 
04138                 manager_log(manager, IOEVENT,
04139                             isc_msgcat_get(isc_msgcat, ISC_MSGSET_SOCKET,
04140                                            ISC_MSG_WATCHERMSG,
04141                                            "watcher got message %d "
04142                                            "for socket %d"), msg, fd);
04143 
04144                 /*
04145                  * Nothing to read?
04146                  */
04147                 if (msg == SELECT_POKE_NOTHING)
04148                         break;
04149 
04150                 /*
04151                  * Handle shutdown message.  We really should
04152                  * jump out of this loop right away, but
04153                  * it doesn't matter if we have to do a little
04154                  * more work first.
04155                  */
04156                 if (msg == SELECT_POKE_SHUTDOWN)
04157                         return (ISC_TRUE);
04158 
04159                 /*
04160                  * This is a wakeup on a socket.  Look
04161                  * at the event queue for both read and write,
04162                  * and decide if we need to watch on it now
04163                  * or not.
04164                  */
04165                 wakeup_socket(manager, fd, msg);
04166         }
04167 
04168         return (ISC_FALSE);
04169 }
04170 
04171 /*
04172  * This is the thread that will loop forever, always in a select or poll
04173  * call.
04174  *
04175  * When select returns something to do, track down what thread gets to do
04176  * this I/O and post the event to it.
04177  */
04178 static isc_threadresult_t
04179 watcher(void *uap) {
04180         isc__socketmgr_t *manager = uap;
04181         isc_boolean_t done;
04182         int cc;
04183 #ifdef USE_KQUEUE
04184         const char *fnname = "kevent()";
04185 #elif defined (USE_EPOLL)
04186         const char *fnname = "epoll_wait()";
04187 #elif defined(USE_DEVPOLL)
04188         isc_result_t result;
04189         const char *fnname = "ioctl(DP_POLL)";
04190         struct dvpoll dvp;
04191         int pass;
04192 #elif defined (USE_SELECT)
04193         const char *fnname = "select()";
04194         int maxfd;
04195         int ctlfd;
04196 #endif
04197         char strbuf[ISC_STRERRORSIZE];
04198 #ifdef ISC_SOCKET_USE_POLLWATCH
04199         pollstate_t pollstate = poll_idle;
04200 #endif
04201 
04202 #if defined (USE_SELECT)
04203         /*
04204          * Get the control fd here.  This will never change.
04205          */
04206         ctlfd = manager->pipe_fds[0];
04207 #endif
04208         done = ISC_FALSE;
04209         while (!done) {
04210                 do {
04211 #ifdef USE_KQUEUE
04212                         cc = kevent(manager->kqueue_fd, NULL, 0,
04213                                     manager->events, manager->nevents, NULL);
04214 #elif defined(USE_EPOLL)
04215                         cc = epoll_wait(manager->epoll_fd, manager->events,
04216                                         manager->nevents, -1);
04217 #elif defined(USE_DEVPOLL)
04218                         /*
04219                          * Re-probe every thousand calls.
04220                          */
04221                         if (manager->calls++ > 1000U) {
04222                                 result = isc_resource_getcurlimit(
04223                                                         isc_resource_openfiles,
04224                                                         &manager->open_max);
04225                                 if (result != ISC_R_SUCCESS)
04226                                         manager->open_max = 64;
04227                                 manager->calls = 0;
04228                         }
04229                         for (pass = 0; pass < 2; pass++) {
04230                                 dvp.dp_fds = manager->events;
04231                                 dvp.dp_nfds = manager->nevents;
04232                                 if (dvp.dp_nfds >= manager->open_max)
04233                                         dvp.dp_nfds = manager->open_max - 1;
04234 #ifndef ISC_SOCKET_USE_POLLWATCH
04235                                 dvp.dp_timeout = -1;
04236 #else
04237                                 if (pollstate == poll_idle)
04238                                         dvp.dp_timeout = -1;
04239                                 else
04240                                         dvp.dp_timeout =
04241                                                  ISC_SOCKET_POLLWATCH_TIMEOUT;
04242 #endif  /* ISC_SOCKET_USE_POLLWATCH */
04243                                 cc = ioctl(manager->devpoll_fd, DP_POLL, &dvp);
04244                                 if (cc == -1 && errno == EINVAL) {
04245                                         /*
04246                                          * {OPEN_MAX} may have dropped.  Look
04247                                          * up the current value and try again.
04248                                          */
04249                                         result = isc_resource_getcurlimit(
04250                                                         isc_resource_openfiles,
04251                                                         &manager->open_max);
04252                                         if (result != ISC_R_SUCCESS)
04253                                                 manager->open_max = 64;
04254                                 } else
04255                                         break;
04256                         }
04257 #elif defined(USE_SELECT)
04258                         LOCK(&manager->lock);
04259                         memmove(manager->read_fds_copy, manager->read_fds,
04260                                 manager->fd_bufsize);
04261                         memmove(manager->write_fds_copy, manager->write_fds,
04262                                 manager->fd_bufsize);
04263                         maxfd = manager->maxfd + 1;
04264                         UNLOCK(&manager->lock);
04265 
04266                         cc = select(maxfd, manager->read_fds_copy,
04267                                     manager->write_fds_copy, NULL, NULL);
04268 #endif  /* USE_KQUEUE */
04269 
04270                         if (cc < 0 && !SOFT_ERROR(errno)) {
04271                                 isc__strerror(errno, strbuf, sizeof(strbuf));
04272                                 FATAL_ERROR(__FILE__, __LINE__,
04273                                             "%s %s: %s", fnname,
04274                                             isc_msgcat_get(isc_msgcat,
04275                                                            ISC_MSGSET_GENERAL,
04276                                                            ISC_MSG_FAILED,
04277                                                            "failed"), strbuf);
04278                         }
04279 
04280 #if defined(USE_DEVPOLL) && defined(ISC_SOCKET_USE_POLLWATCH)
04281                         if (cc == 0) {
04282                                 if (pollstate == poll_active)
04283                                         pollstate = poll_checking;
04284                                 else if (pollstate == poll_checking)
04285                                         pollstate = poll_idle;
04286                         } else if (cc > 0) {
04287                                 if (pollstate == poll_checking) {
04288                                         /*
04289                                          * XXX: We'd like to use a more
04290                                          * verbose log level as it's actually an
04291                                          * unexpected event, but the kernel bug
04292                                          * reportedly happens pretty frequently
04293                                          * (and it can also be a false positive)
04294                                          * so it would be just too noisy.
04295                                          */
04296                                         manager_log(manager,
04297                                                     ISC_LOGCATEGORY_GENERAL,
04298                                                     ISC_LOGMODULE_SOCKET,
04299                                                     ISC_LOG_DEBUG(1),
04300                                                     "unexpected POLL timeout");
04301                                 }
04302                                 pollstate = poll_active;
04303                         }
04304 #endif
04305                 } while (cc < 0);
04306 
04307 #if defined(USE_KQUEUE) || defined (USE_EPOLL) || defined (USE_DEVPOLL)
04308                 done = process_fds(manager, manager->events, cc);
04309 #elif defined(USE_SELECT)
04310                 process_fds(manager, maxfd, manager->read_fds_copy,
04311                             manager->write_fds_copy);
04312 
04313                 /*
04314                  * Process reads on internal, control fd.
04315                  */
04316                 if (FD_ISSET(ctlfd, manager->read_fds_copy))
04317                         done = process_ctlfd(manager);
04318 #endif
04319         }
04320 
04321         manager_log(manager, TRACE, "%s",
04322                     isc_msgcat_get(isc_msgcat, ISC_MSGSET_GENERAL,
04323                                    ISC_MSG_EXITING, "watcher exiting"));
04324 
04325         return ((isc_threadresult_t)0);
04326 }
04327 #endif /* USE_WATCHER_THREAD */
04328 
04329 void
04330 isc__socketmgr_setreserved(isc_socketmgr_t *manager0, isc_uint32_t reserved) {
04331         isc__socketmgr_t *manager = (isc__socketmgr_t *)manager0;
04332 
04333         REQUIRE(VALID_MANAGER(manager));
04334 
04335         manager->reserved = reserved;
04336 }
04337 
04338 void
04339 isc__socketmgr_maxudp(isc_socketmgr_t *manager0, int maxudp) {
04340         isc__socketmgr_t *manager = (isc__socketmgr_t *)manager0;
04341 
04342         REQUIRE(VALID_MANAGER(manager));
04343 
04344         manager->maxudp = maxudp;
04345 }
04346 
04347 /*
04348  * Create a new socket manager.
04349  */
04350 
04351 static isc_result_t
04352 setup_watcher(isc_mem_t *mctx, isc__socketmgr_t *manager) {
04353         isc_result_t result;
04354 #if defined(USE_KQUEUE) || defined(USE_EPOLL) || defined(USE_DEVPOLL)
04355         char strbuf[ISC_STRERRORSIZE];
04356 #endif
04357 
04358 #ifdef USE_KQUEUE
04359         manager->nevents = ISC_SOCKET_MAXEVENTS;
04360         manager->events = isc_mem_get(mctx, sizeof(struct kevent) *
04361                                       manager->nevents);
04362         if (manager->events == NULL)
04363                 return (ISC_R_NOMEMORY);
04364         manager->kqueue_fd = kqueue();
04365         if (manager->kqueue_fd == -1) {
04366                 result = isc__errno2result(errno);
04367                 isc__strerror(errno, strbuf, sizeof(strbuf));
04368                 UNEXPECTED_ERROR(__FILE__, __LINE__,
04369                                  "kqueue %s: %s",
04370                                  isc_msgcat_get(isc_msgcat, ISC_MSGSET_GENERAL,
04371                                                 ISC_MSG_FAILED, "failed"),
04372                                  strbuf);
04373                 isc_mem_put(mctx, manager->events,
04374                             sizeof(struct kevent) * manager->nevents);
04375                 return (result);
04376         }
04377 
04378 #ifdef USE_WATCHER_THREAD
04379         result = watch_fd(manager, manager->pipe_fds[0], SELECT_POKE_READ);
04380         if (result != ISC_R_SUCCESS) {
04381                 close(manager->kqueue_fd);
04382                 isc_mem_put(mctx, manager->events,
04383                             sizeof(struct kevent) * manager->nevents);
04384                 return (result);
04385         }
04386 #endif  /* USE_WATCHER_THREAD */
04387 #elif defined(USE_EPOLL)
04388         manager->nevents = ISC_SOCKET_MAXEVENTS;
04389         manager->events = isc_mem_get(mctx, sizeof(struct epoll_event) *
04390                                       manager->nevents);
04391         if (manager->events == NULL)
04392                 return (ISC_R_NOMEMORY);
04393         manager->epoll_fd = epoll_create(manager->nevents);
04394         if (manager->epoll_fd == -1) {
04395                 result = isc__errno2result(errno);
04396                 isc__strerror(errno, strbuf, sizeof(strbuf));
04397                 UNEXPECTED_ERROR(__FILE__, __LINE__,
04398                                  "epoll_create %s: %s",
04399                                  isc_msgcat_get(isc_msgcat, ISC_MSGSET_GENERAL,
04400                                                 ISC_MSG_FAILED, "failed"),
04401                                  strbuf);
04402                 isc_mem_put(mctx, manager->events,
04403                             sizeof(struct epoll_event) * manager->nevents);
04404                 return (result);
04405         }
04406 #ifdef USE_WATCHER_THREAD
04407         result = watch_fd(manager, manager->pipe_fds[0], SELECT_POKE_READ);
04408         if (result != ISC_R_SUCCESS) {
04409                 close(manager->epoll_fd);
04410                 isc_mem_put(mctx, manager->events,
04411                             sizeof(struct epoll_event) * manager->nevents);
04412                 return (result);
04413         }
04414 #endif  /* USE_WATCHER_THREAD */
04415 #elif defined(USE_DEVPOLL)
04416         manager->nevents = ISC_SOCKET_MAXEVENTS;
04417         result = isc_resource_getcurlimit(isc_resource_openfiles,
04418                                           &manager->open_max);
04419         if (result != ISC_R_SUCCESS)
04420                 manager->open_max = 64;
04421         manager->calls = 0;
04422         manager->events = isc_mem_get(mctx, sizeof(struct pollfd) *
04423                                       manager->nevents);
04424         if (manager->events == NULL)
04425                 return (ISC_R_NOMEMORY);
04426         /*
04427          * Note: fdpollinfo should be able to support all possible FDs, so
04428          * it must have maxsocks entries (not nevents).
04429          */
04430         manager->fdpollinfo = isc_mem_get(mctx, sizeof(pollinfo_t) *
04431                                           manager->maxsocks);
04432         if (manager->fdpollinfo == NULL) {
04433                 isc_mem_put(mctx, manager->events,
04434                             sizeof(struct pollfd) * manager->nevents);
04435                 return (ISC_R_NOMEMORY);
04436         }
04437         memset(manager->fdpollinfo, 0, sizeof(pollinfo_t) * manager->maxsocks);
04438         manager->devpoll_fd = open("/dev/poll", O_RDWR);
04439         if (manager->devpoll_fd == -1) {
04440                 result = isc__errno2result(errno);
04441                 isc__strerror(errno, strbuf, sizeof(strbuf));
04442                 UNEXPECTED_ERROR(__FILE__, __LINE__,
04443                                  "open(/dev/poll) %s: %s",
04444                                  isc_msgcat_get(isc_msgcat, ISC_MSGSET_GENERAL,
04445                                                 ISC_MSG_FAILED, "failed"),
04446                                  strbuf);
04447                 isc_mem_put(mctx, manager->events,
04448                             sizeof(struct pollfd) * manager->nevents);
04449                 isc_mem_put(mctx, manager->fdpollinfo,
04450                             sizeof(pollinfo_t) * manager->maxsocks);
04451                 return (result);
04452         }
04453 #ifdef USE_WATCHER_THREAD
04454         result = watch_fd(manager, manager->pipe_fds[0], SELECT_POKE_READ);
04455         if (result != ISC_R_SUCCESS) {
04456                 close(manager->devpoll_fd);
04457                 isc_mem_put(mctx, manager->events,
04458                             sizeof(struct pollfd) * manager->nevents);
04459                 isc_mem_put(mctx, manager->fdpollinfo,
04460                             sizeof(pollinfo_t) * manager->maxsocks);
04461                 return (result);
04462         }
04463 #endif  /* USE_WATCHER_THREAD */
04464 #elif defined(USE_SELECT)
04465         UNUSED(result);
04466 
04467 #if ISC_SOCKET_MAXSOCKETS > FD_SETSIZE
04468         /*
04469          * Note: this code should also cover the case of MAXSOCKETS <=
04470          * FD_SETSIZE, but we separate the cases to avoid possible portability
04471          * issues regarding howmany() and the actual representation of fd_set.
04472          */
04473         manager->fd_bufsize = howmany(manager->maxsocks, NFDBITS) *
04474                 sizeof(fd_mask);
04475 #else
04476         manager->fd_bufsize = sizeof(fd_set);
04477 #endif
04478 
04479         manager->read_fds = NULL;
04480         manager->read_fds_copy = NULL;
04481         manager->write_fds = NULL;
04482         manager->write_fds_copy = NULL;
04483 
04484         manager->read_fds = isc_mem_get(mctx, manager->fd_bufsize);
04485         if (manager->read_fds != NULL)
04486                 manager->read_fds_copy = isc_mem_get(mctx, manager->fd_bufsize);
04487         if (manager->read_fds_copy != NULL)
04488                 manager->write_fds = isc_mem_get(mctx, manager->fd_bufsize);
04489         if (manager->write_fds != NULL) {
04490                 manager->write_fds_copy = isc_mem_get(mctx,
04491                                                       manager->fd_bufsize);
04492         }
04493         if (manager->write_fds_copy == NULL) {
04494                 if (manager->write_fds != NULL) {
04495                         isc_mem_put(mctx, manager->write_fds,
04496                                     manager->fd_bufsize);
04497                 }
04498                 if (manager->read_fds_copy != NULL) {
04499                         isc_mem_put(mctx, manager->read_fds_copy,
04500                                     manager->fd_bufsize);
04501                 }
04502                 if (manager->read_fds != NULL) {
04503                         isc_mem_put(mctx, manager->read_fds,
04504                                     manager->fd_bufsize);
04505                 }
04506                 return (ISC_R_NOMEMORY);
04507         }
04508         memset(manager->read_fds, 0, manager->fd_bufsize);
04509         memset(manager->write_fds, 0, manager->fd_bufsize);
04510 
04511 #ifdef USE_WATCHER_THREAD
04512         (void)watch_fd(manager, manager->pipe_fds[0], SELECT_POKE_READ);
04513         manager->maxfd = manager->pipe_fds[0];
04514 #else /* USE_WATCHER_THREAD */
04515         manager->maxfd = 0;
04516 #endif /* USE_WATCHER_THREAD */
04517 #endif  /* USE_KQUEUE */
04518 
04519         return (ISC_R_SUCCESS);
04520 }
04521 
04522 static void
04523 cleanup_watcher(isc_mem_t *mctx, isc__socketmgr_t *manager) {
04524 #ifdef USE_WATCHER_THREAD
04525         isc_result_t result;
04526 
04527         result = unwatch_fd(manager, manager->pipe_fds[0], SELECT_POKE_READ);
04528         if (result != ISC_R_SUCCESS) {
04529                 UNEXPECTED_ERROR(__FILE__, __LINE__,
04530                                  "epoll_ctl(DEL) %s",
04531                                  isc_msgcat_get(isc_msgcat, ISC_MSGSET_GENERAL,
04532                                                 ISC_MSG_FAILED, "failed"));
04533         }
04534 #endif  /* USE_WATCHER_THREAD */
04535 
04536 #ifdef USE_KQUEUE
04537         close(manager->kqueue_fd);
04538         isc_mem_put(mctx, manager->events,
04539                     sizeof(struct kevent) * manager->nevents);
04540 #elif defined(USE_EPOLL)
04541         close(manager->epoll_fd);
04542         isc_mem_put(mctx, manager->events,
04543                     sizeof(struct epoll_event) * manager->nevents);
04544 #elif defined(USE_DEVPOLL)
04545         close(manager->devpoll_fd);
04546         isc_mem_put(mctx, manager->events,
04547                     sizeof(struct pollfd) * manager->nevents);
04548         isc_mem_put(mctx, manager->fdpollinfo,
04549                     sizeof(pollinfo_t) * manager->maxsocks);
04550 #elif defined(USE_SELECT)
04551         if (manager->read_fds != NULL)
04552                 isc_mem_put(mctx, manager->read_fds, manager->fd_bufsize);
04553         if (manager->read_fds_copy != NULL)
04554                 isc_mem_put(mctx, manager->read_fds_copy, manager->fd_bufsize);
04555         if (manager->write_fds != NULL)
04556                 isc_mem_put(mctx, manager->write_fds, manager->fd_bufsize);
04557         if (manager->write_fds_copy != NULL)
04558                 isc_mem_put(mctx, manager->write_fds_copy, manager->fd_bufsize);
04559 #endif  /* USE_KQUEUE */
04560 }
04561 
04562 isc_result_t
04563 isc__socketmgr_create(isc_mem_t *mctx, isc_socketmgr_t **managerp) {
04564         return (isc__socketmgr_create2(mctx, managerp, 0));
04565 }
04566 
04567 isc_result_t
04568 isc__socketmgr_create2(isc_mem_t *mctx, isc_socketmgr_t **managerp,
04569                        unsigned int maxsocks)
04570 {
04571         int i;
04572         isc__socketmgr_t *manager;
04573 #ifdef USE_WATCHER_THREAD
04574         char strbuf[ISC_STRERRORSIZE];
04575 #endif
04576         isc_result_t result;
04577 
04578         REQUIRE(managerp != NULL && *managerp == NULL);
04579 
04580 #ifdef USE_SHARED_MANAGER
04581         if (socketmgr != NULL) {
04582                 /* Don't allow maxsocks to be updated */
04583                 if (maxsocks > 0 && socketmgr->maxsocks != maxsocks)
04584                         return (ISC_R_EXISTS);
04585 
04586                 socketmgr->refs++;
04587                 *managerp = (isc_socketmgr_t *)socketmgr;
04588                 return (ISC_R_SUCCESS);
04589         }
04590 #endif /* USE_SHARED_MANAGER */
04591 
04592         if (maxsocks == 0)
04593                 maxsocks = ISC_SOCKET_MAXSOCKETS;
04594 
04595         manager = isc_mem_get(mctx, sizeof(*manager));
04596         if (manager == NULL)
04597                 return (ISC_R_NOMEMORY);
04598 
04599         /* zero-clear so that necessary cleanup on failure will be easy */
04600         memset(manager, 0, sizeof(*manager));
04601         manager->maxsocks = maxsocks;
04602         manager->reserved = 0;
04603         manager->maxudp = 0;
04604         manager->fds = isc_mem_get(mctx,
04605                                    manager->maxsocks * sizeof(isc__socket_t *));
04606         if (manager->fds == NULL) {
04607                 result = ISC_R_NOMEMORY;
04608                 goto free_manager;
04609         }
04610         manager->fdstate = isc_mem_get(mctx, manager->maxsocks * sizeof(int));
04611         if (manager->fdstate == NULL) {
04612                 result = ISC_R_NOMEMORY;
04613                 goto free_manager;
04614         }
04615         manager->stats = NULL;
04616 
04617         manager->common.methods = &socketmgrmethods;
04618         manager->common.magic = ISCAPI_SOCKETMGR_MAGIC;
04619         manager->common.impmagic = SOCKET_MANAGER_MAGIC;
04620         manager->mctx = NULL;
04621         memset(manager->fds, 0, manager->maxsocks * sizeof(isc_socket_t *));
04622         ISC_LIST_INIT(manager->socklist);
04623         result = isc_mutex_init(&manager->lock);
04624         if (result != ISC_R_SUCCESS)
04625                 goto free_manager;
04626         manager->fdlock = isc_mem_get(mctx, FDLOCK_COUNT * sizeof(isc_mutex_t));
04627         if (manager->fdlock == NULL) {
04628                 result = ISC_R_NOMEMORY;
04629                 goto cleanup_lock;
04630         }
04631         for (i = 0; i < FDLOCK_COUNT; i++) {
04632                 result = isc_mutex_init(&manager->fdlock[i]);
04633                 if (result != ISC_R_SUCCESS) {
04634                         while (--i >= 0)
04635                                 DESTROYLOCK(&manager->fdlock[i]);
04636                         isc_mem_put(mctx, manager->fdlock,
04637                                     FDLOCK_COUNT * sizeof(isc_mutex_t));
04638                         manager->fdlock = NULL;
04639                         goto cleanup_lock;
04640                 }
04641         }
04642 
04643 #ifdef USE_WATCHER_THREAD
04644         if (isc_condition_init(&manager->shutdown_ok) != ISC_R_SUCCESS) {
04645                 UNEXPECTED_ERROR(__FILE__, __LINE__,
04646                                  "isc_condition_init() %s",
04647                                  isc_msgcat_get(isc_msgcat, ISC_MSGSET_GENERAL,
04648                                                 ISC_MSG_FAILED, "failed"));
04649                 result = ISC_R_UNEXPECTED;
04650                 goto cleanup_lock;
04651         }
04652 
04653         /*
04654          * Create the special fds that will be used to wake up the
04655          * select/poll loop when something internal needs to be done.
04656          */
04657         if (pipe(manager->pipe_fds) != 0) {
04658                 isc__strerror(errno, strbuf, sizeof(strbuf));
04659                 UNEXPECTED_ERROR(__FILE__, __LINE__,
04660                                  "pipe() %s: %s",
04661                                  isc_msgcat_get(isc_msgcat, ISC_MSGSET_GENERAL,
04662                                                 ISC_MSG_FAILED, "failed"),
04663                                  strbuf);
04664                 result = ISC_R_UNEXPECTED;
04665                 goto cleanup_condition;
04666         }
04667 
04668         RUNTIME_CHECK(make_nonblock(manager->pipe_fds[0]) == ISC_R_SUCCESS);
04669 #if 0
04670         RUNTIME_CHECK(make_nonblock(manager->pipe_fds[1]) == ISC_R_SUCCESS);
04671 #endif
04672 #endif  /* USE_WATCHER_THREAD */
04673 
04674 #ifdef USE_SHARED_MANAGER
04675         manager->refs = 1;
04676 #endif /* USE_SHARED_MANAGER */
04677 
04678         /*
04679          * Set up initial state for the select loop
04680          */
04681         result = setup_watcher(mctx, manager);
04682         if (result != ISC_R_SUCCESS)
04683                 goto cleanup;
04684         memset(manager->fdstate, 0, manager->maxsocks * sizeof(int));
04685 #ifdef USE_WATCHER_THREAD
04686         /*
04687          * Start up the select/poll thread.
04688          */
04689         if (isc_thread_create(watcher, manager, &manager->watcher) !=
04690             ISC_R_SUCCESS) {
04691                 UNEXPECTED_ERROR(__FILE__, __LINE__,
04692                                  "isc_thread_create() %s",
04693                                  isc_msgcat_get(isc_msgcat, ISC_MSGSET_GENERAL,
04694                                                 ISC_MSG_FAILED, "failed"));
04695                 cleanup_watcher(mctx, manager);
04696                 result = ISC_R_UNEXPECTED;
04697                 goto cleanup;
04698         }
04699 #endif /* USE_WATCHER_THREAD */
04700         isc_mem_attach(mctx, &manager->mctx);
04701 
04702 #ifdef USE_SHARED_MANAGER
04703         socketmgr = manager;
04704 #endif /* USE_SHARED_MANAGER */
04705         *managerp = (isc_socketmgr_t *)manager;
04706 
04707         return (ISC_R_SUCCESS);
04708 
04709 cleanup:
04710 #ifdef USE_WATCHER_THREAD
04711         (void)close(manager->pipe_fds[0]);
04712         (void)close(manager->pipe_fds[1]);
04713 #endif  /* USE_WATCHER_THREAD */
04714 
04715 #ifdef USE_WATCHER_THREAD
04716 cleanup_condition:
04717         (void)isc_condition_destroy(&manager->shutdown_ok);
04718 #endif  /* USE_WATCHER_THREAD */
04719 
04720 
04721 cleanup_lock:
04722         if (manager->fdlock != NULL) {
04723                 for (i = 0; i < FDLOCK_COUNT; i++)
04724                         DESTROYLOCK(&manager->fdlock[i]);
04725         }
04726         DESTROYLOCK(&manager->lock);
04727 
04728 free_manager:
04729         if (manager->fdlock != NULL) {
04730                 isc_mem_put(mctx, manager->fdlock,
04731                             FDLOCK_COUNT * sizeof(isc_mutex_t));
04732         }
04733         if (manager->fdstate != NULL) {
04734                 isc_mem_put(mctx, manager->fdstate,
04735                             manager->maxsocks * sizeof(int));
04736         }
04737         if (manager->fds != NULL) {
04738                 isc_mem_put(mctx, manager->fds,
04739                             manager->maxsocks * sizeof(isc_socket_t *));
04740         }
04741         isc_mem_put(mctx, manager, sizeof(*manager));
04742 
04743         return (result);
04744 }
04745 
04746 isc_result_t
04747 isc_socketmgr_getmaxsockets(isc_socketmgr_t *manager0, unsigned int *nsockp) {
04748         isc__socketmgr_t *manager = (isc__socketmgr_t *)manager0;
04749         REQUIRE(VALID_MANAGER(manager));
04750         REQUIRE(nsockp != NULL);
04751 
04752         *nsockp = manager->maxsocks;
04753 
04754         return (ISC_R_SUCCESS);
04755 }
04756 
04757 void
04758 isc_socketmgr_setstats(isc_socketmgr_t *manager0, isc_stats_t *stats) {
04759         isc__socketmgr_t *manager = (isc__socketmgr_t *)manager0;
04760 
04761         REQUIRE(VALID_MANAGER(manager));
04762         REQUIRE(ISC_LIST_EMPTY(manager->socklist));
04763         REQUIRE(manager->stats == NULL);
04764         REQUIRE(isc_stats_ncounters(stats) == isc_sockstatscounter_max);
04765 
04766         isc_stats_attach(stats, &manager->stats);
04767 }
04768 
04769 void
04770 isc__socketmgr_destroy(isc_socketmgr_t **managerp) {
04771         isc__socketmgr_t *manager;
04772         int i;
04773         isc_mem_t *mctx;
04774 
04775         /*
04776          * Destroy a socket manager.
04777          */
04778 
04779         REQUIRE(managerp != NULL);
04780         manager = (isc__socketmgr_t *)*managerp;
04781         REQUIRE(VALID_MANAGER(manager));
04782 
04783 #ifdef USE_SHARED_MANAGER
04784         manager->refs--;
04785         if (manager->refs > 0) {
04786                 *managerp = NULL;
04787                 return;
04788         }
04789         socketmgr = NULL;
04790 #endif /* USE_SHARED_MANAGER */
04791 
04792         LOCK(&manager->lock);
04793 
04794         /*
04795          * Wait for all sockets to be destroyed.
04796          */
04797         while (!ISC_LIST_EMPTY(manager->socklist)) {
04798 #ifdef USE_WATCHER_THREAD
04799                 manager_log(manager, CREATION, "%s",
04800                             isc_msgcat_get(isc_msgcat, ISC_MSGSET_SOCKET,
04801                                            ISC_MSG_SOCKETSREMAIN,
04802                                            "sockets exist"));
04803                 WAIT(&manager->shutdown_ok, &manager->lock);
04804 #else /* USE_WATCHER_THREAD */
04805                 UNLOCK(&manager->lock);
04806                 isc__taskmgr_dispatch(NULL);
04807                 LOCK(&manager->lock);
04808 #endif /* USE_WATCHER_THREAD */
04809         }
04810 
04811         UNLOCK(&manager->lock);
04812 
04813         /*
04814          * Here, poke our select/poll thread.  Do this by closing the write
04815          * half of the pipe, which will send EOF to the read half.
04816          * This is currently a no-op in the non-threaded case.
04817          */
04818         select_poke(manager, 0, SELECT_POKE_SHUTDOWN);
04819 
04820 #ifdef USE_WATCHER_THREAD
04821         /*
04822          * Wait for thread to exit.
04823          */
04824         if (isc_thread_join(manager->watcher, NULL) != ISC_R_SUCCESS)
04825                 UNEXPECTED_ERROR(__FILE__, __LINE__,
04826                                  "isc_thread_join() %s",
04827                                  isc_msgcat_get(isc_msgcat, ISC_MSGSET_GENERAL,
04828                                                 ISC_MSG_FAILED, "failed"));
04829 #endif /* USE_WATCHER_THREAD */
04830 
04831         /*
04832          * Clean up.
04833          */
04834         cleanup_watcher(manager->mctx, manager);
04835 
04836 #ifdef USE_WATCHER_THREAD
04837         (void)close(manager->pipe_fds[0]);
04838         (void)close(manager->pipe_fds[1]);
04839         (void)isc_condition_destroy(&manager->shutdown_ok);
04840 #endif /* USE_WATCHER_THREAD */
04841 
04842         for (i = 0; i < (int)manager->maxsocks; i++)
04843                 if (manager->fdstate[i] == CLOSE_PENDING) /* no need to lock */
04844                         (void)close(i);
04845 
04846         isc_mem_put(manager->mctx, manager->fds,
04847                     manager->maxsocks * sizeof(isc__socket_t *));
04848         isc_mem_put(manager->mctx, manager->fdstate,
04849                     manager->maxsocks * sizeof(int));
04850 
04851         if (manager->stats != NULL)
04852                 isc_stats_detach(&manager->stats);
04853 
04854         if (manager->fdlock != NULL) {
04855                 for (i = 0; i < FDLOCK_COUNT; i++)
04856                         DESTROYLOCK(&manager->fdlock[i]);
04857                 isc_mem_put(manager->mctx, manager->fdlock,
04858                             FDLOCK_COUNT * sizeof(isc_mutex_t));
04859         }
04860         DESTROYLOCK(&manager->lock);
04861         manager->common.magic = 0;
04862         manager->common.impmagic = 0;
04863         mctx= manager->mctx;
04864         isc_mem_put(mctx, manager, sizeof(*manager));
04865 
04866         isc_mem_detach(&mctx);
04867 
04868         *managerp = NULL;
04869 
04870 #ifdef USE_SHARED_MANAGER
04871         socketmgr = NULL;
04872 #endif
04873 }
04874 
04875 static isc_result_t
04876 socket_recv(isc__socket_t *sock, isc_socketevent_t *dev, isc_task_t *task,
04877             unsigned int flags)
04878 {
04879         int io_state;
04880         isc_boolean_t have_lock = ISC_FALSE;
04881         isc_task_t *ntask = NULL;
04882         isc_result_t result = ISC_R_SUCCESS;
04883 
04884         dev->ev_sender = task;
04885 
04886         if (sock->type == isc_sockettype_udp) {
04887                 io_state = doio_recv(sock, dev);
04888         } else {
04889                 LOCK(&sock->lock);
04890                 have_lock = ISC_TRUE;
04891 
04892                 if (ISC_LIST_EMPTY(sock->recv_list))
04893                         io_state = doio_recv(sock, dev);
04894                 else
04895                         io_state = DOIO_SOFT;
04896         }
04897 
04898         switch (io_state) {
04899         case DOIO_SOFT:
04900                 /*
04901                  * We couldn't read all or part of the request right now, so
04902                  * queue it.
04903                  *
04904                  * Attach to socket and to task
04905                  */
04906                 isc_task_attach(task, &ntask);
04907                 dev->attributes |= ISC_SOCKEVENTATTR_ATTACHED;
04908 
04909                 if (!have_lock) {
04910                         LOCK(&sock->lock);
04911                         have_lock = ISC_TRUE;
04912                 }
04913 
04914                 /*
04915                  * Enqueue the request.  If the socket was previously not being
04916                  * watched, poke the watcher to start paying attention to it.
04917                  */
04918                 if (ISC_LIST_EMPTY(sock->recv_list) && !sock->pending_recv)
04919                         select_poke(sock->manager, sock->fd, SELECT_POKE_READ);
04920                 ISC_LIST_ENQUEUE(sock->recv_list, dev, ev_link);
04921 
04922                 socket_log(sock, NULL, EVENT, NULL, 0, 0,
04923                            "socket_recv: event %p -> task %p",
04924                            dev, ntask);
04925 
04926                 if ((flags & ISC_SOCKFLAG_IMMEDIATE) != 0)
04927                         result = ISC_R_INPROGRESS;
04928                 break;
04929 
04930         case DOIO_EOF:
04931                 dev->result = ISC_R_EOF;
04932                 /* fallthrough */
04933 
04934         case DOIO_HARD:
04935         case DOIO_SUCCESS:
04936                 if ((flags & ISC_SOCKFLAG_IMMEDIATE) == 0)
04937                         send_recvdone_event(sock, &dev);
04938                 break;
04939         }
04940 
04941         if (have_lock)
04942                 UNLOCK(&sock->lock);
04943 
04944         return (result);
04945 }
04946 
04947 isc_result_t
04948 isc__socket_recvv(isc_socket_t *sock0, isc_bufferlist_t *buflist,
04949                   unsigned int minimum, isc_task_t *task,
04950                   isc_taskaction_t action, void *arg)
04951 {
04952         isc__socket_t *sock = (isc__socket_t *)sock0;
04953         isc_socketevent_t *dev;
04954         isc__socketmgr_t *manager;
04955         unsigned int iocount;
04956         isc_buffer_t *buffer;
04957 
04958         REQUIRE(VALID_SOCKET(sock));
04959         REQUIRE(buflist != NULL);
04960         REQUIRE(!ISC_LIST_EMPTY(*buflist));
04961         REQUIRE(task != NULL);
04962         REQUIRE(action != NULL);
04963 
04964         manager = sock->manager;
04965         REQUIRE(VALID_MANAGER(manager));
04966 
04967         iocount = isc_bufferlist_availablecount(buflist);
04968         REQUIRE(iocount > 0);
04969 
04970         INSIST(sock->bound);
04971 
04972         dev = allocate_socketevent(manager->mctx, sock,
04973                                    ISC_SOCKEVENT_RECVDONE, action, arg);
04974         if (dev == NULL)
04975                 return (ISC_R_NOMEMORY);
04976 
04977         /*
04978          * UDP sockets are always partial read
04979          */
04980         if (sock->type == isc_sockettype_udp)
04981                 dev->minimum = 1;
04982         else {
04983                 if (minimum == 0)
04984                         dev->minimum = iocount;
04985                 else
04986                         dev->minimum = minimum;
04987         }
04988 
04989         /*
04990          * Move each buffer from the passed in list to our internal one.
04991          */
04992         buffer = ISC_LIST_HEAD(*buflist);
04993         while (buffer != NULL) {
04994                 ISC_LIST_DEQUEUE(*buflist, buffer, link);
04995                 ISC_LIST_ENQUEUE(dev->bufferlist, buffer, link);
04996                 buffer = ISC_LIST_HEAD(*buflist);
04997         }
04998 
04999         return (socket_recv(sock, dev, task, 0));
05000 }
05001 
05002 isc_result_t
05003 isc__socket_recv(isc_socket_t *sock0, isc_region_t *region,
05004                  unsigned int minimum, isc_task_t *task,
05005                  isc_taskaction_t action, void *arg)
05006 {
05007         isc__socket_t *sock = (isc__socket_t *)sock0;
05008         isc_socketevent_t *dev;
05009         isc__socketmgr_t *manager;
05010 
05011         REQUIRE(VALID_SOCKET(sock));
05012         REQUIRE(action != NULL);
05013 
05014         manager = sock->manager;
05015         REQUIRE(VALID_MANAGER(manager));
05016 
05017         INSIST(sock->bound);
05018 
05019         dev = allocate_socketevent(manager->mctx, sock,
05020                                    ISC_SOCKEVENT_RECVDONE, action, arg);
05021         if (dev == NULL)
05022                 return (ISC_R_NOMEMORY);
05023 
05024         return (isc__socket_recv2(sock0, region, minimum, task, dev, 0));
05025 }
05026 
05027 isc_result_t
05028 isc__socket_recv2(isc_socket_t *sock0, isc_region_t *region,
05029                   unsigned int minimum, isc_task_t *task,
05030                   isc_socketevent_t *event, unsigned int flags)
05031 {
05032         isc__socket_t *sock = (isc__socket_t *)sock0;
05033 
05034         event->ev_sender = sock;
05035         event->result = ISC_R_UNSET;
05036         ISC_LIST_INIT(event->bufferlist);
05037         event->region = *region;
05038         event->n = 0;
05039         event->offset = 0;
05040         event->attributes = 0;
05041 
05042         /*
05043          * UDP sockets are always partial read.
05044          */
05045         if (sock->type == isc_sockettype_udp)
05046                 event->minimum = 1;
05047         else {
05048                 if (minimum == 0)
05049                         event->minimum = region->length;
05050                 else
05051                         event->minimum = minimum;
05052         }
05053 
05054         return (socket_recv(sock, event, task, flags));
05055 }
05056 
05057 static isc_result_t
05058 socket_send(isc__socket_t *sock, isc_socketevent_t *dev, isc_task_t *task,
05059             isc_sockaddr_t *address, struct in6_pktinfo *pktinfo,
05060             unsigned int flags)
05061 {
05062         int io_state;
05063         isc_boolean_t have_lock = ISC_FALSE;
05064         isc_task_t *ntask = NULL;
05065         isc_result_t result = ISC_R_SUCCESS;
05066 
05067         dev->ev_sender = task;
05068 
05069         set_dev_address(address, sock, dev);
05070         if (pktinfo != NULL) {
05071                 dev->attributes |= ISC_SOCKEVENTATTR_PKTINFO;
05072                 dev->pktinfo = *pktinfo;
05073 
05074                 if (!isc_sockaddr_issitelocal(&dev->address) &&
05075                     !isc_sockaddr_islinklocal(&dev->address)) {
05076                         socket_log(sock, NULL, TRACE, isc_msgcat,
05077                                    ISC_MSGSET_SOCKET, ISC_MSG_PKTINFOPROVIDED,
05078                                    "pktinfo structure provided, ifindex %u "
05079                                    "(set to 0)", pktinfo->ipi6_ifindex);
05080 
05081                         /*
05082                          * Set the pktinfo index to 0 here, to let the
05083                          * kernel decide what interface it should send on.
05084                          */
05085                         dev->pktinfo.ipi6_ifindex = 0;
05086                 }
05087         }
05088 
05089         if (sock->type == isc_sockettype_udp)
05090                 io_state = doio_send(sock, dev);
05091         else {
05092                 LOCK(&sock->lock);
05093                 have_lock = ISC_TRUE;
05094 
05095                 if (ISC_LIST_EMPTY(sock->send_list))
05096                         io_state = doio_send(sock, dev);
05097                 else
05098                         io_state = DOIO_SOFT;
05099         }
05100 
05101         switch (io_state) {
05102         case DOIO_SOFT:
05103                 /*
05104                  * We couldn't send all or part of the request right now, so
05105                  * queue it unless ISC_SOCKFLAG_NORETRY is set.
05106                  */
05107                 if ((flags & ISC_SOCKFLAG_NORETRY) == 0) {
05108                         isc_task_attach(task, &ntask);
05109                         dev->attributes |= ISC_SOCKEVENTATTR_ATTACHED;
05110 
05111                         if (!have_lock) {
05112                                 LOCK(&sock->lock);
05113                                 have_lock = ISC_TRUE;
05114                         }
05115 
05116                         /*
05117                          * Enqueue the request.  If the socket was previously
05118                          * not being watched, poke the watcher to start
05119                          * paying attention to it.
05120                          */
05121                         if (ISC_LIST_EMPTY(sock->send_list) &&
05122                             !sock->pending_send)
05123                                 select_poke(sock->manager, sock->fd,
05124                                             SELECT_POKE_WRITE);
05125                         ISC_LIST_ENQUEUE(sock->send_list, dev, ev_link);
05126 
05127                         socket_log(sock, NULL, EVENT, NULL, 0, 0,
05128                                    "socket_send: event %p -> task %p",
05129                                    dev, ntask);
05130 
05131                         if ((flags & ISC_SOCKFLAG_IMMEDIATE) != 0)
05132                                 result = ISC_R_INPROGRESS;
05133                         break;
05134                 }
05135 
05136         case DOIO_HARD:
05137         case DOIO_SUCCESS:
05138                 if ((flags & ISC_SOCKFLAG_IMMEDIATE) == 0)
05139                         send_senddone_event(sock, &dev);
05140                 break;
05141         }
05142 
05143         if (have_lock)
05144                 UNLOCK(&sock->lock);
05145 
05146         return (result);
05147 }
05148 
05149 isc_result_t
05150 isc__socket_send(isc_socket_t *sock, isc_region_t *region,
05151                  isc_task_t *task, isc_taskaction_t action, void *arg)
05152 {
05153         /*
05154          * REQUIRE() checking is performed in isc_socket_sendto().
05155          */
05156         return (isc__socket_sendto(sock, region, task, action, arg, NULL,
05157                                    NULL));
05158 }
05159 
05160 isc_result_t
05161 isc__socket_sendto(isc_socket_t *sock0, isc_region_t *region,
05162                    isc_task_t *task, isc_taskaction_t action, void *arg,
05163                    isc_sockaddr_t *address, struct in6_pktinfo *pktinfo)
05164 {
05165         isc__socket_t *sock = (isc__socket_t *)sock0;
05166         isc_socketevent_t *dev;
05167         isc__socketmgr_t *manager;
05168 
05169         REQUIRE(VALID_SOCKET(sock));
05170         REQUIRE(region != NULL);
05171         REQUIRE(task != NULL);
05172         REQUIRE(action != NULL);
05173 
05174         manager = sock->manager;
05175         REQUIRE(VALID_MANAGER(manager));
05176 
05177         INSIST(sock->bound);
05178 
05179         dev = allocate_socketevent(manager->mctx, sock,
05180                                    ISC_SOCKEVENT_SENDDONE, action, arg);
05181         if (dev == NULL)
05182                 return (ISC_R_NOMEMORY);
05183 
05184         dev->region = *region;
05185 
05186         return (socket_send(sock, dev, task, address, pktinfo, 0));
05187 }
05188 
05189 isc_result_t
05190 isc__socket_sendv(isc_socket_t *sock, isc_bufferlist_t *buflist,
05191                   isc_task_t *task, isc_taskaction_t action, void *arg)
05192 {
05193         return (isc__socket_sendtov2(sock, buflist, task, action, arg, NULL,
05194                                      NULL, 0));
05195 }
05196 
05197 isc_result_t
05198 isc__socket_sendtov(isc_socket_t *sock, isc_bufferlist_t *buflist,
05199                     isc_task_t *task, isc_taskaction_t action, void *arg,
05200                     isc_sockaddr_t *address, struct in6_pktinfo *pktinfo)
05201 {
05202         return (isc__socket_sendtov2(sock, buflist, task, action, arg, address,
05203                                      pktinfo, 0));
05204 }
05205 
05206 isc_result_t
05207 isc__socket_sendtov2(isc_socket_t *sock0, isc_bufferlist_t *buflist,
05208                      isc_task_t *task, isc_taskaction_t action, void *arg,
05209                      isc_sockaddr_t *address, struct in6_pktinfo *pktinfo,
05210                      unsigned int flags)
05211 {
05212         isc__socket_t *sock = (isc__socket_t *)sock0;
05213         isc_socketevent_t *dev;
05214         isc__socketmgr_t *manager;
05215         unsigned int iocount;
05216         isc_buffer_t *buffer;
05217 
05218         REQUIRE(VALID_SOCKET(sock));
05219         REQUIRE(buflist != NULL);
05220         REQUIRE(!ISC_LIST_EMPTY(*buflist));
05221         REQUIRE(task != NULL);
05222         REQUIRE(action != NULL);
05223 
05224         manager = sock->manager;
05225         REQUIRE(VALID_MANAGER(manager));
05226 
05227         iocount = isc_bufferlist_usedcount(buflist);
05228         REQUIRE(iocount > 0);
05229 
05230         dev = allocate_socketevent(manager->mctx, sock,
05231                                    ISC_SOCKEVENT_SENDDONE, action, arg);
05232         if (dev == NULL)
05233                 return (ISC_R_NOMEMORY);
05234 
05235         /*
05236          * Move each buffer from the passed in list to our internal one.
05237          */
05238         buffer = ISC_LIST_HEAD(*buflist);
05239         while (buffer != NULL) {
05240                 ISC_LIST_DEQUEUE(*buflist, buffer, link);
05241                 ISC_LIST_ENQUEUE(dev->bufferlist, buffer, link);
05242                 buffer = ISC_LIST_HEAD(*buflist);
05243         }
05244 
05245         return (socket_send(sock, dev, task, address, pktinfo, flags));
05246 }
05247 
05248 isc_result_t
05249 isc__socket_sendto2(isc_socket_t *sock0, isc_region_t *region,
05250                     isc_task_t *task,
05251                     isc_sockaddr_t *address, struct in6_pktinfo *pktinfo,
05252                     isc_socketevent_t *event, unsigned int flags)
05253 {
05254         isc__socket_t *sock = (isc__socket_t *)sock0;
05255 
05256         REQUIRE(VALID_SOCKET(sock));
05257         REQUIRE((flags & ~(ISC_SOCKFLAG_IMMEDIATE|ISC_SOCKFLAG_NORETRY)) == 0);
05258         if ((flags & ISC_SOCKFLAG_NORETRY) != 0)
05259                 REQUIRE(sock->type == isc_sockettype_udp);
05260         event->ev_sender = sock;
05261         event->result = ISC_R_UNSET;
05262         ISC_LIST_INIT(event->bufferlist);
05263         event->region = *region;
05264         event->n = 0;
05265         event->offset = 0;
05266         event->attributes &= ~ISC_SOCKEVENTATTR_ATTACHED;
05267 
05268         return (socket_send(sock, event, task, address, pktinfo, flags));
05269 }
05270 
05271 void
05272 isc__socket_cleanunix(isc_sockaddr_t *sockaddr, isc_boolean_t active) {
05273 #ifdef ISC_PLATFORM_HAVESYSUNH
05274         int s;
05275         struct stat sb;
05276         char strbuf[ISC_STRERRORSIZE];
05277 
05278         if (sockaddr->type.sa.sa_family != AF_UNIX)
05279                 return;
05280 
05281 #ifndef S_ISSOCK
05282 #if defined(S_IFMT) && defined(S_IFSOCK)
05283 #define S_ISSOCK(mode) ((mode & S_IFMT)==S_IFSOCK)
05284 #elif defined(_S_IFMT) && defined(S_IFSOCK)
05285 #define S_ISSOCK(mode) ((mode & _S_IFMT)==S_IFSOCK)
05286 #endif
05287 #endif
05288 
05289 #ifndef S_ISFIFO
05290 #if defined(S_IFMT) && defined(S_IFIFO)
05291 #define S_ISFIFO(mode) ((mode & S_IFMT)==S_IFIFO)
05292 #elif defined(_S_IFMT) && defined(S_IFIFO)
05293 #define S_ISFIFO(mode) ((mode & _S_IFMT)==S_IFIFO)
05294 #endif
05295 #endif
05296 
05297 #if !defined(S_ISFIFO) && !defined(S_ISSOCK)
05298 #error You need to define S_ISFIFO and S_ISSOCK as appropriate for your platform.  See <sys/stat.h>.
05299 #endif
05300 
05301 #ifndef S_ISFIFO
05302 #define S_ISFIFO(mode) 0
05303 #endif
05304 
05305 #ifndef S_ISSOCK
05306 #define S_ISSOCK(mode) 0
05307 #endif
05308 
05309         if (active) {
05310                 if (stat(sockaddr->type.sunix.sun_path, &sb) < 0) {
05311                         isc__strerror(errno, strbuf, sizeof(strbuf));
05312                         isc_log_write(isc_lctx, ISC_LOGCATEGORY_GENERAL,
05313                                       ISC_LOGMODULE_SOCKET, ISC_LOG_ERROR,
05314                                       "isc_socket_cleanunix: stat(%s): %s",
05315                                       sockaddr->type.sunix.sun_path, strbuf);
05316                         return;
05317                 }
05318                 if (!(S_ISSOCK(sb.st_mode) || S_ISFIFO(sb.st_mode))) {
05319                         isc_log_write(isc_lctx, ISC_LOGCATEGORY_GENERAL,
05320                                       ISC_LOGMODULE_SOCKET, ISC_LOG_ERROR,
05321                                       "isc_socket_cleanunix: %s: not a socket",
05322                                       sockaddr->type.sunix.sun_path);
05323                         return;
05324                 }
05325                 if (unlink(sockaddr->type.sunix.sun_path) < 0) {
05326                         isc__strerror(errno, strbuf, sizeof(strbuf));
05327                         isc_log_write(isc_lctx, ISC_LOGCATEGORY_GENERAL,
05328                                       ISC_LOGMODULE_SOCKET, ISC_LOG_ERROR,
05329                                       "isc_socket_cleanunix: unlink(%s): %s",
05330                                       sockaddr->type.sunix.sun_path, strbuf);
05331                 }
05332                 return;
05333         }
05334 
05335         s = socket(AF_UNIX, SOCK_STREAM, 0);
05336         if (s < 0) {
05337                 isc__strerror(errno, strbuf, sizeof(strbuf));
05338                 isc_log_write(isc_lctx, ISC_LOGCATEGORY_GENERAL,
05339                               ISC_LOGMODULE_SOCKET, ISC_LOG_WARNING,
05340                               "isc_socket_cleanunix: socket(%s): %s",
05341                               sockaddr->type.sunix.sun_path, strbuf);
05342                 return;
05343         }
05344 
05345         if (stat(sockaddr->type.sunix.sun_path, &sb) < 0) {
05346                 switch (errno) {
05347                 case ENOENT:    /* We exited cleanly last time */
05348                         break;
05349                 default:
05350                         isc__strerror(errno, strbuf, sizeof(strbuf));
05351                         isc_log_write(isc_lctx, ISC_LOGCATEGORY_GENERAL,
05352                                       ISC_LOGMODULE_SOCKET, ISC_LOG_WARNING,
05353                                       "isc_socket_cleanunix: stat(%s): %s",
05354                                       sockaddr->type.sunix.sun_path, strbuf);
05355                         break;
05356                 }
05357                 goto cleanup;
05358         }
05359 
05360         if (!(S_ISSOCK(sb.st_mode) || S_ISFIFO(sb.st_mode))) {
05361                 isc_log_write(isc_lctx, ISC_LOGCATEGORY_GENERAL,
05362                               ISC_LOGMODULE_SOCKET, ISC_LOG_WARNING,
05363                               "isc_socket_cleanunix: %s: not a socket",
05364                               sockaddr->type.sunix.sun_path);
05365                 goto cleanup;
05366         }
05367 
05368         if (connect(s, (struct sockaddr *)&sockaddr->type.sunix,
05369                     sizeof(sockaddr->type.sunix)) < 0) {
05370                 switch (errno) {
05371                 case ECONNREFUSED:
05372                 case ECONNRESET:
05373                         if (unlink(sockaddr->type.sunix.sun_path) < 0) {
05374                                 isc__strerror(errno, strbuf, sizeof(strbuf));
05375                                 isc_log_write(isc_lctx, ISC_LOGCATEGORY_GENERAL,
05376                                               ISC_LOGMODULE_SOCKET,
05377                                               ISC_LOG_WARNING,
05378                                               "isc_socket_cleanunix: "
05379                                               "unlink(%s): %s",
05380                                               sockaddr->type.sunix.sun_path,
05381                                               strbuf);
05382                         }
05383                         break;
05384                 default:
05385                         isc__strerror(errno, strbuf, sizeof(strbuf));
05386                         isc_log_write(isc_lctx, ISC_LOGCATEGORY_GENERAL,
05387                                       ISC_LOGMODULE_SOCKET, ISC_LOG_WARNING,
05388                                       "isc_socket_cleanunix: connect(%s): %s",
05389                                       sockaddr->type.sunix.sun_path, strbuf);
05390                         break;
05391                 }
05392         }
05393  cleanup:
05394         close(s);
05395 #else
05396         UNUSED(sockaddr);
05397         UNUSED(active);
05398 #endif
05399 }
05400 
05401 isc_result_t
05402 isc__socket_permunix(isc_sockaddr_t *sockaddr, isc_uint32_t perm,
05403                     isc_uint32_t owner, isc_uint32_t group)
05404 {
05405 #ifdef ISC_PLATFORM_HAVESYSUNH
05406         isc_result_t result = ISC_R_SUCCESS;
05407         char strbuf[ISC_STRERRORSIZE];
05408         char path[sizeof(sockaddr->type.sunix.sun_path)];
05409 #ifdef NEED_SECURE_DIRECTORY
05410         char *slash;
05411 #endif
05412 
05413         REQUIRE(sockaddr->type.sa.sa_family == AF_UNIX);
05414         INSIST(strlen(sockaddr->type.sunix.sun_path) < sizeof(path));
05415         strcpy(path, sockaddr->type.sunix.sun_path);
05416 
05417 #ifdef NEED_SECURE_DIRECTORY
05418         slash = strrchr(path, '/');
05419         if (slash != NULL) {
05420                 if (slash != path)
05421                         *slash = '\0';
05422                 else
05423                         strcpy(path, "/");
05424         } else
05425                 strcpy(path, ".");
05426 #endif
05427 
05428         if (chmod(path, perm) < 0) {
05429                 isc__strerror(errno, strbuf, sizeof(strbuf));
05430                 isc_log_write(isc_lctx, ISC_LOGCATEGORY_GENERAL,
05431                               ISC_LOGMODULE_SOCKET, ISC_LOG_ERROR,
05432                               "isc_socket_permunix: chmod(%s, %d): %s",
05433                               path, perm, strbuf);
05434                 result = ISC_R_FAILURE;
05435         }
05436         if (chown(path, owner, group) < 0) {
05437                 isc__strerror(errno, strbuf, sizeof(strbuf));
05438                 isc_log_write(isc_lctx, ISC_LOGCATEGORY_GENERAL,
05439                               ISC_LOGMODULE_SOCKET, ISC_LOG_ERROR,
05440                               "isc_socket_permunix: chown(%s, %d, %d): %s",
05441                               path, owner, group,
05442                               strbuf);
05443                 result = ISC_R_FAILURE;
05444         }
05445         return (result);
05446 #else
05447         UNUSED(sockaddr);
05448         UNUSED(perm);
05449         UNUSED(owner);
05450         UNUSED(group);
05451         return (ISC_R_NOTIMPLEMENTED);
05452 #endif
05453 }
05454 
05455 isc_result_t
05456 isc__socket_bind(isc_socket_t *sock0, isc_sockaddr_t *sockaddr,
05457                  unsigned int options) {
05458         isc__socket_t *sock = (isc__socket_t *)sock0;
05459         char strbuf[ISC_STRERRORSIZE];
05460         int on = 1;
05461 
05462         REQUIRE(VALID_SOCKET(sock));
05463 
05464         LOCK(&sock->lock);
05465 
05466         INSIST(!sock->bound);
05467         INSIST(!sock->dupped);
05468 
05469         if (sock->pf != sockaddr->type.sa.sa_family) {
05470                 UNLOCK(&sock->lock);
05471                 return (ISC_R_FAMILYMISMATCH);
05472         }
05473 
05474         /*
05475          * Only set SO_REUSEADDR when we want a specific port.
05476          */
05477 #ifdef AF_UNIX
05478         if (sock->pf == AF_UNIX)
05479                 goto bind_socket;
05480 #endif
05481         if ((options & ISC_SOCKET_REUSEADDRESS) != 0 &&
05482             isc_sockaddr_getport(sockaddr) != (in_port_t)0 &&
05483             setsockopt(sock->fd, SOL_SOCKET, SO_REUSEADDR, (void *)&on,
05484                        sizeof(on)) < 0) {
05485                 UNEXPECTED_ERROR(__FILE__, __LINE__,
05486                                  "setsockopt(%d) %s", sock->fd,
05487                                  isc_msgcat_get(isc_msgcat, ISC_MSGSET_GENERAL,
05488                                                 ISC_MSG_FAILED, "failed"));
05489                 /* Press on... */
05490         }
05491 #ifdef AF_UNIX
05492  bind_socket:
05493 #endif
05494         if (bind(sock->fd, &sockaddr->type.sa, sockaddr->length) < 0) {
05495                 inc_stats(sock->manager->stats,
05496                           sock->statsindex[STATID_BINDFAIL]);
05497 
05498                 UNLOCK(&sock->lock);
05499                 switch (errno) {
05500                 case EACCES:
05501                         return (ISC_R_NOPERM);
05502                 case EADDRNOTAVAIL:
05503                         return (ISC_R_ADDRNOTAVAIL);
05504                 case EADDRINUSE:
05505                         return (ISC_R_ADDRINUSE);
05506                 case EINVAL:
05507                         return (ISC_R_BOUND);
05508                 default:
05509                         isc__strerror(errno, strbuf, sizeof(strbuf));
05510                         UNEXPECTED_ERROR(__FILE__, __LINE__, "bind: %s",
05511                                          strbuf);
05512                         return (ISC_R_UNEXPECTED);
05513                 }
05514         }
05515 
05516         socket_log(sock, sockaddr, TRACE,
05517                    isc_msgcat, ISC_MSGSET_SOCKET, ISC_MSG_BOUND, "bound");
05518         sock->bound = 1;
05519 
05520         UNLOCK(&sock->lock);
05521         return (ISC_R_SUCCESS);
05522 }
05523 
05524 /*
05525  * Enable this only for specific OS versions, and only when they have repaired
05526  * their problems with it.  Until then, this is is broken and needs to be
05527  * diabled by default.  See RT22589 for details.
05528  */
05529 #undef ENABLE_ACCEPTFILTER
05530 
05531 isc_result_t
05532 isc__socket_filter(isc_socket_t *sock0, const char *filter) {
05533         isc__socket_t *sock = (isc__socket_t *)sock0;
05534 #if defined(SO_ACCEPTFILTER) && defined(ENABLE_ACCEPTFILTER)
05535         char strbuf[ISC_STRERRORSIZE];
05536         struct accept_filter_arg afa;
05537 #else
05538         UNUSED(sock);
05539         UNUSED(filter);
05540 #endif
05541 
05542         REQUIRE(VALID_SOCKET(sock));
05543 
05544 #if defined(SO_ACCEPTFILTER) && defined(ENABLE_ACCEPTFILTER)
05545         bzero(&afa, sizeof(afa));
05546         strncpy(afa.af_name, filter, sizeof(afa.af_name));
05547         if (setsockopt(sock->fd, SOL_SOCKET, SO_ACCEPTFILTER,
05548                          &afa, sizeof(afa)) == -1) {
05549                 isc__strerror(errno, strbuf, sizeof(strbuf));
05550                 socket_log(sock, NULL, CREATION, isc_msgcat, ISC_MSGSET_SOCKET,
05551                            ISC_MSG_FILTER, "setsockopt(SO_ACCEPTFILTER): %s",
05552                            strbuf);
05553                 return (ISC_R_FAILURE);
05554         }
05555         return (ISC_R_SUCCESS);
05556 #else
05557         return (ISC_R_NOTIMPLEMENTED);
05558 #endif
05559 }
05560 
05561 /*
05562  * Set up to listen on a given socket.  We do this by creating an internal
05563  * event that will be dispatched when the socket has read activity.  The
05564  * watcher will send the internal event to the task when there is a new
05565  * connection.
05566  *
05567  * Unlike in read, we don't preallocate a done event here.  Every time there
05568  * is a new connection we'll have to allocate a new one anyway, so we might
05569  * as well keep things simple rather than having to track them.
05570  */
05571 isc_result_t
05572 isc__socket_listen(isc_socket_t *sock0, unsigned int backlog) {
05573         isc__socket_t *sock = (isc__socket_t *)sock0;
05574         char strbuf[ISC_STRERRORSIZE];
05575 
05576         REQUIRE(VALID_SOCKET(sock));
05577 
05578         LOCK(&sock->lock);
05579 
05580         REQUIRE(!sock->listener);
05581         REQUIRE(sock->bound);
05582         REQUIRE(sock->type == isc_sockettype_tcp ||
05583                 sock->type == isc_sockettype_unix);
05584 
05585         if (backlog == 0)
05586                 backlog = SOMAXCONN;
05587 
05588         if (listen(sock->fd, (int)backlog) < 0) {
05589                 UNLOCK(&sock->lock);
05590                 isc__strerror(errno, strbuf, sizeof(strbuf));
05591 
05592                 UNEXPECTED_ERROR(__FILE__, __LINE__, "listen: %s", strbuf);
05593 
05594                 return (ISC_R_UNEXPECTED);
05595         }
05596 
05597         sock->listener = 1;
05598 
05599         UNLOCK(&sock->lock);
05600         return (ISC_R_SUCCESS);
05601 }
05602 
05603 /*
05604  * This should try to do aggressive accept() XXXMLG
05605  */
05606 isc_result_t
05607 isc__socket_accept(isc_socket_t *sock0,
05608                   isc_task_t *task, isc_taskaction_t action, void *arg)
05609 {
05610         isc__socket_t *sock = (isc__socket_t *)sock0;
05611         isc_socket_newconnev_t *dev;
05612         isc__socketmgr_t *manager;
05613         isc_task_t *ntask = NULL;
05614         isc__socket_t *nsock;
05615         isc_result_t result;
05616         isc_boolean_t do_poke = ISC_FALSE;
05617 
05618         REQUIRE(VALID_SOCKET(sock));
05619         manager = sock->manager;
05620         REQUIRE(VALID_MANAGER(manager));
05621 
05622         LOCK(&sock->lock);
05623 
05624         REQUIRE(sock->listener);
05625 
05626         /*
05627          * Sender field is overloaded here with the task we will be sending
05628          * this event to.  Just before the actual event is delivered the
05629          * actual ev_sender will be touched up to be the socket.
05630          */
05631         dev = (isc_socket_newconnev_t *)
05632                 isc_event_allocate(manager->mctx, task, ISC_SOCKEVENT_NEWCONN,
05633                                    action, arg, sizeof(*dev));
05634         if (dev == NULL) {
05635                 UNLOCK(&sock->lock);
05636                 return (ISC_R_NOMEMORY);
05637         }
05638         ISC_LINK_INIT(dev, ev_link);
05639 
05640         result = allocate_socket(manager, sock->type, &nsock);
05641         if (result != ISC_R_SUCCESS) {
05642                 isc_event_free(ISC_EVENT_PTR(&dev));
05643                 UNLOCK(&sock->lock);
05644                 return (result);
05645         }
05646 
05647         /*
05648          * Attach to socket and to task.
05649          */
05650         isc_task_attach(task, &ntask);
05651         if (isc_task_exiting(ntask)) {
05652                 free_socket(&nsock);
05653                 isc_task_detach(&ntask);
05654                 isc_event_free(ISC_EVENT_PTR(&dev));
05655                 UNLOCK(&sock->lock);
05656                 return (ISC_R_SHUTTINGDOWN);
05657         }
05658         nsock->references++;
05659         nsock->statsindex = sock->statsindex;
05660 
05661         dev->ev_sender = ntask;
05662         dev->newsocket = (isc_socket_t *)nsock;
05663 
05664         /*
05665          * Poke watcher here.  We still have the socket locked, so there
05666          * is no race condition.  We will keep the lock for such a short
05667          * bit of time waking it up now or later won't matter all that much.
05668          */
05669         if (ISC_LIST_EMPTY(sock->accept_list))
05670                 do_poke = ISC_TRUE;
05671 
05672         ISC_LIST_ENQUEUE(sock->accept_list, dev, ev_link);
05673 
05674         if (do_poke)
05675                 select_poke(manager, sock->fd, SELECT_POKE_ACCEPT);
05676 
05677         UNLOCK(&sock->lock);
05678         return (ISC_R_SUCCESS);
05679 }
05680 
05681 isc_result_t
05682 isc__socket_connect(isc_socket_t *sock0, isc_sockaddr_t *addr,
05683                    isc_task_t *task, isc_taskaction_t action, void *arg)
05684 {
05685         isc__socket_t *sock = (isc__socket_t *)sock0;
05686         isc_socket_connev_t *dev;
05687         isc_task_t *ntask = NULL;
05688         isc__socketmgr_t *manager;
05689         int cc;
05690         char strbuf[ISC_STRERRORSIZE];
05691         char addrbuf[ISC_SOCKADDR_FORMATSIZE];
05692 
05693         REQUIRE(VALID_SOCKET(sock));
05694         REQUIRE(addr != NULL);
05695         REQUIRE(task != NULL);
05696         REQUIRE(action != NULL);
05697 
05698         manager = sock->manager;
05699         REQUIRE(VALID_MANAGER(manager));
05700         REQUIRE(addr != NULL);
05701 
05702         if (isc_sockaddr_ismulticast(addr))
05703                 return (ISC_R_MULTICAST);
05704 
05705         LOCK(&sock->lock);
05706 
05707         dev = (isc_socket_connev_t *)isc_event_allocate(manager->mctx, sock,
05708                                                         ISC_SOCKEVENT_CONNECT,
05709                                                         action, arg,
05710                                                         sizeof(*dev));
05711         if (dev == NULL) {
05712                 UNLOCK(&sock->lock);
05713                 return (ISC_R_NOMEMORY);
05714         }
05715         ISC_LINK_INIT(dev, ev_link);
05716 
05717         if (sock->connecting) {
05718                 INSIST(isc_sockaddr_equal(&sock->peer_address, addr));
05719                 goto queue;
05720         }
05721 
05722         if (sock->connected) {
05723                 INSIST(isc_sockaddr_equal(&sock->peer_address, addr));
05724                 dev->result = ISC_R_SUCCESS;
05725                 isc_task_send(task, ISC_EVENT_PTR(&dev));
05726 
05727                 UNLOCK(&sock->lock);
05728 
05729                 return (ISC_R_SUCCESS);
05730         }
05731 
05732         /*
05733          * Try to do the connect right away, as there can be only one
05734          * outstanding, and it might happen to complete.
05735          */
05736         sock->peer_address = *addr;
05737         cc = connect(sock->fd, &addr->type.sa, addr->length);
05738         if (cc < 0) {
05739                 /*
05740                  * HP-UX "fails" to connect a UDP socket and sets errno to
05741                  * EINPROGRESS if it's non-blocking.  We'd rather regard this as
05742                  * a success and let the user detect it if it's really an error
05743                  * at the time of sending a packet on the socket.
05744                  */
05745                 if (sock->type == isc_sockettype_udp && errno == EINPROGRESS) {
05746                         cc = 0;
05747                         goto success;
05748                 }
05749                 if (SOFT_ERROR(errno) || errno == EINPROGRESS)
05750                         goto queue;
05751 
05752                 switch (errno) {
05753 #define ERROR_MATCH(a, b) case a: dev->result = b; goto err_exit;
05754                         ERROR_MATCH(EACCES, ISC_R_NOPERM);
05755                         ERROR_MATCH(EADDRNOTAVAIL, ISC_R_ADDRNOTAVAIL);
05756                         ERROR_MATCH(EAFNOSUPPORT, ISC_R_ADDRNOTAVAIL);
05757                         ERROR_MATCH(ECONNREFUSED, ISC_R_CONNREFUSED);
05758                         ERROR_MATCH(EHOSTUNREACH, ISC_R_HOSTUNREACH);
05759 #ifdef EHOSTDOWN
05760                         ERROR_MATCH(EHOSTDOWN, ISC_R_HOSTUNREACH);
05761 #endif
05762                         ERROR_MATCH(ENETUNREACH, ISC_R_NETUNREACH);
05763                         ERROR_MATCH(ENOBUFS, ISC_R_NORESOURCES);
05764                         ERROR_MATCH(EPERM, ISC_R_HOSTUNREACH);
05765                         ERROR_MATCH(EPIPE, ISC_R_NOTCONNECTED);
05766                         ERROR_MATCH(ECONNRESET, ISC_R_CONNECTIONRESET);
05767 #undef ERROR_MATCH
05768                 }
05769 
05770                 sock->connected = 0;
05771 
05772                 isc__strerror(errno, strbuf, sizeof(strbuf));
05773                 isc_sockaddr_format(addr, addrbuf, sizeof(addrbuf));
05774                 UNEXPECTED_ERROR(__FILE__, __LINE__, "connect(%s) %d/%s",
05775                                  addrbuf, errno, strbuf);
05776 
05777                 UNLOCK(&sock->lock);
05778                 inc_stats(sock->manager->stats,
05779                           sock->statsindex[STATID_CONNECTFAIL]);
05780                 isc_event_free(ISC_EVENT_PTR(&dev));
05781                 return (ISC_R_UNEXPECTED);
05782 
05783         err_exit:
05784                 sock->connected = 0;
05785                 isc_task_send(task, ISC_EVENT_PTR(&dev));
05786 
05787                 UNLOCK(&sock->lock);
05788                 inc_stats(sock->manager->stats,
05789                           sock->statsindex[STATID_CONNECTFAIL]);
05790                 return (ISC_R_SUCCESS);
05791         }
05792 
05793         /*
05794          * If connect completed, fire off the done event.
05795          */
05796  success:
05797         if (cc == 0) {
05798                 sock->connected = 1;
05799                 sock->bound = 1;
05800                 dev->result = ISC_R_SUCCESS;
05801                 isc_task_send(task, ISC_EVENT_PTR(&dev));
05802 
05803                 UNLOCK(&sock->lock);
05804 
05805                 inc_stats(sock->manager->stats,
05806                           sock->statsindex[STATID_CONNECT]);
05807 
05808                 return (ISC_R_SUCCESS);
05809         }
05810 
05811  queue:
05812 
05813         /*
05814          * Attach to task.
05815          */
05816         isc_task_attach(task, &ntask);
05817 
05818         dev->ev_sender = ntask;
05819 
05820         /*
05821          * Poke watcher here.  We still have the socket locked, so there
05822          * is no race condition.  We will keep the lock for such a short
05823          * bit of time waking it up now or later won't matter all that much.
05824          */
05825         if (ISC_LIST_EMPTY(sock->connect_list) && !sock->connecting)
05826                 select_poke(manager, sock->fd, SELECT_POKE_CONNECT);
05827 
05828         sock->connecting = 1;
05829 
05830         ISC_LIST_ENQUEUE(sock->connect_list, dev, ev_link);
05831 
05832         UNLOCK(&sock->lock);
05833         return (ISC_R_SUCCESS);
05834 }
05835 
05836 /*
05837  * Called when a socket with a pending connect() finishes.
05838  */
05839 static void
05840 internal_connect(isc_task_t *me, isc_event_t *ev) {
05841         isc__socket_t *sock;
05842         isc_socket_connev_t *dev;
05843         int cc;
05844         isc_result_t result;
05845         ISC_SOCKADDR_LEN_T optlen;
05846         char strbuf[ISC_STRERRORSIZE];
05847         char peerbuf[ISC_SOCKADDR_FORMATSIZE];
05848 
05849         UNUSED(me);
05850         INSIST(ev->ev_type == ISC_SOCKEVENT_INTW);
05851 
05852         sock = ev->ev_sender;
05853         INSIST(VALID_SOCKET(sock));
05854 
05855         LOCK(&sock->lock);
05856 
05857         /*
05858          * When the internal event was sent the reference count was bumped
05859          * to keep the socket around for us.  Decrement the count here.
05860          */
05861         INSIST(sock->references > 0);
05862         sock->references--;
05863         if (sock->references == 0) {
05864                 UNLOCK(&sock->lock);
05865                 destroy(&sock);
05866                 return;
05867         }
05868 
05869         /*
05870          * Get the first item off the connect list.
05871          * If it is empty, unlock the socket and return.
05872          */
05873         dev = ISC_LIST_HEAD(sock->connect_list);
05874         if (dev == NULL) {
05875                 INSIST(!sock->connecting);
05876                 UNLOCK(&sock->lock);
05877                 return;
05878         }
05879 
05880         INSIST(sock->connecting);
05881         sock->connecting = 0;
05882 
05883         /*
05884          * Get any possible error status here.
05885          */
05886         optlen = sizeof(cc);
05887         if (getsockopt(sock->fd, SOL_SOCKET, SO_ERROR,
05888                        (void *)&cc, (void *)&optlen) != 0)
05889                 cc = errno;
05890         else
05891                 errno = cc;
05892 
05893         if (errno != 0) {
05894                 /*
05895                  * If the error is EAGAIN, just re-select on this
05896                  * fd and pretend nothing strange happened.
05897                  */
05898                 if (SOFT_ERROR(errno) || errno == EINPROGRESS) {
05899                         sock->connecting = 1;
05900                         select_poke(sock->manager, sock->fd,
05901                                     SELECT_POKE_CONNECT);
05902                         UNLOCK(&sock->lock);
05903 
05904                         return;
05905                 }
05906 
05907                 inc_stats(sock->manager->stats,
05908                           sock->statsindex[STATID_CONNECTFAIL]);
05909 
05910                 /*
05911                  * Translate other errors into ISC_R_* flavors.
05912                  */
05913                 switch (errno) {
05914 #define ERROR_MATCH(a, b) case a: result = b; break;
05915                         ERROR_MATCH(EACCES, ISC_R_NOPERM);
05916                         ERROR_MATCH(EADDRNOTAVAIL, ISC_R_ADDRNOTAVAIL);
05917                         ERROR_MATCH(EAFNOSUPPORT, ISC_R_ADDRNOTAVAIL);
05918                         ERROR_MATCH(ECONNREFUSED, ISC_R_CONNREFUSED);
05919                         ERROR_MATCH(EHOSTUNREACH, ISC_R_HOSTUNREACH);
05920 #ifdef EHOSTDOWN
05921                         ERROR_MATCH(EHOSTDOWN, ISC_R_HOSTUNREACH);
05922 #endif
05923                         ERROR_MATCH(ENETUNREACH, ISC_R_NETUNREACH);
05924                         ERROR_MATCH(ENOBUFS, ISC_R_NORESOURCES);
05925                         ERROR_MATCH(EPERM, ISC_R_HOSTUNREACH);
05926                         ERROR_MATCH(EPIPE, ISC_R_NOTCONNECTED);
05927                         ERROR_MATCH(ETIMEDOUT, ISC_R_TIMEDOUT);
05928                         ERROR_MATCH(ECONNRESET, ISC_R_CONNECTIONRESET);
05929 #undef ERROR_MATCH
05930                 default:
05931                         result = ISC_R_UNEXPECTED;
05932                         isc_sockaddr_format(&sock->peer_address, peerbuf,
05933                                             sizeof(peerbuf));
05934                         isc__strerror(errno, strbuf, sizeof(strbuf));
05935                         UNEXPECTED_ERROR(__FILE__, __LINE__,
05936                                          "internal_connect: connect(%s) %s",
05937                                          peerbuf, strbuf);
05938                 }
05939         } else {
05940                 inc_stats(sock->manager->stats,
05941                           sock->statsindex[STATID_CONNECT]);
05942                 result = ISC_R_SUCCESS;
05943                 sock->connected = 1;
05944                 sock->bound = 1;
05945         }
05946 
05947         do {
05948                 dev->result = result;
05949                 send_connectdone_event(sock, &dev);
05950                 dev = ISC_LIST_HEAD(sock->connect_list);
05951         } while (dev != NULL);
05952 
05953         UNLOCK(&sock->lock);
05954 }
05955 
05956 isc_result_t
05957 isc__socket_getpeername(isc_socket_t *sock0, isc_sockaddr_t *addressp) {
05958         isc__socket_t *sock = (isc__socket_t *)sock0;
05959         isc_result_t result;
05960 
05961         REQUIRE(VALID_SOCKET(sock));
05962         REQUIRE(addressp != NULL);
05963 
05964         LOCK(&sock->lock);
05965 
05966         if (sock->connected) {
05967                 *addressp = sock->peer_address;
05968                 result = ISC_R_SUCCESS;
05969         } else {
05970                 result = ISC_R_NOTCONNECTED;
05971         }
05972 
05973         UNLOCK(&sock->lock);
05974 
05975         return (result);
05976 }
05977 
05978 isc_result_t
05979 isc__socket_getsockname(isc_socket_t *sock0, isc_sockaddr_t *addressp) {
05980         isc__socket_t *sock = (isc__socket_t *)sock0;
05981         ISC_SOCKADDR_LEN_T len;
05982         isc_result_t result;
05983         char strbuf[ISC_STRERRORSIZE];
05984 
05985         REQUIRE(VALID_SOCKET(sock));
05986         REQUIRE(addressp != NULL);
05987 
05988         LOCK(&sock->lock);
05989 
05990         if (!sock->bound) {
05991                 result = ISC_R_NOTBOUND;
05992                 goto out;
05993         }
05994 
05995         result = ISC_R_SUCCESS;
05996 
05997         len = sizeof(addressp->type);
05998         if (getsockname(sock->fd, &addressp->type.sa, (void *)&len) < 0) {
05999                 isc__strerror(errno, strbuf, sizeof(strbuf));
06000                 UNEXPECTED_ERROR(__FILE__, __LINE__, "getsockname: %s",
06001                                  strbuf);
06002                 result = ISC_R_UNEXPECTED;
06003                 goto out;
06004         }
06005         addressp->length = (unsigned int)len;
06006 
06007  out:
06008         UNLOCK(&sock->lock);
06009 
06010         return (result);
06011 }
06012 
06013 /*
06014  * Run through the list of events on this socket, and cancel the ones
06015  * queued for task "task" of type "how".  "how" is a bitmask.
06016  */
06017 void
06018 isc__socket_cancel(isc_socket_t *sock0, isc_task_t *task, unsigned int how) {
06019         isc__socket_t *sock = (isc__socket_t *)sock0;
06020 
06021         REQUIRE(VALID_SOCKET(sock));
06022 
06023         /*
06024          * Quick exit if there is nothing to do.  Don't even bother locking
06025          * in this case.
06026          */
06027         if (how == 0)
06028                 return;
06029 
06030         LOCK(&sock->lock);
06031 
06032         /*
06033          * All of these do the same thing, more or less.
06034          * Each will:
06035          *      o If the internal event is marked as "posted" try to
06036          *        remove it from the task's queue.  If this fails, mark it
06037          *        as canceled instead, and let the task clean it up later.
06038          *      o For each I/O request for that task of that type, post
06039          *        its done event with status of "ISC_R_CANCELED".
06040          *      o Reset any state needed.
06041          */
06042         if (((how & ISC_SOCKCANCEL_RECV) == ISC_SOCKCANCEL_RECV)
06043             && !ISC_LIST_EMPTY(sock->recv_list)) {
06044                 isc_socketevent_t      *dev;
06045                 isc_socketevent_t      *next;
06046                 isc_task_t             *current_task;
06047 
06048                 dev = ISC_LIST_HEAD(sock->recv_list);
06049 
06050                 while (dev != NULL) {
06051                         current_task = dev->ev_sender;
06052                         next = ISC_LIST_NEXT(dev, ev_link);
06053 
06054                         if ((task == NULL) || (task == current_task)) {
06055                                 dev->result = ISC_R_CANCELED;
06056                                 send_recvdone_event(sock, &dev);
06057                         }
06058                         dev = next;
06059                 }
06060         }
06061 
06062         if (((how & ISC_SOCKCANCEL_SEND) == ISC_SOCKCANCEL_SEND)
06063             && !ISC_LIST_EMPTY(sock->send_list)) {
06064                 isc_socketevent_t      *dev;
06065                 isc_socketevent_t      *next;
06066                 isc_task_t             *current_task;
06067 
06068                 dev = ISC_LIST_HEAD(sock->send_list);
06069 
06070                 while (dev != NULL) {
06071                         current_task = dev->ev_sender;
06072                         next = ISC_LIST_NEXT(dev, ev_link);
06073 
06074                         if ((task == NULL) || (task == current_task)) {
06075                                 dev->result = ISC_R_CANCELED;
06076                                 send_senddone_event(sock, &dev);
06077                         }
06078                         dev = next;
06079                 }
06080         }
06081 
06082         if (((how & ISC_SOCKCANCEL_ACCEPT) == ISC_SOCKCANCEL_ACCEPT)
06083             && !ISC_LIST_EMPTY(sock->accept_list)) {
06084                 isc_socket_newconnev_t *dev;
06085                 isc_socket_newconnev_t *next;
06086                 isc_task_t             *current_task;
06087 
06088                 dev = ISC_LIST_HEAD(sock->accept_list);
06089                 while (dev != NULL) {
06090                         current_task = dev->ev_sender;
06091                         next = ISC_LIST_NEXT(dev, ev_link);
06092 
06093                         if ((task == NULL) || (task == current_task)) {
06094 
06095                                 ISC_LIST_UNLINK(sock->accept_list, dev,
06096                                                 ev_link);
06097 
06098                                 NEWCONNSOCK(dev)->references--;
06099                                 free_socket((isc__socket_t **)&dev->newsocket);
06100 
06101                                 dev->result = ISC_R_CANCELED;
06102                                 dev->ev_sender = sock;
06103                                 isc_task_sendanddetach(&current_task,
06104                                                        ISC_EVENT_PTR(&dev));
06105                         }
06106 
06107                         dev = next;
06108                 }
06109         }
06110 
06111         if (((how & ISC_SOCKCANCEL_CONNECT) == ISC_SOCKCANCEL_CONNECT)
06112             && !ISC_LIST_EMPTY(sock->connect_list)) {
06113                 isc_socket_connev_t    *dev;
06114                 isc_socket_connev_t    *next;
06115                 isc_task_t             *current_task;
06116 
06117                 INSIST(sock->connecting);
06118                 sock->connecting = 0;
06119 
06120                 dev = ISC_LIST_HEAD(sock->connect_list);
06121 
06122                 while (dev != NULL) {
06123                         current_task = dev->ev_sender;
06124                         next = ISC_LIST_NEXT(dev, ev_link);
06125 
06126                         if ((task == NULL) || (task == current_task)) {
06127                                 dev->result = ISC_R_CANCELED;
06128                                 send_connectdone_event(sock, &dev);
06129                         }
06130                         dev = next;
06131                 }
06132         }
06133 
06134         UNLOCK(&sock->lock);
06135 }
06136 
06137 isc_sockettype_t
06138 isc__socket_gettype(isc_socket_t *sock0) {
06139         isc__socket_t *sock = (isc__socket_t *)sock0;
06140 
06141         REQUIRE(VALID_SOCKET(sock));
06142 
06143         return (sock->type);
06144 }
06145 
06146 isc_boolean_t
06147 isc__socket_isbound(isc_socket_t *sock0) {
06148         isc__socket_t *sock = (isc__socket_t *)sock0;
06149         isc_boolean_t val;
06150 
06151         REQUIRE(VALID_SOCKET(sock));
06152 
06153         LOCK(&sock->lock);
06154         val = ((sock->bound) ? ISC_TRUE : ISC_FALSE);
06155         UNLOCK(&sock->lock);
06156 
06157         return (val);
06158 }
06159 
06160 void
06161 isc__socket_ipv6only(isc_socket_t *sock0, isc_boolean_t yes) {
06162         isc__socket_t *sock = (isc__socket_t *)sock0;
06163 #if defined(IPV6_V6ONLY)
06164         int onoff = yes ? 1 : 0;
06165 #else
06166         UNUSED(yes);
06167         UNUSED(sock);
06168 #endif
06169 
06170         REQUIRE(VALID_SOCKET(sock));
06171         INSIST(!sock->dupped);
06172 
06173 #ifdef IPV6_V6ONLY
06174         if (sock->pf == AF_INET6) {
06175                 if (setsockopt(sock->fd, IPPROTO_IPV6, IPV6_V6ONLY,
06176                                (void *)&onoff, sizeof(int)) < 0) {
06177                         char strbuf[ISC_STRERRORSIZE];
06178                         isc__strerror(errno, strbuf, sizeof(strbuf));
06179                         UNEXPECTED_ERROR(__FILE__, __LINE__,
06180                                          "setsockopt(%d, IPV6_V6ONLY) "
06181                                          "%s: %s", sock->fd,
06182                                          isc_msgcat_get(isc_msgcat,
06183                                                         ISC_MSGSET_GENERAL,
06184                                                         ISC_MSG_FAILED,
06185                                                         "failed"),
06186                                          strbuf);
06187                 }
06188         }
06189         FIX_IPV6_RECVPKTINFO(sock);     /* AIX */
06190 #endif
06191 }
06192 
06193 static void
06194 setdscp(isc__socket_t *sock, isc_dscp_t dscp) {
06195 #if defined(IP_TOS) || defined(IPV6_TCLASS)
06196         int value = dscp << 2;
06197 #endif
06198 
06199         sock->dscp = dscp;
06200 
06201 #ifdef IP_TOS
06202         if (sock->pf == AF_INET) {
06203                 if (setsockopt(sock->fd, IPPROTO_IP, IP_TOS,
06204                                (void *)&value, sizeof(value)) < 0) {
06205                         char strbuf[ISC_STRERRORSIZE];
06206                         isc__strerror(errno, strbuf, sizeof(strbuf));
06207                         UNEXPECTED_ERROR(__FILE__, __LINE__,
06208                                          "setsockopt(%d, IP_TOS, %.02x) "
06209                                          "%s: %s", sock->fd, value >> 2,
06210                                          isc_msgcat_get(isc_msgcat,
06211                                                         ISC_MSGSET_GENERAL,
06212                                                         ISC_MSG_FAILED,
06213                                                         "failed"),
06214                                          strbuf);
06215                 }
06216         }
06217 #endif
06218 #ifdef IPV6_TCLASS
06219         if (sock->pf == AF_INET6) {
06220                 if (setsockopt(sock->fd, IPPROTO_IPV6, IPV6_TCLASS,
06221                                (void *)&value, sizeof(value)) < 0) {
06222                         char strbuf[ISC_STRERRORSIZE];
06223                         isc__strerror(errno, strbuf, sizeof(strbuf));
06224                         UNEXPECTED_ERROR(__FILE__, __LINE__,
06225                                          "setsockopt(%d, IPV6_TCLASS, %.02x) "
06226                                          "%s: %s", sock->fd, dscp >> 2,
06227                                          isc_msgcat_get(isc_msgcat,
06228                                                         ISC_MSGSET_GENERAL,
06229                                                         ISC_MSG_FAILED,
06230                                                         "failed"),
06231                                          strbuf);
06232                 }
06233         }
06234 #endif
06235 }
06236 
06237 void
06238 isc__socket_dscp(isc_socket_t *sock0, isc_dscp_t dscp) {
06239         isc__socket_t *sock = (isc__socket_t *)sock0;
06240 
06241         REQUIRE(VALID_SOCKET(sock));
06242         REQUIRE(dscp < 0x40);
06243 
06244 #if !defined(IP_TOS) && !defined(IPV6_TCLASS)
06245         UNUSED(dscp);
06246 #else
06247         if (dscp < 0)
06248                 return;
06249 
06250         /* The DSCP value must not be changed once it has been set. */
06251         if (isc_dscp_check_value != -1)
06252                 INSIST(dscp == isc_dscp_check_value);
06253 #endif
06254 
06255 
06256 #ifdef notyet
06257         REQUIRE(!sock->dupped);
06258 #endif
06259 
06260         setdscp(sock, dscp);
06261 }
06262 
06263 isc_socketevent_t *
06264 isc_socket_socketevent(isc_mem_t *mctx, void *sender,
06265                         isc_eventtype_t eventtype, isc_taskaction_t action,
06266                         void *arg)
06267 {
06268         return (allocate_socketevent(mctx, sender, eventtype, action, arg));
06269 }
06270 
06271 #ifndef USE_WATCHER_THREAD
06272 /*
06273  * In our assumed scenario, we can simply use a single static object.
06274  * XXX: this is not true if the application uses multiple threads with
06275  *      'multi-context' mode.  Fixing this is a future TODO item.
06276  */
06277 static isc_socketwait_t swait_private;
06278 
06279 int
06280 isc__socketmgr_waitevents(isc_socketmgr_t *manager0, struct timeval *tvp,
06281                           isc_socketwait_t **swaitp)
06282 {
06283         isc__socketmgr_t *manager = (isc__socketmgr_t *)manager0;
06284         int n;
06285 #ifdef USE_KQUEUE
06286         struct timespec ts, *tsp;
06287 #endif
06288 #ifdef USE_EPOLL
06289         int timeout;
06290 #endif
06291 #ifdef USE_DEVPOLL
06292         isc_result_t result;
06293         int pass;
06294         struct dvpoll dvp;
06295 #endif
06296 
06297         REQUIRE(swaitp != NULL && *swaitp == NULL);
06298 
06299 #ifdef USE_SHARED_MANAGER
06300         if (manager == NULL)
06301                 manager = socketmgr;
06302 #endif
06303         if (manager == NULL)
06304                 return (0);
06305 
06306 #ifdef USE_KQUEUE
06307         if (tvp != NULL) {
06308                 ts.tv_sec = tvp->tv_sec;
06309                 ts.tv_nsec = tvp->tv_usec * 1000;
06310                 tsp = &ts;
06311         } else
06312                 tsp = NULL;
06313         swait_private.nevents = kevent(manager->kqueue_fd, NULL, 0,
06314                                        manager->events, manager->nevents,
06315                                        tsp);
06316         n = swait_private.nevents;
06317 #elif defined(USE_EPOLL)
06318         if (tvp != NULL)
06319                 timeout = tvp->tv_sec * 1000 + (tvp->tv_usec + 999) / 1000;
06320         else
06321                 timeout = -1;
06322         swait_private.nevents = epoll_wait(manager->epoll_fd,
06323                                            manager->events,
06324                                            manager->nevents, timeout);
06325         n = swait_private.nevents;
06326 #elif defined(USE_DEVPOLL)
06327         /*
06328          * Re-probe every thousand calls.
06329          */
06330         if (manager->calls++ > 1000U) {
06331                 result = isc_resource_getcurlimit(isc_resource_openfiles,
06332                                                   &manager->open_max);
06333                 if (result != ISC_R_SUCCESS)
06334                         manager->open_max = 64;
06335                 manager->calls = 0;
06336         }
06337         for (pass = 0; pass < 2; pass++) {
06338                 dvp.dp_fds = manager->events;
06339                 dvp.dp_nfds = manager->nevents;
06340                 if (dvp.dp_nfds >= manager->open_max)
06341                         dvp.dp_nfds = manager->open_max - 1;
06342                 if (tvp != NULL) {
06343                         dvp.dp_timeout = tvp->tv_sec * 1000 +
06344                                 (tvp->tv_usec + 999) / 1000;
06345                 } else
06346                         dvp.dp_timeout = -1;
06347                 n = ioctl(manager->devpoll_fd, DP_POLL, &dvp);
06348                 if (n == -1 && errno == EINVAL) {
06349                         /*
06350                          * {OPEN_MAX} may have dropped.  Look
06351                          * up the current value and try again.
06352                          */
06353                         result = isc_resource_getcurlimit(
06354                                                         isc_resource_openfiles,
06355                                                         &manager->open_max);
06356                         if (result != ISC_R_SUCCESS)
06357                                 manager->open_max = 64;
06358                 } else
06359                         break;
06360         }
06361         swait_private.nevents = n;
06362 #elif defined(USE_SELECT)
06363         memmove(manager->read_fds_copy, manager->read_fds, manager->fd_bufsize);
06364         memmove(manager->write_fds_copy, manager->write_fds,
06365                 manager->fd_bufsize);
06366 
06367         swait_private.readset = manager->read_fds_copy;
06368         swait_private.writeset = manager->write_fds_copy;
06369         swait_private.maxfd = manager->maxfd + 1;
06370 
06371         n = select(swait_private.maxfd, swait_private.readset,
06372                    swait_private.writeset, NULL, tvp);
06373 #endif
06374 
06375         *swaitp = &swait_private;
06376         return (n);
06377 }
06378 
06379 isc_result_t
06380 isc__socketmgr_dispatch(isc_socketmgr_t *manager0, isc_socketwait_t *swait) {
06381         isc__socketmgr_t *manager = (isc__socketmgr_t *)manager0;
06382 
06383         REQUIRE(swait == &swait_private);
06384 
06385 #ifdef USE_SHARED_MANAGER
06386         if (manager == NULL)
06387                 manager = socketmgr;
06388 #endif
06389         if (manager == NULL)
06390                 return (ISC_R_NOTFOUND);
06391 
06392 #if defined(USE_KQUEUE) || defined(USE_EPOLL) || defined(USE_DEVPOLL)
06393         (void)process_fds(manager, manager->events, swait->nevents);
06394         return (ISC_R_SUCCESS);
06395 #elif defined(USE_SELECT)
06396         process_fds(manager, swait->maxfd, swait->readset, swait->writeset);
06397         return (ISC_R_SUCCESS);
06398 #endif
06399 }
06400 #endif /* USE_WATCHER_THREAD */
06401 
06402 void
06403 isc__socket_setname(isc_socket_t *socket0, const char *name, void *tag) {
06404         isc__socket_t *sock = (isc__socket_t *)socket0;
06405 
06406         /*
06407          * Name 'sock'.
06408          */
06409 
06410         REQUIRE(VALID_SOCKET(sock));
06411 
06412         LOCK(&sock->lock);
06413         memset(sock->name, 0, sizeof(sock->name));
06414         strncpy(sock->name, name, sizeof(sock->name) - 1);
06415         sock->tag = tag;
06416         UNLOCK(&sock->lock);
06417 }
06418 
06419 const char *
06420 isc__socket_getname(isc_socket_t *socket0) {
06421         isc__socket_t *sock = (isc__socket_t *)socket0;
06422 
06423         return (sock->name);
06424 }
06425 
06426 void *
06427 isc__socket_gettag(isc_socket_t *socket0) {
06428         isc__socket_t *sock = (isc__socket_t *)socket0;
06429 
06430         return (sock->tag);
06431 }
06432 
06433 isc_result_t
06434 isc__socket_register(void) {
06435         return (isc_socket_register(isc__socketmgr_create));
06436 }
06437 
06438 int
06439 isc__socket_getfd(isc_socket_t *socket0) {
06440         isc__socket_t *sock = (isc__socket_t *)socket0;
06441 
06442         return ((short) sock->fd);
06443 }
06444 
06445 #if defined(HAVE_LIBXML2) || defined(HAVE_JSON)
06446 static const char *
06447 _socktype(isc_sockettype_t type)
06448 {
06449         if (type == isc_sockettype_udp)
06450                 return ("udp");
06451         else if (type == isc_sockettype_tcp)
06452                 return ("tcp");
06453         else if (type == isc_sockettype_unix)
06454                 return ("unix");
06455         else if (type == isc_sockettype_fdwatch)
06456                 return ("fdwatch");
06457         else
06458                 return ("not-initialized");
06459 }
06460 #endif
06461 
06462 #ifdef HAVE_LIBXML2
06463 #define TRY0(a) do { xmlrc = (a); if (xmlrc < 0) goto error; } while(0)
06464 int
06465 isc_socketmgr_renderxml(isc_socketmgr_t *mgr0, xmlTextWriterPtr writer) {
06466         isc__socketmgr_t *mgr = (isc__socketmgr_t *)mgr0;
06467         isc__socket_t *sock = NULL;
06468         char peerbuf[ISC_SOCKADDR_FORMATSIZE];
06469         isc_sockaddr_t addr;
06470         ISC_SOCKADDR_LEN_T len;
06471         int xmlrc;
06472 
06473         LOCK(&mgr->lock);
06474 
06475 #ifdef USE_SHARED_MANAGER
06476         TRY0(xmlTextWriterStartElement(writer, ISC_XMLCHAR "references"));
06477         TRY0(xmlTextWriterWriteFormatString(writer, "%d", mgr->refs));
06478         TRY0(xmlTextWriterEndElement(writer));
06479 #endif  /* USE_SHARED_MANAGER */
06480 
06481         TRY0(xmlTextWriterStartElement(writer, ISC_XMLCHAR "sockets"));
06482         sock = ISC_LIST_HEAD(mgr->socklist);
06483         while (sock != NULL) {
06484                 LOCK(&sock->lock);
06485                 TRY0(xmlTextWriterStartElement(writer, ISC_XMLCHAR "socket"));
06486 
06487                 TRY0(xmlTextWriterStartElement(writer, ISC_XMLCHAR "id"));
06488                 TRY0(xmlTextWriterWriteFormatString(writer, "%p", sock));
06489                 TRY0(xmlTextWriterEndElement(writer));
06490 
06491                 if (sock->name[0] != 0) {
06492                         TRY0(xmlTextWriterStartElement(writer,
06493                                                        ISC_XMLCHAR "name"));
06494                         TRY0(xmlTextWriterWriteFormatString(writer, "%s",
06495                                                             sock->name));
06496                         TRY0(xmlTextWriterEndElement(writer)); /* name */
06497                 }
06498 
06499                 TRY0(xmlTextWriterStartElement(writer,
06500                                                ISC_XMLCHAR "references"));
06501                 TRY0(xmlTextWriterWriteFormatString(writer, "%d",
06502                                                     sock->references));
06503                 TRY0(xmlTextWriterEndElement(writer));
06504 
06505                 TRY0(xmlTextWriterWriteElement(writer, ISC_XMLCHAR "type",
06506                                           ISC_XMLCHAR _socktype(sock->type)));
06507 
06508                 if (sock->connected) {
06509                         isc_sockaddr_format(&sock->peer_address, peerbuf,
06510                                             sizeof(peerbuf));
06511                         TRY0(xmlTextWriterWriteElement(writer,
06512                                                   ISC_XMLCHAR "peer-address",
06513                                                   ISC_XMLCHAR peerbuf));
06514                 }
06515 
06516                 len = sizeof(addr);
06517                 if (getsockname(sock->fd, &addr.type.sa, (void *)&len) == 0) {
06518                         isc_sockaddr_format(&addr, peerbuf, sizeof(peerbuf));
06519                         TRY0(xmlTextWriterWriteElement(writer,
06520                                                   ISC_XMLCHAR "local-address",
06521                                                   ISC_XMLCHAR peerbuf));
06522                 }
06523 
06524                 TRY0(xmlTextWriterStartElement(writer, ISC_XMLCHAR "states"));
06525                 if (sock->pending_recv)
06526                         TRY0(xmlTextWriterWriteElement(writer,
06527                                                 ISC_XMLCHAR "state",
06528                                                 ISC_XMLCHAR "pending-receive"));
06529                 if (sock->pending_send)
06530                         TRY0(xmlTextWriterWriteElement(writer,
06531                                                   ISC_XMLCHAR "state",
06532                                                   ISC_XMLCHAR "pending-send"));
06533                 if (sock->pending_accept)
06534                         TRY0(xmlTextWriterWriteElement(writer,
06535                                                  ISC_XMLCHAR "state",
06536                                                  ISC_XMLCHAR "pending_accept"));
06537                 if (sock->listener)
06538                         TRY0(xmlTextWriterWriteElement(writer,
06539                                                        ISC_XMLCHAR "state",
06540                                                        ISC_XMLCHAR "listener"));
06541                 if (sock->connected)
06542                         TRY0(xmlTextWriterWriteElement(writer,
06543                                                      ISC_XMLCHAR "state",
06544                                                      ISC_XMLCHAR "connected"));
06545                 if (sock->connecting)
06546                         TRY0(xmlTextWriterWriteElement(writer,
06547                                                     ISC_XMLCHAR "state",
06548                                                     ISC_XMLCHAR "connecting"));
06549                 if (sock->bound)
06550                         TRY0(xmlTextWriterWriteElement(writer,
06551                                                        ISC_XMLCHAR "state",
06552                                                        ISC_XMLCHAR "bound"));
06553 
06554                 TRY0(xmlTextWriterEndElement(writer)); /* states */
06555 
06556                 TRY0(xmlTextWriterEndElement(writer)); /* socket */
06557 
06558                 UNLOCK(&sock->lock);
06559                 sock = ISC_LIST_NEXT(sock, link);
06560         }
06561         TRY0(xmlTextWriterEndElement(writer)); /* sockets */
06562 
06563  error:
06564         if (sock != NULL)
06565                 UNLOCK(&sock->lock);
06566 
06567         UNLOCK(&mgr->lock);
06568 
06569         return (xmlrc);
06570 }
06571 #endif /* HAVE_LIBXML2 */
06572 
06573 #ifdef HAVE_JSON
06574 #define CHECKMEM(m) do { \
06575         if (m == NULL) { \
06576                 result = ISC_R_NOMEMORY;\
06577                 goto error;\
06578         } \
06579 } while(0)
06580 
06581 isc_result_t
06582 isc_socketmgr_renderjson(isc_socketmgr_t *mgr0, json_object *stats) {
06583         isc_result_t result = ISC_R_SUCCESS;
06584         isc__socketmgr_t *mgr = (isc__socketmgr_t *)mgr0;
06585         isc__socket_t *sock = NULL;
06586         char peerbuf[ISC_SOCKADDR_FORMATSIZE];
06587         isc_sockaddr_t addr;
06588         ISC_SOCKADDR_LEN_T len;
06589         json_object *obj, *array = json_object_new_array();
06590 
06591         CHECKMEM(array);
06592 
06593         LOCK(&mgr->lock);
06594 
06595 #ifdef USE_SHARED_MANAGER
06596         obj = json_object_new_int(mgr->refs);
06597         CHECKMEM(obj);
06598         json_object_object_add(stats, "references", obj);
06599 #endif  /* USE_SHARED_MANAGER */
06600 
06601         sock = ISC_LIST_HEAD(mgr->socklist);
06602         while (sock != NULL) {
06603                 json_object *states, *entry = json_object_new_object();
06604                 char buf[255];
06605 
06606                 CHECKMEM(entry);
06607                 json_object_array_add(array, entry);
06608 
06609                 LOCK(&sock->lock);
06610 
06611                 sprintf(buf, "%p", sock);
06612                 obj = json_object_new_string(buf);
06613                 CHECKMEM(obj);
06614                 json_object_object_add(entry, "id", obj);
06615 
06616                 if (sock->name[0] != 0) {
06617                         obj = json_object_new_string(sock->name);
06618                         CHECKMEM(obj);
06619                         json_object_object_add(entry, "name", obj);
06620                 }
06621 
06622                 obj = json_object_new_int(sock->references);
06623                 CHECKMEM(obj);
06624                 json_object_object_add(entry, "references", obj);
06625 
06626                 obj = json_object_new_string(_socktype(sock->type));
06627                 CHECKMEM(obj);
06628                 json_object_object_add(entry, "type", obj);
06629 
06630                 if (sock->connected) {
06631                         isc_sockaddr_format(&sock->peer_address, peerbuf,
06632                                             sizeof(peerbuf));
06633                         obj = json_object_new_string(peerbuf);
06634                         CHECKMEM(obj);
06635                         json_object_object_add(entry, "peer-address", obj);
06636                 }
06637 
06638                 len = sizeof(addr);
06639                 if (getsockname(sock->fd, &addr.type.sa, (void *)&len) == 0) {
06640                         isc_sockaddr_format(&addr, peerbuf, sizeof(peerbuf));
06641                         obj = json_object_new_string(peerbuf);
06642                         CHECKMEM(obj);
06643                         json_object_object_add(entry, "local-address", obj);
06644                 }
06645 
06646                 states = json_object_new_array();
06647                 CHECKMEM(states);
06648                 json_object_object_add(entry, "states", states);
06649 
06650                 if (sock->pending_recv) {
06651                         obj = json_object_new_string("pending-receive");
06652                         CHECKMEM(obj);
06653                         json_object_array_add(states, obj);
06654                 }
06655 
06656                 if (sock->pending_send) {
06657                         obj = json_object_new_string("pending-send");
06658                         CHECKMEM(obj);
06659                         json_object_array_add(states, obj);
06660                 }
06661 
06662                 if (sock->pending_accept) {
06663                         obj = json_object_new_string("pending-accept");
06664                         CHECKMEM(obj);
06665                         json_object_array_add(states, obj);
06666                 }
06667 
06668                 if (sock->listener) {
06669                         obj = json_object_new_string("listener");
06670                         CHECKMEM(obj);
06671                         json_object_array_add(states, obj);
06672                 }
06673 
06674                 if (sock->connected) {
06675                         obj = json_object_new_string("connected");
06676                         CHECKMEM(obj);
06677                         json_object_array_add(states, obj);
06678                 }
06679 
06680                 if (sock->connecting) {
06681                         obj = json_object_new_string("connecting");
06682                         CHECKMEM(obj);
06683                         json_object_array_add(states, obj);
06684                 }
06685 
06686                 if (sock->bound) {
06687                         obj = json_object_new_string("bound");
06688                         CHECKMEM(obj);
06689                         json_object_array_add(states, obj);
06690                 }
06691 
06692                 UNLOCK(&sock->lock);
06693                 sock = ISC_LIST_NEXT(sock, link);
06694         }
06695 
06696         json_object_object_add(stats, "sockets", array);
06697         array = NULL;
06698         result = ISC_R_SUCCESS;
06699 
06700  error:
06701         if (array != NULL)
06702                 json_object_put(array);
06703 
06704         if (sock != NULL)
06705                 UNLOCK(&sock->lock);
06706 
06707         UNLOCK(&mgr->lock);
06708 
06709         return (result);
06710 }
06711 #endif /* HAVE_JSON */
06712 
06713 #include "../socket_api.c"

Generated on Tue Apr 28 17:41:06 2015 by Doxygen 1.5.4 for BIND9 Internals 9.11.0pre-alpha