possible sendfile() patch

Yaoping Ruan yruan at cs.princeton.edu
Fri Aug 15 13:03:27 PDT 2003


Sometime ago we posted a work on improving Web server performance on
FreeBSD. and got quite a few valuable feedback from the list members.
>From the discussion, we realize it would be great if part of the work,
the sendfile() optimization, could be merged into the mainline code. You
may want to try out the patch attached at the end of this message.

The patch was made on 4.6.2 release. To facilitate measurement, we added
some sysctl elements to monitor and control the sendfile buffers. We
understand there're some changes on the latest version of sendfile. It
is quite easy to adapt to the current release because nothing
fundamentally conflicts. We are ready to make any updates if the merge
could be taken.

Our measurements with the Flash Web server show the optimization has
20-25% throughput benefit for in-memory small files. A macrobenchmark
workload involving disk access and having file size range from 100 bytes
to 1MB yields 5-10% throughput improvement.

Here's a brief summary of the optimizations:
1. Cache the mapping between VM pages and physical map. Free the LRU
tail if sendfile buf is in short or timeout.
2. Avoid disk IO by setting the flag (the last parameter of sendfile())
to SF_NONIO .
3. Pack header into the body packets using mbuf cluster space.
4. Use timeout mechanism for cached buffers. It also serves as switch
between caching/non-caching for measurement purpose.


************* added sysctl elements *************

> sysctl kern.sendfile
kern.sendfile.nsfbufs: 20480
kern.sendfile.numsfpages: 20480
kern.sendfile.lrusfpages: 20474
kern.sendfile.sf_cache_timeout_sec: 30

nsfbufs is the number of sendfile buffers set at booting time.
numsfpages is the number of cached file pages, also the number of
sendfile buffers.
lrusfpages is the number of free sendfile buffers.
sf_cache_timeout_sec is the timeout in seconds. Timeout starts from the
last call
of sendfile. The default value is 5 minutes. Setting to 0 disables the
caching. So one may compare the performance between caching enabled and
disabled. Be note that this is for test purpose only, and one may _not_
change the value when sendfile is still in
using.

************* sendfile patch (based on 4.6.2 release) ************

Index: kern/kern_mib.c
===================================================================
RCS file: /home/ncvs/src/sys/kern/kern_mib.c,v
retrieving revision 1.29.2.4
diff -c -r1.29.2.4 kern_mib.c
*** kern/kern_mib.c 30 Jul 2001 23:28:00 -0000 1.29.2.4
--- kern/kern_mib.c 15 Aug 2003 19:38:03 -0000
***************
*** 46,51 ****
--- 46,52 ----
  #include <sys/sysctl.h>
  #include <sys/proc.h>
  #include <sys/jail.h>
+ #include <sys/mbuf.h>
  #include <machine/smp.h>

  SYSCTL_NODE(, 0,   sysctl, CTLFLAG_RW, 0,
***************
*** 248,250 ****
--- 249,269 ----
  #include <sys/conf.h>
  SYSCTL_INT(_debug_sizeof, OID_AUTO, specinfo, CTLFLAG_RD,
      0, sizeof(struct specinfo), "sizeof(struct specinfo)");
+
+ /* support for sendfile statistics */
+ SYSCTL_NODE(_kern, KERN_SENDFILE, sendfile, CTLFLAG_RW, 0, "sendfile
statistics");
+
+ #define KIPC_SNDF_NSFBUFS       1       /* number of sendfile bufs */
+ #define KIPC_SNDF_NSFPGS        2       /* number of sendfile cached
pages */
+ #define KIPC_SNDF_NLRUPGS       3       /* number of sendfile free
pages */
+ #define KIPC_SNDF_TOSEC         4
+
+ SYSCTL_INT(_kern_sendfile, KSNDF_NSFBUFS, nsfbufs, CTLFLAG_RD,
+     &nsfbufs, 0, "");
+ SYSCTL_INT(_kern_sendfile, KSNDF_NSFPGS, numsfpages, CTLFLAG_RD,
+     &numsfpages, 0, "");
+ SYSCTL_INT(_kern_sendfile, KSNDF_NLRUPGS, lrusfpages, CTLFLAG_RD,
+     &lrusfpages, 0, "");
+ SYSCTL_INT(_kern_sendfile, KSNDF_TOSEC, sf_cache_timeout_sec,
CTLFLAG_RW, &sf_cache_timeout_sec, 0, "");
+ /* end */
+
Index: kern/uipc_syscalls.c
===================================================================
RCS file: /home/ncvs/src/sys/kern/uipc_syscalls.c,v
retrieving revision 1.65.2.9.6.1
diff -c -r1.65.2.9.6.1 uipc_syscalls.c
*** kern/uipc_syscalls.c 13 Aug 2002 12:12:41 -0000 1.65.2.9.6.1
--- kern/uipc_syscalls.c 15 Aug 2003 19:38:07 -0000
***************
*** 59,64 ****
--- 59,68 ----
  #include <sys/vnode.h>
  #include <sys/lock.h>
  #include <sys/mount.h>
+ #include <sys/syslog.h>
+ #include <sys/sysctl.h>
+ #include <sys/uio.h>
+ #include <sys/queue.h>
  #ifdef KTRACE
  #include <sys/ktrace.h>
  #endif
***************
*** 74,79 ****
--- 78,86 ----
  static struct sf_buf *sf_buf_alloc(void);
  static void sf_buf_ref(caddr_t addr, u_int size);
  static void sf_buf_free(caddr_t addr, u_int size);
+ static void sf_buf_timeout(void *arg);
+ static struct sf_buf *sf_page_lookup(vm_page_t pg);
+ static void sf_cache_insert(vm_page_t pg, struct sf_buf *sf);

  static int sendit __P((struct proc *p, int s, struct msghdr *mp, int
flags));
  static int recvit __P((struct proc *p, int s, struct msghdr *mp,
***************
*** 1422,1427 ****
--- 1429,1487 ----
   return(error);
  }

+
+ /*** support for sendfile optimization ***/
+
+ /* global variables exported by sysctl:
+  * lrusfpages: # of sendfile buffers in LRU list (available free
buffer list)
+  * numsfpages: # of sendfile buffers cached
+  * sf_cache_timeout_sec: timeout after the last sendfile system call
in seconds
+  *             Any postive number of this enables the caching
+  *             Set to 0 disables the sendfile buffer caching (default
setting)
+  */
+
+ int numsfpages;
+ int lrusfpages;
+ int sf_cache_timeout_sec;
+
+ typedef struct sf_buf *sf_buf_p;
+ static sf_buf_p *sfcBins;
+ static struct sf_buf *sfc_head, *sfc_tail;
+ static struct callout sf_timeout;
+ static int sf_tticks, sf_prev_tsec;
+
+ #define address_hash(key)  \
+   ((((unsigned long)(key) >> 3)) & (nsfbufs - 1))
+
+ /*
+   ((((unsigned int)(key) >> 3)  * 2654435761) & SFC_MASK)
+ */
+
+ #define sf_buf_LRU_remove(ent)               \
+ {                                            \
+   lrusfpages--; \
+   if ((ent)->sf_prev)                        \
+     (ent)->sf_prev->sf_next = (ent)->sf_next;\
+   else                                       \
+     sfc_head = (ent)->sf_next;               \
+   if ((ent)->sf_next)                        \
+     (ent)->sf_next->sf_prev = (ent)->sf_prev;\
+   else                                       \
+     sfc_tail = (ent)->sf_prev;               \
+ }
+
+ #define sf_buf_LRU_head_insert(ent) \
+ {                                   \
+   lrusfpages++; \
+   (ent)->sf_prev = NULL;            \
+   (ent)->sf_next = sfc_head;        \
+   if (sfc_head)                     \
+     sfc_head->sf_prev = (ent);      \
+   else                              \
+     sfc_tail = (ent);               \
+   sfc_head = (ent);                 \
+ }
+
  /*
   * Allocate a pool of sf_bufs (sendfile(2) or "super-fast" if you
prefer. :-))
   * XXX - The sf_buf functions are currently private to sendfile(2), so
have
***************
*** 1432,1437 ****
--- 1492,1498 ----
  sf_buf_init(void *arg)
  {
   int i;
+  struct sf_buf *sf;

   SLIST_INIT(&sf_freelist);
   sf_base = kmem_alloc_pageable(kernel_map, nsfbufs * PAGE_SIZE);
***************
*** 1439,1524 ****
   bzero(sf_bufs, nsfbufs * sizeof(struct sf_buf));
   for (i = 0; i < nsfbufs; i++) {
    sf_bufs[i].kva = sf_base + i * PAGE_SIZE;
!   SLIST_INSERT_HEAD(&sf_freelist, &sf_bufs[i], free_list);
   }
  }

  /*
!  * Get an sf_buf from the freelist. Will block if none are available.
   */
  static struct sf_buf *
  sf_buf_alloc()
  {
   struct sf_buf *sf;
   int s;
   int error;

   s = splimp();
!  while ((sf = SLIST_FIRST(&sf_freelist)) == NULL) {
    sf_buf_alloc_want = 1;
!   error = tsleep(&sf_freelist, PVM|PCATCH, "sfbufa", 0);
    if (error)
     break;
   }
   if (sf != NULL) {
!   SLIST_REMOVE_HEAD(&sf_freelist, free_list);
!   sf->refcnt = 1;
   }
   splx(s);
   return (sf);
  }

- #define dtosf(x) (&sf_bufs[((uintptr_t)(x) - (uintptr_t)sf_base) >>
PAGE_SHIFT])
  static void
! sf_buf_ref(caddr_t addr, u_int size)
  {
   struct sf_buf *sf;

!  sf = dtosf(addr);
!  if (sf->refcnt == 0)
!   panic("sf_buf_ref: referencing a free sf_buf");
!  sf->refcnt++;
  }

! /*
!  * Lose a reference to an sf_buf. When none left, detach mapped page
!  * and release resources back to the system.
!  *
!  * Must be called at splimp.
   */
  static void
! sf_buf_free(caddr_t addr, u_int size)
  {
!  struct sf_buf *sf;
!  struct vm_page *m;
!  int s;

!  sf = dtosf(addr);
!  if (sf->refcnt == 0)
!   panic("sf_buf_free: freeing free sf_buf");
!  sf->refcnt--;
!  if (sf->refcnt == 0) {
!   pmap_qremove((vm_offset_t)addr, 1);
!   m = sf->m;
!   s = splvm();
!   vm_page_unwire(m, 0);
!   /*
!    * Check for the object going away on us. This can
!    * happen since we don't hold a reference to it.
!    * If so, we're responsible for freeing the page.
!    */
!   if (m->wire_count == 0 && m->object == NULL)
!    vm_page_free(m);
!   splx(s);
!   sf->m = NULL;
!   SLIST_INSERT_HEAD(&sf_freelist, sf, free_list);
!   if (sf_buf_alloc_want) {
!    sf_buf_alloc_want = 0;
!    wakeup(&sf_freelist);
!   }
!  }
  }

  /*
   * sendfile(2).
   * int sendfile(int fd, int s, off_t offset, size_t nbytes,
--- 1500,1790 ----
   bzero(sf_bufs, nsfbufs * sizeof(struct sf_buf));
   for (i = 0; i < nsfbufs; i++) {
    sf_bufs[i].kva = sf_base + i * PAGE_SIZE;
!
!   /* create the LRU list*/
!   sf = &sf_bufs[i];
!   sf_buf_LRU_head_insert(sf);
!   sf->sf_isOnLRU = 1;
!   callout_init(&sf_timeout);
   }
+
+  /* allocate number of nsfbufs cache entries (hash bins) */
+  sfcBins = malloc(nsfbufs * sizeof(sf_buf_p), M_TEMP, M_NOWAIT);
+  bzero(sfcBins, nsfbufs * sizeof(sf_buf_p));
+
+  sf_cache_timeout_sec = 300;
+ }
+
+ #define dtosf(x) (&sf_bufs[((uintptr_t)(x) - (uintptr_t)sf_base) >>
PAGE_SHIFT])
+ static void
+ sf_buf_ref(caddr_t addr, u_int size)
+ {
+  struct sf_buf *sf;
+
+  sf = dtosf(addr);
+  if (sf->refcnt == 0) {
+    panic("sf_buf_ref: referencing a free sf_buf");
+  }
+  sf->refcnt++;
+
+  return;
+ }
+
+ /* Remove pmap entry, free wired pages. Called by
+  * 1. sf_buf_free if sf_buf cache is NOT enabled
+  * 2. sf_buf_alloc if the page on the LRU tail needs to be freed
+  * 3. sf_buf_timeout
+  */
+ static void
+ sf_page_free(struct sf_buf *sf)
+ {
+   struct vm_page *m;
+   int s;
+
+   numsfpages--;
+
+   pmap_qremove((vm_offset_t)sf->kva, 1);
+   m = sf->m;
+   s = splvm();
+   vm_page_unwire(m, 0);
+   /*
+    * Check for the object going away on us. This can
+    * happen since we don't hold a reference to it.
+    * If so, we're responsible for freeing the page.
+    */
+   if (m->wire_count == 0 && m->object == NULL)
+     vm_page_free(m);
+   splx(s);
+   sf->m = NULL;
  }

  /*
!  * Lose a reference to an sf_buf. When none left, detach mapped page
!  * and release resources back to the system.
!  *
!  * Must be called at splimp.
!  */
! static void
! sf_buf_free(caddr_t addr, u_int size)
! {
!  struct sf_buf *sf;
!
!  sf = dtosf(addr);
!  if (sf->refcnt == 0)
!    panic("sf_buf_free: freeing free sf_buf");
!
!  sf->refcnt--;
!
!  /* Don't free the page if caching is enabled */
!  if (sf_cache_timeout_sec && (sf->refcnt == 1) ) {
!    sf_buf_LRU_head_insert(sf);
!    sf->sf_isOnLRU = 1;
!    if (sf_buf_alloc_want) {
!      sf_buf_alloc_want = 0;
!      wakeup(sfc_tail);
!    }
!    return;
!  }
!
!  /* Code reaches here only when caching is NOT enabled */
!  if (sf->refcnt == 0) {
!
!    sf_page_free(sf);
!
!    /* Also use LRU list for the free buffer list */
!    if (!sf->sf_isOnLRU) {
!      sf_buf_LRU_head_insert(sf);
!      sf->sf_isOnLRU = 1;
!      if (sf_buf_alloc_want) {
!        sf_buf_alloc_want = 0;
!        wakeup(sfc_tail);
!      }
!    }
!  }
!
!  return;
! }
!
! /* When caching is NOT enabled (sf_cache_timeout_sec == 0):
!  *   Grab the LRU list tail and simply return.
!  * When caching is enabled (sf_cache_timeout_sec > 0):
!  *   Free tail from the LRu list
!  *   Remove from the hash entry
!  *   Free the wired page
   */
  static struct sf_buf *
  sf_buf_alloc()
  {
   struct sf_buf *sf;
   int s;
+  int hashBin;
+  vm_page_t pg;
   int error;

+  numsfpages ++;
+
   s = splimp();
!  while ((sf = sfc_tail) == NULL) {
    sf_buf_alloc_want = 1;
!
!   /* this should rarely happen when caching is enabled */
!   log(LOG_INFO, "sfbufa");
!
!   error = tsleep(sfc_tail, PVM|PCATCH, "sfbufa", 0);
    if (error)
     break;
   }
+
   if (sf != NULL) {
!    sf_buf_LRU_remove(sf);
!
!    /* Buffers in the list should have refcnt 1 or 0 */
!    if (sf->refcnt > 1)
!      panic("sf_cached_buf_allo: refcnt > 1");
!
!    /* need to free */
!    if (sf->refcnt == 1) {
!      struct sf_buf *walk;
!
!      /* remove from Hash entry */
!      pg = sf->m;
!      hashBin = address_hash(pg);
!
!      for (walk = sfcBins[hashBin]; walk; walk = walk->sf_nextHash) {
!        if (walk == sf) {
!
!   if (walk == sfcBins[hashBin]) {
!     if (walk->sf_nextHash)
!       walk->sf_nextHash->sf_prevHash = NULL;
!     sfcBins[hashBin] = walk->sf_nextHash;
!   } else {
!     if (walk->sf_nextHash)
!       walk->sf_nextHash->sf_prevHash = walk->sf_prevHash;
!     *walk->sf_prevHash = walk->sf_nextHash;
!   }
!
!   break;
!        }
!      }
!
!      /* free from pmap and unwire the page */
!      sf->refcnt --;
!      sf_page_free(sf);
!    }
!
!    sf->refcnt = 1;
!    sf->sf_isOnLRU = 0;
   }
+
   splx(s);
   return (sf);
  }

  static void
! sf_buf_timeout(void *arg)
  {
   struct sf_buf *sf;

!  /* Some buffers are still in using, haven't been called by
sf_buf_free
!          * Wait a while then come back again
!          */
!  if (lrusfpages != nsfbufs) {
!    log(LOG_INFO, "sendfile timeout delayed\n");
!    callout_reset(&sf_timeout, 10000, sf_buf_timeout, (void *) 0);
!    return;
!  }
!
!  for (sf = sfc_tail; sf; sf = sf->sf_prev) {
!
!    /* Clear link fields for hash entry*/
!    sf->sf_nextHash = NULL;
!    sf->sf_prevHash = NULL;
!
!    if (sf->refcnt == 0)
!      continue;
!
!    sf->refcnt--;
!
!    /* free wired pages */
!    if (sf->refcnt == 0)
!      sf_page_free(sf);
!    else
!      panic("sf_buf_timeout: refcnt not 0");
!  }
!
!  /* need to clear hash bins */
!  bzero(sfcBins, nsfbufs * sizeof(sf_buf_p));
!
!  return;
  }

! /* Return cached sf_buf, bring it to the LRU head
!  * The target sf_buf could be in LRU list already
!  * or still an active one. So we need to increase refcnt in advance
   */
+ static struct sf_buf *
+ sf_page_lookup(vm_page_t pg)
+ {
+   int hashBin;
+   struct sf_buf *walk, *ret = NULL;
+   int s;
+
+   if (!sf_cache_timeout_sec)
+     return (NULL);
+
+   s = splimp();
+
+   hashBin = address_hash(pg);
+
+   for (walk = sfcBins[hashBin]; walk; walk = walk->sf_nextHash) {
+     if (walk->m == pg) {
+       /* move to LRU head */
+       if (walk->sf_isOnLRU) {
+  sf_buf_LRU_remove(walk);
+       }
+
+       walk->sf_isOnLRU = 0;
+
+       /* increase ref_cnt in advance to avoid being freed */
+       walk->refcnt++;
+
+       ret = walk;
+       break;
+     }
+   }
+
+   splx(s);
+
+   return (ret);
+ }
+
+ /* This is a newly wired page, put into cache entry */
  static void
! sf_cache_insert(vm_page_t pg, struct sf_buf *sf)
  {
!   int hashBin;

!   if (!sf_cache_timeout_sec)
!     return;
!
!   hashBin = address_hash(pg);
!
!   /* insert into Hash entry */
!   sf->sf_prevHash = &(sfcBins[hashBin]);
!   sf->sf_nextHash = sfcBins[hashBin];
!
!   if (sfcBins[hashBin])
!     (sfcBins[hashBin])->sf_prevHash = &sf->sf_nextHash;
!   sfcBins[hashBin] = sf;
!
!   /* increase ref_cnt in advance to avoid being freed */
!   sf->refcnt++;
!
!   return;
  }

+ /*** end ***/
+
  /*
   * sendfile(2).
   * int sendfile(int fd, int s, off_t offset, size_t nbytes,
***************
*** 1529,1534 ****
--- 1795,1805 ----
   * nbytes == 0. Optionally add a header and/or trailer to the socket
   * output. If specified, write the total number of bytes sent into
*sbytes.
   */
+
+ /*** if flags is specified to 1, don't initiate IO if the page is not
valid
+  *** but reutn 999 instead
+  ***/
+
  int
  sendfile(struct proc *p, struct sendfile_args *uap)
  {
***************
*** 1537,1543 ****
   struct vnode *vp;
   struct vm_object *obj;
   struct socket *so;
!  struct mbuf *m;
   struct sf_buf *sf;
   struct vm_page *pg;
   struct writev_args nuap;
--- 1808,1814 ----
   struct vnode *vp;
   struct vm_object *obj;
   struct socket *so;
!  struct mbuf *m, *hm = NULL;
   struct sf_buf *sf;
   struct vm_page *pg;
   struct writev_args nuap;
***************
*** 1545,1555 ****
   off_t off, xfsize, sbytes = 0;
   int error = 0, s;

   vp = NULL;
!  /*
!   * Do argument checking. Must be a regular file in, stream
!   * type and connected socket out, positive offset.
!   */
   fp = holdfp(fdp, uap->fd, FREAD);
   if (fp == NULL) {
    error = EBADF;
--- 1816,1825 ----
   off_t off, xfsize, sbytes = 0;
   int error = 0, s;

+  int headSent = 0, head_len = 0;
+
   vp = NULL;
!
   fp = holdfp(fdp, uap->fd, FREAD);
   if (fp == NULL) {
    error = EBADF;
***************
*** 1583,1631 ****
    goto done;
   }

-  /*
-   * If specified, get the pointer to the sf_hdtr struct for
-   * any headers/trailers.
-   */
   if (uap->hdtr != NULL) {
    error = copyin(uap->hdtr, &hdtr, sizeof(hdtr));
    if (error)
     goto done;
-   /*
-    * Send any headers. Wimp out and use writev(2).
-    */
-   if (hdtr.headers != NULL) {
-    nuap.fd = uap->s;
-    nuap.iovp = hdtr.headers;
-    nuap.iovcnt = hdtr.hdr_cnt;
-    error = writev(p, &nuap);
-    if (error)
-     goto done;
-    sbytes += p->p_retval[0];
-   }
   }

-  /*
-   * Protect against multiple writers to the socket.
-   */
   (void) sblock(&so->so_snd, M_WAITOK);

!  /*
!   * Loop through the pages in the file, starting with the requested
!   * offset. Get a file page (do I/O if necessary), map the file page
!   * into an sf_buf, attach an mbuf header to the sf_buf, and queue
!   * it on the socket.
!   */
   for (off = uap->offset; ; off += xfsize, sbytes += xfsize) {
    vm_pindex_t pindex;
    vm_offset_t pgoff;

    pindex = OFF_TO_IDX(off);
  retry_lookup:
-   /*
-    * Calculate the amount to transfer. Not to exceed a page,
-    * the EOF, or the passed in nbytes.
-    */
    xfsize = obj->un_pager.vnp.vnp_size - off;
    if (xfsize > PAGE_SIZE)
     xfsize = PAGE_SIZE;
--- 1853,1886 ----
    goto done;
   }

   if (uap->hdtr != NULL) {
    error = copyin(uap->hdtr, &hdtr, sizeof(hdtr));
    if (error)
     goto done;
   }

   (void) sblock(&so->so_snd, M_WAITOK);

!  /* finally timeout setup could be moved into sf_buf_init */
!  if (sf_cache_timeout_sec != sf_prev_tsec) {
!    struct timeval tv;
!
!    tv.tv_sec = sf_cache_timeout_sec;
!    tv.tv_usec = 0;
!
!    sf_tticks = tvtohz(&tv);
!    sf_prev_tsec = sf_cache_timeout_sec;
!  }
!
!  if (sf_cache_timeout_sec)
!    callout_reset(&sf_timeout, sf_tticks, sf_buf_timeout, (void *) 0);
!
   for (off = uap->offset; ; off += xfsize, sbytes += xfsize) {
    vm_pindex_t pindex;
    vm_offset_t pgoff;

    pindex = OFF_TO_IDX(off);
  retry_lookup:
    xfsize = obj->un_pager.vnp.vnp_size - off;
    if (xfsize > PAGE_SIZE)
     xfsize = PAGE_SIZE;
***************
*** 1636,1645 ****
     xfsize = uap->nbytes - sbytes;
    if (xfsize <= 0)
     break;
-   /*
-    * Optimize the non-blocking case by looking at the socket space
-    * before going to the extra work of constituting the sf_buf.
-    */
    if ((so->so_state & SS_NBIO) && sbspace(&so->so_snd) <= 0) {
     if (so->so_state & SS_CANTSENDMORE)
      error = EPIPE;
--- 1891,1896 ----
***************
*** 1647,1689 ****
      error = EAGAIN;
     sbunlock(&so->so_snd);
     goto done;
!   }
!   /*
!    * Attempt to look up the page.
!    *
!    * Allocate if not found
!    *
!    * Wait and loop if busy.
!    */
    pg = vm_page_lookup(obj, pindex);

!   if (pg == NULL) {
!    pg = vm_page_alloc(obj, pindex, VM_ALLOC_NORMAL);
!    if (pg == NULL) {
!     VM_WAIT;
!     goto retry_lookup;
!    }
!    vm_page_wakeup(pg);
    } else if (vm_page_sleep_busy(pg, TRUE, "sfpbsy")) {
!    goto retry_lookup;
    }

-   /*
-    * Wire the page so it does not get ripped out from under
-    * us.
-    */
-
    vm_page_wire(pg);

!   /*
!    * If page is not valid for what we need, initiate I/O
     */

    if (!pg->valid || !vm_page_is_valid(pg, pgoff, xfsize)) {
     struct uio auio;
     struct iovec aiov;
     int bsize;
-
     /*
      * Ensure that our page is still around when the I/O
      * completes.
--- 1898,1936 ----
      error = EAGAIN;
     sbunlock(&so->so_snd);
     goto done;
!                 }
!
!   /* modified part begin */
!
!   /* look up in the hot page cache */
!
    pg = vm_page_lookup(obj, pindex);

!                 if (pg == NULL) {
!
!       if (!(pg = vm_page_alloc(obj, pindex, VM_ALLOC_NORMAL))) {
!         VM_WAIT;
!         goto retry_lookup;
!       }
!       vm_page_wakeup(pg);
    } else if (vm_page_sleep_busy(pg, TRUE, "sfpbsy")) {
!       goto retry_lookup;
!   } else if ((sf = sf_page_lookup(pg))) {
!       goto begin_send;
    }

    vm_page_wire(pg);

!   /* If page is not valid for what we need
!    * initiate I/O if flag is 0
!    * return 999 if flag is SF_NONIO
     */

    if (!pg->valid || !vm_page_is_valid(pg, pgoff, xfsize)) {
+     if (uap->flags == 0) {
     struct uio auio;
     struct iovec aiov;
     int bsize;
     /*
      * Ensure that our page is still around when the I/O
      * completes.
***************
*** 1724,1736 ****
      sbunlock(&so->so_snd);
      goto done;
     }
!   }
!

!   /*
!    * Get a sendfile buf. We usually wait as long as necessary,
!    * but this wait can be interrupted.
!    */
    if ((sf = sf_buf_alloc()) == NULL) {
     s = splvm();
     vm_page_unwire(pg, 0);
--- 1971,1992 ----
      sbunlock(&so->so_snd);
      goto done;
     }
!     }
!     else if (uap->flags == SF_NONIO) {
!    vm_page_unwire(pg, 0);
!    if (pg->wire_count == 0 && pg->valid == 0 &&
!        pg->busy == 0 && !(pg->flags & PG_BUSY) &&
!        pg->hold_count == 0) {
!      vm_page_busy(pg);
!      vm_page_free(pg);
!    }
!    sbunlock(&so->so_snd);
!    error = 999;
!    goto done;
!     }
!   } /* page is not valid */

!     /* Get a sendfile buf */
    if ((sf = sf_buf_alloc()) == NULL) {
     s = splvm();
     vm_page_unwire(pg, 0);
***************
*** 1742,1788 ****
     goto done;
    }

-
-   /*
-    * Allocate a kernel virtual page and insert the physical page
-    * into it.
-    */
-
    sf->m = pg;
    pmap_qenter(sf->kva, &pg, 1);
!   /*
!    * Get an mbuf header and set it up as having external storage.
!    */
    MGETHDR(m, M_WAIT, MT_DATA);
    if (m == NULL) {
     error = ENOBUFS;
-    sf_buf_free((void *)sf->kva, PAGE_SIZE);
     sbunlock(&so->so_snd);
     goto done;
    }
    m->m_ext.ext_free = sf_buf_free;
    m->m_ext.ext_ref = sf_buf_ref;
    m->m_ext.ext_buf = (void *)sf->kva;
    m->m_ext.ext_size = PAGE_SIZE;
    m->m_data = (char *) sf->kva + pgoff;
    m->m_flags |= M_EXT;
!   m->m_pkthdr.len = m->m_len = xfsize;
!   /*
!    * Add the buffer to the socket buffer chain.
!    */
    s = splnet();
  retry_space:
-   /*
-    * Make sure that the socket is still able to take more data.
-    * CANTSENDMORE being true usually means that the connection
-    * was closed. so_error is true when an error was sensed after
-    * a previous send.
-    * The state is checked after the page mapping and buffer
-    * allocation above since those operations may block and make
-    * any socket checks stale. From this point forward, nothing
-    * blocks before the pru_send (or more accurately, any blocking
-    * results in a loop back to here to re-check).
-    */
    if ((so->so_state & SS_CANTSENDMORE) || so->so_error) {
     if (so->so_state & SS_CANTSENDMORE) {
      error = EPIPE;
--- 1998,2070 ----
     goto done;
    }

    sf->m = pg;
    pmap_qenter(sf->kva, &pg, 1);
!
!   sf_cache_insert(pg, sf);
!
!   /* get mbuf and chain into the socket */
! begin_send:
    MGETHDR(m, M_WAIT, MT_DATA);
    if (m == NULL) {
     error = ENOBUFS;
     sbunlock(&so->so_snd);
     goto done;
    }
+   /* only decrease ref count, but not free page */
+
    m->m_ext.ext_free = sf_buf_free;
    m->m_ext.ext_ref = sf_buf_ref;
    m->m_ext.ext_buf = (void *)sf->kva;
    m->m_ext.ext_size = PAGE_SIZE;
    m->m_data = (char *) sf->kva + pgoff;
    m->m_flags |= M_EXT;
!   m->m_len = xfsize;
!
!   /* assemble header into the first packet as mbuf chain */
!   if (uap->hdtr != NULL &&
!       hdtr.headers != NULL &&
!       !headSent) {
!     int i;
!     struct iovec *iov;
!
!                MGETHDR(hm, M_WAIT, MT_DATA);
!     if (hm == NULL) {
!       error = ENOBUFS;
!       m_freem(m);
!       sbunlock(&so->so_snd);
!       goto done;
!     }
!
!     /* get a cluster for header space */
!     MCLGET(hm, M_WAIT);
!     if ((hm->m_flags & M_EXT) == 0) {
!       m_freem(m);
!       sbunlock(&so->so_snd);
!       goto done;
!     }
!
!     /* any limit for cluster size 2048 ??? */
!     for (i = 0; i < hdtr.hdr_cnt; i++) {
!       iov = &(uap->hdtr->headers[i]);
!       error = copyin(iov->iov_base,
!        (char *)(hm->m_data + head_len),
!        iov->iov_len);
!       head_len += iov->iov_len;
!     }
!
!     if (head_len > 2048)
!       log(LOG_INFO, "sendfile header length > 2048");
!
!     hm->m_pkthdr.len = head_len + xfsize;
!     hm->m_len = head_len;
!     hm->m_next = m;
!   }
!   else
!     m->m_pkthdr.len = xfsize;
!
    s = splnet();
  retry_space:
    if ((so->so_state & SS_CANTSENDMORE) || so->so_error) {
     if (so->so_state & SS_CANTSENDMORE) {
      error = EPIPE;
***************
*** 1790,1849 ****
      error = so->so_error;
      so->so_error = 0;
     }
!    m_freem(m);
     sbunlock(&so->so_snd);
     splx(s);
     goto done;
    }
!   /*
!    * Wait for socket space to become available. We do this just
!    * after checking the connection state above in order to avoid
!    * a race condition with sbwait().
!    */
    if (sbspace(&so->so_snd) < so->so_snd.sb_lowat) {
     if (so->so_state & SS_NBIO) {
!     m_freem(m);
      sbunlock(&so->so_snd);
      splx(s);
      error = EAGAIN;
      goto done;
     }
     error = sbwait(&so->so_snd);
-    /*
-     * An error from sbwait usually indicates that we've
-     * been interrupted by a signal. If we've sent anything
-     * then return bytes sent, otherwise return the error.
-     */
     if (error) {
!     m_freem(m);
      sbunlock(&so->so_snd);
      splx(s);
      goto done;
     }
     goto retry_space;
    }
!   error = (*so->so_proto->pr_usrreqs->pru_send)(so, 0, m, 0, 0, p);
    splx(s);
    if (error) {
     sbunlock(&so->so_snd);
     goto done;
    }
   }
   sbunlock(&so->so_snd);

-  /*
-   * Send trailers. Wimp out and use writev(2).
-   */
   if (uap->hdtr != NULL && hdtr.trailers != NULL) {
     nuap.fd = uap->s;
     nuap.iovp = hdtr.trailers;
     nuap.iovcnt = hdtr.trl_cnt;
     error = writev(p, &nuap);
!    if (error)
      goto done;
     sbytes += p->p_retval[0];
   }
-
  done:
   if (uap->sbytes != NULL) {
    copyout(&sbytes, uap->sbytes, sizeof(off_t));
--- 2072,2149 ----
      error = so->so_error;
      so->so_error = 0;
     }
!    /*
!    if (hm && !headSent)
!      m_freem(hm);
!    else
!    */
!      m_freem(m);
     sbunlock(&so->so_snd);
     splx(s);
     goto done;
    }
!
    if (sbspace(&so->so_snd) < so->so_snd.sb_lowat) {
     if (so->so_state & SS_NBIO) {
!      /*
!     if (hm && !headSent)
!       m_freem(hm);
!     else
!      */
!       m_freem(m);
      sbunlock(&so->so_snd);
      splx(s);
      error = EAGAIN;
      goto done;
     }
     error = sbwait(&so->so_snd);
     if (error) {
!      /*
!     if (hm && !headSent)
!       m_freem(hm);
!     else
!      */
!       m_freem(m);
      sbunlock(&so->so_snd);
      splx(s);
      goto done;
     }
     goto retry_space;
    }
!
!   if (uap->hdtr != NULL &&
!       hdtr.headers != NULL &&
!       !headSent)
!     error = (*so->so_proto->pr_usrreqs->pru_send)(so, 0, hm, 0, 0, p);

!   else
!     error = (*so->so_proto->pr_usrreqs->pru_send)(so, 0, m, 0, 0, p);
!
    splx(s);
    if (error) {
     sbunlock(&so->so_snd);
     goto done;
    }
+
+   if (uap->hdtr != NULL &&
+       hdtr.headers != NULL &&
+       !headSent) {
+     sbytes += head_len;
+     headSent = 1;
+   }
+
   }
   sbunlock(&so->so_snd);

   if (uap->hdtr != NULL && hdtr.trailers != NULL) {
     nuap.fd = uap->s;
     nuap.iovp = hdtr.trailers;
     nuap.iovcnt = hdtr.trl_cnt;
     error = writev(p, &nuap);
!    if (error) {
      goto done;
+    }
     sbytes += p->p_retval[0];
   }
  done:
   if (uap->sbytes != NULL) {
    copyout(&sbytes, uap->sbytes, sizeof(off_t));
***************
*** 1852,1856 ****
--- 2152,2157 ----
    vrele(vp);
   if (fp)
    fdrop(fp, p);
+
   return (error);
  }
Index: sys/socketvar.h
===================================================================
RCS file: /home/ncvs/src/sys/sys/socketvar.h,v
retrieving revision 1.46.2.8
diff -c -r1.46.2.8 socketvar.h
*** sys/socketvar.h 1 May 2002 03:26:32 -0000 1.46.2.8
--- sys/socketvar.h 15 Aug 2003 19:38:09 -0000
***************
*** 274,280 ****
--- 274,288 ----
   int  refcnt;  /* reference count */
   struct  vm_page *m; /* currently mapped page */
   vm_offset_t kva;  /* va of mapping */
+   struct sf_buf *sf_next;               /* next in LRU */
+   struct sf_buf *sf_prev;               /* prev in LRU */
+   struct sf_buf *sf_nextHash;           /* next in hash entry */
+   struct sf_buf **sf_prevHash;          /* prev in hash entry */
+   int sf_isOnLRU;
  };
+
+ /* sendfile flags */
+ #define SF_NONIO 1

  struct accept_filter {
   char accf_name[16];
Index: sys/sysctl.h
===================================================================
RCS file: /home/ncvs/src/sys/sys/sysctl.h,v
retrieving revision 1.81.2.8
diff -c -r1.81.2.8 sysctl.h
*** sys/sysctl.h 17 Mar 2002 11:08:38 -0000 1.81.2.8
--- sys/sysctl.h 15 Aug 2003 19:38:10 -0000
***************
*** 331,337 ****
  #define KERN_PS_STRINGS  32 /* int: address of PS_STRINGS */
  #define KERN_USRSTACK  33 /* int: address of USRSTACK */
  #define KERN_LOGSIGEXIT  34 /* int: do we log sigexit procs? */
! #define KERN_MAXID  35      /* number of valid kern ids */

  #define CTL_KERN_NAMES { \
   { 0, 0 }, \
--- 331,338 ----
  #define KERN_PS_STRINGS  32 /* int: address of PS_STRINGS */
  #define KERN_USRSTACK  33 /* int: address of USRSTACK */
  #define KERN_LOGSIGEXIT  34 /* int: do we log sigexit procs? */
! #define KERN_SENDFILE           35      /* sendfile statistics */
! #define KERN_MAXID  36      /* number of valid kern ids */

  #define CTL_KERN_NAMES { \
   { 0, 0 }, \
***************
*** 369,374 ****
--- 370,376 ----
   { "ps_strings", CTLTYPE_INT }, \
   { "usrstack", CTLTYPE_INT }, \
   { "logsigexit", CTLTYPE_INT }, \
+         { "sendfile", CTLTYPE_NODE }, \
  }

  /*
***************
*** 402,407 ****
--- 404,418 ----
  #define KIPC_MAX_DATALEN 7 /* int: max length of data? */
  #define KIPC_MBSTAT  8 /* struct: mbuf usage statistics */
  #define KIPC_NMBCLUSTERS 9 /* int: maximum mbuf clusters */
+
+ #define KSNDF_NSFBUFS       1       /* number of sendfile bufs */
+ #define KSNDF_NSFPGS        2       /* number of sendfile cached pages
*/
+ #define KSNDF_NLRUPGS       3       /* number of sendfile free pages
*/
+ #define KSNDF_TOSEC         4       /* sendfile cache timeout in
seconds */
+
+ extern int numsfpages;
+ extern int lrusfpages;
+ extern int sf_cache_timeout_sec;

  /*
   * CTL_HW identifiers





More information about the freebsd-hackers mailing list