svn commit: r268535 - in head/sys: kern sys

Adrian Chadd adrian at freebsd.org
Fri Jul 11 19:48:41 UTC 2014


I'm so glad this finally made it in!

Thanks!


-a


On 11 July 2014 12:40, Gleb Smirnoff <glebius at freebsd.org> wrote:
> Author: glebius
> Date: Fri Jul 11 19:40:50 2014
> New Revision: 268535
> URL: http://svnweb.freebsd.org/changeset/base/268535
>
> Log:
>   Improve reference counting of EXT_SFBUF pages attached to mbufs.
>
>   o Do not use UMA refcount zone. The problem with this zone is that
>     several refcounting words (16 on amd64) share the same cache line,
>     and issueing atomic(9) updates on them creates cache line contention.
>     Also, allocating and freeing them is extra CPU cycles.
>     Instead, refcount the page directly via vm_page_wire() and the sfbuf
>     via sf_buf_alloc(sf_buf_page(sf)) [1].
>
>   o Call refcounting/freeing function for EXT_SFBUF via direct function
>     call, instead of function pointer. This removes barrier for CPU
>     branch predictor.
>
>   o Do not cleanup the mbuf to be freed in mb_free_ext(), merely to
>     satisfy assertion in mb_dtor_mbuf(). Remove the assertion from
>     mb_dtor_mbuf(). Use bcopy() instead of manual assignments to
>     copy m_ext in mb_dupcl().
>
>   [1] This has some problems for now. Using sf_buf_alloc() merely to
>       increase refcount is expensive, and is broken on sparc64. To be
>       fixed.
>
>   Sponsored by: Netflix
>   Sponsored by: Nginx, Inc.
>
> Modified:
>   head/sys/kern/kern_mbuf.c
>   head/sys/kern/uipc_mbuf.c
>   head/sys/kern/uipc_syscalls.c
>   head/sys/sys/mbuf.h
>   head/sys/sys/sf_buf.h
>
> Modified: head/sys/kern/kern_mbuf.c
> ==============================================================================
> --- head/sys/kern/kern_mbuf.c   Fri Jul 11 17:31:40 2014        (r268534)
> +++ head/sys/kern/kern_mbuf.c   Fri Jul 11 19:40:50 2014        (r268535)
> @@ -449,7 +449,6 @@ mb_dtor_mbuf(void *mem, int size, void *
>
>         if ((m->m_flags & M_PKTHDR) && !SLIST_EMPTY(&m->m_pkthdr.tags))
>                 m_tag_delete_chain(m, NULL);
> -       KASSERT((m->m_flags & M_EXT) == 0, ("%s: M_EXT set", __func__));
>         KASSERT((m->m_flags & M_NOFREE) == 0, ("%s: M_NOFREE set", __func__));
>  #ifdef INVARIANTS
>         trash_dtor(mem, size, arg);
>
> Modified: head/sys/kern/uipc_mbuf.c
> ==============================================================================
> --- head/sys/kern/uipc_mbuf.c   Fri Jul 11 17:31:40 2014        (r268534)
> +++ head/sys/kern/uipc_mbuf.c   Fri Jul 11 19:40:50 2014        (r268535)
> @@ -287,19 +287,31 @@ m_extadd(struct mbuf *mb, caddr_t buf, u
>  void
>  mb_free_ext(struct mbuf *m)
>  {
> -       int skipmbuf;
> +       int freembuf;
>
> -       KASSERT((m->m_flags & M_EXT) == M_EXT, ("%s: M_EXT not set", __func__));
> -       KASSERT(m->m_ext.ext_cnt != NULL, ("%s: ext_cnt not set", __func__));
> +       KASSERT(m->m_flags & M_EXT, ("%s: M_EXT not set on %p", __func__, m));
>
>         /*
> -        * check if the header is embedded in the cluster
> +        * Check if the header is embedded in the cluster.
>          */
> -       skipmbuf = (m->m_flags & M_NOFREE);
> +       freembuf = (m->m_flags & M_NOFREE) ? 0 : 1;
> +
> +       switch (m->m_ext.ext_type) {
> +       case EXT_SFBUF:
> +               sf_ext_free(m->m_ext.ext_arg1, m->m_ext.ext_arg2);
> +               break;
> +       default:
> +               KASSERT(m->m_ext.ext_cnt != NULL,
> +                   ("%s: no refcounting pointer on %p", __func__, m));
> +               /*
> +                * Free attached storage if this mbuf is the only
> +                * reference to it.
> +                */
> +               if (*(m->m_ext.ext_cnt) != 1) {
> +                       if (atomic_fetchadd_int(m->m_ext.ext_cnt, -1) != 1)
> +                               break;
> +               }
>
> -       /* Free attached storage if this mbuf is the only reference to it. */
> -       if (*(m->m_ext.ext_cnt) == 1 ||
> -           atomic_fetchadd_int(m->m_ext.ext_cnt, -1) == 1) {
>                 switch (m->m_ext.ext_type) {
>                 case EXT_PACKET:        /* The packet zone is special. */
>                         if (*(m->m_ext.ext_cnt) == 0)
> @@ -318,7 +330,6 @@ mb_free_ext(struct mbuf *m)
>                 case EXT_JUMBO16:
>                         uma_zfree(zone_jumbo16, m->m_ext.ext_buf);
>                         break;
> -               case EXT_SFBUF:
>                 case EXT_NET_DRV:
>                 case EXT_MOD_TYPE:
>                 case EXT_DISPOSABLE:
> @@ -337,23 +348,9 @@ mb_free_ext(struct mbuf *m)
>                                 ("%s: unknown ext_type", __func__));
>                 }
>         }
> -       if (skipmbuf)
> -               return;
>
> -       /*
> -        * Free this mbuf back to the mbuf zone with all m_ext
> -        * information purged.
> -        */
> -       m->m_ext.ext_buf = NULL;
> -       m->m_ext.ext_free = NULL;
> -       m->m_ext.ext_arg1 = NULL;
> -       m->m_ext.ext_arg2 = NULL;
> -       m->m_ext.ext_cnt = NULL;
> -       m->m_ext.ext_size = 0;
> -       m->m_ext.ext_type = 0;
> -       m->m_ext.ext_flags = 0;
> -       m->m_flags &= ~M_EXT;
> -       uma_zfree(zone_mbuf, m);
> +       if (freembuf)
> +               uma_zfree(zone_mbuf, m);
>  }
>
>  /*
> @@ -363,22 +360,24 @@ mb_free_ext(struct mbuf *m)
>  static void
>  mb_dupcl(struct mbuf *n, struct mbuf *m)
>  {
> -       KASSERT((m->m_flags & M_EXT) == M_EXT, ("%s: M_EXT not set", __func__));
> -       KASSERT(m->m_ext.ext_cnt != NULL, ("%s: ext_cnt not set", __func__));
> -       KASSERT((n->m_flags & M_EXT) == 0, ("%s: M_EXT set", __func__));
>
> -       if (*(m->m_ext.ext_cnt) == 1)
> -               *(m->m_ext.ext_cnt) += 1;
> -       else
> -               atomic_add_int(m->m_ext.ext_cnt, 1);
> -       n->m_ext.ext_buf = m->m_ext.ext_buf;
> -       n->m_ext.ext_free = m->m_ext.ext_free;
> -       n->m_ext.ext_arg1 = m->m_ext.ext_arg1;
> -       n->m_ext.ext_arg2 = m->m_ext.ext_arg2;
> -       n->m_ext.ext_size = m->m_ext.ext_size;
> -       n->m_ext.ext_cnt = m->m_ext.ext_cnt;
> -       n->m_ext.ext_type = m->m_ext.ext_type;
> -       n->m_ext.ext_flags = m->m_ext.ext_flags;
> +       KASSERT(m->m_flags & M_EXT, ("%s: M_EXT not set on %p", __func__, m));
> +       KASSERT(!(n->m_flags & M_EXT), ("%s: M_EXT set on %p", __func__, n));
> +
> +       switch (m->m_ext.ext_type) {
> +       case EXT_SFBUF:
> +               sf_ext_ref(m->m_ext.ext_arg1, m->m_ext.ext_arg2);
> +               break;
> +       default:
> +               KASSERT(m->m_ext.ext_cnt != NULL,
> +                   ("%s: no refcounting pointer on %p", __func__, m));
> +               if (*(m->m_ext.ext_cnt) == 1)
> +                       *(m->m_ext.ext_cnt) += 1;
> +               else
> +                       atomic_add_int(m->m_ext.ext_cnt, 1);
> +       }
> +
> +       bcopy(&m->m_ext, &n->m_ext, sizeof(m->m_ext));
>         n->m_flags |= M_EXT;
>         n->m_flags |= m->m_flags & M_RDONLY;
>  }
>
> Modified: head/sys/kern/uipc_syscalls.c
> ==============================================================================
> --- head/sys/kern/uipc_syscalls.c       Fri Jul 11 17:31:40 2014        (r268534)
> +++ head/sys/kern/uipc_syscalls.c       Fri Jul 11 19:40:50 2014        (r268535)
> @@ -1983,32 +1983,56 @@ filt_sfsync(struct knote *kn, long hint)
>         return (ret);
>  }
>
> +/*
> + * Add more references to a vm_page + sf_buf + sendfile_sync.
> + */
> +void
> +sf_ext_ref(void *arg1, void *arg2)
> +{
> +       struct sf_buf *sf = arg1;
> +       struct sendfile_sync *sfs = arg2;
> +       vm_page_t pg = sf_buf_page(sf);
> +
> +       /* XXXGL: there should be sf_buf_ref() */
> +       sf_buf_alloc(sf_buf_page(sf), SFB_NOWAIT);
> +
> +       vm_page_lock(pg);
> +       vm_page_wire(pg);
> +       vm_page_unlock(pg);
> +
> +       if (sfs != NULL) {
> +               mtx_lock(&sfs->mtx);
> +               KASSERT(sfs->count > 0, ("Sendfile sync botchup count == 0"));
> +               sfs->count++;
> +               mtx_unlock(&sfs->mtx);
> +       }
> +}
>
>  /*
>   * Detach mapped page and release resources back to the system.
>   */
>  void
> -sf_buf_mext(struct mbuf *mb, void *addr, void *args)
> +sf_ext_free(void *arg1, void *arg2)
>  {
> -       vm_page_t m;
> -       struct sendfile_sync *sfs;
> +       struct sf_buf *sf = arg1;
> +       struct sendfile_sync *sfs = arg2;
> +       vm_page_t pg = sf_buf_page(sf);
> +
> +       sf_buf_free(sf);
>
> -       m = sf_buf_page(args);
> -       sf_buf_free(args);
> -       vm_page_lock(m);
> -       vm_page_unwire(m, PQ_INACTIVE);
> +       vm_page_lock(pg);
> +       vm_page_unwire(pg, PQ_INACTIVE);
>         /*
>          * Check for the object going away on us. This can
>          * happen since we don't hold a reference to it.
>          * If so, we're responsible for freeing the page.
>          */
> -       if (m->wire_count == 0 && m->object == NULL)
> -               vm_page_free(m);
> -       vm_page_unlock(m);
> -       if (addr != NULL) {
> -               sfs = addr;
> +       if (pg->wire_count == 0 && pg->object == NULL)
> +               vm_page_free(pg);
> +       vm_page_unlock(pg);
> +
> +       if (sfs != NULL)
>                 sf_sync_deref(sfs);
> -       }
>  }
>
>  /*
> @@ -2124,7 +2148,7 @@ sf_sync_alloc(uint32_t flags)
>  /*
>   * Take a reference to a sfsync instance.
>   *
> - * This has to map 1:1 to free calls coming in via sf_buf_mext(),
> + * This has to map 1:1 to free calls coming in via sf_ext_free(),
>   * so typically this will be referenced once for each mbuf allocated.
>   */
>  void
> @@ -3062,17 +3086,19 @@ retry_space:
>                         m0 = m_get((mnw ? M_NOWAIT : M_WAITOK), MT_DATA);
>                         if (m0 == NULL) {
>                                 error = (mnw ? EAGAIN : ENOBUFS);
> -                               sf_buf_mext(NULL, NULL, sf);
> -                               break;
> -                       }
> -                       if (m_extadd(m0, (caddr_t )sf_buf_kva(sf), PAGE_SIZE,
> -                           sf_buf_mext, sfs, sf, M_RDONLY, EXT_SFBUF,
> -                           (mnw ? M_NOWAIT : M_WAITOK)) != 0) {
> -                               error = (mnw ? EAGAIN : ENOBUFS);
> -                               sf_buf_mext(NULL, NULL, sf);
> -                               m_freem(m0);
> +                               sf_ext_free(sf, NULL);
>                                 break;
>                         }
> +                       /*
> +                        * Attach EXT_SFBUF external storage.
> +                        */
> +                       m0->m_ext.ext_buf = (caddr_t )sf_buf_kva(sf);
> +                       m0->m_ext.ext_size = PAGE_SIZE;
> +                       m0->m_ext.ext_arg1 = sf;
> +                       m0->m_ext.ext_arg2 = sfs;
> +                       m0->m_ext.ext_type = EXT_SFBUF;
> +                       m0->m_ext.ext_flags = 0;
> +                       m0->m_flags |= (M_EXT|M_RDONLY);
>                         m0->m_data = (char *)sf_buf_kva(sf) + pgoff;
>                         m0->m_len = xfsize;
>
>
> Modified: head/sys/sys/mbuf.h
> ==============================================================================
> --- head/sys/sys/mbuf.h Fri Jul 11 17:31:40 2014        (r268534)
> +++ head/sys/sys/mbuf.h Fri Jul 11 19:40:50 2014        (r268535)
> @@ -374,6 +374,12 @@ struct mbuf {
>      "\30EXT_FLAG_EXP4"
>
>  /*
> + * External reference/free functions.
> + */
> +void sf_ext_ref(void *, void *);
> +void sf_ext_free(void *, void *);
> +
> +/*
>   * Flags indicating checksum, segmentation and other offload work to be
>   * done, or already done, by hardware or lower layers.  It is split into
>   * separate inbound and outbound flags.
>
> Modified: head/sys/sys/sf_buf.h
> ==============================================================================
> --- head/sys/sys/sf_buf.h       Fri Jul 11 17:31:40 2014        (r268534)
> +++ head/sys/sys/sf_buf.h       Fri Jul 11 19:40:50 2014        (r268535)
> @@ -52,7 +52,6 @@ struct sfstat {                               /* sendfile statistic
>  #include <machine/sf_buf.h>
>  #include <sys/systm.h>
>  #include <sys/counter.h>
> -struct mbuf;   /* for sf_buf_mext() */
>
>  extern counter_u64_t sfstat[sizeof(struct sfstat) / sizeof(uint64_t)];
>  #define        SFSTAT_ADD(name, val)   \
> @@ -60,7 +59,4 @@ extern counter_u64_t sfstat[sizeof(struc
>         (val))
>  #define        SFSTAT_INC(name)        SFSTAT_ADD(name, 1)
>  #endif /* _KERNEL */
> -
> -void   sf_buf_mext(struct mbuf *mb, void *addr, void *args);
> -
>  #endif /* !_SYS_SF_BUF_H_ */
>


More information about the svn-src-head mailing list