[TMPFS] patch for FreeBSD 8.2-RELEASE

Maurizio Vairani maurizio.vairani at cloverinformatica.it
Mon Oct 17 08:24:59 UTC 2011


Hi list,

Gleb Kurtsou in this thread 
http://lists.freebsd.org/pipermail/freebsd-fs/2011-October/012650.html 
proposes a patch for solving the well known TMPSF problem: the free 
space drops down to zero when ZFS consumes the kernel memory and there 
isn't enough free swap space.

Unfortunately the patch is not directly applicable to FreeBSD 
8.2-RELEASE so I have modified the source code using the Gleb's patch as 
reference, recompiled and installed the new driver. I am testing it for 
a week on my AMD64 16G RAM server reducing the swap space from 28G to 
8G, 4G or none and seems the the problem is solved.

Regards
-Maurizio

/sys/fs/tmpfs/tmpfs.h
===================================================================
--- tmpfs.h.orig    2010-12-21 18:09:00.000000000 +0100 (v 1.17.2.2.2.1)
+++ tmpfs.h    2011-10-13 15:16:26.900043000 +0200         (working copy)
@@ -304,10 +304,30 @@

  #define TMPFS_NODE_LOCK(node) mtx_lock(&(node)->tn_interlock)
  #define TMPFS_NODE_UNLOCK(node) mtx_unlock(&(node)->tn_interlock)
-#define        TMPFS_NODE_MTX(node) (&(node)->tn_interlock)
+#define TMPFS_NODE_MTX(node) (&(node)->tn_interlock)
+
+#ifdef INVARIANTS
+#define TMPFS_ASSERT_LOCKED(node) do {                    \
+        MPASS(node != NULL);                    \
+        MPASS(node->tn_vnode != NULL);                \
+        if (!VOP_ISLOCKED(node->tn_vnode) &&            \
+            !mtx_owned(TMPFS_NODE_MTX(node)))            \
+            panic("tmpfs: node is not locked: %p", node);    \
+    } while (0)
+#define TMPFS_ASSERT_ELOCKED(node) do {                    \
+        MPASS((node) != NULL);                    \
+        MPASS((node)->tn_vnode != NULL);            \
+        mtx_assert(TMPFS_NODE_MTX(node), MA_OWNED);        \
+        ASSERT_VOP_LOCKED((node)->tn_vnode, "tmpfs");        \
+    } while (0)
+#else
+#define TMPFS_ASSERT_LOCKED(node) (void)0
+#define TMPFS_ASSERT_ELOCKED(node) (void)0
+#endif

  #define TMPFS_VNODE_ALLOCATING    1
  #define TMPFS_VNODE_WANT    2
+#define TMPFS_VNODE_DOOMED    4
  /* 
--------------------------------------------------------------------- */

  /*
@@ -467,65 +487,30 @@
   * Memory management stuff.
   */

-/* Amount of memory pages to reserve for the system (e.g., to not use by
- * tmpfs).
- * XXX: Should this be tunable through sysctl, for instance? */
-#define TMPFS_PAGES_RESERVED (4 * 1024 * 1024 / PAGE_SIZE)
-
  /*
- * Returns information about the number of available memory pages,
- * including physical and virtual ones.
- *
- * If 'total' is TRUE, the value returned is the total amount of memory
- * pages configured for the system (either in use or free).
- * If it is FALSE, the value returned is the amount of free memory pages.
- *
- * Remember to remove TMPFS_PAGES_RESERVED from the returned value to avoid
- * excessive memory usage.
- *
+ * Number of reserved swap pages should not be lower than
+ * swap_pager_almost_full high water mark.
   */
+#define TMPFS_SWAP_MINRESERVED        1024
+
  static __inline size_t
-tmpfs_mem_info(void)
+tmpfs_pages_max(struct tmpfs_mount *tmp)
  {
-    size_t size;
-
-    size = swap_pager_avail + cnt.v_free_count + cnt.v_inactive_count;
-    size -= size > cnt.v_wire_count ? cnt.v_wire_count : size;
-    return size;
+    return (tmp->tm_pages_max);
  }

-/* Returns the maximum size allowed for a tmpfs file system.  This macro
- * must be used instead of directly retrieving the value from tm_pages_max.
- * The reason is that the size of a tmpfs file system is dynamic: it lets
- * the user store files as long as there is enough free memory (including
- * physical memory and swap space).  Therefore, the amount of memory to be
- * used is either the limit imposed by the user during mount time or the
- * amount of available memory, whichever is lower.  To avoid consuming all
- * the memory for a given mount point, the system will always reserve a
- * minimum of TMPFS_PAGES_RESERVED pages, which is also taken into account
- * by this macro (see above). */
  static __inline size_t
-TMPFS_PAGES_MAX(struct tmpfs_mount *tmp)
+tmpfs_pages_used(struct tmpfs_mount *tmp)
  {
-    size_t freepages;
-
-    freepages = tmpfs_mem_info();
-    freepages -= freepages < TMPFS_PAGES_RESERVED ?
-        freepages : TMPFS_PAGES_RESERVED;
-
-    return MIN(tmp->tm_pages_max, freepages + tmp->tm_pages_used);
+    const size_t node_size = sizeof(struct tmpfs_node) +
+        sizeof(struct tmpfs_dirent);
+    size_t meta_pages;
+
+    meta_pages = howmany((uintmax_t)tmp->tm_nodes_inuse * node_size,
+        PAGE_SIZE);
+    return (meta_pages + tmp->tm_pages_used);
  }

-/* Returns the available space for the given file system. */
-#define TMPFS_META_PAGES(tmp) (howmany((tmp)->tm_nodes_inuse * 
(sizeof(struct tmpfs_node) \
-                + sizeof(struct tmpfs_dirent)), PAGE_SIZE))
-#define TMPFS_FILE_PAGES(tmp) ((tmp)->tm_pages_used)
-
-#define TMPFS_PAGES_AVAIL(tmp) (TMPFS_PAGES_MAX(tmp) > \
-            TMPFS_META_PAGES(tmp)+TMPFS_FILE_PAGES(tmp)? \
-            TMPFS_PAGES_MAX(tmp) - TMPFS_META_PAGES(tmp) \
-            - TMPFS_FILE_PAGES(tmp):0)
-
  #endif

  /* 
--------------------------------------------------------------------- */

/sys/fs/tmpfs/tmpfs_subr.c
===================================================================
--- tmpfs_subr.c.orig    2010-12-21 18:09:00.000000000 +0100 (v 
1.23.2.2.2.1)
+++ tmpfs_subr.c    2011-10-06 14:31:26.007163000 +0200     (working copy)
@@ -41,6 +41,7 @@
  #include <sys/priv.h>
  #include <sys/proc.h>
  #include <sys/stat.h>
+#include <sys/sysctl.h>
  #include <sys/systm.h>
  #include <sys/vnode.h>
  #include <sys/vmmeter.h>
@@ -55,6 +56,60 @@
  #include <fs/tmpfs/tmpfs_fifoops.h>
  #include <fs/tmpfs/tmpfs_vnops.h>

+static long tmpfs_swap_reserved = TMPFS_SWAP_MINRESERVED * 2;
+
+SYSCTL_NODE(_vfs, OID_AUTO, tmpfs, CTLFLAG_RW, 0, "tmpfs memory file 
system");
+
+static int
+sysctl_swap_reserved(SYSCTL_HANDLER_ARGS)
+{
+    int error;
+    long pages, bytes;
+
+    pages = *(long *)arg1;
+    bytes = pages * PAGE_SIZE;
+
+    error = sysctl_handle_long(oidp, &bytes, 0, req);
+    if (error || !req->newptr)
+        return (error);
+
+    pages = bytes / PAGE_SIZE;
+    if (pages < TMPFS_SWAP_MINRESERVED)
+        return (EINVAL);
+
+    *(long *)arg1 = pages;
+    return (0);
+}
+
+SYSCTL_PROC(_vfs_tmpfs, OID_AUTO, swap_reserved, CTLTYPE_LONG|CTLFLAG_RW,
+ &tmpfs_swap_reserved, 0, sysctl_swap_reserved, "L", "reserved swap 
space");
+
+static __inline size_t
+tmpfs_pages_avail(struct tmpfs_mount *tmp, size_t req_pages)
+{
+    vm_ooffset_t avail;
+
+    if (tmpfs_pages_max(tmp) < tmpfs_pages_used(tmp) + req_pages)
+        return (0);
+
+    if (!vm_page_count_target())
+        return (1);
+
+    /*
+     * Fail if pagedaemon wasn't able to free desired number of pages and
+     * we are running out of swap.
+     */
+    avail = swap_pager_avail - vm_paging_target() - req_pages;
+    if (avail < tmpfs_swap_reserved) {    /* avail is signed */
+        printf("tmpfs: low memory: available %jd, "
+            "paging target %d, requested %zd\n",
+            (intmax_t)swap_pager_avail, vm_paging_target(), req_pages);
+        return (0);
+    }
+
+    return (1);
+}
+
  /* 
--------------------------------------------------------------------- */

  /*
@@ -95,6 +150,8 @@

      if (tmp->tm_nodes_inuse > tmp->tm_nodes_max)
          return (ENOSPC);
+    if (tmpfs_pages_avail(tmp, 1) == 0)
+        return (ENOSPC);

      nnode = (struct tmpfs_node *)uma_zalloc_arg(
                  tmp->tm_node_pool, tmp, M_WAITOK);
@@ -882,7 +939,7 @@
      newpages = round_page(newsize) / PAGE_SIZE;

      if (newpages > oldpages &&
-        newpages - oldpages > TMPFS_PAGES_AVAIL(tmp)) {
+        tmpfs_pages_avail(tmp, newpages - oldpages) == 0) {
          error = ENOSPC;
          goto out;
      }


/sys/fs/tmpfs/tmpfs_vfsops.c
===================================================================
--- tmpfs_vfsops.c.orig    2010-12-21 18:09:00.000000000 +0100 (v 
1.21.2.1.6.1)
+++ tmpfs_vfsops.c    2011-10-07 14:10:15.137747000 +0200     (working copy)
@@ -85,53 +85,6 @@

  #define SWI_MAXMIB    3

-static u_int
-get_swpgtotal(void)
-{
-    struct xswdev xsd;
-    char *sname = "vm.swap_info";
-    int soid[SWI_MAXMIB], oid[2];
-    u_int unswdev, total, dmmax, nswapdev;
-    size_t mibi, len;
-
-    total = 0;
-
-    len = sizeof(dmmax);
-    if (kernel_sysctlbyname(curthread, "vm.dmmax", &dmmax, &len,
-                NULL, 0, NULL, 0) != 0)
-        return total;
-
-    len = sizeof(nswapdev);
-    if (kernel_sysctlbyname(curthread, "vm.nswapdev",
- &nswapdev, &len,
-                NULL, 0, NULL, 0) != 0)
-        return total;
-
-    mibi = (SWI_MAXMIB - 1) * sizeof(int);
-    oid[0] = 0;
-    oid[1] = 3;
-
-    if (kernel_sysctl(curthread, oid, 2,
-            soid, &mibi, (void *)sname, strlen(sname),
-            NULL, 0) != 0)
-        return total;
-
-    mibi = (SWI_MAXMIB - 1);
-    for (unswdev = 0; unswdev < nswapdev; ++unswdev) {
-        soid[mibi] = unswdev;
-        len = sizeof(struct xswdev);
-        if (kernel_sysctl(curthread,
-                soid, mibi + 1, &xsd, &len, NULL, 0,
-                NULL, 0) != 0)
-            return total;
-        if (len == sizeof(struct xswdev))
-            total += (xsd.xsw_nblks - dmmax);
-    }
-
-    /* Not Reached */
-    return total;
-}
-
  /* 
--------------------------------------------------------------------- */
  static int
  tmpfs_node_ctor(void *mem, int size, void *arg, int flags)
@@ -179,14 +132,13 @@
  static int
  tmpfs_mount(struct mount *mp)
  {
+    const size_t nodes_per_page = howmany(PAGE_SIZE,
+        sizeof(struct tmpfs_dirent) + sizeof(struct tmpfs_node));
      struct tmpfs_mount *tmp;
      struct tmpfs_node *root;
-    size_t pages, mem_size;
-    ino_t nodes;
+    u_quad_t pages;
+    u_quad_t nodes_max, size_max, maxfilesize;
      int error;
-    /* Size counters. */
-    ino_t    nodes_max;
-    size_t    size_max;

      /* Root node attributes. */
      uid_t    root_uid;
@@ -223,42 +175,55 @@
      if (mp->mnt_cred->cr_ruid != 0 ||
          vfs_scanopt(mp->mnt_optnew, "mode", "%ho", &root_mode) != 1)
          root_mode = va.va_mode;
-    if (vfs_scanopt(mp->mnt_optnew, "inodes", "%d", &nodes_max) != 1)
+    if (vfs_scanopt(mp->mnt_optnew, "inodes", "%qu", &nodes_max) != 1)
          nodes_max = 0;
      if (vfs_scanopt(mp->mnt_optnew, "size", "%qu", &size_max) != 1)
          size_max = 0;
-
-    /* Do not allow mounts if we do not have enough memory to preserve
-     * the minimum reserved pages. */
-    mem_size = cnt.v_free_count + cnt.v_inactive_count + get_swpgtotal();
-    mem_size -= mem_size > cnt.v_wire_count ? cnt.v_wire_count : mem_size;
-    if (mem_size < TMPFS_PAGES_RESERVED)
+    if (vfs_scanopt(mp->mnt_optnew, "maxfilesize", "%qu", &maxfilesize) 
!= 0)
+        maxfilesize = 0;
+    /*
+     * XXX Deny mounts if pagedaemon wasn't able to recovery desired
+     * number of pages.
+     */
+    if (vm_page_count_target())
          return ENOSPC;

      /* Get the maximum number of memory pages this file system is
       * allowed to use, based on the maximum size the user passed in
-     * the mount structure.  A value of zero is treated as if the
-     * maximum available space was requested. */
-    if (size_max < PAGE_SIZE || size_max >= SIZE_MAX)
-        pages = SIZE_MAX;
+     * the mount structure. Use half of RAM by default. */
+    if (size_max < PAGE_SIZE*4 || size_max > SIZE_MAX - PAGE_SIZE)
+        pages = cnt.v_page_count / 2;
      else
          pages = howmany(size_max, PAGE_SIZE);
      MPASS(pages > 0);
+    MPASS(pages < SIZE_MAX);

-    if (nodes_max <= 3)
-        nodes = 3 + pages * PAGE_SIZE / 1024;
+    if (pages < SIZE_MAX / PAGE_SIZE)
+        size_max = pages * PAGE_SIZE;
      else
-        nodes = nodes_max;
-    MPASS(nodes >= 3);
+        size_max = SIZE_MAX;
+
+    if (nodes_max <= 3) {
+        if (pages < UINT32_MAX / nodes_per_page)
+            nodes_max = pages * nodes_per_page;
+         else
+            nodes_max = UINT32_MAX;
+    }
+    if (nodes_max > UINT32_MAX)
+        nodes_max = UINT32_MAX;
+    MPASS(nodes_max >= 3);
+
+    if (maxfilesize < PAGE_SIZE || maxfilesize > size_max)
+        maxfilesize = size_max;

      /* Allocate the tmpfs mount structure and fill it. */
      tmp = (struct tmpfs_mount *)malloc(sizeof(struct tmpfs_mount),
          M_TMPFSMNT, M_WAITOK | M_ZERO);

      mtx_init(&tmp->allnode_lock, "tmpfs allnode lock", NULL, MTX_DEF);
-    tmp->tm_nodes_max = nodes;
+    tmp->tm_nodes_max = nodes_max;
      tmp->tm_nodes_inuse = 0;
-    tmp->tm_maxfilesize = (u_int64_t)(cnt.v_page_count + 
get_swpgtotal()) * PAGE_SIZE;
+    tmp->tm_maxfilesize = maxfilesize;
      LIST_INIT(&tmp->tm_nodes_used);

      tmp->tm_pages_max = pages;
@@ -427,22 +392,23 @@
  static int
  tmpfs_statfs(struct mount *mp, struct statfs *sbp)
  {
-    fsfilcnt_t freenodes;
      struct tmpfs_mount *tmp;
+    size_t used;

      tmp = VFS_TO_TMPFS(mp);

      sbp->f_iosize = PAGE_SIZE;
      sbp->f_bsize = PAGE_SIZE;

-    sbp->f_blocks = TMPFS_PAGES_MAX(tmp);
-    sbp->f_bavail = sbp->f_bfree = TMPFS_PAGES_AVAIL(tmp);
-
-    freenodes = MIN(tmp->tm_nodes_max - tmp->tm_nodes_inuse,
-        TMPFS_PAGES_AVAIL(tmp) * PAGE_SIZE / sizeof(struct tmpfs_node));
-
-    sbp->f_files = freenodes + tmp->tm_nodes_inuse;
-    sbp->f_ffree = freenodes;
+    sbp->f_blocks = tmpfs_pages_max(tmp);
+    used = tmpfs_pages_used(tmp);
+    if (tmpfs_pages_max(tmp) <= used)
+        sbp->f_bavail = 0;
+    else
+        sbp->f_bavail = tmpfs_pages_max(tmp) - used;
+    sbp->f_bfree = sbp->f_bavail;
+    sbp->f_files = tmp->tm_nodes_max;
+    sbp->f_ffree = tmp->tm_nodes_max - tmp->tm_nodes_inuse;
      /* sbp->f_owner = tmp->tn_uid; */

      return 0;




More information about the freebsd-fs mailing list