mmap() question

Wed Oct 8 17:01:59 UTC 2014

On Wed, Oct 08, 2014 at 07:14:45PM +0400, Dmitry Sivachenko wrote:
> On 12 окт. 2013 г., at 13:59, Konstantin Belousov <kostikbel at gmail.com> wrote:
> > 
> > I was not able to reproduce the situation locally. I even tried to start
> > a lot of threads accessing the mapped regions, to try to outrun the
> > pagedaemon. The user threads sleep on the disk read, while pagedaemon
> > has a lot of time to rebalance the queues. It might be a case when SSD
> > indeed makes a difference.
> > 
> > Still, I see how this situation could appear. The code, which triggers
> > OOM, never fires if there is a free space in the swapfile, so the
> > absense of swap is neccessary condition to trigger the bug.  Next, OOM
> > calculation does not account for a possibility that almost all pages on
> > the queues can be reused. It just fires if free pages depleted too much
> > or free target cannot be reached.
> > 
> > IMO one of the possible solution is to account the queued pages in
> > addition to the swap space.  This is not entirely accurate, since some
> > pages on the queues cannot be reused, at least transiently.  Most precise
> > algorithm would count the hold and busy pages globally, and substract
> > this count from queues length, but it is probably too costly.
> > 
> > Instead, I think we could rely on the numbers which are counted by
> > pagedaemon threads during the passes.  Due to the transient nature of the
> > pagedaemon failures, this should be fine.
> > 
> > Below is the prototype patch, against HEAD.  It is not applicable to
> > stable, please use HEAD kernel for test.
> 
> 
> 
> Hello,
> 
> any chance to commit this patch?
> 

I wrote something different after discussion with Alan, it instead measures
the pagedaemon progress.  Unfortunately, I have some issues with auto-tuning
the detection.  Could you try the patch below.  Possibly, you would need
to change vm.pageout_oom_miss (either lower or raise, depending on the
load) to get the tuning.  Bigger the value, more the time pagedaemon tries
to make any progress before declaring OOM.

diff --git a/sys/vm/vm_page.h b/sys/vm/vm_page.h
index f12b76c..251cf7b 100644
--- a/sys/vm/vm_page.h
+++ b/sys/vm/vm_page.h
@@ -227,6 +227,7 @@ struct vm_domain {
 	long vmd_segs;	/* bitmask of the segments */
 	boolean_t vmd_oom;
 	int vmd_pass;	/* local pagedaemon pass */
+	int vmd_oom_seq;
 	struct vm_page vmd_marker; /* marker for pagedaemon private use */
 };
 
diff --git a/sys/vm/vm_pageout.c b/sys/vm/vm_pageout.c
index ca9d7f9..aa23eb8 100644
--- a/sys/vm/vm_pageout.c
+++ b/sys/vm/vm_pageout.c
@@ -120,7 +120,8 @@ static void vm_pageout(void);
 static void vm_pageout_init(void);
 static int vm_pageout_clean(vm_page_t);
 static void vm_pageout_scan(struct vm_domain *vmd, int pass);
-static void vm_pageout_mightbe_oom(struct vm_domain *vmd, int pass);
+static void vm_pageout_mightbe_oom(struct vm_domain *vmd, int page_shortage,
+    int starting_page_shortage);
 
 SYSINIT(pagedaemon_init, SI_SUB_KTHREAD_PAGE, SI_ORDER_FIRST, vm_pageout_init,
     NULL);
@@ -157,6 +158,7 @@ int vm_pages_needed;		/* Event on which pageout daemon sleeps */
 int vm_pageout_deficit;		/* Estimated number of pages deficit */
 int vm_pageout_pages_needed;	/* flag saying that the pageout daemon needs pages */
 int vm_pageout_wakeup_thresh;
+static int vm_pageout_oom_seq = 24;
 
 #if !defined(NO_SWAPPING)
 static int vm_pageout_req_swapout;	/* XXX */
@@ -216,6 +218,10 @@ static int pageout_lock_miss;
 SYSCTL_INT(_vm, OID_AUTO, pageout_lock_miss,
 	CTLFLAG_RD, &pageout_lock_miss, 0, "vget() lock misses during pageout");
 
+SYSCTL_INT(_vm, OID_AUTO, pageout_oom_seq,
+	CTLFLAG_RW, &vm_pageout_oom_seq, 0,
+	"side-to-side calls to oom detector to start OOM");
+
 #define VM_PAGEOUT_PAGE_COUNT 16
 int vm_pageout_page_count = VM_PAGEOUT_PAGE_COUNT;
 
@@ -912,7 +918,8 @@ vm_pageout_scan(struct vm_domain *vmd, int pass)
 	vm_page_t m, next;
 	struct vm_pagequeue *pq;
 	vm_object_t object;
-	int act_delta, addl_page_shortage, deficit, maxscan, page_shortage;
+	int act_delta, addl_page_shortage, deficit, maxscan;
+	int page_shortage, starting_page_shortage;
 	int vnodes_skipped = 0;
 	int maxlaunder;
 	int lockmode;
@@ -954,6 +961,7 @@ vm_pageout_scan(struct vm_domain *vmd, int pass)
 		page_shortage = vm_paging_target() + deficit;
 	} else
 		page_shortage = deficit = 0;
+	starting_page_shortage = page_shortage;
 
 	/*
 	 * maxlaunder limits the number of dirty pages we flush per scan.
@@ -1329,6 +1337,15 @@ relock_queues:
 		(void)speedup_syncer();
 
 	/*
+	 * If we are critically low on one of RAM or swap and low on
+	 * the other, kill the largest process.  However, we avoid
+	 * doing this on the first pass in order to give ourselves a
+	 * chance to flush out dirty vnode-backed pages and to allow
+	 * active pages to be moved to the inactive queue and reclaimed.
+	 */
+	vm_pageout_mightbe_oom(vmd, page_shortage, starting_page_shortage);
+
+	/*
 	 * Compute the number of pages we want to try to move from the
 	 * active queue to the inactive queue.
 	 */
@@ -1437,15 +1454,6 @@ relock_queues:
 		}
 	}
 #endif
-
-	/*
-	 * If we are critically low on one of RAM or swap and low on
-	 * the other, kill the largest process.  However, we avoid
-	 * doing this on the first pass in order to give ourselves a
-	 * chance to flush out dirty vnode-backed pages and to allow
-	 * active pages to be moved to the inactive queue and reclaimed.
-	 */
-	vm_pageout_mightbe_oom(vmd, pass);
 }
 
 static int vm_pageout_oom_vote;
@@ -1456,18 +1464,36 @@ static int vm_pageout_oom_vote;
  * failed to reach free target is premature.
  */
 static void
-vm_pageout_mightbe_oom(struct vm_domain *vmd, int pass)
+vm_pageout_mightbe_oom(struct vm_domain *vmd, int page_shortage,
+    int starting_page_shortage)
 {
 	int old_vote;
 
-	if (pass <= 1 || !((swap_pager_avail < 64 && vm_page_count_min()) ||
-	    (swap_pager_full && vm_paging_target() > 0))) {
+	if (starting_page_shortage <= 0 || starting_page_shortage !=
+	    page_shortage) {
+#if 0
+		if (vmd->vmd_oom_seq != 0)
+			printf("CLR oom_seq %d ps %d sps %d\n", vmd->vmd_oom_seq, page_shortage, starting_page_shortage);
+#endif
+		vmd->vmd_oom_seq = 0;
+	} else
+		vmd->vmd_oom_seq++;
+	if (vmd->vmd_oom_seq < vm_pageout_oom_seq) {
 		if (vmd->vmd_oom) {
 			vmd->vmd_oom = FALSE;
 			atomic_subtract_int(&vm_pageout_oom_vote, 1);
 		}
 		return;
 	}
+#if 0
+printf("OOM oom_seq %d ps %d sps %d\n", vmd->vmd_oom_seq, page_shortage, starting_page_shortage);
+#endif
+
+	/*
+	 * Do not follow the call sequence until OOM condition is
+	 * cleared.
+	 */
+	vmd->vmd_oom_seq = 0;
 
 	if (vmd->vmd_oom)
 		return;