svn commit: r183918 - in user/netchild/linuxaio/src/sys: amd64/conf amd64/linux32 compat compat/linux conf i386/conf i386/linux kern modules modules/aio modules/linuxaio pc98/conf sys

Alexander Leidinger netchild at FreeBSD.org
Wed Oct 15 14:42:35 UTC 2008


Author: netchild
Date: Wed Oct 15 14:42:34 2008
New Revision: 183918
URL: http://svn.freebsd.org/changeset/base/183918

Log:
  The most recent version (not from this year...) of Introns linux aio
  compatibility shim.
  
  The major flaws of the previous version should be fixed, but this needs
  to be verified.
  
  Submitted by:	Li, Xiao <intron at intron.ac>

Added:
  user/netchild/linuxaio/src/sys/compat/
  user/netchild/linuxaio/src/sys/compat/linux/
  user/netchild/linuxaio/src/sys/compat/linux/linux_aio.c   (contents, props changed)
  user/netchild/linuxaio/src/sys/compat/linux/linux_aio.h   (contents, props changed)
  user/netchild/linuxaio/src/sys/modules/linuxaio/
  user/netchild/linuxaio/src/sys/modules/linuxaio/Makefile   (contents, props changed)
Modified:
  user/netchild/linuxaio/src/sys/amd64/conf/NOTES
  user/netchild/linuxaio/src/sys/amd64/linux32/linux.h
  user/netchild/linuxaio/src/sys/amd64/linux32/linux32_dummy.c
  user/netchild/linuxaio/src/sys/amd64/linux32/syscalls.master
  user/netchild/linuxaio/src/sys/conf/files.i386
  user/netchild/linuxaio/src/sys/conf/files.pc98
  user/netchild/linuxaio/src/sys/conf/options.amd64
  user/netchild/linuxaio/src/sys/conf/options.i386
  user/netchild/linuxaio/src/sys/conf/options.pc98
  user/netchild/linuxaio/src/sys/i386/conf/NOTES
  user/netchild/linuxaio/src/sys/i386/linux/linux.h
  user/netchild/linuxaio/src/sys/i386/linux/linux_dummy.c
  user/netchild/linuxaio/src/sys/i386/linux/syscalls.master
  user/netchild/linuxaio/src/sys/kern/vfs_aio.c
  user/netchild/linuxaio/src/sys/modules/Makefile
  user/netchild/linuxaio/src/sys/modules/aio/Makefile
  user/netchild/linuxaio/src/sys/pc98/conf/NOTES
  user/netchild/linuxaio/src/sys/sys/aio.h

Modified: user/netchild/linuxaio/src/sys/amd64/conf/NOTES
==============================================================================
--- user/netchild/linuxaio/src/sys/amd64/conf/NOTES	Wed Oct 15 14:38:35 2008	(r183917)
+++ user/netchild/linuxaio/src/sys/amd64/conf/NOTES	Wed Oct 15 14:42:34 2008	(r183918)
@@ -443,16 +443,19 @@ options 	COMPAT_IA32
 # Enable Linux ABI emulation
 #XXX#options 	COMPAT_LINUX
 
-# Enable 32-bit Linux ABI emulation (requires COMPAT_43 and COMPAT_IA32)
+# Enable 32-bit Linux ABI emulation (requires COMPAT_IA32)
 options 	COMPAT_LINUX32
 
 # Enable the linux-like proc filesystem support (requires COMPAT_LINUX32
 # and PSEUDOFS)
 options 	LINPROCFS
 
-#Enable the linux-like sys filesystem support (requires COMPAT_LINUX32
+# Enable the linux-like sys filesystem support (requires COMPAT_LINUX32
 # and PSEUDOFS)
-options		LINSYSFS
+options 	LINSYSFS
+
+# Enable the linux aio support (requires COMPAT_LINUX32 and VFS_AIO)
+options 	LINUXAIO
 
 #
 # SysVR4 ABI emulation

Modified: user/netchild/linuxaio/src/sys/amd64/linux32/linux.h
==============================================================================
--- user/netchild/linuxaio/src/sys/amd64/linux32/linux.h	Wed Oct 15 14:38:35 2008	(r183917)
+++ user/netchild/linuxaio/src/sys/amd64/linux32/linux.h	Wed Oct 15 14:42:34 2008	(r183918)
@@ -880,6 +880,8 @@ typedef int l_mqd_t;
 	(LINUX_CLONE_VM | LINUX_CLONE_FS | LINUX_CLONE_FILES |	\
 	LINUX_CLONE_SIGHAND | LINUX_CLONE_THREAD)
 
+#include <compat/linux/linux_aio.h>
+
 /* robust futexes */
 struct linux_robust_list {
 	l_uintptr_t			next;

Modified: user/netchild/linuxaio/src/sys/amd64/linux32/linux32_dummy.c
==============================================================================
--- user/netchild/linuxaio/src/sys/amd64/linux32/linux32_dummy.c	Wed Oct 15 14:38:35 2008	(r183917)
+++ user/netchild/linuxaio/src/sys/amd64/linux32/linux32_dummy.c	Wed Oct 15 14:42:34 2008	(r183918)
@@ -98,6 +98,11 @@ DUMMY(migrate_pages);
 DUMMY(pselect6);
 DUMMY(ppoll);
 DUMMY(unshare);
+DUMMY(io_setup);
+DUMMY(io_destroy);
+DUMMY(io_getevents);
+DUMMY(io_submit);
+DUMMY(io_cancel);
 DUMMY(splice);
 DUMMY(sync_file_range);
 DUMMY(tee);

Modified: user/netchild/linuxaio/src/sys/amd64/linux32/syscalls.master
==============================================================================
--- user/netchild/linuxaio/src/sys/amd64/linux32/syscalls.master	Wed Oct 15 14:38:35 2008	(r183917)
+++ user/netchild/linuxaio/src/sys/amd64/linux32/syscalls.master	Wed Oct 15 14:42:34 2008	(r183918)
@@ -413,11 +413,11 @@
 					l_ulong *user_mask_ptr); }
 243	AUE_NULL	STD	{ int linux_set_thread_area(struct l_user_desc *desc); }
 244	AUE_NULL	UNIMPL	linux_get_thread_area
-245	AUE_NULL	UNIMPL	linux_io_setup
-246	AUE_NULL	UNIMPL	linux_io_destroy
-247	AUE_NULL	UNIMPL	linux_io_getevents
-248	AUE_NULL	UNIMPL	inux_io_submit
-249	AUE_NULL	UNIMPL	linux_io_cancel
+245	AUE_NULL	STD	{ int linux_io_setup(l_uint nr_reqs, linux_aio_context_t *ctxp); }
+246	AUE_NULL	STD	{ int linux_io_destroy(linux_aio_context_t ctx); }
+247	AUE_NULL	STD	{ int linux_io_getevents(linux_aio_context_t ctx_id, l_long min_nr, l_long nr, struct linux_io_event *events, struct l_timespec *timeout); }
+248	AUE_NULL	STD	{ int linux_io_submit(linux_aio_context_t ctx_id, l_long nr, struct linux_iocb **iocbpp); }
+249	AUE_NULL	STD	{ int linux_io_cancel(linux_aio_context_t ctx_id, struct linux_iocb *iocb, struct linux_io_event *result); }
 250	AUE_NULL	STD	{ int linux_fadvise64(void); }
 251	AUE_NULL	UNIMPL
 252	AUE_EXIT	STD	{ int linux_exit_group(int error_code); }

Added: user/netchild/linuxaio/src/sys/compat/linux/linux_aio.c
==============================================================================
--- /dev/null	00:00:00 1970	(empty, because file is newly added)
+++ user/netchild/linuxaio/src/sys/compat/linux/linux_aio.c	Wed Oct 15 14:42:34 2008	(r183918)
@@ -0,0 +1,1254 @@
+/*-
+ * Copyright (c) 2006 Li, Xiao <intron at intron.ac>.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include "opt_compat.h"
+
+#include <sys/types.h>
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/kernel.h>
+#include <sys/module.h>
+#include <sys/sysent.h>
+#include <sys/proc.h>
+#include <sys/errno.h>
+#include <sys/eventhandler.h>
+#include <sys/aio.h>
+#include <sys/mman.h>
+#include <sys/time.h>
+#include <sys/queue.h>
+#include <vm/uma.h>
+#include <sys/lock.h>
+#include <sys/mutex.h>
+#include <sys/sx.h>
+#include <sys/sysproto.h>
+
+#ifdef COMPAT_LINUX32
+#include <machine/../linux32/linux.h>
+#include <machine/../linux32/linux32_proto.h>
+#else
+#include <machine/../linux/linux.h>
+#include <machine/../linux/linux_proto.h>
+#endif
+
+#define	LINUX_AIO_DEBUG
+
+/*
+ * Linux Kernel Implementation of Asynchronous I/O
+ */
+
+#ifdef	LINUX_AIO_DEBUG
+
+/* Print arguments of syscall */
+#define	DARGPRINTF(fmt, ...)	printf("linux(%ld): %s("fmt")\n",	\
+	(long)td->td_proc->p_pid, __func__, __VA_ARGS__)
+/* Print message in syscall function */
+#define	DPRINTF(fmt, ...)	printf(LMSG("%s(): " fmt),		\
+	__func__, __VA_ARGS__)
+/* Print message in non-syscall function, the one more "P" means "private" */
+#define	DPPRINTF(fmt, ...)	printf("linux(): %s(): " fmt "\n",	\
+	__func__, __VA_ARGS__)
+
+#else
+
+#define	DARGPRINTF(fmt, ...)
+#define	DPRINTF(fmt, ...)
+#define	DPPRINTF(fmt, ...)
+
+#endif
+
+/*
+ *                             DATA STRUCTURE HIERARCHY
+ *
+ *                   +--------------------+      +--------------------+
+ * context_list ---> |       context      | ---> |       context      | ---> ...
+ *             SLIST |(owned by a process)|      |(owned by a process)|
+ *                   |                    |      |                    |
+ *                   | ctx_req            |      | ctx_req            |
+ *                   +----|---------------+      +----|---------------+
+ *                        |  STAILQ                   |  STAILQ
+ *                        v                           v
+ *                    +------------+              +------------+
+ *                    |   request  |              |   request  |
+ *                    |            |              |            |
+ *                    |.req_pbsd   |              |.req_pbsd   |
+ *                    |.req_porig  |              |.req_porig  |
+ *                    |.req_linux  |              |.req_linux  |
+ *                    |            |              |            |
+ *                    +------------+              +------------+
+ *                        |                           |
+ *                        v                           v
+ *                    +------------+              +------------+
+ *                    |   request  |              |   request  |
+ *                    |            |              |            |
+ *                    |.req_pbsd   |              |.req_pbsd   |
+ *                    |.req_porig  |              |.req_porig  |
+ *                    |.req_linux  |              |.req_linux  |
+ *                    |            |              |            |
+ *                    +------------+              +------------+
+ *                        |                           |
+ *                        v                           v
+ *                       ...                         ...
+ */
+
+struct linux_aio_context;
+
+struct linux_aio_request {
+	struct aiocb        *req_pbsd;  /* Userland clone for FreeBSD */
+	struct linux_iocb   *req_porig; /* Userland original control block */
+	struct linux_iocb   req_linux;  /* Copy of original control block */
+	STAILQ_ENTRY(linux_aio_request)	req_ctx_entry;
+};
+
+struct linux_aio_context {
+	struct sx	ctx_sx;
+	pid_t		ctx_pid;
+	struct linux_aio_ring *ctx_pring;
+	int		ctx_nreq_max; /* Maximum request number */
+	int		ctx_nreq_cur; /* Current request number */
+	STAILQ_HEAD(,linux_aio_request)	ctx_req;
+	SLIST_ENTRY(linux_aio_context) ctx_list_entry;
+};
+static SLIST_HEAD(,linux_aio_context) linux_aio_context_list;
+
+#define	LINUX_AIO_REQ_HOOK(pctx, preq)		{			\
+	STAILQ_INSERT_TAIL(&((pctx)->ctx_req), (preq), req_ctx_entry);	\
+	(pctx)->ctx_nreq_cur ++;					\
+}
+
+#define	LINUX_AIO_REQ_UNHOOK(pctx, preq) 	{			\
+	STAILQ_REMOVE(&((pctx)->ctx_req), (preq), linux_aio_request,	\
+			req_ctx_entry);					\
+	(pctx)->ctx_nreq_cur --;					\
+}
+
+#define	LINUX_AIO_REQ_FOREACH(pctx, preq)				\
+	STAILQ_FOREACH((preq), &((pctx)->ctx_req), req_ctx_entry)
+
+#define	LINUX_AIO_REQ_FOREACH_SAFE(pctx, preq, ptmpreq)			\
+	STAILQ_FOREACH_SAFE((preq), &((pctx)->ctx_req), req_ctx_entry,	\
+			(ptmpreq))
+
+#define	LINUX_AIO_CTX_LOCK(pctx)	sx_xlock(&((pctx)->ctx_sx))
+
+#define	LINUX_AIO_CTX_UNLOCK(pctx)	sx_unlock(&((pctx)->ctx_sx))
+
+#define	LINUX_AIO_CTX_HOOK(pctx)					\
+	SLIST_INSERT_HEAD(&linux_aio_context_list, (pctx), ctx_list_entry)
+
+#define	LINUX_AIO_CTX_UNHOOK(pctx)					\
+	SLIST_REMOVE(&linux_aio_context_list, (pctx),			\
+			linux_aio_context, ctx_list_entry)
+
+#define	LINUX_AIO_CTX_FOREACH(pctx)					\
+	SLIST_FOREACH((pctx), &linux_aio_context_list, ctx_list_entry)
+
+#define	LINUX_AIO_CTX_FOREACH_SAFE(pctx, ptmpctx)			\
+	SLIST_FOREACH_SAFE((pctx), &linux_aio_context_list,		\
+			ctx_list_entry, (ptmpctx))
+
+#define	LINUX_AIO_CTX_MATCH(pctx, ctxid, pid)				\
+	((linux_aio_context_t)(pctx)->ctx_pring == (ctxid)		\
+		&& (pctx)->ctx_pid == (pid))
+
+static struct mtx linux_aio_context_list_mtx;
+
+#define	LINUX_AIO_CTX_LIST_LOCK()	mtx_lock(&linux_aio_context_list_mtx)
+
+#define	LINUX_AIO_CTX_LIST_UNLOCK()	mtx_unlock(&linux_aio_context_list_mtx)
+
+/*
+ * The following two macros are substantially identical to the two macros
+ * AIO_(UN)LOCK in /sys/kern/vfs_aio.c. Thus, the mutex much be unlocked
+ * before calling functions of FreeBSD native AIO module.
+ *
+ * XXX
+ * I ASSUME the member "kaio_mtx" is the first element of "struct kaioinfo".
+ */
+#define	LINUX_AIO_LOCK(p)	{					\
+	if ((p)->p_aioinfo == NULL)					\
+		aio_init_aioinfo(p);					\
+	mtx_lock((struct mtx *)((p)->p_aioinfo));			\
+}
+
+#define	LINUX_AIO_UNLOCK(p)	{					\
+	if ((p)->p_aioinfo == NULL)					\
+		aio_init_aioinfo(p);					\
+	mtx_unlock((struct mtx *)((p)->p_aioinfo));			\
+}
+
+static uma_zone_t linux_aio_context_zone, linux_aio_request_zone;
+
+static eventhandler_tag linux_aio_exit_tag;
+
+/*
+ * To backup pointers to the dummy implementation of these
+ * system calls faked by the macro DUMMY() in linux_dummy.c.
+ */
+#define	PREPARE_DUMMY_SYSCALL_BACKUP(s)					\
+	static sy_call_t *p_dummy_linux_ ## s
+#define	SHOW_REAL_SYSCALL(s)	{					\
+	p_dummy_linux_ ## s = linux_sysent[LINUX_SYS_linux_ ## s].sy_call; \
+	linux_sysent[LINUX_SYS_linux_ ## s].sy_call = 			\
+		(sy_call_t *)(linux_ ## s);				\
+}
+/*
+ * The concept of "scope": the functions linux_io_xxx defined in this file
+ * always mask/screen/override/prevent homonymous functions defined in
+ * any other files.
+ */
+#define	RESTORE_DUMMY_SYSCALL(s)	{				\
+	linux_sysent[LINUX_SYS_linux_ ## s].sy_call = p_dummy_linux_ ## s; \
+}
+
+PREPARE_DUMMY_SYSCALL_BACKUP(io_setup);
+PREPARE_DUMMY_SYSCALL_BACKUP(io_destroy);
+PREPARE_DUMMY_SYSCALL_BACKUP(io_getevents);
+PREPARE_DUMMY_SYSCALL_BACKUP(io_submit);
+PREPARE_DUMMY_SYSCALL_BACKUP(io_cancel);
+
+/*
+ * Substantially defined in linux_sysent.c.
+ * Also declared in linux_sysvec.c.
+ */
+extern struct sysent linux_sysent[LINUX_SYS_MAXSYSCALL];
+
+static int user_mem_rw_verify(void *p, size_t s)
+{
+	char buf[256];
+	size_t i;
+	int nerr = 0;
+
+	for (i = 0; i < s; i += sizeof(buf)) {
+		/* Verify reading */
+		nerr = copyin((char *)p+i, buf, MIN(sizeof(buf), s-i));
+		if (nerr != 0)
+			break;
+
+		/* Verify writing */
+		nerr = copyout(buf, (char *)p+i, MIN(sizeof(buf), s-i));
+		if (nerr != 0)
+			break;
+	}
+
+	return (nerr);
+}
+
+/* Allocate memory in user space */
+static int user_malloc(struct thread *td, void **pp, size_t s)
+{
+	struct mmap_args mmaparg;
+	int nerr;
+	register_t r;
+
+	r = td->td_retval[0];
+
+	mmaparg.addr = NULL;
+	mmaparg.len = s;
+	mmaparg.prot = PROT_READ | PROT_WRITE;
+	mmaparg.flags = MAP_PRIVATE | MAP_ANON;
+	mmaparg.fd = -1;
+	mmaparg.pad = 0;
+	mmaparg.pos = 0;
+
+	nerr = mmap(td, &mmaparg);
+
+	if (nerr == 0) {
+		*pp = (void *)td->td_retval[0];
+		DPPRINTF("%lu bytes allocated at %p", (unsigned long)s, *pp);
+	}
+
+	td->td_retval[0] = r;
+
+	return (nerr);
+}
+
+/* Free memory in user space */
+static int user_free(struct thread *td, void *p, size_t s)
+{
+	struct munmap_args munmaparg;
+	int nerr;
+	register_t r;
+
+	r = td->td_retval[0];
+
+	munmaparg.addr = p;
+	munmaparg.len = s;
+
+	nerr = munmap(td, &munmaparg);
+
+	td->td_retval[0] = r;
+	DPPRINTF("%lu bytes at %p", (unsigned long)s, p);
+
+	return (nerr);
+}
+
+#ifdef	LINUX_AIO_DEBUG
+
+static void linux_aio_dump_freebsd_aiocb(struct aiocb *piocb, int isuserland)
+{
+	struct aiocb localcb, *pcb;
+	int nerr = 0;
+
+	if (isuserland) {
+		nerr = copyin(piocb, &localcb, sizeof(localcb));
+		pcb = &localcb;
+	}
+	else
+		pcb = piocb;
+
+	DPPRINTF("Dump struct aiocb (%p, %s): %s",
+			piocb, (isuserland?"userland":"kernel"),
+			(nerr?"Failure":""));
+	if (!nerr) {
+		DPPRINTF("aio_fildes: %d",
+				pcb->aio_fildes);
+		DPPRINTF("aio_offset: %lu",
+				(unsigned long) pcb->aio_offset);
+		DPPRINTF("aio_buf: %p",
+				pcb->aio_buf);
+		DPPRINTF("aio_nbytes: %lu",
+				(unsigned long) pcb->aio_nbytes);
+		DPPRINTF("aio_lio_opcode: %d",
+				pcb->aio_lio_opcode);
+		DPPRINTF("aio_reqprio: %d",
+				pcb->aio_reqprio);
+		DPPRINTF("aio_sigevent.sigev_notify: %d",
+			       	pcb->aio_sigevent.sigev_notify);
+		DPPRINTF("aio_sigevent.sigev_signo: %d",
+			       	pcb->aio_sigevent.sigev_signo);
+	}
+}
+
+#define	DUMP_FREEBSD_AIOCB(p, isu)    linux_aio_dump_freebsd_aiocb((p), (isu));
+
+#define	DUMP_TIMESPEC(f, t ,a)						\
+	DPRINTF("%s%ld second + %ld nanosecond%s",			\
+			(f), (long)(t)->tv_sec, (long)(t)->tv_nsec, (a));
+
+#else /* ! LINUX_AIO_DEBUG */
+
+#define	DUMP_FREEBSD_AIOCB(p, isu)
+#define	DUMP_TIMESPEC(f, t, a)
+
+#endif /* LINUX_AIO_DEBUG */
+
+static int iocb_reformat(struct linux_iocb *plnx, struct aiocb *pbsd)
+{
+	int nerr = 0;
+
+	bzero(pbsd, sizeof(*pbsd));
+
+	pbsd->aio_fildes = plnx->aio_fildes;  /* File descriptor */
+	pbsd->aio_offset = plnx->aio_offset;  /* File offset for I/O */
+	pbsd->aio_buf = (void *)(unsigned long) plnx->aio_buf; /*
+								* User space
+								* I/O buffer
+								*/
+	pbsd->aio_nbytes = plnx->aio_nbytes;  /* Number of bytes for I/O */
+	switch (plnx->aio_lio_opcode) {       /* LIO opcode */
+	case LINUX_IOCB_CMD_PREAD:
+		pbsd->aio_lio_opcode = LIO_READ;
+		break;
+	case LINUX_IOCB_CMD_PWRITE:
+		pbsd->aio_lio_opcode = LIO_WRITE;
+		break;
+	case LINUX_IOCB_CMD_FSYNC:
+	case LINUX_IOCB_CMD_FDSYNC:
+		pbsd->aio_lio_opcode = LIO_SYNC;
+		break;
+#if 0
+	case LINUX_IOCB_CMD_PREADX:
+		break;
+	case LINUX_IOCB_CMD_POLL:
+		break;
+#endif
+	case LINUX_IOCB_CMD_NOOP:
+		pbsd->aio_lio_opcode = LIO_NOP;
+		break;
+	default:
+		nerr = EINVAL;
+		break;
+	}
+	if (nerr != 0) {
+	        DPPRINTF("Unsupported aio_lio_opcode: %u",
+	                        (unsigned)plnx->aio_lio_opcode);
+	        return (nerr);
+	}
+	pbsd->aio_reqprio = plnx->aio_reqprio;        /* Request priority */
+	pbsd->aio_sigevent.sigev_notify = SIGEV_NONE; /* No signal to deliver */
+	pbsd->aio_sigevent.sigev_signo = 0;           /* No signal to deliver */
+
+	return (nerr);
+}
+
+/* Linux system call io_setup(2) */
+int linux_io_setup(struct thread *td, struct linux_io_setup_args *args)
+{
+	struct proc *p;
+	struct linux_aio_ring *pring, ring;
+	struct linux_aio_context *pctx = NULL, *ptmpctx;
+	linux_aio_context_t ctx_id;
+	int nerr = 0, nr, nrall, nq, arg_nr_reqs;
+
+	DARGPRINTF("%u, %p", args->nr_reqs, args->ctxp);
+
+	/* Signed integer is a little safer than unsigned */
+	arg_nr_reqs = args->nr_reqs;
+	if (arg_nr_reqs <= 0)
+		return (EINVAL);
+
+	if (arg_nr_reqs > max_aio_queue_per_proc
+			|| arg_nr_reqs > max_aio_queue_count) {
+		printf(LMSG("linux_io_setup(): Please increase sysctls "
+			       "vfs.aio.max_aio_queue_per_proc "
+			       "and/or vfs.aio.max_aio_queue. "));
+		return (ENOMEM);
+	}
+
+	nerr = user_mem_rw_verify(args->ctxp, sizeof(*(args->ctxp)));
+	if (nerr != 0)
+		return (nerr);
+
+	copyin(args->ctxp, &ctx_id, sizeof(ctx_id));
+	if (ctx_id != 0) /* "Not initialized", described by io_setup(2) */
+		return (EINVAL);
+
+	p = td->td_proc;
+
+	/* Get a new "ring" */
+	nerr = user_malloc(td, (void **)&pring, sizeof(*pring));
+	if (nerr != 0)
+		return (nerr);
+
+	/* Get a new context */
+	pctx = uma_zalloc(linux_aio_context_zone, M_WAITOK);
+
+	LINUX_AIO_CTX_LIST_LOCK();
+
+	/* Count request capacity of all contexts belonging to this process */
+	nr = 0;
+	nrall = 0;
+	nq = 0;
+	LINUX_AIO_CTX_FOREACH(ptmpctx) {
+		if (ptmpctx->ctx_pid == p->p_pid) {
+			nr += ptmpctx->ctx_nreq_max;
+			nq ++;
+		}
+		nrall += ptmpctx->ctx_nreq_max;
+	}
+	DPRINTF("%d queues of %d requests totally allocated for this process, "
+			"%d requests' total capacity for the whole system",
+		nq, nr, nrall);
+
+	/* Check whether there are enough resources for requested queue */
+	if (arg_nr_reqs > max_aio_queue_per_proc - nr
+			|| arg_nr_reqs > max_aio_queue_count - nrall) {
+		printf(LMSG("linux_io_setup(): "
+			       "Please increase sysctls "
+			       "vfs.aio.max_aio_queue_per_proc "
+			       "and/or vfs.aio.max_aio_queue. "
+			       "Besides %d queues of %d requests totally "
+			       "for this process, and %d requests' queues "
+			       "totally for the whole system, "
+			       "this Linux application needs one more "
+			       "AIO queue of %d requests' capacity."),
+			nq, nr, nrall, arg_nr_reqs);
+		LINUX_AIO_CTX_LIST_UNLOCK();
+		DPRINTF("Free context %p", pctx);
+		uma_zfree(linux_aio_context_zone, pctx);
+		user_free(td, pring, sizeof(*pring));
+		return (ENOMEM);
+	}
+
+	/* Initialize the new context */
+	sx_init(&(pctx->ctx_sx), "linux_aio_context");
+	pctx->ctx_pid = p->p_pid;
+	pctx->ctx_pring = pring;
+	pctx->ctx_nreq_max = arg_nr_reqs;
+	pctx->ctx_nreq_cur = 0;
+	STAILQ_INIT(&(pctx->ctx_req));
+
+	/* Hook the new context to global context list */
+	LINUX_AIO_CTX_HOOK(pctx);
+
+	LINUX_AIO_CTX_LIST_UNLOCK();
+
+	/* Initialize the new "ring" */
+	DPRINTF("initialize the \"ring\" %p", pring);
+	bzero(&ring, sizeof(ring));
+	ring.ring_id = 1;
+	ring.ring_nr = arg_nr_reqs;
+	ring.ring_head = 0;
+	ring.ring_tail = 1;
+	ring.ring_magic = LINUX_AIO_RING_MAGIC;
+	ring.ring_compat_features = LINUX_AIO_RING_COMPAT_FEATURES;
+	ring.ring_incompat_features = LINUX_AIO_RING_INCOMPAT_FEATURES;
+	ring.ring_header_length = sizeof(ring);
+	copyout(&ring, pring, sizeof(ring)); /* It has been hooked before */
+
+	/* Substantial return value */
+	ctx_id = (linux_aio_context_t)pctx->ctx_pring;
+	copyout(&ctx_id, args->ctxp, sizeof(ctx_id));
+	DPRINTF("returned context: %lx -> %p", (unsigned long)ctx_id, pctx);
+
+	return (nerr);
+}
+
+/* Linux system call io_destroy(2) */
+int linux_io_destroy(struct thread *td, struct linux_io_destroy_args *args)
+{
+	int nerr = 0;
+	struct proc *p;
+	struct linux_aio_context *pctx;
+	struct linux_aio_request *preq, *ptmpreq;
+	struct aio_cancel_args cancelargs;
+	struct aio_return_args aioretargs;
+
+	DARGPRINTF("%lx", (unsigned long)args->ctx);
+
+	p = td->td_proc;
+
+	/*
+	 * Locking:
+	 *
+	 * LINUX_AIO_LOCK(p);   <----------------+
+	 * ...                                   |
+	 *     LINUX_AIO_CTX_LIST_LOCK();   <--+ |
+	 *     ...                             | |
+	 *     LINUX_AIO_CTX_LIST_UNLOCK(); <--+ |
+	 * ...                                   |
+	 * LINUX_AIO_CTX_LOCK(pctx);   <---------|---+
+	 * LINUX_AIO_UNLOCK(p); <----------------+   |
+	 * ...                                       |
+	 * LINUX_AIO_CTX_UNLOCK(pctx); <-------------+
+	 */
+
+	LINUX_AIO_LOCK(p);
+
+	/* Find the context in context list */
+	LINUX_AIO_CTX_LIST_LOCK();
+	LINUX_AIO_CTX_FOREACH(pctx) {
+		if (LINUX_AIO_CTX_MATCH(pctx, args->ctx, p->p_pid))
+			break;
+	}
+	LINUX_AIO_CTX_LIST_UNLOCK();
+
+	/* Unable to find the context */
+	if (pctx == NULL) {
+		LINUX_AIO_UNLOCK(p);
+		return (EINVAL);
+	}
+
+	DPRINTF("Found the context: %lx -> %p", (unsigned long)args->ctx, pctx);
+
+	/* Unhook the context from context list */
+	DPRINTF("Unhook context %p", pctx);
+	LINUX_AIO_CTX_UNHOOK(pctx);
+
+	LINUX_AIO_CTX_LOCK(pctx); /* XXX Interlaced, seamless */
+	LINUX_AIO_UNLOCK(p);      /* XXX Interlaced, seamless */
+
+	/* Real cleanup */
+	LINUX_AIO_REQ_FOREACH_SAFE(pctx, preq, ptmpreq) {
+		DPRINTF("Cancel request (Linux: %p, FreeBSD: %p)",
+				preq->req_porig, preq->req_pbsd);
+
+		/* Cancel FreeBSD native clone */
+		cancelargs.fd = preq->req_linux.aio_fildes;
+		cancelargs.aiocbp = preq->req_pbsd;
+		aio_cancel(td, &cancelargs);
+		DPRINTF("aio_cancel() returned %ld", (long)td->td_retval[0]);
+		if (td->td_retval[0] == AIO_NOTCANCELED)
+			printf(LMSG("linux_io_destroy(): Asynchronous IO "
+					"request (Linux: %p, FreeBSD: %p) "
+					"cannot be cancelled. "
+					"***** Both User Space "
+					"and Kernel Memory Leaked! *****"),
+				preq->req_porig, preq->req_pbsd);
+
+		LINUX_AIO_REQ_UNHOOK(pctx, preq);
+
+		if (td->td_retval[0] == AIO_ALLDONE) {
+			aioretargs.aiocbp = preq->req_pbsd;
+			aio_return(td, &aioretargs);
+			DPRINTF("aio_return(%p) returned %ld",
+					aioretargs.aiocbp,
+					(long)td->td_retval[0]);
+
+			td->td_retval[0] = AIO_ALLDONE;
+		}
+
+		/* Free user space clone of the request */
+		if (td->td_retval[0] != AIO_NOTCANCELED) /*
+							 * XXX How to avoid
+							 * memory leak here?
+							 */
+			user_free(td, preq->req_pbsd,
+					sizeof(*(preq->req_pbsd)));
+
+		/* Free kernel structure of the request */
+		uma_zfree(linux_aio_request_zone, preq);
+
+		td->td_retval[0] = 0;
+	}
+
+	LINUX_AIO_CTX_UNLOCK(pctx);
+
+	sx_destroy(&(pctx->ctx_sx));
+
+	/* Free the "ring" */
+	DPRINTF("free the \"ring\" %p", pctx->ctx_pring);
+	user_free(td, pctx->ctx_pring, sizeof(*pctx->ctx_pring));
+
+	/* Free destroyed context */
+	uma_zfree(linux_aio_context_zone, pctx);
+
+	return (nerr);
+}
+
+/* Linux system call io_getevents(2) */
+int linux_io_getevents(struct thread *td, struct linux_io_getevents_args *args)
+{
+	int i, j, nerr = 0;
+	struct proc *p;
+	struct l_timespec l_timeout;
+	struct timespec timeout, *u_ptimeout, t1, t2;
+	struct linux_aio_context *pctx;
+	struct linux_aio_request *preq, *ptmpreq;
+	struct linux_io_event evt;
+	struct aio_return_args aioretargs;
+	struct aio_error_args aioerrargs;
+	register_t aio_ret, aio_err;
+	struct aiocb ** u_aiocbp;
+	struct aio_suspend_args aiosusargs;
+
+	DARGPRINTF("%lx, %ld, %ld, %p, %p",
+			(unsigned long) args->ctx_id,
+			(long)args->min_nr, (long)args->nr,
+			args->events, args->timeout);
+
+	if (args->nr <= 0)
+		return (EINVAL);
+
+	if (args->min_nr < 0)
+		return (EINVAL);
+
+	nerr = user_mem_rw_verify(args->events,
+			sizeof(*(args->events)) * args->nr);
+	if (nerr != 0)
+		return (nerr);
+
+	if (args->timeout != NULL) {
+		nerr = copyin(args->timeout, &l_timeout, sizeof(l_timeout));
+		if (nerr != 0)
+			return (nerr);
+		timeout.tv_sec = l_timeout.tv_sec;
+		timeout.tv_nsec = l_timeout.tv_nsec;
+		DUMP_TIMESPEC("User specified timeout: ", &timeout, "");
+	}
+
+	p = td->td_proc;
+
+	/*
+	 * Locking:
+	 *
+	 * LINUX_AIO_LOCK(p);   <----------------+
+	 * ...                                   |
+	 *     LINUX_AIO_CTX_LIST_LOCK();   <--+ |
+	 *     ...                             | |
+	 *     LINUX_AIO_CTX_LIST_UNLOCK(); <--+ |
+	 * ...                                   |
+	 * LINUX_AIO_CTX_LOCK(pctx);   <---------|---+
+	 * LINUX_AIO_UNLOCK(p); <----------------+   |
+	 * ...                                       |
+	 * LINUX_AIO_CTX_UNLOCK(pctx); <-------------+
+	 */
+
+	LINUX_AIO_LOCK(p);
+
+	/* Find the context in context list */
+	LINUX_AIO_CTX_LIST_LOCK();
+	LINUX_AIO_CTX_FOREACH(pctx) {
+		if (LINUX_AIO_CTX_MATCH(pctx, args->ctx_id, p->p_pid))
+			break;
+	}
+	LINUX_AIO_CTX_LIST_UNLOCK();
+
+	/* Unable to find the context */
+	if (pctx == NULL) {
+		LINUX_AIO_UNLOCK(p);
+		return (EINVAL);
+	}
+
+	DPRINTF("Found the context: %lx -> %p", (unsigned long)args->ctx_id, pctx);
+
+	LINUX_AIO_CTX_LOCK(pctx); /* XXX Interlaced, seamless */
+	LINUX_AIO_UNLOCK(p);      /* XXX Interlaced, seamless */
+
+	if (STAILQ_EMPTY(&(pctx->ctx_req))) {
+		td->td_retval[0] = 0; /* No queued request */
+		DPRINTF("No request in queue (context: %p) at all, "
+				"return directly", pctx);
+	} else { /* Deal with the request queue */
+		i = 0; /*
+			* This variable's value will be the return value
+			* of linux_io_getevents()
+			*/
+
+		nerr = user_malloc(td, (void **)&u_aiocbp,
+				sizeof(*u_aiocbp) * pctx->ctx_nreq_max);
+		if (nerr != 0)
+			goto skip_substantial_0;
+
+		nerr = user_malloc(td, (void **)&u_ptimeout,
+				sizeof(*u_ptimeout));
+		if (nerr != 0)
+			goto skip_substantial_1;
+
+		for (i = 0;i < args->nr;) {
+
+			/* Collecting finished requests and waiting for queued requests */
+
+			LINUX_AIO_REQ_FOREACH_SAFE(pctx, preq, ptmpreq) {
+
+				/* Collect all finished requests */
+
+				if (i >= args->nr) /* Full */
+					break;
+
+				aioerrargs.aiocbp = preq->req_pbsd;
+				aio_error(td, &aioerrargs);
+				aio_ret = td->td_retval[0];
+				td->td_retval[0] = 0;
+
+				DPRINTF("aio_error(%p) (Linux: %p) "
+						"returned %ld%s",
+					aioerrargs.aiocbp,
+					preq->req_porig,
+					(long)aio_ret,
+					aio_ret == EINPROGRESS ?
+						"(EINPROGRESS)" : "" );
+
+				if (aio_ret == EINPROGRESS)
+					continue;
+
+				/* Done */
+				LINUX_AIO_REQ_UNHOOK(pctx, preq);
+
+				aioretargs.aiocbp = preq->req_pbsd;
+				aio_err = aio_return(td, &aioretargs);
+				aio_ret = td->td_retval[0];
+				td->td_retval[0] = 0;
+
+				DPRINTF("aio_return(%p) (Linux: %p) "
+						"returned %ld, errno=%ld",
+					aioretargs.aiocbp,
+					preq->req_porig,
+					(long)aio_ret,
+					(long)aio_err);
+
+				evt.data = preq->req_linux.aio_data;
+				evt.obj = (uint64_t)(unsigned long)
+					preq->req_porig;
+				if (aio_ret >= 0) {
+					/* Normal return (success) */
+					evt.res = aio_ret;
+				} else { /* Error code (failure) */
+					/*
+					 * Translate FreeBSD error code
+					 * to Linux's
+					 */
+					evt.res =
+					      p->p_sysent->sv_errtbl[aio_err];
+				}
+				DPRINTF("context %p (Linux: %p): "
+						"io_event.res=%lld",
+					preq->req_pbsd,
+					preq->req_porig,
+					(long long)evt.res);
+				evt.res2 = 0;
+
+				copyout(&evt, &(args->events[i]), sizeof(evt));
+
+				uma_zfree(linux_aio_request_zone, preq);
+
+				i ++;
+			} /* End of collecting all finished requests */
+
+			if (STAILQ_EMPTY(&(pctx->ctx_req))) {
+				/* No request remained in this context */
+				DPRINTF("returning(context %p): "
+						"request queue is empty",
+					pctx);
+				break;
+			}
+
+			if (i >= args->nr) { /* Full */
+				DPRINTF("returning(context %p): user space "
+						"event array is full",
+					pctx);
+				break;
+			}
+
+			if (i >= args->min_nr) {
+				/* Met the minimum requirement */
+				DPRINTF("returning(context %p): "
+						"met the minimum requirement",
+					pctx);
+				break;
+			}
+
+			if (args->timeout != NULL) {
+				if (! timespecisset(&timeout)) { /* Timed out */
+					DPRINTF("returning(context %p): "
+							"no time remaining",
+						pctx);
+					break;
+				}
+			}
+
+			if (args->timeout != NULL) {
+				nanouptime(&t1); /* Time before aio_suspend() */
+				DUMP_TIMESPEC("T1: ", &t1,
+					" (uptime before calling aio_suspend())");
+			}
+
+			/* Prepare arguments for aio_suspend() */
+			j = 0;
+			LINUX_AIO_REQ_FOREACH(pctx, preq) {
+				copyout(&(preq->req_pbsd), &(u_aiocbp[j]),
+					sizeof(preq->req_pbsd));
+				j ++;
+			}
+			MPASS(j == pctx->ctx_nreq_cur);
+			aiosusargs.aiocbp = u_aiocbp;
+			aiosusargs.nent = j;
+
+			if (args->timeout != NULL) {
+				copyout(&timeout, u_ptimeout, sizeof(timeout));
+				aiosusargs.timeout = u_ptimeout;
+				DUMP_TIMESPEC("Time remained: ", &timeout, "");
+			} else {
+				aiosusargs.timeout = NULL;
+			}
+
+			aio_err = aio_suspend(td, &aiosusargs);
+			DPRINTF("aio_suspend(%p, %d, %p) returned %ld",
+					aiosusargs.aiocbp, aiosusargs.nent,
+					aiosusargs.timeout, (long)aio_err);
+
+			if (args->timeout != NULL) {
+				nanouptime(&t2); /* Time after aio_suspend() */
+				DUMP_TIMESPEC("T2: ", &t2,
+					" (uptime after calling aio_suspend())");
+				timespecsub(&t2, &t1); /*
+							* Time spent by
+							* aio_suspend()
+							*/
+				DUMP_TIMESPEC("T_delta: ", &t2,
+					" (time spent by calling aio_suspend())");
+				if (timespeccmp(&t2, &timeout, >=)) {
+					timespecclear(&timeout); /* Timed out */
+				} else {
+					timespecsub(&timeout, &t2);
+					/* Time remaining */
+				}
+				DUMP_TIMESPEC("Time remained: ", &timeout, "");
+			}
+
+			if (aio_err == EAGAIN) { /* Timed out */
+				DPRINTF("returning(context %p): "
+						"timed out after calling aio_suspend()",
+					pctx);
+				break;
+			}
+		} /*
+		   * End of collecting finished requests
+		   * and waiting for queued requests
+		   */
+
+		l_timeout.tv_sec = timeout.tv_sec;
+		l_timeout.tv_nsec = timeout.tv_nsec;
+		copyout(&l_timeout, args->timeout, sizeof(l_timeout));
+		/* No matter whether successfully or not */
+
+		nerr = user_free(td, u_ptimeout, sizeof(*u_ptimeout));
+skip_substantial_1:
+		nerr = user_free(td, u_aiocbp,
+				sizeof(*u_aiocbp) * pctx->ctx_nreq_max);
+skip_substantial_0:
+		td->td_retval[0] = i;
+		/* user_free() resets td->td_retval[0] to 0 */
+		DPRINTF("%d requests are unhooked from the context %p", i, pctx);
+	} /* End of dealing with request queue */
+
+	LINUX_AIO_CTX_UNLOCK(pctx);
+
+	return (nerr);
+}
+
+/* Linux system call io_submit(2) */
+int linux_io_submit(struct thread *td, struct linux_io_submit_args *args)
+{
+	int i, nerr = 0;
+	struct proc *p;
+	struct linux_aio_context *pctx;
+	struct linux_aio_request req, *preq;
+	struct linux_iocb *porig;
+	struct aiocb iocb, *piocb;
+
+	DARGPRINTF("%lx, %ld, %p", (unsigned long)args->ctx_id, (long)args->nr, args->iocbpp);
+
+	if (args->nr <= 0)
+		return (EINVAL);
+

*** DIFF OUTPUT TRUNCATED AT 1000 LINES ***


More information about the svn-src-user mailing list