git: 022ca2fc7fe0 - main - Add aio_writev and aio_readv

Alan Somers asomers at FreeBSD.org
Sun Jan 3 02:59:05 UTC 2021


The branch main has been updated by asomers:

URL: https://cgit.FreeBSD.org/src/commit/?id=022ca2fc7fe08d51f33a1d23a9be49e6d132914e

commit 022ca2fc7fe08d51f33a1d23a9be49e6d132914e
Author:     Alan Somers <asomers at FreeBSD.org>
AuthorDate: 2021-01-02 23:34:20 +0000
Commit:     Alan Somers <asomers at FreeBSD.org>
CommitDate: 2021-01-03 02:57:58 +0000

    Add aio_writev and aio_readv
    
    POSIX AIO is great, but it lacks vectored I/O functions. This commit
    fixes that shortcoming by adding aio_writev and aio_readv. They aren't
    part of the standard, but they're an obvious extension. They work just
    like their synchronous equivalents pwritev and preadv.
    
    It isn't yet possible to use vectored aiocbs with lio_listio, but that
    could be added in the future.
    
    Reviewed by:    jhb, kib, bcr
    Relnotes:       yes
    Differential Revision: https://reviews.freebsd.org/D27743
---
 lib/libc/sys/Makefile.inc             |   2 +
 lib/libc/sys/Symbol.map               |   2 +
 lib/libc/sys/aio_error.2              |   6 +-
 lib/libc/sys/aio_read.2               |  63 ++-
 lib/libc/sys/aio_return.2             |   4 +-
 lib/libc/sys/aio_write.2              |  63 ++-
 share/man/man4/aio.4                  |   4 +-
 sys/bsm/audit_kevents.h               |   2 +
 sys/compat/freebsd32/freebsd32_misc.c |   2 +-
 sys/compat/freebsd32/freebsd32_util.h |   2 +
 sys/compat/freebsd32/syscalls.master  |   6 +-
 sys/kern/capabilities.conf            |   2 +
 sys/kern/sys_socket.c                 |  35 +-
 sys/kern/syscalls.master              |  12 +-
 sys/kern/vfs_aio.c                    | 497 +++++++++++++++--------
 sys/sys/aio.h                         |  22 +-
 tests/sys/aio/aio_test.c              | 739 +++++++++++++++++++++++++++++++---
 17 files changed, 1171 insertions(+), 292 deletions(-)

diff --git a/lib/libc/sys/Makefile.inc b/lib/libc/sys/Makefile.inc
index d43a59719563..82d16fb81b6b 100644
--- a/lib/libc/sys/Makefile.inc
+++ b/lib/libc/sys/Makefile.inc
@@ -354,6 +354,8 @@ MAN+=	sctp_generic_recvmsg.2 \
 	write.2 \
 	_umtx_op.2
 
+MLINKS+=aio_read.2 aio_readv.2
+MLINKS+=aio_write.2 aio_writev.2
 MLINKS+=accept.2 accept4.2
 MLINKS+=access.2 eaccess.2 \
 	access.2 faccessat.2
diff --git a/lib/libc/sys/Symbol.map b/lib/libc/sys/Symbol.map
index 9f0d3749ac01..847dd9cca987 100644
--- a/lib/libc/sys/Symbol.map
+++ b/lib/libc/sys/Symbol.map
@@ -403,6 +403,8 @@ FBSD_1.5 {
 
 FBSD_1.6 {
 	__sysctlbyname;
+	aio_readv;
+	aio_writev;
 	close_range;
 	copy_file_range;
 	fhlink;
diff --git a/lib/libc/sys/aio_error.2 b/lib/libc/sys/aio_error.2
index 030914616121..1ec6505a64aa 100644
--- a/lib/libc/sys/aio_error.2
+++ b/lib/libc/sys/aio_error.2
@@ -24,7 +24,7 @@
 .\"
 .\" $FreeBSD$
 .\"
-.Dd June 2, 1999
+.Dd January 2, 2021
 .Dt AIO_ERROR 2
 .Os
 .Sh NAME
@@ -52,7 +52,9 @@ is returned.
 If the request has completed unsuccessfully the error
 status is returned as described in
 .Xr read 2 ,
+.Xr readv 2 ,
 .Xr write 2 ,
+.Xr writev 2 ,
 or
 .Xr fsync 2 .
 On failure,
@@ -76,9 +78,11 @@ does not reference an outstanding asynchronous I/O request.
 .Sh SEE ALSO
 .Xr aio_cancel 2 ,
 .Xr aio_read 2 ,
+.Xr aio_readv 2 ,
 .Xr aio_return 2 ,
 .Xr aio_suspend 2 ,
 .Xr aio_write 2 ,
+.Xr aio_writev 2 ,
 .Xr fsync 2 ,
 .Xr read 2 ,
 .Xr write 2 ,
diff --git a/lib/libc/sys/aio_read.2 b/lib/libc/sys/aio_read.2
index bbf96cc89890..0327ef1f747b 100644
--- a/lib/libc/sys/aio_read.2
+++ b/lib/libc/sys/aio_read.2
@@ -24,11 +24,12 @@
 .\"
 .\" $FreeBSD$
 .\"
-.Dd August 19, 2016
+.Dd January 2, 2021
 .Dt AIO_READ 2
 .Os
 .Sh NAME
-.Nm aio_read
+.Nm aio_read ,
+.Nm aio_readv
 .Nd asynchronous read from a file (REALTIME)
 .Sh LIBRARY
 .Lb libc
@@ -36,21 +37,42 @@
 .In aio.h
 .Ft int
 .Fn aio_read "struct aiocb *iocb"
+.In sys/uio.h
+.Ft int
+.Fn aio_readv "struct aiocb *iocb"
 .Sh DESCRIPTION
 The
 .Fn aio_read
-system call allows the calling process to read
-.Fa iocb->aio_nbytes
+and
+.Fn aio_readv
+system calls allow the calling process to read
 from the descriptor
 .Fa iocb->aio_fildes
 beginning at the offset
-.Fa iocb->aio_offset
-into the buffer pointed to by
-.Fa iocb->aio_buf .
-The call returns immediately after the read request has
+.Fa iocb->aio_offset .
+.Fn aio_read
+will read
+.Fa iocb->aio_nbytes
+from the buffer pointed to by
+.Fa iocb->aio_buf ,
+whereas
+.Fn aio_readv
+reads the data into the
+.Fa iocb->aio_iovcnt
+buffers specified by the members of the
+.Fa iocb->aio_iov
+array.
+Both syscalls return immediately after the read request has
 been enqueued to the descriptor; the read may or may not have
 completed at the time the call returns.
 .Pp
+For
+.Fn aio_readv
+the
+.Fa iovec
+structure is defined in
+.Xr readv 2 .
+.Pp
 If _POSIX_PRIORITIZED_IO is defined, and the descriptor supports it,
 then the enqueued operation is submitted at a priority equal to that
 of the calling process minus
@@ -61,7 +83,9 @@ The
 argument
 is ignored by the
 .Fn aio_read
-system call.
+and
+.Fn aio_readv
+system calls.
 .Pp
 The
 .Fa iocb
@@ -108,16 +132,22 @@ is past the offset maximum for
 .Fa iocb->aio_fildes ,
 no I/O will occur.
 .Sh RETURN VALUES
-.Rv -std aio_read
+.Rv -std aio_read aio_readv
 .Sh DIAGNOSTICS
 None.
 .Sh ERRORS
 The
 .Fn aio_read
-system call will fail if:
+and
+.Fn aio_readv
+system calls will fail if:
 .Bl -tag -width Er
 .It Bq Er EAGAIN
 The request was not queued because of system resource limitations.
+.It Bq Er EFAULT
+Part of
+.Fa aio_iov
+points outside the process's allocated address space.
 .It Bq Er EINVAL
 The asynchronous notification method in
 .Fa iocb->aio_sigevent.sigev_notify
@@ -130,10 +160,14 @@ are unsafe and unsafe asynchronous I/O operations are disabled.
 .Pp
 The following conditions may be synchronously detected when the
 .Fn aio_read
+or
+.Fn aio_readv
 system call is made, or asynchronously, at any time thereafter.
 If they
 are detected at call time,
 .Fn aio_read
+or
+.Fn aio_readv
 returns -1 and sets
 .Va errno
 appropriately; otherwise the
@@ -207,11 +241,18 @@ The
 system call is expected to conform to the
 .St -p1003.1
 standard.
+The
+.Fn aio_readv
+system call is a FreeBSD extension, and should not be used in portable code.
 .Sh HISTORY
 The
 .Fn aio_read
 system call first appeared in
 .Fx 3.0 .
+The
+.Fn aio_readv
+system call first appeared in
+.Fx 13.0 .
 .Sh AUTHORS
 This
 manual page was written by
diff --git a/lib/libc/sys/aio_return.2 b/lib/libc/sys/aio_return.2
index df558734ed41..d94fcc7eba62 100644
--- a/lib/libc/sys/aio_return.2
+++ b/lib/libc/sys/aio_return.2
@@ -24,7 +24,7 @@
 .\"
 .\" $FreeBSD$
 .\"
-.Dd March 21, 2016
+.Dd January 2, 2021
 .Dt AIO_RETURN 2
 .Os
 .Sh NAME
@@ -55,7 +55,9 @@ returns something other than
 If the asynchronous I/O request has completed, the status is returned
 as described in
 .Xr read 2 ,
+.Xr readv 2 ,
 .Xr write 2 ,
+.Xr writev 2 ,
 or
 .Xr fsync 2 .
 Otherwise,
diff --git a/lib/libc/sys/aio_write.2 b/lib/libc/sys/aio_write.2
index a3268e50ea90..601515b0e7b0 100644
--- a/lib/libc/sys/aio_write.2
+++ b/lib/libc/sys/aio_write.2
@@ -24,11 +24,12 @@
 .\"
 .\" $FreeBSD$
 .\"
-.Dd August 19, 2016
+.Dd January 2, 2021
 .Dt AIO_WRITE 2
 .Os
 .Sh NAME
-.Nm aio_write
+.Nm aio_write ,
+.Nm aio_writev
 .Nd asynchronous write to a file (REALTIME)
 .Sh LIBRARY
 .Lb libc
@@ -36,28 +37,48 @@
 .In aio.h
 .Ft int
 .Fn aio_write "struct aiocb *iocb"
+.In sys/uio.h
+.Ft int
+.Fn aio_writev "struct aiocb *iocb"
 .Sh DESCRIPTION
 The
 .Fn aio_write
-system call allows the calling process to write
-.Fa iocb->aio_nbytes
-from the buffer pointed to by
-.Fa iocb->aio_buf
+and
+.Fn aio_writev
+system calls allow the calling process to write
 to the descriptor
 .Fa iocb->aio_fildes .
-The call returns immediately after the write request has been enqueued
+.Fn aio_write
+will write
+.Fa iocb->aio_nbytes
+from the buffer pointed to by
+.Fa iocb->aio_buf ,
+whereas
+.Fn aio_writev
+gathers the data from the
+.Fa iocb->aio_iovcnt
+buffers specified by the members of the
+.Fa iocb->aio_iov
+array.
+Both syscalls return immediately after the write request has been enqueued
 to the descriptor; the write may or may not have completed at the time
 the call returns.
 If the request could not be enqueued, generally due
 to invalid arguments, the call returns without having enqueued the
 request.
 .Pp
+For
+.Fn aio_writev
+the
+.Fa iovec
+structure is defined in
+.Xr writev 2 .
+.Pp
 If
 .Dv O_APPEND
 is set for
 .Fa iocb->aio_fildes ,
-.Fn aio_write
-operations append to the file in the same order as the calls were
+write operations append to the file in the same order as the calls were
 made.
 If
 .Dv O_APPEND
@@ -103,6 +124,8 @@ The asynchronous I/O control buffer
 .Fa iocb
 should be zeroed before the
 .Fn aio_write
+or
+.Fn aio_writev
 system call to avoid passing bogus context information to the kernel.
 .Pp
 Modifications of the Asynchronous I/O Control Block structure or the
@@ -114,14 +137,20 @@ is past the offset maximum for
 .Fa iocb->aio_fildes ,
 no I/O will occur.
 .Sh RETURN VALUES
-.Rv -std aio_write
+.Rv -std aio_write aio_writev
 .Sh ERRORS
 The
 .Fn aio_write
-system call will fail if:
+and
+.Fn aio_writev
+system calls will fail if:
 .Bl -tag -width Er
 .It Bq Er EAGAIN
 The request was not queued because of system resource limitations.
+.It Bq Er EFAULT
+Part of
+.Fa aio_iov
+points outside the process's allocated address space.
 .It Bq Er EINVAL
 The asynchronous notification method in
 .Fa iocb->aio_sigevent.sigev_notify
@@ -134,10 +163,14 @@ are unsafe and unsafe asynchronous I/O operations are disabled.
 .Pp
 The following conditions may be synchronously detected when the
 .Fn aio_write
+or
+.Fn aio_writev
 system call is made, or asynchronously, at any time thereafter.
 If they
 are detected at call time,
 .Fn aio_write
+or
+.Fn aio_writev
 returns -1 and sets
 .Va errno
 appropriately; otherwise the
@@ -203,11 +236,19 @@ system call
 is expected to conform to the
 .St -p1003.1
 standard.
+.Pp
+The
+.Fn aio_writev
+system call is a FreeBSD extension, and should not be used in portable code.
 .Sh HISTORY
 The
 .Fn aio_write
 system call first appeared in
 .Fx 3.0 .
+The
+.Fn aio_writev
+system call first appeared in
+.Fx 13.0 .
 .Sh AUTHORS
 This manual page was written by
 .An Wes Peters Aq Mt wes at softweyr.com .
diff --git a/share/man/man4/aio.4 b/share/man/man4/aio.4
index 0ea728499d13..513a5728defc 100644
--- a/share/man/man4/aio.4
+++ b/share/man/man4/aio.4
@@ -27,7 +27,7 @@
 .\"
 .\" $FreeBSD$
 .\"
-.Dd June 22, 2017
+.Dd January 2, 2021
 .Dt AIO 4
 .Os
 .Sh NAME
@@ -215,10 +215,12 @@ as described in
 .Xr aio_cancel 2 ,
 .Xr aio_error 2 ,
 .Xr aio_read 2 ,
+.Xr aio_readv 2 ,
 .Xr aio_return 2 ,
 .Xr aio_suspend 2 ,
 .Xr aio_waitcomplete 2 ,
 .Xr aio_write 2 ,
+.Xr aio_writev 2 ,
 .Xr lio_listio 2 ,
 .Xr sigevent 3 ,
 .Xr sysctl 8
diff --git a/sys/bsm/audit_kevents.h b/sys/bsm/audit_kevents.h
index 5b37329078a1..eeb928ecafdc 100644
--- a/sys/bsm/audit_kevents.h
+++ b/sys/bsm/audit_kevents.h
@@ -660,6 +660,8 @@
 #define	AUE_REALPATHAT		43264	/* FreeBSD-specific. */
 #define	AUE_CLOSERANGE		43265	/* FreeBSD-specific. */
 #define	AUE_SPECIALFD		43266	/* FreeBSD-specific. */
+#define	AUE_AIO_WRITEV		43267	/* FreeBSD-specific. */
+#define	AUE_AIO_READV		43268	/* FreeBSD-specific. */
 
 /*
  * Darwin BSM uses a number of AUE_O_* definitions, which are aliased to the
diff --git a/sys/compat/freebsd32/freebsd32_misc.c b/sys/compat/freebsd32/freebsd32_misc.c
index 62fab95c68d1..14afd433d9f1 100644
--- a/sys/compat/freebsd32/freebsd32_misc.c
+++ b/sys/compat/freebsd32/freebsd32_misc.c
@@ -1070,7 +1070,7 @@ freebsd32_ptrace(struct thread *td, struct freebsd32_ptrace_args *uap)
 	return (error);
 }
 
-static int
+int
 freebsd32_copyinuio(struct iovec32 *iovp, u_int iovcnt, struct uio **uiop)
 {
 	struct iovec32 iov32;
diff --git a/sys/compat/freebsd32/freebsd32_util.h b/sys/compat/freebsd32/freebsd32_util.h
index a66038d4d36a..b126fbde0857 100644
--- a/sys/compat/freebsd32/freebsd32_util.h
+++ b/sys/compat/freebsd32/freebsd32_util.h
@@ -116,6 +116,8 @@ int	freebsd32_copyout_strings(struct image_params *imgp,
 	    uintptr_t *stack_base);
 int	freebsd32_copyiniov(struct iovec32 *iovp, u_int iovcnt,
 	    struct iovec **iov, int error);
+int	freebsd32_copyinuio(struct iovec32 *iovp, u_int iovcnt,
+	    struct uio **uiop);
 void	freebsd32_rusage_out(const struct rusage *s, struct rusage32 *s32);
 
 struct image_args;
diff --git a/sys/compat/freebsd32/syscalls.master b/sys/compat/freebsd32/syscalls.master
index f4339795781a..ca0db9a76b1e 100644
--- a/sys/compat/freebsd32/syscalls.master
+++ b/sys/compat/freebsd32/syscalls.master
@@ -493,8 +493,10 @@
 257	AUE_LIO_LISTIO	STD	{ int freebsd32_lio_listio(int mode, \
 				    struct aiocb32 * const *acb_list, \
 				    int nent, struct sigevent32 *sig); }
-258	AUE_NULL	UNIMPL	nosys
-259	AUE_NULL	UNIMPL	nosys
+258	AUE_AIO_WRITEV	STD	{ int freebsd32_aio_writev( \
+				    struct aiocb32 *aiocbp); }
+259	AUE_AIO_READV	STD	{ int freebsd32_aio_readv( \
+				    struct aiocb32 *aiocbp); }
 260	AUE_NULL	UNIMPL	nosys
 261	AUE_NULL	UNIMPL	nosys
 262	AUE_NULL	UNIMPL	nosys
diff --git a/sys/kern/capabilities.conf b/sys/kern/capabilities.conf
index 3d552255d823..602ec7088fc6 100644
--- a/sys/kern/capabilities.conf
+++ b/sys/kern/capabilities.conf
@@ -100,6 +100,8 @@ aio_return
 aio_suspend
 aio_waitcomplete
 aio_write
+aio_writev
+aio_readv
 
 ##
 ## audit(2) is a global operation, submitting to the global trail, but it is
diff --git a/sys/kern/sys_socket.c b/sys/kern/sys_socket.c
index 0fe200c119d2..18803b6a5ac0 100644
--- a/sys/kern/sys_socket.c
+++ b/sys/kern/sys_socket.c
@@ -600,9 +600,7 @@ soaio_process_job(struct socket *so, struct sockbuf *sb, struct kaiocb *job)
 	struct ucred *td_savedcred;
 	struct thread *td;
 	struct file *fp;
-	struct uio uio;
-	struct iovec iov;
-	size_t cnt, done;
+	size_t cnt, done, job_total_nbytes;
 	long ru_before;
 	int error, flags;
 
@@ -614,16 +612,11 @@ retry:
 	td_savedcred = td->td_ucred;
 	td->td_ucred = job->cred;
 
+	job_total_nbytes = job->uiop->uio_resid + job->aio_done;
 	done = job->aio_done;
-	cnt = job->uaiocb.aio_nbytes - done;
-	iov.iov_base = (void *)((uintptr_t)job->uaiocb.aio_buf + done);
-	iov.iov_len = cnt;
-	uio.uio_iov = &iov;
-	uio.uio_iovcnt = 1;
-	uio.uio_offset = 0;
-	uio.uio_resid = cnt;
-	uio.uio_segflg = UIO_USERSPACE;
-	uio.uio_td = td;
+	cnt = job->uiop->uio_resid;
+	job->uiop->uio_offset = 0;
+	job->uiop->uio_td = td;
 	flags = MSG_NBIO;
 
 	/*
@@ -633,26 +626,26 @@ retry:
 	 */
 
 	if (sb == &so->so_rcv) {
-		uio.uio_rw = UIO_READ;
 		ru_before = td->td_ru.ru_msgrcv;
 #ifdef MAC
 		error = mac_socket_check_receive(fp->f_cred, so);
 		if (error == 0)
 
 #endif
-			error = soreceive(so, NULL, &uio, NULL, NULL, &flags);
+			error = soreceive(so, NULL, job->uiop, NULL, NULL,
+			    &flags);
 		if (td->td_ru.ru_msgrcv != ru_before)
 			job->msgrcv = 1;
 	} else {
 		if (!TAILQ_EMPTY(&sb->sb_aiojobq))
 			flags |= MSG_MORETOCOME;
-		uio.uio_rw = UIO_WRITE;
 		ru_before = td->td_ru.ru_msgsnd;
 #ifdef MAC
 		error = mac_socket_check_send(fp->f_cred, so);
 		if (error == 0)
 #endif
-			error = sosend(so, NULL, &uio, NULL, NULL, flags, td);
+			error = sosend(so, NULL, job->uiop, NULL, NULL, flags,
+			    td);
 		if (td->td_ru.ru_msgsnd != ru_before)
 			job->msgsnd = 1;
 		if (error == EPIPE && (so->so_options & SO_NOSIGPIPE) == 0) {
@@ -662,7 +655,7 @@ retry:
 		}
 	}
 
-	done += cnt - uio.uio_resid;
+	done += cnt - job->uiop->uio_resid;
 	job->aio_done = done;
 	td->td_ucred = td_savedcred;
 
@@ -676,7 +669,7 @@ retry:
 		 * been made, requeue this request at the head of the
 		 * queue to try again when the socket is ready.
 		 */
-		MPASS(done != job->uaiocb.aio_nbytes);
+		MPASS(done != job_total_nbytes);
 		SOCKBUF_LOCK(sb);
 		if (done == 0 || !(so->so_state & SS_NBIO)) {
 			empty_results++;
@@ -782,10 +775,10 @@ soo_aio_cancel(struct kaiocb *job)
 
 	so = job->fd_file->f_data;
 	opcode = job->uaiocb.aio_lio_opcode;
-	if (opcode == LIO_READ)
+	if (opcode == LIO_READ || opcode == LIO_READV)
 		sb = &so->so_rcv;
 	else {
-		MPASS(opcode == LIO_WRITE);
+		MPASS(opcode == LIO_WRITE || opcode == LIO_WRITEV);
 		sb = &so->so_snd;
 	}
 
@@ -817,9 +810,11 @@ soo_aio_queue(struct file *fp, struct kaiocb *job)
 
 	switch (job->uaiocb.aio_lio_opcode) {
 	case LIO_READ:
+	case LIO_READV:
 		sb = &so->so_rcv;
 		break;
 	case LIO_WRITE:
+	case LIO_WRITEV:
 		sb = &so->so_snd;
 		break;
 	default:
diff --git a/sys/kern/syscalls.master b/sys/kern/syscalls.master
index b7ea5e939635..aaa0a1277461 100644
--- a/sys/kern/syscalls.master
+++ b/sys/kern/syscalls.master
@@ -1477,7 +1477,17 @@
 		    _In_opt_ struct sigevent *sig
 		);
 	}
-258-271	AUE_NULL	UNIMPL	nosys
+258	AUE_AIO_WRITEV	STD {
+		int aio_writev(
+		    _Inout_ struct aiocb *aiocbp
+		);
+	}
+259	AUE_AIO_READV	STD {
+		int aio_readv(
+		    _Inout_ struct aiocb *aiocbp
+		);
+	}
+260-271	AUE_NULL	UNIMPL	nosys
 272	AUE_O_GETDENTS	COMPAT11 {
 		int getdents(
 		    int fd,
diff --git a/sys/kern/vfs_aio.c b/sys/kern/vfs_aio.c
index 37e19557d807..d83c9d725e68 100644
--- a/sys/kern/vfs_aio.c
+++ b/sys/kern/vfs_aio.c
@@ -292,7 +292,7 @@ struct kaioinfo {
  * Different ABIs provide their own operations.
  */
 struct aiocb_ops {
-	int	(*aio_copyin)(struct aiocb *ujob, struct aiocb *kjob);
+	int	(*aio_copyin)(struct aiocb *ujob, struct kaiocb *kjob, int ty);
 	long	(*fetch_status)(struct aiocb *ujob);
 	long	(*fetch_error)(struct aiocb *ujob);
 	int	(*store_status)(struct aiocb *ujob, long status);
@@ -307,6 +307,7 @@ static struct mtx aio_job_mtx;
 static TAILQ_HEAD(,kaiocb) aio_jobs;			/* (c) Async job list */
 static struct unrhdr *aiod_unr;
 
+static void	aio_biocleanup(struct bio *bp);
 void		aio_init_aioinfo(struct proc *p);
 static int	aio_onceonly(void);
 static int	aio_free_entry(struct kaiocb *job);
@@ -559,6 +560,8 @@ aio_free_entry(struct kaiocb *job)
 	if (job->fd_file)
 		fdrop(job->fd_file, curthread);
 	crfree(job->cred);
+	if (job->uiop != &job->uio)
+		free(job->uiop, M_IOV);
 	uma_zfree(aiocb_zone, job);
 	AIO_LOCK(ki);
 
@@ -754,36 +757,29 @@ aio_process_rw(struct kaiocb *job)
 	struct thread *td;
 	struct aiocb *cb;
 	struct file *fp;
-	struct uio auio;
-	struct iovec aiov;
 	ssize_t cnt;
 	long msgsnd_st, msgsnd_end;
 	long msgrcv_st, msgrcv_end;
 	long oublock_st, oublock_end;
 	long inblock_st, inblock_end;
-	int error;
+	int error, opcode;
 
 	KASSERT(job->uaiocb.aio_lio_opcode == LIO_READ ||
-	    job->uaiocb.aio_lio_opcode == LIO_WRITE,
+	    job->uaiocb.aio_lio_opcode == LIO_READV ||
+	    job->uaiocb.aio_lio_opcode == LIO_WRITE ||
+	    job->uaiocb.aio_lio_opcode == LIO_WRITEV,
 	    ("%s: opcode %d", __func__, job->uaiocb.aio_lio_opcode));
 
 	aio_switch_vmspace(job);
 	td = curthread;
 	td_savedcred = td->td_ucred;
 	td->td_ucred = job->cred;
+	job->uiop->uio_td = td;
 	cb = &job->uaiocb;
 	fp = job->fd_file;
 
-	aiov.iov_base = (void *)(uintptr_t)cb->aio_buf;
-	aiov.iov_len = cb->aio_nbytes;
-
-	auio.uio_iov = &aiov;
-	auio.uio_iovcnt = 1;
-	auio.uio_offset = cb->aio_offset;
-	auio.uio_resid = cb->aio_nbytes;
-	cnt = cb->aio_nbytes;
-	auio.uio_segflg = UIO_USERSPACE;
-	auio.uio_td = td;
+	opcode = job->uaiocb.aio_lio_opcode;
+	cnt = job->uiop->uio_resid;
 
 	msgrcv_st = td->td_ru.ru_msgrcv;
 	msgsnd_st = td->td_ru.ru_msgsnd;
@@ -794,17 +790,16 @@ aio_process_rw(struct kaiocb *job)
 	 * aio_aqueue() acquires a reference to the file that is
 	 * released in aio_free_entry().
 	 */
-	if (cb->aio_lio_opcode == LIO_READ) {
-		auio.uio_rw = UIO_READ;
-		if (auio.uio_resid == 0)
+	if (opcode == LIO_READ || opcode == LIO_READV) {
+		if (job->uiop->uio_resid == 0)
 			error = 0;
 		else
-			error = fo_read(fp, &auio, fp->f_cred, FOF_OFFSET, td);
+			error = fo_read(fp, job->uiop, fp->f_cred, FOF_OFFSET,
+			    td);
 	} else {
 		if (fp->f_type == DTYPE_VNODE)
 			bwillwrite();
-		auio.uio_rw = UIO_WRITE;
-		error = fo_write(fp, &auio, fp->f_cred, FOF_OFFSET, td);
+		error = fo_write(fp, job->uiop, fp->f_cred, FOF_OFFSET, td);
 	}
 	msgrcv_end = td->td_ru.ru_msgrcv;
 	msgsnd_end = td->td_ru.ru_msgsnd;
@@ -816,17 +811,18 @@ aio_process_rw(struct kaiocb *job)
 	job->inblock = inblock_end - inblock_st;
 	job->outblock = oublock_end - oublock_st;
 
-	if ((error) && (auio.uio_resid != cnt)) {
+	if (error != 0 && job->uiop->uio_resid != cnt) {
 		if (error == ERESTART || error == EINTR || error == EWOULDBLOCK)
 			error = 0;
-		if ((error == EPIPE) && (cb->aio_lio_opcode == LIO_WRITE)) {
+		if (error == EPIPE &&
+		    (opcode == LIO_WRITE || opcode == LIO_WRITEV)) {
 			PROC_LOCK(job->userproc);
 			kern_psignal(job->userproc, SIGPIPE);
 			PROC_UNLOCK(job->userproc);
 		}
 	}
 
-	cnt -= auio.uio_resid;
+	cnt -= job->uiop->uio_resid;
 	td->td_ucred = td_savedcred;
 	if (error)
 		aio_complete(job, -1, error);
@@ -1210,21 +1206,23 @@ aio_qbio(struct proc *p, struct kaiocb *job)
 {
 	struct aiocb *cb;
 	struct file *fp;
-	struct bio *bp;
 	struct buf *pbuf;
 	struct vnode *vp;
 	struct cdevsw *csw;
 	struct cdev *dev;
 	struct kaioinfo *ki;
-	struct vm_page **pages;
-	int error, npages, poff, ref;
+	struct bio **bios = NULL;
+	off_t offset;
+	int bio_cmd, error, i, iovcnt, opcode, poff, ref;
 	vm_prot_t prot;
+	bool use_unmapped;
 
 	cb = &job->uaiocb;
 	fp = job->fd_file;
+	opcode = cb->aio_lio_opcode;
 
-	if (!(cb->aio_lio_opcode == LIO_WRITE ||
-	    cb->aio_lio_opcode == LIO_READ))
+	if (!(opcode == LIO_WRITE || opcode == LIO_WRITEV ||
+	    opcode == LIO_READ || opcode == LIO_READV))
 		return (-1);
 	if (fp == NULL || fp->f_type != DTYPE_VNODE)
 		return (-1);
@@ -1234,8 +1232,21 @@ aio_qbio(struct proc *p, struct kaiocb *job)
 		return (-1);
 	if (vp->v_bufobj.bo_bsize == 0)
 		return (-1);
-	if (cb->aio_nbytes % vp->v_bufobj.bo_bsize)
+
+	bio_cmd = opcode == LIO_WRITE || opcode == LIO_WRITEV ? BIO_WRITE :
+	    BIO_READ;
+	iovcnt = job->uiop->uio_iovcnt;
+	if (iovcnt > max_buf_aio)
 		return (-1);
+	for (i = 0; i < iovcnt; i++) {
+		if (job->uiop->uio_iov[i].iov_len % vp->v_bufobj.bo_bsize != 0)
+			return (-1);
+		if (job->uiop->uio_iov[i].iov_len > maxphys) {
+			error = -1;
+			return (-1);
+		}
+	}
+	offset = cb->aio_offset;
 
 	ref = 0;
 	csw = devvn_refthread(vp, &dev, &ref);
@@ -1246,89 +1257,106 @@ aio_qbio(struct proc *p, struct kaiocb *job)
 		error = -1;
 		goto unref;
 	}
-	if (cb->aio_nbytes > dev->si_iosize_max) {
+	if (job->uiop->uio_resid > dev->si_iosize_max) {
 		error = -1;
 		goto unref;
 	}
 
 	ki = p->p_aioinfo;
-	poff = (vm_offset_t)cb->aio_buf & PAGE_MASK;
-	if ((dev->si_flags & SI_UNMAPPED) && unmapped_buf_allowed) {
-		if (cb->aio_nbytes > maxphys) {
-			error = -1;
-			goto unref;
-		}
+	job->error = 0;
 
-		pbuf = NULL;
-		pages = malloc(sizeof(vm_page_t) * (atop(round_page(
-		    cb->aio_nbytes)) + 1), M_TEMP, M_WAITOK | M_ZERO);
-	} else {
-		if (cb->aio_nbytes > maxphys) {
-			error = -1;
-			goto unref;
-		}
-		if (ki->kaio_buffer_count >= max_buf_aio) {
+	use_unmapped = (dev->si_flags & SI_UNMAPPED) && unmapped_buf_allowed;
+	if (!use_unmapped) {
+		AIO_LOCK(ki);
+		if (ki->kaio_buffer_count + iovcnt > max_buf_aio) {
+			AIO_UNLOCK(ki);
 			error = EAGAIN;
 			goto unref;
 		}
-
-		pbuf = uma_zalloc(pbuf_zone, M_WAITOK);
-		BUF_KERNPROC(pbuf);
-		AIO_LOCK(ki);
-		ki->kaio_buffer_count++;
+		ki->kaio_buffer_count += iovcnt;
 		AIO_UNLOCK(ki);
-		pages = pbuf->b_pages;
-	}
-	bp = g_alloc_bio();
-
-	bp->bio_length = cb->aio_nbytes;
-	bp->bio_bcount = cb->aio_nbytes;
-	bp->bio_done = aio_biowakeup;
-	bp->bio_offset = cb->aio_offset;
-	bp->bio_cmd = cb->aio_lio_opcode == LIO_WRITE ? BIO_WRITE : BIO_READ;
-	bp->bio_dev = dev;
-	bp->bio_caller1 = job;
-	bp->bio_caller2 = pbuf;
-
-	prot = VM_PROT_READ;
-	if (cb->aio_lio_opcode == LIO_READ)
-		prot |= VM_PROT_WRITE;	/* Less backwards than it looks */
-	npages = vm_fault_quick_hold_pages(&curproc->p_vmspace->vm_map,
-	    (vm_offset_t)cb->aio_buf, bp->bio_length, prot, pages,
-	    atop(maxphys) + 1);
-	if (npages < 0) {
-		error = EFAULT;
-		goto doerror;
 	}
-	if (pbuf != NULL) {
-		pmap_qenter((vm_offset_t)pbuf->b_data, pages, npages);
-		bp->bio_data = pbuf->b_data + poff;
-		atomic_add_int(&num_buf_aio, 1);
-		pbuf->b_npages = npages;
-	} else {
-		bp->bio_ma = pages;
-		bp->bio_ma_n = npages;
-		bp->bio_ma_offset = poff;
-		bp->bio_data = unmapped_buf;
-		bp->bio_flags |= BIO_UNMAPPED;
-		atomic_add_int(&num_unmapped_aio, 1);
+
+	bios = malloc(sizeof(struct bio *) * iovcnt, M_TEMP, M_WAITOK);
+	atomic_store_int(&job->nbio, iovcnt);
+	for (i = 0; i < iovcnt; i++) {
+		struct vm_page** pages;
+		struct bio *bp;
+		void *buf;
+		size_t nbytes;
+		int npages;
+
+		buf = job->uiop->uio_iov[i].iov_base;
+		nbytes = job->uiop->uio_iov[i].iov_len;
+
+		bios[i] = g_alloc_bio();
+		bp = bios[i];
+
+		poff = (vm_offset_t)buf & PAGE_MASK;
+		if (use_unmapped) {
+			pbuf = NULL;
+			pages = malloc(sizeof(vm_page_t) * (atop(round_page(
+			    nbytes)) + 1), M_TEMP, M_WAITOK | M_ZERO);
+		} else {
+			pbuf = uma_zalloc(pbuf_zone, M_WAITOK);
+			BUF_KERNPROC(pbuf);
+			pages = pbuf->b_pages;
+		}
+
+		bp->bio_length = nbytes;
+		bp->bio_bcount = nbytes;
+		bp->bio_done = aio_biowakeup;
+		bp->bio_offset = offset;
+		bp->bio_cmd = bio_cmd;
+		bp->bio_dev = dev;
+		bp->bio_caller1 = job;
+		bp->bio_caller2 = pbuf;
+
+		prot = VM_PROT_READ;
+		if (opcode == LIO_READ || opcode == LIO_READV)
+			prot |= VM_PROT_WRITE;	/* Less backwards than it looks */
+		npages = vm_fault_quick_hold_pages(&curproc->p_vmspace->vm_map,
+		    (vm_offset_t)buf, bp->bio_length, prot, pages,
+		    atop(maxphys) + 1);
+		if (npages < 0) {
+			if (pbuf != NULL)
+				uma_zfree(pbuf_zone, pbuf);
+			else
+				free(pages, M_TEMP);
+			error = EFAULT;
+			g_destroy_bio(bp);
+			i--;
+			goto destroy_bios;
+		}
+		if (pbuf != NULL) {
+			pmap_qenter((vm_offset_t)pbuf->b_data, pages, npages);
+			bp->bio_data = pbuf->b_data + poff;
+			pbuf->b_npages = npages;
+			atomic_add_int(&num_buf_aio, 1);
+		} else {
+			bp->bio_ma = pages;
+			bp->bio_ma_n = npages;
+			bp->bio_ma_offset = poff;
+			bp->bio_data = unmapped_buf;
+			bp->bio_flags |= BIO_UNMAPPED;
+			atomic_add_int(&num_unmapped_aio, 1);
+		}
+
+		offset += nbytes;
 	}
 
 	/* Perform transfer. */
-	csw->d_strategy(bp);
+	for (i = 0; i < iovcnt; i++)
+		csw->d_strategy(bios[i]);
+	free(bios, M_TEMP);
+
 	dev_relthread(dev, ref);
 	return (0);
 
-doerror:
-	if (pbuf != NULL) {
-		AIO_LOCK(ki);
-		ki->kaio_buffer_count--;
-		AIO_UNLOCK(ki);
-		uma_zfree(pbuf_zone, pbuf);
-	} else {
-		free(pages, M_TEMP);
-	}
-	g_destroy_bio(bp);
+destroy_bios:
+	for (; i >= 0; i--)
+		aio_biocleanup(bios[i]);
+	free(bios, M_TEMP);
 unref:
 	dev_relthread(dev, ref);
 	return (error);
@@ -1362,25 +1390,39 @@ convert_old_sigevent(struct osigevent *osig, struct sigevent *nsig)
 }
 
 static int
-aiocb_copyin_old_sigevent(struct aiocb *ujob, struct aiocb *kjob)
+aiocb_copyin_old_sigevent(struct aiocb *ujob, struct kaiocb *kjob,
+    int type __unused)
 {
*** 1510 LINES SKIPPED ***


More information about the dev-commits-src-all mailing list