git: 69d94f4c7608 - main - Add tarfs, a filesystem backed by tarballs.
Date: Thu, 02 Feb 2023 17:20:13 UTC
The branch main has been updated by des:
URL: https://cgit.FreeBSD.org/src/commit/?id=69d94f4c7608e41505996559367450706e91fbb8
commit 69d94f4c7608e41505996559367450706e91fbb8
Author: Dag-Erling Smørgrav <des@FreeBSD.org>
AuthorDate: 2023-02-02 17:18:41 +0000
Commit: Dag-Erling Smørgrav <des@FreeBSD.org>
CommitDate: 2023-02-02 17:19:29 +0000
Add tarfs, a filesystem backed by tarballs.
Sponsored by: Juniper Networks, Inc.
Sponsored by: Klara, Inc.
Reviewed by: pauamma, imp
Differential Revision: https://reviews.freebsd.org/D37753
---
etc/mtree/BSD.tests.dist | 2 +
share/man/man5/Makefile | 1 +
share/man/man5/tarfs.5 | 103 ++++
sys/conf/files | 4 +
sys/conf/options | 4 +
sys/fs/tarfs/tarfs.h | 254 +++++++++
sys/fs/tarfs/tarfs_dbg.h | 65 +++
sys/fs/tarfs/tarfs_io.c | 727 +++++++++++++++++++++++
sys/fs/tarfs/tarfs_subr.c | 603 ++++++++++++++++++++
sys/fs/tarfs/tarfs_vfsops.c | 1173 ++++++++++++++++++++++++++++++++++++++
sys/fs/tarfs/tarfs_vnops.c | 642 +++++++++++++++++++++
sys/kern/subr_witness.c | 6 +
sys/modules/Makefile | 1 +
sys/modules/tarfs/Makefile | 23 +
tests/sys/fs/Makefile | 1 +
tests/sys/fs/tarfs/Makefile | 10 +
tests/sys/fs/tarfs/mktar.c | 238 ++++++++
tests/sys/fs/tarfs/tarfs_test.sh | 54 ++
18 files changed, 3911 insertions(+)
diff --git a/etc/mtree/BSD.tests.dist b/etc/mtree/BSD.tests.dist
index 0d05ecaf06fc..b4b18997b7f9 100644
--- a/etc/mtree/BSD.tests.dist
+++ b/etc/mtree/BSD.tests.dist
@@ -757,6 +757,8 @@
fs
fusefs
..
+ tarfs
+ ..
tmpfs
..
..
diff --git a/share/man/man5/Makefile b/share/man/man5/Makefile
index 2d49d981c2f9..f6e91e4ed00b 100644
--- a/share/man/man5/Makefile
+++ b/share/man/man5/Makefile
@@ -70,6 +70,7 @@ MAN= acct.5 \
style.Makefile.5 \
style.mdoc.5 \
sysctl.conf.5 \
+ tarfs.5 \
tmpfs.5 \
unionfs.5
diff --git a/share/man/man5/tarfs.5 b/share/man/man5/tarfs.5
new file mode 100644
index 000000000000..b25131c323c1
--- /dev/null
+++ b/share/man/man5/tarfs.5
@@ -0,0 +1,103 @@
+.\"-
+.\" SPDX-License-Identifier: BSD-2-Clause
+.\"
+.\" Copyright (c) 2022 Klara, Inc.
+.\"
+.\" Redistribution and use in source and binary forms, with or without
+.\" modification, are permitted provided that the following conditions
+.\" are met:
+.\" 1. Redistributions of source code must retain the above copyright
+.\" notice, this list of conditions and the following disclaimer.
+.\" 2. Redistributions in binary form must reproduce the above copyright
+.\" notice, this list of conditions and the following disclaimer in the
+.\" documentation and/or other materials provided with the distribution.
+.\"
+.\" THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+.\" ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+.\" IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+.\" ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+.\" FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+.\" DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+.\" OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+.\" HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+.\" LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+.\" OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+.\" SUCH DAMAGE.
+.\"
+.Dd February 2, 2023
+.Dt TARFS 5
+.Os
+.Sh NAME
+.Nm tarfs
+.Nd tarball filesystem
+.Sh SYNOPSIS
+To compile this driver into the kernel, place the following line in
+your kernel configuration file:
+.Bd -ragged -offset indent
+.Cd "options TARFS"
+.Ed
+.Pp
+Alternatively, to load the driver as a module at boot time, place the
+following line in
+.Xr loader.conf 5 :
+.Bd -literal -offset indent
+tarfs_load="YES"
+.Ed
+.Sh DESCRIPTION
+The
+.Nm
+driver implementes a read-only filesystem backed by a
+.Xr tar 5
+file.
+Currently, only POSIX archives, optionally compressed with
+.Xr zstd 1 ,
+are supported.
+.Pp
+The preferred I/O size for
+.Nm
+filesystems can be adjusted using the
+.Va vfs.tarfs.ioshift
+sysctl setting and tunable.
+Setting it to 0 will reset it to its default value.
+Note that changes to this setting only apply to filesystems mounted
+after the change.
+.Sh DIAGNOSTICS
+If enabled by the
+.Dv TARFS_DEBUG
+kernel option, the
+.Va vfs.tarfs.debug
+sysctl setting can be used to control debugging output from the
+.Nm
+driver.
+Debugging output for individual sections of the driver can be enabled
+by adding together the relevant values from the table below.
+.Bl -column Value Description
+.It 0x01 Ta Memory allocations
+.It 0x02 Ta Checksum calculations
+.It 0x04 Ta Filesystem operations (vfsops)
+.It 0x08 Ta Path lookups
+.It 0x10 Ta File operations (vnops)
+.It 0x20 Ta General I/O
+.It 0x40 Ta Decompression
+.It 0x80 Ta Decompression index
+.It 0x100 Ta Sparse file mapping
+.El
+.Sh SEE ALSO
+.Xr tar 1 ,
+.Xr zstd 1 ,
+.Xr fstab 5 ,
+.Xr tar 5 ,
+.Xr mount 8 ,
+.Xr sysctl 8
+.Sh HISTORY
+.An -nosplit
+The
+.Nm
+driver was developed by
+.An Stephen J. Kiernan Aq Mt stevek@FreeBSD.org
+and
+.An Dag-Erling Smørgrav Aq Mt des@FreeBSD.org
+for Juniper Networks and Klara Systems.
+This manual page was written by
+.An Dag-Erling Smørgrav Aq Mt des@FreeBSD.org
+for Juniper Networks and Klara Systems.
diff --git a/sys/conf/files b/sys/conf/files
index 6cb4abcd9223..08966a9b46e4 100644
--- a/sys/conf/files
+++ b/sys/conf/files
@@ -3615,6 +3615,10 @@ fs/smbfs/smbfs_smb.c optional smbfs
fs/smbfs/smbfs_subr.c optional smbfs
fs/smbfs/smbfs_vfsops.c optional smbfs
fs/smbfs/smbfs_vnops.c optional smbfs
+fs/tarfs/tarfs_io.c optional tarfs compile-with "${NORMAL_C} -I$S/contrib/zstd/lib/freebsd"
+fs/tarfs/tarfs_subr.c optional tarfs
+fs/tarfs/tarfs_vfsops.c optional tarfs
+fs/tarfs/tarfs_vnops.c optional tarfs
fs/udf/osta.c optional udf
fs/udf/udf_iconv.c optional udf_iconv
fs/udf/udf_vfsops.c optional udf
diff --git a/sys/conf/options b/sys/conf/options
index 1f5003507539..3b2be66ba602 100644
--- a/sys/conf/options
+++ b/sys/conf/options
@@ -265,6 +265,7 @@ NULLFS opt_dontuse.h
PROCFS opt_dontuse.h
PSEUDOFS opt_dontuse.h
SMBFS opt_dontuse.h
+TARFS opt_dontuse.h
TMPFS opt_dontuse.h
UDF opt_dontuse.h
UNIONFS opt_dontuse.h
@@ -273,6 +274,9 @@ ZFS opt_dontuse.h
# Pseudofs debugging
PSEUDOFS_TRACE opt_pseudofs.h
+# Tarfs debugging
+TARFS_DEBUG opt_tarfs.h
+
# In-kernel GSS-API
KGSSAPI opt_kgssapi.h
KGSSAPI_DEBUG opt_kgssapi.h
diff --git a/sys/fs/tarfs/tarfs.h b/sys/fs/tarfs/tarfs.h
new file mode 100644
index 000000000000..dffd60ee6d8a
--- /dev/null
+++ b/sys/fs/tarfs/tarfs.h
@@ -0,0 +1,254 @@
+/*-
+ * SPDX-License-Identifier: BSD-2-Clause
+ *
+ * Copyright (c) 2013 Juniper Networks, Inc.
+ * Copyright (c) 2022-2023 Klara, Inc.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#ifndef _FS_TARFS_TARFS_H_
+#define _FS_TARFS_TARFS_H_
+
+#ifndef _KERNEL
+#error Should only be included by kernel
+#endif
+
+MALLOC_DECLARE(M_TARFSMNT);
+MALLOC_DECLARE(M_TARFSNODE);
+MALLOC_DECLARE(M_TARFSNAME);
+
+#ifdef SYSCTL_DECL
+SYSCTL_DECL(_vfs_tarfs);
+#endif
+
+struct componentname;
+struct mount;
+struct vnode;
+
+/*
+ * Internal representation of a tarfs file system node.
+ */
+struct tarfs_node {
+ TAILQ_ENTRY(tarfs_node) entries;
+ TAILQ_ENTRY(tarfs_node) dirents;
+
+ struct mtx lock;
+
+ struct vnode *vnode;
+ struct tarfs_mount *tmp;
+ enum vtype type;
+ ino_t ino;
+ off_t offset;
+ size_t size;
+ size_t physize;
+ char *name;
+ size_t namelen;
+
+ /* Node attributes */
+ uid_t uid;
+ gid_t gid;
+ mode_t mode;
+ unsigned int flags;
+ nlink_t nlink;
+ struct timespec atime;
+ struct timespec mtime;
+ struct timespec ctime;
+ struct timespec birthtime;
+ unsigned long gen;
+
+ /* Block map */
+ size_t nblk;
+ struct tarfs_blk *blk;
+
+ struct tarfs_node *parent;
+ union {
+ /* VDIR */
+ struct {
+ TAILQ_HEAD(, tarfs_node) dirhead;
+ off_t lastcookie;
+ struct tarfs_node *lastnode;
+ } dir;
+
+ /* VLNK */
+ struct {
+ char *name;
+ size_t namelen;
+ } link;
+
+ /* VBLK or VCHR */
+ dev_t rdev;
+
+ /* VREG */
+ struct tarfs_node *other;
+ };
+};
+
+/*
+ * Entry in sparse file block map.
+ */
+struct tarfs_blk {
+ off_t i; /* input (physical) offset */
+ off_t o; /* output (logical) offset */
+ size_t l; /* length */
+};
+
+/*
+ * Decompression buffer.
+ */
+#define TARFS_ZBUF_SIZE 1048576
+struct tarfs_zbuf {
+ u_char buf[TARFS_ZBUF_SIZE];
+ size_t off; /* offset of contents */
+ size_t len; /* length of contents */
+};
+
+/*
+ * Internal representation of a tarfs mount point.
+ */
+struct tarfs_mount {
+ TAILQ_HEAD(, tarfs_node) allnodes;
+ struct mtx allnode_lock;
+
+ struct tarfs_node *root;
+ struct vnode *vp;
+ struct mount *vfs;
+ ino_t ino;
+ struct unrhdr *ino_unr;
+ size_t iosize;
+ size_t nblocks;
+ size_t nfiles;
+ time_t mtime; /* default mtime for directories */
+
+ struct tarfs_zio *zio;
+ struct vnode *znode;
+};
+
+struct tarfs_zio {
+ struct tarfs_mount *tmp;
+
+ /* decompression state */
+#ifdef ZSTDIO
+ struct tarfs_zstd *zstd; /* decompression state (zstd) */
+#endif
+ off_t ipos; /* current input position */
+ off_t opos; /* current output position */
+
+ /* index of compression frames */
+ unsigned int curidx; /* current index position*/
+ unsigned int nidx; /* number of index entries */
+ unsigned int szidx; /* index capacity */
+ struct tarfs_idx { off_t i, o; } *idx;
+};
+
+struct tarfs_fid {
+ u_short len; /* length of data in bytes */
+ u_short data0; /* force alignment */
+ ino_t ino;
+ unsigned long gen;
+};
+
+#define TARFS_NODE_LOCK(tnp) \
+ mtx_lock(&(tnp)->lock)
+#define TARFS_NODE_UNLOCK(tnp) \
+ mtx_unlock(&(tnp)->lock)
+#define TARFS_ALLNODES_LOCK(tnp) \
+ mtx_lock(&(tmp)->allnode_lock)
+#define TARFS_ALLNODES_UNLOCK(tnp) \
+ mtx_unlock(&(tmp)->allnode_lock)
+
+/*
+ * Data and metadata within tar files are aligned on 512-byte boundaries,
+ * to match the block size of the magnetic tapes they were originally
+ * intended for.
+ */
+#define TARFS_BSHIFT 9
+#define TARFS_BLOCKSIZE (size_t)(1U << TARFS_BSHIFT)
+#define TARFS_BLKOFF(l) ((l) % TARFS_BLOCKSIZE)
+#define TARFS_BLKNUM(l) ((l) >> TARFS_BSHIFT)
+#define TARFS_SZ2BLKS(sz) (((sz) + TARFS_BLOCKSIZE - 1) / TARFS_BLOCKSIZE)
+
+/*
+ * Our preferred I/O size.
+ */
+extern unsigned int tarfs_ioshift;
+#define TARFS_IOSHIFT_MIN TARFS_BSHIFT
+#define TARFS_IOSHIFT_DEFAULT PAGE_SHIFT
+#define TARFS_IOSHIFT_MAX PAGE_SHIFT
+
+#define TARFS_ROOTINO ((ino_t)3)
+#define TARFS_ZIOINO ((ino_t)4)
+#define TARFS_MININO ((ino_t)65535)
+
+#define TARFS_COOKIE_DOT 0
+#define TARFS_COOKIE_DOTDOT 1
+#define TARFS_COOKIE_EOF OFF_MAX
+
+#define TARFS_ZIO_NAME ".tar"
+#define TARFS_ZIO_NAMELEN (sizeof(TARFS_ZIO_NAME) - 1)
+
+extern struct vop_vector tarfs_vnodeops;
+
+static inline
+struct tarfs_mount *
+MP_TO_TARFS_MOUNT(struct mount *mp)
+{
+
+ MPASS(mp != NULL && mp->mnt_data != NULL);
+ return (mp->mnt_data);
+}
+
+static inline
+struct tarfs_node *
+VP_TO_TARFS_NODE(struct vnode *vp)
+{
+
+ MPASS(vp != NULL && vp->v_data != NULL);
+ return (vp->v_data);
+}
+
+int tarfs_alloc_node(struct tarfs_mount *tmp, const char *name,
+ size_t namelen, enum vtype type, off_t off, size_t sz,
+ time_t mtime, uid_t uid, gid_t gid, mode_t mode,
+ unsigned int flags, const char *linkname, dev_t rdev,
+ struct tarfs_node *parent, struct tarfs_node **node);
+int tarfs_load_blockmap(struct tarfs_node *tnp, size_t realsize);
+void tarfs_dump_tree(struct tarfs_node *tnp);
+void tarfs_free_node(struct tarfs_node *tnp);
+struct tarfs_node *
+ tarfs_lookup_dir(struct tarfs_node *tnp, off_t cookie);
+struct tarfs_node *
+ tarfs_lookup_node(struct tarfs_node *tnp, struct tarfs_node *f,
+ struct componentname *cnp);
+void tarfs_print_node(struct tarfs_node *tnp);
+int tarfs_read_file(struct tarfs_node *tnp, size_t len, struct uio *uiop);
+
+int tarfs_io_init(struct tarfs_mount *tmp);
+int tarfs_io_fini(struct tarfs_mount *tmp);
+int tarfs_io_read(struct tarfs_mount *tmp, bool raw,
+ struct uio *uiop);
+ssize_t tarfs_io_read_buf(struct tarfs_mount *tmp, bool raw,
+ void *buf, off_t off, size_t len);
+unsigned int
+ tarfs_strtofflags(const char *str, char **end);
+
+#endif /* _FS_TARFS_TARFS_H_ */
diff --git a/sys/fs/tarfs/tarfs_dbg.h b/sys/fs/tarfs/tarfs_dbg.h
new file mode 100644
index 000000000000..45d11d679719
--- /dev/null
+++ b/sys/fs/tarfs/tarfs_dbg.h
@@ -0,0 +1,65 @@
+/*-
+ * SPDX-License-Identifier: BSD-2-Clause
+ *
+ * Copyright (c) 2013 Juniper Networks, Inc.
+ * Copyright (c) 2022 Klara, Inc.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#ifndef _FS_TARFS_TARFS_DBG_H_
+#define _FS_TARFS_TARFS_DBG_H_
+
+#ifndef _KERNEL
+#error Should only be included by kernel
+#endif
+
+#ifdef TARFS_DEBUG
+extern int tarfs_debug;
+
+#define TARFS_DEBUG_ALLOC 0x01
+#define TARFS_DEBUG_CHECKSUM 0x02
+#define TARFS_DEBUG_FS 0x04
+#define TARFS_DEBUG_LOOKUP 0x08
+#define TARFS_DEBUG_VNODE 0x10
+#define TARFS_DEBUG_IO 0x20
+#define TARFS_DEBUG_ZIO 0x40
+#define TARFS_DEBUG_ZIDX 0x80
+#define TARFS_DEBUG_MAP 0x100
+
+#define TARFS_DPF(category, fmt, ...) \
+ do { \
+ if ((tarfs_debug & TARFS_DEBUG_##category) != 0) \
+ printf(fmt, ## __VA_ARGS__); \
+ } while (0)
+#define TARFS_DPF_IFF(category, cond, fmt, ...) \
+ do { \
+ if ((cond) \
+ && (tarfs_debug & TARFS_DEBUG_##category) != 0) \
+ printf(fmt, ## __VA_ARGS__); \
+ } while (0)
+#else
+#define TARFS_DPF(category, fmt, ...)
+#define TARFS_DPF_IFF(category, cond, fmt, ...)
+#endif
+
+#endif /* _FS_TARFS_TARFS_DBG_H_ */
diff --git a/sys/fs/tarfs/tarfs_io.c b/sys/fs/tarfs/tarfs_io.c
new file mode 100644
index 000000000000..b957ac11ff51
--- /dev/null
+++ b/sys/fs/tarfs/tarfs_io.c
@@ -0,0 +1,727 @@
+/*-
+ * SPDX-License-Identifier: BSD-2-Clause
+ *
+ * Copyright (c) 2013 Juniper Networks, Inc.
+ * Copyright (c) 2022-2023 Klara, Inc.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include "opt_tarfs.h"
+#include "opt_zstdio.h"
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/counter.h>
+#include <sys/bio.h>
+#include <sys/buf.h>
+#include <sys/malloc.h>
+#include <sys/mount.h>
+#include <sys/sysctl.h>
+#include <sys/uio.h>
+#include <sys/vnode.h>
+
+#ifdef ZSTDIO
+#define ZSTD_STATIC_LINKING_ONLY
+#include <contrib/zstd/lib/zstd.h>
+#endif
+
+#include <fs/tarfs/tarfs.h>
+#include <fs/tarfs/tarfs_dbg.h>
+
+#ifdef TARFS_DEBUG
+SYSCTL_NODE(_vfs_tarfs, OID_AUTO, zio, CTLFLAG_RD, 0,
+ "Tar filesystem decompression layer");
+COUNTER_U64_DEFINE_EARLY(tarfs_zio_inflated);
+SYSCTL_COUNTER_U64(_vfs_tarfs_zio, OID_AUTO, inflated, CTLFLAG_RD,
+ &tarfs_zio_inflated, "Amount of compressed data inflated.");
+COUNTER_U64_DEFINE_EARLY(tarfs_zio_consumed);
+SYSCTL_COUNTER_U64(_vfs_tarfs_zio, OID_AUTO, consumed, CTLFLAG_RD,
+ &tarfs_zio_consumed, "Amount of compressed data consumed.");
+COUNTER_U64_DEFINE_EARLY(tarfs_zio_bounced);
+SYSCTL_COUNTER_U64(_vfs_tarfs_zio, OID_AUTO, bounced, CTLFLAG_RD,
+ &tarfs_zio_bounced, "Amount of decompressed data bounced.");
+
+static int
+tarfs_sysctl_handle_zio_reset(SYSCTL_HANDLER_ARGS)
+{
+ unsigned int tmp;
+ int error;
+
+ tmp = 0;
+ if ((error = SYSCTL_OUT(req, &tmp, sizeof(tmp))) != 0)
+ return (error);
+ if (req->newptr != NULL) {
+ if ((error = SYSCTL_IN(req, &tmp, sizeof(tmp))) != 0)
+ return (error);
+ counter_u64_zero(tarfs_zio_inflated);
+ counter_u64_zero(tarfs_zio_consumed);
+ counter_u64_zero(tarfs_zio_bounced);
+ }
+ return (0);
+}
+
+SYSCTL_PROC(_vfs_tarfs_zio, OID_AUTO, reset,
+ CTLTYPE_INT | CTLFLAG_MPSAFE | CTLFLAG_RW,
+ NULL, 0, tarfs_sysctl_handle_zio_reset, "IU",
+ "Reset compression counters.");
+#endif
+
+MALLOC_DEFINE(M_TARFSZSTATE, "tarfs zstate", "tarfs decompression state");
+MALLOC_DEFINE(M_TARFSZBUF, "tarfs zbuf", "tarfs decompression buffers");
+
+#define XZ_MAGIC (uint8_t[]){ 0xfd, 0x37, 0x7a, 0x58, 0x5a }
+#define ZLIB_MAGIC (uint8_t[]){ 0x1f, 0x8b, 0x08 }
+#define ZSTD_MAGIC (uint8_t[]){ 0x28, 0xb5, 0x2f, 0xfd }
+
+#ifdef ZSTDIO
+struct tarfs_zstd {
+ ZSTD_DStream *zds;
+};
+#endif
+
+/* XXX review use of curthread / uio_td / td_cred */
+
+/*
+ * Reads from the tar file according to the provided uio. If the archive
+ * is compressed and raw is false, reads the decompressed stream;
+ * otherwise, reads directly from the original file. Returns 0 on success
+ * and a positive errno value on failure.
+ */
+int
+tarfs_io_read(struct tarfs_mount *tmp, bool raw, struct uio *uiop)
+{
+ void *rl = NULL;
+ off_t off = uiop->uio_offset;
+ size_t len = uiop->uio_resid;
+ int error;
+
+ if (raw || tmp->znode == NULL) {
+ rl = vn_rangelock_rlock(tmp->vp, off, off + len);
+ error = vn_lock(tmp->vp, LK_SHARED);
+ if (error == 0) {
+ error = VOP_READ(tmp->vp, uiop,
+ IO_DIRECT|IO_NODELOCKED,
+ uiop->uio_td->td_ucred);
+ VOP_UNLOCK(tmp->vp);
+ }
+ vn_rangelock_unlock(tmp->vp, rl);
+ } else {
+ error = vn_lock(tmp->znode, LK_EXCLUSIVE);
+ if (error == 0) {
+ error = VOP_READ(tmp->znode, uiop,
+ IO_DIRECT | IO_NODELOCKED,
+ uiop->uio_td->td_ucred);
+ VOP_UNLOCK(tmp->znode);
+ }
+ }
+ TARFS_DPF(IO, "%s(%zu, %zu) = %d (resid %zd)\n", __func__,
+ (size_t)off, len, error, uiop->uio_resid);
+ return (error);
+}
+
+/*
+ * Reads from the tar file into the provided buffer. If the archive is
+ * compressed and raw is false, reads the decompressed stream; otherwise,
+ * reads directly from the original file. Returns the number of bytes
+ * read on success, 0 on EOF, and a negative errno value on failure.
+ */
+ssize_t
+tarfs_io_read_buf(struct tarfs_mount *tmp, bool raw,
+ void *buf, off_t off, size_t len)
+{
+ struct uio auio;
+ struct iovec aiov;
+ ssize_t res;
+ int error;
+
+ if (len == 0) {
+ TARFS_DPF(IO, "%s(%zu, %zu) null\n", __func__,
+ (size_t)off, len);
+ return (0);
+ }
+ aiov.iov_base = buf;
+ aiov.iov_len = len;
+ auio.uio_iov = &aiov;
+ auio.uio_iovcnt = 1;
+ auio.uio_offset = off;
+ auio.uio_segflg = UIO_SYSSPACE;
+ auio.uio_rw = UIO_READ;
+ auio.uio_resid = len;
+ auio.uio_td = curthread;
+ error = tarfs_io_read(tmp, raw, &auio);
+ if (error != 0) {
+ TARFS_DPF(IO, "%s(%zu, %zu) error %d\n", __func__,
+ (size_t)off, len, error);
+ return (-error);
+ }
+ res = len - auio.uio_resid;
+ if (res == 0 && len != 0) {
+ TARFS_DPF(IO, "%s(%zu, %zu) eof\n", __func__,
+ (size_t)off, len);
+ } else {
+ TARFS_DPF(IO, "%s(%zu, %zu) read %zd | %*D\n", __func__,
+ (size_t)off, len, res,
+ (int)(res > 8 ? 8 : res), (uint8_t *)buf, " ");
+ }
+ return (res);
+}
+
+#ifdef ZSTDIO
+static void *
+tarfs_zstate_alloc(void *opaque, size_t size)
+{
+
+ (void)opaque;
+ return (malloc(size, M_TARFSZSTATE, M_WAITOK));
+}
+#endif
+
+#ifdef ZSTDIO
+static void
+tarfs_zstate_free(void *opaque, void *address)
+{
+
+ (void)opaque;
+ free(address, M_TARFSZSTATE);
+}
+#endif
+
+#ifdef ZSTDIO
+static ZSTD_customMem tarfs_zstd_mem = {
+ tarfs_zstate_alloc,
+ tarfs_zstate_free,
+ NULL,
+};
+#endif
+
+/*
+ * Updates the decompression frame index, recording the current input and
+ * output offsets in a new index entry, and growing the index if
+ * necessary.
+ */
+static void
+tarfs_zio_update_index(struct tarfs_zio *zio, off_t i, off_t o)
+{
+
+ if (++zio->curidx >= zio->nidx) {
+ if (++zio->nidx > zio->szidx) {
+ zio->szidx *= 2;
+ zio->idx = realloc(zio->idx,
+ zio->szidx * sizeof(*zio->idx),
+ M_TARFSZSTATE, M_ZERO | M_WAITOK);
+ TARFS_DPF(ALLOC, "%s: resized zio index\n", __func__);
+ }
+ zio->idx[zio->curidx].i = i;
+ zio->idx[zio->curidx].o = o;
+ TARFS_DPF(ZIDX, "%s: index %u = i %zu o %zu\n", __func__,
+ zio->curidx, (size_t)zio->idx[zio->curidx].i,
+ (size_t)zio->idx[zio->curidx].o);
+ }
+ MPASS(zio->idx[zio->curidx].i == i);
+ MPASS(zio->idx[zio->curidx].o == o);
+}
+
+/*
+ * VOP_ACCESS for zio node.
+ */
+static int
+tarfs_zaccess(struct vop_access_args *ap)
+{
+ struct vnode *vp = ap->a_vp;
+ struct tarfs_zio *zio = vp->v_data;
+ struct tarfs_mount *tmp = zio->tmp;
+ accmode_t accmode = ap->a_accmode;
+ int error = EPERM;
+
+ if (accmode == VREAD) {
+ error = vn_lock(tmp->vp, LK_SHARED);
+ if (error == 0) {
+ error = VOP_ACCESS(tmp->vp, accmode, ap->a_cred, ap->a_td);
+ VOP_UNLOCK(tmp->vp);
+ }
+ }
+ TARFS_DPF(ZIO, "%s(%d) = %d\n", __func__, accmode, error);
+ return (error);
+}
+
+/*
+ * VOP_GETATTR for zio node.
+ */
+static int
+tarfs_zgetattr(struct vop_getattr_args *ap)
+{
+ struct vattr va;
+ struct vnode *vp = ap->a_vp;
+ struct tarfs_zio *zio = vp->v_data;
+ struct tarfs_mount *tmp = zio->tmp;
+ struct vattr *vap = ap->a_vap;
+ int error = 0;
+
+ VATTR_NULL(vap);
+ error = vn_lock(tmp->vp, LK_SHARED);
+ if (error == 0) {
+ error = VOP_GETATTR(tmp->vp, &va, ap->a_cred);
+ VOP_UNLOCK(tmp->vp);
+ if (error == 0) {
+ vap->va_type = VREG;
+ vap->va_mode = va.va_mode;
+ vap->va_nlink = 1;
+ vap->va_gid = va.va_gid;
+ vap->va_uid = va.va_uid;
+ vap->va_fsid = vp->v_mount->mnt_stat.f_fsid.val[0];
+ vap->va_fileid = TARFS_ZIOINO;
+ vap->va_size = zio->idx[zio->nidx - 1].o;
+ vap->va_blocksize = vp->v_mount->mnt_stat.f_iosize;
+ vap->va_atime = va.va_atime;
+ vap->va_ctime = va.va_ctime;
+ vap->va_mtime = va.va_mtime;
+ vap->va_birthtime = tmp->root->birthtime;
+ vap->va_bytes = va.va_bytes;
+ }
+ }
+ TARFS_DPF(ZIO, "%s() = %d\n", __func__, error);
+ return (error);
+}
+
+#ifdef ZSTDIO
+/*
+ * VOP_READ for zio node, zstd edition.
+ */
+static int
+tarfs_zread_zstd(struct tarfs_zio *zio, struct uio *uiop)
+{
+ void *ibuf = NULL, *obuf = NULL, *rl = NULL;
+ struct uio auio;
+ struct iovec aiov;
+ struct tarfs_mount *tmp = zio->tmp;
+ struct tarfs_zstd *zstd = zio->zstd;
+ struct thread *td = curthread;
+ ZSTD_inBuffer zib;
+ ZSTD_outBuffer zob;
+ off_t zsize;
+ off_t ipos, opos;
+ size_t ilen, olen;
+ size_t zerror;
+ off_t off = uiop->uio_offset;
+ size_t len = uiop->uio_resid;
+ size_t resid = uiop->uio_resid;
+ size_t bsize;
+ int error;
+ bool reset = false;
+
+ /* do we have to rewind? */
+ if (off < zio->opos) {
+ while (zio->curidx > 0 && off < zio->idx[zio->curidx].o)
+ zio->curidx--;
+ reset = true;
+ }
+ /* advance to the nearest index entry */
+ if (off > zio->opos) {
+ // XXX maybe do a binary search instead
+ while (zio->curidx < zio->nidx - 1 &&
+ off >= zio->idx[zio->curidx + 1].o) {
+ zio->curidx++;
+ reset = true;
+ }
+ }
+ /* reset the decompression stream if needed */
+ if (reset) {
+ zio->ipos = zio->idx[zio->curidx].i;
+ zio->opos = zio->idx[zio->curidx].o;
+ ZSTD_resetDStream(zstd->zds);
+ TARFS_DPF(ZIDX, "%s: skipping to index %u = i %zu o %zu\n", __func__,
+ zio->curidx, (size_t)zio->ipos, (size_t)zio->opos);
+ } else {
+ TARFS_DPF(ZIDX, "%s: continuing at i %zu o %zu\n", __func__,
+ (size_t)zio->ipos, (size_t)zio->opos);
+ }
+
+ /*
+ * Set up a temporary buffer for compressed data. Use the size
+ * recommended by the zstd library; this is usually 128 kB, but
+ * just in case, make sure it's a multiple of the page size and no
+ * larger than MAXBSIZE.
+ */
+ bsize = roundup(ZSTD_CStreamOutSize(), PAGE_SIZE);
+ if (bsize > MAXBSIZE)
+ bsize = MAXBSIZE;
+ ibuf = malloc(bsize, M_TEMP, M_WAITOK);
+ zib.src = NULL;
+ zib.size = 0;
+ zib.pos = 0;
+
+ /*
+ * Set up the decompression buffer. If the target is not in
+ * kernel space, we will have to set up a bounce buffer.
+ *
+ * TODO: to avoid using a bounce buffer, map destination pages
+ * using vm_fault_quick_hold_pages().
+ */
+ MPASS(zio->opos <= off);
+ MPASS(uiop->uio_iovcnt == 1);
+ MPASS(uiop->uio_iov->iov_len >= len);
+ if (uiop->uio_segflg == UIO_SYSSPACE) {
+ zob.dst = uiop->uio_iov->iov_base;
+ } else {
+ TARFS_DPF(ALLOC, "%s: allocating %zu-byte bounce buffer\n",
+ __func__, len);
+ zob.dst = obuf = malloc(len, M_TEMP, M_WAITOK);
+ }
+ zob.size = len;
+ zob.pos = 0;
+
+ /* lock tarball */
+ rl = vn_rangelock_rlock(tmp->vp, zio->ipos, OFF_MAX);
+ error = vn_lock(tmp->vp, LK_SHARED);
+ if (error != 0) {
+ goto fail_unlocked;
+ }
+ /* check size */
+ error = vn_getsize_locked(tmp->vp, &zsize, td->td_ucred);
+ if (error != 0) {
+ goto fail;
+ }
+ if (zio->ipos >= zsize) {
+ /* beyond EOF */
+ goto fail;
+ }
+
+ while (resid > 0) {
+ if (zib.pos == zib.size) {
+ /* request data from the underlying file */
+ aiov.iov_base = ibuf;
+ aiov.iov_len = bsize;
+ auio.uio_iov = &aiov;
+ auio.uio_iovcnt = 1;
+ auio.uio_offset = zio->ipos;
+ auio.uio_segflg = UIO_SYSSPACE;
+ auio.uio_rw = UIO_READ;
+ auio.uio_resid = aiov.iov_len;
+ auio.uio_td = td;
+ error = VOP_READ(tmp->vp, &auio,
+ IO_DIRECT | IO_NODELOCKED,
+ td->td_ucred);
+ if (error != 0)
+ goto fail;
+ TARFS_DPF(ZIO, "%s: req %zu+%zu got %zu+%zu\n", __func__,
+ (size_t)zio->ipos, bsize,
+ (size_t)zio->ipos, bsize - auio.uio_resid);
+ zib.src = ibuf;
+ zib.size = bsize - auio.uio_resid;
+ zib.pos = 0;
+ }
+ MPASS(zib.pos <= zib.size);
+ if (zib.pos == zib.size) {
+ TARFS_DPF(ZIO, "%s: end of file after i %zu o %zu\n", __func__,
+ (size_t)zio->ipos, (size_t)zio->opos);
+ goto fail;
+ }
+ if (zio->opos < off) {
+ /* to be discarded */
+ zob.size = min(off - zio->opos, len);
+ zob.pos = 0;
*** 3111 LINES SKIPPED ***