svn commit: r346175 - head/usr.bin/sort

Conrad Meyer cem at FreeBSD.org
Sat Apr 13 04:42:19 UTC 2019


Author: cem
Date: Sat Apr 13 04:42:17 2019
New Revision: 346175
URL: https://svnweb.freebsd.org/changeset/base/346175

Log:
  sort(1): Memoize MD5 computation to reduce repeated computation
  
  Experimentally, reduces sort -R time of a 148160 line corpus from about
  3.15s to about 0.93s on this particular system.
  
  There's probably room for improvement using some digest other than md5, but
  I don't want to look at sort(1) anymore.  Some discussion of other possible
  improvements in the Test Plan section of the Differential.
  
  PR:		230792
  Reviewed by:	jhb (earlier version)
  Differential Revision:	https://reviews.freebsd.org/D19885

Modified:
  head/usr.bin/sort/coll.c
  head/usr.bin/sort/coll.h
  head/usr.bin/sort/sort.c

Modified: head/usr.bin/sort/coll.c
==============================================================================
--- head/usr.bin/sort/coll.c	Sat Apr 13 04:03:18 2019	(r346174)
+++ head/usr.bin/sort/coll.c	Sat Apr 13 04:42:17 2019	(r346175)
@@ -981,6 +981,15 @@ hnumcoll(struct key_value *kv1, struct key_value *kv2,
 	return (numcoll_impl(kv1, kv2, offset, true));
 }
 
+/* Use hint space to memoize md5 computations, at least. */
+static void
+randomcoll_init_hint(struct key_value *kv, void *hash)
+{
+
+	memcpy(kv->hint->v.Rh.cached, hash, sizeof(kv->hint->v.Rh.cached));
+	kv->hint->status = HS_INITIALIZED;
+}
+
 /*
  * Implements random sort (-R).
  */
@@ -991,6 +1000,7 @@ randomcoll(struct key_value *kv1, struct key_value *kv
 	struct bwstring *s1, *s2;
 	MD5_CTX ctx1, ctx2;
 	unsigned char hash1[MD5_DIGEST_LENGTH], hash2[MD5_DIGEST_LENGTH];
+	int cmp;
 
 	s1 = kv1->k;
 	s2 = kv2->k;
@@ -1003,6 +1013,14 @@ randomcoll(struct key_value *kv1, struct key_value *kv
 	if (s1 == s2)
 		return (0);
 
+	if (kv1->hint->status == HS_INITIALIZED &&
+	    kv2->hint->status == HS_INITIALIZED) {
+		cmp = memcmp(kv1->hint->v.Rh.cached,
+		    kv2->hint->v.Rh.cached, sizeof(kv1->hint->v.Rh.cached));
+		if (cmp != 0)
+			return (cmp);
+	}
+
 	memcpy(&ctx1, &md5_ctx, sizeof(MD5_CTX));
 	memcpy(&ctx2, &md5_ctx, sizeof(MD5_CTX));
 
@@ -1011,6 +1029,11 @@ randomcoll(struct key_value *kv1, struct key_value *kv
 
 	MD5Final(hash1, &ctx1);
 	MD5Final(hash2, &ctx2);
+
+	if (kv1->hint->status == HS_UNINITIALIZED)
+		randomcoll_init_hint(kv1, hash1);
+	if (kv2->hint->status == HS_UNINITIALIZED)
+		randomcoll_init_hint(kv2, hash2);
 
 	return (memcmp(hash1, hash2, sizeof(hash1)));
 }

Modified: head/usr.bin/sort/coll.h
==============================================================================
--- head/usr.bin/sort/coll.h	Sat Apr 13 04:03:18 2019	(r346174)
+++ head/usr.bin/sort/coll.h	Sat Apr 13 04:42:17 2019	(r346175)
@@ -65,6 +65,17 @@ struct M_hint
 };
 
 /*
+ * Sort hint data for -R
+ *
+ * This stores the first 12 bytes of the digest rather than the full output to
+ * avoid increasing the size of the 'key_hint' object via the 'v' union.
+ */
+struct R_hint
+{
+	unsigned char		 cached[12];
+};
+
+/*
  * Status of a sort hint object
  */
 typedef enum
@@ -83,6 +94,7 @@ struct key_hint
 		struct n_hint		nh;
 		struct g_hint		gh;
 		struct M_hint		Mh;
+		struct R_hint		Rh;
 	}			v;
 };
 

Modified: head/usr.bin/sort/sort.c
==============================================================================
--- head/usr.bin/sort/sort.c	Sat Apr 13 04:03:18 2019	(r346174)
+++ head/usr.bin/sort/sort.c	Sat Apr 13 04:42:17 2019	(r346175)
@@ -583,6 +583,7 @@ set_sort_modifier(struct sort_mods *sm, int c)
 		break;
 	case 'R':
 		sm->Rflag = true;
+		need_hint = true;
 		need_random = true;
 		break;
 	case 'M':


More information about the svn-src-head mailing list