bin/78562: Add numerical sorting option to join(1)

Dmitrij Tejblum tejblum at yandex-team.ru
Mon Mar 7 21:20:03 GMT 2005


>Number:         78562
>Category:       bin
>Synopsis:       Add numerical sorting option to join(1)
>Confidential:   no
>Severity:       non-critical
>Priority:       low
>Responsible:    freebsd-bugs
>State:          open
>Quarter:        
>Keywords:       
>Date-Required:
>Class:          change-request
>Submitter-Id:   current-users
>Arrival-Date:   Mon Mar 07 21:20:02 GMT 2005
>Closed-Date:
>Last-Modified:
>Originator:     Dmitrij Tejblum
>Release:        FreeBSD 5.4-PRERELEASE i386
>Organization:
>Environment:

>Description:
join(1) require input files to be lexicographically sorted. Thus if someone
has a numerically sorted files he must resort them in the lexicographic
order. It is inconvenient.

Linux (GNU) join(1) has an -n option to handle numerically sorted files.

>How-To-Repeat:

>Fix:

--- join.c	Mon Mar  7 23:14:49 2005
+++ join.c	Mon Mar  7 23:52:19 2005
@@ -105,7 +105,8 @@
 static wchar_t default_tabchar[] = L" \t";
 wchar_t *tabchar = default_tabchar;/* delimiter characters (-t) */
 
-int  cmp(LINE *, u_long, LINE *, u_long);
+int  cmp(LINE *, u_long, LINE *, u_long, int);
+int  cmpnum(long long, long long);
 void fieldarg(char *);
 void joinlines(INPUT *, INPUT *);
 int  mbscoll(const char *, const char *);
@@ -114,7 +115,7 @@
 void outfield(LINE *, u_long, int);
 void outoneline(INPUT *, LINE *);
 void outtwoline(INPUT *, LINE *, INPUT *, LINE *);
-void slurp(INPUT *);
+void slurp(INPUT *, int);
 wchar_t *towcs(const char *);
 void usage(void);
 
@@ -122,7 +123,7 @@
 main(int argc, char *argv[])
 {
 	INPUT *F1, *F2;
-	int aflag, ch, cval, vflag;
+	int aflag, ch, cval, nflag, vflag;
 	char *end;
 
 	setlocale(LC_ALL, "");
@@ -130,9 +131,9 @@
 	F1 = &input1;
 	F2 = &input2;
 
-	aflag = vflag = 0;
+	aflag = nflag = vflag = 0;
 	obsolete(argv);
-	while ((ch = getopt(argc, argv, "\01a:e:j:1:2:o:t:v:")) != -1) {
+	while ((ch = getopt(argc, argv, "\01na:e:j:1:2:o:t:v:")) != -1) {
 		switch (ch) {
 		case '\01':		/* See comment in obsolete(). */
 			aflag = 1;
@@ -180,6 +181,9 @@
 			--F1->joinf;
 			--F2->joinf;
 			break;
+		case 'n':
+			nflag = 1;
+			break;
 		case 'o':
 			fieldarg(optarg);
 			break;
@@ -234,26 +238,26 @@
 	if (F1->fp == stdin && F2->fp == stdin)
 		errx(1, "only one input file may be stdin");
 
-	slurp(F1);
-	slurp(F2);
+	slurp(F1, nflag);
+	slurp(F2, nflag);
 	while (F1->setcnt && F2->setcnt) {
-		cval = cmp(F1->set, F1->joinf, F2->set, F2->joinf);
+		cval = cmp(F1->set, F1->joinf, F2->set, F2->joinf, nflag);
 		if (cval == 0) {
 			/* Oh joy, oh rapture, oh beauty divine! */
 			if (joinout)
 				joinlines(F1, F2);
-			slurp(F1);
-			slurp(F2);
+			slurp(F1, nflag);
+			slurp(F2, nflag);
 		} else if (cval < 0) {
 			/* File 1 takes the lead... */
 			if (F1->unpair)
 				joinlines(F1, NULL);
-			slurp(F1);
+			slurp(F1, nflag);
 		} else {
 			/* File 2 takes the lead... */
 			if (F2->unpair)
 				joinlines(F2, NULL);
-			slurp(F2);
+			slurp(F2, nflag);
 		}
 	}
 
@@ -264,18 +268,18 @@
 	if (F1->unpair)
 		while (F1->setcnt) {
 			joinlines(F1, NULL);
-			slurp(F1);
+			slurp(F1, nflag);
 		}
 	if (F2->unpair)
 		while (F2->setcnt) {
 			joinlines(F2, NULL);
-			slurp(F2);
+			slurp(F2, nflag);
 		}
 	exit(0);
 }
 
 void
-slurp(INPUT *F)
+slurp(INPUT *F, int nflag)
 {
 	LINE *lp, *lastlp, tmp;
 	size_t len;
@@ -355,7 +359,7 @@
 		}
 
 		/* See if the join field value has changed. */
-		if (lastlp != NULL && cmp(lp, F->joinf, lastlp, F->joinf)) {
+		if (lastlp != NULL && cmp(lp, F->joinf, lastlp, F->joinf, nflag)) {
 			F->pushbool = 1;
 			F->pushback = F->setcnt;
 			break;
@@ -393,13 +397,25 @@
 }
 
 int
-cmp(LINE *lp1, u_long fieldno1, LINE *lp2, u_long fieldno2)
+cmpnum(long long a, long long b)
+{
+	if (a < b)
+		return (-1);
+	else if (a == b)
+		return 0;
+	else
+		return 1; 
+}
+
+int
+cmp(LINE *lp1, u_long fieldno1, LINE *lp2, u_long fieldno2, int nflag)
 {
 	if (lp1->fieldcnt <= fieldno1)
 		return (lp2->fieldcnt <= fieldno2 ? 0 : 1);
 	if (lp2->fieldcnt <= fieldno2)
 		return (-1);
-	return (mbscoll(lp1->fields[fieldno1], lp2->fields[fieldno2]));
+	return (nflag ? cmpnum(atoll(lp1->fields[fieldno1]), atoll(lp2->fields[fieldno2])):
+	    mbscoll(lp1->fields[fieldno1], lp2->fields[fieldno2]));
 }
 
 int
@@ -664,6 +680,6 @@
 	(void)fprintf(stderr, "%s %s\n%s\n",
 	    "usage: join [-a fileno | -v fileno ] [-e string] [-1 field]",
 	    "[-2 field]",
-		"            [-o list] [-t char] file1 file2");
+		"            [-o list] [-n] [-t char] file1 file2");
 	exit(1);
 }
--- join.1	Mon Mar  7 23:39:17 2005
+++ join.1	Tue Mar  8 00:00:27 2005
@@ -50,6 +50,7 @@
 .Op Fl o Ar list
 .Bk -words
 .Ek
+.Op Fl n
 .Op Fl t Ar char
 .Op Fl \&1 Ar field
 .Op Fl \&2 Ar field
@@ -93,6 +94,8 @@
 .It Fl e Ar string
 Replace empty output fields with
 .Ar string .
+.It Fl n
+Assume numerically sorted input files.
 .It Fl o Ar list
 The
 .Fl o
@@ -158,6 +161,13 @@
 without the
 .Fl b
 option.
+When the option
+.Fl n
+is used, the files to be joined should be ordered as with
+.Xr sort 1
+with
+.Fl n
+option.
 .Pp
 If one of the arguments
 .Ar file1
@@ -211,6 +221,11 @@
 .Nm
 command conforms to
 .St -p1003.1-2001 .
+The
+.Fl n
+option is a non-standard
+.Fx
+extension.
 .Sh SEE ALSO
 .Xr awk 1 ,
 .Xr comm 1 ,

>Release-Note:
>Audit-Trail:
>Unformatted:


More information about the freebsd-bugs mailing list