Serious 'tr' bug, patch for review included
Andrey Chernov
ache at nagual.pp.ru
Thu Jul 31 17:44:13 PDT 2003
This patch address two problems.
1st one is relatively minor: according our own manpage, upper and lower
classes must be sorted, but currently not.
2nd one is serious:
tr '[:lower:]' '[:upper:]'
(and vice versa) currently works only if upper and lower classes
have exact the same number of elements. When it is not true, like for
many ISO8859-x locales which have bigger amount of lowercase letters,
tr may do nasty things. The patch is complex, because whole conversion
string need to be processed each time l-u or u->l conversion occurse,
not single character at time, like in previous variant.
See this page
http://www.opengroup.org/onlinepubs/007908799/xcu/tr.html
for detailed description of desired tr behaviour in such cases.
Please test this patch on your system & locale and report me any strange
things.
diff -u ./extern.h /usr/src/usr.bin/tr/extern.h
--- ./extern.h Fri Jun 14 19:56:52 2002
+++ /usr/src/usr.bin/tr/extern.h Fri Aug 1 04:19:36 2003
@@ -40,7 +40,8 @@
typedef struct {
enum { STRING1, STRING2 } which;
- enum { EOS, INFINITE, NORMAL, RANGE, SEQUENCE, SET } state;
+ enum { EOS, INFINITE, NORMAL, RANGE, SEQUENCE,
+ SET, SET_UPPER, SET_LOWER } state;
int cnt; /* character count */
int lastch; /* last character */
int equiv[NCHARS]; /* equivalence set */
@@ -49,3 +50,5 @@
} STR;
int next(STR *);
+int charcoll(const void *, const void *);
+
diff -u ./str.c /usr/src/usr.bin/tr/str.c
--- ./str.c Fri Jul 5 13:28:13 2002
+++ /usr/src/usr.bin/tr/str.c Fri Aug 1 04:22:11 2003
@@ -106,6 +106,8 @@
}
return (1);
case SET:
+ case SET_UPPER:
+ case SET_LOWER:
if ((s->lastch = s->set[s->cnt++]) == OOBCH) {
s->state = NORMAL;
return (next(s));
@@ -194,7 +196,7 @@
{
int cnt, (*func)(int);
CLASS *cp, tmp;
- int *p;
+ int *p, n;
tmp.name = s->str;
if ((cp = (CLASS *)bsearch(&tmp, classes, sizeof(classes) /
@@ -208,10 +210,18 @@
if ((func)(cnt))
*p++ = cnt;
*p = OOBCH;
+ n = p - cp->set;
s->cnt = 0;
- s->state = SET;
s->set = cp->set;
+ if (strcmp(s->str, "upper") == 0)
+ s->state = SET_UPPER;
+ else if (strcmp(s->str, "lower") == 0) {
+ s->state = SET_LOWER;
+ } else
+ s->state = SET;
+ if ((s->state == SET_LOWER || s->state == SET_UPPER) && n > 1)
+ mergesort(s->set, n, sizeof(*(s->set)), charcoll);
}
static int
diff -u ./tr.c /usr/src/usr.bin/tr/tr.c
--- ./tr.c Thu Sep 5 03:29:07 2002
+++ /usr/src/usr.bin/tr/tr.c Fri Aug 1 04:32:01 2003
@@ -101,8 +101,9 @@
STR s1 = { STRING1, NORMAL, 0, OOBCH, { 0, OOBCH }, NULL, NULL };
STR s2 = { STRING2, NORMAL, 0, OOBCH, { 0, OOBCH }, NULL, NULL };
-static int charcoll(const void *, const void *);
static void setup(int *, char *, STR *, int, int);
+static void process_upper(int);
+static void process_lower(int);
static void usage(void);
int
@@ -110,7 +111,7 @@
{
static int collorder[NCHARS], tmpmap[NCHARS];
int ch, cnt, lastch, *p;
- int Cflag, cflag, dflag, sflag, isstring2;
+ int Cflag, cflag, dflag, sflag, isstring2, do_upper, do_lower;
(void)setlocale(LC_ALL, "");
@@ -224,19 +225,67 @@
if (!next(&s2))
errx(1, "empty string2");
- ch = s2.lastch;
+ do_upper = do_lower = 0;
/* If string2 runs out of characters, use the last one specified. */
- if (sflag)
- while (next(&s1)) {
- string1[s1.lastch] = ch = s2.lastch;
- string2[ch] = 1;
- (void)next(&s2);
- }
- else
- while (next(&s1)) {
- string1[s1.lastch] = ch = s2.lastch;
- (void)next(&s2);
+ while (next(&s1)) {
+ if (s1.state == SET_LOWER &&
+ s2.state == SET_UPPER) {
+ if (do_lower) {
+ process_lower(sflag);
+ do_lower = 0;
+ }
+ do_upper = 1;
+ } else if (s1.state == SET_UPPER &&
+ s2.state == SET_LOWER) {
+ if (do_upper) {
+ process_upper(sflag);
+ do_upper = 0;
+ }
+ do_lower = 1;
+ } else {
+ if (do_lower) {
+ /* Skip until aligned */
+ if (s1.state == SET_UPPER) {
+ do {
+ if (!next(&s1))
+ goto endloop;
+ } while (s1.state == SET_UPPER);
+ } else if (s2.state == SET_LOWER) {
+ do {
+ if (!next(&s2))
+ break;
+ } while (s2.state == SET_LOWER);
+ }
+ process_lower(sflag);
+ do_lower = 0;
+ } else if (do_upper) {
+ /* Skip until aligned */
+ if (s1.state == SET_LOWER) {
+ do {
+ if (!next(&s1))
+ goto endloop;
+ } while (s1.state == SET_LOWER);
+ } else if (s2.state == SET_UPPER) {
+ do {
+ if (!next(&s2))
+ break;
+ } while (s2.state == SET_UPPER);
+ }
+ process_upper(sflag);
+ do_upper = 0;
+ }
+ string1[s1.lastch] = s2.lastch;
+ if (sflag)
+ string2[s2.lastch] = 1;
}
+ (void)next(&s2);
+ }
+endloop:
+ if (do_lower)
+ process_lower(sflag);
+ else if (do_upper)
+ process_upper(sflag);
+ /* End of upper & lower special processing */
if (cflag || Cflag) {
s2.str = argv[1];
@@ -294,15 +343,55 @@
string[cnt] = !string[cnt] && ISCHAR(cnt);
}
-static int
+int
charcoll(const void *a, const void *b)
{
- char sa[2], sb[2];
+ static char sa[2], sb[2];
sa[0] = *(const int *)a;
sb[0] = *(const int *)b;
- sa[1] = sb[1] = '\0';
return (strcoll(sa, sb));
+}
+
+
+/*
+ * For -s result will contain only those characters defined
+ * as the second characters in each of the toupper or tolower
+ * pairs.
+ */
+
+static void
+process_upper(int sflag)
+{
+ int cnt, ch;
+
+ for (cnt = 0; cnt < NCHARS; cnt++) {
+ ch = string1[cnt];
+ if (ch == OOBCH) /* [Cc]flag */
+ ch = cnt;
+ if (islower(ch)) {
+ string1[cnt] = ch = toupper(ch);
+ if (sflag && isupper(ch))
+ string2[ch] = 1;
+ }
+ }
+}
+
+static void
+process_lower(int sflag)
+{
+ int cnt, ch;
+
+ for (cnt = 0; cnt < NCHARS; cnt++) {
+ ch = string1[cnt];
+ if (ch == OOBCH) /* [Cc]flag */
+ ch = cnt;
+ if (isupper(ch)) {
+ string1[cnt] = ch = tolower(ch);
+ if (sflag && islower(ch))
+ string2[ch] = 1;
+ }
+ }
}
static void
More information about the freebsd-current
mailing list