aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--coreutils/tr.c235
1 files changed, 131 insertions, 104 deletions
diff --git a/coreutils/tr.c b/coreutils/tr.c
index cd31b8550..bf6fa2996 100644
--- a/coreutils/tr.c
+++ b/coreutils/tr.c
@@ -16,24 +16,31 @@
16 * Licensed under GPLv2 or later, see file LICENSE in this tarball for details. 16 * Licensed under GPLv2 or later, see file LICENSE in this tarball for details.
17 */ 17 */
18/* http://www.opengroup.org/onlinepubs/009695399/utilities/tr.html 18/* http://www.opengroup.org/onlinepubs/009695399/utilities/tr.html
19 * TODO: xdigit, graph, print 19 * TODO: graph, print
20 */ 20 */
21#include "libbb.h" 21#include "libbb.h"
22 22
23#define ASCII 0377 23enum {
24 ASCII = 256,
25 /* string buffer needs to be at least as big as the whole "alphabet".
26 * BUFSIZ == ASCII is ok, but we will realloc in expand
27 * even for smallest patterns, let's avoid that by using *2:
28 */
29 TR_BUFSIZ = (BUFSIZ > ASCII*2) ? BUFSIZ : ASCII*2,
30};
24 31
25static void map(char *pvector, 32static void map(char *pvector,
26 unsigned char *string1, unsigned string1_len, 33 char *string1, unsigned string1_len,
27 unsigned char *string2, unsigned string2_len) 34 char *string2, unsigned string2_len)
28{ 35{
29 char last = '0'; 36 char last = '0';
30 unsigned i, j; 37 unsigned i, j;
31 38
32 for (j = 0, i = 0; i < string1_len; i++) { 39 for (j = 0, i = 0; i < string1_len; i++) {
33 if (string2_len <= j) 40 if (string2_len <= j)
34 pvector[string1[i]] = last; 41 pvector[(unsigned char)(string1[i])] = last;
35 else 42 else
36 pvector[string1[i]] = last = string2[j++]; 43 pvector[(unsigned char)(string1[i])] = last = string2[j++];
37 } 44 }
38} 45}
39 46
@@ -43,139 +50,155 @@ static void map(char *pvector,
43 * Character classes, e.g. [:upper:] ==> A...Z 50 * Character classes, e.g. [:upper:] ==> A...Z
44 * Equiv classess, e.g. [=A=] ==> A (hmmmmmmm?) 51 * Equiv classess, e.g. [=A=] ==> A (hmmmmmmm?)
45 */ 52 */
46static unsigned expand(const char *arg, char *buffer) 53static unsigned expand(const char *arg, char **buffer_p)
47{ 54{
48 char *buffer_start = buffer; 55 char *buffer = *buffer_p;
56 unsigned pos = 0;
57 unsigned size = TR_BUFSIZ;
49 unsigned i; /* can't be unsigned char: must be able to hold 256 */ 58 unsigned i; /* can't be unsigned char: must be able to hold 256 */
50 unsigned char ac; 59 unsigned char ac;
51 60
52 while (*arg) { 61 while (*arg) {
62 if (pos + ASCII > size) {
63 size += ASCII;
64 *buffer_p = buffer = xrealloc(buffer, size);
65 }
53 if (*arg == '\\') { 66 if (*arg == '\\') {
54 arg++; 67 arg++;
55 *buffer++ = bb_process_escape_sequence(&arg); 68 buffer[pos++] = bb_process_escape_sequence(&arg);
56 continue; 69 continue;
57 } 70 }
58 if (arg[1] == '-') { /* "0-9..." */ 71 if (arg[1] == '-') { /* "0-9..." */
59 ac = arg[2]; 72 ac = arg[2];
60 if (ac == '\0') { /* "0-": copy verbatim */ 73 if (ac == '\0') { /* "0-": copy verbatim */
61 *buffer++ = *arg++; /* copy '0' */ 74 buffer[pos++] = *arg++; /* copy '0' */
62 continue; /* next iter will copy '-' and stop */ 75 continue; /* next iter will copy '-' and stop */
63 } 76 }
64 i = *arg; 77 i = (unsigned char) *arg;
65 while (i <= ac) /* ok: i is unsigned _int_ */ 78 while (i <= ac) /* ok: i is unsigned _int_ */
66 *buffer++ = i++; 79 buffer[pos++] = i++;
67 arg += 3; /* skip 0-9 */ 80 arg += 3; /* skip 0-9 */
68 continue; 81 continue;
69 } 82 }
70 if (*arg == '[') { /* "[xyz..." */ 83 if ((ENABLE_FEATURE_TR_CLASSES || ENABLE_FEATURE_TR_EQUIV)
84 && *arg == '['
85 ) {
71 arg++; 86 arg++;
72 i = *arg++; 87 i = (unsigned char) *arg++;
73 /* "[xyz...", i=x, arg points to y */ 88 /* "[xyz...". i=x, arg points to y */
74 if (ENABLE_FEATURE_TR_CLASSES && i == ':') { 89 if (ENABLE_FEATURE_TR_CLASSES && i == ':') { /* [:class:] */
75#define CLO ":]\0" 90#define CLO ":]\0"
76 static const char classes[] ALIGN1 = 91 static const char classes[] ALIGN1 =
77 "alpha"CLO "alnum"CLO "digit"CLO 92 "alpha"CLO "alnum"CLO "digit"CLO
78 "lower"CLO "upper"CLO "space"CLO 93 "lower"CLO "upper"CLO "space"CLO
79 "blank"CLO "punct"CLO "cntrl"CLO 94 "blank"CLO "punct"CLO "cntrl"CLO
80 "xdigit"CLO; 95 "xdigit"CLO;
81#define CLASS_invalid 0 /* we increment the retval */ 96 enum {
82#define CLASS_alpha 1 97 CLASS_invalid = 0, /* we increment the retval */
83#define CLASS_alnum 2 98 CLASS_alpha = 1,
84#define CLASS_digit 3 99 CLASS_alnum = 2,
85#define CLASS_lower 4 100 CLASS_digit = 3,
86#define CLASS_upper 5 101 CLASS_lower = 4,
87#define CLASS_space 6 102 CLASS_upper = 5,
88#define CLASS_blank 7 103 CLASS_space = 6,
89#define CLASS_punct 8 104 CLASS_blank = 7,
90#define CLASS_cntrl 9 105 CLASS_punct = 8,
91#define CLASS_xdigit 10 106 CLASS_cntrl = 9,
92//#define CLASS_graph 11 107 CLASS_xdigit = 10,
93//#define CLASS_print 12 108 //CLASS_graph = 11,
109 //CLASS_print = 12,
110 };
94 smalluint j; 111 smalluint j;
95 { 112 char *tmp;
96 /* xdigit needs 8, not 7 */ 113
97 char *tmp = xstrndup(arg, 7 + (arg[0]=='x')); 114 /* xdigit needs 8, not 7 */
98 j = index_in_strings(classes, tmp) + 1; 115 i = 7 + (arg[0] == 'x');
99 free(tmp); 116 tmp = xstrndup(arg, i);
100 } 117 j = index_in_strings(classes, tmp) + 1;
118 free(tmp);
119
120 if (j == CLASS_invalid)
121 goto skip_bracket;
122
123 arg += i;
101 if (j == CLASS_alnum || j == CLASS_digit || j == CLASS_xdigit) { 124 if (j == CLASS_alnum || j == CLASS_digit || j == CLASS_xdigit) {
102 for (i = '0'; i <= '9'; i++) 125 for (i = '0'; i <= '9'; i++)
103 *buffer++ = i; 126 buffer[pos++] = i;
104 } 127 }
105 if (j == CLASS_alpha || j == CLASS_alnum || j == CLASS_upper) { 128 if (j == CLASS_alpha || j == CLASS_alnum || j == CLASS_upper) {
106 for (i = 'A'; i <= 'Z'; i++) 129 for (i = 'A'; i <= 'Z'; i++)
107 *buffer++ = i; 130 buffer[pos++] = i;
108 } 131 }
109 if (j == CLASS_alpha || j == CLASS_alnum || j == CLASS_lower) { 132 if (j == CLASS_alpha || j == CLASS_alnum || j == CLASS_lower) {
110 for (i = 'a'; i <= 'z'; i++) 133 for (i = 'a'; i <= 'z'; i++)
111 *buffer++ = i; 134 buffer[pos++] = i;
112 } 135 }
113 if (j == CLASS_space || j == CLASS_blank) { 136 if (j == CLASS_space || j == CLASS_blank) {
114 *buffer++ = '\t'; 137 buffer[pos++] = '\t';
115 if (j == CLASS_space) { 138 if (j == CLASS_space) {
116 *buffer++ = '\n'; 139 buffer[pos++] = '\n';
117 *buffer++ = '\v'; 140 buffer[pos++] = '\v';
118 *buffer++ = '\f'; 141 buffer[pos++] = '\f';
119 *buffer++ = '\r'; 142 buffer[pos++] = '\r';
120 } 143 }
121 *buffer++ = ' '; 144 buffer[pos++] = ' ';
122 } 145 }
123 if (j == CLASS_punct || j == CLASS_cntrl) { 146 if (j == CLASS_punct || j == CLASS_cntrl) {
124 for (i = '\0'; i <= ASCII; i++) 147 for (i = '\0'; i < ASCII; i++) {
125 if ((j == CLASS_punct && isprint(i) && !isalnum(i) && !isspace(i)) 148 if ((j == CLASS_punct && isprint(i) && !isalnum(i) && !isspace(i))
126 || (j == CLASS_cntrl && iscntrl(i))) 149 || (j == CLASS_cntrl && iscntrl(i))
127 *buffer++ = i; 150 ) {
151 buffer[pos++] = i;
152 }
153 }
128 } 154 }
129 if (j == CLASS_xdigit) { 155 if (j == CLASS_xdigit) {
130 for (i = 'A'; i <= 'F'; i++) { 156 for (i = 'A'; i <= 'F'; i++) {
131 *buffer++ = i; 157 buffer[pos + 6] = i | 0x20;
132 *buffer++ = i | 0x20; 158 buffer[pos++] = i;
133 } 159 }
160 pos += 6;
134 } 161 }
135 if (j == CLASS_invalid) { 162 continue;
136 *buffer++ = '[';
137 *buffer++ = ':';
138 continue;
139 }
140 break;
141 } 163 }
142 /* "[xyz...", i=x, arg points to y */ 164 /* "[xyz...", i=x, arg points to y */
143 if (ENABLE_FEATURE_TR_EQUIV && i == '=') { /* [=CHAR=] */ 165 if (ENABLE_FEATURE_TR_EQUIV && i == '=') { /* [=CHAR=] */
144 *buffer++ = *arg; /* copy CHAR */ 166 buffer[pos++] = *arg; /* copy CHAR */
145 if (!*arg || arg[1] != '=' || arg[2] != ']') 167 if (!arg[0] || arg[1] != '=' || arg[2] != ']')
146 bb_show_usage(); 168 bb_show_usage();
147 arg += 3; /* skip CHAR=] */ 169 arg += 3; /* skip CHAR=] */
148 continue; 170 continue;
149 } 171 }
150 /* The rest of [xyz... cases is treated as normal 172 /* The rest of "[xyz..." cases is treated as normal
151 * string, '[' has no special meaning here: 173 * string, "[" has no special meaning here:
152 * tr "[a-z]" "[A-Z]" can be written as tr "a-z" "A-Z", 174 * tr "[a-z]" "[A-Z]" can be written as tr "a-z" "A-Z",
153 * also try tr "[a-z]" "_A-Z+" and you'll see that 175 * also try tr "[a-z]" "_A-Z+" and you'll see that
154 * [] is not special here. 176 * [] is not special here.
155 */ 177 */
156 *buffer++ = '['; 178 skip_bracket:
157 arg--; /* points to x */ 179 arg -= 2; /* points to "[" in "[xyz..." */
158 continue;
159 } 180 }
160 *buffer++ = *arg++; 181 buffer[pos++] = *arg++;
161 } 182 }
162 return (buffer - buffer_start); 183 return pos;
163} 184}
164 185
186/* NB: buffer is guaranteed to be at least TR_BUFSIZE
187 * (which is >= ASCII) big.
188 */
165static int complement(char *buffer, int buffer_len) 189static int complement(char *buffer, int buffer_len)
166{ 190{
167 int ch, j, len; 191 int len;
168 char conv[ASCII + 2]; 192 char conv[ASCII];
193 unsigned char ch;
169 194
170 len = 0; 195 len = 0;
171 for (ch = '\0'; ch <= ASCII; ch++) { 196 ch = '\0';
172 for (j = 0; j < buffer_len; j++) 197 while (1) {
173 if (buffer[j] == ch) 198 if (memchr(buffer, ch, buffer_len) == NULL)
174 goto next_ch; 199 conv[len++] = ch;
175 /* Didn't find it */ 200 if (++ch == '\0')
176 conv[len++] = (char) ch; 201 break;
177 next_ch:
178 continue;
179 } 202 }
180 memcpy(buffer, conv, len); 203 memcpy(buffer, conv, len);
181 return len; 204 return len;
@@ -190,54 +213,56 @@ int tr_main(int argc UNUSED_PARAM, char **argv)
190 size_t in_index, out_index; 213 size_t in_index, out_index;
191 unsigned last = UCHAR_MAX + 1; /* not equal to any char */ 214 unsigned last = UCHAR_MAX + 1; /* not equal to any char */
192 unsigned char coded, c; 215 unsigned char coded, c;
193 unsigned char *output = xmalloc(BUFSIZ); 216 char *str1 = xmalloc(TR_BUFSIZ);
194 char *vector = xzalloc((ASCII+1) * 3); 217 char *str2 = xmalloc(TR_BUFSIZ);
195 char *invec = vector + (ASCII+1); 218 int str2_length;
196 char *outvec = vector + (ASCII+1) * 2; 219 int str1_length;
220 char *vector = xzalloc(ASCII * 3);
221 char *invec = vector + ASCII;
222 char *outvec = vector + ASCII * 2;
197 223
198#define TR_OPT_complement (1 << 0) 224#define TR_OPT_complement (1 << 0)
199#define TR_OPT_delete (1 << 1) 225#define TR_OPT_delete (1 << 1)
200#define TR_OPT_squeeze_reps (1 << 2) 226#define TR_OPT_squeeze_reps (1 << 2)
201 227
202 flags = getopt32(argv, "+cds"); /* '+': stop at first non-option */ 228 for (i = 0; i < ASCII; i++) {
203 argv += optind;
204
205 for (i = 0; i <= ASCII; i++) {
206 vector[i] = i; 229 vector[i] = i;
207 /*invec[i] = outvec[i] = FALSE; - done by xzalloc */ 230 /*invec[i] = outvec[i] = FALSE; - done by xzalloc */
208 } 231 }
209 232
210#define tr_buf bb_common_bufsiz1 233 opt_complementary = "-1";
211 if (*argv != NULL) { 234 flags = getopt32(argv, "+cds"); /* '+': stop at first non-option */
212 int output_length = 0; 235 argv += optind;
213 int input_length;
214 236
215 input_length = expand(*argv++, tr_buf); 237 str1_length = expand(*argv++, &str1);
216 if (flags & TR_OPT_complement) 238 str2_length = 0;
217 input_length = complement(tr_buf, input_length); 239 if (flags & TR_OPT_complement)
218 if (*argv) { 240 str1_length = complement(str1, str1_length);
219 if (argv[0][0] == '\0') 241 if (*argv) {
220 bb_error_msg_and_die("STRING2 cannot be empty"); 242 if (argv[0][0] == '\0')
221 output_length = expand(*argv, (char *)output); 243 bb_error_msg_and_die("STRING2 cannot be empty");
222 map(vector, (unsigned char *)tr_buf, input_length, output, output_length); 244 str2_length = expand(*argv, &str2);
223 } 245 map(vector, str1, str1_length,
224 for (i = 0; i < input_length; i++) 246 str2, str2_length);
225 invec[(unsigned char)tr_buf[i]] = TRUE;
226 for (i = 0; i < output_length; i++)
227 outvec[output[i]] = TRUE;
228 } 247 }
248 for (i = 0; i < str1_length; i++)
249 invec[(unsigned char)(str1[i])] = TRUE;
250 for (i = 0; i < str2_length; i++)
251 outvec[(unsigned char)(str2[i])] = TRUE;
229 252
230 goto start_from; 253 goto start_from;
231 254
255 /* In this loop, str1 space is reused as input buffer,
256 * str2 - as output one. */
232 for (;;) { 257 for (;;) {
233 /* If we're out of input, flush output and read more input. */ 258 /* If we're out of input, flush output and read more input. */
234 if ((ssize_t)in_index == read_chars) { 259 if ((ssize_t)in_index == read_chars) {
235 if (out_index) { 260 if (out_index) {
236 xwrite(STDOUT_FILENO, (char *)output, out_index); 261 xwrite(STDOUT_FILENO, str2, out_index);
237 start_from: 262 start_from:
238 out_index = 0; 263 out_index = 0;
239 } 264 }
240 read_chars = safe_read(STDIN_FILENO, tr_buf, BUFSIZ); 265 read_chars = safe_read(STDIN_FILENO, str1, TR_BUFSIZ);
241 if (read_chars <= 0) { 266 if (read_chars <= 0) {
242 if (read_chars < 0) 267 if (read_chars < 0)
243 bb_perror_msg_and_die(bb_msg_read_error); 268 bb_perror_msg_and_die(bb_msg_read_error);
@@ -245,14 +270,16 @@ int tr_main(int argc UNUSED_PARAM, char **argv)
245 } 270 }
246 in_index = 0; 271 in_index = 0;
247 } 272 }
248 c = tr_buf[in_index++]; 273 c = str1[in_index++];
249 if ((flags & TR_OPT_delete) && invec[c]) 274 if ((flags & TR_OPT_delete) && invec[c])
250 continue; 275 continue;
251 coded = vector[c]; 276 coded = vector[c];
252 if ((flags & TR_OPT_squeeze_reps) && last == coded 277 if ((flags & TR_OPT_squeeze_reps) && last == coded
253 && (invec[c] || outvec[coded])) 278 && (invec[c] || outvec[coded])
279 ) {
254 continue; 280 continue;
255 output[out_index++] = last = coded; 281 }
282 str2[out_index++] = last = coded;
256 } 283 }
257 284
258 return EXIT_SUCCESS; 285 return EXIT_SUCCESS;