diff options
-rw-r--r-- | coreutils/tr.c | 235 |
1 files changed, 131 insertions, 104 deletions
diff --git a/coreutils/tr.c b/coreutils/tr.c index cd31b8550..bf6fa2996 100644 --- a/coreutils/tr.c +++ b/coreutils/tr.c | |||
@@ -16,24 +16,31 @@ | |||
16 | * Licensed under GPLv2 or later, see file LICENSE in this tarball for details. | 16 | * Licensed under GPLv2 or later, see file LICENSE in this tarball for details. |
17 | */ | 17 | */ |
18 | /* http://www.opengroup.org/onlinepubs/009695399/utilities/tr.html | 18 | /* http://www.opengroup.org/onlinepubs/009695399/utilities/tr.html |
19 | * TODO: xdigit, graph, print | 19 | * TODO: graph, print |
20 | */ | 20 | */ |
21 | #include "libbb.h" | 21 | #include "libbb.h" |
22 | 22 | ||
23 | #define ASCII 0377 | 23 | enum { |
24 | ASCII = 256, | ||
25 | /* string buffer needs to be at least as big as the whole "alphabet". | ||
26 | * BUFSIZ == ASCII is ok, but we will realloc in expand | ||
27 | * even for smallest patterns, let's avoid that by using *2: | ||
28 | */ | ||
29 | TR_BUFSIZ = (BUFSIZ > ASCII*2) ? BUFSIZ : ASCII*2, | ||
30 | }; | ||
24 | 31 | ||
25 | static void map(char *pvector, | 32 | static void map(char *pvector, |
26 | unsigned char *string1, unsigned string1_len, | 33 | char *string1, unsigned string1_len, |
27 | unsigned char *string2, unsigned string2_len) | 34 | char *string2, unsigned string2_len) |
28 | { | 35 | { |
29 | char last = '0'; | 36 | char last = '0'; |
30 | unsigned i, j; | 37 | unsigned i, j; |
31 | 38 | ||
32 | for (j = 0, i = 0; i < string1_len; i++) { | 39 | for (j = 0, i = 0; i < string1_len; i++) { |
33 | if (string2_len <= j) | 40 | if (string2_len <= j) |
34 | pvector[string1[i]] = last; | 41 | pvector[(unsigned char)(string1[i])] = last; |
35 | else | 42 | else |
36 | pvector[string1[i]] = last = string2[j++]; | 43 | pvector[(unsigned char)(string1[i])] = last = string2[j++]; |
37 | } | 44 | } |
38 | } | 45 | } |
39 | 46 | ||
@@ -43,139 +50,155 @@ static void map(char *pvector, | |||
43 | * Character classes, e.g. [:upper:] ==> A...Z | 50 | * Character classes, e.g. [:upper:] ==> A...Z |
44 | * Equiv classess, e.g. [=A=] ==> A (hmmmmmmm?) | 51 | * Equiv classess, e.g. [=A=] ==> A (hmmmmmmm?) |
45 | */ | 52 | */ |
46 | static unsigned expand(const char *arg, char *buffer) | 53 | static unsigned expand(const char *arg, char **buffer_p) |
47 | { | 54 | { |
48 | char *buffer_start = buffer; | 55 | char *buffer = *buffer_p; |
56 | unsigned pos = 0; | ||
57 | unsigned size = TR_BUFSIZ; | ||
49 | unsigned i; /* can't be unsigned char: must be able to hold 256 */ | 58 | unsigned i; /* can't be unsigned char: must be able to hold 256 */ |
50 | unsigned char ac; | 59 | unsigned char ac; |
51 | 60 | ||
52 | while (*arg) { | 61 | while (*arg) { |
62 | if (pos + ASCII > size) { | ||
63 | size += ASCII; | ||
64 | *buffer_p = buffer = xrealloc(buffer, size); | ||
65 | } | ||
53 | if (*arg == '\\') { | 66 | if (*arg == '\\') { |
54 | arg++; | 67 | arg++; |
55 | *buffer++ = bb_process_escape_sequence(&arg); | 68 | buffer[pos++] = bb_process_escape_sequence(&arg); |
56 | continue; | 69 | continue; |
57 | } | 70 | } |
58 | if (arg[1] == '-') { /* "0-9..." */ | 71 | if (arg[1] == '-') { /* "0-9..." */ |
59 | ac = arg[2]; | 72 | ac = arg[2]; |
60 | if (ac == '\0') { /* "0-": copy verbatim */ | 73 | if (ac == '\0') { /* "0-": copy verbatim */ |
61 | *buffer++ = *arg++; /* copy '0' */ | 74 | buffer[pos++] = *arg++; /* copy '0' */ |
62 | continue; /* next iter will copy '-' and stop */ | 75 | continue; /* next iter will copy '-' and stop */ |
63 | } | 76 | } |
64 | i = *arg; | 77 | i = (unsigned char) *arg; |
65 | while (i <= ac) /* ok: i is unsigned _int_ */ | 78 | while (i <= ac) /* ok: i is unsigned _int_ */ |
66 | *buffer++ = i++; | 79 | buffer[pos++] = i++; |
67 | arg += 3; /* skip 0-9 */ | 80 | arg += 3; /* skip 0-9 */ |
68 | continue; | 81 | continue; |
69 | } | 82 | } |
70 | if (*arg == '[') { /* "[xyz..." */ | 83 | if ((ENABLE_FEATURE_TR_CLASSES || ENABLE_FEATURE_TR_EQUIV) |
84 | && *arg == '[' | ||
85 | ) { | ||
71 | arg++; | 86 | arg++; |
72 | i = *arg++; | 87 | i = (unsigned char) *arg++; |
73 | /* "[xyz...", i=x, arg points to y */ | 88 | /* "[xyz...". i=x, arg points to y */ |
74 | if (ENABLE_FEATURE_TR_CLASSES && i == ':') { | 89 | if (ENABLE_FEATURE_TR_CLASSES && i == ':') { /* [:class:] */ |
75 | #define CLO ":]\0" | 90 | #define CLO ":]\0" |
76 | static const char classes[] ALIGN1 = | 91 | static const char classes[] ALIGN1 = |
77 | "alpha"CLO "alnum"CLO "digit"CLO | 92 | "alpha"CLO "alnum"CLO "digit"CLO |
78 | "lower"CLO "upper"CLO "space"CLO | 93 | "lower"CLO "upper"CLO "space"CLO |
79 | "blank"CLO "punct"CLO "cntrl"CLO | 94 | "blank"CLO "punct"CLO "cntrl"CLO |
80 | "xdigit"CLO; | 95 | "xdigit"CLO; |
81 | #define CLASS_invalid 0 /* we increment the retval */ | 96 | enum { |
82 | #define CLASS_alpha 1 | 97 | CLASS_invalid = 0, /* we increment the retval */ |
83 | #define CLASS_alnum 2 | 98 | CLASS_alpha = 1, |
84 | #define CLASS_digit 3 | 99 | CLASS_alnum = 2, |
85 | #define CLASS_lower 4 | 100 | CLASS_digit = 3, |
86 | #define CLASS_upper 5 | 101 | CLASS_lower = 4, |
87 | #define CLASS_space 6 | 102 | CLASS_upper = 5, |
88 | #define CLASS_blank 7 | 103 | CLASS_space = 6, |
89 | #define CLASS_punct 8 | 104 | CLASS_blank = 7, |
90 | #define CLASS_cntrl 9 | 105 | CLASS_punct = 8, |
91 | #define CLASS_xdigit 10 | 106 | CLASS_cntrl = 9, |
92 | //#define CLASS_graph 11 | 107 | CLASS_xdigit = 10, |
93 | //#define CLASS_print 12 | 108 | //CLASS_graph = 11, |
109 | //CLASS_print = 12, | ||
110 | }; | ||
94 | smalluint j; | 111 | smalluint j; |
95 | { | 112 | char *tmp; |
96 | /* xdigit needs 8, not 7 */ | 113 | |
97 | char *tmp = xstrndup(arg, 7 + (arg[0]=='x')); | 114 | /* xdigit needs 8, not 7 */ |
98 | j = index_in_strings(classes, tmp) + 1; | 115 | i = 7 + (arg[0] == 'x'); |
99 | free(tmp); | 116 | tmp = xstrndup(arg, i); |
100 | } | 117 | j = index_in_strings(classes, tmp) + 1; |
118 | free(tmp); | ||
119 | |||
120 | if (j == CLASS_invalid) | ||
121 | goto skip_bracket; | ||
122 | |||
123 | arg += i; | ||
101 | if (j == CLASS_alnum || j == CLASS_digit || j == CLASS_xdigit) { | 124 | if (j == CLASS_alnum || j == CLASS_digit || j == CLASS_xdigit) { |
102 | for (i = '0'; i <= '9'; i++) | 125 | for (i = '0'; i <= '9'; i++) |
103 | *buffer++ = i; | 126 | buffer[pos++] = i; |
104 | } | 127 | } |
105 | if (j == CLASS_alpha || j == CLASS_alnum || j == CLASS_upper) { | 128 | if (j == CLASS_alpha || j == CLASS_alnum || j == CLASS_upper) { |
106 | for (i = 'A'; i <= 'Z'; i++) | 129 | for (i = 'A'; i <= 'Z'; i++) |
107 | *buffer++ = i; | 130 | buffer[pos++] = i; |
108 | } | 131 | } |
109 | if (j == CLASS_alpha || j == CLASS_alnum || j == CLASS_lower) { | 132 | if (j == CLASS_alpha || j == CLASS_alnum || j == CLASS_lower) { |
110 | for (i = 'a'; i <= 'z'; i++) | 133 | for (i = 'a'; i <= 'z'; i++) |
111 | *buffer++ = i; | 134 | buffer[pos++] = i; |
112 | } | 135 | } |
113 | if (j == CLASS_space || j == CLASS_blank) { | 136 | if (j == CLASS_space || j == CLASS_blank) { |
114 | *buffer++ = '\t'; | 137 | buffer[pos++] = '\t'; |
115 | if (j == CLASS_space) { | 138 | if (j == CLASS_space) { |
116 | *buffer++ = '\n'; | 139 | buffer[pos++] = '\n'; |
117 | *buffer++ = '\v'; | 140 | buffer[pos++] = '\v'; |
118 | *buffer++ = '\f'; | 141 | buffer[pos++] = '\f'; |
119 | *buffer++ = '\r'; | 142 | buffer[pos++] = '\r'; |
120 | } | 143 | } |
121 | *buffer++ = ' '; | 144 | buffer[pos++] = ' '; |
122 | } | 145 | } |
123 | if (j == CLASS_punct || j == CLASS_cntrl) { | 146 | if (j == CLASS_punct || j == CLASS_cntrl) { |
124 | for (i = '\0'; i <= ASCII; i++) | 147 | for (i = '\0'; i < ASCII; i++) { |
125 | if ((j == CLASS_punct && isprint(i) && !isalnum(i) && !isspace(i)) | 148 | if ((j == CLASS_punct && isprint(i) && !isalnum(i) && !isspace(i)) |
126 | || (j == CLASS_cntrl && iscntrl(i))) | 149 | || (j == CLASS_cntrl && iscntrl(i)) |
127 | *buffer++ = i; | 150 | ) { |
151 | buffer[pos++] = i; | ||
152 | } | ||
153 | } | ||
128 | } | 154 | } |
129 | if (j == CLASS_xdigit) { | 155 | if (j == CLASS_xdigit) { |
130 | for (i = 'A'; i <= 'F'; i++) { | 156 | for (i = 'A'; i <= 'F'; i++) { |
131 | *buffer++ = i; | 157 | buffer[pos + 6] = i | 0x20; |
132 | *buffer++ = i | 0x20; | 158 | buffer[pos++] = i; |
133 | } | 159 | } |
160 | pos += 6; | ||
134 | } | 161 | } |
135 | if (j == CLASS_invalid) { | 162 | continue; |
136 | *buffer++ = '['; | ||
137 | *buffer++ = ':'; | ||
138 | continue; | ||
139 | } | ||
140 | break; | ||
141 | } | 163 | } |
142 | /* "[xyz...", i=x, arg points to y */ | 164 | /* "[xyz...", i=x, arg points to y */ |
143 | if (ENABLE_FEATURE_TR_EQUIV && i == '=') { /* [=CHAR=] */ | 165 | if (ENABLE_FEATURE_TR_EQUIV && i == '=') { /* [=CHAR=] */ |
144 | *buffer++ = *arg; /* copy CHAR */ | 166 | buffer[pos++] = *arg; /* copy CHAR */ |
145 | if (!*arg || arg[1] != '=' || arg[2] != ']') | 167 | if (!arg[0] || arg[1] != '=' || arg[2] != ']') |
146 | bb_show_usage(); | 168 | bb_show_usage(); |
147 | arg += 3; /* skip CHAR=] */ | 169 | arg += 3; /* skip CHAR=] */ |
148 | continue; | 170 | continue; |
149 | } | 171 | } |
150 | /* The rest of [xyz... cases is treated as normal | 172 | /* The rest of "[xyz..." cases is treated as normal |
151 | * string, '[' has no special meaning here: | 173 | * string, "[" has no special meaning here: |
152 | * tr "[a-z]" "[A-Z]" can be written as tr "a-z" "A-Z", | 174 | * tr "[a-z]" "[A-Z]" can be written as tr "a-z" "A-Z", |
153 | * also try tr "[a-z]" "_A-Z+" and you'll see that | 175 | * also try tr "[a-z]" "_A-Z+" and you'll see that |
154 | * [] is not special here. | 176 | * [] is not special here. |
155 | */ | 177 | */ |
156 | *buffer++ = '['; | 178 | skip_bracket: |
157 | arg--; /* points to x */ | 179 | arg -= 2; /* points to "[" in "[xyz..." */ |
158 | continue; | ||
159 | } | 180 | } |
160 | *buffer++ = *arg++; | 181 | buffer[pos++] = *arg++; |
161 | } | 182 | } |
162 | return (buffer - buffer_start); | 183 | return pos; |
163 | } | 184 | } |
164 | 185 | ||
186 | /* NB: buffer is guaranteed to be at least TR_BUFSIZE | ||
187 | * (which is >= ASCII) big. | ||
188 | */ | ||
165 | static int complement(char *buffer, int buffer_len) | 189 | static int complement(char *buffer, int buffer_len) |
166 | { | 190 | { |
167 | int ch, j, len; | 191 | int len; |
168 | char conv[ASCII + 2]; | 192 | char conv[ASCII]; |
193 | unsigned char ch; | ||
169 | 194 | ||
170 | len = 0; | 195 | len = 0; |
171 | for (ch = '\0'; ch <= ASCII; ch++) { | 196 | ch = '\0'; |
172 | for (j = 0; j < buffer_len; j++) | 197 | while (1) { |
173 | if (buffer[j] == ch) | 198 | if (memchr(buffer, ch, buffer_len) == NULL) |
174 | goto next_ch; | 199 | conv[len++] = ch; |
175 | /* Didn't find it */ | 200 | if (++ch == '\0') |
176 | conv[len++] = (char) ch; | 201 | break; |
177 | next_ch: | ||
178 | continue; | ||
179 | } | 202 | } |
180 | memcpy(buffer, conv, len); | 203 | memcpy(buffer, conv, len); |
181 | return len; | 204 | return len; |
@@ -190,54 +213,56 @@ int tr_main(int argc UNUSED_PARAM, char **argv) | |||
190 | size_t in_index, out_index; | 213 | size_t in_index, out_index; |
191 | unsigned last = UCHAR_MAX + 1; /* not equal to any char */ | 214 | unsigned last = UCHAR_MAX + 1; /* not equal to any char */ |
192 | unsigned char coded, c; | 215 | unsigned char coded, c; |
193 | unsigned char *output = xmalloc(BUFSIZ); | 216 | char *str1 = xmalloc(TR_BUFSIZ); |
194 | char *vector = xzalloc((ASCII+1) * 3); | 217 | char *str2 = xmalloc(TR_BUFSIZ); |
195 | char *invec = vector + (ASCII+1); | 218 | int str2_length; |
196 | char *outvec = vector + (ASCII+1) * 2; | 219 | int str1_length; |
220 | char *vector = xzalloc(ASCII * 3); | ||
221 | char *invec = vector + ASCII; | ||
222 | char *outvec = vector + ASCII * 2; | ||
197 | 223 | ||
198 | #define TR_OPT_complement (1 << 0) | 224 | #define TR_OPT_complement (1 << 0) |
199 | #define TR_OPT_delete (1 << 1) | 225 | #define TR_OPT_delete (1 << 1) |
200 | #define TR_OPT_squeeze_reps (1 << 2) | 226 | #define TR_OPT_squeeze_reps (1 << 2) |
201 | 227 | ||
202 | flags = getopt32(argv, "+cds"); /* '+': stop at first non-option */ | 228 | for (i = 0; i < ASCII; i++) { |
203 | argv += optind; | ||
204 | |||
205 | for (i = 0; i <= ASCII; i++) { | ||
206 | vector[i] = i; | 229 | vector[i] = i; |
207 | /*invec[i] = outvec[i] = FALSE; - done by xzalloc */ | 230 | /*invec[i] = outvec[i] = FALSE; - done by xzalloc */ |
208 | } | 231 | } |
209 | 232 | ||
210 | #define tr_buf bb_common_bufsiz1 | 233 | opt_complementary = "-1"; |
211 | if (*argv != NULL) { | 234 | flags = getopt32(argv, "+cds"); /* '+': stop at first non-option */ |
212 | int output_length = 0; | 235 | argv += optind; |
213 | int input_length; | ||
214 | 236 | ||
215 | input_length = expand(*argv++, tr_buf); | 237 | str1_length = expand(*argv++, &str1); |
216 | if (flags & TR_OPT_complement) | 238 | str2_length = 0; |
217 | input_length = complement(tr_buf, input_length); | 239 | if (flags & TR_OPT_complement) |
218 | if (*argv) { | 240 | str1_length = complement(str1, str1_length); |
219 | if (argv[0][0] == '\0') | 241 | if (*argv) { |
220 | bb_error_msg_and_die("STRING2 cannot be empty"); | 242 | if (argv[0][0] == '\0') |
221 | output_length = expand(*argv, (char *)output); | 243 | bb_error_msg_and_die("STRING2 cannot be empty"); |
222 | map(vector, (unsigned char *)tr_buf, input_length, output, output_length); | 244 | str2_length = expand(*argv, &str2); |
223 | } | 245 | map(vector, str1, str1_length, |
224 | for (i = 0; i < input_length; i++) | 246 | str2, str2_length); |
225 | invec[(unsigned char)tr_buf[i]] = TRUE; | ||
226 | for (i = 0; i < output_length; i++) | ||
227 | outvec[output[i]] = TRUE; | ||
228 | } | 247 | } |
248 | for (i = 0; i < str1_length; i++) | ||
249 | invec[(unsigned char)(str1[i])] = TRUE; | ||
250 | for (i = 0; i < str2_length; i++) | ||
251 | outvec[(unsigned char)(str2[i])] = TRUE; | ||
229 | 252 | ||
230 | goto start_from; | 253 | goto start_from; |
231 | 254 | ||
255 | /* In this loop, str1 space is reused as input buffer, | ||
256 | * str2 - as output one. */ | ||
232 | for (;;) { | 257 | for (;;) { |
233 | /* If we're out of input, flush output and read more input. */ | 258 | /* If we're out of input, flush output and read more input. */ |
234 | if ((ssize_t)in_index == read_chars) { | 259 | if ((ssize_t)in_index == read_chars) { |
235 | if (out_index) { | 260 | if (out_index) { |
236 | xwrite(STDOUT_FILENO, (char *)output, out_index); | 261 | xwrite(STDOUT_FILENO, str2, out_index); |
237 | start_from: | 262 | start_from: |
238 | out_index = 0; | 263 | out_index = 0; |
239 | } | 264 | } |
240 | read_chars = safe_read(STDIN_FILENO, tr_buf, BUFSIZ); | 265 | read_chars = safe_read(STDIN_FILENO, str1, TR_BUFSIZ); |
241 | if (read_chars <= 0) { | 266 | if (read_chars <= 0) { |
242 | if (read_chars < 0) | 267 | if (read_chars < 0) |
243 | bb_perror_msg_and_die(bb_msg_read_error); | 268 | bb_perror_msg_and_die(bb_msg_read_error); |
@@ -245,14 +270,16 @@ int tr_main(int argc UNUSED_PARAM, char **argv) | |||
245 | } | 270 | } |
246 | in_index = 0; | 271 | in_index = 0; |
247 | } | 272 | } |
248 | c = tr_buf[in_index++]; | 273 | c = str1[in_index++]; |
249 | if ((flags & TR_OPT_delete) && invec[c]) | 274 | if ((flags & TR_OPT_delete) && invec[c]) |
250 | continue; | 275 | continue; |
251 | coded = vector[c]; | 276 | coded = vector[c]; |
252 | if ((flags & TR_OPT_squeeze_reps) && last == coded | 277 | if ((flags & TR_OPT_squeeze_reps) && last == coded |
253 | && (invec[c] || outvec[coded])) | 278 | && (invec[c] || outvec[coded]) |
279 | ) { | ||
254 | continue; | 280 | continue; |
255 | output[out_index++] = last = coded; | 281 | } |
282 | str2[out_index++] = last = coded; | ||
256 | } | 283 | } |
257 | 284 | ||
258 | return EXIT_SUCCESS; | 285 | return EXIT_SUCCESS; |