1 files changed, 56 insertions, 23 deletions
diff --git a/coreutils/wc.c b/coreutils/wc.c
index 4f14374c3..ecadae59b 100644
--- a/coreutils/wc.c
+++ b/coreutils/wc.c
@@ -7,7 +7,7 @@
 * Licensed under GPLv2 or later, see file LICENSE in this source tree.
 */
-/* BB_AUDIT SUSv3 _NOT_ compliant -- option -m is not currently supported. */
+/* BB_AUDIT SUSv3 compliant. */
 /* http://www.opengroup.org/onlinepubs/007904975/utilities/wc.html */
 /* Mar 16, 2003      Manuel Novoa III   (mjn3@codepoet.org)
@@ -19,10 +19,6 @@
 *  3) no checking of ferror on EOF returns
 *  4) isprint() wasn't considered when word counting.
 *
- * TODO:
- *
- * When locale support is enabled, count multibyte chars in the '-m' case.
- *
 * NOTES:
 *
 * The previous busybox wc attempted an optimization using stat for the
@@ -40,8 +36,8 @@
 *
 * for which 'wc -c' should output '0'.
 */
 #include "libbb.h"
+#include "unicode.h"
 #if !ENABLE_LOCALE_SUPPORT
 # undef isprint
@@ -58,11 +54,39 @@
 # define COUNT_FMT "u"
 #endif
+/* We support -m even when UNICODE_SUPPORT is off,
+ * we just don't advertise it in help text,
+ * since it is the same as -c in this case.
+ */
+//usage:#define wc_trivial_usage
+//usage:       "[-c"IF_UNICODE_SUPPORT("m")"lwL] [FILE]..."
+//usage:
+//usage:#define wc_full_usage "\n\n"
+//usage:       "Count lines, words, and bytes for each FILE (or stdin)\n"
+//usage:     "\nOptions:"
+//usage:     "\n        -c      Count bytes"
+//usage:        IF_UNICODE_SUPPORT(
+//usage:     "\n        -m      Count characters"
+//usage:        )
+//usage:     "\n        -l      Count newlines"
+//usage:     "\n        -w      Count words"
+//usage:     "\n        -L      Print longest line length"
+//usage:
+//usage:#define wc_example_usage
+//usage:       "$ wc /etc/passwd\n"
+//usage:       "     31      46    1365 /etc/passwd\n"
+/* Order is important if we want to be compatible with
+ * column order in "wc -cmlwL" output:
+ */
 enum {
-        WC_LINES        = 0,
+        WC_LINES    = 0,
-        WC_WORDS        = 1,
+        WC_WORDS    = 1,
-        WC_CHARS        = 2,
+        WC_UNICHARS = 2,
-        WC_LENGTH       = 3
+        WC_CHARS    = 3,
+        WC_LENGTH   = 4,
+        NUM_WCS     = 5,
 };
 int wc_main(int argc, char **argv) MAIN_EXTERNALLY_VISIBLE;
@@ -72,13 +96,15 @@ int wc_main(int argc UNUSED_PARAM, char **argv)
        const char *start_fmt = " %9"COUNT_FMT + 1;
        const char *fname_fmt = " %s\n";
        COUNT_T *pcounts;
-        COUNT_T counts[4];
+        COUNT_T counts[NUM_WCS];
-        COUNT_T totals[4];
+        COUNT_T totals[NUM_WCS];
        int num_files;
        smallint status = EXIT_SUCCESS;
        unsigned print_type;
-        print_type = getopt32(argv, "lwcL");
+        init_unicode();
+        print_type = getopt32(argv, "lwcmL");
        if (print_type == 0) {
                print_type = (1 << WC_LINES) | (1 << WC_WORDS) | (1 << WC_CHARS);
@@ -99,7 +125,7 @@ int wc_main(int argc UNUSED_PARAM, char **argv)
        pcounts = counts;
        num_files = 0;
-        while ((arg = *argv++) != 0) {
+        while ((arg = *argv++) != NULL) {
                FILE *fp;
                const char *s;
                unsigned u;
@@ -117,21 +143,28 @@ int wc_main(int argc UNUSED_PARAM, char **argv)
                linepos = 0;
                in_word = 0;
-                do {
+                while (1) {
                        int c;
                        /* Our -w doesn't match GNU wc exactly... oh well */
-                        ++counts[WC_CHARS];
                        c = getc(fp);
                        if (c == EOF) {
                                if (ferror(fp)) {
                                        bb_simple_perror_msg(arg);
                                        status = EXIT_FAILURE;
                                }
-                                --counts[WC_CHARS];
                                goto DO_EOF;            /* Treat an EOF as '\r'. */
                        }
-                        if (isprint_asciionly(c)) {
+                        /* Cater for -c and -m */
+                        ++counts[WC_CHARS];
+                        if (unicode_status != UNICODE_ON /* every byte is a new char */
+                         || (c & 0xc0) != 0x80 /* it isn't a 2nd+ byte of a Unicode char */
+                        ) {
+                                ++counts[WC_UNICHARS];
+                        }
+                        if (isprint_asciionly(c)) { /* FIXME: not unicode-aware */
                                ++linepos;
                                if (!isspace(c)) {
                                        in_word = 1;
@@ -167,18 +200,18 @@ int wc_main(int argc UNUSED_PARAM, char **argv)
                        if (c == EOF) {
                                break;
                        }
-                } while (1);
+                }
+                fclose_if_not_stdin(fp);
                if (totals[WC_LENGTH] < counts[WC_LENGTH]) {
                        totals[WC_LENGTH] = counts[WC_LENGTH];
                }
                totals[WC_LENGTH] -= counts[WC_LENGTH];
-                fclose_if_not_stdin(fp);
 OUTPUT:
                /* coreutils wc tries hard to print pretty columns
-                 * (saves results for all files, find max col len etc...)
+                 * (saves results for all files, finds max col len etc...)
                 * we won't try that hard, it will bloat us too much */
                s = start_fmt;
                u = 0;
@@ -188,7 +221,7 @@ int wc_main(int argc UNUSED_PARAM, char **argv)
                                s = " %9"COUNT_FMT; /* Ok... restore the leading space. */
                        }
                        totals[u] += pcounts[u];
-                } while (++u < 4);
+                } while (++u < NUM_WCS);
                printf(fname_fmt, arg);
        }

diff --git a/coreutils/wc.c b/coreutils/wc.c index 4f14374c3..ecadae59b 100644 --- a/coreutils/wc.c +++ b/coreutils/wc.c
@@ -7,7 +7,7 @@
7	* Licensed under GPLv2 or later, see file LICENSE in this source tree.	7	* Licensed under GPLv2 or later, see file LICENSE in this source tree.
8	*/	8	*/
9		9
10	/* BB_AUDIT SUSv3 _NOT_ compliant -- option -m is not currently supported. */	10	/* BB_AUDIT SUSv3 compliant. */
11	/* http://www.opengroup.org/onlinepubs/007904975/utilities/wc.html */	11	/* http://www.opengroup.org/onlinepubs/007904975/utilities/wc.html */
12		12
13	/* Mar 16, 2003 Manuel Novoa III (mjn3@codepoet.org)	13	/* Mar 16, 2003 Manuel Novoa III (mjn3@codepoet.org)
@@ -19,10 +19,6 @@
19	* 3) no checking of ferror on EOF returns	19	* 3) no checking of ferror on EOF returns
20	* 4) isprint() wasn't considered when word counting.	20	* 4) isprint() wasn't considered when word counting.
21	*	21	*
22	* TODO:
23	*
24	* When locale support is enabled, count multibyte chars in the '-m' case.
25	*
26	* NOTES:	22	* NOTES:
27	*	23	*
28	* The previous busybox wc attempted an optimization using stat for the	24	* The previous busybox wc attempted an optimization using stat for the
@@ -40,8 +36,8 @@
40	*	36	*
41	* for which 'wc -c' should output '0'.	37	* for which 'wc -c' should output '0'.
42	*/	38	*/
43
44	#include "libbb.h"	39	#include "libbb.h"
		40	#include "unicode.h"
45		41
46	#if !ENABLE_LOCALE_SUPPORT	42	#if !ENABLE_LOCALE_SUPPORT
47	# undef isprint	43	# undef isprint
@@ -58,11 +54,39 @@
58	# define COUNT_FMT "u"	54	# define COUNT_FMT "u"
59	#endif	55	#endif
60		56
		57	/* We support -m even when UNICODE_SUPPORT is off,
		58	* we just don't advertise it in help text,
		59	* since it is the same as -c in this case.
		60	*/
		61
		62	//usage:#define wc_trivial_usage
		63	//usage: "[-c"IF_UNICODE_SUPPORT("m")"lwL] [FILE]..."
		64	//usage:
		65	//usage:#define wc_full_usage "\n\n"
		66	//usage: "Count lines, words, and bytes for each FILE (or stdin)\n"
		67	//usage: "\nOptions:"
		68	//usage: "\n -c Count bytes"
		69	//usage: IF_UNICODE_SUPPORT(
		70	//usage: "\n -m Count characters"
		71	//usage: )
		72	//usage: "\n -l Count newlines"
		73	//usage: "\n -w Count words"
		74	//usage: "\n -L Print longest line length"
		75	//usage:
		76	//usage:#define wc_example_usage
		77	//usage: "$ wc /etc/passwd\n"
		78	//usage: " 31 46 1365 /etc/passwd\n"
		79
		80	/* Order is important if we want to be compatible with
		81	* column order in "wc -cmlwL" output:
		82	*/
61	enum {	83	enum {
62	WC_LINES = 0,	84	WC_LINES = 0,
63	WC_WORDS = 1,	85	WC_WORDS = 1,
64	WC_CHARS = 2,	86	WC_UNICHARS = 2,
65	WC_LENGTH = 3	87	WC_CHARS = 3,
		88	WC_LENGTH = 4,
		89	NUM_WCS = 5,
66	};	90	};
67		91
68	int wc_main(int argc, char **argv) MAIN_EXTERNALLY_VISIBLE;	92	int wc_main(int argc, char **argv) MAIN_EXTERNALLY_VISIBLE;
@@ -72,13 +96,15 @@ int wc_main(int argc UNUSED_PARAM, char **argv)
72	const char *start_fmt = " %9"COUNT_FMT + 1;	96	const char *start_fmt = " %9"COUNT_FMT + 1;
73	const char *fname_fmt = " %s\n";	97	const char *fname_fmt = " %s\n";
74	COUNT_T *pcounts;	98	COUNT_T *pcounts;
75	COUNT_T counts[4];	99	COUNT_T counts[NUM_WCS];
76	COUNT_T totals[4];	100	COUNT_T totals[NUM_WCS];
77	int num_files;	101	int num_files;
78	smallint status = EXIT_SUCCESS;	102	smallint status = EXIT_SUCCESS;
79	unsigned print_type;	103	unsigned print_type;
80		104
81	print_type = getopt32(argv, "lwcL");	105	init_unicode();
		106
		107	print_type = getopt32(argv, "lwcmL");
82		108
83	if (print_type == 0) {	109	if (print_type == 0) {
84	print_type = (1 << WC_LINES) \| (1 << WC_WORDS) \| (1 << WC_CHARS);	110	print_type = (1 << WC_LINES) \| (1 << WC_WORDS) \| (1 << WC_CHARS);
@@ -99,7 +125,7 @@ int wc_main(int argc UNUSED_PARAM, char **argv)
99	pcounts = counts;	125	pcounts = counts;
100		126
101	num_files = 0;	127	num_files = 0;
102	while ((arg = *argv++) != 0) {	128	while ((arg = *argv++) != NULL) {
103	FILE *fp;	129	FILE *fp;
104	const char *s;	130	const char *s;
105	unsigned u;	131	unsigned u;
@@ -117,21 +143,28 @@ int wc_main(int argc UNUSED_PARAM, char **argv)
117	linepos = 0;	143	linepos = 0;
118	in_word = 0;	144	in_word = 0;
119		145
120	do {	146	while (1) {
121	int c;	147	int c;
122	/* Our -w doesn't match GNU wc exactly... oh well */	148	/* Our -w doesn't match GNU wc exactly... oh well */
123		149
124	++counts[WC_CHARS];
125	c = getc(fp);	150	c = getc(fp);
126	if (c == EOF) {	151	if (c == EOF) {
127	if (ferror(fp)) {	152	if (ferror(fp)) {
128	bb_simple_perror_msg(arg);	153	bb_simple_perror_msg(arg);
129	status = EXIT_FAILURE;	154	status = EXIT_FAILURE;
130	}	155	}
131	--counts[WC_CHARS];
132	goto DO_EOF; /* Treat an EOF as '\r'. */	156	goto DO_EOF; /* Treat an EOF as '\r'. */
133	}	157	}
134	if (isprint_asciionly(c)) {	158
		159	/* Cater for -c and -m */
		160	++counts[WC_CHARS];
		161	if (unicode_status != UNICODE_ON /* every byte is a new char */
		162	\|\| (c & 0xc0) != 0x80 /* it isn't a 2nd+ byte of a Unicode char */
		163	) {
		164	++counts[WC_UNICHARS];
		165	}
		166
		167	if (isprint_asciionly(c)) { /* FIXME: not unicode-aware */
135	++linepos;	168	++linepos;
136	if (!isspace(c)) {	169	if (!isspace(c)) {
137	in_word = 1;	170	in_word = 1;
@@ -167,18 +200,18 @@ int wc_main(int argc UNUSED_PARAM, char **argv)
167	if (c == EOF) {	200	if (c == EOF) {
168	break;	201	break;
169	}	202	}
170	} while (1);	203	}
		204
		205	fclose_if_not_stdin(fp);
171		206
172	if (totals[WC_LENGTH] < counts[WC_LENGTH]) {	207	if (totals[WC_LENGTH] < counts[WC_LENGTH]) {
173	totals[WC_LENGTH] = counts[WC_LENGTH];	208	totals[WC_LENGTH] = counts[WC_LENGTH];
174	}	209	}
175	totals[WC_LENGTH] -= counts[WC_LENGTH];	210	totals[WC_LENGTH] -= counts[WC_LENGTH];
176		211
177	fclose_if_not_stdin(fp);
178
179	OUTPUT:	212	OUTPUT:
180	/* coreutils wc tries hard to print pretty columns	213	/* coreutils wc tries hard to print pretty columns
181	* (saves results for all files, find max col len etc...)	214	* (saves results for all files, finds max col len etc...)
182	* we won't try that hard, it will bloat us too much */	215	* we won't try that hard, it will bloat us too much */
183	s = start_fmt;	216	s = start_fmt;
184	u = 0;	217	u = 0;
@@ -188,7 +221,7 @@ int wc_main(int argc UNUSED_PARAM, char **argv)
188	s = " %9"COUNT_FMT; /* Ok... restore the leading space. */	221	s = " %9"COUNT_FMT; /* Ok... restore the leading space. */
189	}	222	}
190	totals[u] += pcounts[u];	223	totals[u] += pcounts[u];
191	} while (++u < 4);	224	} while (++u < NUM_WCS);
192	printf(fname_fmt, arg);	225	printf(fname_fmt, arg);
193	}	226	}
194		227