diff options
| author | Denys Vlasenko <vda.linux@googlemail.com> | 2018-04-04 17:02:32 +0200 |
|---|---|---|
| committer | Denys Vlasenko <vda.linux@googlemail.com> | 2018-04-04 17:03:41 +0200 |
| commit | c29c2e60d8677cdec142a88609a3d040e4719439 (patch) | |
| tree | 9d23487088699fb4f1b4aabd9361c4a7c7d21edf /coreutils | |
| parent | ee1fd1246e7d25990531dbbb2d267605b8914d5d (diff) | |
| download | busybox-w32-c29c2e60d8677cdec142a88609a3d040e4719439.tar.gz busybox-w32-c29c2e60d8677cdec142a88609a3d040e4719439.tar.bz2 busybox-w32-c29c2e60d8677cdec142a88609a3d040e4719439.zip | |
sort: FEATURE_SORT_OPTIMIZE_MEMORY
On sorting all kernel/linux/arch/ *.[ch] files,
this reduces memory usage by 6%.
yes | head -99999999 | sort
goes down from 1900Mb to 380 Mb.
function old new delta
sort_main 862 1098 +236
Signed-off-by: Denys Vlasenko <vda.linux@googlemail.com>
Diffstat (limited to 'coreutils')
| -rw-r--r-- | coreutils/sort.c | 86 |
1 files changed, 80 insertions, 6 deletions
diff --git a/coreutils/sort.c b/coreutils/sort.c index b39297a26..b8bb6871d 100644 --- a/coreutils/sort.c +++ b/coreutils/sort.c | |||
| @@ -18,16 +18,24 @@ | |||
| 18 | //config: sort is used to sort lines of text in specified files. | 18 | //config: sort is used to sort lines of text in specified files. |
| 19 | //config: | 19 | //config: |
| 20 | //config:config FEATURE_SORT_BIG | 20 | //config:config FEATURE_SORT_BIG |
| 21 | //config: bool "Full SuSv3 compliant sort (support -ktcsbdfiozgM)" | 21 | //config: bool "Full SuSv3 compliant sort (support -ktcbdfiozgM)" |
| 22 | //config: default y | 22 | //config: default y |
| 23 | //config: depends on SORT | 23 | //config: depends on SORT |
| 24 | //config: help | 24 | //config: help |
| 25 | //config: Without this, sort only supports -r, -u, and an integer version | 25 | //config: Without this, sort only supports -r, -u, -s, and an integer version |
| 26 | //config: of -n. Selecting this adds sort keys, floating point support, and | 26 | //config: of -n. Selecting this adds sort keys, floating point support, and |
| 27 | //config: more. This adds a little over 3k to a nonstatic build on x86. | 27 | //config: more. This adds a little over 3k to a nonstatic build on x86. |
| 28 | //config: | 28 | //config: |
| 29 | //config: The SuSv3 sort standard is available at: | 29 | //config: The SuSv3 sort standard is available at: |
| 30 | //config: http://www.opengroup.org/onlinepubs/007904975/utilities/sort.html | 30 | //config: http://www.opengroup.org/onlinepubs/007904975/utilities/sort.html |
| 31 | //config: | ||
| 32 | //config:config FEATURE_SORT_OPTIMIZE_MEMORY | ||
| 33 | //config: bool "Use less memory (but might be slower)" | ||
| 34 | //config: default n # defaults to N since we are size-paranoid tribe | ||
| 35 | //config: depends on SORT | ||
| 36 | //config: help | ||
| 37 | //config: Attempt to use less memory (by storing only one copy | ||
| 38 | //config: of duplicated lines, and such). Useful if you work on huge files. | ||
| 31 | 39 | ||
| 32 | //applet:IF_SORT(APPLET_NOEXEC(sort, sort, BB_DIR_USR_BIN, BB_SUID_DROP, sort)) | 40 | //applet:IF_SORT(APPLET_NOEXEC(sort, sort, BB_DIR_USR_BIN, BB_SUID_DROP, sort)) |
| 33 | 41 | ||
| @@ -46,19 +54,17 @@ | |||
| 46 | //usage: "\n -f Ignore case" | 54 | //usage: "\n -f Ignore case" |
| 47 | //usage: "\n -i Ignore unprintable characters" | 55 | //usage: "\n -i Ignore unprintable characters" |
| 48 | //usage: "\n -d Dictionary order (blank or alphanumeric only)" | 56 | //usage: "\n -d Dictionary order (blank or alphanumeric only)" |
| 49 | //usage: "\n -g General numerical sort" | ||
| 50 | //usage: "\n -M Sort month" | ||
| 51 | //usage: ) | 57 | //usage: ) |
| 52 | //-h, --human-numeric-sort: compare human readable numbers (e.g., 2K 1G) | 58 | //-h, --human-numeric-sort: compare human readable numbers (e.g., 2K 1G) |
| 53 | //usage: "\n -n Sort numbers" | 59 | //usage: "\n -n Sort numbers" |
| 54 | //usage: IF_FEATURE_SORT_BIG( | 60 | //usage: IF_FEATURE_SORT_BIG( |
| 61 | //usage: "\n -g General numerical sort" | ||
| 62 | //usage: "\n -M Sort month" | ||
| 55 | //usage: "\n -t CHAR Field separator" | 63 | //usage: "\n -t CHAR Field separator" |
| 56 | //usage: "\n -k N[,M] Sort by Nth field" | 64 | //usage: "\n -k N[,M] Sort by Nth field" |
| 57 | //usage: ) | 65 | //usage: ) |
| 58 | //usage: "\n -r Reverse sort order" | 66 | //usage: "\n -r Reverse sort order" |
| 59 | //usage: IF_FEATURE_SORT_BIG( | ||
| 60 | //usage: "\n -s Stable (don't sort ties alphabetically)" | 67 | //usage: "\n -s Stable (don't sort ties alphabetically)" |
| 61 | //usage: ) | ||
| 62 | //usage: "\n -u Suppress duplicate lines" | 68 | //usage: "\n -u Suppress duplicate lines" |
| 63 | //usage: IF_FEATURE_SORT_BIG( | 69 | //usage: IF_FEATURE_SORT_BIG( |
| 64 | //usage: "\n -z Lines are terminated by NUL, not newline" | 70 | //usage: "\n -z Lines are terminated by NUL, not newline" |
| @@ -404,6 +410,14 @@ int sort_main(int argc UNUSED_PARAM, char **argv) | |||
| 404 | int i; | 410 | int i; |
| 405 | int linecount; | 411 | int linecount; |
| 406 | unsigned opts; | 412 | unsigned opts; |
| 413 | #if ENABLE_FEATURE_SORT_OPTIMIZE_MEMORY | ||
| 414 | bool can_drop_dups; | ||
| 415 | size_t prev_len = 0; | ||
| 416 | /* Postpone optimizing if the input is small, < 16k lines: | ||
| 417 | * even just free()ing duplicate lines takes time. | ||
| 418 | */ | ||
| 419 | size_t count_to_optimize_dups = 0x3fff; | ||
| 420 | #endif | ||
| 407 | 421 | ||
| 408 | xfunc_error_retval = 2; | 422 | xfunc_error_retval = 2; |
| 409 | 423 | ||
| @@ -412,6 +426,29 @@ int sort_main(int argc UNUSED_PARAM, char **argv) | |||
| 412 | sort_opt_str, | 426 | sort_opt_str, |
| 413 | &str_ignored, &str_ignored, &str_o, &lst_k, &str_t | 427 | &str_ignored, &str_ignored, &str_o, &lst_k, &str_t |
| 414 | ); | 428 | ); |
| 429 | #if ENABLE_FEATURE_SORT_OPTIMIZE_MEMORY | ||
| 430 | /* Can drop dups only if -u but no "complicating" options, | ||
| 431 | * IOW: if we do a full line compares. Safe options: | ||
| 432 | * -o FILE Output to FILE | ||
| 433 | * -z Lines are terminated by NUL, not newline | ||
| 434 | * -r Reverse sort order | ||
| 435 | * -s Stable (don't sort ties alphabetically) | ||
| 436 | * Not sure about these: | ||
| 437 | * -b Ignore leading blanks | ||
| 438 | * -f Ignore case | ||
| 439 | * -i Ignore unprintable characters | ||
| 440 | * -d Dictionary order (blank or alphanumeric only) | ||
| 441 | * -n Sort numbers | ||
| 442 | * -g General numerical sort | ||
| 443 | * -M Sort month | ||
| 444 | */ | ||
| 445 | can_drop_dups = ((opts & ~(FLAG_o|FLAG_z|FLAG_r|FLAG_s)) == FLAG_u); | ||
| 446 | /* Stable sort needs every line to be uniquely allocated, | ||
| 447 | * disable optimization to reuse strings: | ||
| 448 | */ | ||
| 449 | if (opts & FLAG_s) | ||
| 450 | count_to_optimize_dups = (size_t)-1L; | ||
| 451 | #endif | ||
| 415 | /* global b strips leading and trailing spaces */ | 452 | /* global b strips leading and trailing spaces */ |
| 416 | if (opts & FLAG_b) | 453 | if (opts & FLAG_b) |
| 417 | option_mask32 |= FLAG_bb; | 454 | option_mask32 |= FLAG_bb; |
| @@ -489,6 +526,41 @@ int sort_main(int argc UNUSED_PARAM, char **argv) | |||
| 489 | char *line = GET_LINE(fp); | 526 | char *line = GET_LINE(fp); |
| 490 | if (!line) | 527 | if (!line) |
| 491 | break; | 528 | break; |
| 529 | |||
| 530 | #if ENABLE_FEATURE_SORT_OPTIMIZE_MEMORY | ||
| 531 | if (count_to_optimize_dups != 0) | ||
| 532 | count_to_optimize_dups--; | ||
| 533 | if (count_to_optimize_dups == 0) { | ||
| 534 | size_t len; | ||
| 535 | char *new_line; | ||
| 536 | char first = *line; | ||
| 537 | |||
| 538 | /* On kernel/linux/arch/ *.[ch] files, | ||
| 539 | * this reduces memory usage by 6%. | ||
| 540 | * yes | head -99999999 | sort | ||
| 541 | * goes down from 1900Mb to 380 Mb. | ||
| 542 | */ | ||
| 543 | if (first == '\0' || first == '\n') { | ||
| 544 | len = !(first == '\0'); | ||
| 545 | new_line = (char*)"\n" + 1 - len; | ||
| 546 | goto replace; | ||
| 547 | } | ||
| 548 | len = strlen(line); | ||
| 549 | if (len <= prev_len) { | ||
| 550 | new_line = lines[linecount-1] + (prev_len - len); | ||
| 551 | if (strcmp(line, new_line) == 0) { | ||
| 552 | if (can_drop_dups && prev_len == len) { | ||
| 553 | free(line); | ||
| 554 | continue; | ||
| 555 | } | ||
| 556 | replace: | ||
| 557 | free(line); | ||
| 558 | line = new_line; | ||
| 559 | } | ||
| 560 | } | ||
| 561 | prev_len = len; | ||
| 562 | } | ||
| 563 | #endif | ||
| 492 | lines = xrealloc_vector(lines, 6, linecount); | 564 | lines = xrealloc_vector(lines, 6, linecount); |
| 493 | lines[linecount++] = line; | 565 | lines[linecount++] = line; |
| 494 | } | 566 | } |
| @@ -510,6 +582,8 @@ int sort_main(int argc UNUSED_PARAM, char **argv) | |||
| 510 | } | 582 | } |
| 511 | return EXIT_SUCCESS; | 583 | return EXIT_SUCCESS; |
| 512 | } | 584 | } |
| 585 | #else | ||
| 586 | //TODO: lighter version which only drops total dups if can_drop_dups == true | ||
| 513 | #endif | 587 | #endif |
| 514 | 588 | ||
| 515 | /* For stable sort, store original line position beyond terminating NUL */ | 589 | /* For stable sort, store original line position beyond terminating NUL */ |
