/* vi: set sw=4 ts=4: */ /* * join -- equality join on two files * * Written by Morgan Bartlett * Copyright (C) 2025 Morgan Bartlett * * Licensed under GPLv2 or later, see file LICENSE in this source tree. */ //config:config JOIN //config: bool "join (8.7 kb)" //config: default n //config: help //config: Equality join on two files //applet:IF_JOIN(APPLET_NOEXEC(join, join, BB_DIR_USR_BIN, BB_SUID_DROP, join)) //kbuild:lib-$(CONFIG_JOIN) += join.o //usage:#define join_trivial_usage //usage: "[-a file_number | -v file_number] [-e empty] [-o list] [-t sep] [-1 field] [-2 field] file1 file2" //usage:#define join_full_usage "\n\n" //usage: "Perform a join on file1 and file2, writing to stdout\n" //usage: "\n -a FNUM Also write unpaired lines from file FNUM" //usage: "\n -v FNUM Only write unpaired lines from file FNUM" //usage: "\n -e STR Replace empty fields with STR" //usage: "\n -o LIST Write the fields in LIST" //usage: "\n -t SEP Use a different field separator" //usage: "\n -1 N Join on field N from file1" //usage: "\n -2 N Join on field N from file2" //usage: "\n" //usage: "\nLIST is a space or comma separated list of FNUM.FIELD or 0 (the join field)" #include "libbb.h" #include "unicode.h" /* This is a NOEXEC applet. Be very careful! */ /* Must match getopt32 call */ #define FLAG_UNPAIRED_ADD 1 #define FLAG_UNPAIRED_ONLY 2 #define FLAG_EMPTY_STRING 4 #define FLAG_LIST_OUTPUT 8 #define FLAG_SEP_USED 16 #define FLAG_FIELD_1 32 #define FLAG_FIELD_2 64 typedef struct { char **fields; int fieldcount; } LINE; typedef struct { FILE *fp; int idx; const char *field; /* this is a pointer to lines[0].fields[idx - 1] or "" */ LINE *lines; LINE pushback; int linecount; int linecap; } FDAT; static int field_split(const char *s, char sep, char ***fields) { /* compare awk_split from editors/awk.c */ int n; const char *ps = s; const char *s1 = s; char **sl; size_t sn; /* in worst case, every character is a separator */ *fields = sl = xzalloc(sizeof(char*) * (strlen(s) + 2)); n = 0; if (sep != '\0') { /* single-character split */ while ((s1 = strchr(ps, sep)) != NULL) { size_t count = s1 - ps; sl[n] = xstrndup(ps, count); ps = s1 + 1; n++; } /* Add the last field */ sl[n] = xstrdup(ps); n++; return n; } /* default split: skip the initial whitespace and then any run of non-whitespace characters is a field */ while (*s) { /* blanks are space and tab in the POSIX locale */ while (*s == ' ' || *s == '\t') s++; if (!*s) break; s1 = strpbrk(s, " \t"); if (s1 == NULL) /* last field */ sn = strlen(s); else sn = s1 - s; sl[n] = xstrndup(s, s1 - s); n++; s += sn; } return n; } static inline void freefields(char **fields) { char **fieldsiter = fields; while (*fieldsiter) { free(*fieldsiter); fieldsiter++; } free(fields); } static void readfields(char sep, FDAT *f) { LINE curr = { .fields = NULL, .fieldcount = 0 }; char *line; const char *field2; int n; for (n = 0; n < f->linecount; n ++) { if (f->lines[n].fields) { freefields(f->lines[n].fields); f->lines[n].fields = NULL; } } f->linecount = 0; f->field = ""; /* read a line and split, checking the pushed back line */ if (f->pushback.fields) { memcpy(&curr, &f->pushback, sizeof(LINE)); f->pushback.fields = NULL; f->pushback.fieldcount = 0; } else { line = xmalloc_fgetline(f->fp); if (line == NULL) { return; } n = field_split(line, sep, &curr.fields); free(line); curr.fieldcount = n; } if (f->idx > curr.fieldcount) f->field = ""; else f->field = (curr.fields)[f->idx - 1]; if (f->linecount == f->linecap) { f->linecap = f->linecap ? f->linecap * 2 : 8; f->lines = xrealloc(f->lines, sizeof(LINE) * f->linecap); } memcpy(&f->lines[f->linecount], &curr, sizeof(LINE)); f->linecount ++; while (true) { /* read a line and split */ line = xmalloc_fgetline(f->fp); if (line == NULL) { return; } n = field_split(line, sep, &curr.fields); free(line); curr.fieldcount = n; if (f->idx > n) field2 = ""; else field2 = (curr.fields)[f->idx - 1]; if (strcmp(f->field, field2) == 0) { /* add to the stack */ if (f->linecount == f->linecap) { f->linecap = f->linecap ? 8 : f->linecap * 2; f->lines = xrealloc(f->lines, sizeof(LINE) * f->linecap); } memcpy(&f->lines[f->linecount], &curr, sizeof(LINE)); f->linecount ++; } else { memcpy(&f->pushback, &curr, sizeof(LINE)); return; } } } static inline const char *fieldorempty(const char *field, const char *empty_str) { if (*field == '\0') return empty_str; else return field; } static void printfields(int *format, const char *empty_str, char sep, FDAT *f1, FDAT *f2) { const char *field = (f1 == NULL) ? f2->field : f1->field; int linef1; int linef2; int fn; int format_fl; int format_no; bool first; LINE *l1; LINE *l2; if (sep == '\0') sep = ' '; for (linef1 = 0; linef1 < (f1 ? f1->linecount : 1); linef1 ++) { l1 = f1 ? &f1->lines[linef1] : NULL; for (linef2 = 0; linef2 < (f2 ? f2->linecount : 1); linef2 ++) { l2 = f2 ? &f2->lines[linef2] : NULL; if (format) { /* Format is a sort of null-terminated array: They are indexed by [n] for file number and [n + 1] for field number. If file number is 0 then that is the null-terminator. If file number is neither 1 nor 2 then we have the join field. */ first = true; while (format[0]) { if (first) first = false; else bb_putchar(sep); format_fl = format[0]; format_no = format[1]; if (format_fl == 1) { if (l1 != NULL && l1->fieldcount >= format_no && format_no > 0) fputs_stdout(fieldorempty(l1->fields[format_no - 1], empty_str)); else fputs_stdout(empty_str); } else if (format_fl == 2) { if (l2 != NULL && l2->fieldcount >= format_no && format_no > 0) fputs_stdout(fieldorempty(l2->fields[format_no - 1], empty_str)); else fputs_stdout(empty_str); } else fputs_stdout(fieldorempty(field, empty_str)); format += 2; } } else { fputs_stdout(fieldorempty(field, empty_str)); fn = 1; if (l1 != NULL) while (l1->fields[fn - 1]) { if (fn != f1->idx) { bb_putchar(sep); fputs_stdout(fieldorempty(l1->fields[fn - 1], empty_str)); } fn++; } fn = 1; if (l2 != NULL) while (l2->fields[fn - 1]) { if (fn != f2->idx) { bb_putchar(sep); fputs_stdout(fieldorempty(l2->fields[fn - 1], empty_str)); } fn++; } } bb_putchar('\n'); } } } static void parsejformat(int **format_p, const char *format_str) { int *format; int field_no; char scache[20] = { 0 }; int n = 0; const char *s1 = format_str; size_t sn; /* The most formats we can have is if the format_str is noted as 0,0,0,0 which ends up being (strlen + 1) / 2 and then we need to have two for each entry plus one for the terminator */ *format_p = format = xzalloc(sizeof(int) * ((strlen(format_str) + 1) / 2) * 2 + 1); /* default split: skip the initial whitespace and then any run of non-whitespace characters is a field */ while (*format_str) { /* blanks are space and tab in the POSIX locale */ while (*format_str == ' ' || *format_str == '\t' || *format_str == ',') format_str++; if (!*format_str) break; s1 = strpbrk(format_str, " \t,"); if (s1 == NULL) /* last field */ sn = strlen(format_str); else sn = s1 - format_str; if (sn == 1 && *format_str == '0') { format[n * 2] = 3; } else if (sn >= 3 && (*format_str == '1' || *format_str == '2') && *(format_str + 1) == '.') { if (sn > 21) /* res_str > 19 */ bb_simple_error_msg_and_die("field specifier too large"); memcpy(scache, format_str + 2, sn - 2); scache[sn - 2] = '\0'; field_no = xatoi_positive(scache); if (field_no <= 0) bb_simple_error_msg_and_die("field number can't be 0"); format[n * 2] = *format_str - '0'; format[n * 2 + 1] = field_no; } else { bb_simple_error_msg_and_die("field specifier must be 0, 1.x or 2.x"); } n++; format_str += sn; } } int join_main(int argc, char **argv) MAIN_EXTERNALLY_VISIBLE; int join_main(int argc, char **argv) { exitcode_t exitcode = EXIT_SUCCESS; llist_t *unpaired_add = NULL; llist_t *unpaired_only = NULL; const char *empty_str = ""; const char *format_str = NULL; char *separator = NULL; uint32_t opts; bool print1unpaired = false; bool print2unpaired = false; bool printpaired = true; const char *filename1; const char *filename2; int *format = NULL; FDAT f1 = { .fp = NULL, .idx = 1, .field = NULL, .lines = NULL, .pushback = { .fields = NULL, .fieldcount = 0 }, .linecount = 0, .linecap = 0, }; FDAT f2 = { .fp = NULL, .idx = 1, .field = NULL, .lines = NULL, .pushback = { .fields = NULL, .fieldcount = 0 }, .linecount = 0, .linecap = 0, }; char *unpaired_str; /* We can't use \0 as a real separator, so this stands in for the whitespace+ pattern */ char sep = 0; int i; init_unicode(); opts = getopt32(argv, "^a:*v:*" "e:" "o:" "t:" "1:+2:+" "\0""=2", &unpaired_add, &unpaired_only, &empty_str, &format_str, &separator, &f1.idx, &f2.idx); argc -= optind; argv += optind; if ((opts & FLAG_UNPAIRED_ADD) || (opts & FLAG_UNPAIRED_ONLY)) { if ((opts & FLAG_UNPAIRED_ADD) && (opts & FLAG_UNPAIRED_ONLY)) bb_simple_error_msg_and_die("-a and -v are exclusive"); printpaired = !(opts & FLAG_UNPAIRED_ONLY); while ((unpaired_str = llist_pop((opts & FLAG_UNPAIRED_ONLY) ? &unpaired_only : &unpaired_add)) != NULL) { if (*unpaired_str == '\0' || unpaired_str[1] != '\0') bb_simple_error_msg_and_die("-a and -v take either 1 or 2"); else if (*unpaired_str == '1') print1unpaired = true; else if (*unpaired_str == '2') print2unpaired = true; else bb_simple_error_msg_and_die("-a and -v take either 1 or 2"); } unpaired_str = NULL; } if (f1.idx < 1 || f2.idx < 1) bb_simple_error_msg_and_die("field 0 doesn't exist"); if (opts & FLAG_SEP_USED) { if (strlen(separator) != 1) bb_simple_error_msg_and_die("separators are single characters"); sep = *separator; } if (opts & FLAG_LIST_OUTPUT) parsejformat(&format, format_str); filename1 = argv[0]; filename2 = argv[1]; f1.fp = xfopen_stdin(filename1); f2.fp = xfopen_stdin(filename2); if (f1.fp == f2.fp) bb_simple_error_msg_and_die("can't combine stdin with itself"); /* Outline of the program: - Read a line from each file into cache - While there are cached current lines from both files: - If the two indexing fields are the same: - If no unpaired_only options: - Print joined line formatted with fields - Read lines from both files - Elseif indexing1 < indexing2 then - If print1unpaired: - Print line from file 1 - Read another line from file 1 - Elseif indexing1 > indexing2 then - If print2unpaired: - Print line from file 2 - Read another line from file 2 - If there are cached lines from file 1 AND print1unpaired: - Print the rest of the lines from file 1 - If there are cached lines from file 2 AND print2unpaired: - Print the rest of the lines from file 2 */ /* Notes for POSIX: https://pubs.opengroup.org/onlinepubs/9699919799/utilities/join.html Multiple instances of the same key are meant to give combinatorial results. E.g: file1: a 1 a 2 file2: a A a B a C result: a 1 A a 1 B a 1 C a 2 A a 2 B a 2 C This is done in this implementation by reading all the lines that have the same key and then iterating through the combinations when printing them. (test with `./busybox join <(printf 'a 1\na 2\n';) <(printf 'a A\na B\na C\n')`) */ readfields(sep, &f1); readfields(sep, &f2); while (f1.linecount && f2.linecount) { int res = strcmp(f1.field, f2.field); if (res == 0) { if (printpaired) printfields(format, empty_str, sep, &f1, &f2); readfields(sep, &f1); readfields(sep, &f2); } else if (res < 0) { if (print1unpaired) printfields(format, empty_str, sep, &f1, NULL); readfields(sep, &f1); } else { if (print2unpaired) printfields(format, empty_str, sep, NULL, &f2); readfields(sep, &f2); } } if (f1.linecount && print1unpaired) { do { printfields(format, empty_str, sep, &f1, NULL); readfields(sep, &f1); } while (f1.linecount); } if (f2.linecount && print2unpaired) { do { printfields(format, empty_str, sep, NULL, &f2); readfields(sep, &f2); } while (f2.linecount); } if (f1.linecap) { for (i = 0; i < f1.linecount; i ++) { if (f1.lines[i].fields) freefields(f1.lines[i].fields); } free(f1.lines); } if (f2.linecap) { for (i = 0; i < f2.linecount; i ++) { if (f2.lines[i].fields) freefields(f2.lines[i].fields); } free(f2.lines); } if (format) free(format); fclose_if_not_stdin(f1.fp); fclose_if_not_stdin(f2.fp); fflush_stdout_and_exit(exitcode); }