Author | Jamozed <[email protected]> |
Date | 2021-01-07 13:51:54 |
Commit | be0a1d282aa384272424ae6966cc85342e66bcfe |
Parent | ca214639145b349fd7ba5bd02eb0bcd8b02746f1 |
wc: Handle multibyte characters
Diffstat
M | src/wc.c | | | 55 | +++++++++++++++++++++++++++++++++++++------------------ |
1 files changed, 37 insertions, 18 deletions
diff --git a/src/wc.c b/src/wc.c index aff65f9..c130415 100644 --- a/src/wc.c +++ b/src/wc.c @@ -1,4 +1,4 @@ -// wc.c, version 0.1.0 +// wc.c, version 0.2.0 // OMKOV coreutils implementation of POSIX wc // Copyright (C) 2020, Jakob Wakeling // All rights reserved. @@ -31,8 +31,7 @@ CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS WITH THE SOFTWARE. */ /* - TODO Handle multi-byte characters according to locale. - TODO Improve word counting algorithm. + FIXME See line 112. */ #include "lib/error.h" @@ -42,8 +41,10 @@ CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS WITH THE SOFTWARE. #include <locale.h> #include <stdbool.h> #include <stdio.h> +#include <wchar.h> +#include <wctype.h> -#define VERSION "0.1.0" +#define VERSION "0.2.0" static struct lop lops[] = { { "help", ARG_NUL, 256 }, @@ -55,7 +56,7 @@ static bool cflag, lflag, mflag, wflag; static size_t ctotal, ltotal, mtotal, wtotal; static inline int wc(const char *file); -static inline void report(size_t c, size_t l, size_t m, size_t w, const char *f); +static void report(size_t c, size_t l, size_t m, size_t w, const char *f); static void hlp(void); static void ver(void); @@ -73,7 +74,6 @@ int main(int ac, char *av[]) { A0 = av[0]; } char *lc = setlocale(LC_ALL, ""); - printf("%s\n", lc); // If no options specified, use default format if (!cflag && !lflag && !mflag && !wflag) { cflag = lflag = wflag = true; } @@ -94,19 +94,40 @@ int main(int ac, char *av[]) { A0 = av[0]; If the file path given is NULL or "-", then use stdin. */ static inline int wc(const char *file) { - FILE *fi; size_t ccount = 0, lcount = 0, mcount = 0, wcount = 0; - bool wordflag = false; + char buf[BUFSIZ * 16]; FILE *fi; + + size_t ccount = 0, lcount = 0, mcount = 0, wcount = 0; + bool inword = false; if (!file || (file[0] == '-' && file[1] == 0)) { fi = stdin; } else if (!(fi = fopen(file, "r"))) { return 1; } - for (int c; (c = fgetc(fi)) != EOF;) { - ++ccount; ++mcount; - if (c == '\n') { ++lcount; } - if (isspace(c)) { if (!wordflag) { ++wcount; } wordflag = true; } - else { wordflag = false; } + for (size_t c; (c = fread(buf, 1, sizeof (buf), fi)) != 0;) { + size_t r = c; ccount += c; bool failflag = false; + + for (char *p = buf; r != 0;) { + wchar_t wc; size_t n = mbrtowc(&wc, p, r, NULL); + + switch (n) { + // FIXME Case (size_t)-2 fails when incomplete strings of more than + // one byte are encountered, I can't think of a way to fix this + // right now because I have no way of knowing how many bytes it has + // processed. This shouldn't occur on correcly encoded files though + case (size_t)-2: { ++p; --r; continue; } + case (size_t)-1: { ++p; --r; continue; } + case 0: { n = 1; wc = 0; break; } + } + + if (wc == '\n') { ++lcount; } + if (iswspace(wc)) { if (inword) { inword = false; ++wcount; }} + else { inword = true; } + + p += n; r -= n; ++mcount; + } } + if (inword) { ++wcount; } // Add final word to count if applicable + report(ccount, lcount, mcount, wcount, file); ctotal += ccount; ltotal += lcount; mtotal += mcount; wtotal += wcount; @@ -114,11 +135,9 @@ static inline int wc(const char *file) { } /* Report the appropriate metrics */ -static inline void report(size_t c, size_t l, size_t m, size_t w, const char *f) { - if (lflag) { printf("%zu ", l); } - if (wflag) { printf("%zu ", w); } - if (mflag) { printf("%zu ", m); } - if (cflag) { printf("%zu ", c); } +static void report(size_t c, size_t l, size_t m, size_t w, const char *f) { + if (lflag) { printf("%zu ", l); } if (wflag) { printf("%zu ", w); } + if (mflag) { printf("%zu ", m); } if (cflag) { printf("%zu ", c); } if (f) { fputs(f, stdout); } fputc('\n', stdout); return; }