coreutils

General Software Utilities
git clone http://git.omkov.net/coreutils
Log | Tree | Refs | README | LICENCE | Download

AuthorJamozed <[email protected]>
Date2021-01-07 13:51:54
Commitbe0a1d282aa384272424ae6966cc85342e66bcfe
Parentca214639145b349fd7ba5bd02eb0bcd8b02746f1

wc: Handle multibyte characters

Diffstat

M src/wc.c | 53 +++++++++++++++++++++++++++++++++++++++--------------

1 files changed, 39 insertions, 14 deletions

diff --git a/src/wc.c b/src/wc.c
index aff65f9..c130415 100644
--- a/src/wc.c
+++ b/src/wc.c
@@ -1,4 +1,4 @@
-// wc.c, version 0.1.0
+// wc.c, version 0.2.0
 // OMKOV coreutils implementation of POSIX wc
 // Copyright (C) 2020, Jakob Wakeling
 // All rights reserved.
@@ -31,8 +31,7 @@ CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS WITH THE SOFTWARE.
 */
 
 /*
-	TODO Handle multi-byte characters according to locale.
-	TODO Improve word counting algorithm.
+	FIXME See line 112.
 */
 
 #include "lib/error.h"
@@ -42,8 +41,10 @@ CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS WITH THE SOFTWARE.
 #include <locale.h>
 #include <stdbool.h>
 #include <stdio.h>
+#include <wchar.h>
+#include <wctype.h>
 
-#define VERSION "0.1.0"
+#define VERSION "0.2.0"
 
 static struct lop lops[] = {
 	{ "help",    ARG_NUL, 256 },
@@ -55,7 +56,7 @@ static bool cflag, lflag, mflag, wflag;
 static size_t ctotal, ltotal, mtotal, wtotal;
 
 static inline int wc(const char *file);
-static inline void report(size_t c, size_t l, size_t m, size_t w, const char *f);
+static void report(size_t c, size_t l, size_t m, size_t w, const char *f);
 
 static void hlp(void);
 static void ver(void);
@@ -73,7 +74,6 @@ int main(int ac, char *av[]) { A0 = av[0];
 	}
 
 	char *lc = setlocale(LC_ALL, "");
-	printf("%s\n", lc);
 
 	// If no options specified, use default format
 	if (!cflag && !lflag && !mflag && !wflag) { cflag = lflag = wflag = true; }
@@ -94,19 +94,40 @@ int main(int ac, char *av[]) { A0 = av[0];
 	If the file path given is NULL or "-", then use stdin.
 */
 static inline int wc(const char *file) {
-	FILE *fi; size_t ccount = 0, lcount = 0, mcount = 0, wcount = 0;
-	bool wordflag = false;
+	char buf[BUFSIZ * 16]; FILE *fi;
+	
+	size_t ccount = 0, lcount = 0, mcount = 0, wcount = 0;
+	bool inword = false;
 
 	if (!file || (file[0] == '-' && file[1] == 0)) { fi = stdin; }
 	else if (!(fi = fopen(file, "r"))) { return 1; }
 
-	for (int c; (c = fgetc(fi)) != EOF;) {
-		++ccount; ++mcount;
-		if (c == '\n') { ++lcount; }
-		if (isspace(c)) { if (!wordflag) { ++wcount; } wordflag = true; }
-		else { wordflag = false; }
+	for (size_t c; (c = fread(buf, 1, sizeof (buf), fi)) != 0;) {
+		size_t r = c; ccount += c; bool failflag = false;
+		
+		for (char *p = buf; r != 0;) {
+			wchar_t wc; size_t n = mbrtowc(&wc, p, r, NULL);
+			
+			switch (n) {
+			// FIXME Case (size_t)-2 fails when incomplete strings of more than
+			// one byte are encountered, I can't think of a way to fix this
+			// right now because I have no way of knowing how many bytes it has
+			// processed. This shouldn't occur on correcly encoded files though
+			case (size_t)-2: { ++p; --r; continue; }
+			case (size_t)-1: { ++p; --r; continue; }
+			case 0: { n = 1; wc = 0; break; }
+			}
+			
+			if (wc == '\n') { ++lcount; }
+			if (iswspace(wc)) { if (inword) { inword = false; ++wcount; }}
+			else { inword = true; }
+			
+			p += n; r -= n; ++mcount;
+		}
 	}
 
+	if (inword) { ++wcount; } // Add final word to count if applicable
+	
 	report(ccount, lcount, mcount, wcount, file);
 	ctotal += ccount; ltotal += lcount; mtotal += mcount; wtotal += wcount;
 
@@ -114,11 +135,9 @@ static inline int wc(const char *file) {
 }
 
 /* Report the appropriate metrics */
-static inline void report(size_t c, size_t l, size_t m, size_t w, const char *f) {
-	if (lflag) { printf("%zu ", l); }
-	if (wflag) { printf("%zu ", w); }
-	if (mflag) { printf("%zu ", m); }
-	if (cflag) { printf("%zu ", c); }
+static void report(size_t c, size_t l, size_t m, size_t w, const char *f) {
+	if (lflag) { printf("%zu ", l); } if (wflag) { printf("%zu ", w); }
+	if (mflag) { printf("%zu ", m); } if (cflag) { printf("%zu ", c); }
 	if (f) { fputs(f, stdout); } fputc('\n', stdout); return;
 }