util.c: printutf8pad(): improve padded printing and printing invalid unicode characters

This affects sfeed_plain.

- Use unicode replacement character (codepoint 0xfffd) when a codepoint is
  invalid and proceed printing the rest of the characters.

- When a codepoint is invalid reset the internal state of mbtowc(3), from the
  OpenBSD man page:

  "  If a call to mbtowc() resulted in an undefined internal state, mbtowc()
     must be called with s set to NULL to reset the internal state before it
     can safely be used again."

- Optimize for the common ASCII case and use a macro to print the character
  instead of a wasteful fwrite() function call. With 250k lines (+- 350MB) this
  improves printing performance from 1.7s to 1.0s on my laptop. On an other
  system it improved by +- 25%.  Tested with clang and gcc and also tested the
  worst-case (non-ASCII) with no penalty.

To test:

	printf '0\tabc\xc3 def' | sfeed_plain

Before:

  1970-01-01 01:00  abc

After:

  1970-01-01 01:00  abc� def
This commit is contained in:
Hiltjo Posthuma 2021-01-08 19:08:59 +01:00
parent c7e3ec5f37
commit 04b832539c
1 changed files with 34 additions and 15 deletions

49
util.c
View File

@ -234,29 +234,48 @@ printutf8pad(FILE *fp, const char *s, size_t len, int pad)
{
wchar_t wc;
size_t col = 0, i, slen;
int rl, w;
int rl, siz, w;
if (!len)
return;
slen = strlen(s);
for (i = 0; i < slen; i += rl) {
rl = w = 1;
if ((unsigned char)s[i] < 32)
continue;
if ((unsigned char)s[i] >= 127) {
if ((rl = mbtowc(&wc, s + i, slen - i < 4 ? slen - i : 4)) <= 0)
break;
if ((w = wcwidth(wc)) == -1)
for (i = 0; i < slen; i += siz) {
siz = 1;
if ((unsigned char)s[i] < 32) {
continue; /* skip control characters */
} else if ((unsigned char)s[i] >= 127) {
rl = siz = mbtowc(&wc, s + i, slen - i < 4 ? slen - i : 4);
if (rl < 0) {
mbtowc(NULL, NULL, 0); /* reset state */
siz = 1; /* next byte */
w = 1; /* replacement char is one width */
} else if ((w = wcwidth(wc)) == -1) {
continue;
}
if (col + w > len || (col + w == len && s[i + rl])) {
fputs("\xe2\x80\xa6", fp);
}
if (col + w > len || (col + w == len && s[i + siz])) {
fputs("\xe2\x80\xa6", fp); /* ellipsis */
col++;
break;
} else if (rl < 0) {
fputs("\xef\xbf\xbd", fp); /* replacement */
col++;
continue;
}
fwrite(&s[i], 1, siz, fp);
col += w;
} else {
/* simple ASCII character */
if (col + 1 > len || (col + 1 == len && s[i + 1])) {
fputs("\xe2\x80\xa6", fp); /* ellipsis */
col++;
break;
}
putc(s[i], fp);
col++;
break;
}
fwrite(&s[i], 1, rl, fp);
col += w;
}
for (; col < len; ++col)
putc(pad, fp);