Resolution of improper multibyte rendering.

2023-03-02 20:00:49 -05:00 · 2023-03-02 20:00:49 -05:00 · 0a8acd40cc
parent 8959f4ef97
commit 0a8acd40cc
2 changed files with 26 additions and 52 deletions
--- a/README.md
+++ b/README.md
@ -36,7 +36,8 @@ When the software is running, the '?' key will give you the list of commands ava


 ## Known issues
-Menus and forms will not render correctly if your terminal window is too narrow.  Resize the terminal and this should work.
+* Menus and forms will not render correctly if your terminal window is too narrow.  Resize the terminal and this should work.
+* Extended ASCII characters, typically in the Latin-1 set, will not render due to a quirk of how ncurses handles UTF-8.


 ## Feedback / Support / Gratuity
--- a/pane.cpp
+++ b/pane.cpp
@ -101,19 +101,7 @@ void pane::renderText() {
 	int nontextual = parseConf(config["markup"]["nontextual"]);
 	// int footnotes = 0;		not implemented yet

-	// convert input to wide characters for proper rendering
-	//wchar_t* text = (wchar_t*) malloc(sizeof(wchar_t) * (length + 1));
-	//if (! text) wrapup(1, "Error allocating memory in renderText.\n");
-	//wmemset(text, L'\0', length + 1);
-	//
-	///* some typographical quotes won't go through mbstowcs (eg. Eph 4:8 BWE), so
-	 //* alter those; this seems inefficient, but we don't know how many bytes are
-	 //* in each multibyte char so I think it has to be done this way */
-	//for (int i = 0; i < length; i++) {
-	//int converted = mbstowcs(text, rawtext, i);
-	//if (converted == -1) strncpy(&(rawtext[i-1]), "'", 2);
-	//}
-	//mbstowcs(text, rawtext, length - 1);
+	// copy our unformatted text to local scope for window formatting
 	char* text = (char*) malloc(sizeof(char*) * length + 1);
 	if (! text) wrapup(1, "Error allocating memory in renderText.\n");
 	memset(text, '\0', strlen(rawtext) + 1);
@ -129,13 +117,14 @@ void pane::renderText() {
 	for (int p = 0; p < 2; p++) {

 		// loop through the text
-		// XXX -- we have to use mbrlen() to determine how large the next character is, and advance by that
-		for (int i = 0; i < length; i++) {
+		int i = 0;
+		while (i < length) {

 			/* check if we're in markup - it's not printed and it doesn't
 			 * affect our line lengths, so spin through it */
 			if (inmarkup) {
 				if (text[i] == '>') inmarkup = 0;
+				i++;
 				continue;
 			}

@ -143,27 +132,21 @@ void pane::renderText() {
 			if ((text[i] == '<') && (! rawonly)) {
 				inmarkup = 1;

-				//if ((! strmatch(&text[i], L"</q>", 1)) && (p == 1)) {
 				if ((! strmatch(&text[i], "</q>", 1)) && (p == 1)) {
 					// end of redletter bracket
 					makered = 0;

-				//} else if ((! strmatch(&text[i], L"</transChange>", 1))
 				} else if ((! strmatch(&text[i], "</transChange>", 1))
 									 && (p == 1)) {
 					// end of interpretive text bracket
 					makeital = 0;

-				//} else if ((! strmatch(&text[i], L"<p>", 1)) && (p == 0)) {
 				} else if ((! strmatch(&text[i], "<p>", 1)) && (p == 0)) {
 					// paragraph break - replace </p> with </>'\n'
-					//int endindex = strmatch(&text[i], L"</p>", 0);
 					int endindex = strmatch(&text[i], "</p>", 0);
 					if (endindex != -1)
-						//wmemcpy(&text[i + endindex + 2], L">\n", 2);
 						memcpy(&text[i + endindex + 2], ">\n", 2);

-				//} else if ((! strmatch(&text[i], L"<w savlm=", 1))
 				} else if ((! strmatch(&text[i], "<w savlm=", 1))
 									 && (p == 0) && (strongs == 1)) {
 					/* Strong's number - the format is below, but note
@ -173,35 +156,27 @@ void pane::renderText() {
 					 * <w savlm="strong:[G|H]NNNN(N)">word</w> */

 					// 1. get boundary of open tag
-					//int endbracket = strmatch(&text[i], L">", 0);
 					int endbracket = strmatch(&text[i], ">", 0);

 					// 2. get Strong's parameters
-					//wchar_t* num = (wchar_t*) malloc(sizeof(wchar_t*)
 					char* num = (char*) malloc(sizeof(char*) * 100);
 					if (! num) wrapup(1,
 										"Error declaring memory in renderText.\n");
 					int numidx = 0;

-					//int strnum = strmatch(&text[i], L"strong:", 0) + 7;
 					int strnum = strmatch(&text[i], "strong:", 0) + 7;
 					while ((strnum < endbracket) && (strnum != -1)) {
-						//int nextspace = strmatch(&text[i+strnum], L" ", 0);
 						int nextspace = strmatch(&text[i+strnum], " ", 0);
-						//int space1 = strmatch(&text[i+strnum], L"\"", 0);
 						int space1 = strmatch(&text[i+strnum], "\"", 0);
 						int len = ((nextspace == -1) || (space1 < nextspace)
 														 ? space1
 														 : nextspace);

-						//if (numidx != 0) wmemcpy(&num[numidx++], L" ", 1);
 						if (numidx != 0) memcpy(&num[numidx++], " ", 1);
-						//wmemcpy(&num[numidx], &text[i+strnum], len);
 						memcpy(&num[numidx], &text[i+strnum], len);
 						numidx += len;

 						int nextnum =
-							//strmatch(&text[i+strnum], L"strong:", 0);
 							strmatch(&text[i+strnum], "strong:", 0);
 						strnum = (nextnum != -1
 											? strnum + nextnum + 7
@ -213,31 +188,21 @@ void pane::renderText() {
 					 * below zero, otherwise get word boundaries */
 					int wordstart = endbracket + 1;
 					int endtag =
-						//(endbracket > strmatch(&text[i], L"/", 0)
 						(endbracket > strmatch(&text[i], "/", 0)
 						 ? endbracket + 1
-						 //: strmatch(&text[i], L"</w>", 0));
 						 : strmatch(&text[i], "</w>", 0));

 					// 4. determine word boundaries & rewrite
 					int wordlen = endtag - wordstart;
-					//wchar_t* word = (wchar_t*) malloc(sizeof(wchar_t*)
 					char* word = (char*) malloc(sizeof(char*)
 														* (wordlen == 0 ? 1 : wordlen));

 					if (! word) wrapup(1,
 								 "Error rewriting markup in renderText.\n");
-					//wmemcpy(word, &text[i+wordstart], wordlen);
 					memcpy(word, &text[i+wordstart], wordlen);

 					// rewrite
 					int start = i + endtag - wordlen - numidx - 4;
-					//wmemcpy(&text[start], L"\"", 1);
-					//wmemcpy(&text[start + 1], L">", 1);
-					//wmemcpy(&text[start + 2], word, wordlen);
-					//wmemcpy(&text[start + 2 + wordlen], L"[", 1);
-					//wmemcpy(&text[start + 3 + wordlen], num, numidx);
-					//wmemcpy(&text[start + 3 + wordlen + numidx], L"]", 1);
 					memcpy(&text[start], "\"", 1);
 					memcpy(&text[start + 1], ">", 1);
 					memcpy(&text[start + 2], word, wordlen);
@ -252,13 +217,11 @@ void pane::renderText() {
 					// kept for debugging
 					//fwprintf(stderr, L"(I): %ls\n", text);

-				//} else if ((! strmatch(&text[i], L"<q marker", 1))
 				} else if ((! strmatch(&text[i], "<q marker", 1))
 									 && (p == 1) && (redletter == 1)) {
 					// start of redletter bracket
 					makered = 1;

-				//} else if ((! strmatch(&text[i], L"<transChange type=\"added\"", 1))
 				} else if ((! strmatch(&text[i], "<transChange type=\"added\"", 1))
 									 && (p == 1) && (nontextual == 1)) {
 					// start of interpretive text bracket
@ -269,8 +232,16 @@ void pane::renderText() {
 				continue;
 			} // markup check

-			// XXX -- add in here call to mbrlen() to determine number of bytes this character is --
-			// we will need it to determine our advance amount in the loop
+			/* determine how large this multibyte character is -- we will need it to
+			 * determine our advance amount in the loop */
+			int offset = 0;
+			size_t charlen = mbrlen(&text[i], 5, NULL);
+			while (((int) mbrlen(&text[i+offset], 5, NULL) == -1) &&
+						 (i + offset < length)) {
+				/* mbrlen() says the next char is not a valid multibyte char, so find
+				 * the length by figuring out where the succeeding character is */
+				offset++;
+			}

 			if (p == 0) {
 				// handle word wrapping
@ -301,7 +272,6 @@ void pane::renderText() {
 					lastprintspace = printable;
 				}

-				// XXX -- this is probably okay; check for linelength issues when moving to mbrlen()
 				linelength++;

 				// various word wrapping debugging statements
@ -312,18 +282,21 @@ void pane::renderText() {

 			} else {
 				// printing -- pull out the single character we care about
-				//wchar_t single[] = L"\0\0";
-				// XXX -- need to use mbrlen() to determine how many bytes we really need to pull
-				char single[] = "\0\0";
-				//wcsncpy(&single[0], &text[i], 1);
-				strncpy(&single[0], &text[i], 1);
-
 				wattrset(pad, COLOR_PAIR(makered)
 								 | (makeital ? A_ITALIC : 0));
-				waddstr(pad, single);
+
+				if (((int) charlen == -1) && (offset == 1)) {
+					/* This is an extended ascii character (probably Latin-1) and not
+					 * UTF-8.  In setting up ncurses for UTF-8 we set a locale and seem to
+					 * make these characters unprintable.  To avoid massive rendering
+					 * errors we have to substitute them with something else. */
+					waddstr(pad, "?");
+
+				} else waddnstr(pad, &text[i], charlen);
 			}

 			printable++;
+			i += ((int) charlen == -1 ? offset : (int) charlen);
 		} // text loop

 		// text rewriting debugging