Resolution of improper multibyte rendering.

Commit to log changes in progress for retooling away from wide character strings:
- Functions in free.cpp retooled away from wide chars. Commented lines for removal still present. - Locale change line moved in scriptura.cpp. No further changes needed here. - Partial changes made in pane.cpp - more to come here.
2023-03-02 20:00:49 -05:00 · 2023-02-23 15:01:21 -05:00 · 2022-12-27 17:46:17 -05:00
7 changed files with 83 additions and 64 deletions
--- a/README.md
+++ b/README.md
@ -36,7 +36,8 @@ When the software is running, the '?' key will give you the list of commands ava


 ## Known issues
-Menus and forms will not render correctly if your terminal window is too narrow.  Resize the terminal and this should work.
+* Menus and forms will not render correctly if your terminal window is too narrow.  Resize the terminal and this should work.
+* Extended ASCII characters, typically in the Latin-1 set, will not render due to a quirk of how ncurses handles UTF-8.


 ## Feedback / Support / Gratuity
--- a/free.cpp
+++ b/free.cpp
@ -75,18 +75,25 @@ void trim(char* str) {
 	while (len-- && isspace(str[len])) str[len] = 0;
 }

-int strmatch(wchar_t text[], const wchar_t* key, int matchnow) {
+//int strmatch(wchar_t text[], const wchar_t* key, int matchnow) {
+int strmatch(char text[], const char* key, int matchnow) {
 	int retval = -1;
-	int length = wcslen(key);
-	wchar_t* substr = (wchar_t*) malloc((length + 1) * sizeof(wchar_t));
-	wmemset(substr, L'\0', length + 1);
+	//int length = wcslen(key);
+	int length = strlen(key);
+	//wchar_t* substr = (wchar_t*) malloc((length + 1) * sizeof(wchar_t));
+	char* substr = (char*) malloc((length + 1) * sizeof(char*));
+	//wmemset(substr, L'\0', length + 1);
+	memset(substr, '\0', length + 1);
 	if (! substr) wrapup(1, "Error allocating memory in strmatch.\n");

 	int i = 0;
-	int textlen = wcslen(text);
+	//int textlen = wcslen(text);
+	int textlen = strlen(text);
 	while ((text[i] != '\0') && (i < (textlen - length))) {
-		wmemcpy(substr, &text[i], length);
-		if (! wcscmp(substr, key)) {
+		//wmemcpy(substr, &text[i], length);
+		memcpy(substr, &text[i], length);
+		//if (! wcscmp(substr, key)) {
+		if (! strcmp(substr, key)) {
 			retval = i;
 			break;
 		}
--- a/free.h
+++ b/free.h
@ -75,7 +75,8 @@ int parseConf(sword::SWBuf buf);
 /*! Determine if a substring is present at either the beginning of a string or
 * at any point within it, depending on a toggle for a flag -- return integer
 * of matching index. */
-int strmatch(wchar_t text[], const wchar_t* key, int matchnow);
+//int strmatch(wchar_t text[], const wchar_t* key, int matchnow);
+int strmatch(char text[], const char* key, int matchnow);

 //! Append a new string to a starray.
 starray stappend(starray arr, const char* newstr);
--- a/pane.cpp
+++ b/pane.cpp
@ -101,19 +101,11 @@ void pane::renderText() {
 	int nontextual = parseConf(config["markup"]["nontextual"]);
 	// int footnotes = 0;		not implemented yet

-	// convert input to wide characters for proper rendering
-	wchar_t* text = (wchar_t*) malloc(sizeof(wchar_t) * (length + 1));
+	// copy our unformatted text to local scope for window formatting
+	char* text = (char*) malloc(sizeof(char*) * length + 1);
 	if (! text) wrapup(1, "Error allocating memory in renderText.\n");
-	wmemset(text, L'\0', length + 1);
-
-	/* some typographical quotes won't go through mbstowcs, so alter those; this
-	 * seems inefficient, but we don't know how many bytes are in each multibyte
-	 * char so I think it has to be done this way */
-	for (int i = 0; i < length; i++) {
-		int converted = mbstowcs(text, rawtext, 1);
-		if (converted == -1) strncpy(&(rawtext[i-1]), "'", 2);
-	}
-	mbstowcs(text, rawtext, length - 1);
+	memset(text, '\0', strlen(rawtext) + 1);
+	strncpy(text, rawtext, length);

 	// kept for debugging
 	//fwprintf(stderr, L"(O): %ls\n\n", text);
@ -125,12 +117,14 @@ void pane::renderText() {
 	for (int p = 0; p < 2; p++) {

 		// loop through the text
-		for (int i = 0; i < length; i++) {
+		int i = 0;
+		while (i < length) {

 			/* check if we're in markup - it's not printed and it doesn't
 			 * affect our line lengths, so spin through it */
 			if (inmarkup) {
 				if (text[i] == '>') inmarkup = 0;
+				i++;
 				continue;
 			}

@ -138,22 +132,22 @@ void pane::renderText() {
 			if ((text[i] == '<') && (! rawonly)) {
 				inmarkup = 1;

-				if ((! strmatch(&text[i], L"</q>", 1)) && (p == 1)) {
+				if ((! strmatch(&text[i], "</q>", 1)) && (p == 1)) {
 					// end of redletter bracket
 					makered = 0;

-				} else if ((! strmatch(&text[i], L"</transChange>", 1))
+				} else if ((! strmatch(&text[i], "</transChange>", 1))
 									 && (p == 1)) {
 					// end of interpretive text bracket
 					makeital = 0;

-				} else if ((! strmatch(&text[i], L"<p>", 1)) && (p == 0)) {
+				} else if ((! strmatch(&text[i], "<p>", 1)) && (p == 0)) {
 					// paragraph break - replace </p> with </>'\n'
-					int endindex = strmatch(&text[i], L"</p>", 0);
+					int endindex = strmatch(&text[i], "</p>", 0);
 					if (endindex != -1)
-						wmemcpy(&text[i + endindex + 2], L">\n", 2);
+						memcpy(&text[i + endindex + 2], ">\n", 2);

-				} else if ((! strmatch(&text[i], L"<w savlm=", 1))
+				} else if ((! strmatch(&text[i], "<w savlm=", 1))
 									 && (p == 0) && (strongs == 1)) {
 					/* Strong's number - the format is below, but note
 					 * there may be 1+ strong:[G|H]NNNN(N)'s to find,
@ -162,29 +156,28 @@ void pane::renderText() {
 					 * <w savlm="strong:[G|H]NNNN(N)">word</w> */

 					// 1. get boundary of open tag
-					int endbracket = strmatch(&text[i], L">", 0);
+					int endbracket = strmatch(&text[i], ">", 0);

 					// 2. get Strong's parameters
-					wchar_t* num = (wchar_t*) malloc(sizeof(wchar_t*)
-																					 * 100);
+					char* num = (char*) malloc(sizeof(char*) * 100);
 					if (! num) wrapup(1,
 										"Error declaring memory in renderText.\n");
 					int numidx = 0;

-					int strnum = strmatch(&text[i], L"strong:", 0) + 7;
+					int strnum = strmatch(&text[i], "strong:", 0) + 7;
 					while ((strnum < endbracket) && (strnum != -1)) {
-						int nextspace = strmatch(&text[i+strnum], L" ", 0);
-						int space1 = strmatch(&text[i+strnum], L"\"", 0);
+						int nextspace = strmatch(&text[i+strnum], " ", 0);
+						int space1 = strmatch(&text[i+strnum], "\"", 0);
 						int len = ((nextspace == -1) || (space1 < nextspace)
 														 ? space1
 														 : nextspace);

-						if (numidx != 0) wmemcpy(&num[numidx++], L" ", 1);
-						wmemcpy(&num[numidx], &text[i+strnum], len);
+						if (numidx != 0) memcpy(&num[numidx++], " ", 1);
+						memcpy(&num[numidx], &text[i+strnum], len);
 						numidx += len;

 						int nextnum =
-							strmatch(&text[i+strnum], L"strong:", 0);
+							strmatch(&text[i+strnum], "strong:", 0);
 						strnum = (nextnum != -1
 											? strnum + nextnum + 7
 											: -1);
@ -195,27 +188,27 @@ void pane::renderText() {
 					 * below zero, otherwise get word boundaries */
 					int wordstart = endbracket + 1;
 					int endtag =
-						(endbracket > strmatch(&text[i], L"/", 0)
+						(endbracket > strmatch(&text[i], "/", 0)
 						 ? endbracket + 1
-						 : strmatch(&text[i], L"</w>", 0));
+						 : strmatch(&text[i], "</w>", 0));

 					// 4. determine word boundaries & rewrite
 					int wordlen = endtag - wordstart;
-					wchar_t* word = (wchar_t*) malloc(sizeof(wchar_t*)
+					char* word = (char*) malloc(sizeof(char*)
 														* (wordlen == 0 ? 1 : wordlen));

 					if (! word) wrapup(1,
 								 "Error rewriting markup in renderText.\n");
-					wmemcpy(word, &text[i+wordstart], wordlen);
+					memcpy(word, &text[i+wordstart], wordlen);

 					// rewrite
 					int start = i + endtag - wordlen - numidx - 4;
-					wmemcpy(&text[start], L"\"", 1);
-					wmemcpy(&text[start + 1], L">", 1);
-					wmemcpy(&text[start + 2], word, wordlen);
-					wmemcpy(&text[start + 2 + wordlen], L"[", 1);
-					wmemcpy(&text[start + 3 + wordlen], num, numidx);
-					wmemcpy(&text[start + 3 + wordlen + numidx], L"]", 1);
+					memcpy(&text[start], "\"", 1);
+					memcpy(&text[start + 1], ">", 1);
+					memcpy(&text[start + 2], word, wordlen);
+					memcpy(&text[start + 2 + wordlen], "[", 1);
+					memcpy(&text[start + 3 + wordlen], num, numidx);
+					memcpy(&text[start + 3 + wordlen + numidx], "]", 1);

 					// clean up so we can do this again
 					free(num);
@ -224,12 +217,12 @@ void pane::renderText() {
 					// kept for debugging
 					//fwprintf(stderr, L"(I): %ls\n", text);

-				} else if ((! strmatch(&text[i], L"<q marker", 1))
+				} else if ((! strmatch(&text[i], "<q marker", 1))
 									 && (p == 1) && (redletter == 1)) {
 					// start of redletter bracket
 					makered = 1;

-				} else if ((! strmatch(&text[i], L"<transChange type=\"added\"", 1))
+				} else if ((! strmatch(&text[i], "<transChange type=\"added\"", 1))
 									 && (p == 1) && (nontextual == 1)) {
 					// start of interpretive text bracket
 					makeital = 1;
@ -239,6 +232,17 @@ void pane::renderText() {
 				continue;
 			} // markup check

+			/* determine how large this multibyte character is -- we will need it to
+			 * determine our advance amount in the loop */
+			int offset = 0;
+			size_t charlen = mbrlen(&text[i], 5, NULL);
+			while (((int) mbrlen(&text[i+offset], 5, NULL) == -1) &&
+						 (i + offset < length)) {
+				/* mbrlen() says the next char is not a valid multibyte char, so find
+				 * the length by figuring out where the succeeding character is */
+				offset++;
+			}
+
 			if (p == 0) {
 				// handle word wrapping
 				if (text[i] == ' ') {
@ -278,15 +282,21 @@ void pane::renderText() {

 			} else {
 				// printing -- pull out the single character we care about
-				wchar_t single[] = L"\0\0";
-				wcsncpy(&single[0], &text[i], 1);
-
 				wattrset(pad, COLOR_PAIR(makered)
 								 | (makeital ? A_ITALIC : 0));
-				waddwstr(pad, single);
+
+				if (((int) charlen == -1) && (offset == 1)) {
+					/* This is an extended ascii character (probably Latin-1) and not
+					 * UTF-8.  In setting up ncurses for UTF-8 we set a locale and seem to
+					 * make these characters unprintable.  To avoid massive rendering
+					 * errors we have to substitute them with something else. */
+					waddstr(pad, "?");
+
+				} else waddnstr(pad, &text[i], charlen);
 			}

 			printable++;
+			i += ((int) charlen == -1 ? offset : (int) charlen);
 		} // text loop

 		// text rewriting debugging
@ -318,9 +328,7 @@ void pane::setTitle(const char* newtitle) {

 void pane::retitle() {
 	(hasFocus ? wattrset(win, A_STANDOUT) : standend());
-	mvwprintw(win, 0, 1, "%s", "  ");
-	mvwprintw(win, 0, 3, "%s", titlebar);
-	mvwprintw(win, 0, 3+strlen(titlebar), "%s", "   ");
+	mvwprintw(win, 0, 1, "%s%s%s", "  ", titlebar, "   ");
 	wstandend(win);
 }

--- a/scabbard.cpp
+++ b/scabbard.cpp
@ -41,15 +41,17 @@ scabbard::scabbard() {
 }

 void scabbard::constructModlist() {
-	int tmpnum = 4;
-	modtype tmpmods[4];
-	strcpy(tmpmods[0].label, "Generic Books");
+	int tmpnum = 5;
+	modtype tmpmods[5];
+	strcpy(tmpmods[0].label, sword::SWMgr::MODTYPE_GENBOOKS);
 	tmpmods[0].keytype = 1;
-	strcpy(tmpmods[1].label, "Biblical Texts");
+	strcpy(tmpmods[1].label, sword::SWMgr::MODTYPE_BIBLES);
 	tmpmods[1].keytype = 0;
-	strcpy(tmpmods[2].label, "Lexicons / Dictionaries");
+	strcpy(tmpmods[2].label, sword::SWMgr::MODTYPE_LEXDICTS);
 	tmpmods[2].keytype = 2;
-	strcpy(tmpmods[3].label, "Commentaries");
+	strcpy(tmpmods[3].label, sword::SWMgr::MODTYPE_COMMENTARIES);
+	tmpmods[3].keytype = 0;
+	strcpy(tmpmods[4].label, sword::SWMgr::MODTYPE_DAILYDEVOS);
 	tmpmods[3].keytype = 0;

 	// need some throwaway ints
--- a/scabbard.h
+++ b/scabbard.h
@ -93,7 +93,7 @@ class scabbard {

 	/*! Struct to hold listing of different modules - each modtype corresponds
 	 * to a different type of module - see comments for modtype */
-	modtype modules[4];
+	modtype modules[5];

 	/*! Get all module types loaded in and accessible by Sword.
 	 * \returns the module types loaded into an array */
--- a/scriptura.cpp
+++ b/scriptura.cpp
@ -265,6 +265,7 @@ int main(int argc, char** argv) {

 	/* start ncurses, disable line buffering hide cursor, and allow for fancy
 	 * keys & so on */
+	setlocale(LC_ALL, "");
 	initscr();
 	cbreak();

@ -272,7 +273,6 @@ int main(int argc, char** argv) {
 	noecho();
 	start_color();
 	keypad(stdscr, true);
-	setlocale(LC_ALL, "");

 	// get color settings
 	if (! strcmp(config["markup"]["lettercolor"], "green")) {
Author	SHA1	Message	Date
Paul Mosier	0a8acd40cc	Resolution of improper multibyte rendering.	2023-03-02 20:00:49 -05:00
Paul Mosier	8959f4ef97	Commit to log changes in progress for retooling away from wide character strings: - Functions in free.cpp retooled away from wide chars. Commented lines for removal still present. - Locale change line moved in scriptura.cpp. No further changes needed here. - Partial changes made in pane.cpp - more to come here.	2023-02-23 15:01:21 -05:00
Paul Mosier	8bf642a8a9	Use swmgr's module type constants intead of magic strings.	2022-12-27 17:46:17 -05:00