Added tokeniing/stemming to marc example.

2019-02-26 08:34:31 -05:00 · 2019-02-26 08:34:31 -05:00 · bc69373828
parent 3a16f4273d
commit bc69373828
1 changed files with 13 additions and 3 deletions
--- a/Class4/marc_template/index.js
+++ b/Class4/marc_template/index.js
@ -124,11 +124,21 @@ var allRecords = [];

 //Function to check a record for specific words
 function checkForWords(_r, _w) {
+
+	//Tokenize the record (break it into words)
+	tokenizer = new natural.TreebankWordTokenizer();
+	var words = [tokenizer.tokenize(_r)][0];
+
 	var chk = {chk:false, w:null};
+	
 	for (var i = 0; i < _w.length; i++) {
-		if (_r.indexOf(' ' + _w[i] + ' ') != -1) {
-			chk.chk = true;
-			chk.w = _w[i];
+		for (var j = 0; j < words.length; j++) {
+			//Stem the word we're checking so that dogs becomes dog, etc.
+			var stemmedWord = natural.PorterStemmer.stem(words[j]);
+			if (stemmedWord.toLowerCase() == _w[i].toLowerCase()) {
+				chk.chk = true;
+				chk.w = _w[i];
+			}
 		}
 	}
 	return(chk);