Adding Class 11 examples.

2019-04-17 11:25:22 -04:00 · 2019-04-17 11:25:22 -04:00 · bff249de75
parent 3be45b7b32
commit bff249de75
5 changed files with 363 additions and 0 deletions
--- a/Class11/MARC_Disaster/.gitignore
+++ b/Class11/MARC_Disaster/.gitignore
@ -0,0 +1,4 @@
+node_modules/*
+data/*.gz
+data/*.xml
+out/*
--- a/Class11/MARC_Disaster/data/urls.txt
+++ b/Class11/MARC_Disaster/data/urls.txt
@ -0,0 +1 @@
+https://www.loc.gov/cds/downloads/MDSConnect/Maps.2014.part01.xml.gz
--- a/Class11/MARC_Disaster/index.js
+++ b/Class11/MARC_Disaster/index.js
@ -0,0 +1,254 @@
+/*
+
+MARC List Example
+Jer Thorp
+3/11/19
+
+- npm install
+- npm run download-data
+- npm start
+
+NOTE: You will need to have the URLs you'd like the download script to get listed in data/urls.txt
+NOTE: If the download-data command doesn't work you probably need to install wget:
+	OSX: brew install wget
+	Windows: http://gnuwin32.sourceforge.net/packages/wget.htm
+
+This example runs through the Visual Materials and makes a data file recording how many items were
+published in each given year.
+
+This tactic could be used to get distribution data for any MARC field - and could be filtered by
+title keyword (see CleanMARC example)
+
+*/
+
+//We're using the xml2object package, which takes XML loaded as text and parses it into a javascript object
+const xml2object = require('xml2object');
+//The filesystem package is used to load the .gz files from the local directory
+const fs = require('fs');  
+//The zlib package is used to unzip the .gz files
+const zlib = require('zlib');
+//I like to use this package which provides a clean way to reference to root directory of a node project
+const appRoot = require('app-root-path');
+//Natural is a nice NLP package for node: https://www.npmjs.com/package/natural
+const natural = require('natural');
+
+//Where is the data?
+var dataPath = appRoot + "/data";
+
+//Which subset of the MARC files were we looking for?
+const filePrefix = "Maps";
+//How many of them are there?
+const fileMap = [];
+fileMap["BooksAll"] = 			41;
+fileMap["Computer.Files"] = 	1; 
+fileMap["Maps"] = 				1;
+fileMap["Music"] = 				1; 
+fileMap["Names"] = 				37; 
+fileMap["Serials"] = 			11;
+fileMap["Subjects"] = 			2;
+fileMap["Visual.Materials"] = 	1;
+//Total number of files to load
+const fileCount = fileMap[filePrefix];
+//Number of files we've already loaded
+//We start at 1 because the MARC files are 1-indexed
+var fileCounter = 1;
+
+//Counter to keep track of years
+var outCounter = {};
+
+//Array to hold CSV outs
+var rows = [];
+
+//Search word
+var search = "fire";
+
+//The xml2object package needs us to build a parser object - that will ingest the XML and then
+//trigger functions when the parse is complete. 
+const parser = new xml2object([ 'record' ]);
+
+//XML PARSER ---------------------------------------------------------------------------!!
+//When we construct the parser with an array of which xml elements to look for. In our case, we're
+//interested in the record objects. We also can pass in a reference to the file name.
+function makeParser() {
+
+	//The parser's on method handles events. Here, we'll define what happens when it finishes parsing an object 
+	parser.on('object', function(name, obj) {
+
+	  	//Get the Year of the object from subfield 260
+	  	var marcDict = {};
+	  	marcDict["245"] = {"*" :"Title"};
+	  	marcDict["260"] = {"c" :"Year"};
+	  	marcDict["752"] = {"*" : "Location"};
+	  	marcDict["650"] = {"a" : "Subject"};
+	  	
+
+	  	var record = parseRecord(obj, marcDict);
+
+	  	var year = record.Year;
+	  	if (record.Year) {
+	  		var cy;
+	  		try {
+	  			cy = record.Year[0].replace(/-/g, "5");
+	  			cy = cy.replace(/[.,\/#!$%\^&\*\[\];:{}=\-_`~()]/g,"");
+
+	  			var yearRegex = /(\d{4})/;///(\d{4}|\d{4}\-\d{4})$/g;
+	  			///(17|18|19|20)\d{2}/
+
+	  			var y = cy.match(yearRegex)[0];
+	  			if (y) year = y;
+		  	} catch (error) {
+		  		//console.log("failed year extract" + record.Year);
+		  		//console.log(cy);
+		  	}
+	  	}
+
+	  	if (record.Location && record.Title) {
+	  		var chk1 = checkForMatches(record.Title.join(" ").toLowerCase(), ["hazard","fire","flood","storm","tornado","earthquake"]).chk;
+	  		var chk2;
+	  		if (record.Subject) {
+	  			chk2 = checkForMatches(record.Subject.join(" ").toLowerCase(), ["hazard","fire","flood","storm","tornado","earthquake"]).chk
+	  		}
+	  		if (chk1 || chk2) {
+	  			rows.push([record.Title.join(" ") + "	" + year + "	" + record.Location. join(" ")]);	
+	  			console.log(record);
+	  		}
+	  	}
+	});
+
+	//And what happens when it finishes parsing all of the records. 
+	parser.on('end', function() {
+	    onParseFinished();
+	}); 
+
+}
+
+//------------------CHECK FOR MATCHES FUNCTION ---------------------------------------------------------------------------!!
+//This function checks any string (input) against any list of candidate strings (candidates)
+//Uses NLP to split the sentence into words and also to stem
+var tokenizer = new natural.TreebankWordTokenizer();
+//Used to singularize the words so that frogs matches frog. Wether or not you have to do this will depend on what data you're trying to match.
+//For example if it's something *already* standardized (ie. Subjects) you won't have to. 
+//This function is SLOW if there are a lot of words to check against 
+var nounInflector = new natural.NounInflector();
+
+function checkForMatches(input, candidates) {
+
+	//Tokenize the record (break it into words)
+	var words = [tokenizer.tokenize(input)][0];
+
+	//Set up our return object, this is the state that is returned with no matches
+	var chk = {chk:false, words:[]};
+	
+	for (var i = 0; i < candidates.length; i++) {
+		var cand = nounInflector.singularize(candidates[i].toLowerCase());
+		for (var j = 0; j < words.length; j++) {
+			if (nounInflector.singularize(words[j].toLowerCase()) == cand) {
+				chk.chk = true;
+				chk.words.push(candidates[i]);
+			}
+		}
+	}
+
+	//Returns an object with a boolean and a list of words (if any)
+	//ie {chk:true, words:["frog","monkey"]}
+	return(chk);
+}
+
+//MARC PARSE FUNCTION ---------------------------------------------------------------------------!!
+//This function expects an object from xml2obj, and a dictionary object which links
+//the mark tags and subfields to a property name.
+//
+//For example, you could do this:
+//  marcDict["260"] = {"c" :"Year"};
+//
+//Which asks the parser to link records with a tag of 260 and a subfield of c to the property Year.
+//
+//You can also use * to say you want ALL subfields of a tag to be stored in a property:
+//
+//	marcDict["245"] = {"*" :"Title"};
+
+function parseRecord(obj, marcDict) {
+ 	record = {};
+
+	for (var i = 0; i < obj.datafield.length; i++) {
+		var df = obj.datafield[i];
+		//Get the numeric tag
+		var tag  = df.tag;
+
+		//If we have the tag in our dictionary, write to the JSON object
+		//Based on the code (doesn't work for all cases?)
+		if (marcDict[tag] && df.subfield) {
+			var isAll = marcDict[tag]['*'];
+
+			for (var j = 0; j < df.subfield.length; j++) {
+
+				var code = isAll ? "*":df.subfield[j].code;
+				var disp = df.subfield[j]['$t'];
+				
+				if (marcDict[tag][code] || isAll) {
+					if (!record[marcDict[tag][code]]) {
+						record[marcDict[tag][code]] = [];
+					}
+					record[marcDict[tag][code]].push(disp);
+				}
+			}
+		}
+	}
+	return(record);	
+}
+
+
+//FILE LOADING CASCADE ---------------------------------------------------------------------------!!
+//These two functions sequence through the list of MARC records one by one and process them with our 
+//xml2object parser
+function loadNextFile() {
+	if (fileCounter <= fileCount) {
+		//Put a zero in file names under 10
+		var n = (fileCounter < 10 ? "0":"") + fileCounter;
+		//Construct the URL
+		var url = dataPath + "/" + filePrefix + ".2014.part" + n + ".xml.gz";
+		//Open up a read stream and unzip it
+
+		
+			var rstream = fs.createReadStream(url);
+			var gunzip = zlib.createGunzip();
+			 	
+			rstream   // reads from the url we've constructed
+			  .pipe(gunzip)  // uncompresses
+			  .pipe(parser.saxStream); //Parses into record objects
+				
+			fileCounter ++;
+			console.log("LOADING FILE : " + url);
+		
+	}
+}
+
+
+function onParseFinished() {
+	
+	//Write every time - useful in very long processes
+	writeFile(rows);
+	try {
+		loadNextFile();
+	} catch(err) {
+		console.log("ERROR LOADING NEXT FILE: " + fileCounter);
+	}
+}
+
+//File WRITER ---------------------------------------------------------------------------!!
+
+function writeFile(rows) {
+	var text = rows.join("\n");
+	//Write
+	//File prefix is defined on line 26
+	fs.writeFile(appRoot + "/out/" + search + ".tsv", text, 'utf8', function() {
+		console.log("Saved JSON.");
+	});
+}
+
+//PULL THE TRIGGER.
+makeParser();
+loadNextFile();
+
+
+
--- a/Class11/MARC_Disaster/package-lock.json
+++ b/Class11/MARC_Disaster/package-lock.json
@ -0,0 +1,84 @@
+{
+  "name": "cleanmarc_list",
+  "version": "1.0.0",
+  "lockfileVersion": 1,
+  "requires": true,
+  "dependencies": {
+    "afinn-165": {
+      "version": "1.0.2",
+      "resolved": "https://registry.npmjs.org/afinn-165/-/afinn-165-1.0.2.tgz",
+      "integrity": "sha512-oVbXkteWA6XgYndv3dXYVvulStflVYQtR2K+zp2PyaVhPkkOhZ8tAvk9V7cwaI43GwZaNqRoC2VTpoaWmFyBTA=="
+    },
+    "app-root-path": {
+      "version": "2.1.0",
+      "resolved": "https://registry.npmjs.org/app-root-path/-/app-root-path-2.1.0.tgz",
+      "integrity": "sha1-mL9lmTJ+zqGZMJhm6BQDaP0uZGo="
+    },
+    "apparatus": {
+      "version": "0.0.10",
+      "resolved": "https://registry.npmjs.org/apparatus/-/apparatus-0.0.10.tgz",
+      "integrity": "sha512-KLy/ugo33KZA7nugtQ7O0E1c8kQ52N3IvD/XgIh4w/Nr28ypfkwDfA67F1ev4N1m5D+BOk1+b2dEJDfpj/VvZg==",
+      "requires": {
+        "sylvester": ">= 0.0.8"
+      }
+    },
+    "fs": {
+      "version": "0.0.1-security",
+      "resolved": "https://registry.npmjs.org/fs/-/fs-0.0.1-security.tgz",
+      "integrity": "sha1-invTcYa23d84E/I4WLV+yq9eQdQ="
+    },
+    "json-stable-stringify": {
+      "version": "1.0.1",
+      "resolved": "https://registry.npmjs.org/json-stable-stringify/-/json-stable-stringify-1.0.1.tgz",
+      "integrity": "sha1-mnWdOcXy/1A/1TAGRu1EX4jE+a8=",
+      "requires": {
+        "jsonify": "~0.0.0"
+      }
+    },
+    "jsonify": {
+      "version": "0.0.0",
+      "resolved": "https://registry.npmjs.org/jsonify/-/jsonify-0.0.0.tgz",
+      "integrity": "sha1-LHS27kHZPKUbe1qu6PUDYx0lKnM="
+    },
+    "natural": {
+      "version": "0.6.3",
+      "resolved": "https://registry.npmjs.org/natural/-/natural-0.6.3.tgz",
+      "integrity": "sha512-78fcEdNN6Y4pv8SOLPDhJTlUG+8IiQzNx0nYpl0k7q00K4ZZuds+wDWfSa6eeiPcSQDncvV44WWGsi70/ZP3+w==",
+      "requires": {
+        "afinn-165": "^1.0.2",
+        "apparatus": "^0.0.10",
+        "json-stable-stringify": "^1.0.1",
+        "sylvester": "^0.0.12",
+        "underscore": "^1.3.1"
+      }
+    },
+    "sax": {
+      "version": "1.2.4",
+      "resolved": "https://registry.npmjs.org/sax/-/sax-1.2.4.tgz",
+      "integrity": "sha512-NqVDv9TpANUjFm0N8uM5GxL36UgKi9/atZw+x7YFnQ8ckwFGKrl4xX4yWtrey3UJm5nP1kUbnYgLopqWNSRhWw=="
+    },
+    "sylvester": {
+      "version": "0.0.12",
+      "resolved": "https://registry.npmjs.org/sylvester/-/sylvester-0.0.12.tgz",
+      "integrity": "sha1-WohEFc0tACxX56OqyZRip1zp/bQ="
+    },
+    "underscore": {
+      "version": "1.9.1",
+      "resolved": "https://registry.npmjs.org/underscore/-/underscore-1.9.1.tgz",
+      "integrity": "sha512-5/4etnCkd9c8gwgowi5/om/mYO5ajCaOgdzj/oW+0eQV9WxKBDZw5+ycmKmeaTXjInS/W0BzpGLo2xR2aBwZdg=="
+    },
+    "xml2object": {
+      "version": "0.1.2",
+      "resolved": "https://registry.npmjs.org/xml2object/-/xml2object-0.1.2.tgz",
+      "integrity": "sha1-hylkKI6BgaUP3UT3iRCX/lyYK0U=",
+      "requires": {
+        "sax": ">=0.3.5"
+      }
+    },
+    "zlib": {
+      "version": "1.0.5",
+      "resolved": "https://registry.npmjs.org/zlib/-/zlib-1.0.5.tgz",
+      "integrity": "sha1-bnyXL8NxxkWmr7A6sUdp3vEU/MA="
+    }
+  }
+}
--- a/Class11/MARC_Disaster/package.json
+++ b/Class11/MARC_Disaster/package.json
@ -0,0 +1,20 @@
+{
+  "name": "cleanmarc_list",
+  "version": "1.0.0",
+  "description": "ITP Artists in the Archive MARC Example",
+  "main": "index.js",
+  "scripts": {
+    "start": "node index.js",
+    "download-data": "wget --input-file data/urls.txt --directory-prefix data"
+  },
+  "dependencies": {
+    "app-root-path": "^2.1.0",
+    "fs": "^0.0.1-security",
+    "natural": "^0.6.3",
+    "xml2object": "^0.1.2",
+    "zlib": "^1.0.5"
+  },
+  "devDependencies": {},
+  "author": "Jer Thorp",
+  "license": "ISC"
+}
				`@ -0,0 +1 @@`
				`https://www.loc.gov/cds/downloads/MDSConnect/Maps.2014.part01.xml.gz`