From bff249de75fb99326c721dddc7b3762a01fc9ea1 Mon Sep 17 00:00:00 2001 From: Jer Thorp Date: Wed, 17 Apr 2019 11:25:22 -0400 Subject: [PATCH] Adding Class 11 examples. --- Class11/MARC_Disaster/.gitignore | 4 + Class11/MARC_Disaster/data/urls.txt | 1 + Class11/MARC_Disaster/index.js | 254 ++++++++++++++++++++++++ Class11/MARC_Disaster/package-lock.json | 84 ++++++++ Class11/MARC_Disaster/package.json | 20 ++ 5 files changed, 363 insertions(+) create mode 100644 Class11/MARC_Disaster/.gitignore create mode 100644 Class11/MARC_Disaster/data/urls.txt create mode 100644 Class11/MARC_Disaster/index.js create mode 100644 Class11/MARC_Disaster/package-lock.json create mode 100644 Class11/MARC_Disaster/package.json diff --git a/Class11/MARC_Disaster/.gitignore b/Class11/MARC_Disaster/.gitignore new file mode 100644 index 0000000..1d53c39 --- /dev/null +++ b/Class11/MARC_Disaster/.gitignore @@ -0,0 +1,4 @@ +node_modules/* +data/*.gz +data/*.xml +out/* \ No newline at end of file diff --git a/Class11/MARC_Disaster/data/urls.txt b/Class11/MARC_Disaster/data/urls.txt new file mode 100644 index 0000000..f4f4af4 --- /dev/null +++ b/Class11/MARC_Disaster/data/urls.txt @@ -0,0 +1 @@ +https://www.loc.gov/cds/downloads/MDSConnect/Maps.2014.part01.xml.gz \ No newline at end of file diff --git a/Class11/MARC_Disaster/index.js b/Class11/MARC_Disaster/index.js new file mode 100644 index 0000000..a1f27b1 --- /dev/null +++ b/Class11/MARC_Disaster/index.js @@ -0,0 +1,254 @@ +/* + +MARC List Example +Jer Thorp +3/11/19 + +- npm install +- npm run download-data +- npm start + +NOTE: You will need to have the URLs you'd like the download script to get listed in data/urls.txt +NOTE: If the download-data command doesn't work you probably need to install wget: + OSX: brew install wget + Windows: http://gnuwin32.sourceforge.net/packages/wget.htm + +This example runs through the Visual Materials and makes a data file recording how many items were +published in each given year. + +This tactic could be used to get distribution data for any MARC field - and could be filtered by +title keyword (see CleanMARC example) + +*/ + +//We're using the xml2object package, which takes XML loaded as text and parses it into a javascript object +const xml2object = require('xml2object'); +//The filesystem package is used to load the .gz files from the local directory +const fs = require('fs'); +//The zlib package is used to unzip the .gz files +const zlib = require('zlib'); +//I like to use this package which provides a clean way to reference to root directory of a node project +const appRoot = require('app-root-path'); +//Natural is a nice NLP package for node: https://www.npmjs.com/package/natural +const natural = require('natural'); + +//Where is the data? +var dataPath = appRoot + "/data"; + +//Which subset of the MARC files were we looking for? +const filePrefix = "Maps"; +//How many of them are there? +const fileMap = []; +fileMap["BooksAll"] = 41; +fileMap["Computer.Files"] = 1; +fileMap["Maps"] = 1; +fileMap["Music"] = 1; +fileMap["Names"] = 37; +fileMap["Serials"] = 11; +fileMap["Subjects"] = 2; +fileMap["Visual.Materials"] = 1; +//Total number of files to load +const fileCount = fileMap[filePrefix]; +//Number of files we've already loaded +//We start at 1 because the MARC files are 1-indexed +var fileCounter = 1; + +//Counter to keep track of years +var outCounter = {}; + +//Array to hold CSV outs +var rows = []; + +//Search word +var search = "fire"; + +//The xml2object package needs us to build a parser object - that will ingest the XML and then +//trigger functions when the parse is complete. +const parser = new xml2object([ 'record' ]); + +//XML PARSER ---------------------------------------------------------------------------!! +//When we construct the parser with an array of which xml elements to look for. In our case, we're +//interested in the record objects. We also can pass in a reference to the file name. +function makeParser() { + + //The parser's on method handles events. Here, we'll define what happens when it finishes parsing an object + parser.on('object', function(name, obj) { + + //Get the Year of the object from subfield 260 + var marcDict = {}; + marcDict["245"] = {"*" :"Title"}; + marcDict["260"] = {"c" :"Year"}; + marcDict["752"] = {"*" : "Location"}; + marcDict["650"] = {"a" : "Subject"}; + + + var record = parseRecord(obj, marcDict); + + var year = record.Year; + if (record.Year) { + var cy; + try { + cy = record.Year[0].replace(/-/g, "5"); + cy = cy.replace(/[.,\/#!$%\^&\*\[\];:{}=\-_`~()]/g,""); + + var yearRegex = /(\d{4})/;///(\d{4}|\d{4}\-\d{4})$/g; + ///(17|18|19|20)\d{2}/ + + var y = cy.match(yearRegex)[0]; + if (y) year = y; + } catch (error) { + //console.log("failed year extract" + record.Year); + //console.log(cy); + } + } + + if (record.Location && record.Title) { + var chk1 = checkForMatches(record.Title.join(" ").toLowerCase(), ["hazard","fire","flood","storm","tornado","earthquake"]).chk; + var chk2; + if (record.Subject) { + chk2 = checkForMatches(record.Subject.join(" ").toLowerCase(), ["hazard","fire","flood","storm","tornado","earthquake"]).chk + } + if (chk1 || chk2) { + rows.push([record.Title.join(" ") + " " + year + " " + record.Location. join(" ")]); + console.log(record); + } + } + }); + + //And what happens when it finishes parsing all of the records. + parser.on('end', function() { + onParseFinished(); + }); + +} + +//------------------CHECK FOR MATCHES FUNCTION ---------------------------------------------------------------------------!! +//This function checks any string (input) against any list of candidate strings (candidates) +//Uses NLP to split the sentence into words and also to stem +var tokenizer = new natural.TreebankWordTokenizer(); +//Used to singularize the words so that frogs matches frog. Wether or not you have to do this will depend on what data you're trying to match. +//For example if it's something *already* standardized (ie. Subjects) you won't have to. +//This function is SLOW if there are a lot of words to check against +var nounInflector = new natural.NounInflector(); + +function checkForMatches(input, candidates) { + + //Tokenize the record (break it into words) + var words = [tokenizer.tokenize(input)][0]; + + //Set up our return object, this is the state that is returned with no matches + var chk = {chk:false, words:[]}; + + for (var i = 0; i < candidates.length; i++) { + var cand = nounInflector.singularize(candidates[i].toLowerCase()); + for (var j = 0; j < words.length; j++) { + if (nounInflector.singularize(words[j].toLowerCase()) == cand) { + chk.chk = true; + chk.words.push(candidates[i]); + } + } + } + + //Returns an object with a boolean and a list of words (if any) + //ie {chk:true, words:["frog","monkey"]} + return(chk); +} + +//MARC PARSE FUNCTION ---------------------------------------------------------------------------!! +//This function expects an object from xml2obj, and a dictionary object which links +//the mark tags and subfields to a property name. +// +//For example, you could do this: +// marcDict["260"] = {"c" :"Year"}; +// +//Which asks the parser to link records with a tag of 260 and a subfield of c to the property Year. +// +//You can also use * to say you want ALL subfields of a tag to be stored in a property: +// +// marcDict["245"] = {"*" :"Title"}; + +function parseRecord(obj, marcDict) { + record = {}; + + for (var i = 0; i < obj.datafield.length; i++) { + var df = obj.datafield[i]; + //Get the numeric tag + var tag = df.tag; + + //If we have the tag in our dictionary, write to the JSON object + //Based on the code (doesn't work for all cases?) + if (marcDict[tag] && df.subfield) { + var isAll = marcDict[tag]['*']; + + for (var j = 0; j < df.subfield.length; j++) { + + var code = isAll ? "*":df.subfield[j].code; + var disp = df.subfield[j]['$t']; + + if (marcDict[tag][code] || isAll) { + if (!record[marcDict[tag][code]]) { + record[marcDict[tag][code]] = []; + } + record[marcDict[tag][code]].push(disp); + } + } + } + } + return(record); +} + + +//FILE LOADING CASCADE ---------------------------------------------------------------------------!! +//These two functions sequence through the list of MARC records one by one and process them with our +//xml2object parser +function loadNextFile() { + if (fileCounter <= fileCount) { + //Put a zero in file names under 10 + var n = (fileCounter < 10 ? "0":"") + fileCounter; + //Construct the URL + var url = dataPath + "/" + filePrefix + ".2014.part" + n + ".xml.gz"; + //Open up a read stream and unzip it + + + var rstream = fs.createReadStream(url); + var gunzip = zlib.createGunzip(); + + rstream // reads from the url we've constructed + .pipe(gunzip) // uncompresses + .pipe(parser.saxStream); //Parses into record objects + + fileCounter ++; + console.log("LOADING FILE : " + url); + + } +} + + +function onParseFinished() { + + //Write every time - useful in very long processes + writeFile(rows); + try { + loadNextFile(); + } catch(err) { + console.log("ERROR LOADING NEXT FILE: " + fileCounter); + } +} + +//File WRITER ---------------------------------------------------------------------------!! + +function writeFile(rows) { + var text = rows.join("\n"); + //Write + //File prefix is defined on line 26 + fs.writeFile(appRoot + "/out/" + search + ".tsv", text, 'utf8', function() { + console.log("Saved JSON."); + }); +} + +//PULL THE TRIGGER. +makeParser(); +loadNextFile(); + + + diff --git a/Class11/MARC_Disaster/package-lock.json b/Class11/MARC_Disaster/package-lock.json new file mode 100644 index 0000000..189ab00 --- /dev/null +++ b/Class11/MARC_Disaster/package-lock.json @@ -0,0 +1,84 @@ +{ + "name": "cleanmarc_list", + "version": "1.0.0", + "lockfileVersion": 1, + "requires": true, + "dependencies": { + "afinn-165": { + "version": "1.0.2", + "resolved": "https://registry.npmjs.org/afinn-165/-/afinn-165-1.0.2.tgz", + "integrity": "sha512-oVbXkteWA6XgYndv3dXYVvulStflVYQtR2K+zp2PyaVhPkkOhZ8tAvk9V7cwaI43GwZaNqRoC2VTpoaWmFyBTA==" + }, + "app-root-path": { + "version": "2.1.0", + "resolved": "https://registry.npmjs.org/app-root-path/-/app-root-path-2.1.0.tgz", + "integrity": "sha1-mL9lmTJ+zqGZMJhm6BQDaP0uZGo=" + }, + "apparatus": { + "version": "0.0.10", + "resolved": "https://registry.npmjs.org/apparatus/-/apparatus-0.0.10.tgz", + "integrity": "sha512-KLy/ugo33KZA7nugtQ7O0E1c8kQ52N3IvD/XgIh4w/Nr28ypfkwDfA67F1ev4N1m5D+BOk1+b2dEJDfpj/VvZg==", + "requires": { + "sylvester": ">= 0.0.8" + } + }, + "fs": { + "version": "0.0.1-security", + "resolved": "https://registry.npmjs.org/fs/-/fs-0.0.1-security.tgz", + "integrity": "sha1-invTcYa23d84E/I4WLV+yq9eQdQ=" + }, + "json-stable-stringify": { + "version": "1.0.1", + "resolved": "https://registry.npmjs.org/json-stable-stringify/-/json-stable-stringify-1.0.1.tgz", + "integrity": "sha1-mnWdOcXy/1A/1TAGRu1EX4jE+a8=", + "requires": { + "jsonify": "~0.0.0" + } + }, + "jsonify": { + "version": "0.0.0", + "resolved": "https://registry.npmjs.org/jsonify/-/jsonify-0.0.0.tgz", + "integrity": "sha1-LHS27kHZPKUbe1qu6PUDYx0lKnM=" + }, + "natural": { + "version": "0.6.3", + "resolved": "https://registry.npmjs.org/natural/-/natural-0.6.3.tgz", + "integrity": "sha512-78fcEdNN6Y4pv8SOLPDhJTlUG+8IiQzNx0nYpl0k7q00K4ZZuds+wDWfSa6eeiPcSQDncvV44WWGsi70/ZP3+w==", + "requires": { + "afinn-165": "^1.0.2", + "apparatus": "^0.0.10", + "json-stable-stringify": "^1.0.1", + "sylvester": "^0.0.12", + "underscore": "^1.3.1" + } + }, + "sax": { + "version": "1.2.4", + "resolved": "https://registry.npmjs.org/sax/-/sax-1.2.4.tgz", + "integrity": "sha512-NqVDv9TpANUjFm0N8uM5GxL36UgKi9/atZw+x7YFnQ8ckwFGKrl4xX4yWtrey3UJm5nP1kUbnYgLopqWNSRhWw==" + }, + "sylvester": { + "version": "0.0.12", + "resolved": "https://registry.npmjs.org/sylvester/-/sylvester-0.0.12.tgz", + "integrity": "sha1-WohEFc0tACxX56OqyZRip1zp/bQ=" + }, + "underscore": { + "version": "1.9.1", + "resolved": "https://registry.npmjs.org/underscore/-/underscore-1.9.1.tgz", + "integrity": "sha512-5/4etnCkd9c8gwgowi5/om/mYO5ajCaOgdzj/oW+0eQV9WxKBDZw5+ycmKmeaTXjInS/W0BzpGLo2xR2aBwZdg==" + }, + "xml2object": { + "version": "0.1.2", + "resolved": "https://registry.npmjs.org/xml2object/-/xml2object-0.1.2.tgz", + "integrity": "sha1-hylkKI6BgaUP3UT3iRCX/lyYK0U=", + "requires": { + "sax": ">=0.3.5" + } + }, + "zlib": { + "version": "1.0.5", + "resolved": "https://registry.npmjs.org/zlib/-/zlib-1.0.5.tgz", + "integrity": "sha1-bnyXL8NxxkWmr7A6sUdp3vEU/MA=" + } + } +} diff --git a/Class11/MARC_Disaster/package.json b/Class11/MARC_Disaster/package.json new file mode 100644 index 0000000..6333293 --- /dev/null +++ b/Class11/MARC_Disaster/package.json @@ -0,0 +1,20 @@ +{ + "name": "cleanmarc_list", + "version": "1.0.0", + "description": "ITP Artists in the Archive MARC Example", + "main": "index.js", + "scripts": { + "start": "node index.js", + "download-data": "wget --input-file data/urls.txt --directory-prefix data" + }, + "dependencies": { + "app-root-path": "^2.1.0", + "fs": "^0.0.1-security", + "natural": "^0.6.3", + "xml2object": "^0.1.2", + "zlib": "^1.0.5" + }, + "devDependencies": {}, + "author": "Jer Thorp", + "license": "ISC" +}