Adding Class 11 examples.
This commit is contained in:
parent
3be45b7b32
commit
bff249de75
|
@ -0,0 +1,4 @@
|
|||
node_modules/*
|
||||
data/*.gz
|
||||
data/*.xml
|
||||
out/*
|
|
@ -0,0 +1 @@
|
|||
https://www.loc.gov/cds/downloads/MDSConnect/Maps.2014.part01.xml.gz
|
|
@ -0,0 +1,254 @@
|
|||
/*
|
||||
|
||||
MARC List Example
|
||||
Jer Thorp
|
||||
3/11/19
|
||||
|
||||
- npm install
|
||||
- npm run download-data
|
||||
- npm start
|
||||
|
||||
NOTE: You will need to have the URLs you'd like the download script to get listed in data/urls.txt
|
||||
NOTE: If the download-data command doesn't work you probably need to install wget:
|
||||
OSX: brew install wget
|
||||
Windows: http://gnuwin32.sourceforge.net/packages/wget.htm
|
||||
|
||||
This example runs through the Visual Materials and makes a data file recording how many items were
|
||||
published in each given year.
|
||||
|
||||
This tactic could be used to get distribution data for any MARC field - and could be filtered by
|
||||
title keyword (see CleanMARC example)
|
||||
|
||||
*/
|
||||
|
||||
//We're using the xml2object package, which takes XML loaded as text and parses it into a javascript object
|
||||
const xml2object = require('xml2object');
|
||||
//The filesystem package is used to load the .gz files from the local directory
|
||||
const fs = require('fs');
|
||||
//The zlib package is used to unzip the .gz files
|
||||
const zlib = require('zlib');
|
||||
//I like to use this package which provides a clean way to reference to root directory of a node project
|
||||
const appRoot = require('app-root-path');
|
||||
//Natural is a nice NLP package for node: https://www.npmjs.com/package/natural
|
||||
const natural = require('natural');
|
||||
|
||||
//Where is the data?
|
||||
var dataPath = appRoot + "/data";
|
||||
|
||||
//Which subset of the MARC files were we looking for?
|
||||
const filePrefix = "Maps";
|
||||
//How many of them are there?
|
||||
const fileMap = [];
|
||||
fileMap["BooksAll"] = 41;
|
||||
fileMap["Computer.Files"] = 1;
|
||||
fileMap["Maps"] = 1;
|
||||
fileMap["Music"] = 1;
|
||||
fileMap["Names"] = 37;
|
||||
fileMap["Serials"] = 11;
|
||||
fileMap["Subjects"] = 2;
|
||||
fileMap["Visual.Materials"] = 1;
|
||||
//Total number of files to load
|
||||
const fileCount = fileMap[filePrefix];
|
||||
//Number of files we've already loaded
|
||||
//We start at 1 because the MARC files are 1-indexed
|
||||
var fileCounter = 1;
|
||||
|
||||
//Counter to keep track of years
|
||||
var outCounter = {};
|
||||
|
||||
//Array to hold CSV outs
|
||||
var rows = [];
|
||||
|
||||
//Search word
|
||||
var search = "fire";
|
||||
|
||||
//The xml2object package needs us to build a parser object - that will ingest the XML and then
|
||||
//trigger functions when the parse is complete.
|
||||
const parser = new xml2object([ 'record' ]);
|
||||
|
||||
//XML PARSER ---------------------------------------------------------------------------!!
|
||||
//When we construct the parser with an array of which xml elements to look for. In our case, we're
|
||||
//interested in the record objects. We also can pass in a reference to the file name.
|
||||
function makeParser() {
|
||||
|
||||
//The parser's on method handles events. Here, we'll define what happens when it finishes parsing an object
|
||||
parser.on('object', function(name, obj) {
|
||||
|
||||
//Get the Year of the object from subfield 260
|
||||
var marcDict = {};
|
||||
marcDict["245"] = {"*" :"Title"};
|
||||
marcDict["260"] = {"c" :"Year"};
|
||||
marcDict["752"] = {"*" : "Location"};
|
||||
marcDict["650"] = {"a" : "Subject"};
|
||||
|
||||
|
||||
var record = parseRecord(obj, marcDict);
|
||||
|
||||
var year = record.Year;
|
||||
if (record.Year) {
|
||||
var cy;
|
||||
try {
|
||||
cy = record.Year[0].replace(/-/g, "5");
|
||||
cy = cy.replace(/[.,\/#!$%\^&\*\[\];:{}=\-_`~()]/g,"");
|
||||
|
||||
var yearRegex = /(\d{4})/;///(\d{4}|\d{4}\-\d{4})$/g;
|
||||
///(17|18|19|20)\d{2}/
|
||||
|
||||
var y = cy.match(yearRegex)[0];
|
||||
if (y) year = y;
|
||||
} catch (error) {
|
||||
//console.log("failed year extract" + record.Year);
|
||||
//console.log(cy);
|
||||
}
|
||||
}
|
||||
|
||||
if (record.Location && record.Title) {
|
||||
var chk1 = checkForMatches(record.Title.join(" ").toLowerCase(), ["hazard","fire","flood","storm","tornado","earthquake"]).chk;
|
||||
var chk2;
|
||||
if (record.Subject) {
|
||||
chk2 = checkForMatches(record.Subject.join(" ").toLowerCase(), ["hazard","fire","flood","storm","tornado","earthquake"]).chk
|
||||
}
|
||||
if (chk1 || chk2) {
|
||||
rows.push([record.Title.join(" ") + " " + year + " " + record.Location. join(" ")]);
|
||||
console.log(record);
|
||||
}
|
||||
}
|
||||
});
|
||||
|
||||
//And what happens when it finishes parsing all of the records.
|
||||
parser.on('end', function() {
|
||||
onParseFinished();
|
||||
});
|
||||
|
||||
}
|
||||
|
||||
//------------------CHECK FOR MATCHES FUNCTION ---------------------------------------------------------------------------!!
|
||||
//This function checks any string (input) against any list of candidate strings (candidates)
|
||||
//Uses NLP to split the sentence into words and also to stem
|
||||
var tokenizer = new natural.TreebankWordTokenizer();
|
||||
//Used to singularize the words so that frogs matches frog. Wether or not you have to do this will depend on what data you're trying to match.
|
||||
//For example if it's something *already* standardized (ie. Subjects) you won't have to.
|
||||
//This function is SLOW if there are a lot of words to check against
|
||||
var nounInflector = new natural.NounInflector();
|
||||
|
||||
function checkForMatches(input, candidates) {
|
||||
|
||||
//Tokenize the record (break it into words)
|
||||
var words = [tokenizer.tokenize(input)][0];
|
||||
|
||||
//Set up our return object, this is the state that is returned with no matches
|
||||
var chk = {chk:false, words:[]};
|
||||
|
||||
for (var i = 0; i < candidates.length; i++) {
|
||||
var cand = nounInflector.singularize(candidates[i].toLowerCase());
|
||||
for (var j = 0; j < words.length; j++) {
|
||||
if (nounInflector.singularize(words[j].toLowerCase()) == cand) {
|
||||
chk.chk = true;
|
||||
chk.words.push(candidates[i]);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
//Returns an object with a boolean and a list of words (if any)
|
||||
//ie {chk:true, words:["frog","monkey"]}
|
||||
return(chk);
|
||||
}
|
||||
|
||||
//MARC PARSE FUNCTION ---------------------------------------------------------------------------!!
|
||||
//This function expects an object from xml2obj, and a dictionary object which links
|
||||
//the mark tags and subfields to a property name.
|
||||
//
|
||||
//For example, you could do this:
|
||||
// marcDict["260"] = {"c" :"Year"};
|
||||
//
|
||||
//Which asks the parser to link records with a tag of 260 and a subfield of c to the property Year.
|
||||
//
|
||||
//You can also use * to say you want ALL subfields of a tag to be stored in a property:
|
||||
//
|
||||
// marcDict["245"] = {"*" :"Title"};
|
||||
|
||||
function parseRecord(obj, marcDict) {
|
||||
record = {};
|
||||
|
||||
for (var i = 0; i < obj.datafield.length; i++) {
|
||||
var df = obj.datafield[i];
|
||||
//Get the numeric tag
|
||||
var tag = df.tag;
|
||||
|
||||
//If we have the tag in our dictionary, write to the JSON object
|
||||
//Based on the code (doesn't work for all cases?)
|
||||
if (marcDict[tag] && df.subfield) {
|
||||
var isAll = marcDict[tag]['*'];
|
||||
|
||||
for (var j = 0; j < df.subfield.length; j++) {
|
||||
|
||||
var code = isAll ? "*":df.subfield[j].code;
|
||||
var disp = df.subfield[j]['$t'];
|
||||
|
||||
if (marcDict[tag][code] || isAll) {
|
||||
if (!record[marcDict[tag][code]]) {
|
||||
record[marcDict[tag][code]] = [];
|
||||
}
|
||||
record[marcDict[tag][code]].push(disp);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
return(record);
|
||||
}
|
||||
|
||||
|
||||
//FILE LOADING CASCADE ---------------------------------------------------------------------------!!
|
||||
//These two functions sequence through the list of MARC records one by one and process them with our
|
||||
//xml2object parser
|
||||
function loadNextFile() {
|
||||
if (fileCounter <= fileCount) {
|
||||
//Put a zero in file names under 10
|
||||
var n = (fileCounter < 10 ? "0":"") + fileCounter;
|
||||
//Construct the URL
|
||||
var url = dataPath + "/" + filePrefix + ".2014.part" + n + ".xml.gz";
|
||||
//Open up a read stream and unzip it
|
||||
|
||||
|
||||
var rstream = fs.createReadStream(url);
|
||||
var gunzip = zlib.createGunzip();
|
||||
|
||||
rstream // reads from the url we've constructed
|
||||
.pipe(gunzip) // uncompresses
|
||||
.pipe(parser.saxStream); //Parses into record objects
|
||||
|
||||
fileCounter ++;
|
||||
console.log("LOADING FILE : " + url);
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
function onParseFinished() {
|
||||
|
||||
//Write every time - useful in very long processes
|
||||
writeFile(rows);
|
||||
try {
|
||||
loadNextFile();
|
||||
} catch(err) {
|
||||
console.log("ERROR LOADING NEXT FILE: " + fileCounter);
|
||||
}
|
||||
}
|
||||
|
||||
//File WRITER ---------------------------------------------------------------------------!!
|
||||
|
||||
function writeFile(rows) {
|
||||
var text = rows.join("\n");
|
||||
//Write
|
||||
//File prefix is defined on line 26
|
||||
fs.writeFile(appRoot + "/out/" + search + ".tsv", text, 'utf8', function() {
|
||||
console.log("Saved JSON.");
|
||||
});
|
||||
}
|
||||
|
||||
//PULL THE TRIGGER.
|
||||
makeParser();
|
||||
loadNextFile();
|
||||
|
||||
|
||||
|
|
@ -0,0 +1,84 @@
|
|||
{
|
||||
"name": "cleanmarc_list",
|
||||
"version": "1.0.0",
|
||||
"lockfileVersion": 1,
|
||||
"requires": true,
|
||||
"dependencies": {
|
||||
"afinn-165": {
|
||||
"version": "1.0.2",
|
||||
"resolved": "https://registry.npmjs.org/afinn-165/-/afinn-165-1.0.2.tgz",
|
||||
"integrity": "sha512-oVbXkteWA6XgYndv3dXYVvulStflVYQtR2K+zp2PyaVhPkkOhZ8tAvk9V7cwaI43GwZaNqRoC2VTpoaWmFyBTA=="
|
||||
},
|
||||
"app-root-path": {
|
||||
"version": "2.1.0",
|
||||
"resolved": "https://registry.npmjs.org/app-root-path/-/app-root-path-2.1.0.tgz",
|
||||
"integrity": "sha1-mL9lmTJ+zqGZMJhm6BQDaP0uZGo="
|
||||
},
|
||||
"apparatus": {
|
||||
"version": "0.0.10",
|
||||
"resolved": "https://registry.npmjs.org/apparatus/-/apparatus-0.0.10.tgz",
|
||||
"integrity": "sha512-KLy/ugo33KZA7nugtQ7O0E1c8kQ52N3IvD/XgIh4w/Nr28ypfkwDfA67F1ev4N1m5D+BOk1+b2dEJDfpj/VvZg==",
|
||||
"requires": {
|
||||
"sylvester": ">= 0.0.8"
|
||||
}
|
||||
},
|
||||
"fs": {
|
||||
"version": "0.0.1-security",
|
||||
"resolved": "https://registry.npmjs.org/fs/-/fs-0.0.1-security.tgz",
|
||||
"integrity": "sha1-invTcYa23d84E/I4WLV+yq9eQdQ="
|
||||
},
|
||||
"json-stable-stringify": {
|
||||
"version": "1.0.1",
|
||||
"resolved": "https://registry.npmjs.org/json-stable-stringify/-/json-stable-stringify-1.0.1.tgz",
|
||||
"integrity": "sha1-mnWdOcXy/1A/1TAGRu1EX4jE+a8=",
|
||||
"requires": {
|
||||
"jsonify": "~0.0.0"
|
||||
}
|
||||
},
|
||||
"jsonify": {
|
||||
"version": "0.0.0",
|
||||
"resolved": "https://registry.npmjs.org/jsonify/-/jsonify-0.0.0.tgz",
|
||||
"integrity": "sha1-LHS27kHZPKUbe1qu6PUDYx0lKnM="
|
||||
},
|
||||
"natural": {
|
||||
"version": "0.6.3",
|
||||
"resolved": "https://registry.npmjs.org/natural/-/natural-0.6.3.tgz",
|
||||
"integrity": "sha512-78fcEdNN6Y4pv8SOLPDhJTlUG+8IiQzNx0nYpl0k7q00K4ZZuds+wDWfSa6eeiPcSQDncvV44WWGsi70/ZP3+w==",
|
||||
"requires": {
|
||||
"afinn-165": "^1.0.2",
|
||||
"apparatus": "^0.0.10",
|
||||
"json-stable-stringify": "^1.0.1",
|
||||
"sylvester": "^0.0.12",
|
||||
"underscore": "^1.3.1"
|
||||
}
|
||||
},
|
||||
"sax": {
|
||||
"version": "1.2.4",
|
||||
"resolved": "https://registry.npmjs.org/sax/-/sax-1.2.4.tgz",
|
||||
"integrity": "sha512-NqVDv9TpANUjFm0N8uM5GxL36UgKi9/atZw+x7YFnQ8ckwFGKrl4xX4yWtrey3UJm5nP1kUbnYgLopqWNSRhWw=="
|
||||
},
|
||||
"sylvester": {
|
||||
"version": "0.0.12",
|
||||
"resolved": "https://registry.npmjs.org/sylvester/-/sylvester-0.0.12.tgz",
|
||||
"integrity": "sha1-WohEFc0tACxX56OqyZRip1zp/bQ="
|
||||
},
|
||||
"underscore": {
|
||||
"version": "1.9.1",
|
||||
"resolved": "https://registry.npmjs.org/underscore/-/underscore-1.9.1.tgz",
|
||||
"integrity": "sha512-5/4etnCkd9c8gwgowi5/om/mYO5ajCaOgdzj/oW+0eQV9WxKBDZw5+ycmKmeaTXjInS/W0BzpGLo2xR2aBwZdg=="
|
||||
},
|
||||
"xml2object": {
|
||||
"version": "0.1.2",
|
||||
"resolved": "https://registry.npmjs.org/xml2object/-/xml2object-0.1.2.tgz",
|
||||
"integrity": "sha1-hylkKI6BgaUP3UT3iRCX/lyYK0U=",
|
||||
"requires": {
|
||||
"sax": ">=0.3.5"
|
||||
}
|
||||
},
|
||||
"zlib": {
|
||||
"version": "1.0.5",
|
||||
"resolved": "https://registry.npmjs.org/zlib/-/zlib-1.0.5.tgz",
|
||||
"integrity": "sha1-bnyXL8NxxkWmr7A6sUdp3vEU/MA="
|
||||
}
|
||||
}
|
||||
}
|
|
@ -0,0 +1,20 @@
|
|||
{
|
||||
"name": "cleanmarc_list",
|
||||
"version": "1.0.0",
|
||||
"description": "ITP Artists in the Archive MARC Example",
|
||||
"main": "index.js",
|
||||
"scripts": {
|
||||
"start": "node index.js",
|
||||
"download-data": "wget --input-file data/urls.txt --directory-prefix data"
|
||||
},
|
||||
"dependencies": {
|
||||
"app-root-path": "^2.1.0",
|
||||
"fs": "^0.0.1-security",
|
||||
"natural": "^0.6.3",
|
||||
"xml2object": "^0.1.2",
|
||||
"zlib": "^1.0.5"
|
||||
},
|
||||
"devDependencies": {},
|
||||
"author": "Jer Thorp",
|
||||
"license": "ISC"
|
||||
}
|
Loading…
Reference in New Issue