ArtArchive/Class6/CleanMARC/index.js

198 lines
5.7 KiB
JavaScript

/*
Clean Base MARC example
Jer Thorp
3/11/19
- npm install
- npm run download-data
- npm start
NOTE: You will need to have the URLs you'd like the download script to get listed in data/urls.txt
NOTE: If the download-data command doesn't work you probably need to install wget:
OSX: brew install wget
Windows: http://gnuwin32.sourceforge.net/packages/wget.htm
This example runs through the Books - (~25m records) - and finds titles with the word 'monkey'.
It's a purposely simple example to build on. Following examples look at how to expand these concepts.
*/
//We're using the xml2object package, which takes XML loaded as text and parses it into a javascript object
const xml2object = require('xml2object');
//The filesystem package is used to load the .gz files from the local directory
const fs = require('fs');
//The zlib package is used to unzip the .gz files
const zlib = require('zlib');
//I like to use this package which provides a clean way to reference to root directory of a node project
const appRoot = require('app-root-path');
//Where is the data?
var dataPath = appRoot + "/data";
//Which subset of the MARC files were we looking for?
const filePrefix = "BooksAll";
//How many of them are there?
const fileMap = [];
fileMap["BooksAll"] = 41;
fileMap["Computer.Files"] = 1;
fileMap["Maps"] = 1;
fileMap["Music"] = 1;
fileMap["Names"] = 37;
fileMap["Serials"] = 11;
fileMap["Subjects"] = 2;
fileMap["Visual.Materials"] = 1;
//Total number of files to load
const fileCount = fileMap[filePrefix];
//Number of files we've already loaded
//We start at 1 because the MARC files are 1-indexed
var fileCounter = 1;
//List to hold objects we want to write to file at the end
var outList = [];
//The xml2object package needs us to build a parser object - that will ingest the XML and then
//trigger functions when the parse is complete.
const parser = new xml2object([ 'record' ]);
//------------------XML PARSER ---------------------------------------------------------------------------!!
//When we construct the parser with an array of which xml elements to look for. In our case, we're
//interested in the record objects. We also can pass in a reference to the file name.
function makeParser() {
//The parser's on method handles events. Here, we'll define what happens when it finishes parsing an object
parser.on('object', function(name, obj) {
//Use the parseRecord method to get the Title of the object
var marcDict = {};
marcDict["245"] = {"*" :"Title"};
var record = parseRecord(obj, marcDict);
//****************************** HERE'S THE PIECE OF CODE THAT ACTUALLY DOES THE THING!!***********
if (record.Title) {
var fullTitle = record.Title.join(" ");
if (fullTitle.toLowerCase().indexOf('monkey') != -1) {
console.log("FOUND A MONKEY!");
console.log(record.Title);
outList.push({title:record.Title})
}
}
});
//And what happens when it finishes parsing all of the records.
parser.on('end', function() {
onParseFinished();
});
}
//------------------MARC PARSE FUNCTION ---------------------------------------------------------------------------!!
//This function expects an object from xml2obj, and a dictionary object which links
//the mark tags and subfields to a property name.
//
//For example, you could do this:
// marcDict["260"] = {"c" :"Year"};
//
//Which asks the parser to link records with a tag of 260 and a subfield of c to the property Year.
//
//You can also use * to say you want ALL subfields of a tag to be stored in a property:
//
// marcDict["245"] = {"*" :"Title"};
function parseRecord(obj, marcDict) {
record = {};
for (var i = 0; i < obj.datafield.length; i++) {
var df = obj.datafield[i];
//Get the numeric tag
var tag = df.tag;
//If we have the tag in our dictionary, write to the JSON object
//Based on the code (doesn't work for all cases?)
if (marcDict[tag] && df.subfield) {
var isAll = marcDict[tag]['*'];
for (var j = 0; j < df.subfield.length; j++) {
var code = isAll ? "*":df.subfield[j].code;
var disp = df.subfield[j]['$t'];
if (marcDict[tag][code] || isAll) {
if (!record[marcDict[tag][code]]) {
record[marcDict[tag][code]] = [];
}
record[marcDict[tag][code]].push(disp);
}
}
}
}
return(record);
}
//------------------FILE LOADING CASCADE ---------------------------------------------------------------------------!!
//These two functions sequence through the list of MARC records one by one and process them with our
//xml2object parser
function loadNextFile() {
if (fileCounter <= fileCount) {
//Put a zero in file names under 10
var n = (fileCounter < 10 ? "0":"") + fileCounter;
//Construct the URL
var url = dataPath + "/" + filePrefix + ".2014.part" + n + ".xml.gz";
//Open up a read stream and unzip it
var rstream = fs.createReadStream(url);
var gunzip = zlib.createGunzip();
rstream // reads from the url we've constructed
.pipe(gunzip) // uncompresses
.pipe(parser.saxStream); //Parses into record objects
fileCounter ++;
console.log("LOADING FILE : " + url);
}
}
function onParseFinished() {
//Write every time - useful in very long processes
writeFile(outList);
try {
loadNextFile();
} catch(err) {
console.log("ERROR LOADING NEXT FILE: " + fileCounter);
}
}
//------------------JSON WRITER ---------------------------------------------------------------------------!!
//Writes any JSON object to a file
function writeFile(json) {
var json = JSON.stringify(json, null, 2);
//Write
console.log("WRITING." + json.length);
//File prefix is defined on line 26
fs.writeFile(appRoot + "/out/output_" + filePrefix + ".json", json, 'utf8', function() {
console.log("Saved JSON.");
});
}
//PULL THE TRIGGER.
makeParser();
loadNextFile();