Adding Class 11 examples.
This commit is contained in:
parent
b2a6e5070a
commit
3aadce4524
|
@ -1 +0,0 @@
|
||||||
https://www.loc.gov/cds/downloads/MDSConnect/Maps.2014.part01.xml.gz
|
|
|
@ -1,4 +0,0 @@
|
||||||
node_modules/*
|
|
||||||
data/*.gz
|
|
||||||
data/*.xml
|
|
||||||
out/*
|
|
|
@ -1,41 +0,0 @@
|
||||||
http://www.loc.gov/cds/downloads/MDSConnect/BooksAll.2014.part01.xml.gz
|
|
||||||
http://www.loc.gov/cds/downloads/MDSConnect/BooksAll.2014.part02.xml.gz
|
|
||||||
http://www.loc.gov/cds/downloads/MDSConnect/BooksAll.2014.part03.xml.gz
|
|
||||||
http://www.loc.gov/cds/downloads/MDSConnect/BooksAll.2014.part04.xml.gz
|
|
||||||
http://www.loc.gov/cds/downloads/MDSConnect/BooksAll.2014.part05.xml.gz
|
|
||||||
http://www.loc.gov/cds/downloads/MDSConnect/BooksAll.2014.part06.xml.gz
|
|
||||||
http://www.loc.gov/cds/downloads/MDSConnect/BooksAll.2014.part07.xml.gz
|
|
||||||
http://www.loc.gov/cds/downloads/MDSConnect/BooksAll.2014.part08.xml.gz
|
|
||||||
http://www.loc.gov/cds/downloads/MDSConnect/BooksAll.2014.part09.xml.gz
|
|
||||||
http://www.loc.gov/cds/downloads/MDSConnect/BooksAll.2014.part10.xml.gz
|
|
||||||
http://www.loc.gov/cds/downloads/MDSConnect/BooksAll.2014.part11.xml.gz
|
|
||||||
http://www.loc.gov/cds/downloads/MDSConnect/BooksAll.2014.part12.xml.gz
|
|
||||||
http://www.loc.gov/cds/downloads/MDSConnect/BooksAll.2014.part13.xml.gz
|
|
||||||
http://www.loc.gov/cds/downloads/MDSConnect/BooksAll.2014.part14.xml.gz
|
|
||||||
http://www.loc.gov/cds/downloads/MDSConnect/BooksAll.2014.part15.xml.gz
|
|
||||||
http://www.loc.gov/cds/downloads/MDSConnect/BooksAll.2014.part16.xml.gz
|
|
||||||
http://www.loc.gov/cds/downloads/MDSConnect/BooksAll.2014.part17.xml.gz
|
|
||||||
http://www.loc.gov/cds/downloads/MDSConnect/BooksAll.2014.part18.xml.gz
|
|
||||||
http://www.loc.gov/cds/downloads/MDSConnect/BooksAll.2014.part19.xml.gz
|
|
||||||
http://www.loc.gov/cds/downloads/MDSConnect/BooksAll.2014.part20.xml.gz
|
|
||||||
http://www.loc.gov/cds/downloads/MDSConnect/BooksAll.2014.part21.xml.gz
|
|
||||||
http://www.loc.gov/cds/downloads/MDSConnect/BooksAll.2014.part22.xml.gz
|
|
||||||
http://www.loc.gov/cds/downloads/MDSConnect/BooksAll.2014.part23.xml.gz
|
|
||||||
http://www.loc.gov/cds/downloads/MDSConnect/BooksAll.2014.part24.xml.gz
|
|
||||||
http://www.loc.gov/cds/downloads/MDSConnect/BooksAll.2014.part25.xml.gz
|
|
||||||
http://www.loc.gov/cds/downloads/MDSConnect/BooksAll.2014.part26.xml.gz
|
|
||||||
http://www.loc.gov/cds/downloads/MDSConnect/BooksAll.2014.part27.xml.gz
|
|
||||||
http://www.loc.gov/cds/downloads/MDSConnect/BooksAll.2014.part28.xml.gz
|
|
||||||
http://www.loc.gov/cds/downloads/MDSConnect/BooksAll.2014.part29.xml.gz
|
|
||||||
http://www.loc.gov/cds/downloads/MDSConnect/BooksAll.2014.part30.xml.gz
|
|
||||||
http://www.loc.gov/cds/downloads/MDSConnect/BooksAll.2014.part31.xml.gz
|
|
||||||
http://www.loc.gov/cds/downloads/MDSConnect/BooksAll.2014.part32.xml.gz
|
|
||||||
http://www.loc.gov/cds/downloads/MDSConnect/BooksAll.2014.part33.xml.gz
|
|
||||||
http://www.loc.gov/cds/downloads/MDSConnect/BooksAll.2014.part34.xml.gz
|
|
||||||
http://www.loc.gov/cds/downloads/MDSConnect/BooksAll.2014.part35.xml.gz
|
|
||||||
http://www.loc.gov/cds/downloads/MDSConnect/BooksAll.2014.part36.xml.gz
|
|
||||||
http://www.loc.gov/cds/downloads/MDSConnect/BooksAll.2014.part37.xml.gz
|
|
||||||
http://www.loc.gov/cds/downloads/MDSConnect/BooksAll.2014.part38.xml.gz
|
|
||||||
http://www.loc.gov/cds/downloads/MDSConnect/BooksAll.2014.part39.xml.gz
|
|
||||||
http://www.loc.gov/cds/downloads/MDSConnect/BooksAll.2014.part40.xml.gz
|
|
||||||
http://www.loc.gov/cds/downloads/MDSConnect/BooksAll.2014.part41.xml.gz
|
|
|
@ -1,197 +0,0 @@
|
||||||
/*
|
|
||||||
|
|
||||||
Clean Base MARC example
|
|
||||||
Jer Thorp
|
|
||||||
3/11/19
|
|
||||||
|
|
||||||
- npm install
|
|
||||||
- npm run download-data
|
|
||||||
- npm start
|
|
||||||
|
|
||||||
NOTE: You will need to have the URLs you'd like the download script to get listed in data/urls.txt
|
|
||||||
NOTE: If the download-data command doesn't work you probably need to install wget:
|
|
||||||
OSX: brew install wget
|
|
||||||
Windows: http://gnuwin32.sourceforge.net/packages/wget.htm
|
|
||||||
|
|
||||||
This example runs through the Books - (~25m records) - and finds titles with the word 'monkey'.
|
|
||||||
It's a purposely simple example to build on. Following examples look at how to expand these concepts.
|
|
||||||
|
|
||||||
*/
|
|
||||||
|
|
||||||
//We're using the xml2object package, which takes XML loaded as text and parses it into a javascript object
|
|
||||||
const xml2object = require('xml2object');
|
|
||||||
//The filesystem package is used to load the .gz files from the local directory
|
|
||||||
const fs = require('fs');
|
|
||||||
//The zlib package is used to unzip the .gz files
|
|
||||||
const zlib = require('zlib');
|
|
||||||
//I like to use this package which provides a clean way to reference to root directory of a node project
|
|
||||||
const appRoot = require('app-root-path');
|
|
||||||
|
|
||||||
//Where is the data?
|
|
||||||
var dataPath = appRoot + "/data";
|
|
||||||
|
|
||||||
//Which subset of the MARC files were we looking for?
|
|
||||||
const filePrefix = "BooksAll";
|
|
||||||
//How many of them are there?
|
|
||||||
const fileMap = [];
|
|
||||||
fileMap["BooksAll"] = 41;
|
|
||||||
fileMap["Computer.Files"] = 1;
|
|
||||||
fileMap["Maps"] = 1;
|
|
||||||
fileMap["Music"] = 1;
|
|
||||||
fileMap["Names"] = 37;
|
|
||||||
fileMap["Serials"] = 11;
|
|
||||||
fileMap["Subjects"] = 2;
|
|
||||||
fileMap["Visual.Materials"] = 1;
|
|
||||||
//Total number of files to load
|
|
||||||
const fileCount = fileMap[filePrefix];
|
|
||||||
//Number of files we've already loaded
|
|
||||||
//We start at 1 because the MARC files are 1-indexed
|
|
||||||
var fileCounter = 1;
|
|
||||||
|
|
||||||
//List to hold objects we want to write to file at the end
|
|
||||||
var outList = [];
|
|
||||||
|
|
||||||
//The xml2object package needs us to build a parser object - that will ingest the XML and then
|
|
||||||
//trigger functions when the parse is complete.
|
|
||||||
const parser = new xml2object([ 'record' ]);
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
//------------------XML PARSER ---------------------------------------------------------------------------!!
|
|
||||||
//When we construct the parser with an array of which xml elements to look for. In our case, we're
|
|
||||||
//interested in the record objects. We also can pass in a reference to the file name.
|
|
||||||
function makeParser() {
|
|
||||||
|
|
||||||
//The parser's on method handles events. Here, we'll define what happens when it finishes parsing an object
|
|
||||||
parser.on('object', function(name, obj) {
|
|
||||||
|
|
||||||
//Use the parseRecord method to get the Title of the object
|
|
||||||
var marcDict = {};
|
|
||||||
marcDict["245"] = {"*" :"Title"};
|
|
||||||
|
|
||||||
var record = parseRecord(obj, marcDict);
|
|
||||||
|
|
||||||
//****************************** HERE'S THE PIECE OF CODE THAT ACTUALLY DOES THE THING!!***********
|
|
||||||
if (record.Title) {
|
|
||||||
var fullTitle = record.Title.join(" ");
|
|
||||||
if (fullTitle.toLowerCase().indexOf('monkey') != -1) {
|
|
||||||
console.log("FOUND A MONKEY!");
|
|
||||||
console.log(record.Title);
|
|
||||||
outList.push({title:record.Title})
|
|
||||||
}
|
|
||||||
}
|
|
||||||
});
|
|
||||||
|
|
||||||
//And what happens when it finishes parsing all of the records.
|
|
||||||
parser.on('end', function() {
|
|
||||||
onParseFinished();
|
|
||||||
});
|
|
||||||
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
//------------------MARC PARSE FUNCTION ---------------------------------------------------------------------------!!
|
|
||||||
//This function expects an object from xml2obj, and a dictionary object which links
|
|
||||||
//the mark tags and subfields to a property name.
|
|
||||||
//
|
|
||||||
//For example, you could do this:
|
|
||||||
// marcDict["260"] = {"c" :"Year"};
|
|
||||||
//
|
|
||||||
//Which asks the parser to link records with a tag of 260 and a subfield of c to the property Year.
|
|
||||||
//
|
|
||||||
//You can also use * to say you want ALL subfields of a tag to be stored in a property:
|
|
||||||
//
|
|
||||||
// marcDict["245"] = {"*" :"Title"};
|
|
||||||
|
|
||||||
function parseRecord(obj, marcDict) {
|
|
||||||
record = {};
|
|
||||||
|
|
||||||
for (var i = 0; i < obj.datafield.length; i++) {
|
|
||||||
var df = obj.datafield[i];
|
|
||||||
//Get the numeric tag
|
|
||||||
var tag = df.tag;
|
|
||||||
|
|
||||||
//If we have the tag in our dictionary, write to the JSON object
|
|
||||||
//Based on the code (doesn't work for all cases?)
|
|
||||||
if (marcDict[tag] && df.subfield) {
|
|
||||||
var isAll = marcDict[tag]['*'];
|
|
||||||
|
|
||||||
for (var j = 0; j < df.subfield.length; j++) {
|
|
||||||
|
|
||||||
var code = isAll ? "*":df.subfield[j].code;
|
|
||||||
var disp = df.subfield[j]['$t'];
|
|
||||||
|
|
||||||
if (marcDict[tag][code] || isAll) {
|
|
||||||
if (!record[marcDict[tag][code]]) {
|
|
||||||
record[marcDict[tag][code]] = [];
|
|
||||||
}
|
|
||||||
record[marcDict[tag][code]].push(disp);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
return(record);
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
//------------------FILE LOADING CASCADE ---------------------------------------------------------------------------!!
|
|
||||||
//These two functions sequence through the list of MARC records one by one and process them with our
|
|
||||||
//xml2object parser
|
|
||||||
function loadNextFile() {
|
|
||||||
if (fileCounter <= fileCount) {
|
|
||||||
//Put a zero in file names under 10
|
|
||||||
var n = (fileCounter < 10 ? "0":"") + fileCounter;
|
|
||||||
//Construct the URL
|
|
||||||
var url = dataPath + "/" + filePrefix + ".2014.part" + n + ".xml.gz";
|
|
||||||
//Open up a read stream and unzip it
|
|
||||||
var rstream = fs.createReadStream(url);
|
|
||||||
var gunzip = zlib.createGunzip();
|
|
||||||
|
|
||||||
rstream // reads from the url we've constructed
|
|
||||||
.pipe(gunzip) // uncompresses
|
|
||||||
.pipe(parser.saxStream); //Parses into record objects
|
|
||||||
|
|
||||||
fileCounter ++;
|
|
||||||
console.log("LOADING FILE : " + url);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
function onParseFinished() {
|
|
||||||
|
|
||||||
//Write every time - useful in very long processes
|
|
||||||
writeFile(outList);
|
|
||||||
try {
|
|
||||||
loadNextFile();
|
|
||||||
} catch(err) {
|
|
||||||
console.log("ERROR LOADING NEXT FILE: " + fileCounter);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
//------------------JSON WRITER ---------------------------------------------------------------------------!!
|
|
||||||
//Writes any JSON object to a file
|
|
||||||
function writeFile(json) {
|
|
||||||
var json = JSON.stringify(json, null, 2);
|
|
||||||
//Write
|
|
||||||
console.log("WRITING." + json.length);
|
|
||||||
//File prefix is defined on line 26
|
|
||||||
fs.writeFile(appRoot + "/out/output_" + filePrefix + ".json", json, 'utf8', function() {
|
|
||||||
console.log("Saved JSON.");
|
|
||||||
});
|
|
||||||
}
|
|
||||||
|
|
||||||
//PULL THE TRIGGER.
|
|
||||||
makeParser();
|
|
||||||
loadNextFile();
|
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -1,36 +0,0 @@
|
||||||
{
|
|
||||||
"name": "cleanmarc_list",
|
|
||||||
"version": "1.0.0",
|
|
||||||
"lockfileVersion": 1,
|
|
||||||
"requires": true,
|
|
||||||
"dependencies": {
|
|
||||||
"app-root-path": {
|
|
||||||
"version": "2.1.0",
|
|
||||||
"resolved": "https://registry.npmjs.org/app-root-path/-/app-root-path-2.1.0.tgz",
|
|
||||||
"integrity": "sha1-mL9lmTJ+zqGZMJhm6BQDaP0uZGo="
|
|
||||||
},
|
|
||||||
"fs": {
|
|
||||||
"version": "0.0.1-security",
|
|
||||||
"resolved": "https://registry.npmjs.org/fs/-/fs-0.0.1-security.tgz",
|
|
||||||
"integrity": "sha1-invTcYa23d84E/I4WLV+yq9eQdQ="
|
|
||||||
},
|
|
||||||
"sax": {
|
|
||||||
"version": "1.2.4",
|
|
||||||
"resolved": "https://registry.npmjs.org/sax/-/sax-1.2.4.tgz",
|
|
||||||
"integrity": "sha512-NqVDv9TpANUjFm0N8uM5GxL36UgKi9/atZw+x7YFnQ8ckwFGKrl4xX4yWtrey3UJm5nP1kUbnYgLopqWNSRhWw=="
|
|
||||||
},
|
|
||||||
"xml2object": {
|
|
||||||
"version": "0.1.2",
|
|
||||||
"resolved": "https://registry.npmjs.org/xml2object/-/xml2object-0.1.2.tgz",
|
|
||||||
"integrity": "sha1-hylkKI6BgaUP3UT3iRCX/lyYK0U=",
|
|
||||||
"requires": {
|
|
||||||
"sax": ">=0.3.5"
|
|
||||||
}
|
|
||||||
},
|
|
||||||
"zlib": {
|
|
||||||
"version": "1.0.5",
|
|
||||||
"resolved": "https://registry.npmjs.org/zlib/-/zlib-1.0.5.tgz",
|
|
||||||
"integrity": "sha1-bnyXL8NxxkWmr7A6sUdp3vEU/MA="
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
|
@ -1,19 +0,0 @@
|
||||||
{
|
|
||||||
"name": "cleanmarc_list",
|
|
||||||
"version": "1.0.0",
|
|
||||||
"description": "ITP Artists in the Archive MARC Example",
|
|
||||||
"main": "index.js",
|
|
||||||
"scripts": {
|
|
||||||
"start": "node index.js",
|
|
||||||
"download-data": "wget --input-file data/urls.txt --directory-prefix data"
|
|
||||||
},
|
|
||||||
"dependencies": {
|
|
||||||
"app-root-path": "^2.1.0",
|
|
||||||
"fs": "^0.0.1-security",
|
|
||||||
"xml2object": "^0.1.2",
|
|
||||||
"zlib": "^1.0.5"
|
|
||||||
},
|
|
||||||
"devDependencies": {},
|
|
||||||
"author": "Jer Thorp",
|
|
||||||
"license": "ISC"
|
|
||||||
}
|
|
File diff suppressed because it is too large
Load Diff
Loading…
Reference in New Issue