ArtArchive/Class2/marc_template/index.js

153 lines
3.0 KiB
JavaScript

/*
Node.js parser for MARC Files from Library of Congress
- Make network JSON files for consumption by sigma.js
- npm run download-data to get data files (you may have to install wget)
Jer Thorp (@blprnt)
December, 2017
*/
let request = require('request');
const fs = require('fs');
const zlib = require('zlib');
const concat = require('concat-stream');
const xml2object = require('xml2object');
const appRoot = require('app-root-path');
const natural = require('natural');
var dataPath = appRoot + "/data";
const marc_location = dataPath;
const filePrefix = "Visual.Materials"
var docCount = 0;
var docCounts = [];
var callNumCounts = [];
//XML Parser
var parser;
// Create a new xml parser looking for the record objects
function makeParser() {
parser = new xml2object([ 'record' ]);
parser.outs = [];
parser.on('object', function(name, obj) {
parseRecord(obj);
});
parser.on('end', function() {
console.log('Finished parsing xml!');
onParseFinished();
});
}
//Record parser
//Parse MARC record into a usable JSON object
//https://folgerpedia.folger.edu/Interpreting_MARC_records#2xx
//SUPER rough for now!
const marcDict = {};
marcDict["245"] = {"a" :"Title"};
var nameDict = {};
var lastNameDict = {};
var callDict = {};
var callYearTotals = [];
var allRecords = [];
function parseRecord(obj) {
record = {};
for (var i = 0; i < obj.datafield.length; i++) {
var df = obj.datafield[i];
//Get the numeric tag
var tag = df.tag;
//If we have the tag in our dictionary, write to the JSON object
//Based on the code (doesn't work for all cases?)
if (marcDict[tag]) {
for (var j = 0; j < df.subfield.length; j++) {
var code = df.subfield[j].code;
var disp = df.subfield[j]['$t'];
if (marcDict[tag][code]) {
if (!record[marcDict[tag][code]]) {
record[marcDict[tag][code]] = [];
}
record[marcDict[tag][code]].push(disp);
}
}
}
}
if (record.Title) {
if (record.Title.length > 0) {
var t = record.Title[0];
var chk = checkForColor(t);
if (chk.chk) console.log(chk.col + ":" + record.Title[0]);
}
}
}
function checkForColor(_s) {
var colors = [" red"," blue"," green"];
var chk = {chk:false, col:null};
for (var i = 0; i < colors.length; i++) {
if (_s.indexOf(colors[i]) != -1) {
chk.chk = true;
chk.col = colors[i];
}
}
return(chk);
}
function incrementDict(dict, val, yi) {
if (!dict[val]) {
dict[val] = {
"name":val,
"total":0,
"years":[],
"callNums":{}
};
}
}
function onParseFinished() {
nextFile();
}
var counter = 1;
function nextFile() {
var n = (counter < 10 ? "0":"") + counter;
var url = marc_location + "/" + filePrefix + ".2014.part" + n + ".xml.gz";
var rstream = fs.createReadStream(url);
var gunzip = zlib.createGunzip();
makeParser();
allRecords = [];
console.log("LOADING FILE : " + url);
counter ++;
if (counter < 42) {
rstream // reads from myfile.txt.gz
.pipe(gunzip) // uncompresses
.pipe(parser.saxStream); //Parses into record objects
}
}
nextFile();