/* Node.js parser for MARC Files from Library of Congress - Make network JSON files for consumption by sigma.js - npm run download-data to get data files (you may have to install wget) Jer Thorp (@blprnt) December, 2017 */ let request = require('request'); const fs = require('fs'); const zlib = require('zlib'); const concat = require('concat-stream'); const xml2object = require('xml2object'); const appRoot = require('app-root-path'); const natural = require('natural'); var dataPath = appRoot + "/data"; const marc_location = dataPath; var docCount = 0; var docCounts = []; var callNumCounts = []; var networkMap = {}; //XML Parser var parser; // Create a new xml parser looking for the record objects function makeParser() { parser = new xml2object([ 'record' ]); parser.outs = []; parser.on('object', function(name, obj) { if (Math.random() < 1) parseRecord(obj); }); parser.on('end', function() { console.log('Finished parsing xml!'); onParseFinished(); }); } //Record parser //Parse MARC record into a usable JSON object //https://folgerpedia.folger.edu/Interpreting_MARC_records#2xx //SUPER rough for now! const marcDict = {}; marcDict["370"] = {"a" :"BirthPlace", "b": "DeathPlace", "e": "ResidencePlace", "f":"OtherPlace"}; marcDict["046"] = {"f" :"BirthDate", "g":"DeathDate"}; marcDict["374"] = {"a" :"Occupation"}; var nameDict = {}; var lastNameDict = {}; var callDict = {}; var callYearTotals = []; var allRecords = []; function parseRecord(obj) { record = {}; for (var i = 0; i < obj.datafield.length; i++) { var df = obj.datafield[i]; //Get the numeric tag var tag = df.tag; //If we have the tag in our dictionary, write to the JSON object //Based on the code (doesn't work for all cases?) if (marcDict[tag]) { for (var j = 0; j < df.subfield.length; j++) { var code = df.subfield[j].code; var disp = df.subfield[j]['$t']; if (marcDict[tag][code]) { if (!record[marcDict[tag][code]]) { record[marcDict[tag][code]] = []; } record[marcDict[tag][code]].push(disp); } } } } if (record.Occupation) { if (record.Occupation.length > 0) { var stemmed = []; //Stem the occupation (ie writers -> writer) to reduce duplicates for (var i = 0; i < record.Occupation.length; i++) { var stem = natural.PorterStemmer.stem(record.Occupation[i]); //console.log(record.Occupation[i] + ":" + stem) stemmed.push(stem); } fileNetworkEntry("Occupation", stemmed); } } if (record.BirthPlace && record.DeathPlace) { fileNetworkEntry("Places", [record.BirthPlace[0], record.DeathPlace[0]]); //console.log(record.BirthPlace + ":" + record.DeathPlace); } } function fileNetworkEntry(networkKey, entryArray) { //Does the key exist? If not create a new network object if (!networkMap[networkKey]) { networkMap[networkKey] = {"network":{"nodes":[], "edges":[]}, "maps":{"nodes":{}}}; console.log("CREATED KEY:" + networkKey) } var network = networkMap[networkKey]; //Make a node object for each item in the entryArray /* { "id": "n0", "label": "A node", "x": 0, "y": 0, "size": 3 } */ for (var i = 0; i < entryArray.length; i++) { var nodeName = entryArray[i]; if (!network.maps.nodes[nodeName]) { var n = { "id":"n" + network.network.nodes.length, "label": nodeName, "x": Math.random() * 100, "y": Math.random() * 100, "size":1 }; network.maps.nodes[nodeName] = n; network.network.nodes.push(n); } network.maps.nodes[nodeName].size ++; } //Make edge objects /* { "id": "e0", "source": "n0", "target": "n1" } */ for (var i = 0; i < entryArray.length; i++) { var nodeName = entryArray[i]; var n1 = network.maps.nodes[nodeName]; for (var j = i + 1; j < entryArray.length; j++) { if (i != j) { var nodeName2 = entryArray[j]; var n2 = network.maps.nodes[nodeName2]; var e = { "id": "e" + network.network.edges.length, "source": n1.id, "target": n2.id }; network.network.edges.push(e); } } } } function incrementDict(dict, val, yi) { if (!dict[val]) { dict[val] = { "name":val, "total":0, "years":[], "callNums":{} }; } } function onParseFinished() { console.log("TOTAL OCCS:" + networkMap["Occupation"].network.nodes.length); console.log("TOTAL PLACES:" + networkMap["Places"].network.nodes.length); saveNetwork("Occupation"); saveNetwork("Places"); nextFile(); } function saveNetwork(name) { var network = networkMap[name]; var json = JSON.stringify({"nodes":network.network.nodes, "edges":network.network.edges}, null, 2); //Write fs.writeFile(name + '_network.json', json, 'utf8', function() { console.log("Saved " + name + " JSON."); }); } var counter = 1; function nextFile() { var n = (counter < 10 ? "0":"") + counter; var url = marc_location + "/Names.2014.part" + n + ".xml.gz"; var rstream = fs.createReadStream(url); var gunzip = zlib.createGunzip(); makeParser(); allRecords = []; console.log("LOADING FILE : " + url); counter ++; if (counter < 10) { rstream // reads from myfile.txt.gz .pipe(gunzip) // uncompresses .pipe(parser.saxStream); //Parses into record objects } } nextFile();