ArtArchive/Class5/marc_network/network.js

234 lines
5.2 KiB
JavaScript

/*
Node.js parser for MARC Files from Library of Congress
- Make network JSON files for consumption by sigma.js
- npm run download-data to get data files (you may have to install wget)
Jer Thorp (@blprnt)
December, 2017
*/
let request = require('request');
const fs = require('fs');
const zlib = require('zlib');
const concat = require('concat-stream');
const xml2object = require('xml2object');
const appRoot = require('app-root-path');
const natural = require('natural');
var dataPath = appRoot + "/data";
const marc_location = dataPath;
var docCount = 0;
var docCounts = [];
var callNumCounts = [];
var networkMap = {};
//XML Parser
var parser;
// Create a new xml parser looking for the record objects
function makeParser() {
parser = new xml2object([ 'record' ]);
parser.outs = [];
parser.on('object', function(name, obj) {
if (Math.random() < 1) parseRecord(obj);
});
parser.on('end', function() {
console.log('Finished parsing xml!');
onParseFinished();
});
}
//Record parser
//Parse MARC record into a usable JSON object
//https://folgerpedia.folger.edu/Interpreting_MARC_records#2xx
//SUPER rough for now!
const marcDict = {};
marcDict["370"] = {"a" :"BirthPlace", "b": "DeathPlace", "e": "ResidencePlace", "f":"OtherPlace"};
marcDict["046"] = {"f" :"BirthDate", "g":"DeathDate"};
marcDict["374"] = {"a" :"Occupation"};
var nameDict = {};
var lastNameDict = {};
var callDict = {};
var callYearTotals = [];
var allRecords = [];
function parseRecord(obj) {
record = {};
for (var i = 0; i < obj.datafield.length; i++) {
var df = obj.datafield[i];
//Get the numeric tag
var tag = df.tag;
//If we have the tag in our dictionary, write to the JSON object
//Based on the code (doesn't work for all cases?)
if (marcDict[tag]) {
for (var j = 0; j < df.subfield.length; j++) {
var code = df.subfield[j].code;
var disp = df.subfield[j]['$t'];
if (marcDict[tag][code]) {
if (!record[marcDict[tag][code]]) {
record[marcDict[tag][code]] = [];
}
record[marcDict[tag][code]].push(disp);
}
}
}
}
if (record.Occupation) {
if (record.Occupation.length > 0) {
var stemmed = [];
//Stem the occupation (ie writers -> writer) to reduce duplicates
for (var i = 0; i < record.Occupation.length; i++) {
var stem = natural.PorterStemmer.stem(record.Occupation[i]);
//console.log(record.Occupation[i] + ":" + stem)
stemmed.push(stem);
}
fileNetworkEntry("Occupation", stemmed);
}
}
if (record.BirthPlace && record.DeathPlace) {
fileNetworkEntry("Places", [record.BirthPlace[0], record.DeathPlace[0]]);
//console.log(record.BirthPlace + ":" + record.DeathPlace);
}
}
function fileNetworkEntry(networkKey, entryArray) {
//Does the key exist? If not create a new network object
if (!networkMap[networkKey]) {
networkMap[networkKey] = {"network":{"nodes":[], "edges":[]}, "maps":{"nodes":{}}};
console.log("CREATED KEY:" + networkKey)
}
var network = networkMap[networkKey];
//Make a node object for each item in the entryArray
/*
{
"id": "n0",
"label": "A node",
"x": 0,
"y": 0,
"size": 3
}
*/
for (var i = 0; i < entryArray.length; i++) {
var nodeName = entryArray[i];
if (!network.maps.nodes[nodeName]) {
var n = {
"id":"n" + network.network.nodes.length,
"label": nodeName,
"x": Math.random() * 100,
"y": Math.random() * 100,
"size":1
};
network.maps.nodes[nodeName] = n;
network.network.nodes.push(n);
}
network.maps.nodes[nodeName].size ++;
}
//Make edge objects
/*
{
"id": "e0",
"source": "n0",
"target": "n1"
}
*/
for (var i = 0; i < entryArray.length; i++) {
var nodeName = entryArray[i];
var n1 = network.maps.nodes[nodeName];
for (var j = i + 1; j < entryArray.length; j++) {
if (i != j) {
var nodeName2 = entryArray[j];
var n2 = network.maps.nodes[nodeName2];
var e = {
"id": "e" + network.network.edges.length,
"source": n1.id,
"target": n2.id
};
network.network.edges.push(e);
}
}
}
}
function incrementDict(dict, val, yi) {
if (!dict[val]) {
dict[val] = {
"name":val,
"total":0,
"years":[],
"callNums":{}
};
}
}
function onParseFinished() {
console.log("TOTAL OCCS:" + networkMap["Occupation"].network.nodes.length);
console.log("TOTAL PLACES:" + networkMap["Places"].network.nodes.length);
saveNetwork("Occupation");
saveNetwork("Places");
nextFile();
}
function saveNetwork(name) {
var network = networkMap[name];
var json = JSON.stringify({"nodes":network.network.nodes, "edges":network.network.edges}, null, 2);
//Write
fs.writeFile(name + '_network.json', json, 'utf8', function() {
console.log("Saved " + name + " JSON.");
});
}
var counter = 1;
function nextFile() {
var n = (counter < 10 ? "0":"") + counter;
var url = marc_location + "/Names.2014.part" + n + ".xml.gz";
var rstream = fs.createReadStream(url);
var gunzip = zlib.createGunzip();
makeParser();
allRecords = [];
console.log("LOADING FILE : " + url);
counter ++;
if (counter < 10) {
rstream // reads from myfile.txt.gz
.pipe(gunzip) // uncompresses
.pipe(parser.saxStream); //Parses into record objects
}
}
nextFile();