98 lines
2.5 KiB
JavaScript
98 lines
2.5 KiB
JavaScript
var debug = false;
|
|
|
|
var path = require("path");
|
|
var fs = require("fs");
|
|
var jsdom = require("jsdom").jsdom;
|
|
var html = require("html");
|
|
var serializeDocument = require("jsdom").serializeDocument;
|
|
var http = require("http");
|
|
var urlparse = require("url").parse;
|
|
var htmltidy = require("htmltidy2").tidy;
|
|
|
|
var readability = require("./readability/index");
|
|
var Readability = readability.Readability;
|
|
var JSDOMParser = readability.JSDOMParser;
|
|
|
|
var FFX_UA = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.10; rv:38.0) Gecko/20100101 Firefox/38.0";
|
|
|
|
function fetchSource(url, callbackFn) {
|
|
if (!url) {
|
|
console.error("You should pass a URL if the source doesn't exist yet!");
|
|
process.exit(1);
|
|
return;
|
|
}
|
|
var client = http;
|
|
if (url.indexOf("https") == 0) {
|
|
client = require("https");
|
|
}
|
|
var options = urlparse(url);
|
|
options.headers = {"User-Agent": FFX_UA};
|
|
|
|
client.get(options, function(response) {
|
|
if (debug) {
|
|
console.log("STATUS:", response.statusCode);
|
|
console.log("HEADERS:", JSON.stringify(response.headers));
|
|
}
|
|
response.setEncoding("utf-8");
|
|
var rv = "";
|
|
response.on("data", function(chunk) {
|
|
rv += chunk;
|
|
});
|
|
response.on("end", function() {
|
|
if (debug) {
|
|
console.log("End received");
|
|
}
|
|
sanitizeSource(rv, callbackFn);
|
|
});
|
|
});
|
|
}
|
|
|
|
function sanitizeSource(html, callbackFn) {
|
|
// htmltidy(serializeDocument(jsdom(html)), {
|
|
htmltidy(html, {
|
|
"indent": true,
|
|
"indent-spaces": 4,
|
|
"numeric-entities": true,
|
|
"output-xhtml": true,
|
|
"wrap": 0
|
|
}, callbackFn);
|
|
}
|
|
|
|
function runReadability(source) {
|
|
var uri = "http://fakehost/test/page.html";
|
|
var doc = new JSDOMParser().parse(source, uri);
|
|
var myReader, result, readerable;
|
|
try {
|
|
// We pass `caption` as a class to check that passing in extra classes works,
|
|
// given that it appears in some of the test documents.
|
|
myReader = new Readability(doc, { classesToPreserve: ["caption"] });
|
|
result = myReader.parse();
|
|
} catch (ex) {
|
|
console.error(ex);
|
|
ex.stack.forEach(console.log.bind(console));
|
|
}
|
|
if (!result) {
|
|
console.error("No content generated by readability!");
|
|
return;
|
|
}
|
|
|
|
var prettyHTML = html.prettyPrint(result.content, {indent_size: 2});
|
|
console.log(prettyHTML)
|
|
return prettyHTML;
|
|
}
|
|
|
|
function onResponseReceived(error, source) {
|
|
if (error) {
|
|
console.error("Couldn't tidy source html!");
|
|
console.error(error);
|
|
return;
|
|
}
|
|
return runReadability(source);
|
|
}
|
|
|
|
module.exports = {
|
|
parseUrl: function(theUrl) {
|
|
return fetchSource(theUrl, onResponseReceived);
|
|
}
|
|
};
|