Handle robots.txt properly (even with "Crawl-Delay"-Support)
This commit is contained in:
parent
b8307957a5
commit
43c0a4c3e0
|
@ -15,6 +15,7 @@
|
||||||
"md5": "^2.0.0",
|
"md5": "^2.0.0",
|
||||||
"memcached": "^2.2.1",
|
"memcached": "^2.2.1",
|
||||||
"moment": "^2.11.2",
|
"moment": "^2.11.2",
|
||||||
|
"robots": "^0.9.4",
|
||||||
"swagger-ui": "^2.1.4"
|
"swagger-ui": "^2.1.4"
|
||||||
},
|
},
|
||||||
"author": "DracoBlue <JanS@DracoBlue.de>",
|
"author": "DracoBlue <JanS@DracoBlue.de>",
|
||||||
|
|
138
src/Storage.js
138
src/Storage.js
|
@ -5,6 +5,7 @@ var http = require('http');
|
||||||
var https = require('https');
|
var https = require('https');
|
||||||
var moment = require('moment');
|
var moment = require('moment');
|
||||||
var fs = require('fs');
|
var fs = require('fs');
|
||||||
|
var robots = require('robots');
|
||||||
var info = JSON.parse(fs.readFileSync(__dirname + '/../package.json'));
|
var info = JSON.parse(fs.readFileSync(__dirname + '/../package.json'));
|
||||||
info.version = info.version || 'dev';
|
info.version = info.version || 'dev';
|
||||||
|
|
||||||
|
@ -12,6 +13,8 @@ var Storage = function(client, memcached) {
|
||||||
|
|
||||||
this.client = client;
|
this.client = client;
|
||||||
this.memcached = memcached;
|
this.memcached = memcached;
|
||||||
|
this.robotsParserUrlMap = {};
|
||||||
|
this.userAgent = "twtxt-registry/" + info.version;
|
||||||
};
|
};
|
||||||
|
|
||||||
Storage.prototype.addUser = function(url, nickname, cb) {
|
Storage.prototype.addUser = function(url, nickname, cb) {
|
||||||
|
@ -187,66 +190,119 @@ Storage.prototype.getTweetsByMentions = function(twtxtUrl, page, cb) {
|
||||||
Storage.prototype.startUpdating = function() {
|
Storage.prototype.startUpdating = function() {
|
||||||
var that = this;
|
var that = this;
|
||||||
|
|
||||||
|
var lastUpdate = 0;
|
||||||
|
var seconds = 60;
|
||||||
|
|
||||||
clearInterval(this.updatingInterval);
|
clearInterval(this.updatingInterval);
|
||||||
|
|
||||||
var updateAllUrls = function() {
|
var updateAllUrls = function() {
|
||||||
|
|
||||||
|
lastUpdate++;
|
||||||
|
|
||||||
|
if (lastUpdate > (60 * 24 * 60) / seconds) {
|
||||||
|
lastUpdate = 0;
|
||||||
|
}
|
||||||
|
|
||||||
that.forEachUser(function(user) {
|
that.forEachUser(function(user) {
|
||||||
var client = http;
|
var client = http;
|
||||||
var urlParts = urlUtils.parse(user.url);
|
var options = urlUtils.parse(user.url);
|
||||||
|
|
||||||
if (urlParts['protocol'] === "https:") {
|
if (options['protocol'] === "https:") {
|
||||||
client = https;
|
client = https;
|
||||||
}
|
}
|
||||||
|
|
||||||
var options = {
|
options.headers = that.userAgent;
|
||||||
hostname: urlParts['hostname'],
|
options.method = 'GET';
|
||||||
port: urlParts['port'] || (urlParts['protocol'] === "https:" ? 443 : 80),
|
|
||||||
path: urlParts['path'],
|
var robotsTxtOptions = JSON.parse(JSON.stringify(options));
|
||||||
method: 'GET',
|
robotsTxtOptions.path = "/robots.txt";
|
||||||
headers: {
|
robotsTxtOptions.pathname = "/robots.txt";
|
||||||
"User-Agent": "twtxt-registry/" + info.version
|
var robotsTxtUrl = urlUtils.format(robotsTxtOptions);
|
||||||
|
|
||||||
|
var fetchUrlIfAllowed = function(parser) {
|
||||||
|
/* default delay is 100 times a day */
|
||||||
|
var crawlDelay = Math.ceil((parser.getCrawlDelay(that.userAgent) || 900) / seconds);
|
||||||
|
|
||||||
|
//console.log("CrawlDelay: " + parser.getCrawlDelay(that.userAgent) + " for " + robotsTxtUrl);
|
||||||
|
//console.log("number is at", lastUpdate, "delay is at", crawlDelay, "% is at", lastUpdate % crawlDelay);
|
||||||
|
|
||||||
|
if (crawlDelay != 0 && lastUpdate % crawlDelay != 0) {
|
||||||
|
//console.log("does not match crawlDelay! STOP");
|
||||||
|
return ;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
//console.log("does match crawlDelay! FETCH");
|
||||||
|
|
||||||
|
parser.canFetch(that.userAgent, options.path, function (access, url, reason) {
|
||||||
|
if (!access) {
|
||||||
|
console.error("not allowed to fetch", user.url, "because of " + robotsTxtUrl + ":", reason.type, " statusCode:", reason.statusCode);
|
||||||
|
return ;
|
||||||
|
}
|
||||||
|
|
||||||
|
var key = md5(user.url);
|
||||||
|
|
||||||
|
that.memcached.get('last-modified-since-' + key, function(err, memcacheData) {
|
||||||
|
|
||||||
|
if (memcacheData) {
|
||||||
|
options.headers['If-Modified-Since'] = memcacheData;
|
||||||
|
}
|
||||||
|
|
||||||
|
var req = client.request(options, function(res) {
|
||||||
|
var body = [];
|
||||||
|
res.on('data', function(chunk) {
|
||||||
|
body.push(chunk);
|
||||||
|
}).on('end', function() {
|
||||||
|
if (res.statusCode == 304) {
|
||||||
|
return ;
|
||||||
|
}
|
||||||
|
body = Buffer.concat(body).toString();
|
||||||
|
|
||||||
|
var txt = new TwtxtTxt(user.url, user.nickname, body);
|
||||||
|
txt.getTweets().forEach(function(tweet) {
|
||||||
|
that.storeTweet(tweet, function() {
|
||||||
|
});
|
||||||
|
});
|
||||||
|
|
||||||
|
if (res.headers['last-modified']) {
|
||||||
|
that.memcached.set('last-modified-since-' + key, res.headers['last-modified'], 60*60*24, function() {
|
||||||
|
});
|
||||||
|
}
|
||||||
|
});
|
||||||
|
|
||||||
|
}).on('error', function (e) {});
|
||||||
|
req.end();
|
||||||
|
});
|
||||||
|
});
|
||||||
};
|
};
|
||||||
|
|
||||||
var key = md5(user.url);
|
|
||||||
|
|
||||||
that.memcached.get('last-modified-since-' + key, function(err, memcacheData) {
|
if (!that.robotsParserUrlMap[robotsTxtUrl]) {
|
||||||
|
console.log("Creating robots.txt parser for:", robotsTxtUrl);
|
||||||
|
that.robotsParserUrlMap[robotsTxtUrl] = new robots.RobotsParser(
|
||||||
|
robotsTxtUrl,
|
||||||
|
that.userAgent,
|
||||||
|
function (parser, success) {
|
||||||
|
fetchUrlIfAllowed(that.robotsParserUrlMap[robotsTxtUrl]);
|
||||||
|
}
|
||||||
|
);
|
||||||
|
|
||||||
if (memcacheData) {
|
/* update the parser once in a day */
|
||||||
options.headers['If-Modified-Since'] = memcacheData;
|
setInterval(function() {
|
||||||
}
|
console.log("Recreating robots.txt parser for:", robotsTxtUrl);
|
||||||
|
that.robotsParserUrlMap[robotsTxtUrl] = new robots.RobotsParser(
|
||||||
var req = client.request(options, function(res) {
|
robotsTxtUrl,
|
||||||
var body = [];
|
that.userAgent
|
||||||
res.on('data', function(chunk) {
|
);
|
||||||
body.push(chunk);
|
}, 24 * 60 * 60000);
|
||||||
}).on('end', function() {
|
} else {
|
||||||
if (res.statusCode == 304) {
|
fetchUrlIfAllowed(that.robotsParserUrlMap[robotsTxtUrl]);
|
||||||
return ;
|
}
|
||||||
}
|
|
||||||
body = Buffer.concat(body).toString();
|
|
||||||
|
|
||||||
var txt = new TwtxtTxt(user.url, user.nickname, body);
|
|
||||||
txt.getTweets().forEach(function(tweet) {
|
|
||||||
that.storeTweet(tweet, function() {
|
|
||||||
});
|
|
||||||
});
|
|
||||||
|
|
||||||
if (res.headers['last-modified']) {
|
|
||||||
that.memcached.set('last-modified-since-' + key, res.headers['last-modified'], 60*60*24, function() {
|
|
||||||
});
|
|
||||||
}
|
|
||||||
});
|
|
||||||
|
|
||||||
}).on('error', function (e) {});
|
|
||||||
req.end();
|
|
||||||
});
|
|
||||||
});
|
});
|
||||||
};
|
};
|
||||||
|
|
||||||
this.updatingInterval = setInterval(function() {
|
this.updatingInterval = setInterval(function() {
|
||||||
updateAllUrls();
|
updateAllUrls();
|
||||||
}, 864000); // 100 times a day
|
}, seconds * 1000); // check every minute for the crawl delay (fallback to 100 times a day!)
|
||||||
|
|
||||||
updateAllUrls();
|
updateAllUrls();
|
||||||
};
|
};
|
||||||
|
|
Reference in New Issue