Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -136,6 +136,7 @@ crawler with the following options:
| robotsIgnoreServerError | Indicates if `500` status code response for robots.txt should be ignored. Defaults to `false`. |
| userAgent | User agent to use for requests. This can be either a string or a function that takes the URL being crawled. Defaults to `Mozilla/5.0 (compatible; supercrawler/1.0; +https://github.com/brendonboshell/supercrawler)`. |
| request | Object of options to be passed to [request](https://github.com/request/request). Note that request does not support an asynchronous (and distributed) cookie jar. |
| maxContentLength | The maximum content length (bytes) for requests. This can be either a number or a function that takes the URL being downloaded. Defaults to `0` (no max). |

Example usage:

Expand All @@ -154,6 +155,7 @@ The following methods are available:
| getInterval | Get the interval setting. |
| getConcurrentRequestsLimit | Get the maximum number of concurrent requests. |
| getUserAgent | Get the user agent. |
| getMaxContentLength | Get the maximum content length for the request. |
| start | Start crawling. |
| stop | Stop crawling. |
| addHandler(handler) | Add a handler for all content types. |
Expand Down
61 changes: 45 additions & 16 deletions lib/Crawler.js
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ var Crawler,
Promise = require("bluebird"),
urlMod = require("url"),
NodeCache = require("node-cache"),
request = Promise.promisify(require("request")),
request = require("request"),
robotsParser = require("robots-parser"),
mime = require('mime-types'),
_ = require("lodash"),
Expand Down Expand Up @@ -44,6 +44,7 @@ Crawler = function (opts) {
this._outstandingRequests = 0;
this._robotsIgnoreServerError = opts.robotsIgnoreServerError || false;
this._robotsEnabled = (opts.robotsEnabled !== false);
this._maxContentLength = opts.maxContentLength || 0;
};

util.inherits(Crawler, EventEmitter);
Expand Down Expand Up @@ -99,6 +100,19 @@ Crawler.prototype.getRequestOptions = function () {
return this._request;
};

/**
* Get the maximum content length for the request.
*
* @return {number} Max content length in bytes.
*/
Crawler.prototype.getMaxContentLength = function (url) {
if (typeof this._maxContentLength === 'function') {
return this._maxContentLength(url);
}

return this._maxContentLength;
};

/**
* Start the crawler. Pages will be crawled according to the configuration
* provided to the Crawler's constructor.
Expand Down Expand Up @@ -379,7 +393,9 @@ Crawler.prototype._fireHandlers = function (contentType, body, url) {
*/
Crawler.prototype._downloadUrl = function (url, followRedirect) {
var defaultOptions,
requestOptions;
requestOptions,
totalBytes = 0,
self = this;

defaultOptions = {
url: url,
Expand All @@ -393,21 +409,34 @@ Crawler.prototype._downloadUrl = function (url, followRedirect) {
};
requestOptions = _.merge(defaultOptions, this.getRequestOptions());

return request(requestOptions).catch(function (err) {
err = new error.RequestError("A request error occured. " + err.message);

return Promise.reject(err);
}).then(function (response) {
var err;

if (response.statusCode >= 400) {
err = new error.HttpError("HTTP status code is " + response.statusCode);
err.statusCode = response.statusCode;

return Promise.reject(err);
}
return new Promise(function (resolve, reject) {
request(requestOptions, function (err, response) {
if (!response) {
err = new error.RequestError("A request error occured. " + err.message);
reject(err);
} else if (response.statusCode >= 400) {
err = new error.HttpError("HTTP status code is " + response.statusCode);
err.statusCode = response.statusCode;

reject(err);
} else {
resolve(response);
}
}).on('error', function (err) {
err = new error.RequestError("A request error occured. " + err.message);
reject(err);
}).on('data', function (data) {
if (self.getMaxContentLength(url) <= 0) {
return;
}

return response;
// count bytes
totalBytes += data.length;
if (totalBytes > self.getMaxContentLength(url)) {
this.abort();
reject(new error.RequestError('Max content length exceeded.'));
}
});
});
};

Expand Down