diff --git a/README.md b/README.md index 426b842..dc3ed18 100644 --- a/README.md +++ b/README.md @@ -136,6 +136,7 @@ crawler with the following options: | robotsIgnoreServerError | Indicates if `500` status code response for robots.txt should be ignored. Defaults to `false`. | | userAgent | User agent to use for requests. This can be either a string or a function that takes the URL being crawled. Defaults to `Mozilla/5.0 (compatible; supercrawler/1.0; +https://github.com/brendonboshell/supercrawler)`. | | request | Object of options to be passed to [request](https://github.com/request/request). Note that request does not support an asynchronous (and distributed) cookie jar. | +| maxContentLength | The maximum content length (bytes) for requests. This can be either a number or a function that takes the URL being downloaded. Defaults to `0` (no max). | Example usage: @@ -154,6 +155,7 @@ The following methods are available: | getInterval | Get the interval setting. | | getConcurrentRequestsLimit | Get the maximum number of concurrent requests. | | getUserAgent | Get the user agent. | +| getMaxContentLength | Get the maximum content length for the request. | | start | Start crawling. | | stop | Stop crawling. | | addHandler(handler) | Add a handler for all content types. | diff --git a/lib/Crawler.js b/lib/Crawler.js index ef8efd1..d3ce11a 100644 --- a/lib/Crawler.js +++ b/lib/Crawler.js @@ -6,7 +6,7 @@ var Crawler, Promise = require("bluebird"), urlMod = require("url"), NodeCache = require("node-cache"), - request = Promise.promisify(require("request")), + request = require("request"), robotsParser = require("robots-parser"), mime = require('mime-types'), _ = require("lodash"), @@ -44,6 +44,7 @@ Crawler = function (opts) { this._outstandingRequests = 0; this._robotsIgnoreServerError = opts.robotsIgnoreServerError || false; this._robotsEnabled = (opts.robotsEnabled !== false); + this._maxContentLength = opts.maxContentLength || 0; }; util.inherits(Crawler, EventEmitter); @@ -99,6 +100,19 @@ Crawler.prototype.getRequestOptions = function () { return this._request; }; +/** + * Get the maximum content length for the request. + * + * @return {number} Max content length in bytes. + */ +Crawler.prototype.getMaxContentLength = function (url) { + if (typeof this._maxContentLength === 'function') { + return this._maxContentLength(url); + } + + return this._maxContentLength; +}; + /** * Start the crawler. Pages will be crawled according to the configuration * provided to the Crawler's constructor. @@ -379,7 +393,9 @@ Crawler.prototype._fireHandlers = function (contentType, body, url) { */ Crawler.prototype._downloadUrl = function (url, followRedirect) { var defaultOptions, - requestOptions; + requestOptions, + totalBytes = 0, + self = this; defaultOptions = { url: url, @@ -393,21 +409,34 @@ Crawler.prototype._downloadUrl = function (url, followRedirect) { }; requestOptions = _.merge(defaultOptions, this.getRequestOptions()); - return request(requestOptions).catch(function (err) { - err = new error.RequestError("A request error occured. " + err.message); - - return Promise.reject(err); - }).then(function (response) { - var err; - - if (response.statusCode >= 400) { - err = new error.HttpError("HTTP status code is " + response.statusCode); - err.statusCode = response.statusCode; - - return Promise.reject(err); - } + return new Promise(function (resolve, reject) { + request(requestOptions, function (err, response) { + if (!response) { + err = new error.RequestError("A request error occured. " + err.message); + reject(err); + } else if (response.statusCode >= 400) { + err = new error.HttpError("HTTP status code is " + response.statusCode); + err.statusCode = response.statusCode; + + reject(err); + } else { + resolve(response); + } + }).on('error', function (err) { + err = new error.RequestError("A request error occured. " + err.message); + reject(err); + }).on('data', function (data) { + if (self.getMaxContentLength(url) <= 0) { + return; + } - return response; + // count bytes + totalBytes += data.length; + if (totalBytes > self.getMaxContentLength(url)) { + this.abort(); + reject(new error.RequestError('Max content length exceeded.')); + } + }); }); };