brendonboshell · cbess · Nov 30, 2019 · Dec 1, 2019 · Dec 5, 2019
diff --git a/README.md b/README.md
@@ -136,6 +136,7 @@ crawler with the following options:
 | robotsIgnoreServerError | Indicates if `500` status code response for robots.txt should be ignored. Defaults to `false`. |
 | userAgent | User agent to use for requests. This can be either a string or a function that takes the URL being crawled. Defaults to `Mozilla/5.0 (compatible; supercrawler/1.0; +https://github.com/brendonboshell/supercrawler)`. |
 | request | Object of options to be passed to [request](https://github.com/request/request). Note that request does not support an asynchronous (and distributed) cookie jar. |
+| maxContentLength | The maximum content length (bytes) for requests. This can be either a number or a function that takes the URL being downloaded. Defaults to `0` (no max). |
 
 Example usage:
 
@@ -154,6 +155,7 @@ The following methods are available:
 | getInterval | Get the interval setting. |
 | getConcurrentRequestsLimit | Get the maximum number of concurrent requests. |
 | getUserAgent | Get the user agent. |
+| getMaxContentLength | Get the maximum content length for the request. |
 | start | Start crawling. |
 | stop | Stop crawling. |
 | addHandler(handler) | Add a handler for all content types. |

diff --git a/lib/Crawler.js b/lib/Crawler.js
@@ -6,7 +6,7 @@ var Crawler,
     Promise = require("bluebird"),
     urlMod = require("url"),
     NodeCache = require("node-cache"),
-    request = Promise.promisify(require("request")),
+    request = require("request"),
     robotsParser = require("robots-parser"),
     mime = require('mime-types'),
     _ = require("lodash"),
@@ -44,6 +44,7 @@ Crawler = function (opts) {
   this._outstandingRequests = 0;
   this._robotsIgnoreServerError = opts.robotsIgnoreServerError || false;
   this._robotsEnabled = (opts.robotsEnabled !== false);
+  this._maxContentLength = opts.maxContentLength || 0;
 };
 
 util.inherits(Crawler, EventEmitter);
@@ -99,6 +100,19 @@ Crawler.prototype.getRequestOptions = function () {
   return this._request;
 };
 
+/**
+ * Get the maximum content length for the request.
+ *
+ * @return {number} Max content length in bytes.
+ */
+Crawler.prototype.getMaxContentLength = function (url) {
+  if (typeof this._maxContentLength === 'function') {
+    return this._maxContentLength(url);
+  }
+
+  return this._maxContentLength;
+};
+
 /**
  * Start the crawler. Pages will be crawled according to the configuration
  * provided to the Crawler's constructor.
@@ -379,7 +393,9 @@ Crawler.prototype._fireHandlers = function (contentType, body, url) {
  */
 Crawler.prototype._downloadUrl = function (url, followRedirect) {
   var defaultOptions,
-      requestOptions;
+      requestOptions,
+      totalBytes = 0,
+      self = this;
 
   defaultOptions = {
     url: url,
@@ -393,21 +409,34 @@ Crawler.prototype._downloadUrl = function (url, followRedirect) {
   };
   requestOptions = _.merge(defaultOptions, this.getRequestOptions());
 
-  return request(requestOptions).catch(function (err) {
-    err = new error.RequestError("A request error occured. " + err.message);
-
-    return Promise.reject(err);
-  }).then(function (response) {
-    var err;
-
-    if (response.statusCode >= 400) {
-      err = new error.HttpError("HTTP status code is " + response.statusCode);
-      err.statusCode = response.statusCode;
-
-      return Promise.reject(err);
-    }
+  return new Promise(function (resolve, reject) {
+    request(requestOptions, function (err, response) {
+      if (!response) {
+        err = new error.RequestError("A request error occured. " + err.message);
+        reject(err);
+      } else if (response.statusCode >= 400) {
+        err = new error.HttpError("HTTP status code is " + response.statusCode);
+        err.statusCode = response.statusCode;
+
+        reject(err);
+      } else {
+        resolve(response);
+      }
+    }).on('error', function (err) {
+      err = new error.RequestError("A request error occured. " + err.message);
+      reject(err);
+    }).on('data', function (data) {
+      if (self.getMaxContentLength(url) <= 0) {
+        return;
+      }
 
-    return response;
+      // count bytes
+      totalBytes += data.length;
+      if (totalBytes > self.getMaxContentLength(url)) {
+        this.abort();
+        reject(new error.RequestError('Max content length exceeded.'));
+      }
+    });
   });
 };