From 68c155b4cdbb8224b127256ad168cf54f7e6e370 Mon Sep 17 00:00:00 2001 From: MothOnMars Date: Sat, 9 Dec 2017 08:57:03 -0800 Subject: [PATCH] follow robots.txt redirects --- lib/robotex.rb | 7 +++--- robotex.gemspec | 2 ++ spec/robotex_spec.rb | 57 ++++++++++++++++++++++++++++++-------------- 3 files changed, 45 insertions(+), 21 deletions(-) diff --git a/lib/robotex.rb b/lib/robotex.rb index bf186ab..33c3d61 100644 --- a/lib/robotex.rb +++ b/lib/robotex.rb @@ -5,6 +5,7 @@ require 'open-uri' require 'uri' require 'timeout' +require 'open_uri_redirections' class Robotex @@ -17,7 +18,7 @@ class ParsedRobots def initialize(uri, user_agent) io = Robotex.get_robots_txt(uri, user_agent) - + if !io || io.content_type != "text/plain" || io.status != ["200", "OK"] io = StringIO.new("User-agent: *\nAllow: /\n") end @@ -100,8 +101,8 @@ def to_regex(pattern) def self.get_robots_txt(uri, user_agent) begin Timeout::timeout(Robotex.timeout) do - io = URI.join(uri.to_s, "/robots.txt").open("User-Agent" => user_agent) rescue nil - end + URI.join(uri.to_s, "/robots.txt").open("User-Agent" => user_agent, allow_redirections: :all) rescue nil + end rescue Timeout::Error STDERR.puts "robots.txt request timed out" end diff --git a/robotex.gemspec b/robotex.gemspec index e885531..ebdf085 100644 --- a/robotex.gemspec +++ b/robotex.gemspec @@ -10,6 +10,8 @@ spec = Gem::Specification.new do |s| s.rdoc_options << '-m' << 'README.rdoc' << '-t' << 'Robotex' s.extra_rdoc_files = ["README.rdoc"] + s.add_runtime_dependency "open_uri_redirections", "~> 0.2.1" + s.add_development_dependency "rake", ">=0.9.2" s.add_development_dependency "rdoc", ">=3.12" s.add_development_dependency "rspec", ">=2.8.0" diff --git a/spec/robotex_spec.rb b/spec/robotex_spec.rb index d8b4388..52cebef 100644 --- a/spec/robotex_spec.rb +++ b/spec/robotex_spec.rb @@ -1,25 +1,30 @@ require 'spec_helper' describe Robotex do + let(:robots) do + <<~END + User-Agent: msnbot + Crawl-Delay: 20 - before(:all) do + User-Agent: bender + Disallow: /my_shiny_metal_ass + + User-Agent: * + Disallow: /login + Allow: / + + Disallow: /locked + Allow: /locked + END + end + + let(:response) do + { body: robots, content_type: 'text/plain', status: [200, "OK"] } + end + + before do FakeWeb.allow_net_connect = false - robots = <<-END -User-Agent: msnbot -Crawl-Delay: 20 - -User-Agent: bender -Disallow: /my_shiny_metal_ass - -User-Agent: * -Disallow: /login -Allow: / - -Disallow: /locked -Allow: /locked -END - options = {:body => robots, :content_type => 'text/plain', :status => [200, "OK"]} - FakeWeb.register_uri(:get, SPEC_DOMAIN + 'robots.txt', options) + FakeWeb.register_uri(:get, SPEC_DOMAIN + 'robots.txt', response) end describe '#initialize' do @@ -65,13 +70,29 @@ robotex.allowed?(SPEC_DOMAIN + 'locked').should be_false end end + + context 'when the robots.txt url is redirected' do + let(:redirection) do + { status: [301], location: 'https://example.com/robots.txt' } + end + + before do + FakeWeb.register_uri(:get, SPEC_DOMAIN + 'robots.txt', redirection) + FakeWeb.register_uri(:get, 'https://example.com/robots.txt', response) + end + + it 'returns false' do + robotex = Robotex.new + robotex.allowed?(SPEC_DOMAIN + 'locked').should be_false + end + end end describe '#delay' do context 'when no Crawl-Delay is specified for the user-agent' do it 'returns nil' do robotex = Robotex.new - robotex.delay(SPEC_DOMAIN).should be_nil + robotex.delay(SPEC_DOMAIN).should be_nil end context 'when Crawl-Delay is specified for the user-agent' do