diff --git a/lib/spidr/page/html.rb b/lib/spidr/page/html.rb index c7d1212c..0d4f9b99 100644 --- a/lib/spidr/page/html.rb +++ b/lib/spidr/page/html.rb @@ -266,7 +266,7 @@ def urls def to_absolute(link) link = link.to_s new_url = begin - url.merge(link) + base_uri.merge(link) rescue Exception return end @@ -285,5 +285,14 @@ def to_absolute(link) return new_url end + + def base_uri + if (html? && doc) + base_tag = doc.search('//base[@href]').first + base_tag ? URI(base_tag.get_attribute('href')) : url + else + url + end + end end end diff --git a/spec/page/html_spec.rb b/spec/page/html_spec.rb index f3d9b5f4..e745a3f0 100644 --- a/spec/page/html_spec.rb +++ b/spec/page/html_spec.rb @@ -520,5 +520,17 @@ end end end + + context "when the page has a base tag" do + let(:base_href) { "http://www.google.com/" } + let(:body) { %{example

hello

} } + let(:link) { "/foo/" } + + subject { super().to_absolute(link) } + + it "should set the hostname to that of the base tag instead of the page's URL" do + expect(subject).to be == URI("#{base_href}").merge(link) + end + end end end