PDF parsing support for fetch title

Merge #707
2020-03-02 20:34:01 -06:00 · 2020-03-02 20:34:01 -06:00 · 5f50c79a20
parent feea257c64
commit 5f50c79a20
4 changed files with 90 additions and 37 deletions
--- a/1
+++ b/1
@ -29,6 +29,7 @@ gem "rotp"
 gem "rqrcode"

 # parsing
+gem "pdf-reader"
 gem "nokogiri", ">= 1.7.2"
 gem "htmlentities"
 gem "commonmarker", "~> 0.14"
--- a/Gemfile.lock
+++ b/Gemfile.lock
@ -1,6 +1,7 @@
 GEM
  remote: https://rubygems.org/
  specs:
+    Ascii85 (1.0.3)
    actioncable (5.2.4.1)
      actionpack (= 5.2.4.1)
      nio4r (~> 2.0)
@ -48,6 +49,7 @@ GEM
      tzinfo (~> 1.1)
    addressable (2.6.0)
      public_suffix (>= 2.0.2, < 4.0)
+    afm (0.2.2)
    arel (9.0.0)
    ast (2.4.0)
    bcrypt (3.1.13)
@ -90,6 +92,7 @@ GEM
    good_migrations (0.0.2)
      activerecord (>= 3.1)
      railties (>= 3.1)
+    hashery (2.1.2)
    htmlentities (4.3.4)
    i18n (1.8.2)
      concurrent-ruby (~> 1.0)
@ -124,6 +127,12 @@ GEM
    parallel (1.19.1)
    parser (2.7.0.2)
      ast (~> 2.4.0)
+    pdf-reader (2.2.0)
+      Ascii85 (~> 1.0.0)
+      afm (~> 0.2.1)
+      hashery (~> 2.0)
+      ruby-rc4
+      ttfunk
    public_suffix (3.1.1)
    rack (2.1.2)
    rack-test (1.1.0)
@ -196,6 +205,7 @@ GEM
    ruby-enum (0.7.2)
      i18n
    ruby-progressbar (1.10.1)
+    ruby-rc4 (0.1.5)
    ruby_dep (1.5.0)
    ruumba (0.1.10)
      rubocop
@ -223,6 +233,7 @@ GEM
    transaction_retry (1.0.3)
      activerecord (>= 3.0.11)
      transaction_isolation (>= 1.0.2)
+    ttfunk (1.5.1)
    tzinfo (1.2.6)
      thread_safe (~> 0.1)
    uglifier (4.1.20)
@ -263,6 +274,7 @@ DEPENDENCIES
  mysql2
  nokogiri (>= 1.7.2)
  oauth
+  pdf-reader
  rails (~> 5.2)
  rb-readline
  rotp
--- a/app/models/story.rb
+++ b/app/models/story.rb
@ -138,7 +138,7 @@ class Story < ApplicationRecord

  attr_accessor :editing_from_suggestions, :editor, :fetching_ip, :is_hidden_by_cur_user,
                :is_saved_by_cur_user, :moderation_reason, :previewing, :seen_previous, :vote
-  attr_writer :fetched_content
+  attr_writer :fetched_response

  before_validation :assign_short_id_and_upvote, :on => :create
  before_create :assign_initial_hotness
@ -915,32 +915,9 @@ class Story < ApplicationRecord
    }.join(", ")
  end

-  def fetched_attributes
-    return @fetched_attributes if @fetched_attributes
-
-    @fetched_attributes = {
-      :url => self.url,
-      :title => "",
-    }
-
-    # security: do not connect to arbitrary user-submitted ports
-    return @fetched_attributes if @url_port
-
-    if !@fetched_content
-      begin
-        s = Sponge.new
-        s.timeout = 3
-        # User submitted URLs may have an incorrect https certificate, but we
-        # don't want to fail the retrieval for this. Security risk is minimal.
-        s.ssl_verify = false
-        user_agent = { "User-agent" => "#{Rails.application.domain} for #{fetching_ip}" }
-        @fetched_content = s.fetch(url, :get, nil, nil, user_agent, 3).body.force_encoding('utf-8')
-      rescue
-        return @fetched_attributes
-      end
-    end
-
-    parsed = Nokogiri::HTML(@fetched_content.to_s)
+  def fetched_attributes_html
+    converted = @fetched_response.body.force_encoding('utf-8')
+    parsed = Nokogiri::HTML(converted.to_s)

    # parse best title from html tags
    # try <meta property="og:title"> first, it probably won't have the site
@ -998,6 +975,54 @@ class Story < ApplicationRecord
    @fetched_attributes
  end

+  def fetched_attributes_pdf
+    return @fetched_attributes = {} if @fetched_response.body >= 5.megabytes
+
+    # pdf-reader only accepts a stream or filename
+    pdf_stream = StringIO.new(@fetched_response.body)
+    pdf = PDF::Reader.new(pdf_stream)
+
+    title = pdf.info[:Title]
+
+    @fetched_attributes[:title] = title
+    @fetched_attributes
+  end
+
+  def fetched_attributes
+    return @fetched_attributes if @fetched_attributes
+
+    @fetched_attributes = {
+      :url => self.url,
+      :title => "",
+    }
+
+    # security: do not connect to arbitrary user-submitted ports
+    return @fetched_attributes if @url_port
+
+    begin
+      # if we haven't had a test inject a response into us
+      if !@fetched_response
+        s = Sponge.new
+        s.timeout = 3
+        # User submitted URLs may have an incorrect https certificate, but we
+        # don't want to fail the retrieval for this. Security risk is minimal.
+        s.ssl_verify = false
+        user_agent = { "User-agent" => "#{Rails.application.domain} for #{fetching_ip}" }
+        res = s.fetch(url, :get, nil, nil, user_agent, 3)
+        @fetched_response = res
+      end
+
+      case @fetched_response["content-type"]
+      when /pdf/
+        return fetched_attributes_pdf
+      else
+        return fetched_attributes_html
+      end
+    rescue
+      return @fetched_attributes
+    end
+  end
+
 private

  def valid_canonical_uri?(url)
--- a/spec/models/story_spec.rb
+++ b/spec/models/story_spec.rb
@ -152,13 +152,26 @@ describe Story do
  context 'fetching titles' do
    let(:story_directory) { Rails.root.join 'spec/fixtures/story_pages/' }

+    # this is more elaborate than the previous system, because now it needs to know the content type
+    def fake_response(content, type, code = '200')
+      res = Net::HTTPResponse.new(1.0, code, "OK")
+      res.add_field("content-type", type)
+      # we can't seemingly just set body, so...
+      allow(res).to receive(:body).and_return(content)
+      return res
+    end
+
    it "can fetch its title properly" do
+      content = File.read(story_directory + "title_ampersand.html")
+      res = fake_response(content, "text/html")
      s = build(:story)
-      s.fetched_content = File.read(story_directory + "title_ampersand.html")
+      s.fetched_response = res
      expect(s.fetched_attributes[:title]).to eq("B2G demo & quick hack // by Paul Rouget")

+      content = File.read(story_directory + "title_google.html")
+      res = fake_response(content, "text/html")
      s = build(:story)
-      s.fetched_content = File.read(story_directory + "title_google.html")
+      s.fetched_response = res
      expect(s.fetched_attributes[:title]).to eq("Google")
    end

@ -171,44 +184,46 @@ describe Story do
    it "does not follow rel=canonical when this is to the main page" do
      url = "https://www.mcsweeneys.net/articles/who-said-it-donald-trump-or-regina-george"
      s = build(:story, url: url)
-      s.fetched_content = File.read(story_directory + "canonical_root.html")
+      s.fetched_response = File.read(story_directory + "canonical_root.html")
      expect(s.fetched_attributes[:url]).to eq(url)
    end

    it "does not assign canonical url when the response is non-200" do
      url = "https://www.mcsweeneys.net/a/who-said-it-donald-trump-or-regina-george"
+      content = File.read(story_directory + "canonical_error.html")
+      res = fake_response(content, "text/html", '404')

      expect_any_instance_of(Sponge)
        .to receive(:fetch)
        .and_return(Net::HTTPResponse.new(1.0, 404, "OK"))

      s = build(:story, url: url)
-      s.fetched_content = File.read(story_directory + "canonical_error.html")
+      s.fetched_response = res
      expect(s.fetched_attributes[:url]).to eq(url)
    end

    it "assigns canonical when url when it resolves 200" do
      url = "https://www.mcsweeneys.net/a/who-said-it-donald-trump-or-regina-george"
      canonical = "https://www.mcsweeneys.net/articles/who-said-it-donald-trump-or-regina-george"
+      content = File.read(story_directory + "canonical_error.html")
+      res = fake_response(content, "text/html")

      expect_any_instance_of(Sponge)
        .to receive(:fetch)
        .and_return(Net::HTTPResponse.new(1.0, 200, "OK"))

      s = build(:story, url: url)
-      s.fetched_content = File.read(story_directory + "canonical_error.html")
+      s.fetched_response = res
      expect(s.fetched_attributes[:url]).to eq(canonical)
    end

    context "with unicode" do
-      before do
+      it "can fetch unicode titles properly" do
        content = "<!DOCTYPE html><html><title>你好世界！ Here’s a fancy apostrophe</title></html>"
                  .force_encoding('ASCII-8BIT') # This is the encoding returned by Sponge#fetch
-        allow_any_instance_of(Sponge).to receive(:fetch).and_return double(body: content)
-      end
-
-      it "can fetch unicode titles properly" do
+        res = fake_response(content, "text/html")
        s = build(:story)
+        s.fetched_response = res
        expect(s.fetched_attributes[:title]).to eq("你好世界！ Here’s a fancy apostrophe")
      end
    end