PDF parsing support for fetch title

Merge #707
This commit is contained in:
Peter Bhat Harkins 2020-03-02 20:34:01 -06:00
parent feea257c64
commit 5f50c79a20
4 changed files with 90 additions and 37 deletions

View File

@ -29,6 +29,7 @@ gem "rotp"
gem "rqrcode"
# parsing
gem "pdf-reader"
gem "nokogiri", ">= 1.7.2"
gem "htmlentities"
gem "commonmarker", "~> 0.14"

View File

@ -1,6 +1,7 @@
GEM
remote: https://rubygems.org/
specs:
Ascii85 (1.0.3)
actioncable (5.2.4.1)
actionpack (= 5.2.4.1)
nio4r (~> 2.0)
@ -48,6 +49,7 @@ GEM
tzinfo (~> 1.1)
addressable (2.6.0)
public_suffix (>= 2.0.2, < 4.0)
afm (0.2.2)
arel (9.0.0)
ast (2.4.0)
bcrypt (3.1.13)
@ -90,6 +92,7 @@ GEM
good_migrations (0.0.2)
activerecord (>= 3.1)
railties (>= 3.1)
hashery (2.1.2)
htmlentities (4.3.4)
i18n (1.8.2)
concurrent-ruby (~> 1.0)
@ -124,6 +127,12 @@ GEM
parallel (1.19.1)
parser (2.7.0.2)
ast (~> 2.4.0)
pdf-reader (2.2.0)
Ascii85 (~> 1.0.0)
afm (~> 0.2.1)
hashery (~> 2.0)
ruby-rc4
ttfunk
public_suffix (3.1.1)
rack (2.1.2)
rack-test (1.1.0)
@ -196,6 +205,7 @@ GEM
ruby-enum (0.7.2)
i18n
ruby-progressbar (1.10.1)
ruby-rc4 (0.1.5)
ruby_dep (1.5.0)
ruumba (0.1.10)
rubocop
@ -223,6 +233,7 @@ GEM
transaction_retry (1.0.3)
activerecord (>= 3.0.11)
transaction_isolation (>= 1.0.2)
ttfunk (1.5.1)
tzinfo (1.2.6)
thread_safe (~> 0.1)
uglifier (4.1.20)
@ -263,6 +274,7 @@ DEPENDENCIES
mysql2
nokogiri (>= 1.7.2)
oauth
pdf-reader
rails (~> 5.2)
rb-readline
rotp

View File

@ -138,7 +138,7 @@ class Story < ApplicationRecord
attr_accessor :editing_from_suggestions, :editor, :fetching_ip, :is_hidden_by_cur_user,
:is_saved_by_cur_user, :moderation_reason, :previewing, :seen_previous, :vote
attr_writer :fetched_content
attr_writer :fetched_response
before_validation :assign_short_id_and_upvote, :on => :create
before_create :assign_initial_hotness
@ -915,32 +915,9 @@ class Story < ApplicationRecord
}.join(", ")
end
def fetched_attributes
return @fetched_attributes if @fetched_attributes
@fetched_attributes = {
:url => self.url,
:title => "",
}
# security: do not connect to arbitrary user-submitted ports
return @fetched_attributes if @url_port
if !@fetched_content
begin
s = Sponge.new
s.timeout = 3
# User submitted URLs may have an incorrect https certificate, but we
# don't want to fail the retrieval for this. Security risk is minimal.
s.ssl_verify = false
user_agent = { "User-agent" => "#{Rails.application.domain} for #{fetching_ip}" }
@fetched_content = s.fetch(url, :get, nil, nil, user_agent, 3).body.force_encoding('utf-8')
rescue
return @fetched_attributes
end
end
parsed = Nokogiri::HTML(@fetched_content.to_s)
def fetched_attributes_html
converted = @fetched_response.body.force_encoding('utf-8')
parsed = Nokogiri::HTML(converted.to_s)
# parse best title from html tags
# try <meta property="og:title"> first, it probably won't have the site
@ -998,6 +975,54 @@ class Story < ApplicationRecord
@fetched_attributes
end
def fetched_attributes_pdf
return @fetched_attributes = {} if @fetched_response.body >= 5.megabytes
# pdf-reader only accepts a stream or filename
pdf_stream = StringIO.new(@fetched_response.body)
pdf = PDF::Reader.new(pdf_stream)
title = pdf.info[:Title]
@fetched_attributes[:title] = title
@fetched_attributes
end
def fetched_attributes
return @fetched_attributes if @fetched_attributes
@fetched_attributes = {
:url => self.url,
:title => "",
}
# security: do not connect to arbitrary user-submitted ports
return @fetched_attributes if @url_port
begin
# if we haven't had a test inject a response into us
if !@fetched_response
s = Sponge.new
s.timeout = 3
# User submitted URLs may have an incorrect https certificate, but we
# don't want to fail the retrieval for this. Security risk is minimal.
s.ssl_verify = false
user_agent = { "User-agent" => "#{Rails.application.domain} for #{fetching_ip}" }
res = s.fetch(url, :get, nil, nil, user_agent, 3)
@fetched_response = res
end
case @fetched_response["content-type"]
when /pdf/
return fetched_attributes_pdf
else
return fetched_attributes_html
end
rescue
return @fetched_attributes
end
end
private
def valid_canonical_uri?(url)

View File

@ -152,13 +152,26 @@ describe Story do
context 'fetching titles' do
let(:story_directory) { Rails.root.join 'spec/fixtures/story_pages/' }
# this is more elaborate than the previous system, because now it needs to know the content type
def fake_response(content, type, code = '200')
res = Net::HTTPResponse.new(1.0, code, "OK")
res.add_field("content-type", type)
# we can't seemingly just set body, so...
allow(res).to receive(:body).and_return(content)
return res
end
it "can fetch its title properly" do
content = File.read(story_directory + "title_ampersand.html")
res = fake_response(content, "text/html")
s = build(:story)
s.fetched_content = File.read(story_directory + "title_ampersand.html")
s.fetched_response = res
expect(s.fetched_attributes[:title]).to eq("B2G demo & quick hack // by Paul Rouget")
content = File.read(story_directory + "title_google.html")
res = fake_response(content, "text/html")
s = build(:story)
s.fetched_content = File.read(story_directory + "title_google.html")
s.fetched_response = res
expect(s.fetched_attributes[:title]).to eq("Google")
end
@ -171,44 +184,46 @@ describe Story do
it "does not follow rel=canonical when this is to the main page" do
url = "https://www.mcsweeneys.net/articles/who-said-it-donald-trump-or-regina-george"
s = build(:story, url: url)
s.fetched_content = File.read(story_directory + "canonical_root.html")
s.fetched_response = File.read(story_directory + "canonical_root.html")
expect(s.fetched_attributes[:url]).to eq(url)
end
it "does not assign canonical url when the response is non-200" do
url = "https://www.mcsweeneys.net/a/who-said-it-donald-trump-or-regina-george"
content = File.read(story_directory + "canonical_error.html")
res = fake_response(content, "text/html", '404')
expect_any_instance_of(Sponge)
.to receive(:fetch)
.and_return(Net::HTTPResponse.new(1.0, 404, "OK"))
s = build(:story, url: url)
s.fetched_content = File.read(story_directory + "canonical_error.html")
s.fetched_response = res
expect(s.fetched_attributes[:url]).to eq(url)
end
it "assigns canonical when url when it resolves 200" do
url = "https://www.mcsweeneys.net/a/who-said-it-donald-trump-or-regina-george"
canonical = "https://www.mcsweeneys.net/articles/who-said-it-donald-trump-or-regina-george"
content = File.read(story_directory + "canonical_error.html")
res = fake_response(content, "text/html")
expect_any_instance_of(Sponge)
.to receive(:fetch)
.and_return(Net::HTTPResponse.new(1.0, 200, "OK"))
s = build(:story, url: url)
s.fetched_content = File.read(story_directory + "canonical_error.html")
s.fetched_response = res
expect(s.fetched_attributes[:url]).to eq(canonical)
end
context "with unicode" do
before do
it "can fetch unicode titles properly" do
content = "<!DOCTYPE html><html><title>你好世界! Heres a fancy apostrophe</title></html>"
.force_encoding('ASCII-8BIT') # This is the encoding returned by Sponge#fetch
allow_any_instance_of(Sponge).to receive(:fetch).and_return double(body: content)
end
it "can fetch unicode titles properly" do
res = fake_response(content, "text/html")
s = build(:story)
s.fetched_response = res
expect(s.fetched_attributes[:title]).to eq("你好世界! Heres a fancy apostrophe")
end
end