parent
feea257c64
commit
5f50c79a20
1
Gemfile
1
Gemfile
|
@ -29,6 +29,7 @@ gem "rotp"
|
|||
gem "rqrcode"
|
||||
|
||||
# parsing
|
||||
gem "pdf-reader"
|
||||
gem "nokogiri", ">= 1.7.2"
|
||||
gem "htmlentities"
|
||||
gem "commonmarker", "~> 0.14"
|
||||
|
|
12
Gemfile.lock
12
Gemfile.lock
|
@ -1,6 +1,7 @@
|
|||
GEM
|
||||
remote: https://rubygems.org/
|
||||
specs:
|
||||
Ascii85 (1.0.3)
|
||||
actioncable (5.2.4.1)
|
||||
actionpack (= 5.2.4.1)
|
||||
nio4r (~> 2.0)
|
||||
|
@ -48,6 +49,7 @@ GEM
|
|||
tzinfo (~> 1.1)
|
||||
addressable (2.6.0)
|
||||
public_suffix (>= 2.0.2, < 4.0)
|
||||
afm (0.2.2)
|
||||
arel (9.0.0)
|
||||
ast (2.4.0)
|
||||
bcrypt (3.1.13)
|
||||
|
@ -90,6 +92,7 @@ GEM
|
|||
good_migrations (0.0.2)
|
||||
activerecord (>= 3.1)
|
||||
railties (>= 3.1)
|
||||
hashery (2.1.2)
|
||||
htmlentities (4.3.4)
|
||||
i18n (1.8.2)
|
||||
concurrent-ruby (~> 1.0)
|
||||
|
@ -124,6 +127,12 @@ GEM
|
|||
parallel (1.19.1)
|
||||
parser (2.7.0.2)
|
||||
ast (~> 2.4.0)
|
||||
pdf-reader (2.2.0)
|
||||
Ascii85 (~> 1.0.0)
|
||||
afm (~> 0.2.1)
|
||||
hashery (~> 2.0)
|
||||
ruby-rc4
|
||||
ttfunk
|
||||
public_suffix (3.1.1)
|
||||
rack (2.1.2)
|
||||
rack-test (1.1.0)
|
||||
|
@ -196,6 +205,7 @@ GEM
|
|||
ruby-enum (0.7.2)
|
||||
i18n
|
||||
ruby-progressbar (1.10.1)
|
||||
ruby-rc4 (0.1.5)
|
||||
ruby_dep (1.5.0)
|
||||
ruumba (0.1.10)
|
||||
rubocop
|
||||
|
@ -223,6 +233,7 @@ GEM
|
|||
transaction_retry (1.0.3)
|
||||
activerecord (>= 3.0.11)
|
||||
transaction_isolation (>= 1.0.2)
|
||||
ttfunk (1.5.1)
|
||||
tzinfo (1.2.6)
|
||||
thread_safe (~> 0.1)
|
||||
uglifier (4.1.20)
|
||||
|
@ -263,6 +274,7 @@ DEPENDENCIES
|
|||
mysql2
|
||||
nokogiri (>= 1.7.2)
|
||||
oauth
|
||||
pdf-reader
|
||||
rails (~> 5.2)
|
||||
rb-readline
|
||||
rotp
|
||||
|
|
|
@ -138,7 +138,7 @@ class Story < ApplicationRecord
|
|||
|
||||
attr_accessor :editing_from_suggestions, :editor, :fetching_ip, :is_hidden_by_cur_user,
|
||||
:is_saved_by_cur_user, :moderation_reason, :previewing, :seen_previous, :vote
|
||||
attr_writer :fetched_content
|
||||
attr_writer :fetched_response
|
||||
|
||||
before_validation :assign_short_id_and_upvote, :on => :create
|
||||
before_create :assign_initial_hotness
|
||||
|
@ -915,32 +915,9 @@ class Story < ApplicationRecord
|
|||
}.join(", ")
|
||||
end
|
||||
|
||||
def fetched_attributes
|
||||
return @fetched_attributes if @fetched_attributes
|
||||
|
||||
@fetched_attributes = {
|
||||
:url => self.url,
|
||||
:title => "",
|
||||
}
|
||||
|
||||
# security: do not connect to arbitrary user-submitted ports
|
||||
return @fetched_attributes if @url_port
|
||||
|
||||
if !@fetched_content
|
||||
begin
|
||||
s = Sponge.new
|
||||
s.timeout = 3
|
||||
# User submitted URLs may have an incorrect https certificate, but we
|
||||
# don't want to fail the retrieval for this. Security risk is minimal.
|
||||
s.ssl_verify = false
|
||||
user_agent = { "User-agent" => "#{Rails.application.domain} for #{fetching_ip}" }
|
||||
@fetched_content = s.fetch(url, :get, nil, nil, user_agent, 3).body.force_encoding('utf-8')
|
||||
rescue
|
||||
return @fetched_attributes
|
||||
end
|
||||
end
|
||||
|
||||
parsed = Nokogiri::HTML(@fetched_content.to_s)
|
||||
def fetched_attributes_html
|
||||
converted = @fetched_response.body.force_encoding('utf-8')
|
||||
parsed = Nokogiri::HTML(converted.to_s)
|
||||
|
||||
# parse best title from html tags
|
||||
# try <meta property="og:title"> first, it probably won't have the site
|
||||
|
@ -998,6 +975,54 @@ class Story < ApplicationRecord
|
|||
@fetched_attributes
|
||||
end
|
||||
|
||||
def fetched_attributes_pdf
|
||||
return @fetched_attributes = {} if @fetched_response.body >= 5.megabytes
|
||||
|
||||
# pdf-reader only accepts a stream or filename
|
||||
pdf_stream = StringIO.new(@fetched_response.body)
|
||||
pdf = PDF::Reader.new(pdf_stream)
|
||||
|
||||
title = pdf.info[:Title]
|
||||
|
||||
@fetched_attributes[:title] = title
|
||||
@fetched_attributes
|
||||
end
|
||||
|
||||
def fetched_attributes
|
||||
return @fetched_attributes if @fetched_attributes
|
||||
|
||||
@fetched_attributes = {
|
||||
:url => self.url,
|
||||
:title => "",
|
||||
}
|
||||
|
||||
# security: do not connect to arbitrary user-submitted ports
|
||||
return @fetched_attributes if @url_port
|
||||
|
||||
begin
|
||||
# if we haven't had a test inject a response into us
|
||||
if !@fetched_response
|
||||
s = Sponge.new
|
||||
s.timeout = 3
|
||||
# User submitted URLs may have an incorrect https certificate, but we
|
||||
# don't want to fail the retrieval for this. Security risk is minimal.
|
||||
s.ssl_verify = false
|
||||
user_agent = { "User-agent" => "#{Rails.application.domain} for #{fetching_ip}" }
|
||||
res = s.fetch(url, :get, nil, nil, user_agent, 3)
|
||||
@fetched_response = res
|
||||
end
|
||||
|
||||
case @fetched_response["content-type"]
|
||||
when /pdf/
|
||||
return fetched_attributes_pdf
|
||||
else
|
||||
return fetched_attributes_html
|
||||
end
|
||||
rescue
|
||||
return @fetched_attributes
|
||||
end
|
||||
end
|
||||
|
||||
private
|
||||
|
||||
def valid_canonical_uri?(url)
|
||||
|
|
|
@ -152,13 +152,26 @@ describe Story do
|
|||
context 'fetching titles' do
|
||||
let(:story_directory) { Rails.root.join 'spec/fixtures/story_pages/' }
|
||||
|
||||
# this is more elaborate than the previous system, because now it needs to know the content type
|
||||
def fake_response(content, type, code = '200')
|
||||
res = Net::HTTPResponse.new(1.0, code, "OK")
|
||||
res.add_field("content-type", type)
|
||||
# we can't seemingly just set body, so...
|
||||
allow(res).to receive(:body).and_return(content)
|
||||
return res
|
||||
end
|
||||
|
||||
it "can fetch its title properly" do
|
||||
content = File.read(story_directory + "title_ampersand.html")
|
||||
res = fake_response(content, "text/html")
|
||||
s = build(:story)
|
||||
s.fetched_content = File.read(story_directory + "title_ampersand.html")
|
||||
s.fetched_response = res
|
||||
expect(s.fetched_attributes[:title]).to eq("B2G demo & quick hack // by Paul Rouget")
|
||||
|
||||
content = File.read(story_directory + "title_google.html")
|
||||
res = fake_response(content, "text/html")
|
||||
s = build(:story)
|
||||
s.fetched_content = File.read(story_directory + "title_google.html")
|
||||
s.fetched_response = res
|
||||
expect(s.fetched_attributes[:title]).to eq("Google")
|
||||
end
|
||||
|
||||
|
@ -171,44 +184,46 @@ describe Story do
|
|||
it "does not follow rel=canonical when this is to the main page" do
|
||||
url = "https://www.mcsweeneys.net/articles/who-said-it-donald-trump-or-regina-george"
|
||||
s = build(:story, url: url)
|
||||
s.fetched_content = File.read(story_directory + "canonical_root.html")
|
||||
s.fetched_response = File.read(story_directory + "canonical_root.html")
|
||||
expect(s.fetched_attributes[:url]).to eq(url)
|
||||
end
|
||||
|
||||
it "does not assign canonical url when the response is non-200" do
|
||||
url = "https://www.mcsweeneys.net/a/who-said-it-donald-trump-or-regina-george"
|
||||
content = File.read(story_directory + "canonical_error.html")
|
||||
res = fake_response(content, "text/html", '404')
|
||||
|
||||
expect_any_instance_of(Sponge)
|
||||
.to receive(:fetch)
|
||||
.and_return(Net::HTTPResponse.new(1.0, 404, "OK"))
|
||||
|
||||
s = build(:story, url: url)
|
||||
s.fetched_content = File.read(story_directory + "canonical_error.html")
|
||||
s.fetched_response = res
|
||||
expect(s.fetched_attributes[:url]).to eq(url)
|
||||
end
|
||||
|
||||
it "assigns canonical when url when it resolves 200" do
|
||||
url = "https://www.mcsweeneys.net/a/who-said-it-donald-trump-or-regina-george"
|
||||
canonical = "https://www.mcsweeneys.net/articles/who-said-it-donald-trump-or-regina-george"
|
||||
content = File.read(story_directory + "canonical_error.html")
|
||||
res = fake_response(content, "text/html")
|
||||
|
||||
expect_any_instance_of(Sponge)
|
||||
.to receive(:fetch)
|
||||
.and_return(Net::HTTPResponse.new(1.0, 200, "OK"))
|
||||
|
||||
s = build(:story, url: url)
|
||||
s.fetched_content = File.read(story_directory + "canonical_error.html")
|
||||
s.fetched_response = res
|
||||
expect(s.fetched_attributes[:url]).to eq(canonical)
|
||||
end
|
||||
|
||||
context "with unicode" do
|
||||
before do
|
||||
it "can fetch unicode titles properly" do
|
||||
content = "<!DOCTYPE html><html><title>你好世界! Here’s a fancy apostrophe</title></html>"
|
||||
.force_encoding('ASCII-8BIT') # This is the encoding returned by Sponge#fetch
|
||||
allow_any_instance_of(Sponge).to receive(:fetch).and_return double(body: content)
|
||||
end
|
||||
|
||||
it "can fetch unicode titles properly" do
|
||||
res = fake_response(content, "text/html")
|
||||
s = build(:story)
|
||||
s.fetched_response = res
|
||||
expect(s.fetched_attributes[:title]).to eq("你好世界! Here’s a fancy apostrophe")
|
||||
end
|
||||
end
|
||||
|
|
Loading…
Reference in New Issue