Recognize Arvix html and pdf urls as duplicates (#965)
This commit is contained in:
parent
b4c64aba0e
commit
9041e3df49
|
@ -217,6 +217,16 @@ class Story < ApplicationRecord
|
|||
urls2 = [url.to_s.gsub(/(#.*)/, "")]
|
||||
urls_with_trailing_pound = []
|
||||
|
||||
# arxiv html page and its pdf link based off the [arxiv identifier](https://arxiv.org/help/arxiv_identifier)
|
||||
if /^https?:\/\/(www\d*\.)?arxiv.org/i.match(url)
|
||||
urls.each do |u|
|
||||
urls2.push u.gsub(/(arxiv.org\/)abs(\/\d{4}.\d{4,5})/i, '\1pdf\2')
|
||||
urls2.push u.gsub(/(arxiv.org\/)abs(\/\d{4}.\d{4,5})/i, '\1pdf\2.pdf')
|
||||
urls2.push u.gsub(/(arxiv.org\/)pdf(\/\d{4}.\d{4,5})(.pdf)?/i, '\1abs\2')
|
||||
end
|
||||
urls = urls2.uniq
|
||||
end
|
||||
|
||||
# https
|
||||
urls.each do |u|
|
||||
urls2.push u.gsub(/^http:\/\//i, "https://")
|
||||
|
|
|
@ -307,6 +307,26 @@ describe Story do
|
|||
s = create(:story, url: 'http://aaonline.fr/search.php?search&criteria[title-contains]=debian')
|
||||
expect(s.similar_stories).to eq([])
|
||||
end
|
||||
|
||||
it "finds arxiv html page and pdf URLs with the same arxiv identifier" do
|
||||
s1 = create(:story,
|
||||
url: 'https://arxiv.org/abs/2101.07554',
|
||||
created_at: (Story::RECENT_DAYS + 1).days.ago)
|
||||
s2 = create(:story, url: 'https://arxiv.org/pdf/2101.07554')
|
||||
|
||||
expect(s1.similar_stories).to eq([s2])
|
||||
expect(s2.similar_stories).to eq([s1])
|
||||
end
|
||||
|
||||
it "finds similar arxiv html page and pdf URLs that contain a pdf extension" do
|
||||
s1 = create(:story,
|
||||
url: 'https://arxiv.org/abs/2101.09188',
|
||||
created_at: (Story::RECENT_DAYS + 1).days.ago)
|
||||
s2 = create(:story, url: 'https://arxiv.org/pdf/2101.09188.pdf')
|
||||
|
||||
expect(s1.similar_stories).to eq([s2])
|
||||
expect(s2.similar_stories).to eq([s1])
|
||||
end
|
||||
end
|
||||
|
||||
describe "#calculated_hotness" do
|
||||
|
|
Loading…
Reference in New Issue