Recognize Arvix html and pdf urls as duplicates (#965)

This commit is contained in:
Prabu Weerasinghe 2021-10-25 23:19:45 +01:00 committed by GitHub
parent b4c64aba0e
commit 9041e3df49
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
2 changed files with 30 additions and 0 deletions

View File

@ -217,6 +217,16 @@ class Story < ApplicationRecord
urls2 = [url.to_s.gsub(/(#.*)/, "")]
urls_with_trailing_pound = []
# arxiv html page and its pdf link based off the [arxiv identifier](https://arxiv.org/help/arxiv_identifier)
if /^https?:\/\/(www\d*\.)?arxiv.org/i.match(url)
urls.each do |u|
urls2.push u.gsub(/(arxiv.org\/)abs(\/\d{4}.\d{4,5})/i, '\1pdf\2')
urls2.push u.gsub(/(arxiv.org\/)abs(\/\d{4}.\d{4,5})/i, '\1pdf\2.pdf')
urls2.push u.gsub(/(arxiv.org\/)pdf(\/\d{4}.\d{4,5})(.pdf)?/i, '\1abs\2')
end
urls = urls2.uniq
end
# https
urls.each do |u|
urls2.push u.gsub(/^http:\/\//i, "https://")

View File

@ -307,6 +307,26 @@ describe Story do
s = create(:story, url: 'http://aaonline.fr/search.php?search&criteria[title-contains]=debian')
expect(s.similar_stories).to eq([])
end
it "finds arxiv html page and pdf URLs with the same arxiv identifier" do
s1 = create(:story,
url: 'https://arxiv.org/abs/2101.07554',
created_at: (Story::RECENT_DAYS + 1).days.ago)
s2 = create(:story, url: 'https://arxiv.org/pdf/2101.07554')
expect(s1.similar_stories).to eq([s2])
expect(s2.similar_stories).to eq([s1])
end
it "finds similar arxiv html page and pdf URLs that contain a pdf extension" do
s1 = create(:story,
url: 'https://arxiv.org/abs/2101.09188',
created_at: (Story::RECENT_DAYS + 1).days.ago)
s2 = create(:story, url: 'https://arxiv.org/pdf/2101.09188.pdf')
expect(s1.similar_stories).to eq([s2])
expect(s2.similar_stories).to eq([s1])
end
end
describe "#calculated_hotness" do