Recognize Arvix html and pdf urls as duplicates (#965)

2021-10-25 23:19:45 +01:00 · 2021-10-25 23:19:45 +01:00 · 9041e3df49
parent b4c64aba0e
commit 9041e3df49
2 changed files with 30 additions and 0 deletions
--- a/app/models/story.rb
+++ b/app/models/story.rb
@ -217,6 +217,16 @@ class Story < ApplicationRecord
    urls2 = [url.to_s.gsub(/(#.*)/, "")]
    urls_with_trailing_pound = []

+    # arxiv html page and its pdf link based off the [arxiv identifier](https://arxiv.org/help/arxiv_identifier)
+    if /^https?:\/\/(www\d*\.)?arxiv.org/i.match(url)
+      urls.each do |u|
+        urls2.push u.gsub(/(arxiv.org\/)abs(\/\d{4}.\d{4,5})/i, '\1pdf\2')
+        urls2.push u.gsub(/(arxiv.org\/)abs(\/\d{4}.\d{4,5})/i, '\1pdf\2.pdf')
+        urls2.push u.gsub(/(arxiv.org\/)pdf(\/\d{4}.\d{4,5})(.pdf)?/i, '\1abs\2')
+      end
+      urls = urls2.uniq
+    end
+
    # https
    urls.each do |u|
      urls2.push u.gsub(/^http:\/\//i, "https://")
--- a/spec/models/story_spec.rb
+++ b/spec/models/story_spec.rb
@ -307,6 +307,26 @@ describe Story do
      s = create(:story, url: 'http://aaonline.fr/search.php?search&criteria[title-contains]=debian')
      expect(s.similar_stories).to eq([])
    end
+
+    it "finds arxiv html page and pdf URLs with the same arxiv identifier" do
+      s1 = create(:story,
+                  url: 'https://arxiv.org/abs/2101.07554',
+                  created_at: (Story::RECENT_DAYS + 1).days.ago)
+      s2 = create(:story, url: 'https://arxiv.org/pdf/2101.07554')
+
+      expect(s1.similar_stories).to eq([s2])
+      expect(s2.similar_stories).to eq([s1])
+    end
+
+    it "finds similar arxiv html page and pdf URLs that contain a pdf extension" do
+      s1 = create(:story,
+                  url: 'https://arxiv.org/abs/2101.09188',
+                  created_at: (Story::RECENT_DAYS + 1).days.ago)
+      s2 = create(:story, url: 'https://arxiv.org/pdf/2101.09188.pdf')
+
+      expect(s1.similar_stories).to eq([s2])
+      expect(s2.similar_stories).to eq([s1])
+    end
  end

  describe "#calculated_hotness" do