Add youtube duplicate detection (#1174)

This commit is contained in:
Patryk 2023-04-26 15:21:05 +02:00 committed by GitHub
parent 64ba31c4d0
commit f3ab19ee1c
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
2 changed files with 44 additions and 0 deletions

View File

@ -237,6 +237,28 @@ class Story < ApplicationRecord
urls = urls2.uniq
end
# www.youtube.com
# m.youtube.com
# youtube.com redirects to www.youtube.com
# youtu.be redirects to www.youtube.com
# www.m.youtube.com doesn't work
# www.youtu.be doesn't exist
# m.youtu.be doesn't exist
if /^https?:\/\/((?:www\d*|m)\.)?(youtube\.com|youtu\.be)/i.match(url)
urls.each do |u|
id = /^https?:\/\/(?:(?:m|www)\.)?(?:youtube\.com\/watch\?v=|youtu\.be\/)([A-z0-9\-_]+)/i
.match(u)[1]
urls2.push "https://www.youtube.com/watch?v=#{id}"
# In theory, youtube redirects https://youtube.com to https://www.youtube.com
# let's check it just in case
urls2.push "https://youtube.com/watch?v=#{id}"
urls2.push "https://youtu.be/#{id}"
urls2.push "https://m.youtube.com/watch?v=#{id}"
end
urls = urls2.uniq
end
# https
urls.each do |u|
urls2.push u.gsub(/^http:\/\//i, "https://")

View File

@ -327,6 +327,28 @@ describe Story do
expect(s1.similar_stories).to eq([s2])
expect(s2.similar_stories).to eq([s1])
end
it "finds similar www.youtube and youtu.be URLs" do
s1 = create(:story,
url: 'https://www.youtube.com/watch?v=7Pq-S557XQU',
created_at: (Story::RECENT_DAYS + 1).days.ago)
s2 = create(:story, url: 'https://youtu.be/7Pq-S557XQU')
expect(s1.similar_stories).to eq([s2])
expect(s2.similar_stories).to eq([s1])
end
it "finds similar www.youtube and m.youtube URLs" do
s1 = create(:story,
url: 'https://www.youtube.com/watch?v=7Pq-S557XQU',
created_at: (Story::RECENT_DAYS + 1).days.ago)
s2 = create(:story, url: 'https://m.youtube.com/watch?v=7Pq-S557XQU')
expect(s1.similar_stories).to eq([s2])
expect(s2.similar_stories).to eq([s1])
end
end
describe "#calculated_hotness" do