From f96e4b30473cf31acfa450a71b65dff02ea33d19 Mon Sep 17 00:00:00 2001 From: Christian Schmidt Date: Fri, 28 Jul 2023 23:02:08 +0200 Subject: [PATCH] Use original URL in preview if it redirects to 4xx page (#26200) --- app/services/fetch_link_card_service.rb | 14 +- spec/fixtures/requests/idn.txt | 483 ------------------ spec/services/fetch_link_card_service_spec.rb | 209 ++++++-- 3 files changed, 179 insertions(+), 527 deletions(-) delete mode 100644 spec/fixtures/requests/idn.txt diff --git a/app/services/fetch_link_card_service.rb b/app/services/fetch_link_card_service.rb index dccd86c1e6..8865824609 100644 --- a/app/services/fetch_link_card_service.rb +++ b/app/services/fetch_link_card_service.rb @@ -45,20 +45,18 @@ class FetchLinkCardService < BaseService def html return @html if defined?(@html) - Request.new(:get, @url).add_headers('Accept' => 'text/html', 'User-Agent' => "#{Mastodon::Version.user_agent} Bot").perform do |res| + @html = Request.new(:get, @url).add_headers('Accept' => 'text/html', 'User-Agent' => "#{Mastodon::Version.user_agent} Bot").perform do |res| + next unless res.code == 200 && res.mime_type == 'text/html' + # We follow redirects, and ideally we want to save the preview card for # the destination URL and not any link shortener in-between, so here # we set the URL to the one of the last response in the redirect chain @url = res.request.uri.to_s @card = PreviewCard.find_or_initialize_by(url: @url) if @card.url != @url - if res.code == 200 && res.mime_type == 'text/html' - @html_charset = res.charset - @html = res.body_with_limit - else - @html_charset = nil - @html = nil - end + @html_charset = res.charset + + res.body_with_limit end end diff --git a/spec/fixtures/requests/idn.txt b/spec/fixtures/requests/idn.txt deleted file mode 100644 index 5d07f2b796..0000000000 --- a/spec/fixtures/requests/idn.txt +++ /dev/null @@ -1,483 +0,0 @@ -HTTP/1.1 200 OK -Server: nginx -Date: Sun, 23 Apr 2017 19:37:13 GMT -Content-Type: text/html -Content-Length: 38111 -Last-Modified: Wed, 20 Jul 2016 02:50:52 GMT -Connection: keep-alive -Accept-Ranges: bytes - - - - - - - - - - - - - - - 中国域名网站 - - - -
- -
- -
- - -
-
-
-

网址大全

- -
-
-
-
-
-

中文域名简介

-

- “中国域名”是中文域名的一种,特指以“中国”为后缀的中文域名,是我国域名体系和全球互联网域名体系的重要组成部分。“中国”是在全球互联网上代表中国的中文顶级域名,于2010年7月正式纳入全球互联网域名体系,全球互联网域名体系,全球网民可通过联网计算机在世界任何国家和地区实现无障碍访问。“中国”域名在使用上和 .CN,相似属于互联网上的基础服务,基于域名可以提供WWW.EMAIL FTP等应用服务。 -

-
- - - - diff --git a/spec/services/fetch_link_card_service_spec.rb b/spec/services/fetch_link_card_service_spec.rb index eba38b6998..f44cbb750c 100644 --- a/spec/services/fetch_link_card_service_spec.rb +++ b/spec/services/fetch_link_card_service_spec.rb @@ -5,96 +5,233 @@ require 'rails_helper' RSpec.describe FetchLinkCardService, type: :service do subject { described_class.new } + let(:html) { 'Hello world' } + let(:oembed_cache) { nil } + before do - stub_request(:get, 'http://example.xn--fiqs8s/').to_return(request_fixture('idn.txt')) + stub_request(:get, 'http://example.com/html').to_return(headers: { 'Content-Type' => 'text/html' }, body: html) + stub_request(:get, 'http://example.com/not-found').to_return(status: 404, headers: { 'Content-Type' => 'text/html' }, body: html) + stub_request(:get, 'http://example.com/text').to_return(status: 404, headers: { 'Content-Type' => 'text/plain' }, body: 'Hello') + stub_request(:get, 'http://example.com/redirect').to_return(status: 302, headers: { 'Location' => 'http://example.com/html' }) + stub_request(:get, 'http://example.com/redirect-to-404').to_return(status: 302, headers: { 'Location' => 'http://example.com/not-found' }) + stub_request(:get, 'http://example.com/oembed?url=http://example.com/html').to_return(headers: { 'Content-Type' => 'application/json' }, body: '{ "version": "1.0", "type": "link", "title": "oEmbed title" }') + stub_request(:get, 'http://example.com/oembed?format=json&url=http://example.com/html').to_return(headers: { 'Content-Type' => 'application/json' }, body: '{ "version": "1.0", "type": "link", "title": "oEmbed title" }') + + stub_request(:get, 'http://example.xn--fiqs8s') + stub_request(:get, 'http://example.com/日本語') + stub_request(:get, 'http://example.com/test?data=file.gpx%5E1') + stub_request(:get, 'http://example.com/test-') + stub_request(:get, 'http://example.com/sjis').to_return(request_fixture('sjis.txt')) stub_request(:get, 'http://example.com/sjis_with_wrong_charset').to_return(request_fixture('sjis_with_wrong_charset.txt')) stub_request(:get, 'http://example.com/koi8-r').to_return(request_fixture('koi8-r.txt')) - stub_request(:get, 'http://example.com/日本語').to_return(request_fixture('sjis.txt')) - stub_request(:get, 'https://github.com/qbi/WannaCry').to_return(status: 404) - stub_request(:get, 'http://example.com/test?data=file.gpx%5E1').to_return(status: 200) - stub_request(:get, 'http://example.com/test-').to_return(request_fixture('idn.txt')) stub_request(:get, 'http://example.com/windows-1251').to_return(request_fixture('windows-1251.txt')) + Rails.cache.write('oembed_endpoint:example.com', oembed_cache) if oembed_cache + subject.call(status) end context 'with a local status' do - context 'with an IDN url' do + context 'with URL of a regular HTML page' do + let(:status) { Fabricate(:status, text: 'http://example.com/html') } + + it 'creates preview card' do + expect(status.preview_card).to_not be_nil + expect(status.preview_card.url).to eq 'http://example.com/html' + expect(status.preview_card.title).to eq 'Hello world' + end + end + + context 'with URL of a page with no title' do + let(:status) { Fabricate(:status, text: 'http://example.com/html') } + let(:html) { '' } + + it 'does not create a preview card' do + expect(status.preview_card).to be_nil + end + end + + context 'with a URL of a plain-text page' do + let(:status) { Fabricate(:status, text: 'http://example.com/text') } + + it 'does not create a preview card' do + expect(status.preview_card).to be_nil + end + end + + context 'with multiple URLs' do + let(:status) { Fabricate(:status, text: 'ftp://example.com http://example.com/html http://example.com/text') } + + it 'fetches the first valid URL' do + expect(a_request(:get, 'http://example.com/html')).to have_been_made + end + + it 'does not fetch the second valid URL' do + expect(a_request(:get, 'http://example.com/text/')).to_not have_been_made + end + end + + context 'with a redirect URL' do + let(:status) { Fabricate(:status, text: 'http://example.com/redirect') } + + it 'follows redirect' do + expect(a_request(:get, 'http://example.com/redirect')).to have_been_made.once + expect(a_request(:get, 'http://example.com/html')).to have_been_made.once + end + + it 'creates preview card' do + expect(status.preview_card).to_not be_nil + expect(status.preview_card.url).to eq 'http://example.com/html' + expect(status.preview_card.title).to eq 'Hello world' + end + end + + context 'with a broken redirect URL' do + let(:status) { Fabricate(:status, text: 'http://example.com/redirect-to-404') } + + it 'follows redirect' do + expect(a_request(:get, 'http://example.com/redirect-to-404')).to have_been_made.once + expect(a_request(:get, 'http://example.com/not-found')).to have_been_made.once + end + + it 'does not create a preview card' do + expect(status.preview_card).to be_nil + end + end + + context 'with a 404 URL' do + let(:status) { Fabricate(:status, text: 'http://example.com/not-found') } + + it 'does not create a preview card' do + expect(status.preview_card).to be_nil + end + end + + context 'with an IDN URL' do let(:status) { Fabricate(:status, text: 'Check out http://example.中国') } - it 'works with IDN URLs' do - expect(a_request(:get, 'http://example.xn--fiqs8s/')).to have_been_made.at_least_once + it 'fetches the URL' do + expect(a_request(:get, 'http://example.xn--fiqs8s/')).to have_been_made.once end end - context 'with an SJIS url' do + context 'with a URL of a page in Shift JIS encoding' do let(:status) { Fabricate(:status, text: 'Check out http://example.com/sjis') } - it 'works with SJIS' do - expect(a_request(:get, 'http://example.com/sjis')).to have_been_made.at_least_once + it 'decodes the HTML' do expect(status.preview_cards.first.title).to eq('SJISのページ') end end - context 'with invalid SJIS url' do + context 'with a URL of a page in Shift JIS encoding labeled as UTF-8' do let(:status) { Fabricate(:status, text: 'Check out http://example.com/sjis_with_wrong_charset') } - it 'works with SJIS even with wrong charset header' do - expect(a_request(:get, 'http://example.com/sjis_with_wrong_charset')).to have_been_made.at_least_once + it 'decodes the HTML despite the wrong charset header' do expect(status.preview_cards.first.title).to eq('SJISのページ') end end - context 'with an koi8-r url' do + context 'with a URL of a page in KOI8-R encoding' do let(:status) { Fabricate(:status, text: 'Check out http://example.com/koi8-r') } - it 'works with koi8-r' do - expect(a_request(:get, 'http://example.com/koi8-r')).to have_been_made.at_least_once + it 'decodes the HTML' do expect(status.preview_cards.first.title).to eq('Московя начинаетъ только въ XVI ст. привлекать внимане иностранцевъ.') end end - context 'with a windows-1251 url' do + context 'with a URL of a page in Windows-1251 encoding' do let(:status) { Fabricate(:status, text: 'Check out http://example.com/windows-1251') } - it 'works with windows-1251' do - expect(a_request(:get, 'http://example.com/windows-1251')).to have_been_made.at_least_once + it 'decodes the HTML' do expect(status.preview_cards.first.title).to eq('сэмпл текст') end end - context 'with a japanese path url' do + context 'with a Japanese path URL' do let(:status) { Fabricate(:status, text: 'テストhttp://example.com/日本語') } - it 'works with Japanese path string' do - expect(a_request(:get, 'http://example.com/日本語')).to have_been_made.at_least_once - expect(status.preview_cards.first.title).to eq('SJISのページ') + it 'fetches the URL' do + expect(a_request(:get, 'http://example.com/日本語')).to have_been_made.once end end - context 'with a hyphen-suffixed url' do + context 'with a hyphen-suffixed URL' do let(:status) { Fabricate(:status, text: 'test http://example.com/test-') } - it 'works with a URL ending with a hyphen' do - expect(a_request(:get, 'http://example.com/test-')).to have_been_made.at_least_once + it 'fetches the URL' do + expect(a_request(:get, 'http://example.com/test-')).to have_been_made.once end end - context 'with an isolated url' do + context 'with a caret-suffixed URL' do + let(:status) { Fabricate(:status, text: 'test http://example.com/test?data=file.gpx^1') } + + it 'fetches the URL' do + expect(a_request(:get, 'http://example.com/test?data=file.gpx%5E1')).to have_been_made.once + end + + it 'does not strip the caret before fetching' do + expect(a_request(:get, 'http://example.com/test?data=file.gpx')).to_not have_been_made + end + end + + context 'with a non-isolated URL' do let(:status) { Fabricate(:status, text: 'testhttp://example.com/sjis') } - it 'does not fetch URLs with not isolated from their surroundings' do + it 'does not fetch URLs not isolated from their surroundings' do expect(a_request(:get, 'http://example.com/sjis')).to_not have_been_made end end - context 'with a url that has a caret' do - let(:status) { Fabricate(:status, text: 'test http://example.com/test?data=file.gpx^1') } + context 'with a URL of a page with oEmbed support' do + let(:html) { 'Hello world' } + let(:status) { Fabricate(:status, text: 'http://example.com/html') } - it 'does fetch URLs with a caret in search params' do - expect(a_request(:get, 'http://example.com/test?data=file.gpx')).to_not have_been_made - expect(a_request(:get, 'http://example.com/test?data=file.gpx%5E1')).to have_been_made.once + it 'fetches the oEmbed URL' do + expect(a_request(:get, 'http://example.com/oembed?url=http://example.com/html')).to have_been_made.once + end + + it 'creates preview card' do + expect(status.preview_card).to_not be_nil + expect(status.preview_card.url).to eq 'http://example.com/html' + expect(status.preview_card.title).to eq 'oEmbed title' + end + + context 'when oEmbed endpoint cache populated' do + let(:oembed_cache) { { endpoint: 'http://example.com/oembed?format=json&url={url}', format: :json } } + + it 'uses the cached oEmbed response' do + expect(a_request(:get, 'http://example.com/oembed?url=http://example.com/html')).to_not have_been_made + expect(a_request(:get, 'http://example.com/oembed?format=json&url=http://example.com/html')).to have_been_made + end + + it 'creates preview card' do + expect(status.preview_card).to_not be_nil + expect(status.preview_card.url).to eq 'http://example.com/html' + expect(status.preview_card.title).to eq 'oEmbed title' + end + end + + # If the original HTML URL for whatever reason (e.g. DOS protection) redirects to + # an error page, we can still use the cached oEmbed but should not use the + # redirect URL on the card. + context 'when oEmbed endpoint cache populated but page returns 404' do + let(:status) { Fabricate(:status, text: 'http://example.com/redirect-to-404') } + let(:oembed_cache) { { endpoint: 'http://example.com/oembed?url=http://example.com/html', format: :json } } + + it 'uses the cached oEmbed response' do + expect(a_request(:get, 'http://example.com/oembed?url=http://example.com/html')).to have_been_made + end + + it 'creates preview card' do + expect(status.preview_card).to_not be_nil + expect(status.preview_card.title).to eq 'oEmbed title' + end + + it 'uses the original URL' do + expect(status.preview_card&.url).to eq 'http://example.com/redirect-to-404' + end end end end @@ -104,13 +241,13 @@ RSpec.describe FetchLinkCardService, type: :service do Fabricate(:status, account: Fabricate(:account, domain: 'example.com'), text: <<-TEXT) Habt ihr ein paar gute Links zu foo #Wannacry herumfliegen? - Ich will mal unter
https://github.com/qbi/WannaCry was sammeln. ! + Ich will mal unter
http://example.com/not-found was sammeln. ! security  TEXT end it 'parses out URLs' do - expect(a_request(:get, 'https://github.com/qbi/WannaCry')).to have_been_made.at_least_once + expect(a_request(:get, 'http://example.com/not-found')).to have_been_made.once end it 'ignores URLs to hashtags' do