From 11d668a8cb2c31c535c01a8afeaba9872258e4fe Mon Sep 17 00:00:00 2001 From: Mike Dalessio Date: Wed, 12 Jun 2024 16:57:31 -0400 Subject: [PATCH] fix: correctly recognize encoding errors in libxml >= 2.12.0 Starting with libxml2 v2.12.0, encoding errors have a message that was not being detected by Page#encoding_error? resulting in a page that was parsed using incorrect encoding. This change updates Page to detect those errors as _encoding errors_ and as a result we get the expected encoding back on the parsed document. --- lib/mechanize/page.rb | 1 + test/test_mechanize_page_link.rb | 6 ++---- 2 files changed, 3 insertions(+), 4 deletions(-) diff --git a/lib/mechanize/page.rb b/lib/mechanize/page.rb index 96ffac06..3c8d698e 100644 --- a/lib/mechanize/page.rb +++ b/lib/mechanize/page.rb @@ -101,6 +101,7 @@ def encoding_error?(parser=nil) return false if parser.errors.empty? parser.errors.any? do |error| error.message.scrub =~ /(indicate\ encoding)| + (Invalid\ bytes)| (Invalid\ char)| (input\ conversion\ failed)/x end diff --git a/test/test_mechanize_page_link.rb b/test/test_mechanize_page_link.rb index aafdc1d0..76024a4d 100644 --- a/test/test_mechanize_page_link.rb +++ b/test/test_mechanize_page_link.rb @@ -114,13 +114,12 @@ def test_encoding_charset_after_title_bad # https://gitlab.gnome.org/GNOME/libxml2/-/issues/543 skip if Nokogiri.uses_libxml?([">= 2.11.0", "< 2.12.0"]) - expected_encoding = Nokogiri.uses_libxml?("< 2.11.0") ? 'UTF-8' : 'Shift_JIS' page = util_page UTF8.dup assert_equal false, page.encoding_error? - assert_equal expected_encoding, page.encoding + assert_equal "UTF-8", page.encoding end def test_encoding_charset_after_title_double_bad @@ -138,7 +137,6 @@ def test_encoding_charset_bad # https://gitlab.gnome.org/GNOME/libxml2/-/issues/543 skip if Nokogiri.uses_libxml?([">= 2.11.0", "< 2.12.0"]) - expected_encoding = Nokogiri.uses_libxml?("< 2.11.0") ? 'UTF-8' : 'Shift_JIS' page = util_page(+"#{UTF8_TITLE}") page.encodings.replace %w[ @@ -148,7 +146,7 @@ def test_encoding_charset_bad assert_equal false, page.encoding_error? - assert_equal expected_encoding, page.encoding + assert_equal 'UTF-8', page.encoding end def test_encoding_meta_charset