From ec0ce00b95c4d08c2c7c9bae79d16f84829349fb Mon Sep 17 00:00:00 2001 From: Go MAEDA Date: Fri, 7 Jan 2022 01:29:55 +0000 Subject: [PATCH] Auto guess file encoding when importing CSV file (#34718). Patch by Go MAEDA. git-svn-id: http://svn.redmine.org/redmine/trunk@21352 e93f8b46-1217-0410-a6f0-8f06a7374b81 --- app/models/import.rb | 10 +++++++- lib/redmine/codeset_util.rb | 18 +++++++++++++++ test/unit/issue_import_test.rb | 27 ++++++++++++++++++++++ test/unit/lib/redmine/codeset_util_test.rb | 17 ++++++++++++++ 4 files changed, 71 insertions(+), 1 deletion(-) diff --git a/app/models/import.rb b/app/models/import.rb index 92752a3db..ffa69d22e 100644 --- a/app/models/import.rb +++ b/app/models/import.rb @@ -65,15 +65,23 @@ class Import < ActiveRecord::Base def set_default_settings(options={}) separator = lu(user, :general_csv_separator) + encoding = lu(user, :general_csv_encoding) if file_exists? begin content = File.read(filepath, 256) + separator = [',', ';'].sort_by {|sep| content.count(sep)}.last + + guessed_encoding = Redmine::CodesetUtil.guess_encoding(file_content) + encoding = + (guessed_encoding && ( + Setting::ENCODINGS.detect {|e| e.casecmp?(guessed_encoding)} || + Setting::ENCODINGS.detect {|e| Encoding.find(e) == Encoding.find(guessed_encoding)} + )) || lu(user, :general_csv_encoding) rescue => e end end wrapper = '"' - encoding = lu(user, :general_csv_encoding) date_format = lu(user, "date.formats.default", :default => "foo") date_format = DATE_FORMATS.first unless DATE_FORMATS.include?(date_format) diff --git a/lib/redmine/codeset_util.rb b/lib/redmine/codeset_util.rb index 8261e572b..875689de2 100644 --- a/lib/redmine/codeset_util.rb +++ b/lib/redmine/codeset_util.rb @@ -75,5 +75,23 @@ module Redmine str = self.replace_invalid_utf8(str) end end + + def self.guess_encoding(str) + return if str.nil? + + str = str.dup + encodings = Setting.repositories_encodings.split(',').collect(&:strip) + encodings = encodings.presence || ['UTF-8'] + + encodings.each do |encoding| + begin + str.force_encoding(encoding) + rescue Encoding::ConverterNotFoundError + # ignore if the encoding name is invalid + end + return encoding if str.valid_encoding? + end + nil + end end end diff --git a/test/unit/issue_import_test.rb b/test/unit/issue_import_test.rb index b08629b55..3f98f0372 100644 --- a/test/unit/issue_import_test.rb +++ b/test/unit/issue_import_test.rb @@ -411,4 +411,31 @@ class IssueImportTest < ActiveSupport::TestCase assert_empty import.mapping end + + def test_set_default_settings_should_guess_encoding + import = generate_import('import_iso8859-1.csv') + with_settings :repositories_encodings => 'UTF-8,ISO-8859-1' do + import.set_default_settings + guessed_encoding = import.settings['encoding'] + assert_equal 'ISO-8859-1', guessed_encoding + end + with_settings :repositories_encodings => 'UTF-8,iso8859-1' do + import.set_default_settings + guessed_encoding = import.settings['encoding'] + assert_equal 'ISO-8859-1', guessed_encoding + assert_includes Setting::ENCODINGS, guessed_encoding + end + end + + def test_set_default_settings_should_use_general_csv_encoding_when_cannnot_guess_encoding + import = generate_import('import_iso8859-1.csv') + user = User.generate!(:language => 'ja') + import.user = user + with_settings :repositories_encodings => 'UTF-8' do + import.set_default_settings + guessed_encoding = import.settings['encoding'] + assert_equal 'CP932', lu(user, :general_csv_encoding) + assert_equal 'CP932', guessed_encoding + end + end end diff --git a/test/unit/lib/redmine/codeset_util_test.rb b/test/unit/lib/redmine/codeset_util_test.rb index aaf664047..56094ecfa 100644 --- a/test/unit/lib/redmine/codeset_util_test.rb +++ b/test/unit/lib/redmine/codeset_util_test.rb @@ -101,4 +101,21 @@ class Redmine::CodesetUtilTest < ActiveSupport::TestCase assert_equal "UTF-8", s2.encoding.to_s assert_equal 'こんにち?', s2 end + + def test_guess_encoding_should_return_guessed_encoding + str = '日本語'.encode('Windows-31J').b + with_settings :repositories_encodings => 'UTF-8,Windows-31J' do + assert_equal 'Windows-31J', Redmine::CodesetUtil.guess_encoding(str) + end + with_settings :repositories_encodings => 'UTF-8,csWindows31J' do + assert_equal 'csWindows31J', Redmine::CodesetUtil.guess_encoding(str) + end + end + + def guess_encoding_should_return_nil_if_cannot_guess_encoding + str = '日本語'.encode('Windows-31J').b + with_settings :repositories_encodings => 'UTF-8,EUC-JP' do + assert_nil Redmine::CodesetUtil.guess_encoding(str) + end + end end -- 2.39.5