From 1d46be8b0f037e173ba4bf2c68ab0d0c33f1eb48 Mon Sep 17 00:00:00 2001 From: Go MAEDA Date: Sun, 20 Oct 2024 06:47:28 +0000 Subject: [PATCH] Fix CSV import file encoding auto-detection failure with multibyte characters (#41464). Patch by Go MAEDA (user:maeda). git-svn-id: https://svn.redmine.org/redmine/trunk@23150 e93f8b46-1217-0410-a6f0-8f06a7374b81 --- app/models/import.rb | 16 +++++++++++++++- test/fixtures/files/mbcs-multiline-text.txt | 17 +++++++++++++++++ test/unit/issue_import_test.rb | 17 +++++++++++++++++ 3 files changed, 49 insertions(+), 1 deletion(-) create mode 100644 test/fixtures/files/mbcs-multiline-text.txt diff --git a/app/models/import.rb b/app/models/import.rb index 94e44c5e2..caf673e9a 100644 --- a/app/models/import.rb +++ b/app/models/import.rb @@ -69,7 +69,7 @@ class Import < ApplicationRecord encoding = lu(user, :general_csv_encoding) if file_exists? begin - content = File.read(filepath, 256) + content = read_file_head separator = [',', ';'].max_by {|sep| content.count(sep)} wrapper = ['"', "'"].max_by {|quote_char| content.count(quote_char)} @@ -248,6 +248,20 @@ class Import < ApplicationRecord private + # Reads lines from the beginning of the file, up to the specified number + # of bytes (max_read_bytes). + def read_file_head(max_read_bytes = 4096) + return '' unless file_exists? + return File.read(filepath, mode: 'rb') if File.size(filepath) <= max_read_bytes + + # The last byte of the chunk may be part of a multi-byte character, + # causing an invalid byte sequence. To avoid this, it truncates + # the chunk at the last LF character, if found. + chunk = File.read(filepath, max_read_bytes) + last_lf_index = chunk.rindex("\n") + last_lf_index ? chunk[..last_lf_index] : chunk + end + def read_rows return unless file_exists? diff --git a/test/fixtures/files/mbcs-multiline-text.txt b/test/fixtures/files/mbcs-multiline-text.txt new file mode 100644 index 000000000..f847113f2 --- /dev/null +++ b/test/fixtures/files/mbcs-multiline-text.txt @@ -0,0 +1,17 @@ +An emoticon is represented by 4 bytes in UTF-8 encoding. + +If you simply read the first 4096 bytes of this file, the trailing characters of a multi-byte sequence might be cut off, resulting in an invalid UTF-8 string. + +😀😁😂😃😄😅😆😇😈😉😊😋😌😍😎😏😐😑😒😓😔😕😖😗😘😙😚😛😜😝😞😟😠😡😢😣😤😥😦😧😨😩😪😫😬😭😮😯😰😱😲😳😴😵😶😷😸😹😺😻😼😽😾😿🙀🙁🙂🙃🙄🙅🙆🙇🙈🙉🙊🙋🙌🙍🙎🙏 +😀😁😂😃😄😅😆😇😈😉😊😋😌😍😎😏😐😑😒😓😔😕😖😗😘😙😚😛😜😝😞😟😠😡😢😣😤😥😦😧😨😩😪😫😬😭😮😯😰😱😲😳😴😵😶😷😸😹😺😻😼😽😾😿🙀🙁🙂🙃🙄🙅🙆🙇🙈🙉🙊🙋🙌🙍🙎🙏 +😀😁😂😃😄😅😆😇😈😉😊😋😌😍😎😏😐😑😒😓😔😕😖😗😘😙😚😛😜😝😞😟😠😡😢😣😤😥😦😧😨😩😪😫😬😭😮😯😰😱😲😳😴😵😶😷😸😹😺😻😼😽😾😿🙀🙁🙂🙃🙄🙅🙆🙇🙈🙉🙊🙋🙌🙍🙎🙏 +😀😁😂😃😄😅😆😇😈😉😊😋😌😍😎😏😐😑😒😓😔😕😖😗😘😙😚😛😜😝😞😟😠😡😢😣😤😥😦😧😨😩😪😫😬😭😮😯😰😱😲😳😴😵😶😷😸😹😺😻😼😽😾😿🙀🙁🙂🙃🙄🙅🙆🙇🙈🙉🙊🙋🙌🙍🙎🙏 +😀😁😂😃😄😅😆😇😈😉😊😋😌😍😎😏😐😑😒😓😔😕😖😗😘😙😚😛😜😝😞😟😠😡😢😣😤😥😦😧😨😩😪😫😬😭😮😯😰😱😲😳😴😵😶😷😸😹😺😻😼😽😾😿🙀🙁🙂🙃🙄🙅🙆🙇🙈🙉🙊🙋🙌🙍🙎🙏 +😀😁😂😃😄😅😆😇😈😉😊😋😌😍😎😏😐😑😒😓😔😕😖😗😘😙😚😛😜😝😞😟😠😡😢😣😤😥😦😧😨😩😪😫😬😭😮😯😰😱😲😳😴😵😶😷😸😹😺😻😼😽😾😿🙀🙁🙂🙃🙄🙅🙆🙇🙈🙉🙊🙋🙌🙍🙎🙏 +😀😁😂😃😄😅😆😇😈😉😊😋😌😍😎😏😐😑😒😓😔😕😖😗😘😙😚😛😜😝😞😟😠😡😢😣😤😥😦😧😨😩😪😫😬😭😮😯😰😱😲😳😴😵😶😷😸😹😺😻😼😽😾😿🙀🙁🙂🙃🙄🙅🙆🙇🙈🙉🙊🙋🙌🙍🙎🙏 +😀😁😂😃😄😅😆😇😈😉😊😋😌😍😎😏😐😑😒😓😔😕😖😗😘😙😚😛😜😝😞😟😠😡😢😣😤😥😦😧😨😩😪😫😬😭😮😯😰😱😲😳😴😵😶😷😸😹😺😻😼😽😾😿🙀🙁🙂🙃🙄🙅🙆🙇🙈🙉🙊🙋🙌🙍🙎🙏 +😀😁😂😃😄😅😆😇😈😉😊😋😌😍😎😏😐😑😒😓😔😕😖😗😘😙😚😛😜😝😞😟😠😡😢😣😤😥😦😧😨😩😪😫😬😭😮😯😰😱😲😳😴😵😶😷😸😹😺😻😼😽😾😿🙀🙁🙂🙃🙄🙅🙆🙇🙈🙉🙊🙋🙌🙍🙎🙏 +😀😁😂😃😄😅😆😇😈😉😊😋😌😍😎😏😐😑😒😓😔😕😖😗😘😙😚😛😜😝😞😟😠😡😢😣😤😥😦😧😨😩😪😫😬😭😮😯😰😱😲😳😴😵😶😷😸😹😺😻😼😽😾😿🙀🙁🙂🙃🙄🙅🙆🙇🙈🙉🙊🙋🙌🙍🙎🙏 +😀😁😂😃😄😅😆😇😈😉😊😋😌😍😎😏😐😑😒😓😔😕😖😗😘😙😚😛😜😝😞😟😠😡😢😣😤😥😦😧😨😩😪😫😬😭😮😯😰😱😲😳😴😵😶😷😸😹😺😻😼😽😾😿🙀🙁🙂🙃🙄🙅🙆🙇🙈🙉🙊🙋🙌🙍🙎🙏 +😀😁😂😃😄😅😆😇😈😉😊😋😌😍😎😏😐😑😒😓😔😕😖😗😘😙😚😛😜😝😞😟😠😡😢😣😤😥😦😧😨😩😪😫😬😭😮😯😰😱😲😳😴😵😶😷😸😹😺😻😼😽😾😿🙀🙁🙂🙃🙄🙅🙆🙇🙈🙉🙊🙋🙌🙍🙎🙏 +😀😁😂😃😄😅😆😇😈😉😊😋😌😍😎😏😐😑😒😓😔😕😖😗😘😙😚😛😜😝😞😟😠😡😢😣😤😥😦😧😨😩😪😫😬😭😮😯😰😱😲😳😴😵😶😷😸😹😺😻😼😽😾😿🙀🙁🙂🙃🙄🙅🙆🙇🙈🙉🙊🙋🙌🙍🙎🙏 diff --git a/test/unit/issue_import_test.rb b/test/unit/issue_import_test.rb index ed913fe44..a47dd6e23 100644 --- a/test/unit/issue_import_test.rb +++ b/test/unit/issue_import_test.rb @@ -464,6 +464,23 @@ class IssueImportTest < ActiveSupport::TestCase end end + def test_encoding_guessing_respects_multibyte_boundaries + # Reading a specified number of bytes from the beginning of this file + # may stop in the middle of a multi-byte character, which can lead to + # an invalid UTF-8 string. + test_file = 'mbcs-multiline-text.txt' + chunk = File.read(Rails.root.join('test', 'fixtures', 'files', test_file), 4096) + chunk.force_encoding('UTF-8') # => "...😃😄😅\xF0\x9F" + assert_not chunk.valid_encoding? + + import = generate_import(test_file) + with_settings :repositories_encodings => 'UTF-8,ISO-8859-1' do + import.set_default_settings + guessed_encoding = import.settings['encoding'] + assert_equal 'UTF-8', guessed_encoding + end + end + def test_set_default_settings_should_detect_field_wrapper to_test = { 'import_issues.csv' => '"', -- 2.39.5