summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorGo MAEDA <maeda@farend.jp>2024-10-20 06:47:28 +0000
committerGo MAEDA <maeda@farend.jp>2024-10-20 06:47:28 +0000
commit1d46be8b0f037e173ba4bf2c68ab0d0c33f1eb48 (patch)
tree9be98ebfb892e7726543079138f521997cfbf23b
parent7c66cdaaaf22dae1a9089dba96abf591d1852d78 (diff)
downloadredmine-1d46be8b0f037e173ba4bf2c68ab0d0c33f1eb48.tar.gz
redmine-1d46be8b0f037e173ba4bf2c68ab0d0c33f1eb48.zip
Fix CSV import file encoding auto-detection failure with multibyte characters (#41464).
Patch by Go MAEDA (user:maeda). git-svn-id: https://svn.redmine.org/redmine/trunk@23150 e93f8b46-1217-0410-a6f0-8f06a7374b81
-rw-r--r--app/models/import.rb16
-rw-r--r--test/fixtures/files/mbcs-multiline-text.txt17
-rw-r--r--test/unit/issue_import_test.rb17
3 files changed, 49 insertions, 1 deletions
diff --git a/app/models/import.rb b/app/models/import.rb
index 94e44c5e2..caf673e9a 100644
--- a/app/models/import.rb
+++ b/app/models/import.rb
@@ -69,7 +69,7 @@ class Import < ApplicationRecord
encoding = lu(user, :general_csv_encoding)
if file_exists?
begin
- content = File.read(filepath, 256)
+ content = read_file_head
separator = [',', ';'].max_by {|sep| content.count(sep)}
wrapper = ['"', "'"].max_by {|quote_char| content.count(quote_char)}
@@ -248,6 +248,20 @@ class Import < ApplicationRecord
private
+ # Reads lines from the beginning of the file, up to the specified number
+ # of bytes (max_read_bytes).
+ def read_file_head(max_read_bytes = 4096)
+ return '' unless file_exists?
+ return File.read(filepath, mode: 'rb') if File.size(filepath) <= max_read_bytes
+
+ # The last byte of the chunk may be part of a multi-byte character,
+ # causing an invalid byte sequence. To avoid this, it truncates
+ # the chunk at the last LF character, if found.
+ chunk = File.read(filepath, max_read_bytes)
+ last_lf_index = chunk.rindex("\n")
+ last_lf_index ? chunk[..last_lf_index] : chunk
+ end
+
def read_rows
return unless file_exists?
diff --git a/test/fixtures/files/mbcs-multiline-text.txt b/test/fixtures/files/mbcs-multiline-text.txt
new file mode 100644
index 000000000..f847113f2
--- /dev/null
+++ b/test/fixtures/files/mbcs-multiline-text.txt
@@ -0,0 +1,17 @@
+An emoticon is represented by 4 bytes in UTF-8 encoding.
+
+If you simply read the first 4096 bytes of this file, the trailing characters of a multi-byte sequence might be cut off, resulting in an invalid UTF-8 string.
+
+😀😁😂😃😄😅😆😇😈😉😊😋😌😍😎😏😐😑😒😓😔😕😖😗😘😙😚😛😜😝😞😟😠😡😢😣😤😥😦😧😨😩😪😫😬😭😮😯😰😱😲😳😴😵😶😷😸😹😺😻😼😽😾😿🙀🙁🙂🙃🙄🙅🙆🙇🙈🙉🙊🙋🙌🙍🙎🙏
+😀😁😂😃😄😅😆😇😈😉😊😋😌😍😎😏😐😑😒😓😔😕😖😗😘😙😚😛😜😝😞😟😠😡😢😣😤😥😦😧😨😩😪😫😬😭😮😯😰😱😲😳😴😵😶😷😸😹😺😻😼😽😾😿🙀🙁🙂🙃🙄🙅🙆🙇🙈🙉🙊🙋🙌🙍🙎🙏
+😀😁😂😃😄😅😆😇😈😉😊😋😌😍😎😏😐😑😒😓😔😕😖😗😘😙😚😛😜😝😞😟😠😡😢😣😤😥😦😧😨😩😪😫😬😭😮😯😰😱😲😳😴😵😶😷😸😹😺😻😼😽😾😿🙀🙁🙂🙃🙄🙅🙆🙇🙈🙉🙊🙋🙌🙍🙎🙏
+😀😁😂😃😄😅😆😇😈😉😊😋😌😍😎😏😐😑😒😓😔😕😖😗😘😙😚😛😜😝😞😟😠😡😢😣😤😥😦😧😨😩😪😫😬😭😮😯😰😱😲😳😴😵😶😷😸😹😺😻😼😽😾😿🙀🙁🙂🙃🙄🙅🙆🙇🙈🙉🙊🙋🙌🙍🙎🙏
+😀😁😂😃😄😅😆😇😈😉😊😋😌😍😎😏😐😑😒😓😔😕😖😗😘😙😚😛😜😝😞😟😠😡😢😣😤😥😦😧😨😩😪😫😬😭😮😯😰😱😲😳😴😵😶😷😸😹😺😻😼😽😾😿🙀🙁🙂🙃🙄🙅🙆🙇🙈🙉🙊🙋🙌🙍🙎🙏
+😀😁😂😃😄😅😆😇😈😉😊😋😌😍😎😏😐😑😒😓😔😕😖😗😘😙😚😛😜😝😞😟😠😡😢😣😤😥😦😧😨😩😪😫😬😭😮😯😰😱😲😳😴😵😶😷😸😹😺😻😼😽😾😿🙀🙁🙂🙃🙄🙅🙆🙇🙈🙉🙊🙋🙌🙍🙎🙏
+😀😁😂😃😄😅😆😇😈😉😊😋😌😍😎😏😐😑😒😓😔😕😖😗😘😙😚😛😜😝😞😟😠😡😢😣😤😥😦😧😨😩😪😫😬😭😮😯😰😱😲😳😴😵😶😷😸😹😺😻😼😽😾😿🙀🙁🙂🙃🙄🙅🙆🙇🙈🙉🙊🙋🙌🙍🙎🙏
+😀😁😂😃😄😅😆😇😈😉😊😋😌😍😎😏😐😑😒😓😔😕😖😗😘😙😚😛😜😝😞😟😠😡😢😣😤😥😦😧😨😩😪😫😬😭😮😯😰😱😲😳😴😵😶😷😸😹😺😻😼😽😾😿🙀🙁🙂🙃🙄🙅🙆🙇🙈🙉🙊🙋🙌🙍🙎🙏
+😀😁😂😃😄😅😆😇😈😉😊😋😌😍😎😏😐😑😒😓😔😕😖😗😘😙😚😛😜😝😞😟😠😡😢😣😤😥😦😧😨😩😪😫😬😭😮😯😰😱😲😳😴😵😶😷😸😹😺😻😼😽😾😿🙀🙁🙂🙃🙄🙅🙆🙇🙈🙉🙊🙋🙌🙍🙎🙏
+😀😁😂😃😄😅😆😇😈😉😊😋😌😍😎😏😐😑😒😓😔😕😖😗😘😙😚😛😜😝😞😟😠😡😢😣😤😥😦😧😨😩😪😫😬😭😮😯😰😱😲😳😴😵😶😷😸😹😺😻😼😽😾😿🙀🙁🙂🙃🙄🙅🙆🙇🙈🙉🙊🙋🙌🙍🙎🙏
+😀😁😂😃😄😅😆😇😈😉😊😋😌😍😎😏😐😑😒😓😔😕😖😗😘😙😚😛😜😝😞😟😠😡😢😣😤😥😦😧😨😩😪😫😬😭😮😯😰😱😲😳😴😵😶😷😸😹😺😻😼😽😾😿🙀🙁🙂🙃🙄🙅🙆🙇🙈🙉🙊🙋🙌🙍🙎🙏
+😀😁😂😃😄😅😆😇😈😉😊😋😌😍😎😏😐😑😒😓😔😕😖😗😘😙😚😛😜😝😞😟😠😡😢😣😤😥😦😧😨😩😪😫😬😭😮😯😰😱😲😳😴😵😶😷😸😹😺😻😼😽😾😿🙀🙁🙂🙃🙄🙅🙆🙇🙈🙉🙊🙋🙌🙍🙎🙏
+😀😁😂😃😄😅😆😇😈😉😊😋😌😍😎😏😐😑😒😓😔😕😖😗😘😙😚😛😜😝😞😟😠😡😢😣😤😥😦😧😨😩😪😫😬😭😮😯😰😱😲😳😴😵😶😷😸😹😺😻😼😽😾😿🙀🙁🙂🙃🙄🙅🙆🙇🙈🙉🙊🙋🙌🙍🙎🙏
diff --git a/test/unit/issue_import_test.rb b/test/unit/issue_import_test.rb
index ed913fe44..a47dd6e23 100644
--- a/test/unit/issue_import_test.rb
+++ b/test/unit/issue_import_test.rb
@@ -464,6 +464,23 @@ class IssueImportTest < ActiveSupport::TestCase
end
end
+ def test_encoding_guessing_respects_multibyte_boundaries
+ # Reading a specified number of bytes from the beginning of this file
+ # may stop in the middle of a multi-byte character, which can lead to
+ # an invalid UTF-8 string.
+ test_file = 'mbcs-multiline-text.txt'
+ chunk = File.read(Rails.root.join('test', 'fixtures', 'files', test_file), 4096)
+ chunk.force_encoding('UTF-8') # => "...😃😄😅\xF0\x9F"
+ assert_not chunk.valid_encoding?
+
+ import = generate_import(test_file)
+ with_settings :repositories_encodings => 'UTF-8,ISO-8859-1' do
+ import.set_default_settings
+ guessed_encoding = import.settings['encoding']
+ assert_equal 'UTF-8', guessed_encoding
+ end
+ end
+
def test_set_default_settings_should_detect_field_wrapper
to_test = {
'import_issues.csv' => '"',