]> source.dussan.org Git - redmine.git/commitdiff
Fix CSV import file encoding auto-detection failure with multibyte characters (#41464).
authorGo MAEDA <maeda@farend.jp>
Sun, 20 Oct 2024 06:47:28 +0000 (06:47 +0000)
committerGo MAEDA <maeda@farend.jp>
Sun, 20 Oct 2024 06:47:28 +0000 (06:47 +0000)
Patch by Go MAEDA (user:maeda).

git-svn-id: https://svn.redmine.org/redmine/trunk@23150 e93f8b46-1217-0410-a6f0-8f06a7374b81

app/models/import.rb
test/fixtures/files/mbcs-multiline-text.txt [new file with mode: 0644]
test/unit/issue_import_test.rb

index 94e44c5e24ac57817f3962f6056de790b15493cd..caf673e9aa5eac2078e1e94b8c6b46c149c1ea64 100644 (file)
@@ -69,7 +69,7 @@ class Import < ApplicationRecord
     encoding = lu(user, :general_csv_encoding)
     if file_exists?
       begin
-        content = File.read(filepath, 256)
+        content = read_file_head
 
         separator = [',', ';'].max_by {|sep| content.count(sep)}
         wrapper = ['"', "'"].max_by {|quote_char| content.count(quote_char)}
@@ -248,6 +248,20 @@ class Import < ApplicationRecord
 
   private
 
+  # Reads lines from the beginning of the file, up to the specified number
+  # of bytes (max_read_bytes).
+  def read_file_head(max_read_bytes = 4096)
+    return '' unless file_exists?
+    return File.read(filepath, mode: 'rb') if File.size(filepath) <= max_read_bytes
+
+    # The last byte of the chunk may be part of a multi-byte character,
+    # causing an invalid byte sequence. To avoid this, it truncates
+    # the chunk at the last LF character, if found.
+    chunk = File.read(filepath, max_read_bytes)
+    last_lf_index = chunk.rindex("\n")
+    last_lf_index ? chunk[..last_lf_index] : chunk
+  end
+
   def read_rows
     return unless file_exists?
 
diff --git a/test/fixtures/files/mbcs-multiline-text.txt b/test/fixtures/files/mbcs-multiline-text.txt
new file mode 100644 (file)
index 0000000..f847113
--- /dev/null
@@ -0,0 +1,17 @@
+An emoticon is represented by 4 bytes in UTF-8 encoding.
+
+If you simply read the first 4096 bytes of this file, the trailing characters of a multi-byte sequence might be cut off, resulting in an invalid UTF-8 string.
+
+😀😁😂😃😄😅😆😇😈😉😊😋😌😍😎😏😐😑😒😓😔😕😖😗😘😙😚😛😜😝😞😟😠😡😢😣😤😥😦😧😨😩😪😫😬😭😮😯😰😱😲😳😴😵😶😷😸😹😺😻😼😽😾😿🙀🙁🙂🙃🙄🙅🙆🙇🙈🙉🙊🙋🙌🙍🙎🙏
+😀😁😂😃😄😅😆😇😈😉😊😋😌😍😎😏😐😑😒😓😔😕😖😗😘😙😚😛😜😝😞😟😠😡😢😣😤😥😦😧😨😩😪😫😬😭😮😯😰😱😲😳😴😵😶😷😸😹😺😻😼😽😾😿🙀🙁🙂🙃🙄🙅🙆🙇🙈🙉🙊🙋🙌🙍🙎🙏
+😀😁😂😃😄😅😆😇😈😉😊😋😌😍😎😏😐😑😒😓😔😕😖😗😘😙😚😛😜😝😞😟😠😡😢😣😤😥😦😧😨😩😪😫😬😭😮😯😰😱😲😳😴😵😶😷😸😹😺😻😼😽😾😿🙀🙁🙂🙃🙄🙅🙆🙇🙈🙉🙊🙋🙌🙍🙎🙏
+😀😁😂😃😄😅😆😇😈😉😊😋😌😍😎😏😐😑😒😓😔😕😖😗😘😙😚😛😜😝😞😟😠😡😢😣😤😥😦😧😨😩😪😫😬😭😮😯😰😱😲😳😴😵😶😷😸😹😺😻😼😽😾😿🙀🙁🙂🙃🙄🙅🙆🙇🙈🙉🙊🙋🙌🙍🙎🙏
+😀😁😂😃😄😅😆😇😈😉😊😋😌😍😎😏😐😑😒😓😔😕😖😗😘😙😚😛😜😝😞😟😠😡😢😣😤😥😦😧😨😩😪😫😬😭😮😯😰😱😲😳😴😵😶😷😸😹😺😻😼😽😾😿🙀🙁🙂🙃🙄🙅🙆🙇🙈🙉🙊🙋🙌🙍🙎🙏
+😀😁😂😃😄😅😆😇😈😉😊😋😌😍😎😏😐😑😒😓😔😕😖😗😘😙😚😛😜😝😞😟😠😡😢😣😤😥😦😧😨😩😪😫😬😭😮😯😰😱😲😳😴😵😶😷😸😹😺😻😼😽😾😿🙀🙁🙂🙃🙄🙅🙆🙇🙈🙉🙊🙋🙌🙍🙎🙏
+😀😁😂😃😄😅😆😇😈😉😊😋😌😍😎😏😐😑😒😓😔😕😖😗😘😙😚😛😜😝😞😟😠😡😢😣😤😥😦😧😨😩😪😫😬😭😮😯😰😱😲😳😴😵😶😷😸😹😺😻😼😽😾😿🙀🙁🙂🙃🙄🙅🙆🙇🙈🙉🙊🙋🙌🙍🙎🙏
+😀😁😂😃😄😅😆😇😈😉😊😋😌😍😎😏😐😑😒😓😔😕😖😗😘😙😚😛😜😝😞😟😠😡😢😣😤😥😦😧😨😩😪😫😬😭😮😯😰😱😲😳😴😵😶😷😸😹😺😻😼😽😾😿🙀🙁🙂🙃🙄🙅🙆🙇🙈🙉🙊🙋🙌🙍🙎🙏
+😀😁😂😃😄😅😆😇😈😉😊😋😌😍😎😏😐😑😒😓😔😕😖😗😘😙😚😛😜😝😞😟😠😡😢😣😤😥😦😧😨😩😪😫😬😭😮😯😰😱😲😳😴😵😶😷😸😹😺😻😼😽😾😿🙀🙁🙂🙃🙄🙅🙆🙇🙈🙉🙊🙋🙌🙍🙎🙏
+😀😁😂😃😄😅😆😇😈😉😊😋😌😍😎😏😐😑😒😓😔😕😖😗😘😙😚😛😜😝😞😟😠😡😢😣😤😥😦😧😨😩😪😫😬😭😮😯😰😱😲😳😴😵😶😷😸😹😺😻😼😽😾😿🙀🙁🙂🙃🙄🙅🙆🙇🙈🙉🙊🙋🙌🙍🙎🙏
+😀😁😂😃😄😅😆😇😈😉😊😋😌😍😎😏😐😑😒😓😔😕😖😗😘😙😚😛😜😝😞😟😠😡😢😣😤😥😦😧😨😩😪😫😬😭😮😯😰😱😲😳😴😵😶😷😸😹😺😻😼😽😾😿🙀🙁🙂🙃🙄🙅🙆🙇🙈🙉🙊🙋🙌🙍🙎🙏
+😀😁😂😃😄😅😆😇😈😉😊😋😌😍😎😏😐😑😒😓😔😕😖😗😘😙😚😛😜😝😞😟😠😡😢😣😤😥😦😧😨😩😪😫😬😭😮😯😰😱😲😳😴😵😶😷😸😹😺😻😼😽😾😿🙀🙁🙂🙃🙄🙅🙆🙇🙈🙉🙊🙋🙌🙍🙎🙏
+😀😁😂😃😄😅😆😇😈😉😊😋😌😍😎😏😐😑😒😓😔😕😖😗😘😙😚😛😜😝😞😟😠😡😢😣😤😥😦😧😨😩😪😫😬😭😮😯😰😱😲😳😴😵😶😷😸😹😺😻😼😽😾😿🙀🙁🙂🙃🙄🙅🙆🙇🙈🙉🙊🙋🙌🙍🙎🙏
index ed913fe44606cdb3eb4fe4a124aeec59e88352eb..a47dd6e23a9fdf4e3715dec9004730cd9eee0368 100644 (file)
@@ -464,6 +464,23 @@ class IssueImportTest < ActiveSupport::TestCase
     end
   end
 
+  def test_encoding_guessing_respects_multibyte_boundaries
+    # Reading a specified number of bytes from the beginning of this file
+    # may stop in the middle of a multi-byte character, which can lead to
+    # an invalid UTF-8 string.
+    test_file = 'mbcs-multiline-text.txt'
+    chunk = File.read(Rails.root.join('test', 'fixtures', 'files', test_file), 4096)
+    chunk.force_encoding('UTF-8') # => "...😃😄😅\xF0\x9F"
+    assert_not chunk.valid_encoding?
+
+    import = generate_import(test_file)
+    with_settings :repositories_encodings => 'UTF-8,ISO-8859-1' do
+      import.set_default_settings
+      guessed_encoding = import.settings['encoding']
+      assert_equal 'UTF-8', guessed_encoding
+    end
+  end
+
   def test_set_default_settings_should_detect_field_wrapper
     to_test = {
       'import_issues.csv' => '"',