Fix bug #49441 - Allow overriding and guessing of HSMF non-unicode string encodings

author Nick Burch <nick@apache.org>

Tue, 3 Aug 2010 16:06:21 +0000 (16:06 +0000)

committer Nick Burch <nick@apache.org>

Tue, 3 Aug 2010 16:06:21 +0000 (16:06 +0000)
author Nick Burch <nick@apache.org>
Tue, 3 Aug 2010 16:06:21 +0000 (16:06 +0000)
committer Nick Burch <nick@apache.org>
Tue, 3 Aug 2010 16:06:21 +0000 (16:06 +0000)
diff --git a/src/documentation/content/xdocs/status.xml b/src/documentation/content/xdocs/status.xml

index 3c6ba2742c4f9d6cb5b88fd3389ef4dc84bc3868..840486d617f969bf5310b5dc597594f55110584b 100644 (file)
--- a/src/documentation/content/xdocs/status.xml
+++ b/src/documentation/content/xdocs/status.xml
@@ -34,6 +34,7 @@
  
      <changes>
          <release version="3.7-beta2" date="2010-??-??">
+           <action dev="POI-DEVELOPERS" type="add">49441 - Allow overriding and guessing of HSMF non-unicode string encodings</action>
             <action dev="POI-DEVELOPERS" type="fix">49689 - Allow the setting of user style names on newly created HSSF cell styles</action>
             <action dev="POI-DEVELOPERS" type="add">Make it easier to tell which content types each POIXMLTextExtractor handles</action>
             <action dev="POI-DEVELOPERS" type="fix">49649 - Added clone support for UserSView* and Feat* families of records</action>
diff --git a/src/scratchpad/src/org/apache/poi/hsmf/MAPIMessage.java b/src/scratchpad/src/org/apache/poi/hsmf/MAPIMessage.java

index e04d299b136e944ef459edaf6c5e3d968166ee38..3db17180fd7a1dbd50f25d1ab94741dcd87f2c57 100644 (file)
--- a/src/scratchpad/src/org/apache/poi/hsmf/MAPIMessage.java
+++ b/src/scratchpad/src/org/apache/poi/hsmf/MAPIMessage.java
@@ -25,10 +25,13 @@ import java.io.OutputStream;
  import java.util.ArrayList;
  import java.util.Arrays;
  import java.util.Calendar;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
  
  import org.apache.poi.POIDocument;
  import org.apache.poi.hsmf.datatypes.AttachmentChunks;
  import org.apache.poi.hsmf.datatypes.AttachmentChunks.AttachmentChunksSorter;
+import org.apache.poi.hsmf.datatypes.Chunk;
  import org.apache.poi.hsmf.datatypes.ChunkGroup;
  import org.apache.poi.hsmf.datatypes.Chunks;
  import org.apache.poi.hsmf.datatypes.NameIdChunks;
@@ -286,10 +289,58 @@ public class MAPIMessage extends POIDocument {
  
        return names;
     }
+   
+   /**
+    * Many messages store their strings as unicode, which is
+    *  nice and easy. Some use one-byte encodings for their
+    *  strings, but don't easily store the encoding anywhere
+    *  in the file!
+    * This method looks at the headers for the message, and
+    *  tries to use these to guess the correct encoding for
+    *  your file.
+    * Bug #49441 has more on why this is needed
+    */
+   public void guess7BitEncoding() {
+      try {
+         String[] headers = getHeaders();
+         if(headers == null || headers.length == 0) {
+            return;
+         }
  
+         // Look for a content type with a charset
+         Pattern p = Pattern.compile("Content-Type:.*?charset=[\"']?(.*?)[\"']?");
+         for(String header : headers) {
+            if(header.startsWith("Content-Type")) {
+               Matcher m = p.matcher(header);
+               if(m.matches()) {
+                  // Found it! Tell all the string chunks
+                  String charset = m.group(1);
+                  
+                  for(Chunk c : mainChunks.getAll()) {
+                     if(c instanceof StringChunk) {
+                        ((StringChunk)c).set7BitEncoding(charset);
+                     }
+                  }
+                  for(Chunk c : nameIdChunks.getAll()) {
+                     if(c instanceof StringChunk) {
+                        ((StringChunk)c).set7BitEncoding(charset);
+                     }
+                  }
+                  for(RecipientChunks rc : recipientChunks) {
+                     for(Chunk c : rc.getAll()) {
+                        if(c instanceof StringChunk) {
+                           ((StringChunk)c).set7BitEncoding(charset);
+                        }
+                     }
+                  }
+               }
+            }
+         }
+      } catch(ChunkNotFoundException e) {}
+   }
     
     /**
-    * 
+    * Returns all the headers, one entry per line
      */
     public String[] getHeaders() throws ChunkNotFoundException {
        String headers = getStringFromChunk(mainChunks.messageHeaders);
diff --git a/src/scratchpad/src/org/apache/poi/hsmf/datatypes/StringChunk.java b/src/scratchpad/src/org/apache/poi/hsmf/datatypes/StringChunk.java

index b735058dc7a953cbad7483003a97a16750bb151e..057bb07aad82237ac4d02346c1bf829a12029d5c 100644 (file)
--- a/src/scratchpad/src/org/apache/poi/hsmf/datatypes/StringChunk.java
+++ b/src/scratchpad/src/org/apache/poi/hsmf/datatypes/StringChunk.java
@@ -30,8 +30,8 @@ import org.apache.poi.util.StringUtil;
   * A Chunk made up of a single string.
   */
  public class StringChunk extends Chunk {
-
         private String value;
+       private String encoding7Bit = "CP1252";
  
         /**
          * Creates a String Chunk.
@@ -48,13 +48,33 @@ public class StringChunk extends Chunk {
            super(chunkId, type);
         }
         
+       /**
+        * Returns the Encoding that will be used to
+        *  decode any "7 bit" (non unicode) data.
+        * Most files default to CP1252
+        */
+       public String get7BitEncoding() {
+          return encoding7Bit;
+       }
+       
+       /**
+        * Sets the Encoding that will be used to
+        *  decode any "7 bit" (non unicode) data.
+        * This doesn't appear to be stored anywhere
+        *  specific in the file, so you may need
+        *  to guess by looking at headers etc
+        */
+       public void set7BitEncoding(String encoding) {
+          this.encoding7Bit = encoding;
+       }
+       
         public void readValue(InputStream value) throws IOException {
        String tmpValue;
        byte[] data = IOUtils.toByteArray(value);
        
            switch(type) {
            case Types.ASCII_STRING:
-             tmpValue = parseAs7BitData(data);
+             tmpValue = parseAs7BitData(data, encoding7Bit);
           break;
            case Types.UNICODE_STRING:
               tmpValue = StringUtil.getFromUnicodeLE(data);
@@ -73,9 +93,9 @@ public class StringChunk extends Chunk {
        switch(type) {
        case Types.ASCII_STRING:
           try {
-            data = value.getBytes("CP1252");
+            data = value.getBytes(encoding7Bit);
           } catch (UnsupportedEncodingException e) {
-            throw new RuntimeException("Core encoding not found, JVM broken?", e);
+            throw new RuntimeException("Encoding not found - " + encoding7Bit, e);
           }
           break;
        case Types.UNICODE_STRING:
@@ -101,10 +121,17 @@ public class StringChunk extends Chunk {
      *  and returns the string that that yields.
      */
     protected static String parseAs7BitData(byte[] data) {
+      return parseAs7BitData(data, "CP1252");
+   }
+   /**
+    * Parses as non-unicode, supposedly 7 bit data
+    *  and returns the string that that yields.
+    */
+   protected static String parseAs7BitData(byte[] data, String encoding) {
        try {
-         return new String(data, "CP1252");
+         return new String(data, encoding);
        } catch (UnsupportedEncodingException e) {
-         throw new RuntimeException("Core encoding not found, JVM broken?", e);
+         throw new RuntimeException("Encoding not found - " + encoding, e);
        }
     }
  }
diff --git a/src/scratchpad/testcases/org/apache/poi/hsmf/TestBasics.java b/src/scratchpad/testcases/org/apache/poi/hsmf/TestBasics.java

index 18a316491f929854576d177b6cf117f610d1c59f..bb7c87262cdac746901d4b871d11307dae9255d4 100644 (file)
--- a/src/scratchpad/testcases/org/apache/poi/hsmf/TestBasics.java
+++ b/src/scratchpad/testcases/org/apache/poi/hsmf/TestBasics.java
@@ -34,6 +34,7 @@ public final class TestBasics extends TestCase {
     private MAPIMessage outlook30;
     private MAPIMessage attachments;
     private MAPIMessage noRecipientAddress;
+   private MAPIMessage cyrillic;
  
         /**
          * Initialize this test, load up the blank.msg mapi message.
@@ -46,6 +47,7 @@ public final class TestBasics extends TestCase {
        outlook30  = new MAPIMessage(samples.openResourceAsStream("outlook_30_msg.msg"));
        attachments = new MAPIMessage(samples.openResourceAsStream("attachment_test_msg.msg"));
        noRecipientAddress = new MAPIMessage(samples.openResourceAsStream("no_recipient_address.msg"));
+      cyrillic = new MAPIMessage(samples.openResourceAsStream("cyrillic_message.msg"));
         }
         
         /**
@@ -177,4 +179,21 @@ public final class TestBasics extends TestCase {
        
        noRecipientAddress.setReturnNullOnMissingChunk(false);
         }
+       
+   /**
+    * We default to CP1252, but can sometimes do better
+    *  if needed.
+    * This file is really CP1251, according to the person
+    *  who submitted it in bug #49441
+    */
+   public void testEncoding() throws Exception {
+      assertEquals(2, cyrillic.getRecipientDetailsChunks().length);
+      assertEquals("CP1252", cyrillic.getRecipientDetailsChunks()[0].recipientDisplayNameChunk.get7BitEncoding());
+      assertEquals("CP1252", cyrillic.getRecipientDetailsChunks()[1].recipientDisplayNameChunk.get7BitEncoding());
+      
+      cyrillic.guess7BitEncoding();
+      
+      assertEquals("Cp1251", cyrillic.getRecipientDetailsChunks()[0].recipientDisplayNameChunk.get7BitEncoding());
+      assertEquals("Cp1251", cyrillic.getRecipientDetailsChunks()[1].recipientDisplayNameChunk.get7BitEncoding());
+   }
  }
author	Nick Burch <nick@apache.org>
	Tue, 3 Aug 2010 16:06:21 +0000 (16:06 +0000)
committer	Nick Burch <nick@apache.org>
	Tue, 3 Aug 2010 16:06:21 +0000 (16:06 +0000)
src/documentation/content/xdocs/status.xml		patch \| blob \| history
src/scratchpad/src/org/apache/poi/hsmf/MAPIMessage.java		patch \| blob \| history
src/scratchpad/src/org/apache/poi/hsmf/datatypes/StringChunk.java		patch \| blob \| history
src/scratchpad/testcases/org/apache/poi/hsmf/TestBasics.java		patch \| blob \| history