]> source.dussan.org Git - poi.git/commitdiff
Improve HSMF encoding guessing for 7 bit fields, and allow HSMF access to the HTML...
authorNick Burch <nick@apache.org>
Fri, 1 Apr 2011 14:51:45 +0000 (14:51 +0000)
committerNick Burch <nick@apache.org>
Fri, 1 Apr 2011 14:51:45 +0000 (14:51 +0000)
git-svn-id: https://svn.apache.org/repos/asf/poi/trunk@1087726 13f79535-47bb-0310-9956-ffa450edef68

src/documentation/content/xdocs/status.xml
src/scratchpad/src/org/apache/poi/hsmf/MAPIMessage.java
src/scratchpad/src/org/apache/poi/hsmf/datatypes/Chunks.java
src/scratchpad/src/org/apache/poi/hsmf/datatypes/StringChunk.java
src/scratchpad/testcases/org/apache/poi/hsmf/TestBasics.java
test-data/hsmf/chinese-traditional.msg [new file with mode: 0644]

index 2730545cceef510fde4a86a1995fe9f1a46bfbfb..c619413f2025fec16cf48aeab05b6c5ab4bbe535 100644 (file)
     </developers>
 
     <changes>
+        <release version="3.8-beta3" date="2011-??-??">
+           <action dev="poi-developers" type="add">Improve HSMF encoding guessing for 7 bit fields in MAPIMessage</action>
+           <action dev="poi-developers" type="add">Allow HSMF access to the HTML body contents in MAPIMessage</action>
+        </release>
         <release version="3.8-beta2" date="2011-??-??">
            <action dev="poi-developers" type="add">Implement the load method on MemoryPackagePart</action>
            <action dev="poi-developers" type="add">50967 - Support for continued ExtSSTRecords</action>
index f220ebfe5581ff906ce2add14408c0156e5f554b..e9cc82d14d134b2cb416eadb3786fd49fd93d3c6 100644 (file)
@@ -176,6 +176,16 @@ public class MAPIMessage extends POIDocument {
       return getStringFromChunk(mainChunks.textBodyChunk);
    }
 
+   /**
+    * Gets the html body of this Outlook Message, if this email
+    *  contains a html version.
+    * @return The string representation of the 'html' version of the body, if available.
+    * @throws ChunkNotFoundException
+    */
+   public String getHmtlBody() throws ChunkNotFoundException {
+      return getStringFromChunk(mainChunks.htmlBodyChunk);
+   }
+
    /**
     * Gets the subject line of the Outlook Message
     * @throws ChunkNotFoundException
@@ -331,28 +341,59 @@ public class MAPIMessage extends POIDocument {
                if(m.matches()) {
                   // Found it! Tell all the string chunks
                   String charset = m.group(1);
-                  
-                  for(Chunk c : mainChunks.getAll()) {
-                     if(c instanceof StringChunk) {
-                        ((StringChunk)c).set7BitEncoding(charset);
-                     }
-                  }
-                  for(Chunk c : nameIdChunks.getAll()) {
-                     if(c instanceof StringChunk) {
-                        ((StringChunk)c).set7BitEncoding(charset);
-                     }
-                  }
-                  for(RecipientChunks rc : recipientChunks) {
-                     for(Chunk c : rc.getAll()) {
-                        if(c instanceof StringChunk) {
-                           ((StringChunk)c).set7BitEncoding(charset);
-                        }
-                     }
-                  }
+                  set7BitEncoding(charset);
+                  return;
                }
             }
          }
       } catch(ChunkNotFoundException e) {}
+      
+      // Nothing suitable in the headers, try HTML
+      try {
+         String html = getHmtlBody();
+         
+         // Look for a content type in the meta headers
+         Pattern p = Pattern.compile(
+               "<META\\s+HTTP-EQUIV=\"Content-Type\"\\s+CONTENT=\"text/html;\\s+charset=(.*?)\""
+         );
+         Matcher m = p.matcher(html);
+         if(m.find()) {
+            // Found it! Tell all the string chunks
+            String charset = m.group(1);
+            set7BitEncoding(charset);
+            return;
+         }
+      } catch(ChunkNotFoundException e) {}
+   }
+
+   /**
+    * Many messages store their strings as unicode, which is
+    *  nice and easy. Some use one-byte encodings for their
+    *  strings, but don't easily store the encoding anywhere
+    *  in the file!
+    * If you know what the encoding is of your file, you can
+    *  use this method to set the 7 bit encoding for all
+    *  the non unicode strings in the file.
+    * @see #guess7BitEncoding()
+    */
+   public void set7BitEncoding(String charset) {
+      for(Chunk c : mainChunks.getAll()) {
+         if(c instanceof StringChunk) {
+            ((StringChunk)c).set7BitEncoding(charset);
+         }
+      }
+      for(Chunk c : nameIdChunks.getAll()) {
+         if(c instanceof StringChunk) {
+            ((StringChunk)c).set7BitEncoding(charset);
+         }
+      }
+      for(RecipientChunks rc : recipientChunks) {
+         for(Chunk c : rc.getAll()) {
+            if(c instanceof StringChunk) {
+               ((StringChunk)c).set7BitEncoding(charset);
+            }
+         }
+      }
    }
    
    /**
index 08b7e98988a75a7d5784f0202405f631ece73805..d9a060fe4ad0df56d614b6b62d7ca794daf2aa1e 100644 (file)
@@ -37,6 +37,8 @@ public final class Chunks implements ChunkGroup {
    public StringChunk messageClass;
    /** BODY Chunk, for plain/text messages */
    public StringChunk textBodyChunk;
+   /** BODY Html Chunk, for html messages */
+   public StringChunk htmlBodyChunk;
    /** Subject link chunk, in plain/text */
    public StringChunk subjectChunk;
    /** Value that is in the TO field (not actually the addresses as they are stored in recip directory nodes */
@@ -117,6 +119,10 @@ public final class Chunks implements ChunkGroup {
       else if(chunk.getChunkId() == MAPIProperty.BODY.id) {
          textBodyChunk = (StringChunk)chunk;
       }
+      else if(chunk.getChunkId() == MAPIProperty.BODY_HTML.id && 
+              chunk instanceof StringChunk) {
+         htmlBodyChunk = (StringChunk)chunk;
+      }
       
       // And add to the main list
       allChunks.add(chunk);
index 057bb07aad82237ac4d02346c1bf829a12029d5c..19a2f13efafe6d510e1bb7b3e66d37fca5c3ee24 100644 (file)
@@ -30,8 +30,11 @@ import org.apache.poi.util.StringUtil;
  * A Chunk made up of a single string.
  */
 public class StringChunk extends Chunk {
-       private String value;
-       private String encoding7Bit = "CP1252";
+   private static final String DEFAULT_ENCODING = "CP1252"; 
+   private String encoding7Bit = DEFAULT_ENCODING;
+   private String value;
+   /** Only kept around for 7 bit strings */
+   private byte[] rawValue;
 
        /**
         * Creates a String Chunk.
@@ -56,7 +59,7 @@ public class StringChunk extends Chunk {
        public String get7BitEncoding() {
           return encoding7Bit;
        }
-       
+
        /**
         * Sets the Encoding that will be used to
         *  decode any "7 bit" (non unicode) data.
@@ -66,25 +69,33 @@ public class StringChunk extends Chunk {
         */
        public void set7BitEncoding(String encoding) {
           this.encoding7Bit = encoding;
+
+          // Re-read the String if we're a 7 bit one
+          if(type == Types.ASCII_STRING) {
+             parseString(rawValue);
+          }
        }
-       
+
        public void readValue(InputStream value) throws IOException {
-      String tmpValue;
-      byte[] data = IOUtils.toByteArray(value);
-      
+          byte[] data = IOUtils.toByteArray(value);
+          parseString(data);
+       }
+       private void parseString(byte[] data) {
+          String tmpValue;
           switch(type) {
           case Types.ASCII_STRING:
              tmpValue = parseAs7BitData(data, encoding7Bit);
-         break;
+             this.rawValue = data;
+             break;
           case Types.UNICODE_STRING:
              tmpValue = StringUtil.getFromUnicodeLE(data);
              break;
           default:
              throw new IllegalArgumentException("Invalid type " + type + " for String Chunk");
           }
-          
+
           // Clean up
-               this.value = tmpValue.replace("\0", "");
+          this.value = tmpValue.replace("\0", "");
        }
        
        public void writeValue(OutputStream out) throws IOException {
@@ -121,7 +132,7 @@ public class StringChunk extends Chunk {
     *  and returns the string that that yields.
     */
    protected static String parseAs7BitData(byte[] data) {
-      return parseAs7BitData(data, "CP1252");
+      return parseAs7BitData(data, DEFAULT_ENCODING);
    }
    /**
     * Parses as non-unicode, supposedly 7 bit data
index bb7c87262cdac746901d4b871d11307dae9255d4..e2f6fe39d4d2939fe15f5acc75944c12bb18e3c5 100644 (file)
@@ -35,6 +35,7 @@ public final class TestBasics extends TestCase {
    private MAPIMessage attachments;
    private MAPIMessage noRecipientAddress;
    private MAPIMessage cyrillic;
+   private MAPIMessage chinese;
 
        /**
         * Initialize this test, load up the blank.msg mapi message.
@@ -48,6 +49,7 @@ public final class TestBasics extends TestCase {
       attachments = new MAPIMessage(samples.openResourceAsStream("attachment_test_msg.msg"));
       noRecipientAddress = new MAPIMessage(samples.openResourceAsStream("no_recipient_address.msg"));
       cyrillic = new MAPIMessage(samples.openResourceAsStream("cyrillic_message.msg"));
+      chinese = new MAPIMessage(samples.openResourceAsStream("chinese-traditional.msg"));
        }
        
        /**
@@ -195,5 +197,27 @@ public final class TestBasics extends TestCase {
       
       assertEquals("Cp1251", cyrillic.getRecipientDetailsChunks()[0].recipientDisplayNameChunk.get7BitEncoding());
       assertEquals("Cp1251", cyrillic.getRecipientDetailsChunks()[1].recipientDisplayNameChunk.get7BitEncoding());
+      
+      // Override it, check it's taken
+      cyrillic.set7BitEncoding("UTF-8");
+      assertEquals("UTF-8", cyrillic.getRecipientDetailsChunks()[0].recipientDisplayNameChunk.get7BitEncoding());
+      assertEquals("UTF-8", cyrillic.getRecipientDetailsChunks()[1].recipientDisplayNameChunk.get7BitEncoding());
+      
+      
+      // Check with a file that has no headers
+      try {
+         chinese.getHeaders();
+         fail("File doesn't have headers!");
+      } catch(ChunkNotFoundException e) {}
+      
+      String html = chinese.getHmtlBody();
+      assertTrue("Charset not found:\n" + html, html.contains("text/html; charset=big5"));
+      
+      // Defaults to CP1251
+      assertEquals("CP1252", chinese.getRecipientDetailsChunks()[0].recipientDisplayNameChunk.get7BitEncoding());
+      
+      // But after guessing goes to the correct one, Big 5
+      chinese.guess7BitEncoding();
+      assertEquals("big5", chinese.getRecipientDetailsChunks()[0].recipientDisplayNameChunk.get7BitEncoding());
    }
 }
diff --git a/test-data/hsmf/chinese-traditional.msg b/test-data/hsmf/chinese-traditional.msg
new file mode 100644 (file)
index 0000000..c2b84c0
Binary files /dev/null and b/test-data/hsmf/chinese-traditional.msg differ