public class StringChunk extends Chunk {
private static final String DEFAULT_ENCODING = "CP1252";
private String encoding7Bit = DEFAULT_ENCODING;
- private String value;
- /** Only kept around for 7 bit strings */
private byte[] rawValue;
+ private String value;
/**
* Creates a String Chunk.
// Re-read the String if we're a 7 bit one
if(type == Types.ASCII_STRING) {
- parseString(rawValue);
+ parseString();
}
}
public void readValue(InputStream value) throws IOException {
- byte[] data = IOUtils.toByteArray(value);
- parseString(data);
+ rawValue = IOUtils.toByteArray(value);
+ parseString();
}
- private void parseString(byte[] data) {
+ private void parseString() {
String tmpValue;
switch(type) {
case Types.ASCII_STRING:
- tmpValue = parseAs7BitData(data, encoding7Bit);
- this.rawValue = data;
+ tmpValue = parseAs7BitData(rawValue, encoding7Bit);
break;
case Types.UNICODE_STRING:
- tmpValue = StringUtil.getFromUnicodeLE(data);
+ tmpValue = StringUtil.getFromUnicodeLE(rawValue);
break;
default:
throw new IllegalArgumentException("Invalid type " + type + " for String Chunk");
}
public void writeValue(OutputStream out) throws IOException {
- byte[] data;
-
+ out.write(rawValue);
+ }
+ private void storeString() {
switch(type) {
case Types.ASCII_STRING:
try {
- data = value.getBytes(encoding7Bit);
+ rawValue = value.getBytes(encoding7Bit);
} catch (UnsupportedEncodingException e) {
throw new RuntimeException("Encoding not found - " + encoding7Bit, e);
}
break;
case Types.UNICODE_STRING:
- data = new byte[value.length()*2];
- StringUtil.putUnicodeLE(value, data, 0);
+ rawValue = new byte[value.length()*2];
+ StringUtil.putUnicodeLE(value, rawValue, 0);
break;
default:
throw new IllegalArgumentException("Invalid type " + type + " for String Chunk");
}
-
- out.write(data);
}
+ /**
+ * Returns the Text value of the chunk
+ */
public String getValue() {
return this.value;
}
- public String toString() {
- return this.value;
- }
+
+ public byte[] getRawValue() {
+ return this.rawValue;
+ }
+ public void setValue(String str) {
+ this.value = str;
+ storeString();
+ }
+
+ public String toString() {
+ return this.value;
+ }
+
/**
* Parses as non-unicode, supposedly 7 bit CP1252 data
* and returns the string that that yields.
==================================================================== */
package org.apache.poi.hsmf.extractor;
+import java.io.File;
import java.io.IOException;
import java.io.InputStream;
import java.text.SimpleDateFormat;
public OutlookTextExtactor(InputStream inp) throws IOException {
this(new MAPIMessage(inp));
}
+
+ public static void main(String[] args) throws Exception {
+ for(String filename : args) {
+ OutlookTextExtactor extractor = new OutlookTextExtactor(
+ new NPOIFSFileSystem(new File(filename))
+ );
+ System.out.println( extractor.getText() );
+ }
+ }
/**
* Returns the underlying MAPI message
MAPIMessage msg = (MAPIMessage)document;
StringBuffer s = new StringBuffer();
+ // See if we can get a suitable encoding for any
+ // non unicode text in the file
+ msg.guess7BitEncoding();
+
+ // Off we go
StringsIterator emails;
try {
emails = new StringsIterator(
// Embeded bits are checked in
// TestExtractorFactory
}
+
+ public void testEncodings() throws Exception {
+ POIFSFileSystem simple = new POIFSFileSystem(
+ new FileInputStream(samples.getFile("chinese-traditional.msg"))
+ );
+ MAPIMessage msg = new MAPIMessage(simple);
+ OutlookTextExtactor ext = new OutlookTextExtactor(msg);
+ String text = ext.getText();
+
+ // Check the english bits
+ assertContains(text, "From: Tests Chang@FT");
+ assertContains(text, "tests.chang@fengttt.com");
+
+ // And check some chinese bits
+ assertContains(text, "(\u5f35\u6bd3\u502b)");
+ assertContains(text, "( MSG \u683c\u5f0f\u6e2c\u8a66 )");
+ }
}