From 01f14a7548bd357557d0fa18589b8c9de2f11c20 Mon Sep 17 00:00:00 2001 From: PJ Fanning Date: Fri, 15 Jul 2022 22:45:56 +0000 Subject: [github-361] reuse regex pattern instances. Thanks to XenoAmess. This closes #361 git-svn-id: https://svn.apache.org/repos/asf/poi/trunk@1902756 13f79535-47bb-0310-9956-ffa450edef68 --- .../apache/poi/xslf/usermodel/XMLSlideShow.java | 6 +- .../apache/poi/xssf/usermodel/XSSFWorkbook.java | 3 +- .../main/java/org/apache/poi/hsmf/MAPIMessage.java | 1317 ++++++++++---------- 3 files changed, 676 insertions(+), 650 deletions(-) diff --git a/poi-ooxml/src/main/java/org/apache/poi/xslf/usermodel/XMLSlideShow.java b/poi-ooxml/src/main/java/org/apache/poi/xslf/usermodel/XMLSlideShow.java index efdc7f057c..5d861393d2 100644 --- a/poi-ooxml/src/main/java/org/apache/poi/xslf/usermodel/XMLSlideShow.java +++ b/poi-ooxml/src/main/java/org/apache/poi/xslf/usermodel/XMLSlideShow.java @@ -77,6 +77,8 @@ public class XMLSlideShow extends POIXMLDocument //arbitrarily selected; may need to increase private static final int DEFAULT_MAX_RECORD_LENGTH = 1_000_000; private static int MAX_RECORD_LENGTH = DEFAULT_MAX_RECORD_LENGTH; + private static final Pattern GET_ALL_EMBEDDED_PARTS_PATTERN = Pattern.compile("/ppt/embeddings/.*?"); + private static final Pattern GET_PICTURE_DATA_PATTERN = Pattern.compile("/ppt/media/.*?"); private CTPresentation _presentation; private final List _slides = new ArrayList<>(); @@ -222,14 +224,14 @@ public class XMLSlideShow extends POIXMLDocument @Override public List getAllEmbeddedParts() { return Collections.unmodifiableList( - getPackage().getPartsByName(Pattern.compile("/ppt/embeddings/.*?")) + getPackage().getPartsByName(GET_ALL_EMBEDDED_PARTS_PATTERN) ); } @Override public List getPictureData() { if (_pictures.isEmpty()) { - getPackage().getPartsByName(Pattern.compile("/ppt/media/.*?")).forEach(part -> { + getPackage().getPartsByName(GET_PICTURE_DATA_PATTERN).forEach(part -> { XSLFPictureData pd = new XSLFPictureData(part); pd.setIndex(_pictures.size()); _pictures.add(pd); diff --git a/poi-ooxml/src/main/java/org/apache/poi/xssf/usermodel/XSSFWorkbook.java b/poi-ooxml/src/main/java/org/apache/poi/xssf/usermodel/XSSFWorkbook.java index f5ac4e18b6..38ade18345 100644 --- a/poi-ooxml/src/main/java/org/apache/poi/xssf/usermodel/XSSFWorkbook.java +++ b/poi-ooxml/src/main/java/org/apache/poi/xssf/usermodel/XSSFWorkbook.java @@ -107,6 +107,7 @@ import org.openxmlformats.schemas.spreadsheetml.x2006.main.*; */ public class XSSFWorkbook extends POIXMLDocument implements Workbook, Date1904Support { private static final Pattern COMMA_PATTERN = Pattern.compile(","); + private static final Pattern GET_ALL_PICTURES_PATTERN = Pattern.compile("/xl/media/.*?"); /** * Images formats supported by XSSF but not by HSSF @@ -993,7 +994,7 @@ public class XSSFWorkbook extends POIXMLDocument implements Workbook, Date1904Su @Override public List getAllPictures() { if(pictures == null){ - List mediaParts = getPackage().getPartsByName(Pattern.compile("/xl/media/.*?")); + List mediaParts = getPackage().getPartsByName(GET_ALL_PICTURES_PATTERN); pictures = new ArrayList<>(mediaParts.size()); for(PackagePart part : mediaParts){ pictures.add(new XSSFPictureData(part)); diff --git a/poi-scratchpad/src/main/java/org/apache/poi/hsmf/MAPIMessage.java b/poi-scratchpad/src/main/java/org/apache/poi/hsmf/MAPIMessage.java index c71f2a8e9b..5ffa6d0191 100644 --- a/poi-scratchpad/src/main/java/org/apache/poi/hsmf/MAPIMessage.java +++ b/poi-scratchpad/src/main/java/org/apache/poi/hsmf/MAPIMessage.java @@ -58,668 +58,691 @@ import static org.apache.logging.log4j.util.Unbox.box; /** * Reads an Outlook MSG File in and provides hooks into its data structure. - * + *

* If you want to develop with HSMF, you might find it worth getting - * some of the Microsoft public documentation, such as: - * + * some of the Microsoft public documentation, such as: + *

* [MS-OXCMSG]: Message and Attachment Object Protocol Specification */ public class MAPIMessage extends POIReadOnlyDocument { - /** - * A MAPI file can be an email (NOTE) or a number of other types - */ - public enum MESSAGE_CLASS { - APPOINTMENT, - CONTACT, - NOTE, - POST, - STICKY_NOTE, - TASK, - UNKNOWN, - UNSPECIFIED - } - - /** For logging problems we spot with the file */ - private static final Logger LOG = LogManager.getLogger(MAPIMessage.class); + /** + * A MAPI file can be an email (NOTE) or a number of other types + */ + public enum MESSAGE_CLASS { + APPOINTMENT, + CONTACT, + NOTE, + POST, + STICKY_NOTE, + TASK, + UNKNOWN, + UNSPECIFIED + } + + /** + * For logging problems we spot with the file + */ + private static final Logger LOG = LogManager.getLogger(MAPIMessage.class); + + private static final Pattern GUESS_7_BIT_ENCODING_PATTERN = Pattern.compile("content-type:.*?charset=[\"']?([^;'\"]+)[\"']?", Pattern.CASE_INSENSITIVE); + private Chunks mainChunks; - private NameIdChunks nameIdChunks; - private RecipientChunks[] recipientChunks; - private AttachmentChunks[] attachmentChunks; - - private boolean returnNullOnMissingChunk; - - /** - * Constructor for creating new files. - */ - public MAPIMessage() { - // TODO - make writing possible - super(new POIFSFileSystem()); - } - - - /** - * Constructor for reading MSG Files from the file system. - * - * @param filename Name of the file to read - * @throws IOException on errors reading, or invalid data - * @throws RuntimeException a number of runtime exceptions can be thrown, especially if there are problems with the - * input format - */ - public MAPIMessage(String filename) throws IOException { - this(new File(filename)); - } - /** - * Constructor for reading MSG Files from the file system. - * - * @param file The file to read from - * @throws IOException on errors reading, or invalid data - * @throws RuntimeException a number of runtime exceptions can be thrown, especially if there are problems with the - * input format - */ - public MAPIMessage(File file) throws IOException { - this(new POIFSFileSystem(file)); - } - - /** - * Constructor for reading MSG Files from an input stream. - * - *

Note - this will buffer the whole message into memory - * in order to process. For lower memory use, use {@link #MAPIMessage(File)} - * - * @param in The InputStream to buffer and then read from - * @throws IOException on errors reading, or invalid data - * @throws RuntimeException a number of runtime exceptions can be thrown, especially if there are problems with the - * input format - */ - public MAPIMessage(InputStream in) throws IOException { - this(new POIFSFileSystem(in)); - } - /** - * Constructor for reading MSG Files from a POIFS filesystem - * - * @param fs Open POIFS FileSystem containing the message - * @throws IOException on errors reading, or invalid data - * @throws RuntimeException a number of runtime exceptions can be thrown, especially if there are problems with the - * input format - */ - public MAPIMessage(POIFSFileSystem fs) throws IOException { - this(fs.getRoot()); - } - /** - * Constructor for reading MSG Files from a certain - * point within a POIFS filesystem - * @param poifsDir Directory containing the message - * @throws IOException on errors reading, or invalid data - * @throws RuntimeException a number of runtime exceptions can be thrown, especially if there are problems with the - * input format - */ - public MAPIMessage(DirectoryNode poifsDir) throws IOException { - super(poifsDir); - - // Grab all the chunks - ChunkGroup[] chunkGroups = POIFSChunkParser.parse(poifsDir); - - // Grab interesting bits - ArrayList attachments = new ArrayList<>(); - ArrayList recipients = new ArrayList<>(); - for(ChunkGroup group : chunkGroups) { - // Should only ever be one of each of these - if(group instanceof Chunks) { - mainChunks = (Chunks)group; - } else if(group instanceof NameIdChunks) { - nameIdChunks = (NameIdChunks)group; - } else if(group instanceof RecipientChunks) { - recipients.add( (RecipientChunks)group ); - } - - // Can be multiple of these - add to list(s) - if(group instanceof AttachmentChunks) { - attachments.add( (AttachmentChunks)group ); - } - } - attachmentChunks = attachments.toArray(new AttachmentChunks[0]); - recipientChunks = recipients.toArray(new RecipientChunks[0]); - - // Now sort these chunks lists so they're in ascending order, - // rather than in random filesystem order - Arrays.sort(attachmentChunks, new AttachmentChunksSorter()); - Arrays.sort(recipientChunks, new RecipientChunksSorter()); - } - - - /** - * Gets a string value based on the passed chunk. - * @throws ChunkNotFoundException if the chunk isn't there - */ - public String getStringFromChunk(StringChunk chunk) throws ChunkNotFoundException { - if(chunk == null) { - if(returnNullOnMissingChunk) { - return null; - } else { - throw new ChunkNotFoundException(); - } - } - return chunk.getValue(); - } - - - /** - * Gets the plain text body of this Outlook Message - * @return The string representation of the 'text' version of the body, if available. - * @throws ChunkNotFoundException If the text-body chunk does not exist and - * returnNullOnMissingChunk is set - */ - public String getTextBody() throws ChunkNotFoundException { - return getStringFromChunk(mainChunks.getTextBodyChunk()); - } - - /** - * Gets the html body of this Outlook Message, if this email - * contains a html version. - * @return The string representation of the 'html' version of the body, if available. - * @throws ChunkNotFoundException If the html-body chunk does not exist and - * returnNullOnMissingChunk is set - */ - public String getHtmlBody() throws ChunkNotFoundException { - ByteChunk htmlBodyBinaryChunk = mainChunks.getHtmlBodyChunkBinary(); - if (htmlBodyBinaryChunk != null) { - List cpid = mainChunks.getProperties().get(MAPIProperty.INTERNET_CPID); - if (cpid != null && !cpid.isEmpty()) { - int codepage = ((LongPropertyValue) cpid.get(0)).getValue(); - try { - String encoding = CodePageUtil.codepageToEncoding(codepage, true); - byte[] htmlBodyBinary = htmlBodyBinaryChunk.getValue(); - return new String(htmlBodyBinary, encoding); - } catch (UnsupportedEncodingException e) { - LOG.atWarn().log("HTML body binary: Invalid codepage ID {} set for the message via {}, ignoring", box(codepage), MAPIProperty.INTERNET_CPID); + private NameIdChunks nameIdChunks; + private RecipientChunks[] recipientChunks; + private AttachmentChunks[] attachmentChunks; + + private boolean returnNullOnMissingChunk; + + /** + * Constructor for creating new files. + */ + public MAPIMessage() { + // TODO - make writing possible + super(new POIFSFileSystem()); + } + + + /** + * Constructor for reading MSG Files from the file system. + * + * @param filename Name of the file to read + * @throws IOException on errors reading, or invalid data + * @throws RuntimeException a number of runtime exceptions can be thrown, especially if there are problems with the + * input format + */ + public MAPIMessage(String filename) throws IOException { + this(new File(filename)); + } + + /** + * Constructor for reading MSG Files from the file system. + * + * @param file The file to read from + * @throws IOException on errors reading, or invalid data + * @throws RuntimeException a number of runtime exceptions can be thrown, especially if there are problems with the + * input format + */ + public MAPIMessage(File file) throws IOException { + this(new POIFSFileSystem(file)); + } + + /** + * Constructor for reading MSG Files from an input stream. + * + *

Note - this will buffer the whole message into memory + * in order to process. For lower memory use, use {@link #MAPIMessage(File)} + * + * @param in The InputStream to buffer and then read from + * @throws IOException on errors reading, or invalid data + * @throws RuntimeException a number of runtime exceptions can be thrown, especially if there are problems with the + * input format + */ + public MAPIMessage(InputStream in) throws IOException { + this(new POIFSFileSystem(in)); + } + + /** + * Constructor for reading MSG Files from a POIFS filesystem + * + * @param fs Open POIFS FileSystem containing the message + * @throws IOException on errors reading, or invalid data + * @throws RuntimeException a number of runtime exceptions can be thrown, especially if there are problems with the + * input format + */ + public MAPIMessage(POIFSFileSystem fs) throws IOException { + this(fs.getRoot()); + } + + /** + * Constructor for reading MSG Files from a certain + * point within a POIFS filesystem + * + * @param poifsDir Directory containing the message + * @throws IOException on errors reading, or invalid data + * @throws RuntimeException a number of runtime exceptions can be thrown, especially if there are problems with the + * input format + */ + public MAPIMessage(DirectoryNode poifsDir) throws IOException { + super(poifsDir); + + // Grab all the chunks + ChunkGroup[] chunkGroups = POIFSChunkParser.parse(poifsDir); + + // Grab interesting bits + ArrayList attachments = new ArrayList<>(); + ArrayList recipients = new ArrayList<>(); + for (ChunkGroup group : chunkGroups) { + // Should only ever be one of each of these + if (group instanceof Chunks) { + mainChunks = (Chunks) group; + } else if (group instanceof NameIdChunks) { + nameIdChunks = (NameIdChunks) group; + } else if (group instanceof RecipientChunks) { + recipients.add((RecipientChunks) group); } - } - return htmlBodyBinaryChunk.getAs7bitString(); - } - return getStringFromChunk(mainChunks.getHtmlBodyChunkString()); - } - - /** - * Gets the RTF Rich Message body of this Outlook Message, if this email - * contains a RTF (rich) version. - * @return The string representation of the 'RTF' version of the body, if available. - * @throws ChunkNotFoundException If the rtf-body chunk does not exist and - * returnNullOnMissingChunk is set - */ - public String getRtfBody() throws ChunkNotFoundException { - ByteChunk chunk = mainChunks.getRtfBodyChunk(); - if(chunk == null) { - if(returnNullOnMissingChunk) { - return null; - } else { - throw new ChunkNotFoundException(); - } - } - - try { - MAPIRtfAttribute rtf = new MAPIRtfAttribute( - MAPIProperty.RTF_COMPRESSED, Types.BINARY.getId(), chunk.getValue() - ); - return rtf.getDataString(); - } catch(IOException e) { - throw new RuntimeException("Shouldn't happen", e); - } - } - - /** - * Gets the subject line of the Outlook Message - * @throws ChunkNotFoundException If the subject-chunk does not exist and - * returnNullOnMissingChunk is set - */ - public String getSubject() throws ChunkNotFoundException { - return getStringFromChunk(mainChunks.getSubjectChunk()); - } - - /** - * Gets the display value of the "FROM" line of the outlook message - * This is not the actual address that was sent from but the formatted display of the user name. - * @throws ChunkNotFoundException If the from-chunk does not exist and - * returnNullOnMissingChunk is set - */ - public String getDisplayFrom() throws ChunkNotFoundException { - return getStringFromChunk(mainChunks.getDisplayFromChunk()); - } - - /** - * Gets the display value of the "TO" line of the outlook message. - * If there are multiple recipients, they will be separated - * by semicolons. - * This is not the actual list of addresses/values that will be - * sent to if you click Reply in the email - those are stored - * in {@link RecipientChunks}. - * @throws ChunkNotFoundException If the to-chunk does not exist and - * returnNullOnMissingChunk is set - */ - public String getDisplayTo() throws ChunkNotFoundException { - return getStringFromChunk(mainChunks.getDisplayToChunk()); - } - - /** - * Gets the display value of the "CC" line of the outlook message. - * If there are multiple recipients, they will be separated - * by semicolons. - * This is not the actual list of addresses/values that will be - * sent to if you click Reply in the email - those are stored - * in {@link RecipientChunks}. - * @throws ChunkNotFoundException If the cc-chunk does not exist and - * returnNullOnMissingChunk is set - */ - public String getDisplayCC() throws ChunkNotFoundException { - return getStringFromChunk(mainChunks.getDisplayCCChunk()); - } - - /** - * Gets the display value of the "BCC" line of the outlook message. - * If there are multiple recipients, they will be separated - * by semicolons. - * This is not the actual list of addresses/values that will be - * sent to if you click Reply in the email - those are stored - * in {@link RecipientChunks}. - * This will only be present in sent emails, not received ones! - * @throws ChunkNotFoundException If the bcc-chunk does not exist and - * returnNullOnMissingChunk is set - */ - public String getDisplayBCC() throws ChunkNotFoundException { - return getStringFromChunk(mainChunks.getDisplayBCCChunk()); - } - - /** - * Returns all the recipients' email address, separated by - * semicolons. Checks all the likely chunks in search of - * the addresses. - */ - public String getRecipientEmailAddress() throws ChunkNotFoundException { - return toSemicolonList(getRecipientEmailAddressList()); - } - /** - * Returns an array of all the recipient's email address, normally - * in TO then CC then BCC order. - * Checks all the likely chunks in search of the addresses. - */ - public String[] getRecipientEmailAddressList() throws ChunkNotFoundException { - if(recipientChunks == null || recipientChunks.length == 0) { - throw new ChunkNotFoundException("No recipients section present"); - } - - String[] emails = new String[recipientChunks.length]; - for(int i=0; iMany messages store their strings as unicode, which is - * nice and easy. Some use one-byte encodings for their - * strings, but don't always store the encoding anywhere - * helpful in the file.

- *

This method checks for codepage properties, and failing that - * looks at the headers for the message, and uses these to - * guess the correct encoding for your file.

- *

Bug #49441 has more on why this is needed

- */ - public void guess7BitEncoding() { - String generalcodepage = null; - String htmlbodycodepage = null; - String bodycodepage = null; - // - // General codepage: Message codepage property. - // - List val = mainChunks.getProperties().get(MAPIProperty.MESSAGE_CODEPAGE); - if (val != null && !val.isEmpty()) { - int codepage = ((LongPropertyValue) val.get(0)).getValue(); - try { - String encoding = CodePageUtil.codepageToEncoding(codepage, true); - generalcodepage = encoding; - } catch (UnsupportedEncodingException e) { - LOG.atWarn().log("Invalid codepage ID {} set for the message via {}, ignoring", box(codepage), MAPIProperty.MESSAGE_CODEPAGE); - } - } - // - // General codepage fallback: Message locale ID property. - // - if (generalcodepage == null) { - val = mainChunks.getProperties().get(MAPIProperty.MESSAGE_LOCALE_ID); - if (val != null && !val.isEmpty()) { - int lcid = ((LongPropertyValue) val.get(0)).getValue(); - int codepage = LocaleUtil.getDefaultCodePageFromLCID(lcid); - try { - if (codepage != 0) { - String encoding = CodePageUtil.codepageToEncoding(codepage, true); - generalcodepage = encoding; - } - } catch (UnsupportedEncodingException e) { - LOG.atWarn().log("Invalid codepage ID {}from locale ID{} set for the message via {}, ignoring", box(codepage),box(lcid), MAPIProperty.MESSAGE_LOCALE_ID); - } - } - } - // - // General codepage fallback: Charset on a content type header. - // - if (generalcodepage == null) { - try { - String[] headers = getHeaders(); - if (headers != null && headers.length > 0) { - Pattern p = Pattern.compile("content-type:.*?charset=[\"']?([^;'\"]+)[\"']?", Pattern.CASE_INSENSITIVE); - for (String header : headers) { - if (header.toLowerCase(LocaleUtil.getUserLocale()).startsWith("content-type")) { - Matcher m = p.matcher(header); - if (m.matches()) { - String encoding = m.group(1); - generalcodepage = encoding; - } - } - } - } - } catch (ChunkNotFoundException e) { - } - } - // - // HTML and text body encoding: Internet CPID property. - // UTF-8 is ignored for text body. This seems to be a special Outlook behavior. - // - val = mainChunks.getProperties().get(MAPIProperty.INTERNET_CPID); - if (val != null && !val.isEmpty()) { - int codepage = ((LongPropertyValue) val.get(0)).getValue(); - try { - String encoding = CodePageUtil.codepageToEncoding(codepage, true); - htmlbodycodepage = encoding; - if (!encoding.equalsIgnoreCase("utf-8")) { - bodycodepage = encoding; - } - } catch (UnsupportedEncodingException e) { - LOG.atWarn().log("Invalid codepage ID {} set for the message via {}, ignoring", box(codepage), MAPIProperty.INTERNET_CPID); - } - } - // - // Apply encoding - // - set7BitEncoding(generalcodepage, htmlbodycodepage, bodycodepage); - } - - /** - * Many messages store their strings as unicode, which is - * nice and easy. Some use one-byte encodings for their - * strings, but don't easily store the encoding anywhere - * in the file! - * If you know what the encoding is of your file, you can - * use this method to set the 7 bit encoding for all - * the non unicode strings in the file. - * @see #guess7BitEncoding() - */ - public void set7BitEncoding(String charset) { - set7BitEncoding(charset, charset, charset); - } - public void set7BitEncoding(String generalcharset, String htmlbodycharset, String bodycharset) { - for(Chunk c : mainChunks.getChunks()) { - if(c instanceof StringChunk) { - if (c.getChunkId() == MAPIProperty.BODY_HTML.id) { - if (htmlbodycharset != null) { - ((StringChunk)c).set7BitEncoding(htmlbodycharset); - } - } - else if (c.getChunkId() == MAPIProperty.BODY.id) { - if (bodycharset != null) { - ((StringChunk)c).set7BitEncoding(bodycharset); - } - } - else if (generalcharset != null) { - ((StringChunk)c).set7BitEncoding(generalcharset); - } - } - } - if (generalcharset != null) { - if (nameIdChunks!=null) { - for(Chunk c : nameIdChunks.getChunks()) { - if(c instanceof StringChunk) { - ((StringChunk)c).set7BitEncoding(generalcharset); - } - } } - for(RecipientChunks rc : recipientChunks) { - for(Chunk c : rc.getAll()) { - if(c instanceof StringChunk) { - ((StringChunk)c).set7BitEncoding(generalcharset); - } - } + return chunk.getValue(); + } + + + /** + * Gets the plain text body of this Outlook Message + * + * @return The string representation of the 'text' version of the body, if available. + * @throws ChunkNotFoundException If the text-body chunk does not exist and + * returnNullOnMissingChunk is set + */ + public String getTextBody() throws ChunkNotFoundException { + return getStringFromChunk(mainChunks.getTextBodyChunk()); + } + + /** + * Gets the html body of this Outlook Message, if this email + * contains a html version. + * + * @return The string representation of the 'html' version of the body, if available. + * @throws ChunkNotFoundException If the html-body chunk does not exist and + * returnNullOnMissingChunk is set + */ + public String getHtmlBody() throws ChunkNotFoundException { + ByteChunk htmlBodyBinaryChunk = mainChunks.getHtmlBodyChunkBinary(); + if (htmlBodyBinaryChunk != null) { + List cpid = mainChunks.getProperties().get(MAPIProperty.INTERNET_CPID); + if (cpid != null && !cpid.isEmpty()) { + int codepage = ((LongPropertyValue) cpid.get(0)).getValue(); + try { + String encoding = CodePageUtil.codepageToEncoding(codepage, true); + byte[] htmlBodyBinary = htmlBodyBinaryChunk.getValue(); + return new String(htmlBodyBinary, encoding); + } catch (UnsupportedEncodingException e) { + LOG.atWarn().log("HTML body binary: Invalid codepage ID {} set for the message via {}, ignoring", box(codepage), MAPIProperty.INTERNET_CPID); + } + } + return htmlBodyBinaryChunk.getAs7bitString(); } - } - } - - /** - * Does this file contain any strings that - * are stored as 7 bit rather than unicode? - */ - public boolean has7BitEncodingStrings() { - for(Chunk c : mainChunks.getChunks()) { - if(c instanceof StringChunk) { - if( c.getType() == Types.ASCII_STRING ) { - return true; + return getStringFromChunk(mainChunks.getHtmlBodyChunkString()); + } + + /** + * Gets the RTF Rich Message body of this Outlook Message, if this email + * contains a RTF (rich) version. + * + * @return The string representation of the 'RTF' version of the body, if available. + * @throws ChunkNotFoundException If the rtf-body chunk does not exist and + * returnNullOnMissingChunk is set + */ + public String getRtfBody() throws ChunkNotFoundException { + ByteChunk chunk = mainChunks.getRtfBodyChunk(); + if (chunk == null) { + if (returnNullOnMissingChunk) { + return null; + } else { + throw new ChunkNotFoundException(); } - } - } - - if (nameIdChunks!=null) { - for(Chunk c : nameIdChunks.getChunks()) { - if(c instanceof StringChunk) { - if( c.getType() == Types.ASCII_STRING ) { - return true; - } + } + + try { + MAPIRtfAttribute rtf = new MAPIRtfAttribute( + MAPIProperty.RTF_COMPRESSED, Types.BINARY.getId(), chunk.getValue() + ); + return rtf.getDataString(); + } catch (IOException e) { + throw new RuntimeException("Shouldn't happen", e); + } + } + + /** + * Gets the subject line of the Outlook Message + * + * @throws ChunkNotFoundException If the subject-chunk does not exist and + * returnNullOnMissingChunk is set + */ + public String getSubject() throws ChunkNotFoundException { + return getStringFromChunk(mainChunks.getSubjectChunk()); + } + + /** + * Gets the display value of the "FROM" line of the outlook message + * This is not the actual address that was sent from but the formatted display of the user name. + * + * @throws ChunkNotFoundException If the from-chunk does not exist and + * returnNullOnMissingChunk is set + */ + public String getDisplayFrom() throws ChunkNotFoundException { + return getStringFromChunk(mainChunks.getDisplayFromChunk()); + } + + /** + * Gets the display value of the "TO" line of the outlook message. + * If there are multiple recipients, they will be separated + * by semicolons. + * This is not the actual list of addresses/values that will be + * sent to if you click Reply in the email - those are stored + * in {@link RecipientChunks}. + * + * @throws ChunkNotFoundException If the to-chunk does not exist and + * returnNullOnMissingChunk is set + */ + public String getDisplayTo() throws ChunkNotFoundException { + return getStringFromChunk(mainChunks.getDisplayToChunk()); + } + + /** + * Gets the display value of the "CC" line of the outlook message. + * If there are multiple recipients, they will be separated + * by semicolons. + * This is not the actual list of addresses/values that will be + * sent to if you click Reply in the email - those are stored + * in {@link RecipientChunks}. + * + * @throws ChunkNotFoundException If the cc-chunk does not exist and + * returnNullOnMissingChunk is set + */ + public String getDisplayCC() throws ChunkNotFoundException { + return getStringFromChunk(mainChunks.getDisplayCCChunk()); + } + + /** + * Gets the display value of the "BCC" line of the outlook message. + * If there are multiple recipients, they will be separated + * by semicolons. + * This is not the actual list of addresses/values that will be + * sent to if you click Reply in the email - those are stored + * in {@link RecipientChunks}. + * This will only be present in sent emails, not received ones! + * + * @throws ChunkNotFoundException If the bcc-chunk does not exist and + * returnNullOnMissingChunk is set + */ + public String getDisplayBCC() throws ChunkNotFoundException { + return getStringFromChunk(mainChunks.getDisplayBCCChunk()); + } + + /** + * Returns all the recipients' email address, separated by + * semicolons. Checks all the likely chunks in search of + * the addresses. + */ + public String getRecipientEmailAddress() throws ChunkNotFoundException { + return toSemicolonList(getRecipientEmailAddressList()); + } + + /** + * Returns an array of all the recipient's email address, normally + * in TO then CC then BCC order. + * Checks all the likely chunks in search of the addresses. + */ + public String[] getRecipientEmailAddressList() throws ChunkNotFoundException { + if (recipientChunks == null || recipientChunks.length == 0) { + throw new ChunkNotFoundException("No recipients section present"); + } + + String[] emails = new String[recipientChunks.length]; + for (int i = 0; i < emails.length; i++) { + RecipientChunks rc = recipientChunks[i]; + String email = rc.getRecipientEmailAddress(); + if (email != null) { + emails[i] = email; + } else { + if (returnNullOnMissingChunk) { + emails[i] = null; + } else { + throw new ChunkNotFoundException("No email address holding chunks found for the " + (i + 1) + "th recipient"); + } } - } - } - - for(RecipientChunks rc : recipientChunks) { - for(Chunk c : rc.getAll()) { - if(c instanceof StringChunk) { - if( c.getType() == Types.ASCII_STRING ) { - return true; - } + } + + return emails; + } + + + /** + * Returns all the recipients' names, separated by + * semicolons. Checks all the likely chunks in search of + * the names. + * See also {@link #getDisplayTo()}, {@link #getDisplayCC()} + * and {@link #getDisplayBCC()}. + */ + public String getRecipientNames() throws ChunkNotFoundException { + return toSemicolonList(getRecipientNamesList()); + } + + /** + * Returns an array of all the recipient's names, normally + * in TO then CC then BCC order. + * Checks all the likely chunks in search of the names. + * See also {@link #getDisplayTo()}, {@link #getDisplayCC()} + * and {@link #getDisplayBCC()}. + */ + public String[] getRecipientNamesList() throws ChunkNotFoundException { + if (recipientChunks == null || recipientChunks.length == 0) { + throw new ChunkNotFoundException("No recipients section present"); + } + + String[] names = new String[recipientChunks.length]; + for (int i = 0; i < names.length; i++) { + RecipientChunks rc = recipientChunks[i]; + String name = rc.getRecipientName(); + if (name != null) { + names[i] = name; + } else { + throw new ChunkNotFoundException("No display name holding chunks found for the " + (i + 1) + "th recipient"); } - } - } - return false; - } - - /** - * Returns all the headers, one entry per line - */ - public String[] getHeaders() throws ChunkNotFoundException { - String headers = getStringFromChunk(mainChunks.getMessageHeaders()); - if(headers == null) { - return null; - } - return headers.split("\\r?\\n"); - } - - /** - * Gets the conversation topic of the parsed Outlook Message. - * This is the part of the subject line that is after the RE: and FWD: - * @throws ChunkNotFoundException If the conversation-topic chunk does not exist and - * returnNullOnMissingChunk is set - */ - public String getConversationTopic() throws ChunkNotFoundException { - return getStringFromChunk(mainChunks.getConversationTopic()); - } - - /** - * Gets the message class of the parsed Outlook Message. - * (Yes, you can use this to determine if a message is a calendar - * item, note, or actual outlook Message) - * For emails the class will be IPM.Note - * - * @throws ChunkNotFoundException If the message-class chunk does not exist and - * returnNullOnMissingChunk is set - */ - public MESSAGE_CLASS getMessageClassEnum() throws ChunkNotFoundException { - String mc = getStringFromChunk(mainChunks.getMessageClass()); - if (mc == null || mc.trim().length() == 0) { - return MESSAGE_CLASS.UNSPECIFIED; - } else if (mc.equalsIgnoreCase("IPM.Note")) { - return MESSAGE_CLASS.NOTE; - } else if (mc.equalsIgnoreCase("IPM.Contact")) { - return MESSAGE_CLASS.CONTACT; - } else if (mc.equalsIgnoreCase("IPM.Appointment")) { - return MESSAGE_CLASS.APPOINTMENT; - } else if (mc.equalsIgnoreCase("IPM.StickyNote")) { - return MESSAGE_CLASS.STICKY_NOTE; - } else if (mc.equalsIgnoreCase("IPM.Task")) { - return MESSAGE_CLASS.TASK; - } else if (mc.equalsIgnoreCase("IPM.Post")) { - return MESSAGE_CLASS.POST; - } else { - LOG.atWarn().log("I don't recognize message class '{}'. Please open an issue on POI's bugzilla", mc); - return MESSAGE_CLASS.UNKNOWN; - } - } - /** - * Gets the date that the message was accepted by the - * server on. - */ - public Calendar getMessageDate() throws ChunkNotFoundException { - if (mainChunks.getSubmissionChunk() != null) { - return mainChunks.getSubmissionChunk().getAcceptedAtTime(); - } - else { - // Try a few likely suspects... - for (MAPIProperty prop : new MAPIProperty[] { - MAPIProperty.CLIENT_SUBMIT_TIME, MAPIProperty.LAST_MODIFICATION_TIME, - MAPIProperty.CREATION_TIME - }) { - List val = mainChunks.getProperties().get(prop); + } + + return names; + } + + /** + * Tries to identify the correct encoding for 7-bit (non-unicode) + * strings in the file. + *

Many messages store their strings as unicode, which is + * nice and easy. Some use one-byte encodings for their + * strings, but don't always store the encoding anywhere + * helpful in the file.

+ *

This method checks for codepage properties, and failing that + * looks at the headers for the message, and uses these to + * guess the correct encoding for your file.

+ *

Bug #49441 has more on why this is needed

+ */ + public void guess7BitEncoding() { + String generalcodepage = null; + String htmlbodycodepage = null; + String bodycodepage = null; + // + // General codepage: Message codepage property. + // + List val = mainChunks.getProperties().get(MAPIProperty.MESSAGE_CODEPAGE); + if (val != null && !val.isEmpty()) { + int codepage = ((LongPropertyValue) val.get(0)).getValue(); + try { + String encoding = CodePageUtil.codepageToEncoding(codepage, true); + generalcodepage = encoding; + } catch (UnsupportedEncodingException e) { + LOG.atWarn().log("Invalid codepage ID {} set for the message via {}, ignoring", box(codepage), MAPIProperty.MESSAGE_CODEPAGE); + } + } + // + // General codepage fallback: Message locale ID property. + // + if (generalcodepage == null) { + val = mainChunks.getProperties().get(MAPIProperty.MESSAGE_LOCALE_ID); if (val != null && !val.isEmpty()) { - return ((TimePropertyValue)val.get(0)).getValue(); + int lcid = ((LongPropertyValue) val.get(0)).getValue(); + int codepage = LocaleUtil.getDefaultCodePageFromLCID(lcid); + try { + if (codepage != 0) { + String encoding = CodePageUtil.codepageToEncoding(codepage, true); + generalcodepage = encoding; + } + } catch (UnsupportedEncodingException e) { + LOG.atWarn().log("Invalid codepage ID {}from locale ID{} set for the message via {}, ignoring", box(codepage), box(lcid), MAPIProperty.MESSAGE_LOCALE_ID); + } } - } - } - - if(returnNullOnMissingChunk) - return null; - throw new ChunkNotFoundException(); - } - - - /** - * Gets the main, core details chunks - */ - public Chunks getMainChunks() { - return mainChunks; - } - /** - * Gets all the recipient details chunks. - * These will normally be in the order of: - * * TO recipients, in the order returned by {@link #getDisplayTo()} - * * CC recipients, in the order returned by {@link #getDisplayCC()} - * * BCC recipients, in the order returned by {@link #getDisplayBCC()} - */ - public RecipientChunks[] getRecipientDetailsChunks() { - return recipientChunks; - } - /** - * Gets the Name ID chunks, or - * null if there aren't any - */ - public NameIdChunks getNameIdChunks() { - return nameIdChunks; - } - /** - * Gets the message attachments. - */ - public AttachmentChunks[] getAttachmentFiles() { - return attachmentChunks; - } - - - /** - * Will you get a null on a missing chunk, or a - * {@link ChunkNotFoundException} (default is the - * exception). - */ - public boolean isReturnNullOnMissingChunk() { - return returnNullOnMissingChunk; - } - - /** - * Sets whether on asking for a missing chunk, - * you get back null or a {@link ChunkNotFoundException} - * (default is the exception). - */ - public void setReturnNullOnMissingChunk(boolean returnNullOnMissingChunk) { - this.returnNullOnMissingChunk = returnNullOnMissingChunk; - } - - - private String toSemicolonList(String[] l) { - StringBuilder list = new StringBuilder(); - boolean first = true; - - for(String s : l) { - if(s == null) continue; - if(first) { - first = false; - } else { - list.append("; "); - } - list.append(s); - } - - return list.toString(); - } + } + // + // General codepage fallback: Charset on a content type header. + // + if (generalcodepage == null) { + try { + String[] headers = getHeaders(); + if (headers != null && headers.length > 0) { + for (String header : headers) { + if (header.toLowerCase(LocaleUtil.getUserLocale()).startsWith("content-type")) { + Matcher m = GUESS_7_BIT_ENCODING_PATTERN.matcher(header); + if (m.matches()) { + String encoding = m.group(1); + generalcodepage = encoding; + } + } + } + } + } catch (ChunkNotFoundException e) { + } + } + // + // HTML and text body encoding: Internet CPID property. + // UTF-8 is ignored for text body. This seems to be a special Outlook behavior. + // + val = mainChunks.getProperties().get(MAPIProperty.INTERNET_CPID); + if (val != null && !val.isEmpty()) { + int codepage = ((LongPropertyValue) val.get(0)).getValue(); + try { + String encoding = CodePageUtil.codepageToEncoding(codepage, true); + htmlbodycodepage = encoding; + if (!encoding.equalsIgnoreCase("utf-8")) { + bodycodepage = encoding; + } + } catch (UnsupportedEncodingException e) { + LOG.atWarn().log("Invalid codepage ID {} set for the message via {}, ignoring", box(codepage), MAPIProperty.INTERNET_CPID); + } + } + // + // Apply encoding + // + set7BitEncoding(generalcodepage, htmlbodycodepage, bodycodepage); + } + + /** + * Many messages store their strings as unicode, which is + * nice and easy. Some use one-byte encodings for their + * strings, but don't easily store the encoding anywhere + * in the file! + * If you know what the encoding is of your file, you can + * use this method to set the 7 bit encoding for all + * the non unicode strings in the file. + * + * @see #guess7BitEncoding() + */ + public void set7BitEncoding(String charset) { + set7BitEncoding(charset, charset, charset); + } + + public void set7BitEncoding(String generalcharset, String htmlbodycharset, String bodycharset) { + for (Chunk c : mainChunks.getChunks()) { + if (c instanceof StringChunk) { + if (c.getChunkId() == MAPIProperty.BODY_HTML.id) { + if (htmlbodycharset != null) { + ((StringChunk) c).set7BitEncoding(htmlbodycharset); + } + } else if (c.getChunkId() == MAPIProperty.BODY.id) { + if (bodycharset != null) { + ((StringChunk) c).set7BitEncoding(bodycharset); + } + } else if (generalcharset != null) { + ((StringChunk) c).set7BitEncoding(generalcharset); + } + } + } + if (generalcharset != null) { + if (nameIdChunks != null) { + for (Chunk c : nameIdChunks.getChunks()) { + if (c instanceof StringChunk) { + ((StringChunk) c).set7BitEncoding(generalcharset); + } + } + } + for (RecipientChunks rc : recipientChunks) { + for (Chunk c : rc.getAll()) { + if (c instanceof StringChunk) { + ((StringChunk) c).set7BitEncoding(generalcharset); + } + } + } + } + } + + /** + * Does this file contain any strings that + * are stored as 7 bit rather than unicode? + */ + public boolean has7BitEncodingStrings() { + for (Chunk c : mainChunks.getChunks()) { + if (c instanceof StringChunk) { + if (c.getType() == Types.ASCII_STRING) { + return true; + } + } + } + + if (nameIdChunks != null) { + for (Chunk c : nameIdChunks.getChunks()) { + if (c instanceof StringChunk) { + if (c.getType() == Types.ASCII_STRING) { + return true; + } + } + } + } + + for (RecipientChunks rc : recipientChunks) { + for (Chunk c : rc.getAll()) { + if (c instanceof StringChunk) { + if (c.getType() == Types.ASCII_STRING) { + return true; + } + } + } + } + return false; + } + + /** + * Returns all the headers, one entry per line + */ + public String[] getHeaders() throws ChunkNotFoundException { + String headers = getStringFromChunk(mainChunks.getMessageHeaders()); + if (headers == null) { + return null; + } + return headers.split("\\r?\\n"); + } + + /** + * Gets the conversation topic of the parsed Outlook Message. + * This is the part of the subject line that is after the RE: and FWD: + * + * @throws ChunkNotFoundException If the conversation-topic chunk does not exist and + * returnNullOnMissingChunk is set + */ + public String getConversationTopic() throws ChunkNotFoundException { + return getStringFromChunk(mainChunks.getConversationTopic()); + } + + /** + * Gets the message class of the parsed Outlook Message. + * (Yes, you can use this to determine if a message is a calendar + * item, note, or actual outlook Message) + * For emails the class will be IPM.Note + * + * @throws ChunkNotFoundException If the message-class chunk does not exist and + * returnNullOnMissingChunk is set + */ + public MESSAGE_CLASS getMessageClassEnum() throws ChunkNotFoundException { + String mc = getStringFromChunk(mainChunks.getMessageClass()); + if (mc == null || mc.trim().length() == 0) { + return MESSAGE_CLASS.UNSPECIFIED; + } else if (mc.equalsIgnoreCase("IPM.Note")) { + return MESSAGE_CLASS.NOTE; + } else if (mc.equalsIgnoreCase("IPM.Contact")) { + return MESSAGE_CLASS.CONTACT; + } else if (mc.equalsIgnoreCase("IPM.Appointment")) { + return MESSAGE_CLASS.APPOINTMENT; + } else if (mc.equalsIgnoreCase("IPM.StickyNote")) { + return MESSAGE_CLASS.STICKY_NOTE; + } else if (mc.equalsIgnoreCase("IPM.Task")) { + return MESSAGE_CLASS.TASK; + } else if (mc.equalsIgnoreCase("IPM.Post")) { + return MESSAGE_CLASS.POST; + } else { + LOG.atWarn().log("I don't recognize message class '{}'. Please open an issue on POI's bugzilla", mc); + return MESSAGE_CLASS.UNKNOWN; + } + } + + /** + * Gets the date that the message was accepted by the + * server on. + */ + public Calendar getMessageDate() throws ChunkNotFoundException { + if (mainChunks.getSubmissionChunk() != null) { + return mainChunks.getSubmissionChunk().getAcceptedAtTime(); + } else { + // Try a few likely suspects... + for (MAPIProperty prop : new MAPIProperty[]{ + MAPIProperty.CLIENT_SUBMIT_TIME, MAPIProperty.LAST_MODIFICATION_TIME, + MAPIProperty.CREATION_TIME + }) { + List val = mainChunks.getProperties().get(prop); + if (val != null && !val.isEmpty()) { + return ((TimePropertyValue) val.get(0)).getValue(); + } + } + } + + if (returnNullOnMissingChunk) + return null; + throw new ChunkNotFoundException(); + } + + + /** + * Gets the main, core details chunks + */ + public Chunks getMainChunks() { + return mainChunks; + } + + /** + * Gets all the recipient details chunks. + * These will normally be in the order of: + * * TO recipients, in the order returned by {@link #getDisplayTo()} + * * CC recipients, in the order returned by {@link #getDisplayCC()} + * * BCC recipients, in the order returned by {@link #getDisplayBCC()} + */ + public RecipientChunks[] getRecipientDetailsChunks() { + return recipientChunks; + } + + /** + * Gets the Name ID chunks, or + * null if there aren't any + */ + public NameIdChunks getNameIdChunks() { + return nameIdChunks; + } + + /** + * Gets the message attachments. + */ + public AttachmentChunks[] getAttachmentFiles() { + return attachmentChunks; + } + + + /** + * Will you get a null on a missing chunk, or a + * {@link ChunkNotFoundException} (default is the + * exception). + */ + public boolean isReturnNullOnMissingChunk() { + return returnNullOnMissingChunk; + } + + /** + * Sets whether on asking for a missing chunk, + * you get back null or a {@link ChunkNotFoundException} + * (default is the exception). + */ + public void setReturnNullOnMissingChunk(boolean returnNullOnMissingChunk) { + this.returnNullOnMissingChunk = returnNullOnMissingChunk; + } + + + private String toSemicolonList(String[] l) { + StringBuilder list = new StringBuilder(); + boolean first = true; + + for (String s : l) { + if (s == null) continue; + if (first) { + first = false; + } else { + list.append("; "); + } + list.append(s); + } + + return list.toString(); + } } -- cgit v1.2.3