<action dev="POI-DEVELOPERS" type="add">Created a common interface for handling Excel files, irrespective of if they are .xls or .xlsx</action>
</release>
<release version="3.1.1-alpha1" date="2008-??-??">
+ <action dev="POI-DEVELOPERS" type="fix">45622 - Support stripping HWPF fields (eg macros) out of text, via Range.stripFields(text)</action>
+ <action dev="POI-DEVELOPERS" type="add">New HPSF based TextExtractor for document metadata, org.apache.poi.hpsf.extractor.HPSFPropertiesExtractor</action>
<action dev="POI-DEVELOPERS" type="fix">Properly update the array of Slide's text runs in HSLF when new text shapes are added</action>
<action dev="POI-DEVELOPERS" type="fix">45590 - Fix for Header/footer extraction for .ppt files saved in Office 2007</action>
<action dev="POI-DEVELOPERS" type="fix">Big improvement in how HWPF handles unicode text, and more sanity checking of text ranges within HWPF</action>
properties. Chances are that you will find here what you need and don't
have to read the other sections.</note>
+ <p>If all you are interested in is getting the textual content of
+ all the document properties, such as for full text indexing, then
+ take a look at
+ <code>org.apache.poi.hpsf.extractor.HPSFPropertiesExtractor</code>. However,
+ if you want full access to the properties, please read on!</p>
+
<p>The first thing you should understand is that a Microsoft Office file is
not one large bunch of bytes but has an internal filesystem structure with
files and directories. You can access these files and directories using
</p>
</section>
+ <section><title>Headers and Footers</title>
+ <p>To get at the headers and footers of a word document, first create a
+<code>org.apache.poi.hwpf.HWPFDocument</code>. Next, you need to create a
+<code>org.apache.poi.hwpf.usermodel.HeaderStores</code>, passing it your
+HWPFDocument. Finally, the HeaderStores gives you access to the headers and
+footers, including first / even / odd page ones if defined in your
+document. Additionally, HeaderStores provides a method for removing
+any macros in the text, which is helpful as many headers and footers
+do end up with macros in them.</p>
+ </section>
+
<section><title>Changing Text</title>
<p>It is possible to change the text via
<code>insertBefore()</code> and <code>insertAfter()</code>
on a <code>Range</code> object (either a <code>Range</code>,
<code>Paragraph</code> or <code>CharacterRun</code>).
- It is also possible to delete a <code>Range</code>, but this
- code is know to have bugs in it.
+ It is also possible to delete a <code>Range</code>.
+ This code will work in many, but not all cases, and patches to
+ improve it are gratefully received!
</p>
</section>
<action dev="POI-DEVELOPERS" type="add">Created a common interface for handling Excel files, irrespective of if they are .xls or .xlsx</action>
</release>
<release version="3.1.1-alpha1" date="2008-??-??">
+ <action dev="POI-DEVELOPERS" type="fix">45622 - Support stripping HWPF fields (eg macros) out of text, via Range.stripFields(text)</action>
+ <action dev="POI-DEVELOPERS" type="add">New HPSF based TextExtractor for document metadata, org.apache.poi.hpsf.extractor.HPSFPropertiesExtractor</action>
<action dev="POI-DEVELOPERS" type="fix">Properly update the array of Slide's text runs in HSLF when new text shapes are added</action>
<action dev="POI-DEVELOPERS" type="fix">45590 - Fix for Header/footer extraction for .ppt files saved in Office 2007</action>
<action dev="POI-DEVELOPERS" type="fix">Big improvement in how HWPF handles unicode text, and more sanity checking of text ranges within HWPF</action>
import org.apache.poi.hpsf.DocumentSummaryInformation;
import org.apache.poi.hpsf.SummaryInformation;
+import org.apache.poi.hpsf.extractor.HPSFPropertiesExtractor;
/**
* Common Parent for OLE2 based Text Extractors
public SummaryInformation getSummaryInformation() {
return document.getSummaryInformation();
}
+
+ /**
+ * Returns an HPSF powered text extractor for the
+ * document properties metadata, such as title and author.
+ */
+ public POITextExtractor getMetadataTextExtractor() {
+ return new HPSFPropertiesExtractor(this);
+ }
}
public POITextExtractor(POIDocument document) {
this.document = document;
}
+ /**
+ * Creates a new text extractor, using the same
+ * document as another text extractor. Normally
+ * only used by properties extractors.
+ */
+ protected POITextExtractor(POITextExtractor otherExtractor) {
+ this.document = otherExtractor.document;
+ }
/**
* Retrieves all the text from the document.
* @return All the text from the document
*/
public abstract String getText();
+
+ /**
+ * Returns another text extractor, which is able to
+ * output the textual content of the document
+ * metadata / properties, such as author and title.
+ */
+ public abstract POITextExtractor getMetadataTextExtractor();
}
import java.util.HashMap;
import java.util.Iterator;
import java.util.Map;
+import java.util.Set;
import org.apache.poi.hpsf.wellknown.PropertyIDMap;
final CustomProperty cp = new CustomProperty(p, name);
return put(cp);
}
-
+
/**
+ * Returns a set of all the names of our
+ * custom properties
+ */
+ public Set keySet() {
+ return dictionaryNameToID.keySet();
+ }
+
+
+
+ /**
* <p>Sets the codepage.</p>
*
* @param codepage the codepage
public static final String DEFAULT_STREAM_NAME =
"\005DocumentSummaryInformation";
+ public PropertyIDMap getPropertySetIDMap() {
+ return PropertyIDMap.getDocumentSummaryInformationProperties();
+ }
/**
import java.io.OutputStream;
import java.util.List;
+import org.apache.poi.hpsf.wellknown.PropertyIDMap;
import org.apache.poi.poifs.filesystem.DirectoryEntry;
/**
*/
public abstract class SpecialPropertySet extends MutablePropertySet
{
+ /**
+ * The id to name mapping of the properties
+ * in this set.
+ */
+ public abstract PropertyIDMap getPropertySetIDMap();
/**
* <p>The "real" property set <code>SpecialPropertySet</code>
*/
public static final String DEFAULT_STREAM_NAME = "\005SummaryInformation";
+ public PropertyIDMap getPropertySetIDMap() {
+ return PropertyIDMap.getSummaryInformationProperties();
+ }
/**
--- /dev/null
+/* ====================================================================
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements. See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License. You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+==================================================================== */
+package org.apache.poi.hpsf.extractor;
+
+import java.io.IOException;
+import java.io.OutputStream;
+import java.util.Iterator;
+
+import org.apache.poi.POIDocument;
+import org.apache.poi.POITextExtractor;
+import org.apache.poi.hpsf.CustomProperties;
+import org.apache.poi.hpsf.DocumentSummaryInformation;
+import org.apache.poi.hpsf.Property;
+import org.apache.poi.hpsf.SpecialPropertySet;
+import org.apache.poi.hpsf.SummaryInformation;
+import org.apache.poi.hpsf.wellknown.PropertyIDMap;
+import org.apache.poi.poifs.filesystem.POIFSFileSystem;
+import org.apache.poi.util.LittleEndian;
+
+/**
+ * Extracts all of the HPSF properties, both
+ * build in and custom, returning them in
+ * textual form.
+ */
+public class HPSFPropertiesExtractor extends POITextExtractor {
+ public HPSFPropertiesExtractor(POITextExtractor mainExtractor) {
+ super(mainExtractor);
+ }
+ public HPSFPropertiesExtractor(POIDocument doc) {
+ super(doc);
+ }
+ public HPSFPropertiesExtractor(POIFSFileSystem fs) {
+ super(new PropertiesOnlyDocument(fs));
+ }
+
+ public String getDocumentSummaryInformationText() {
+ DocumentSummaryInformation dsi = document.getDocumentSummaryInformation();
+ StringBuffer text = new StringBuffer();
+
+ // Normal properties
+ text.append( getPropertiesText(dsi) );
+
+ // Now custom ones
+ CustomProperties cps = dsi.getCustomProperties();
+ Iterator keys = cps.keySet().iterator();
+ while(keys.hasNext()) {
+ String key = (String)keys.next();
+ String val = getPropertyValueText( cps.get(key) );
+ text.append(key + " = " + val + "\n");
+ }
+
+ // All done
+ return text.toString();
+ }
+ public String getSummaryInformationText() {
+ SummaryInformation si = document.getSummaryInformation();
+
+ // Just normal properties
+ return getPropertiesText(si);
+ }
+
+ private static String getPropertiesText(SpecialPropertySet ps) {
+ if(ps == null) {
+ // Not defined, oh well
+ return "";
+ }
+
+ StringBuffer text = new StringBuffer();
+
+ PropertyIDMap idMap = ps.getPropertySetIDMap();
+ Property[] props = ps.getProperties();
+ for(int i=0; i<props.length; i++) {
+ String type = Long.toString( props[i].getID() );
+ Object typeObj = idMap.get(props[i].getID());
+ if(typeObj != null) {
+ type = typeObj.toString();
+ }
+
+ String val = getPropertyValueText( props[i].getValue() );
+ text.append(type + " = " + val + "\n");
+ }
+
+ return text.toString();
+ }
+ private static String getPropertyValueText(Object val) {
+ if(val == null) {
+ return "(not set)";
+ }
+ if(val instanceof byte[]) {
+ byte[] b = (byte[])val;
+ if(b.length == 0) {
+ return "";
+ }
+ if(b.length == 1) {
+ return Byte.toString(b[0]);
+ }
+ if(b.length == 2) {
+ return Integer.toString( LittleEndian.getUShort(b) );
+ }
+ if(b.length == 4) {
+ return Long.toString( LittleEndian.getUInt(b) );
+ }
+ // Maybe it's a string? who knows!
+ return new String(b);
+ }
+ return val.toString();
+ }
+
+ /**
+ * Return the text of all the properties defined in
+ * the document.
+ */
+ public String getText() {
+ return getSummaryInformationText() + getDocumentSummaryInformationText();
+ }
+
+ /**
+ * Prevent recursion!
+ */
+ public POITextExtractor getMetadataTextExtractor() {
+ throw new IllegalStateException("You already have the Metadata Text Extractor, not recursing!");
+ }
+
+ /**
+ * So we can get at the properties of any
+ * random OLE2 document.
+ */
+ private static class PropertiesOnlyDocument extends POIDocument {
+ private PropertiesOnlyDocument(POIFSFileSystem fs) {
+ super(fs);
+ }
+
+ public void write(OutputStream out) throws IOException {
+ throw new IllegalStateException("Unable to write, only for properties!");
+ }
+ }
+}
* Creates a new text extractor for the given document
*/
public POIXMLTextExtractor(POIXMLDocument document) {
- super(null);
+ super((POIDocument)null);
this.document = document;
}
public POIXMLDocument getDocument(){
return document;
}
+
+
+ /**
+ * Returns an OOXML properties text extractor for the
+ * document properties metadata, such as title and author.
+ */
+ public POITextExtractor getMetadataTextExtractor() {
+ throw new RuntimeException("Not yet supported for OOXML!");
+ }
}
return ret.toString();
}
+
+ /**
+ * Removes any fields (eg macros, page markers etc)
+ * from the string.
+ */
+ public static String stripFields(String text) {
+ return Range.stripFields(text);
+ }
}
private Range headerStories;
private PlexOfCps plcfHdd;
+ private boolean stripFields = false;
+
public HeaderStories(HWPFDocument doc) {
this.headerStories = doc.getHeaderStoryRange();
FileInformationBlock fib = doc.getFileInformationBlock();
return "";
}
- // Return the contents
- return headerStories.text().substring(prop.getStart(), prop.getEnd());
+ // Grab the contents
+ String text =
+ headerStories.text().substring(prop.getStart(), prop.getEnd());
+
+ // Strip off fields and macros if requested
+ if(stripFields) {
+ return Range.stripFields(text);
+ }
+ return text;
}
public Range getRange() {
protected PlexOfCps getPlcfHdd() {
return plcfHdd;
}
+
+ /**
+ * Are fields currently being stripped from
+ * the text that this {@link HeaderStories} returns?
+ * Default is false, but can be changed
+ */
+ public boolean areFieldsStripped() {
+ return stripFields;
+ }
+ /**
+ * Should fields (eg macros) be stripped from
+ * the text that this class returns?
+ * Default is not to strip.
+ * @param stripFields
+ */
+ public void setAreFieldsStripped(boolean stripFields) {
+ this.stripFields = stripFields;
+ }
}
}
return sb.toString();
}
+
+ /**
+ * Removes any fields (eg macros, page markers etc)
+ * from the string.
+ * Normally used to make some text suitable for showing
+ * to humans, and the resultant text should not normally
+ * be saved back into the document!
+ */
+ public static String stripFields(String text) {
+ // First up, fields can be nested...
+ // A field can be 0x13 [contents] 0x15
+ // Or it can be 0x13 [contents] 0x14 [real text] 0x15
+
+ // If there are no fields, all easy
+ if(text.indexOf('\u0013') == -1) return text;
+
+ // Loop over until they're all gone
+ // That's when we're out of both 0x13s and 0x15s
+ while( text.indexOf('\u0013') > -1 &&
+ text.indexOf('\u0015') > -1) {
+ int first13 = text.indexOf('\u0013');
+ int next13 = text.indexOf('\u0013', first13+1);
+ int first14 = text.indexOf('\u0014', first13+1);
+ int last15 = text.lastIndexOf('\u0015');
+
+ // If they're the wrong way around, give up
+ if(last15 < first13) {
+ break;
+ }
+
+ // If no more 13s and 14s, just zap
+ if(next13 == -1 && first14 == -1) {
+ text = text.substring(0, first13) +
+ text.substring(last15+1);
+ break;
+ }
+
+ // If a 14 comes before the next 13, then
+ // zap from the 13 to the 14, and remove
+ // the 15
+ if(first14 != -1 && (first14 < next13 || next13 == -1)) {
+ text = text.substring(0, first13) +
+ text.substring(first14+1, last15) +
+ text.substring(last15+1);
+ continue;
+ }
+
+ // Another 13 comes before the next 14.
+ // This means there's nested stuff, so we
+ // can just zap the lot
+ text = text.substring(0, first13) +
+ text.substring(last15+1);
+ continue;
+ }
+
+ return text;
+ }
/**
* Used to get the number of sections in a range. If this range is smaller
private HWPFDocument oddEven;
private HWPFDocument diffFirst;
private HWPFDocument unicode;
+ private HWPFDocument withFields;
protected void setUp() throws Exception {
String dirname = System.getProperty("HWPF.testdata.path");
unicode = new HWPFDocument(
new FileInputStream(new File(dirname, "HeaderFooterUnicode.doc"))
);
+ withFields = new HWPFDocument(
+ new FileInputStream(new File(dirname, "HeaderWithMacros.doc"))
+ );
}
public void testNone() throws Exception {
assertEquals("\r\r", hs.getEvenFooter());
assertEquals("The footer, with Moli\u00e8re, has Unicode in it.\r\r", hs.getOddFooter());
}
+
+ public void testWithFields() throws Exception {
+ HeaderStories hs = new HeaderStories(withFields);
+ assertFalse(hs.areFieldsStripped());
+
+ assertEquals("HEADER GOES HERE. 8/12/2008 \u0013 AUTHOR \\* MERGEFORMAT \u0014Eric Roch\u0015\r\r\r", hs.getOddHeader());
+
+ // Now turn on stripping
+ hs.setAreFieldsStripped(true);
+ assertEquals("HEADER GOES HERE. 8/12/2008 Eric Roch\r\r\r", hs.getOddHeader());
+ }
}
import java.io.File;
import java.io.FileInputStream;
-import java.io.FileOutputStream;
import junit.framework.TestCase;
--- /dev/null
+/*
+* Licensed to the Apache Software Foundation (ASF) under one or more
+* contributor license agreements. See the NOTICE file distributed with
+* this work for additional information regarding copyright ownership.
+* The ASF licenses this file to You under the Apache License, Version 2.0
+* (the "License"); you may not use this file except in compliance with
+* the License. You may obtain a copy of the License at
+*
+* http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*/
+package org.apache.poi.hwpf.usermodel;
+
+import junit.framework.TestCase;
+
+/**
+ * Tests for Range which aren't around deletion, insertion,
+ * text replacement or textual contents
+ */
+public class TestRange extends TestCase {
+ public void testFieldStripping() throws Exception {
+ String exp = "This is some text.";
+
+ String single = "This is some \u0013Blah!\u0015text.";
+ String with14 = "This is \u0013Blah!\u0014some\u0015 text.";
+ String withNested =
+ "This is \u0013Blah!\u0013Blah!\u0015\u0015some text.";
+ String withNested14 =
+ "This is \u0013Blah!\u0013Blah!\u0014don't see me\u0015 blah!\u0015some text.";
+ String withNestedIn14 =
+ "This is \u0013Blah!\u0014some\u0013Blah!\u0015 \u0015text.";
+
+ // Check all comes out right
+ assertEquals(exp, Range.stripFields(exp));
+ assertEquals(exp, Range.stripFields(single));
+ assertEquals(exp, Range.stripFields(with14));
+ assertEquals(exp, Range.stripFields(withNested));
+ assertEquals(exp, Range.stripFields(withNested14));
+ assertEquals(exp, Range.stripFields(withNestedIn14));
+
+ // Ones that are odd and we won't change
+ String odd1 = "This\u0015 is \u0013 odd";
+ String odd2 = "This\u0015 is \u0014 also \u0013 odd";
+
+ assertEquals(odd1, Range.stripFields(odd1));
+ assertEquals(odd2, Range.stripFields(odd2));
+ }
+}
--- /dev/null
+/* ====================================================================
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements. See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License. You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+==================================================================== */
+package org.apache.poi.hpsf.extractor;
+
+import java.io.File;
+import java.io.FileInputStream;
+
+import org.apache.poi.hssf.extractor.ExcelExtractor;
+import org.apache.poi.hssf.usermodel.HSSFWorkbook;
+import org.apache.poi.poifs.filesystem.POIFSFileSystem;
+
+import junit.framework.TestCase;
+
+public class TestHPSFPropertiesExtractor extends TestCase {
+ private String dir;
+
+ protected void setUp() throws Exception {
+ dir = System.getProperty("HPSF.testdata.path");
+ assertNotNull("HPSF.testdata.path not set", dir);
+ }
+
+ public void testNormalProperties() throws Exception {
+ POIFSFileSystem fs = new POIFSFileSystem(
+ new FileInputStream(new File(dir, "TestMickey.doc"))
+ );
+ HPSFPropertiesExtractor ext = new HPSFPropertiesExtractor(fs);
+ ext.getText();
+
+ // Check each bit in turn
+ String sinfText = ext.getSummaryInformationText();
+ String dinfText = ext.getDocumentSummaryInformationText();
+
+ assertTrue(sinfText.indexOf("TEMPLATE = Normal") > -1);
+ assertTrue(sinfText.indexOf("SUBJECT = sample subject") > -1);
+ assertTrue(dinfText.indexOf("MANAGER = sample manager") > -1);
+ assertTrue(dinfText.indexOf("COMPANY = sample company") > -1);
+
+ // Now overall
+ String text = ext.getText();
+ assertTrue(text.indexOf("TEMPLATE = Normal") > -1);
+ assertTrue(text.indexOf("SUBJECT = sample subject") > -1);
+ assertTrue(text.indexOf("MANAGER = sample manager") > -1);
+ assertTrue(text.indexOf("COMPANY = sample company") > -1);
+ }
+ public void testNormalUnicodeProperties() throws Exception {
+ POIFSFileSystem fs = new POIFSFileSystem(
+ new FileInputStream(new File(dir, "TestUnicode.xls"))
+ );
+ HPSFPropertiesExtractor ext = new HPSFPropertiesExtractor(fs);
+ ext.getText();
+
+ // Check each bit in turn
+ String sinfText = ext.getSummaryInformationText();
+ String dinfText = ext.getDocumentSummaryInformationText();
+
+ assertTrue(sinfText.indexOf("AUTHOR = marshall") > -1);
+ assertTrue(sinfText.indexOf("TITLE = Titel: \u00c4h") > -1);
+ assertTrue(dinfText.indexOf("COMPANY = Schreiner") > -1);
+ assertTrue(dinfText.indexOf("SCALE = false") > -1);
+
+ // Now overall
+ String text = ext.getText();
+ assertTrue(text.indexOf("AUTHOR = marshall") > -1);
+ assertTrue(text.indexOf("TITLE = Titel: \u00c4h") > -1);
+ assertTrue(text.indexOf("COMPANY = Schreiner") > -1);
+ assertTrue(text.indexOf("SCALE = false") > -1);
+ }
+ public void testCustomProperties() throws Exception {
+ POIFSFileSystem fs = new POIFSFileSystem(
+ new FileInputStream(new File(dir, "TestMickey.doc"))
+ );
+ HPSFPropertiesExtractor ext = new HPSFPropertiesExtractor(fs);
+
+ // Custom properties are part of the document info stream
+ String dinfText = ext.getDocumentSummaryInformationText();
+ assertTrue(dinfText.indexOf("Client = sample client") > -1);
+ assertTrue(dinfText.indexOf("Division = sample division") > -1);
+
+ String text = ext.getText();
+ assertTrue(text.indexOf("Client = sample client") > -1);
+ assertTrue(text.indexOf("Division = sample division") > -1);
+ }
+
+ public void testConstructors() throws Exception {
+ POIFSFileSystem fs = new POIFSFileSystem(
+ new FileInputStream(new File(dir, "TestUnicode.xls"))
+ );
+ HSSFWorkbook wb = new HSSFWorkbook(fs);
+ ExcelExtractor excelExt = new ExcelExtractor(wb);
+
+ String fsText = (new HPSFPropertiesExtractor(fs)).getText();
+ String hwText = (new HPSFPropertiesExtractor(wb)).getText();
+ String eeText = (new HPSFPropertiesExtractor(excelExt)).getText();
+
+ assertEquals(fsText, hwText);
+ assertEquals(fsText, eeText);
+
+ assertTrue(fsText.indexOf("AUTHOR = marshall") > -1);
+ assertTrue(fsText.indexOf("TITLE = Titel: \u00c4h") > -1);
+ }
+}