<!-- Don't forget to update status.xml too! -->
<release version="3.1.1-alpha1" date="2008-??-??">
- <action dev="POI-DEVELOPERS" type="add">New HPSF based TextExtractor for document metadata, org.apache.poi.hpsf.extractor.HPFSPropertiesExtractor</action>
+ <action dev="POI-DEVELOPERS" type="add">New HPSF based TextExtractor for document metadata, org.apache.poi.hpsf.extractor.HPSFPropertiesExtractor</action>
<action dev="POI-DEVELOPERS" type="fix">Properly update the array of Slide's text runs in HSLF when new text shapes are added</action>
<action dev="POI-DEVELOPERS" type="fix">45590 - Fix for Header/footer extraction for .ppt files saved in Office 2007</action>
<action dev="POI-DEVELOPERS" type="fix">Big improvement in how HWPF handles unicode text, and more sanity checking of text ranges within HWPF</action>
<p>If all you are interested in is getting the textual content of
all the document properties, such as for full text indexing, then
take a look at
- <code>org.apache.poi.hpsf.extractor.HPFSPropertiesExtractor</code>. However,
+ <code>org.apache.poi.hpsf.extractor.HPSFPropertiesExtractor</code>. However,
if you want full access to the properties, please read on!</p>
<p>The first thing you should understand is that a Microsoft Office file is
<!-- Don't forget to update changes.xml too! -->
<changes>
<release version="3.1.1-alpha1" date="2008-??-??">
- <action dev="POI-DEVELOPERS" type="add">New HPSF based TextExtractor for document metadata, org.apache.poi.hpsf.extractor.HPFSPropertiesExtractor</action>
+ <action dev="POI-DEVELOPERS" type="add">New HPSF based TextExtractor for document metadata, org.apache.poi.hpsf.extractor.HPSFPropertiesExtractor</action>
<action dev="POI-DEVELOPERS" type="fix">Properly update the array of Slide's text runs in HSLF when new text shapes are added</action>
<action dev="POI-DEVELOPERS" type="fix">45590 - Fix for Header/footer extraction for .ppt files saved in Office 2007</action>
<action dev="POI-DEVELOPERS" type="fix">Big improvement in how HWPF handles unicode text, and more sanity checking of text ranges within HWPF</action>
import org.apache.poi.hpsf.DocumentSummaryInformation;
import org.apache.poi.hpsf.SummaryInformation;
+import org.apache.poi.hpsf.extractor.HPSFPropertiesExtractor;
/**
* Common Parent for OLE2 based Text Extractors
public SummaryInformation getSummaryInformation() {
return document.getSummaryInformation();
}
+
+ /**
+ * Returns an HPSF powered text extractor for the
+ * document properties metadata, such as title and author.
+ */
+ public POITextExtractor getMetadataTextExtractor() {
+ return new HPSFPropertiesExtractor(this);
+ }
}
* @return All the text from the document
*/
public abstract String getText();
+
+ /**
+ * Returns another text extractor, which is able to
+ * output the textual content of the document
+ * metadata / properties, such as author and title.
+ */
+ public abstract POITextExtractor getMetadataTextExtractor();
}
+++ /dev/null
-/* ====================================================================
- Licensed to the Apache Software Foundation (ASF) under one or more
- contributor license agreements. See the NOTICE file distributed with
- this work for additional information regarding copyright ownership.
- The ASF licenses this file to You under the Apache License, Version 2.0
- (the "License"); you may not use this file except in compliance with
- the License. You may obtain a copy of the License at
-
- http://www.apache.org/licenses/LICENSE-2.0
-
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License.
-==================================================================== */
-package org.apache.poi.hpsf.extractor;
-
-import java.io.IOException;
-import java.io.OutputStream;
-import java.util.Iterator;
-
-import org.apache.poi.POIDocument;
-import org.apache.poi.POITextExtractor;
-import org.apache.poi.hpsf.CustomProperties;
-import org.apache.poi.hpsf.DocumentSummaryInformation;
-import org.apache.poi.hpsf.Property;
-import org.apache.poi.hpsf.SpecialPropertySet;
-import org.apache.poi.hpsf.SummaryInformation;
-import org.apache.poi.hpsf.wellknown.PropertyIDMap;
-import org.apache.poi.poifs.filesystem.POIFSFileSystem;
-import org.apache.poi.util.LittleEndian;
-
-/**
- * Extracts all of the HPSF properties, both
- * build in and custom, returning them in
- * textual form.
- */
-public class HPFSPropertiesExtractor extends POITextExtractor {
- public HPFSPropertiesExtractor(POITextExtractor mainExtractor) {
- super(mainExtractor);
- }
- public HPFSPropertiesExtractor(POIDocument doc) {
- super(doc);
- }
- public HPFSPropertiesExtractor(POIFSFileSystem fs) {
- super(new PropertiesOnlyDocument(fs));
- }
-
- public String getDocumentSummaryInformationText() {
- DocumentSummaryInformation dsi = document.getDocumentSummaryInformation();
- StringBuffer text = new StringBuffer();
-
- // Normal properties
- text.append( getPropertiesText(dsi) );
-
- // Now custom ones
- CustomProperties cps = dsi.getCustomProperties();
- Iterator keys = cps.keySet().iterator();
- while(keys.hasNext()) {
- String key = (String)keys.next();
- String val = getPropertyValueText( cps.get(key) );
- text.append(key + " = " + val + "\n");
- }
-
- // All done
- return text.toString();
- }
- public String getSummaryInformationText() {
- SummaryInformation si = document.getSummaryInformation();
-
- // Just normal properties
- return getPropertiesText(si);
- }
-
- private static String getPropertiesText(SpecialPropertySet ps) {
- if(ps == null) {
- // Not defined, oh well
- return "";
- }
-
- StringBuffer text = new StringBuffer();
-
- PropertyIDMap idMap = ps.getPropertySetIDMap();
- Property[] props = ps.getProperties();
- for(int i=0; i<props.length; i++) {
- String type = Long.toString( props[i].getID() );
- Object typeObj = idMap.get(props[i].getID());
- if(typeObj != null) {
- type = typeObj.toString();
- }
-
- String val = getPropertyValueText( props[i].getValue() );
- text.append(type + " = " + val + "\n");
- }
-
- return text.toString();
- }
- private static String getPropertyValueText(Object val) {
- if(val == null) {
- return "(not set)";
- }
- if(val instanceof byte[]) {
- byte[] b = (byte[])val;
- if(b.length == 0) {
- return "";
- }
- if(b.length == 1) {
- return Byte.toString(b[0]);
- }
- if(b.length == 2) {
- return Integer.toString( LittleEndian.getUShort(b) );
- }
- if(b.length == 4) {
- return Long.toString( LittleEndian.getUInt(b) );
- }
- // Maybe it's a string? who knows!
- return new String(b);
- }
- return val.toString();
- }
-
- /**
- * Return the text of all the properties defined in
- * the document.
- */
- public String getText() {
- return getSummaryInformationText() + getDocumentSummaryInformationText();
- }
-
- /**
- * So we can get at the properties of any
- * random OLE2 document.
- */
- private static class PropertiesOnlyDocument extends POIDocument {
- private PropertiesOnlyDocument(POIFSFileSystem fs) {
- super(fs);
- }
-
- public void write(OutputStream out) throws IOException {
- throw new IllegalStateException("Unable to write, only for properties!");
- }
- }
-}
--- /dev/null
+/* ====================================================================
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements. See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License. You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+==================================================================== */
+package org.apache.poi.hpsf.extractor;
+
+import java.io.IOException;
+import java.io.OutputStream;
+import java.util.Iterator;
+
+import org.apache.poi.POIDocument;
+import org.apache.poi.POITextExtractor;
+import org.apache.poi.hpsf.CustomProperties;
+import org.apache.poi.hpsf.DocumentSummaryInformation;
+import org.apache.poi.hpsf.Property;
+import org.apache.poi.hpsf.SpecialPropertySet;
+import org.apache.poi.hpsf.SummaryInformation;
+import org.apache.poi.hpsf.wellknown.PropertyIDMap;
+import org.apache.poi.poifs.filesystem.POIFSFileSystem;
+import org.apache.poi.util.LittleEndian;
+
+/**
+ * Extracts all of the HPSF properties, both
+ * build in and custom, returning them in
+ * textual form.
+ */
+public class HPSFPropertiesExtractor extends POITextExtractor {
+ public HPSFPropertiesExtractor(POITextExtractor mainExtractor) {
+ super(mainExtractor);
+ }
+ public HPSFPropertiesExtractor(POIDocument doc) {
+ super(doc);
+ }
+ public HPSFPropertiesExtractor(POIFSFileSystem fs) {
+ super(new PropertiesOnlyDocument(fs));
+ }
+
+ public String getDocumentSummaryInformationText() {
+ DocumentSummaryInformation dsi = document.getDocumentSummaryInformation();
+ StringBuffer text = new StringBuffer();
+
+ // Normal properties
+ text.append( getPropertiesText(dsi) );
+
+ // Now custom ones
+ CustomProperties cps = dsi.getCustomProperties();
+ Iterator keys = cps.keySet().iterator();
+ while(keys.hasNext()) {
+ String key = (String)keys.next();
+ String val = getPropertyValueText( cps.get(key) );
+ text.append(key + " = " + val + "\n");
+ }
+
+ // All done
+ return text.toString();
+ }
+ public String getSummaryInformationText() {
+ SummaryInformation si = document.getSummaryInformation();
+
+ // Just normal properties
+ return getPropertiesText(si);
+ }
+
+ private static String getPropertiesText(SpecialPropertySet ps) {
+ if(ps == null) {
+ // Not defined, oh well
+ return "";
+ }
+
+ StringBuffer text = new StringBuffer();
+
+ PropertyIDMap idMap = ps.getPropertySetIDMap();
+ Property[] props = ps.getProperties();
+ for(int i=0; i<props.length; i++) {
+ String type = Long.toString( props[i].getID() );
+ Object typeObj = idMap.get(props[i].getID());
+ if(typeObj != null) {
+ type = typeObj.toString();
+ }
+
+ String val = getPropertyValueText( props[i].getValue() );
+ text.append(type + " = " + val + "\n");
+ }
+
+ return text.toString();
+ }
+ private static String getPropertyValueText(Object val) {
+ if(val == null) {
+ return "(not set)";
+ }
+ if(val instanceof byte[]) {
+ byte[] b = (byte[])val;
+ if(b.length == 0) {
+ return "";
+ }
+ if(b.length == 1) {
+ return Byte.toString(b[0]);
+ }
+ if(b.length == 2) {
+ return Integer.toString( LittleEndian.getUShort(b) );
+ }
+ if(b.length == 4) {
+ return Long.toString( LittleEndian.getUInt(b) );
+ }
+ // Maybe it's a string? who knows!
+ return new String(b);
+ }
+ return val.toString();
+ }
+
+ /**
+ * Return the text of all the properties defined in
+ * the document.
+ */
+ public String getText() {
+ return getSummaryInformationText() + getDocumentSummaryInformationText();
+ }
+
+ /**
+ * Prevent recursion!
+ */
+ public POITextExtractor getMetadataTextExtractor() {
+ throw new IllegalStateException("You already have the Metadata Text Extractor, not recursing!");
+ }
+
+ /**
+ * So we can get at the properties of any
+ * random OLE2 document.
+ */
+ private static class PropertiesOnlyDocument extends POIDocument {
+ private PropertiesOnlyDocument(POIFSFileSystem fs) {
+ super(fs);
+ }
+
+ public void write(OutputStream out) throws IOException {
+ throw new IllegalStateException("Unable to write, only for properties!");
+ }
+ }
+}
+++ /dev/null
-/* ====================================================================
- Licensed to the Apache Software Foundation (ASF) under one or more
- contributor license agreements. See the NOTICE file distributed with
- this work for additional information regarding copyright ownership.
- The ASF licenses this file to You under the Apache License, Version 2.0
- (the "License"); you may not use this file except in compliance with
- the License. You may obtain a copy of the License at
-
- http://www.apache.org/licenses/LICENSE-2.0
-
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License.
-==================================================================== */
-package org.apache.poi.hpsf.extractor;
-
-import java.io.File;
-import java.io.FileInputStream;
-
-import org.apache.poi.hssf.extractor.ExcelExtractor;
-import org.apache.poi.hssf.usermodel.HSSFWorkbook;
-import org.apache.poi.poifs.filesystem.POIFSFileSystem;
-
-import junit.framework.TestCase;
-
-public class TestHPFSPropertiesExtractor extends TestCase {
- private String dir;
-
- protected void setUp() throws Exception {
- dir = System.getProperty("HPSF.testdata.path");
- assertNotNull("HPSF.testdata.path not set", dir);
- }
-
- public void testNormalProperties() throws Exception {
- POIFSFileSystem fs = new POIFSFileSystem(
- new FileInputStream(new File(dir, "TestMickey.doc"))
- );
- HPFSPropertiesExtractor ext = new HPFSPropertiesExtractor(fs);
- ext.getText();
-
- // Check each bit in turn
- String sinfText = ext.getSummaryInformationText();
- String dinfText = ext.getDocumentSummaryInformationText();
-
- assertTrue(sinfText.indexOf("TEMPLATE = Normal") > -1);
- assertTrue(sinfText.indexOf("SUBJECT = sample subject") > -1);
- assertTrue(dinfText.indexOf("MANAGER = sample manager") > -1);
- assertTrue(dinfText.indexOf("COMPANY = sample company") > -1);
-
- // Now overall
- String text = ext.getText();
- assertTrue(text.indexOf("TEMPLATE = Normal") > -1);
- assertTrue(text.indexOf("SUBJECT = sample subject") > -1);
- assertTrue(text.indexOf("MANAGER = sample manager") > -1);
- assertTrue(text.indexOf("COMPANY = sample company") > -1);
- }
- public void testNormalUnicodeProperties() throws Exception {
- POIFSFileSystem fs = new POIFSFileSystem(
- new FileInputStream(new File(dir, "TestUnicode.xls"))
- );
- HPFSPropertiesExtractor ext = new HPFSPropertiesExtractor(fs);
- ext.getText();
-
- // Check each bit in turn
- String sinfText = ext.getSummaryInformationText();
- String dinfText = ext.getDocumentSummaryInformationText();
-
- assertTrue(sinfText.indexOf("AUTHOR = marshall") > -1);
- assertTrue(sinfText.indexOf("TITLE = Titel: \u00c4h") > -1);
- assertTrue(dinfText.indexOf("COMPANY = Schreiner") > -1);
- assertTrue(dinfText.indexOf("SCALE = false") > -1);
-
- // Now overall
- String text = ext.getText();
- assertTrue(text.indexOf("AUTHOR = marshall") > -1);
- assertTrue(text.indexOf("TITLE = Titel: \u00c4h") > -1);
- assertTrue(text.indexOf("COMPANY = Schreiner") > -1);
- assertTrue(text.indexOf("SCALE = false") > -1);
- }
- public void testCustomProperties() throws Exception {
- POIFSFileSystem fs = new POIFSFileSystem(
- new FileInputStream(new File(dir, "TestMickey.doc"))
- );
- HPFSPropertiesExtractor ext = new HPFSPropertiesExtractor(fs);
-
- // Custom properties are part of the document info stream
- String dinfText = ext.getDocumentSummaryInformationText();
- assertTrue(dinfText.indexOf("Client = sample client") > -1);
- assertTrue(dinfText.indexOf("Division = sample division") > -1);
-
- String text = ext.getText();
- assertTrue(text.indexOf("Client = sample client") > -1);
- assertTrue(text.indexOf("Division = sample division") > -1);
- }
-
- public void testConstructors() throws Exception {
- POIFSFileSystem fs = new POIFSFileSystem(
- new FileInputStream(new File(dir, "TestUnicode.xls"))
- );
- HSSFWorkbook wb = new HSSFWorkbook(fs);
- ExcelExtractor excelExt = new ExcelExtractor(wb);
-
- String fsText = (new HPFSPropertiesExtractor(fs)).getText();
- String hwText = (new HPFSPropertiesExtractor(wb)).getText();
- String eeText = (new HPFSPropertiesExtractor(excelExt)).getText();
-
- assertEquals(fsText, hwText);
- assertEquals(fsText, eeText);
-
- assertTrue(fsText.indexOf("AUTHOR = marshall") > -1);
- assertTrue(fsText.indexOf("TITLE = Titel: \u00c4h") > -1);
- }
-}
--- /dev/null
+/* ====================================================================
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements. See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License. You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+==================================================================== */
+package org.apache.poi.hpsf.extractor;
+
+import java.io.File;
+import java.io.FileInputStream;
+
+import org.apache.poi.hssf.extractor.ExcelExtractor;
+import org.apache.poi.hssf.usermodel.HSSFWorkbook;
+import org.apache.poi.poifs.filesystem.POIFSFileSystem;
+
+import junit.framework.TestCase;
+
+public class TestHPSFPropertiesExtractor extends TestCase {
+ private String dir;
+
+ protected void setUp() throws Exception {
+ dir = System.getProperty("HPSF.testdata.path");
+ assertNotNull("HPSF.testdata.path not set", dir);
+ }
+
+ public void testNormalProperties() throws Exception {
+ POIFSFileSystem fs = new POIFSFileSystem(
+ new FileInputStream(new File(dir, "TestMickey.doc"))
+ );
+ HPSFPropertiesExtractor ext = new HPSFPropertiesExtractor(fs);
+ ext.getText();
+
+ // Check each bit in turn
+ String sinfText = ext.getSummaryInformationText();
+ String dinfText = ext.getDocumentSummaryInformationText();
+
+ assertTrue(sinfText.indexOf("TEMPLATE = Normal") > -1);
+ assertTrue(sinfText.indexOf("SUBJECT = sample subject") > -1);
+ assertTrue(dinfText.indexOf("MANAGER = sample manager") > -1);
+ assertTrue(dinfText.indexOf("COMPANY = sample company") > -1);
+
+ // Now overall
+ String text = ext.getText();
+ assertTrue(text.indexOf("TEMPLATE = Normal") > -1);
+ assertTrue(text.indexOf("SUBJECT = sample subject") > -1);
+ assertTrue(text.indexOf("MANAGER = sample manager") > -1);
+ assertTrue(text.indexOf("COMPANY = sample company") > -1);
+ }
+ public void testNormalUnicodeProperties() throws Exception {
+ POIFSFileSystem fs = new POIFSFileSystem(
+ new FileInputStream(new File(dir, "TestUnicode.xls"))
+ );
+ HPSFPropertiesExtractor ext = new HPSFPropertiesExtractor(fs);
+ ext.getText();
+
+ // Check each bit in turn
+ String sinfText = ext.getSummaryInformationText();
+ String dinfText = ext.getDocumentSummaryInformationText();
+
+ assertTrue(sinfText.indexOf("AUTHOR = marshall") > -1);
+ assertTrue(sinfText.indexOf("TITLE = Titel: \u00c4h") > -1);
+ assertTrue(dinfText.indexOf("COMPANY = Schreiner") > -1);
+ assertTrue(dinfText.indexOf("SCALE = false") > -1);
+
+ // Now overall
+ String text = ext.getText();
+ assertTrue(text.indexOf("AUTHOR = marshall") > -1);
+ assertTrue(text.indexOf("TITLE = Titel: \u00c4h") > -1);
+ assertTrue(text.indexOf("COMPANY = Schreiner") > -1);
+ assertTrue(text.indexOf("SCALE = false") > -1);
+ }
+ public void testCustomProperties() throws Exception {
+ POIFSFileSystem fs = new POIFSFileSystem(
+ new FileInputStream(new File(dir, "TestMickey.doc"))
+ );
+ HPSFPropertiesExtractor ext = new HPSFPropertiesExtractor(fs);
+
+ // Custom properties are part of the document info stream
+ String dinfText = ext.getDocumentSummaryInformationText();
+ assertTrue(dinfText.indexOf("Client = sample client") > -1);
+ assertTrue(dinfText.indexOf("Division = sample division") > -1);
+
+ String text = ext.getText();
+ assertTrue(text.indexOf("Client = sample client") > -1);
+ assertTrue(text.indexOf("Division = sample division") > -1);
+ }
+
+ public void testConstructors() throws Exception {
+ POIFSFileSystem fs = new POIFSFileSystem(
+ new FileInputStream(new File(dir, "TestUnicode.xls"))
+ );
+ HSSFWorkbook wb = new HSSFWorkbook(fs);
+ ExcelExtractor excelExt = new ExcelExtractor(wb);
+
+ String fsText = (new HPSFPropertiesExtractor(fs)).getText();
+ String hwText = (new HPSFPropertiesExtractor(wb)).getText();
+ String eeText = (new HPSFPropertiesExtractor(excelExt)).getText();
+
+ assertEquals(fsText, hwText);
+ assertEquals(fsText, eeText);
+
+ assertTrue(fsText.indexOf("AUTHOR = marshall") > -1);
+ assertTrue(fsText.indexOf("TITLE = Titel: \u00c4h") > -1);
+ }
+}