From: Nick Burch Date: Tue, 12 Aug 2008 19:02:41 +0000 (+0000) Subject: Fix a typo in the file name, and add a generic method to POITextExtractor to get... X-Git-Tag: REL_3_2_FINAL~161 X-Git-Url: https://source.dussan.org/?a=commitdiff_plain;h=8ad0a4f34ceae63cda3e0e4bef34455e3992291f;p=poi.git Fix a typo in the file name, and add a generic method to POITextExtractor to get the appropriate metadata text extractor git-svn-id: https://svn.apache.org/repos/asf/poi/trunk@685267 13f79535-47bb-0310-9956-ffa450edef68 --- diff --git a/src/documentation/content/xdocs/changes.xml b/src/documentation/content/xdocs/changes.xml index 6a0cae2672..b1cfff6b91 100644 --- a/src/documentation/content/xdocs/changes.xml +++ b/src/documentation/content/xdocs/changes.xml @@ -37,7 +37,7 @@ - New HPSF based TextExtractor for document metadata, org.apache.poi.hpsf.extractor.HPFSPropertiesExtractor + New HPSF based TextExtractor for document metadata, org.apache.poi.hpsf.extractor.HPSFPropertiesExtractor Properly update the array of Slide's text runs in HSLF when new text shapes are added 45590 - Fix for Header/footer extraction for .ppt files saved in Office 2007 Big improvement in how HWPF handles unicode text, and more sanity checking of text ranges within HWPF diff --git a/src/documentation/content/xdocs/hpsf/how-to.xml b/src/documentation/content/xdocs/hpsf/how-to.xml index aadf753a44..964005bf24 100644 --- a/src/documentation/content/xdocs/hpsf/how-to.xml +++ b/src/documentation/content/xdocs/hpsf/how-to.xml @@ -95,7 +95,7 @@

If all you are interested in is getting the textual content of all the document properties, such as for full text indexing, then take a look at - org.apache.poi.hpsf.extractor.HPFSPropertiesExtractor. However, + org.apache.poi.hpsf.extractor.HPSFPropertiesExtractor. However, if you want full access to the properties, please read on!

The first thing you should understand is that a Microsoft Office file is diff --git a/src/documentation/content/xdocs/status.xml b/src/documentation/content/xdocs/status.xml index 0146f55e79..fbe242aa2a 100644 --- a/src/documentation/content/xdocs/status.xml +++ b/src/documentation/content/xdocs/status.xml @@ -34,7 +34,7 @@ - New HPSF based TextExtractor for document metadata, org.apache.poi.hpsf.extractor.HPFSPropertiesExtractor + New HPSF based TextExtractor for document metadata, org.apache.poi.hpsf.extractor.HPSFPropertiesExtractor Properly update the array of Slide's text runs in HSLF when new text shapes are added 45590 - Fix for Header/footer extraction for .ppt files saved in Office 2007 Big improvement in how HWPF handles unicode text, and more sanity checking of text ranges within HWPF diff --git a/src/java/org/apache/poi/POIOLE2TextExtractor.java b/src/java/org/apache/poi/POIOLE2TextExtractor.java index f5aee4cc6d..d46c7e4aad 100644 --- a/src/java/org/apache/poi/POIOLE2TextExtractor.java +++ b/src/java/org/apache/poi/POIOLE2TextExtractor.java @@ -18,6 +18,7 @@ package org.apache.poi; import org.apache.poi.hpsf.DocumentSummaryInformation; import org.apache.poi.hpsf.SummaryInformation; +import org.apache.poi.hpsf.extractor.HPSFPropertiesExtractor; /** * Common Parent for OLE2 based Text Extractors @@ -50,4 +51,12 @@ public abstract class POIOLE2TextExtractor extends POITextExtractor { public SummaryInformation getSummaryInformation() { return document.getSummaryInformation(); } + + /** + * Returns an HPSF powered text extractor for the + * document properties metadata, such as title and author. + */ + public POITextExtractor getMetadataTextExtractor() { + return new HPSFPropertiesExtractor(this); + } } diff --git a/src/java/org/apache/poi/POITextExtractor.java b/src/java/org/apache/poi/POITextExtractor.java index a7ffd44197..0b69894d08 100644 --- a/src/java/org/apache/poi/POITextExtractor.java +++ b/src/java/org/apache/poi/POITextExtractor.java @@ -54,4 +54,11 @@ public abstract class POITextExtractor { * @return All the text from the document */ public abstract String getText(); + + /** + * Returns another text extractor, which is able to + * output the textual content of the document + * metadata / properties, such as author and title. + */ + public abstract POITextExtractor getMetadataTextExtractor(); } diff --git a/src/java/org/apache/poi/hpsf/extractor/HPFSPropertiesExtractor.java b/src/java/org/apache/poi/hpsf/extractor/HPFSPropertiesExtractor.java deleted file mode 100644 index c85f1bb04c..0000000000 --- a/src/java/org/apache/poi/hpsf/extractor/HPFSPropertiesExtractor.java +++ /dev/null @@ -1,144 +0,0 @@ -/* ==================================================================== - Licensed to the Apache Software Foundation (ASF) under one or more - contributor license agreements. See the NOTICE file distributed with - this work for additional information regarding copyright ownership. - The ASF licenses this file to You under the Apache License, Version 2.0 - (the "License"); you may not use this file except in compliance with - the License. You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. -==================================================================== */ -package org.apache.poi.hpsf.extractor; - -import java.io.IOException; -import java.io.OutputStream; -import java.util.Iterator; - -import org.apache.poi.POIDocument; -import org.apache.poi.POITextExtractor; -import org.apache.poi.hpsf.CustomProperties; -import org.apache.poi.hpsf.DocumentSummaryInformation; -import org.apache.poi.hpsf.Property; -import org.apache.poi.hpsf.SpecialPropertySet; -import org.apache.poi.hpsf.SummaryInformation; -import org.apache.poi.hpsf.wellknown.PropertyIDMap; -import org.apache.poi.poifs.filesystem.POIFSFileSystem; -import org.apache.poi.util.LittleEndian; - -/** - * Extracts all of the HPSF properties, both - * build in and custom, returning them in - * textual form. - */ -public class HPFSPropertiesExtractor extends POITextExtractor { - public HPFSPropertiesExtractor(POITextExtractor mainExtractor) { - super(mainExtractor); - } - public HPFSPropertiesExtractor(POIDocument doc) { - super(doc); - } - public HPFSPropertiesExtractor(POIFSFileSystem fs) { - super(new PropertiesOnlyDocument(fs)); - } - - public String getDocumentSummaryInformationText() { - DocumentSummaryInformation dsi = document.getDocumentSummaryInformation(); - StringBuffer text = new StringBuffer(); - - // Normal properties - text.append( getPropertiesText(dsi) ); - - // Now custom ones - CustomProperties cps = dsi.getCustomProperties(); - Iterator keys = cps.keySet().iterator(); - while(keys.hasNext()) { - String key = (String)keys.next(); - String val = getPropertyValueText( cps.get(key) ); - text.append(key + " = " + val + "\n"); - } - - // All done - return text.toString(); - } - public String getSummaryInformationText() { - SummaryInformation si = document.getSummaryInformation(); - - // Just normal properties - return getPropertiesText(si); - } - - private static String getPropertiesText(SpecialPropertySet ps) { - if(ps == null) { - // Not defined, oh well - return ""; - } - - StringBuffer text = new StringBuffer(); - - PropertyIDMap idMap = ps.getPropertySetIDMap(); - Property[] props = ps.getProperties(); - for(int i=0; i -1); - assertTrue(sinfText.indexOf("SUBJECT = sample subject") > -1); - assertTrue(dinfText.indexOf("MANAGER = sample manager") > -1); - assertTrue(dinfText.indexOf("COMPANY = sample company") > -1); - - // Now overall - String text = ext.getText(); - assertTrue(text.indexOf("TEMPLATE = Normal") > -1); - assertTrue(text.indexOf("SUBJECT = sample subject") > -1); - assertTrue(text.indexOf("MANAGER = sample manager") > -1); - assertTrue(text.indexOf("COMPANY = sample company") > -1); - } - public void testNormalUnicodeProperties() throws Exception { - POIFSFileSystem fs = new POIFSFileSystem( - new FileInputStream(new File(dir, "TestUnicode.xls")) - ); - HPFSPropertiesExtractor ext = new HPFSPropertiesExtractor(fs); - ext.getText(); - - // Check each bit in turn - String sinfText = ext.getSummaryInformationText(); - String dinfText = ext.getDocumentSummaryInformationText(); - - assertTrue(sinfText.indexOf("AUTHOR = marshall") > -1); - assertTrue(sinfText.indexOf("TITLE = Titel: \u00c4h") > -1); - assertTrue(dinfText.indexOf("COMPANY = Schreiner") > -1); - assertTrue(dinfText.indexOf("SCALE = false") > -1); - - // Now overall - String text = ext.getText(); - assertTrue(text.indexOf("AUTHOR = marshall") > -1); - assertTrue(text.indexOf("TITLE = Titel: \u00c4h") > -1); - assertTrue(text.indexOf("COMPANY = Schreiner") > -1); - assertTrue(text.indexOf("SCALE = false") > -1); - } - public void testCustomProperties() throws Exception { - POIFSFileSystem fs = new POIFSFileSystem( - new FileInputStream(new File(dir, "TestMickey.doc")) - ); - HPFSPropertiesExtractor ext = new HPFSPropertiesExtractor(fs); - - // Custom properties are part of the document info stream - String dinfText = ext.getDocumentSummaryInformationText(); - assertTrue(dinfText.indexOf("Client = sample client") > -1); - assertTrue(dinfText.indexOf("Division = sample division") > -1); - - String text = ext.getText(); - assertTrue(text.indexOf("Client = sample client") > -1); - assertTrue(text.indexOf("Division = sample division") > -1); - } - - public void testConstructors() throws Exception { - POIFSFileSystem fs = new POIFSFileSystem( - new FileInputStream(new File(dir, "TestUnicode.xls")) - ); - HSSFWorkbook wb = new HSSFWorkbook(fs); - ExcelExtractor excelExt = new ExcelExtractor(wb); - - String fsText = (new HPFSPropertiesExtractor(fs)).getText(); - String hwText = (new HPFSPropertiesExtractor(wb)).getText(); - String eeText = (new HPFSPropertiesExtractor(excelExt)).getText(); - - assertEquals(fsText, hwText); - assertEquals(fsText, eeText); - - assertTrue(fsText.indexOf("AUTHOR = marshall") > -1); - assertTrue(fsText.indexOf("TITLE = Titel: \u00c4h") > -1); - } -} diff --git a/src/testcases/org/apache/poi/hpsf/extractor/TestHPSFPropertiesExtractor.java b/src/testcases/org/apache/poi/hpsf/extractor/TestHPSFPropertiesExtractor.java new file mode 100644 index 0000000000..3a189353d0 --- /dev/null +++ b/src/testcases/org/apache/poi/hpsf/extractor/TestHPSFPropertiesExtractor.java @@ -0,0 +1,115 @@ +/* ==================================================================== + Licensed to the Apache Software Foundation (ASF) under one or more + contributor license agreements. See the NOTICE file distributed with + this work for additional information regarding copyright ownership. + The ASF licenses this file to You under the Apache License, Version 2.0 + (the "License"); you may not use this file except in compliance with + the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +==================================================================== */ +package org.apache.poi.hpsf.extractor; + +import java.io.File; +import java.io.FileInputStream; + +import org.apache.poi.hssf.extractor.ExcelExtractor; +import org.apache.poi.hssf.usermodel.HSSFWorkbook; +import org.apache.poi.poifs.filesystem.POIFSFileSystem; + +import junit.framework.TestCase; + +public class TestHPSFPropertiesExtractor extends TestCase { + private String dir; + + protected void setUp() throws Exception { + dir = System.getProperty("HPSF.testdata.path"); + assertNotNull("HPSF.testdata.path not set", dir); + } + + public void testNormalProperties() throws Exception { + POIFSFileSystem fs = new POIFSFileSystem( + new FileInputStream(new File(dir, "TestMickey.doc")) + ); + HPSFPropertiesExtractor ext = new HPSFPropertiesExtractor(fs); + ext.getText(); + + // Check each bit in turn + String sinfText = ext.getSummaryInformationText(); + String dinfText = ext.getDocumentSummaryInformationText(); + + assertTrue(sinfText.indexOf("TEMPLATE = Normal") > -1); + assertTrue(sinfText.indexOf("SUBJECT = sample subject") > -1); + assertTrue(dinfText.indexOf("MANAGER = sample manager") > -1); + assertTrue(dinfText.indexOf("COMPANY = sample company") > -1); + + // Now overall + String text = ext.getText(); + assertTrue(text.indexOf("TEMPLATE = Normal") > -1); + assertTrue(text.indexOf("SUBJECT = sample subject") > -1); + assertTrue(text.indexOf("MANAGER = sample manager") > -1); + assertTrue(text.indexOf("COMPANY = sample company") > -1); + } + public void testNormalUnicodeProperties() throws Exception { + POIFSFileSystem fs = new POIFSFileSystem( + new FileInputStream(new File(dir, "TestUnicode.xls")) + ); + HPSFPropertiesExtractor ext = new HPSFPropertiesExtractor(fs); + ext.getText(); + + // Check each bit in turn + String sinfText = ext.getSummaryInformationText(); + String dinfText = ext.getDocumentSummaryInformationText(); + + assertTrue(sinfText.indexOf("AUTHOR = marshall") > -1); + assertTrue(sinfText.indexOf("TITLE = Titel: \u00c4h") > -1); + assertTrue(dinfText.indexOf("COMPANY = Schreiner") > -1); + assertTrue(dinfText.indexOf("SCALE = false") > -1); + + // Now overall + String text = ext.getText(); + assertTrue(text.indexOf("AUTHOR = marshall") > -1); + assertTrue(text.indexOf("TITLE = Titel: \u00c4h") > -1); + assertTrue(text.indexOf("COMPANY = Schreiner") > -1); + assertTrue(text.indexOf("SCALE = false") > -1); + } + public void testCustomProperties() throws Exception { + POIFSFileSystem fs = new POIFSFileSystem( + new FileInputStream(new File(dir, "TestMickey.doc")) + ); + HPSFPropertiesExtractor ext = new HPSFPropertiesExtractor(fs); + + // Custom properties are part of the document info stream + String dinfText = ext.getDocumentSummaryInformationText(); + assertTrue(dinfText.indexOf("Client = sample client") > -1); + assertTrue(dinfText.indexOf("Division = sample division") > -1); + + String text = ext.getText(); + assertTrue(text.indexOf("Client = sample client") > -1); + assertTrue(text.indexOf("Division = sample division") > -1); + } + + public void testConstructors() throws Exception { + POIFSFileSystem fs = new POIFSFileSystem( + new FileInputStream(new File(dir, "TestUnicode.xls")) + ); + HSSFWorkbook wb = new HSSFWorkbook(fs); + ExcelExtractor excelExt = new ExcelExtractor(wb); + + String fsText = (new HPSFPropertiesExtractor(fs)).getText(); + String hwText = (new HPSFPropertiesExtractor(wb)).getText(); + String eeText = (new HPSFPropertiesExtractor(excelExt)).getText(); + + assertEquals(fsText, hwText); + assertEquals(fsText, eeText); + + assertTrue(fsText.indexOf("AUTHOR = marshall") > -1); + assertTrue(fsText.indexOf("TITLE = Titel: \u00c4h") > -1); + } +}