From: Nick Burch Date: Tue, 12 Aug 2008 20:08:14 +0000 (+0000) Subject: Merged revisions 638786-638802,638805-638811,638813-638814,638816-639230,639233-63924... X-Git-Tag: REL_3_5_BETA2~5 X-Git-Url: https://source.dussan.org/?a=commitdiff_plain;h=3638f76a8aa229cae91dc0f16dbf7297f0cec725;p=poi.git Merged revisions 638786-638802,638805-638811,638813-638814,638816-639230,639233-639241,639243-639253,639255-639486,639488-639601,639603-639835,639837-639917,639919-640056,640058-640710,640712-641156,641158-641184,641186-641795,641797-641798,641800-641933,641935-641963,641965-641966,641968-641995,641997-642230,642232-642562,642564-642565,642568-642570,642572-642573,642576-642736,642739-642877,642879,642881-642890,642892-642903,642905-642945,642947-643624,643626-643653,643655-643669,643671,643673-643830,643832-643833,643835-644342,644344-644472,644474-644508,644510-645347,645349-645351,645353-645559,645561-645565,645568-645951,645953-646193,646195-646311,646313-646404,646406-646665,646667-646853,646855-646869,646871-647151,647153-647185,647187-647277,647279-647566,647568-647573,647575,647578-647711,647714-647737,647739-647823,647825-648155,648157-648202,648204-648273,648275,648277-648302,648304-648333,648335-648588,648590-648622,648625-648673,648675-649141,649144,649146-649556,649558-649795,649799,649801-649910,649912-649913,649915-650128,650131-650132,650134-650137,650140-650914,650916-651991,651993-652284,652286-652287,652289,652291,652293-652297,652299-652328,652330-652425,652427-652445,652447-652560,652562-652933,652935,652937-652993,652995-653116,653118-653124,653126-653483,653487-653519,653522-653550,653552-653607,653609-653667,653669-653674,653676-653814,653817-653830,653832-653891,653893-653944,653946-654055,654057-654355,654357-654365,654367-654648,654651-655215,655217-655277,655279-655281,655283-655911,655913-656212,656214,656216-656251,656253-656698,656700-656756,656758-656892,656894-657135,657137-657165,657168-657179,657181-657354,657356-657357,657359-657701,657703-657874,657876-658032,658034-658284,658286,658288-658301,658303-658307,658309-658321,658323-658335,658337-658348,658351,658353-658832,658834-658983,658985,658987-659066,659068-659402,659404-659428,659430-659451,659453-659454,659456-659461,659463-659477,659479-659524,659526-659571,659574,659576-660255,660257-660262,660264-660279,660281-660343,660345-660473,660475-660827,660829-660833,660835-660888,660890-663321,663323-663435,663437-663764,663766-663854,663856-664219,664221-664489,664494-664514,664516-668013,668015-668142,668144-668152,668154,668156-668256,668258,668260-669139,669141-669455,669457-669657,669659-669808,669810-670189,670191-671321,671323-672229,672231-672549,672551-672552,672554-672561,672563-672566,672568,672571-673049,673051-673852,673854-673862,673864-673986,673988-673996,673998-674347,674349-674890,674892-674910,674912-674936,674938-674952,674954-675078,675080-675085,675087-675217,675219-675660,675662-675670,675672-675716,675718-675726,675728-675733,675735-675775,675777-675782,675784,675786-675791,675794-675852,675854-676200,676202,676204,676206-676220,676222-676309,676311-676456,676458-676994,676996-677027,677030-677040,677042-677056,677058-677375,677377-677968,677970-677971,677973,677975-677994,677996-678286,678288-678538,678540-680393,680395-680469,680471-680529,680531-680852,680854-681529,681531-681571,681573-682224,682226,682228,682231-682281,682283-682335,682337-682507,682509,682512-682517,682519-682532,682534-682619,682622-682777,682779-682998,683000-683019,683021-683022,683024-683080,683082-683092,683094-683095,683097-683127,683129-683131,683133-683166,683168-683698,683700-683705,683707-683757,683759-683787,683789-683870,683872-683879,683881-683900,683902-684066,684068-684074,684076-684222,684224-684254,684257-684281,684283-684286,684288-684292,684294-684298,684300-684301,684303-684308,684310-684317,684320,684323-684335,684337-684348,684350-684354,684356-684361,684363-684369,684371-684453,684455-684883,684885-684937,684940-684958,684960-684970,684972-684985,684987-685053,685055-685063,685065-685284 via svnmerge from https://svn.apache.org/repos/asf/poi/trunk ........ r685260 | nick | 2008-08-12 19:44:50 +0100 (Tue, 12 Aug 2008) | 1 line New HPSF based TextExtractor for document metadata, org.apache.poi.hpsf.extractor.HPFSPropertiesExtractor ........ r685263 | nick | 2008-08-12 19:55:47 +0100 (Tue, 12 Aug 2008) | 1 line Few documentation updates for recent new code ........ r685267 | nick | 2008-08-12 20:02:41 +0100 (Tue, 12 Aug 2008) | 1 line Fix a typo in the file name, and add a generic method to POITextExtractor to get the appropriate metadata text extractor ........ r685283 | nick | 2008-08-12 20:57:04 +0100 (Tue, 12 Aug 2008) | 1 line Add HWPF support for stripping out fields (eg macros), and make this optionally happen always for headers and footers ........ r685284 | nick | 2008-08-12 20:59:35 +0100 (Tue, 12 Aug 2008) | 1 line Update changelog ........ git-svn-id: https://svn.apache.org/repos/asf/poi/branches/ooxml@685288 13f79535-47bb-0310-9956-ffa450edef68 --- diff --git a/src/documentation/content/xdocs/changes.xml b/src/documentation/content/xdocs/changes.xml index e9164fb46a..020c6c9602 100644 --- a/src/documentation/content/xdocs/changes.xml +++ b/src/documentation/content/xdocs/changes.xml @@ -58,6 +58,8 @@ Created a common interface for handling Excel files, irrespective of if they are .xls or .xlsx + 45622 - Support stripping HWPF fields (eg macros) out of text, via Range.stripFields(text) + New HPSF based TextExtractor for document metadata, org.apache.poi.hpsf.extractor.HPSFPropertiesExtractor Properly update the array of Slide's text runs in HSLF when new text shapes are added 45590 - Fix for Header/footer extraction for .ppt files saved in Office 2007 Big improvement in how HWPF handles unicode text, and more sanity checking of text ranges within HWPF diff --git a/src/documentation/content/xdocs/hpsf/how-to.xml b/src/documentation/content/xdocs/hpsf/how-to.xml index 0073126c95..964005bf24 100644 --- a/src/documentation/content/xdocs/hpsf/how-to.xml +++ b/src/documentation/content/xdocs/hpsf/how-to.xml @@ -92,6 +92,12 @@ properties. Chances are that you will find here what you need and don't have to read the other sections. +

If all you are interested in is getting the textual content of + all the document properties, such as for full text indexing, then + take a look at + org.apache.poi.hpsf.extractor.HPSFPropertiesExtractor. However, + if you want full access to the properties, please read on!

+

The first thing you should understand is that a Microsoft Office file is not one large bunch of bytes but has an internal filesystem structure with files and directories. You can access these files and directories using diff --git a/src/documentation/content/xdocs/hwpf/quick-guide.xml b/src/documentation/content/xdocs/hwpf/quick-guide.xml index bf046258e7..d717b0ef09 100644 --- a/src/documentation/content/xdocs/hwpf/quick-guide.xml +++ b/src/documentation/content/xdocs/hwpf/quick-guide.xml @@ -55,13 +55,25 @@ can then get text and other properties.

+
Headers and Footers +

To get at the headers and footers of a word document, first create a +org.apache.poi.hwpf.HWPFDocument. Next, you need to create a +org.apache.poi.hwpf.usermodel.HeaderStores, passing it your +HWPFDocument. Finally, the HeaderStores gives you access to the headers and +footers, including first / even / odd page ones if defined in your +document. Additionally, HeaderStores provides a method for removing +any macros in the text, which is helpful as many headers and footers +do end up with macros in them.

+
+
Changing Text

It is possible to change the text via insertBefore() and insertAfter() on a Range object (either a Range, Paragraph or CharacterRun). - It is also possible to delete a Range, but this - code is know to have bugs in it. + It is also possible to delete a Range. + This code will work in many, but not all cases, and patches to + improve it are gratefully received!

diff --git a/src/documentation/content/xdocs/status.xml b/src/documentation/content/xdocs/status.xml index 5c4ffadb15..998263d8dd 100644 --- a/src/documentation/content/xdocs/status.xml +++ b/src/documentation/content/xdocs/status.xml @@ -55,6 +55,8 @@ Created a common interface for handling Excel files, irrespective of if they are .xls or .xlsx
+ 45622 - Support stripping HWPF fields (eg macros) out of text, via Range.stripFields(text) + New HPSF based TextExtractor for document metadata, org.apache.poi.hpsf.extractor.HPSFPropertiesExtractor Properly update the array of Slide's text runs in HSLF when new text shapes are added 45590 - Fix for Header/footer extraction for .ppt files saved in Office 2007 Big improvement in how HWPF handles unicode text, and more sanity checking of text ranges within HWPF diff --git a/src/java/org/apache/poi/POIOLE2TextExtractor.java b/src/java/org/apache/poi/POIOLE2TextExtractor.java index f5aee4cc6d..d46c7e4aad 100644 --- a/src/java/org/apache/poi/POIOLE2TextExtractor.java +++ b/src/java/org/apache/poi/POIOLE2TextExtractor.java @@ -18,6 +18,7 @@ package org.apache.poi; import org.apache.poi.hpsf.DocumentSummaryInformation; import org.apache.poi.hpsf.SummaryInformation; +import org.apache.poi.hpsf.extractor.HPSFPropertiesExtractor; /** * Common Parent for OLE2 based Text Extractors @@ -50,4 +51,12 @@ public abstract class POIOLE2TextExtractor extends POITextExtractor { public SummaryInformation getSummaryInformation() { return document.getSummaryInformation(); } + + /** + * Returns an HPSF powered text extractor for the + * document properties metadata, such as title and author. + */ + public POITextExtractor getMetadataTextExtractor() { + return new HPSFPropertiesExtractor(this); + } } diff --git a/src/java/org/apache/poi/POITextExtractor.java b/src/java/org/apache/poi/POITextExtractor.java index 3ba71880eb..0b69894d08 100644 --- a/src/java/org/apache/poi/POITextExtractor.java +++ b/src/java/org/apache/poi/POITextExtractor.java @@ -37,6 +37,14 @@ public abstract class POITextExtractor { public POITextExtractor(POIDocument document) { this.document = document; } + /** + * Creates a new text extractor, using the same + * document as another text extractor. Normally + * only used by properties extractors. + */ + protected POITextExtractor(POITextExtractor otherExtractor) { + this.document = otherExtractor.document; + } /** * Retrieves all the text from the document. @@ -46,4 +54,11 @@ public abstract class POITextExtractor { * @return All the text from the document */ public abstract String getText(); + + /** + * Returns another text extractor, which is able to + * output the textual content of the document + * metadata / properties, such as author and title. + */ + public abstract POITextExtractor getMetadataTextExtractor(); } diff --git a/src/java/org/apache/poi/hpsf/CustomProperties.java b/src/java/org/apache/poi/hpsf/CustomProperties.java index 24b19e5d04..420fc2f9bb 100644 --- a/src/java/org/apache/poi/hpsf/CustomProperties.java +++ b/src/java/org/apache/poi/hpsf/CustomProperties.java @@ -21,6 +21,7 @@ import java.util.Date; import java.util.HashMap; import java.util.Iterator; import java.util.Map; +import java.util.Set; import org.apache.poi.hpsf.wellknown.PropertyIDMap; @@ -293,8 +294,18 @@ public class CustomProperties extends HashMap final CustomProperty cp = new CustomProperty(p, name); return put(cp); } - + /** + * Returns a set of all the names of our + * custom properties + */ + public Set keySet() { + return dictionaryNameToID.keySet(); + } + + + + /** *

Sets the codepage.

* * @param codepage the codepage diff --git a/src/java/org/apache/poi/hpsf/DocumentSummaryInformation.java b/src/java/org/apache/poi/hpsf/DocumentSummaryInformation.java index b7a7c9ae6d..62c6127ee4 100644 --- a/src/java/org/apache/poi/hpsf/DocumentSummaryInformation.java +++ b/src/java/org/apache/poi/hpsf/DocumentSummaryInformation.java @@ -45,6 +45,9 @@ public class DocumentSummaryInformation extends SpecialPropertySet public static final String DEFAULT_STREAM_NAME = "\005DocumentSummaryInformation"; + public PropertyIDMap getPropertySetIDMap() { + return PropertyIDMap.getDocumentSummaryInformationProperties(); + } /** diff --git a/src/java/org/apache/poi/hpsf/SpecialPropertySet.java b/src/java/org/apache/poi/hpsf/SpecialPropertySet.java index 6a02bbc188..f415bd5d12 100644 --- a/src/java/org/apache/poi/hpsf/SpecialPropertySet.java +++ b/src/java/org/apache/poi/hpsf/SpecialPropertySet.java @@ -22,6 +22,7 @@ import java.io.InputStream; import java.io.OutputStream; import java.util.List; +import org.apache.poi.hpsf.wellknown.PropertyIDMap; import org.apache.poi.poifs.filesystem.DirectoryEntry; /** @@ -57,6 +58,11 @@ import org.apache.poi.poifs.filesystem.DirectoryEntry; */ public abstract class SpecialPropertySet extends MutablePropertySet { + /** + * The id to name mapping of the properties + * in this set. + */ + public abstract PropertyIDMap getPropertySetIDMap(); /** *

The "real" property set SpecialPropertySet diff --git a/src/java/org/apache/poi/hpsf/SummaryInformation.java b/src/java/org/apache/poi/hpsf/SummaryInformation.java index 66d9ce0937..a143e2bad0 100644 --- a/src/java/org/apache/poi/hpsf/SummaryInformation.java +++ b/src/java/org/apache/poi/hpsf/SummaryInformation.java @@ -40,6 +40,9 @@ public class SummaryInformation extends SpecialPropertySet */ public static final String DEFAULT_STREAM_NAME = "\005SummaryInformation"; + public PropertyIDMap getPropertySetIDMap() { + return PropertyIDMap.getSummaryInformationProperties(); + } /** diff --git a/src/java/org/apache/poi/hpsf/extractor/HPSFPropertiesExtractor.java b/src/java/org/apache/poi/hpsf/extractor/HPSFPropertiesExtractor.java new file mode 100644 index 0000000000..ecad5c05be --- /dev/null +++ b/src/java/org/apache/poi/hpsf/extractor/HPSFPropertiesExtractor.java @@ -0,0 +1,151 @@ +/* ==================================================================== + Licensed to the Apache Software Foundation (ASF) under one or more + contributor license agreements. See the NOTICE file distributed with + this work for additional information regarding copyright ownership. + The ASF licenses this file to You under the Apache License, Version 2.0 + (the "License"); you may not use this file except in compliance with + the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +==================================================================== */ +package org.apache.poi.hpsf.extractor; + +import java.io.IOException; +import java.io.OutputStream; +import java.util.Iterator; + +import org.apache.poi.POIDocument; +import org.apache.poi.POITextExtractor; +import org.apache.poi.hpsf.CustomProperties; +import org.apache.poi.hpsf.DocumentSummaryInformation; +import org.apache.poi.hpsf.Property; +import org.apache.poi.hpsf.SpecialPropertySet; +import org.apache.poi.hpsf.SummaryInformation; +import org.apache.poi.hpsf.wellknown.PropertyIDMap; +import org.apache.poi.poifs.filesystem.POIFSFileSystem; +import org.apache.poi.util.LittleEndian; + +/** + * Extracts all of the HPSF properties, both + * build in and custom, returning them in + * textual form. + */ +public class HPSFPropertiesExtractor extends POITextExtractor { + public HPSFPropertiesExtractor(POITextExtractor mainExtractor) { + super(mainExtractor); + } + public HPSFPropertiesExtractor(POIDocument doc) { + super(doc); + } + public HPSFPropertiesExtractor(POIFSFileSystem fs) { + super(new PropertiesOnlyDocument(fs)); + } + + public String getDocumentSummaryInformationText() { + DocumentSummaryInformation dsi = document.getDocumentSummaryInformation(); + StringBuffer text = new StringBuffer(); + + // Normal properties + text.append( getPropertiesText(dsi) ); + + // Now custom ones + CustomProperties cps = dsi.getCustomProperties(); + Iterator keys = cps.keySet().iterator(); + while(keys.hasNext()) { + String key = (String)keys.next(); + String val = getPropertyValueText( cps.get(key) ); + text.append(key + " = " + val + "\n"); + } + + // All done + return text.toString(); + } + public String getSummaryInformationText() { + SummaryInformation si = document.getSummaryInformation(); + + // Just normal properties + return getPropertiesText(si); + } + + private static String getPropertiesText(SpecialPropertySet ps) { + if(ps == null) { + // Not defined, oh well + return ""; + } + + StringBuffer text = new StringBuffer(); + + PropertyIDMap idMap = ps.getPropertySetIDMap(); + Property[] props = ps.getProperties(); + for(int i=0; i -1 && + text.indexOf('\u0015') > -1) { + int first13 = text.indexOf('\u0013'); + int next13 = text.indexOf('\u0013', first13+1); + int first14 = text.indexOf('\u0014', first13+1); + int last15 = text.lastIndexOf('\u0015'); + + // If they're the wrong way around, give up + if(last15 < first13) { + break; + } + + // If no more 13s and 14s, just zap + if(next13 == -1 && first14 == -1) { + text = text.substring(0, first13) + + text.substring(last15+1); + break; + } + + // If a 14 comes before the next 13, then + // zap from the 13 to the 14, and remove + // the 15 + if(first14 != -1 && (first14 < next13 || next13 == -1)) { + text = text.substring(0, first13) + + text.substring(first14+1, last15) + + text.substring(last15+1); + continue; + } + + // Another 13 comes before the next 14. + // This means there's nested stuff, so we + // can just zap the lot + text = text.substring(0, first13) + + text.substring(last15+1); + continue; + } + + return text; + } /** * Used to get the number of sections in a range. If this range is smaller diff --git a/src/scratchpad/testcases/org/apache/poi/hwpf/data/HeaderWithMacros.doc b/src/scratchpad/testcases/org/apache/poi/hwpf/data/HeaderWithMacros.doc new file mode 100644 index 0000000000..934970f58b Binary files /dev/null and b/src/scratchpad/testcases/org/apache/poi/hwpf/data/HeaderWithMacros.doc differ diff --git a/src/scratchpad/testcases/org/apache/poi/hwpf/usermodel/TestHeaderStories.java b/src/scratchpad/testcases/org/apache/poi/hwpf/usermodel/TestHeaderStories.java index d4d2517f90..404f6e47a4 100644 --- a/src/scratchpad/testcases/org/apache/poi/hwpf/usermodel/TestHeaderStories.java +++ b/src/scratchpad/testcases/org/apache/poi/hwpf/usermodel/TestHeaderStories.java @@ -35,6 +35,7 @@ public class TestHeaderStories extends TestCase { private HWPFDocument oddEven; private HWPFDocument diffFirst; private HWPFDocument unicode; + private HWPFDocument withFields; protected void setUp() throws Exception { String dirname = System.getProperty("HWPF.testdata.path"); @@ -60,6 +61,9 @@ public class TestHeaderStories extends TestCase { unicode = new HWPFDocument( new FileInputStream(new File(dirname, "HeaderFooterUnicode.doc")) ); + withFields = new HWPFDocument( + new FileInputStream(new File(dirname, "HeaderWithMacros.doc")) + ); } public void testNone() throws Exception { @@ -186,4 +190,15 @@ public class TestHeaderStories extends TestCase { assertEquals("\r\r", hs.getEvenFooter()); assertEquals("The footer, with Moli\u00e8re, has Unicode in it.\r\r", hs.getOddFooter()); } + + public void testWithFields() throws Exception { + HeaderStories hs = new HeaderStories(withFields); + assertFalse(hs.areFieldsStripped()); + + assertEquals("HEADER GOES HERE. 8/12/2008 \u0013 AUTHOR \\* MERGEFORMAT \u0014Eric Roch\u0015\r\r\r", hs.getOddHeader()); + + // Now turn on stripping + hs.setAreFieldsStripped(true); + assertEquals("HEADER GOES HERE. 8/12/2008 Eric Roch\r\r\r", hs.getOddHeader()); + } } diff --git a/src/scratchpad/testcases/org/apache/poi/hwpf/usermodel/TestProblems.java b/src/scratchpad/testcases/org/apache/poi/hwpf/usermodel/TestProblems.java index db28cbd456..6b4200a631 100644 --- a/src/scratchpad/testcases/org/apache/poi/hwpf/usermodel/TestProblems.java +++ b/src/scratchpad/testcases/org/apache/poi/hwpf/usermodel/TestProblems.java @@ -18,7 +18,6 @@ package org.apache.poi.hwpf.usermodel; import java.io.File; import java.io.FileInputStream; -import java.io.FileOutputStream; import junit.framework.TestCase; diff --git a/src/scratchpad/testcases/org/apache/poi/hwpf/usermodel/TestRange.java b/src/scratchpad/testcases/org/apache/poi/hwpf/usermodel/TestRange.java new file mode 100644 index 0000000000..bcb03996ab --- /dev/null +++ b/src/scratchpad/testcases/org/apache/poi/hwpf/usermodel/TestRange.java @@ -0,0 +1,53 @@ +/* +* Licensed to the Apache Software Foundation (ASF) under one or more +* contributor license agreements. See the NOTICE file distributed with +* this work for additional information regarding copyright ownership. +* The ASF licenses this file to You under the Apache License, Version 2.0 +* (the "License"); you may not use this file except in compliance with +* the License. You may obtain a copy of the License at +* +* http://www.apache.org/licenses/LICENSE-2.0 +* +* Unless required by applicable law or agreed to in writing, software +* distributed under the License is distributed on an "AS IS" BASIS, +* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +* See the License for the specific language governing permissions and +* limitations under the License. +*/ +package org.apache.poi.hwpf.usermodel; + +import junit.framework.TestCase; + +/** + * Tests for Range which aren't around deletion, insertion, + * text replacement or textual contents + */ +public class TestRange extends TestCase { + public void testFieldStripping() throws Exception { + String exp = "This is some text."; + + String single = "This is some \u0013Blah!\u0015text."; + String with14 = "This is \u0013Blah!\u0014some\u0015 text."; + String withNested = + "This is \u0013Blah!\u0013Blah!\u0015\u0015some text."; + String withNested14 = + "This is \u0013Blah!\u0013Blah!\u0014don't see me\u0015 blah!\u0015some text."; + String withNestedIn14 = + "This is \u0013Blah!\u0014some\u0013Blah!\u0015 \u0015text."; + + // Check all comes out right + assertEquals(exp, Range.stripFields(exp)); + assertEquals(exp, Range.stripFields(single)); + assertEquals(exp, Range.stripFields(with14)); + assertEquals(exp, Range.stripFields(withNested)); + assertEquals(exp, Range.stripFields(withNested14)); + assertEquals(exp, Range.stripFields(withNestedIn14)); + + // Ones that are odd and we won't change + String odd1 = "This\u0015 is \u0013 odd"; + String odd2 = "This\u0015 is \u0014 also \u0013 odd"; + + assertEquals(odd1, Range.stripFields(odd1)); + assertEquals(odd2, Range.stripFields(odd2)); + } +} diff --git a/src/testcases/org/apache/poi/hpsf/extractor/TestHPSFPropertiesExtractor.java b/src/testcases/org/apache/poi/hpsf/extractor/TestHPSFPropertiesExtractor.java new file mode 100644 index 0000000000..3a189353d0 --- /dev/null +++ b/src/testcases/org/apache/poi/hpsf/extractor/TestHPSFPropertiesExtractor.java @@ -0,0 +1,115 @@ +/* ==================================================================== + Licensed to the Apache Software Foundation (ASF) under one or more + contributor license agreements. See the NOTICE file distributed with + this work for additional information regarding copyright ownership. + The ASF licenses this file to You under the Apache License, Version 2.0 + (the "License"); you may not use this file except in compliance with + the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +==================================================================== */ +package org.apache.poi.hpsf.extractor; + +import java.io.File; +import java.io.FileInputStream; + +import org.apache.poi.hssf.extractor.ExcelExtractor; +import org.apache.poi.hssf.usermodel.HSSFWorkbook; +import org.apache.poi.poifs.filesystem.POIFSFileSystem; + +import junit.framework.TestCase; + +public class TestHPSFPropertiesExtractor extends TestCase { + private String dir; + + protected void setUp() throws Exception { + dir = System.getProperty("HPSF.testdata.path"); + assertNotNull("HPSF.testdata.path not set", dir); + } + + public void testNormalProperties() throws Exception { + POIFSFileSystem fs = new POIFSFileSystem( + new FileInputStream(new File(dir, "TestMickey.doc")) + ); + HPSFPropertiesExtractor ext = new HPSFPropertiesExtractor(fs); + ext.getText(); + + // Check each bit in turn + String sinfText = ext.getSummaryInformationText(); + String dinfText = ext.getDocumentSummaryInformationText(); + + assertTrue(sinfText.indexOf("TEMPLATE = Normal") > -1); + assertTrue(sinfText.indexOf("SUBJECT = sample subject") > -1); + assertTrue(dinfText.indexOf("MANAGER = sample manager") > -1); + assertTrue(dinfText.indexOf("COMPANY = sample company") > -1); + + // Now overall + String text = ext.getText(); + assertTrue(text.indexOf("TEMPLATE = Normal") > -1); + assertTrue(text.indexOf("SUBJECT = sample subject") > -1); + assertTrue(text.indexOf("MANAGER = sample manager") > -1); + assertTrue(text.indexOf("COMPANY = sample company") > -1); + } + public void testNormalUnicodeProperties() throws Exception { + POIFSFileSystem fs = new POIFSFileSystem( + new FileInputStream(new File(dir, "TestUnicode.xls")) + ); + HPSFPropertiesExtractor ext = new HPSFPropertiesExtractor(fs); + ext.getText(); + + // Check each bit in turn + String sinfText = ext.getSummaryInformationText(); + String dinfText = ext.getDocumentSummaryInformationText(); + + assertTrue(sinfText.indexOf("AUTHOR = marshall") > -1); + assertTrue(sinfText.indexOf("TITLE = Titel: \u00c4h") > -1); + assertTrue(dinfText.indexOf("COMPANY = Schreiner") > -1); + assertTrue(dinfText.indexOf("SCALE = false") > -1); + + // Now overall + String text = ext.getText(); + assertTrue(text.indexOf("AUTHOR = marshall") > -1); + assertTrue(text.indexOf("TITLE = Titel: \u00c4h") > -1); + assertTrue(text.indexOf("COMPANY = Schreiner") > -1); + assertTrue(text.indexOf("SCALE = false") > -1); + } + public void testCustomProperties() throws Exception { + POIFSFileSystem fs = new POIFSFileSystem( + new FileInputStream(new File(dir, "TestMickey.doc")) + ); + HPSFPropertiesExtractor ext = new HPSFPropertiesExtractor(fs); + + // Custom properties are part of the document info stream + String dinfText = ext.getDocumentSummaryInformationText(); + assertTrue(dinfText.indexOf("Client = sample client") > -1); + assertTrue(dinfText.indexOf("Division = sample division") > -1); + + String text = ext.getText(); + assertTrue(text.indexOf("Client = sample client") > -1); + assertTrue(text.indexOf("Division = sample division") > -1); + } + + public void testConstructors() throws Exception { + POIFSFileSystem fs = new POIFSFileSystem( + new FileInputStream(new File(dir, "TestUnicode.xls")) + ); + HSSFWorkbook wb = new HSSFWorkbook(fs); + ExcelExtractor excelExt = new ExcelExtractor(wb); + + String fsText = (new HPSFPropertiesExtractor(fs)).getText(); + String hwText = (new HPSFPropertiesExtractor(wb)).getText(); + String eeText = (new HPSFPropertiesExtractor(excelExt)).getText(); + + assertEquals(fsText, hwText); + assertEquals(fsText, eeText); + + assertTrue(fsText.indexOf("AUTHOR = marshall") > -1); + assertTrue(fsText.indexOf("TITLE = Titel: \u00c4h") > -1); + } +}