]> source.dussan.org Git - poi.git/commitdiff
Merged revisions 638786-638802,638805-638811,638813-638814,638816-639230,639233-63924...
authorNick Burch <nick@apache.org>
Tue, 12 Aug 2008 20:08:14 +0000 (20:08 +0000)
committerNick Burch <nick@apache.org>
Tue, 12 Aug 2008 20:08:14 +0000 (20:08 +0000)
https://svn.apache.org/repos/asf/poi/trunk

........
  r685260 | nick | 2008-08-12 19:44:50 +0100 (Tue, 12 Aug 2008) | 1 line

  New HPSF based TextExtractor for document metadata, org.apache.poi.hpsf.extractor.HPFSPropertiesExtractor
........
  r685263 | nick | 2008-08-12 19:55:47 +0100 (Tue, 12 Aug 2008) | 1 line

  Few documentation updates for recent new code
........
  r685267 | nick | 2008-08-12 20:02:41 +0100 (Tue, 12 Aug 2008) | 1 line

  Fix a typo in the file name, and add a generic method to POITextExtractor to get the appropriate metadata text extractor
........
  r685283 | nick | 2008-08-12 20:57:04 +0100 (Tue, 12 Aug 2008) | 1 line

  Add HWPF support for stripping out fields (eg macros), and make this optionally happen always for headers and footers
........
  r685284 | nick | 2008-08-12 20:59:35 +0100 (Tue, 12 Aug 2008) | 1 line

  Update changelog
........

git-svn-id: https://svn.apache.org/repos/asf/poi/branches/ooxml@685288 13f79535-47bb-0310-9956-ffa450edef68

20 files changed:
src/documentation/content/xdocs/changes.xml
src/documentation/content/xdocs/hpsf/how-to.xml
src/documentation/content/xdocs/hwpf/quick-guide.xml
src/documentation/content/xdocs/status.xml
src/java/org/apache/poi/POIOLE2TextExtractor.java
src/java/org/apache/poi/POITextExtractor.java
src/java/org/apache/poi/hpsf/CustomProperties.java
src/java/org/apache/poi/hpsf/DocumentSummaryInformation.java
src/java/org/apache/poi/hpsf/SpecialPropertySet.java
src/java/org/apache/poi/hpsf/SummaryInformation.java
src/java/org/apache/poi/hpsf/extractor/HPSFPropertiesExtractor.java [new file with mode: 0644]
src/ooxml/java/org/apache/poi/POIXMLTextExtractor.java
src/scratchpad/src/org/apache/poi/hwpf/extractor/WordExtractor.java
src/scratchpad/src/org/apache/poi/hwpf/usermodel/HeaderStories.java
src/scratchpad/src/org/apache/poi/hwpf/usermodel/Range.java
src/scratchpad/testcases/org/apache/poi/hwpf/data/HeaderWithMacros.doc [new file with mode: 0644]
src/scratchpad/testcases/org/apache/poi/hwpf/usermodel/TestHeaderStories.java
src/scratchpad/testcases/org/apache/poi/hwpf/usermodel/TestProblems.java
src/scratchpad/testcases/org/apache/poi/hwpf/usermodel/TestRange.java [new file with mode: 0644]
src/testcases/org/apache/poi/hpsf/extractor/TestHPSFPropertiesExtractor.java [new file with mode: 0644]

index e9164fb46aa5b7c98d18f9eb794189688e763b1b..020c6c9602c7c5efffd2900302b687f10bd07c54 100644 (file)
@@ -58,6 +58,8 @@
            <action dev="POI-DEVELOPERS" type="add">Created a common interface for handling Excel files, irrespective of if they are .xls or .xlsx</action>
         </release>
         <release version="3.1.1-alpha1" date="2008-??-??">
+           <action dev="POI-DEVELOPERS" type="fix">45622 - Support stripping HWPF fields (eg macros) out of text, via Range.stripFields(text)</action>
+           <action dev="POI-DEVELOPERS" type="add">New HPSF based TextExtractor for document metadata, org.apache.poi.hpsf.extractor.HPSFPropertiesExtractor</action>
            <action dev="POI-DEVELOPERS" type="fix">Properly update the array of Slide's text runs in HSLF when new text shapes are added</action>
            <action dev="POI-DEVELOPERS" type="fix">45590 - Fix for Header/footer extraction for .ppt files saved in Office 2007</action>
            <action dev="POI-DEVELOPERS" type="fix">Big improvement in how HWPF handles unicode text, and more sanity checking of text ranges within HWPF</action>
index 0073126c95bb788dfa59e21625f0e7d433fa293e..964005bf244963385949d67b6a8e47fc814917bd 100644 (file)
      properties. Chances are that you will find here what you need and don't
      have to read the other sections.</note>
 
+    <p>If all you are interested in is getting the textual content of
+     all the document properties, such as for full text indexing, then
+     take a look at 
+     <code>org.apache.poi.hpsf.extractor.HPSFPropertiesExtractor</code>. However,
+     if you want full access to the properties, please read on!</p>
+
     <p>The first thing you should understand is that a Microsoft Office file is
      not one large bunch of bytes but has an internal filesystem structure with
      files and directories. You can access these files and directories using
index bf046258e7f65192e1e14857e28796e00ef761c1..d717b0ef094f540480ee2436b912544092150846 100644 (file)
@@ -55,13 +55,25 @@ can then get text and other properties.
                </p>
                </section>
                
+               <section><title>Headers and Footers</title>
+               <p>To get at the headers and footers of a word document, first create a
+<code>org.apache.poi.hwpf.HWPFDocument</code>. Next, you need to create a
+<code>org.apache.poi.hwpf.usermodel.HeaderStores</code>, passing it your
+HWPFDocument. Finally, the HeaderStores gives you access to the headers and
+footers, including first / even / odd page ones if defined in your
+document. Additionally, HeaderStores provides a method for removing
+any macros in the text, which is helpful as many headers and footers
+do end up with macros in them.</p>
+               </section>
+               
                <section><title>Changing Text</title>
                <p>It is possible to change the text via 
                <code>insertBefore()</code> and <code>insertAfter()</code>
                on a <code>Range</code> object (either a <code>Range</code>,
                <code>Paragraph</code> or <code>CharacterRun</code>).
-               It is also possible to delete a <code>Range</code>, but this
-               code is know to have bugs in it.
+               It is also possible to delete a <code>Range</code>.
+               This code will work in many, but not all cases, and patches to
+        improve it are gratefully received!
                </p>
                </section>
 
index 5c4ffadb152b9a120b0d1e0f1550fe2bc201c6dc..998263d8dd73c9062169e26a4ff3a3ec65991ab8 100644 (file)
@@ -55,6 +55,8 @@
            <action dev="POI-DEVELOPERS" type="add">Created a common interface for handling Excel files, irrespective of if they are .xls or .xlsx</action>
         </release>
         <release version="3.1.1-alpha1" date="2008-??-??">
+           <action dev="POI-DEVELOPERS" type="fix">45622 - Support stripping HWPF fields (eg macros) out of text, via Range.stripFields(text)</action>
+           <action dev="POI-DEVELOPERS" type="add">New HPSF based TextExtractor for document metadata, org.apache.poi.hpsf.extractor.HPSFPropertiesExtractor</action>
            <action dev="POI-DEVELOPERS" type="fix">Properly update the array of Slide's text runs in HSLF when new text shapes are added</action>
            <action dev="POI-DEVELOPERS" type="fix">45590 - Fix for Header/footer extraction for .ppt files saved in Office 2007</action>
            <action dev="POI-DEVELOPERS" type="fix">Big improvement in how HWPF handles unicode text, and more sanity checking of text ranges within HWPF</action>
index f5aee4cc6dc7a3ee6f59d666f39e6439915739e9..d46c7e4aadd893a74d42736493eedb6b9f8c6776 100644 (file)
@@ -18,6 +18,7 @@ package org.apache.poi;
 
 import org.apache.poi.hpsf.DocumentSummaryInformation;
 import org.apache.poi.hpsf.SummaryInformation;
+import org.apache.poi.hpsf.extractor.HPSFPropertiesExtractor;
 
 /**
  * Common Parent for OLE2 based Text Extractors
@@ -50,4 +51,12 @@ public abstract class POIOLE2TextExtractor extends POITextExtractor {
        public SummaryInformation getSummaryInformation() {
                return document.getSummaryInformation();
        }
+       
+       /**
+        * Returns an HPSF powered text extractor for the 
+        *  document properties metadata, such as title and author.
+        */
+       public POITextExtractor getMetadataTextExtractor() {
+               return new HPSFPropertiesExtractor(this);
+       }
 }
index 3ba71880ebbdeb7316cc3f0b74a0a609a1ac4a8f..0b69894d0853c66fbc86992a45dedd6d1707c997 100644 (file)
@@ -37,6 +37,14 @@ public abstract class POITextExtractor {
        public POITextExtractor(POIDocument document) {
                this.document = document;
        }
+       /**
+        * Creates a new text extractor, using the same
+        *  document as another text extractor. Normally
+        *  only used by properties extractors.
+        */
+       protected POITextExtractor(POITextExtractor otherExtractor) {
+               this.document = otherExtractor.document;
+       }
        
        /**
         * Retrieves all the text from the document.
@@ -46,4 +54,11 @@ public abstract class POITextExtractor {
         * @return All the text from the document
         */
        public abstract String getText();
+       
+       /**
+        * Returns another text extractor, which is able to
+        *  output the textual content of the document
+        *  metadata / properties, such as author and title.
+        */
+       public abstract POITextExtractor getMetadataTextExtractor();
 }
index 24b19e5d046e8cc434fb247d2b32ae034d736a9f..420fc2f9bb991ed47657864b2e130609aac0861f 100644 (file)
@@ -21,6 +21,7 @@ import java.util.Date;
 import java.util.HashMap;
 import java.util.Iterator;
 import java.util.Map;
+import java.util.Set;
 
 import org.apache.poi.hpsf.wellknown.PropertyIDMap;
 
@@ -293,8 +294,18 @@ public class CustomProperties extends HashMap
         final CustomProperty cp = new CustomProperty(p, name);
         return put(cp);
     }
-
+    
     /**
+     * Returns a set of all the names of our
+     *  custom properties
+     */
+    public Set keySet() {
+       return dictionaryNameToID.keySet();
+       }
+
+
+
+       /**
      * <p>Sets the codepage.</p>
      *
      * @param codepage the codepage
index b7a7c9ae6d3e46061383d141c3a46b6c8a088a26..62c6127ee4f543a152a485414d43aa5133f783e6 100644 (file)
@@ -45,6 +45,9 @@ public class DocumentSummaryInformation extends SpecialPropertySet
     public static final String DEFAULT_STREAM_NAME =
         "\005DocumentSummaryInformation";
 
+    public PropertyIDMap getPropertySetIDMap() {
+       return PropertyIDMap.getDocumentSummaryInformationProperties();
+    }
 
 
     /**
index 6a02bbc188218bf9e139f9c9bb83dcbe0a4b903c..f415bd5d128862d5757fb1cb3a49c7cc31ac0c84 100644 (file)
@@ -22,6 +22,7 @@ import java.io.InputStream;
 import java.io.OutputStream;
 import java.util.List;
 
+import org.apache.poi.hpsf.wellknown.PropertyIDMap;
 import org.apache.poi.poifs.filesystem.DirectoryEntry;
 
 /**
@@ -57,6 +58,11 @@ import org.apache.poi.poifs.filesystem.DirectoryEntry;
  */
 public abstract class SpecialPropertySet extends MutablePropertySet
 {
+       /**
+        * The id to name mapping of the properties
+        *  in this set.
+        */
+       public abstract PropertyIDMap getPropertySetIDMap();
 
     /**
      * <p>The "real" property set <code>SpecialPropertySet</code>
index 66d9ce0937e9bc06acfb4bf3ba5da222c0c75cfe..a143e2bad0f12c712c77a82bed2d987d75e368d8 100644 (file)
@@ -40,6 +40,9 @@ public class SummaryInformation extends SpecialPropertySet
      */
     public static final String DEFAULT_STREAM_NAME = "\005SummaryInformation";
 
+    public PropertyIDMap getPropertySetIDMap() {
+       return PropertyIDMap.getSummaryInformationProperties();
+    }
 
 
     /**
diff --git a/src/java/org/apache/poi/hpsf/extractor/HPSFPropertiesExtractor.java b/src/java/org/apache/poi/hpsf/extractor/HPSFPropertiesExtractor.java
new file mode 100644 (file)
index 0000000..ecad5c0
--- /dev/null
@@ -0,0 +1,151 @@
+/* ====================================================================
+   Licensed to the Apache Software Foundation (ASF) under one or more
+   contributor license agreements.  See the NOTICE file distributed with
+   this work for additional information regarding copyright ownership.
+   The ASF licenses this file to You under the Apache License, Version 2.0
+   (the "License"); you may not use this file except in compliance with
+   the License.  You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
+==================================================================== */
+package org.apache.poi.hpsf.extractor;
+
+import java.io.IOException;
+import java.io.OutputStream;
+import java.util.Iterator;
+
+import org.apache.poi.POIDocument;
+import org.apache.poi.POITextExtractor;
+import org.apache.poi.hpsf.CustomProperties;
+import org.apache.poi.hpsf.DocumentSummaryInformation;
+import org.apache.poi.hpsf.Property;
+import org.apache.poi.hpsf.SpecialPropertySet;
+import org.apache.poi.hpsf.SummaryInformation;
+import org.apache.poi.hpsf.wellknown.PropertyIDMap;
+import org.apache.poi.poifs.filesystem.POIFSFileSystem;
+import org.apache.poi.util.LittleEndian;
+
+/**
+ * Extracts all of the HPSF properties, both
+ *  build in and custom, returning them in 
+ *  textual form.
+ */
+public class HPSFPropertiesExtractor extends POITextExtractor {
+       public HPSFPropertiesExtractor(POITextExtractor mainExtractor) {
+               super(mainExtractor);
+       }
+       public HPSFPropertiesExtractor(POIDocument doc) {
+               super(doc);
+       }
+       public HPSFPropertiesExtractor(POIFSFileSystem fs) {
+               super(new PropertiesOnlyDocument(fs));
+       }
+       
+       public String getDocumentSummaryInformationText() {
+               DocumentSummaryInformation dsi = document.getDocumentSummaryInformation();
+               StringBuffer text = new StringBuffer();
+
+               // Normal properties
+               text.append( getPropertiesText(dsi) );
+               
+               // Now custom ones
+               CustomProperties cps = dsi.getCustomProperties();
+               Iterator keys = cps.keySet().iterator();
+               while(keys.hasNext()) {
+                       String key = (String)keys.next();
+                       String val = getPropertyValueText( cps.get(key) );
+                       text.append(key + " = " + val + "\n");
+               }
+               
+               // All done
+               return text.toString();
+       }
+       public String getSummaryInformationText() {
+               SummaryInformation si = document.getSummaryInformation();
+               
+               // Just normal properties
+               return getPropertiesText(si);
+       }
+       
+       private static String getPropertiesText(SpecialPropertySet ps) {
+               if(ps == null) {
+                       // Not defined, oh well
+                       return "";
+               }
+               
+               StringBuffer text = new StringBuffer();
+               
+               PropertyIDMap idMap = ps.getPropertySetIDMap();
+               Property[] props = ps.getProperties();
+               for(int i=0; i<props.length; i++) {
+                       String type = Long.toString( props[i].getID() ); 
+                       Object typeObj = idMap.get(props[i].getID());
+                       if(typeObj != null) {
+                               type = typeObj.toString();
+                       }
+                       
+                       String val = getPropertyValueText( props[i].getValue() );
+                       text.append(type + " = " + val + "\n");
+               }
+               
+               return text.toString();
+       }
+       private static String getPropertyValueText(Object val) {
+               if(val == null) {
+                       return "(not set)";
+               }
+               if(val instanceof byte[]) {
+                       byte[] b = (byte[])val;
+                       if(b.length == 0) {
+                               return "";
+                       }
+                       if(b.length == 1) {
+                               return Byte.toString(b[0]);
+                       }
+                       if(b.length == 2) {
+                               return Integer.toString( LittleEndian.getUShort(b) );
+                       }
+                       if(b.length == 4) {
+                               return Long.toString( LittleEndian.getUInt(b) );
+                       }
+                       // Maybe it's a string? who knows!
+                       return new String(b);
+               }
+               return val.toString();
+       }
+
+       /**
+        * Return the text of all the properties defined in
+        *  the document.
+        */
+       public String getText() {
+               return getSummaryInformationText() + getDocumentSummaryInformationText();
+       }
+       
+       /**
+        * Prevent recursion!
+        */
+       public POITextExtractor getMetadataTextExtractor() {
+               throw new IllegalStateException("You already have the Metadata Text Extractor, not recursing!");
+       }
+
+       /**
+        * So we can get at the properties of any 
+        *  random OLE2 document.
+        */
+       private static class PropertiesOnlyDocument extends POIDocument {
+               private PropertiesOnlyDocument(POIFSFileSystem fs) {
+                       super(fs);
+               }
+
+               public void write(OutputStream out) throws IOException {
+                       throw new IllegalStateException("Unable to write, only for properties!");
+               }
+       }
+}
index 8df75d949d17196feb0b15a8ff99b39bc2f86682..7f09e5e99057f8c19d602823466306df30303e51 100644 (file)
@@ -30,7 +30,7 @@ public abstract class POIXMLTextExtractor extends POITextExtractor {
         * Creates a new text extractor for the given document
         */
        public POIXMLTextExtractor(POIXMLDocument document) {
-               super(null);
+               super((POIDocument)null);
                
                this.document = document;
        }
@@ -54,4 +54,13 @@ public abstract class POIXMLTextExtractor extends POITextExtractor {
        public POIXMLDocument getDocument(){
            return document;
        }
+       
+       
+       /**
+        * Returns an OOXML properties text extractor for the 
+        *  document properties metadata, such as title and author.
+        */
+       public POITextExtractor getMetadataTextExtractor() {
+               throw new RuntimeException("Not yet supported for OOXML!");
+       }
 }
index 63c6a18ca792e41bab1d3ffd5b384c4039a10eff..deaffc79afba891ae1c89d91d1a770203e03f81a 100644 (file)
@@ -230,4 +230,12 @@ public class WordExtractor extends POIOLE2TextExtractor {
                
                return ret.toString();
        }
+       
+       /**
+        * Removes any fields (eg macros, page markers etc)
+        *  from the string.
+        */
+       public static String stripFields(String text) {
+               return Range.stripFields(text);
+       }
 }
index 95cee57d1ba0a787a0cdbcc64eaf80446f3d72bf..83003617201be1cd774efd773f5334bee5a0bb49 100644 (file)
@@ -35,6 +35,8 @@ public class HeaderStories {
        private Range headerStories;
        private PlexOfCps plcfHdd;
        
+       private boolean stripFields = false;
+       
        public HeaderStories(HWPFDocument doc) {
                this.headerStories = doc.getHeaderStoryRange();
                FileInformationBlock fib = doc.getFileInformationBlock();
@@ -157,8 +159,15 @@ public class HeaderStories {
                        return "";
                }
                
-               // Return the contents
-               return headerStories.text().substring(prop.getStart(), prop.getEnd());
+               // Grab the contents
+               String text =
+                       headerStories.text().substring(prop.getStart(), prop.getEnd());
+               
+               // Strip off fields and macros if requested
+               if(stripFields) {
+                       return Range.stripFields(text);
+               }
+               return text;
        }
        
        public Range getRange() {
@@ -167,4 +176,22 @@ public class HeaderStories {
        protected PlexOfCps getPlcfHdd() {
                return plcfHdd;
        }
+       
+       /**
+        * Are fields currently being stripped from
+        *  the text that this {@link HeaderStories} returns?
+        *  Default is false, but can be changed
+        */
+       public boolean areFieldsStripped() {
+               return stripFields;
+       }
+       /**
+        * Should fields (eg macros) be stripped from
+        *  the text that this class returns?
+        * Default is not to strip.
+        * @param stripFields
+        */
+       public void setAreFieldsStripped(boolean stripFields) {
+               this.stripFields = stripFields;
+       }
 }
index 0a145d5fd43130d68ca399d99a1f8a2b911605d1..d9b679e42d72a2e9b2bd1d5deaeaa438159cb282 100644 (file)
@@ -299,6 +299,63 @@ public class Range
     }
     return sb.toString();
   }
+  
+  /**
+   * Removes any fields (eg macros, page markers etc)
+   *  from the string.
+   * Normally used to make some text suitable for showing
+   *  to humans, and the resultant text should not normally
+   *  be saved back into the document!
+   */
+  public static String stripFields(String text) {
+         // First up, fields can be nested...
+         // A field can be 0x13 [contents] 0x15
+         // Or it can be 0x13 [contents] 0x14 [real text] 0x15
+         
+         // If there are no fields, all easy
+         if(text.indexOf('\u0013') == -1) return text;
+         
+         // Loop over until they're all gone
+         // That's when we're out of both 0x13s and 0x15s
+         while( text.indexOf('\u0013') > -1 &&
+                          text.indexOf('\u0015') > -1) {
+                 int first13 = text.indexOf('\u0013');
+                 int next13 = text.indexOf('\u0013', first13+1);
+                 int first14 = text.indexOf('\u0014', first13+1);
+                 int last15 = text.lastIndexOf('\u0015');
+                 
+                 // If they're the wrong way around, give up
+                 if(last15 < first13) {
+                         break;
+                 }
+                 
+                 // If no more 13s and 14s, just zap
+                 if(next13 == -1 && first14 == -1) {
+                         text = text.substring(0, first13) +
+                                               text.substring(last15+1);
+                         break;
+                 }
+                 
+                 // If a 14 comes before the next 13, then
+                 //  zap from the 13 to the 14, and remove
+                 //  the 15
+                 if(first14 != -1 && (first14 < next13 || next13 == -1)) {
+                         text = text.substring(0, first13) +
+                                               text.substring(first14+1, last15) +
+                                               text.substring(last15+1);
+                         continue;
+                 }
+                 
+                 // Another 13 comes before the next 14.
+                 // This means there's nested stuff, so we
+                 //  can just zap the lot
+                 text = text.substring(0, first13) +
+                       text.substring(last15+1);
+                 continue;
+         }
+
+         return text;
+  }
 
   /**
    * Used to get the number of sections in a range. If this range is smaller
diff --git a/src/scratchpad/testcases/org/apache/poi/hwpf/data/HeaderWithMacros.doc b/src/scratchpad/testcases/org/apache/poi/hwpf/data/HeaderWithMacros.doc
new file mode 100644 (file)
index 0000000..934970f
Binary files /dev/null and b/src/scratchpad/testcases/org/apache/poi/hwpf/data/HeaderWithMacros.doc differ
index d4d2517f90aa58bfb15e00ba6ebe44cc240be864..404f6e47a402b007b4db5344d2d3a6be2bbaac86 100644 (file)
@@ -35,6 +35,7 @@ public class TestHeaderStories extends TestCase {
        private HWPFDocument oddEven; 
        private HWPFDocument diffFirst; 
        private HWPFDocument unicode;
+       private HWPFDocument withFields;
        
     protected void setUp() throws Exception {
                String dirname = System.getProperty("HWPF.testdata.path");
@@ -60,6 +61,9 @@ public class TestHeaderStories extends TestCase {
                unicode = new HWPFDocument(
                                new FileInputStream(new File(dirname, "HeaderFooterUnicode.doc"))
                );
+               withFields = new HWPFDocument(
+                               new FileInputStream(new File(dirname, "HeaderWithMacros.doc"))
+               );
     }
     
     public void testNone() throws Exception {
@@ -186,4 +190,15 @@ public class TestHeaderStories extends TestCase {
                assertEquals("\r\r", hs.getEvenFooter());
                assertEquals("The footer, with Moli\u00e8re, has Unicode in it.\r\r", hs.getOddFooter());
     }
+    
+    public void testWithFields() throws Exception {
+       HeaderStories hs = new HeaderStories(withFields);
+       assertFalse(hs.areFieldsStripped());
+       
+       assertEquals("HEADER GOES HERE. 8/12/2008 \u0013 AUTHOR   \\* MERGEFORMAT \u0014Eric Roch\u0015\r\r\r", hs.getOddHeader());
+       
+       // Now turn on stripping
+       hs.setAreFieldsStripped(true);
+       assertEquals("HEADER GOES HERE. 8/12/2008 Eric Roch\r\r\r", hs.getOddHeader());
+    }
 }
index db28cbd45609294da86be9dce1b17f10799f2d02..6b4200a631f8fe66763d5ef2aba0fad653bac5f4 100644 (file)
@@ -18,7 +18,6 @@ package org.apache.poi.hwpf.usermodel;
 
 import java.io.File;
 import java.io.FileInputStream;
-import java.io.FileOutputStream;
 
 import junit.framework.TestCase;
 
diff --git a/src/scratchpad/testcases/org/apache/poi/hwpf/usermodel/TestRange.java b/src/scratchpad/testcases/org/apache/poi/hwpf/usermodel/TestRange.java
new file mode 100644 (file)
index 0000000..bcb0399
--- /dev/null
@@ -0,0 +1,53 @@
+/*
+* Licensed to the Apache Software Foundation (ASF) under one or more
+* contributor license agreements.  See the NOTICE file distributed with
+* this work for additional information regarding copyright ownership.
+* The ASF licenses this file to You under the Apache License, Version 2.0
+* (the "License"); you may not use this file except in compliance with
+* the License.  You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*/
+package org.apache.poi.hwpf.usermodel;
+
+import junit.framework.TestCase;
+
+/**
+ * Tests for Range which aren't around deletion, insertion,
+ *  text replacement or textual contents
+ */
+public class TestRange extends TestCase {
+       public void testFieldStripping() throws Exception {
+               String exp = "This is some text.";
+               
+               String single = "This is some \u0013Blah!\u0015text.";
+               String with14 = "This is \u0013Blah!\u0014some\u0015 text.";
+               String withNested =
+                       "This is \u0013Blah!\u0013Blah!\u0015\u0015some text.";
+               String withNested14 =
+                       "This is \u0013Blah!\u0013Blah!\u0014don't see me\u0015 blah!\u0015some text.";
+               String withNestedIn14 = 
+                       "This is \u0013Blah!\u0014some\u0013Blah!\u0015 \u0015text.";
+               
+               // Check all comes out right
+               assertEquals(exp, Range.stripFields(exp));
+               assertEquals(exp, Range.stripFields(single));
+               assertEquals(exp, Range.stripFields(with14));
+               assertEquals(exp, Range.stripFields(withNested));
+               assertEquals(exp, Range.stripFields(withNested14));
+               assertEquals(exp, Range.stripFields(withNestedIn14));
+               
+               // Ones that are odd and we won't change
+               String odd1 = "This\u0015 is \u0013 odd";
+               String odd2 = "This\u0015 is \u0014 also \u0013 odd";
+               
+               assertEquals(odd1, Range.stripFields(odd1));
+               assertEquals(odd2, Range.stripFields(odd2));
+       }
+}
diff --git a/src/testcases/org/apache/poi/hpsf/extractor/TestHPSFPropertiesExtractor.java b/src/testcases/org/apache/poi/hpsf/extractor/TestHPSFPropertiesExtractor.java
new file mode 100644 (file)
index 0000000..3a18935
--- /dev/null
@@ -0,0 +1,115 @@
+/* ====================================================================
+   Licensed to the Apache Software Foundation (ASF) under one or more
+   contributor license agreements.  See the NOTICE file distributed with
+   this work for additional information regarding copyright ownership.
+   The ASF licenses this file to You under the Apache License, Version 2.0
+   (the "License"); you may not use this file except in compliance with
+   the License.  You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
+==================================================================== */
+package org.apache.poi.hpsf.extractor;
+
+import java.io.File;
+import java.io.FileInputStream;
+
+import org.apache.poi.hssf.extractor.ExcelExtractor;
+import org.apache.poi.hssf.usermodel.HSSFWorkbook;
+import org.apache.poi.poifs.filesystem.POIFSFileSystem;
+
+import junit.framework.TestCase;
+
+public class TestHPSFPropertiesExtractor extends TestCase {
+       private String dir;
+       
+    protected void setUp() throws Exception {
+       dir = System.getProperty("HPSF.testdata.path");
+       assertNotNull("HPSF.testdata.path not set", dir);
+       }
+    
+       public void testNormalProperties() throws Exception {
+               POIFSFileSystem fs = new POIFSFileSystem(
+                               new FileInputStream(new File(dir, "TestMickey.doc"))
+               );
+               HPSFPropertiesExtractor ext = new HPSFPropertiesExtractor(fs);
+               ext.getText();
+               
+               // Check each bit in turn
+               String sinfText = ext.getSummaryInformationText();
+               String dinfText = ext.getDocumentSummaryInformationText();
+               
+               assertTrue(sinfText.indexOf("TEMPLATE = Normal") > -1);
+               assertTrue(sinfText.indexOf("SUBJECT = sample subject") > -1);
+               assertTrue(dinfText.indexOf("MANAGER = sample manager") > -1);
+               assertTrue(dinfText.indexOf("COMPANY = sample company") > -1);
+               
+               // Now overall
+               String text = ext.getText();
+               assertTrue(text.indexOf("TEMPLATE = Normal") > -1);
+               assertTrue(text.indexOf("SUBJECT = sample subject") > -1);
+               assertTrue(text.indexOf("MANAGER = sample manager") > -1);
+               assertTrue(text.indexOf("COMPANY = sample company") > -1);
+       }
+       public void testNormalUnicodeProperties() throws Exception {
+               POIFSFileSystem fs = new POIFSFileSystem(
+                               new FileInputStream(new File(dir, "TestUnicode.xls"))
+               );
+               HPSFPropertiesExtractor ext = new HPSFPropertiesExtractor(fs);
+               ext.getText();
+               
+               // Check each bit in turn
+               String sinfText = ext.getSummaryInformationText();
+               String dinfText = ext.getDocumentSummaryInformationText();
+               
+               assertTrue(sinfText.indexOf("AUTHOR = marshall") > -1);
+               assertTrue(sinfText.indexOf("TITLE = Titel: \u00c4h") > -1);
+               assertTrue(dinfText.indexOf("COMPANY = Schreiner") > -1);
+               assertTrue(dinfText.indexOf("SCALE = false") > -1);
+               
+               // Now overall
+               String text = ext.getText();
+               assertTrue(text.indexOf("AUTHOR = marshall") > -1);
+               assertTrue(text.indexOf("TITLE = Titel: \u00c4h") > -1);
+               assertTrue(text.indexOf("COMPANY = Schreiner") > -1);
+               assertTrue(text.indexOf("SCALE = false") > -1);
+       }
+       public void testCustomProperties() throws Exception {
+               POIFSFileSystem fs = new POIFSFileSystem(
+                               new FileInputStream(new File(dir, "TestMickey.doc"))
+               );
+               HPSFPropertiesExtractor ext = new HPSFPropertiesExtractor(fs);
+
+               // Custom properties are part of the document info stream
+               String dinfText = ext.getDocumentSummaryInformationText();
+               assertTrue(dinfText.indexOf("Client = sample client") > -1);
+               assertTrue(dinfText.indexOf("Division = sample division") > -1);
+               
+               String text = ext.getText();
+               assertTrue(text.indexOf("Client = sample client") > -1);
+               assertTrue(text.indexOf("Division = sample division") > -1);
+       }
+    
+    public void testConstructors() throws Exception {
+               POIFSFileSystem fs = new POIFSFileSystem(
+                               new FileInputStream(new File(dir, "TestUnicode.xls"))
+               );
+               HSSFWorkbook wb = new HSSFWorkbook(fs);
+               ExcelExtractor excelExt = new ExcelExtractor(wb);
+               
+               String fsText = (new HPSFPropertiesExtractor(fs)).getText();
+               String hwText = (new HPSFPropertiesExtractor(wb)).getText();
+               String eeText = (new HPSFPropertiesExtractor(excelExt)).getText();
+               
+               assertEquals(fsText, hwText);
+               assertEquals(fsText, eeText);
+               
+               assertTrue(fsText.indexOf("AUTHOR = marshall") > -1);
+               assertTrue(fsText.indexOf("TITLE = Titel: \u00c4h") > -1);
+    }
+}