Add HWPF support for stripping out fields (eg macros), and make this optionally happe...

author Nick Burch <nick@apache.org>

Tue, 12 Aug 2008 19:57:04 +0000 (19:57 +0000)

committer Nick Burch <nick@apache.org>

Tue, 12 Aug 2008 19:57:04 +0000 (19:57 +0000)
author Nick Burch <nick@apache.org>
Tue, 12 Aug 2008 19:57:04 +0000 (19:57 +0000)
committer Nick Burch <nick@apache.org>
Tue, 12 Aug 2008 19:57:04 +0000 (19:57 +0000)
diff --git a/src/scratchpad/src/org/apache/poi/hwpf/extractor/WordExtractor.java b/src/scratchpad/src/org/apache/poi/hwpf/extractor/WordExtractor.java

index 63c6a18ca792e41bab1d3ffd5b384c4039a10eff..deaffc79afba891ae1c89d91d1a770203e03f81a 100644 (file)
--- a/src/scratchpad/src/org/apache/poi/hwpf/extractor/WordExtractor.java
+++ b/src/scratchpad/src/org/apache/poi/hwpf/extractor/WordExtractor.java
@@ -230,4 +230,12 @@ public class WordExtractor extends POIOLE2TextExtractor {
                 
                 return ret.toString();
         }
+       
+       /**
+        * Removes any fields (eg macros, page markers etc)
+        *  from the string.
+        */
+       public static String stripFields(String text) {
+               return Range.stripFields(text);
+       }
  }
diff --git a/src/scratchpad/src/org/apache/poi/hwpf/usermodel/HeaderStories.java b/src/scratchpad/src/org/apache/poi/hwpf/usermodel/HeaderStories.java

index 95cee57d1ba0a787a0cdbcc64eaf80446f3d72bf..83003617201be1cd774efd773f5334bee5a0bb49 100644 (file)
--- a/src/scratchpad/src/org/apache/poi/hwpf/usermodel/HeaderStories.java
+++ b/src/scratchpad/src/org/apache/poi/hwpf/usermodel/HeaderStories.java
@@ -35,6 +35,8 @@ public class HeaderStories {
         private Range headerStories;
         private PlexOfCps plcfHdd;
         
+       private boolean stripFields = false;
+       
         public HeaderStories(HWPFDocument doc) {
                 this.headerStories = doc.getHeaderStoryRange();
                 FileInformationBlock fib = doc.getFileInformationBlock();
@@ -157,8 +159,15 @@ public class HeaderStories {
                         return "";
                 }
                 
-               // Return the contents
-               return headerStories.text().substring(prop.getStart(), prop.getEnd());
+               // Grab the contents
+               String text =
+                       headerStories.text().substring(prop.getStart(), prop.getEnd());
+               
+               // Strip off fields and macros if requested
+               if(stripFields) {
+                       return Range.stripFields(text);
+               }
+               return text;
         }
         
         public Range getRange() {
@@ -167,4 +176,22 @@ public class HeaderStories {
         protected PlexOfCps getPlcfHdd() {
                 return plcfHdd;
         }
+       
+       /**
+        * Are fields currently being stripped from
+        *  the text that this {@link HeaderStories} returns?
+        *  Default is false, but can be changed
+        */
+       public boolean areFieldsStripped() {
+               return stripFields;
+       }
+       /**
+        * Should fields (eg macros) be stripped from
+        *  the text that this class returns?
+        * Default is not to strip.
+        * @param stripFields
+        */
+       public void setAreFieldsStripped(boolean stripFields) {
+               this.stripFields = stripFields;
+       }
  }
diff --git a/src/scratchpad/src/org/apache/poi/hwpf/usermodel/Range.java b/src/scratchpad/src/org/apache/poi/hwpf/usermodel/Range.java

index 0a145d5fd43130d68ca399d99a1f8a2b911605d1..d9b679e42d72a2e9b2bd1d5deaeaa438159cb282 100644 (file)
--- a/src/scratchpad/src/org/apache/poi/hwpf/usermodel/Range.java
+++ b/src/scratchpad/src/org/apache/poi/hwpf/usermodel/Range.java
@@ -299,6 +299,63 @@ public class Range
      }
      return sb.toString();
    }
+  
+  /**
+   * Removes any fields (eg macros, page markers etc)
+   *  from the string.
+   * Normally used to make some text suitable for showing
+   *  to humans, and the resultant text should not normally
+   *  be saved back into the document!
+   */
+  public static String stripFields(String text) {
+         // First up, fields can be nested...
+         // A field can be 0x13 [contents] 0x15
+         // Or it can be 0x13 [contents] 0x14 [real text] 0x15
+         
+         // If there are no fields, all easy
+         if(text.indexOf('\u0013') == -1) return text;
+         
+         // Loop over until they're all gone
+         // That's when we're out of both 0x13s and 0x15s
+         while( text.indexOf('\u0013') > -1 &&
+                          text.indexOf('\u0015') > -1) {
+                 int first13 = text.indexOf('\u0013');
+                 int next13 = text.indexOf('\u0013', first13+1);
+                 int first14 = text.indexOf('\u0014', first13+1);
+                 int last15 = text.lastIndexOf('\u0015');
+                 
+                 // If they're the wrong way around, give up
+                 if(last15 < first13) {
+                         break;
+                 }
+                 
+                 // If no more 13s and 14s, just zap
+                 if(next13 == -1 && first14 == -1) {
+                         text = text.substring(0, first13) +
+                                               text.substring(last15+1);
+                         break;
+                 }
+                 
+                 // If a 14 comes before the next 13, then
+                 //  zap from the 13 to the 14, and remove
+                 //  the 15
+                 if(first14 != -1 && (first14 < next13 || next13 == -1)) {
+                         text = text.substring(0, first13) +
+                                               text.substring(first14+1, last15) +
+                                               text.substring(last15+1);
+                         continue;
+                 }
+                 
+                 // Another 13 comes before the next 14.
+                 // This means there's nested stuff, so we
+                 //  can just zap the lot
+                 text = text.substring(0, first13) +
+                       text.substring(last15+1);
+                 continue;
+         }
+
+         return text;
+  }
  
    /**
     * Used to get the number of sections in a range. If this range is smaller
diff --git a/src/scratchpad/testcases/org/apache/poi/hwpf/data/HeaderWithMacros.doc b/src/scratchpad/testcases/org/apache/poi/hwpf/data/HeaderWithMacros.doc

new file mode 100644 (file)

index 0000000..934970f

Binary files /dev/null and b/src/scratchpad/testcases/org/apache/poi/hwpf/data/HeaderWithMacros.doc differ
diff --git a/src/scratchpad/testcases/org/apache/poi/hwpf/usermodel/TestHeaderStories.java b/src/scratchpad/testcases/org/apache/poi/hwpf/usermodel/TestHeaderStories.java

index d4d2517f90aa58bfb15e00ba6ebe44cc240be864..404f6e47a402b007b4db5344d2d3a6be2bbaac86 100644 (file)
--- a/src/scratchpad/testcases/org/apache/poi/hwpf/usermodel/TestHeaderStories.java
+++ b/src/scratchpad/testcases/org/apache/poi/hwpf/usermodel/TestHeaderStories.java
@@ -35,6 +35,7 @@ public class TestHeaderStories extends TestCase {
         private HWPFDocument oddEven; 
         private HWPFDocument diffFirst; 
         private HWPFDocument unicode;
+       private HWPFDocument withFields;
         
      protected void setUp() throws Exception {
                 String dirname = System.getProperty("HWPF.testdata.path");
@@ -60,6 +61,9 @@ public class TestHeaderStories extends TestCase {
                 unicode = new HWPFDocument(
                                 new FileInputStream(new File(dirname, "HeaderFooterUnicode.doc"))
                 );
+               withFields = new HWPFDocument(
+                               new FileInputStream(new File(dirname, "HeaderWithMacros.doc"))
+               );
      }
      
      public void testNone() throws Exception {
@@ -186,4 +190,15 @@ public class TestHeaderStories extends TestCase {
                 assertEquals("\r\r", hs.getEvenFooter());
                 assertEquals("The footer, with Moli\u00e8re, has Unicode in it.\r\r", hs.getOddFooter());
      }
+    
+    public void testWithFields() throws Exception {
+       HeaderStories hs = new HeaderStories(withFields);
+       assertFalse(hs.areFieldsStripped());
+       
+       assertEquals("HEADER GOES HERE. 8/12/2008 \u0013 AUTHOR   \\* MERGEFORMAT \u0014Eric Roch\u0015\r\r\r", hs.getOddHeader());
+       
+       // Now turn on stripping
+       hs.setAreFieldsStripped(true);
+       assertEquals("HEADER GOES HERE. 8/12/2008 Eric Roch\r\r\r", hs.getOddHeader());
+    }
  }
diff --git a/src/scratchpad/testcases/org/apache/poi/hwpf/usermodel/TestProblems.java b/src/scratchpad/testcases/org/apache/poi/hwpf/usermodel/TestProblems.java

index db28cbd45609294da86be9dce1b17f10799f2d02..6b4200a631f8fe66763d5ef2aba0fad653bac5f4 100644 (file)
--- a/src/scratchpad/testcases/org/apache/poi/hwpf/usermodel/TestProblems.java
+++ b/src/scratchpad/testcases/org/apache/poi/hwpf/usermodel/TestProblems.java
@@ -18,7 +18,6 @@ package org.apache.poi.hwpf.usermodel;
  
  import java.io.File;
  import java.io.FileInputStream;
-import java.io.FileOutputStream;
  
  import junit.framework.TestCase;
  
diff --git a/src/scratchpad/testcases/org/apache/poi/hwpf/usermodel/TestRange.java b/src/scratchpad/testcases/org/apache/poi/hwpf/usermodel/TestRange.java

new file mode 100644 (file)

index 0000000..bcb0399
--- /dev/null
+++ b/src/scratchpad/testcases/org/apache/poi/hwpf/usermodel/TestRange.java
@@ -0,0 +1,53 @@
+/*
+* Licensed to the Apache Software Foundation (ASF) under one or more
+* contributor license agreements.  See the NOTICE file distributed with
+* this work for additional information regarding copyright ownership.
+* The ASF licenses this file to You under the Apache License, Version 2.0
+* (the "License"); you may not use this file except in compliance with
+* the License.  You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*/
+package org.apache.poi.hwpf.usermodel;
+
+import junit.framework.TestCase;
+
+/**
+ * Tests for Range which aren't around deletion, insertion,
+ *  text replacement or textual contents
+ */
+public class TestRange extends TestCase {
+       public void testFieldStripping() throws Exception {
+               String exp = "This is some text.";
+               
+               String single = "This is some \u0013Blah!\u0015text.";
+               String with14 = "This is \u0013Blah!\u0014some\u0015 text.";
+               String withNested =
+                       "This is \u0013Blah!\u0013Blah!\u0015\u0015some text.";
+               String withNested14 =
+                       "This is \u0013Blah!\u0013Blah!\u0014don't see me\u0015 blah!\u0015some text.";
+               String withNestedIn14 = 
+                       "This is \u0013Blah!\u0014some\u0013Blah!\u0015 \u0015text.";
+               
+               // Check all comes out right
+               assertEquals(exp, Range.stripFields(exp));
+               assertEquals(exp, Range.stripFields(single));
+               assertEquals(exp, Range.stripFields(with14));
+               assertEquals(exp, Range.stripFields(withNested));
+               assertEquals(exp, Range.stripFields(withNested14));
+               assertEquals(exp, Range.stripFields(withNestedIn14));
+               
+               // Ones that are odd and we won't change
+               String odd1 = "This\u0015 is \u0013 odd";
+               String odd2 = "This\u0015 is \u0014 also \u0013 odd";
+               
+               assertEquals(odd1, Range.stripFields(odd1));
+               assertEquals(odd2, Range.stripFields(odd2));
+       }
+}
author	Nick Burch <nick@apache.org>
	Tue, 12 Aug 2008 19:57:04 +0000 (19:57 +0000)
committer	Nick Burch <nick@apache.org>
	Tue, 12 Aug 2008 19:57:04 +0000 (19:57 +0000)
src/scratchpad/src/org/apache/poi/hwpf/extractor/WordExtractor.java		patch \| blob \| history
src/scratchpad/src/org/apache/poi/hwpf/usermodel/HeaderStories.java		patch \| blob \| history
src/scratchpad/src/org/apache/poi/hwpf/usermodel/Range.java		patch \| blob \| history
src/scratchpad/testcases/org/apache/poi/hwpf/data/HeaderWithMacros.doc	[new file with mode: 0644]	patch \| blob
src/scratchpad/testcases/org/apache/poi/hwpf/usermodel/TestHeaderStories.java		patch \| blob \| history
src/scratchpad/testcases/org/apache/poi/hwpf/usermodel/TestProblems.java		patch \| blob \| history
src/scratchpad/testcases/org/apache/poi/hwpf/usermodel/TestRange.java	[new file with mode: 0644]	patch \| blob