]> source.dussan.org Git - poi.git/commitdiff
Add HWPF support for stripping out fields (eg macros), and make this optionally happe...
authorNick Burch <nick@apache.org>
Tue, 12 Aug 2008 19:57:04 +0000 (19:57 +0000)
committerNick Burch <nick@apache.org>
Tue, 12 Aug 2008 19:57:04 +0000 (19:57 +0000)
git-svn-id: https://svn.apache.org/repos/asf/poi/trunk@685283 13f79535-47bb-0310-9956-ffa450edef68

src/scratchpad/src/org/apache/poi/hwpf/extractor/WordExtractor.java
src/scratchpad/src/org/apache/poi/hwpf/usermodel/HeaderStories.java
src/scratchpad/src/org/apache/poi/hwpf/usermodel/Range.java
src/scratchpad/testcases/org/apache/poi/hwpf/data/HeaderWithMacros.doc [new file with mode: 0644]
src/scratchpad/testcases/org/apache/poi/hwpf/usermodel/TestHeaderStories.java
src/scratchpad/testcases/org/apache/poi/hwpf/usermodel/TestProblems.java
src/scratchpad/testcases/org/apache/poi/hwpf/usermodel/TestRange.java [new file with mode: 0644]

index 63c6a18ca792e41bab1d3ffd5b384c4039a10eff..deaffc79afba891ae1c89d91d1a770203e03f81a 100644 (file)
@@ -230,4 +230,12 @@ public class WordExtractor extends POIOLE2TextExtractor {
                
                return ret.toString();
        }
+       
+       /**
+        * Removes any fields (eg macros, page markers etc)
+        *  from the string.
+        */
+       public static String stripFields(String text) {
+               return Range.stripFields(text);
+       }
 }
index 95cee57d1ba0a787a0cdbcc64eaf80446f3d72bf..83003617201be1cd774efd773f5334bee5a0bb49 100644 (file)
@@ -35,6 +35,8 @@ public class HeaderStories {
        private Range headerStories;
        private PlexOfCps plcfHdd;
        
+       private boolean stripFields = false;
+       
        public HeaderStories(HWPFDocument doc) {
                this.headerStories = doc.getHeaderStoryRange();
                FileInformationBlock fib = doc.getFileInformationBlock();
@@ -157,8 +159,15 @@ public class HeaderStories {
                        return "";
                }
                
-               // Return the contents
-               return headerStories.text().substring(prop.getStart(), prop.getEnd());
+               // Grab the contents
+               String text =
+                       headerStories.text().substring(prop.getStart(), prop.getEnd());
+               
+               // Strip off fields and macros if requested
+               if(stripFields) {
+                       return Range.stripFields(text);
+               }
+               return text;
        }
        
        public Range getRange() {
@@ -167,4 +176,22 @@ public class HeaderStories {
        protected PlexOfCps getPlcfHdd() {
                return plcfHdd;
        }
+       
+       /**
+        * Are fields currently being stripped from
+        *  the text that this {@link HeaderStories} returns?
+        *  Default is false, but can be changed
+        */
+       public boolean areFieldsStripped() {
+               return stripFields;
+       }
+       /**
+        * Should fields (eg macros) be stripped from
+        *  the text that this class returns?
+        * Default is not to strip.
+        * @param stripFields
+        */
+       public void setAreFieldsStripped(boolean stripFields) {
+               this.stripFields = stripFields;
+       }
 }
index 0a145d5fd43130d68ca399d99a1f8a2b911605d1..d9b679e42d72a2e9b2bd1d5deaeaa438159cb282 100644 (file)
@@ -299,6 +299,63 @@ public class Range
     }
     return sb.toString();
   }
+  
+  /**
+   * Removes any fields (eg macros, page markers etc)
+   *  from the string.
+   * Normally used to make some text suitable for showing
+   *  to humans, and the resultant text should not normally
+   *  be saved back into the document!
+   */
+  public static String stripFields(String text) {
+         // First up, fields can be nested...
+         // A field can be 0x13 [contents] 0x15
+         // Or it can be 0x13 [contents] 0x14 [real text] 0x15
+         
+         // If there are no fields, all easy
+         if(text.indexOf('\u0013') == -1) return text;
+         
+         // Loop over until they're all gone
+         // That's when we're out of both 0x13s and 0x15s
+         while( text.indexOf('\u0013') > -1 &&
+                          text.indexOf('\u0015') > -1) {
+                 int first13 = text.indexOf('\u0013');
+                 int next13 = text.indexOf('\u0013', first13+1);
+                 int first14 = text.indexOf('\u0014', first13+1);
+                 int last15 = text.lastIndexOf('\u0015');
+                 
+                 // If they're the wrong way around, give up
+                 if(last15 < first13) {
+                         break;
+                 }
+                 
+                 // If no more 13s and 14s, just zap
+                 if(next13 == -1 && first14 == -1) {
+                         text = text.substring(0, first13) +
+                                               text.substring(last15+1);
+                         break;
+                 }
+                 
+                 // If a 14 comes before the next 13, then
+                 //  zap from the 13 to the 14, and remove
+                 //  the 15
+                 if(first14 != -1 && (first14 < next13 || next13 == -1)) {
+                         text = text.substring(0, first13) +
+                                               text.substring(first14+1, last15) +
+                                               text.substring(last15+1);
+                         continue;
+                 }
+                 
+                 // Another 13 comes before the next 14.
+                 // This means there's nested stuff, so we
+                 //  can just zap the lot
+                 text = text.substring(0, first13) +
+                       text.substring(last15+1);
+                 continue;
+         }
+
+         return text;
+  }
 
   /**
    * Used to get the number of sections in a range. If this range is smaller
diff --git a/src/scratchpad/testcases/org/apache/poi/hwpf/data/HeaderWithMacros.doc b/src/scratchpad/testcases/org/apache/poi/hwpf/data/HeaderWithMacros.doc
new file mode 100644 (file)
index 0000000..934970f
Binary files /dev/null and b/src/scratchpad/testcases/org/apache/poi/hwpf/data/HeaderWithMacros.doc differ
index d4d2517f90aa58bfb15e00ba6ebe44cc240be864..404f6e47a402b007b4db5344d2d3a6be2bbaac86 100644 (file)
@@ -35,6 +35,7 @@ public class TestHeaderStories extends TestCase {
        private HWPFDocument oddEven; 
        private HWPFDocument diffFirst; 
        private HWPFDocument unicode;
+       private HWPFDocument withFields;
        
     protected void setUp() throws Exception {
                String dirname = System.getProperty("HWPF.testdata.path");
@@ -60,6 +61,9 @@ public class TestHeaderStories extends TestCase {
                unicode = new HWPFDocument(
                                new FileInputStream(new File(dirname, "HeaderFooterUnicode.doc"))
                );
+               withFields = new HWPFDocument(
+                               new FileInputStream(new File(dirname, "HeaderWithMacros.doc"))
+               );
     }
     
     public void testNone() throws Exception {
@@ -186,4 +190,15 @@ public class TestHeaderStories extends TestCase {
                assertEquals("\r\r", hs.getEvenFooter());
                assertEquals("The footer, with Moli\u00e8re, has Unicode in it.\r\r", hs.getOddFooter());
     }
+    
+    public void testWithFields() throws Exception {
+       HeaderStories hs = new HeaderStories(withFields);
+       assertFalse(hs.areFieldsStripped());
+       
+       assertEquals("HEADER GOES HERE. 8/12/2008 \u0013 AUTHOR   \\* MERGEFORMAT \u0014Eric Roch\u0015\r\r\r", hs.getOddHeader());
+       
+       // Now turn on stripping
+       hs.setAreFieldsStripped(true);
+       assertEquals("HEADER GOES HERE. 8/12/2008 Eric Roch\r\r\r", hs.getOddHeader());
+    }
 }
index db28cbd45609294da86be9dce1b17f10799f2d02..6b4200a631f8fe66763d5ef2aba0fad653bac5f4 100644 (file)
@@ -18,7 +18,6 @@ package org.apache.poi.hwpf.usermodel;
 
 import java.io.File;
 import java.io.FileInputStream;
-import java.io.FileOutputStream;
 
 import junit.framework.TestCase;
 
diff --git a/src/scratchpad/testcases/org/apache/poi/hwpf/usermodel/TestRange.java b/src/scratchpad/testcases/org/apache/poi/hwpf/usermodel/TestRange.java
new file mode 100644 (file)
index 0000000..bcb0399
--- /dev/null
@@ -0,0 +1,53 @@
+/*
+* Licensed to the Apache Software Foundation (ASF) under one or more
+* contributor license agreements.  See the NOTICE file distributed with
+* this work for additional information regarding copyright ownership.
+* The ASF licenses this file to You under the Apache License, Version 2.0
+* (the "License"); you may not use this file except in compliance with
+* the License.  You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*/
+package org.apache.poi.hwpf.usermodel;
+
+import junit.framework.TestCase;
+
+/**
+ * Tests for Range which aren't around deletion, insertion,
+ *  text replacement or textual contents
+ */
+public class TestRange extends TestCase {
+       public void testFieldStripping() throws Exception {
+               String exp = "This is some text.";
+               
+               String single = "This is some \u0013Blah!\u0015text.";
+               String with14 = "This is \u0013Blah!\u0014some\u0015 text.";
+               String withNested =
+                       "This is \u0013Blah!\u0013Blah!\u0015\u0015some text.";
+               String withNested14 =
+                       "This is \u0013Blah!\u0013Blah!\u0014don't see me\u0015 blah!\u0015some text.";
+               String withNestedIn14 = 
+                       "This is \u0013Blah!\u0014some\u0013Blah!\u0015 \u0015text.";
+               
+               // Check all comes out right
+               assertEquals(exp, Range.stripFields(exp));
+               assertEquals(exp, Range.stripFields(single));
+               assertEquals(exp, Range.stripFields(with14));
+               assertEquals(exp, Range.stripFields(withNested));
+               assertEquals(exp, Range.stripFields(withNested14));
+               assertEquals(exp, Range.stripFields(withNestedIn14));
+               
+               // Ones that are odd and we won't change
+               String odd1 = "This\u0015 is \u0013 odd";
+               String odd2 = "This\u0015 is \u0014 also \u0013 odd";
+               
+               assertEquals(odd1, Range.stripFields(odd1));
+               assertEquals(odd2, Range.stripFields(odd2));
+       }
+}