]> source.dussan.org Git - poi.git/commitdiff
Start shuffling things out of the old src/scratchpad/ooxml-* directories
authorNick Burch <nick@apache.org>
Sat, 8 Mar 2008 17:13:30 +0000 (17:13 +0000)
committerNick Burch <nick@apache.org>
Sat, 8 Mar 2008 17:13:30 +0000 (17:13 +0000)
git-svn-id: https://svn.apache.org/repos/asf/poi/branches/ooxml@635019 13f79535-47bb-0310-9956-ffa450edef68

26 files changed:
src/ooxml/java/org/apache/poi/POIXMLDocument.java
src/ooxml/java/org/apache/poi/POIXMLTextExtractor.java [new file with mode: 0644]
src/ooxml/java/org/apache/poi/dev/OOXMLLister.java [new file with mode: 0644]
src/ooxml/java/org/apache/poi/hslf/HSLFXML.java [new file with mode: 0644]
src/ooxml/java/org/apache/poi/hslf/extractor/HXFPowerPointExtractor.java [new file with mode: 0644]
src/ooxml/java/org/apache/poi/hslf/usermodel/HSLFXMLSlideShow.java [new file with mode: 0644]
src/ooxml/java/org/apache/poi/hwpf/HWPFXML.java [new file with mode: 0644]
src/ooxml/java/org/apache/poi/hwpf/extractor/HXFWordExtractor.java [new file with mode: 0644]
src/ooxml/java/org/apache/poi/hwpf/usermodel/HWPFXMLDocument.java [new file with mode: 0644]
src/ooxml/testcases/org/apache/poi/hslf/TestHSLFXML.java [new file with mode: 0644]
src/ooxml/testcases/org/apache/poi/hslf/extractor/TestHXFPowerPointExtractor.java [new file with mode: 0644]
src/ooxml/testcases/org/apache/poi/hwpf/TestHWPFXML.java [new file with mode: 0644]
src/ooxml/testcases/org/apache/poi/hwpf/extractor/TestHXFWordExtractor.java [new file with mode: 0644]
src/scratchpad/ooxml-src/org/apache/poi/POIXMLDocument.java [deleted file]
src/scratchpad/ooxml-src/org/apache/poi/POIXMLTextExtractor.java [deleted file]
src/scratchpad/ooxml-src/org/apache/poi/hslf/HSLFXML.java [deleted file]
src/scratchpad/ooxml-src/org/apache/poi/hslf/extractor/HXFPowerPointExtractor.java [deleted file]
src/scratchpad/ooxml-src/org/apache/poi/hslf/usermodel/HSLFXMLSlideShow.java [deleted file]
src/scratchpad/ooxml-src/org/apache/poi/hwpf/HWPFXML.java [deleted file]
src/scratchpad/ooxml-src/org/apache/poi/hwpf/extractor/HXFWordExtractor.java [deleted file]
src/scratchpad/ooxml-src/org/apache/poi/hwpf/usermodel/HWPFXMLDocument.java [deleted file]
src/scratchpad/ooxml-src/org/apache/poi/hxf/dev/HXFLister.java [deleted file]
src/scratchpad/ooxml-testcases/org/apache/poi/hslf/TestHSLFXML.java [deleted file]
src/scratchpad/ooxml-testcases/org/apache/poi/hslf/extractor/TestHXFPowerPointExtractor.java [deleted file]
src/scratchpad/ooxml-testcases/org/apache/poi/hwpf/TestHWPFXML.java [deleted file]
src/scratchpad/ooxml-testcases/org/apache/poi/hwpf/extractor/TestHXFWordExtractor.java [deleted file]

index 6f8a7e37d800e6fafd08395cd01a0fe5dc30e247..a9cceb525a14cdf38fb73acfa8f2c33ff29b56a8 100644 (file)
@@ -80,4 +80,37 @@ public abstract class POIXMLDocument {
         }
         return part;
     }
+    
+    /**
+     * Checks that the supplied InputStream (which MUST
+     *  support mark and reset, or be a PushbackInputStream) 
+     *  has a OOXML (zip) header at the start of it.
+     * If your InputStream does not support mark / reset,
+     *  then wrap it in a PushBackInputStream, then be
+     *  sure to always use that, and not the original!
+     * @param inp An InputStream which supports either mark/reset, or is a PushbackInputStream 
+     */
+    public static boolean hasOOXMLHeader(InputStream inp) throws IOException {
+       // We want to peek at the first 4 bytes 
+       inp.mark(4);
+
+       byte[] header = new byte[4];
+       IOUtils.readFully(inp, header);
+
+        // Wind back those 4 bytes
+        if(inp instanceof PushbackInputStream) {
+               PushbackInputStream pin = (PushbackInputStream)inp;
+               pin.unread(header);
+        } else {
+               inp.reset();
+        }
+       
+       // Did it match the ooxml zip signature?
+        return (
+               header[0] == POIFSConstants.OOXML_FILE_HEADER[0] && 
+               header[1] == POIFSConstants.OOXML_FILE_HEADER[1] && 
+               header[2] == POIFSConstants.OOXML_FILE_HEADER[2] && 
+               header[3] == POIFSConstants.OOXML_FILE_HEADER[3]
+        );                                                         
+    }
 }
diff --git a/src/ooxml/java/org/apache/poi/POIXMLTextExtractor.java b/src/ooxml/java/org/apache/poi/POIXMLTextExtractor.java
new file mode 100644 (file)
index 0000000..c28eba4
--- /dev/null
@@ -0,0 +1,31 @@
+/* ====================================================================
+   Licensed to the Apache Software Foundation (ASF) under one or more
+   contributor license agreements.  See the NOTICE file distributed with
+   this work for additional information regarding copyright ownership.
+   The ASF licenses this file to You under the Apache License, Version 2.0
+   (the "License"); you may not use this file except in compliance with
+   the License.  You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
+==================================================================== */
+package org.apache.poi;
+
+public abstract class POIXMLTextExtractor extends POITextExtractor {
+       /** The POIXMLDocument that's open */
+       protected POIXMLDocument document;
+
+       /**
+        * Creates a new text extractor for the given document
+        */
+       public POIXMLTextExtractor(POIXMLDocument document) {
+               super(null);
+               
+               this.document = document;
+       }
+}
diff --git a/src/ooxml/java/org/apache/poi/dev/OOXMLLister.java b/src/ooxml/java/org/apache/poi/dev/OOXMLLister.java
new file mode 100644 (file)
index 0000000..7084b38
--- /dev/null
@@ -0,0 +1,133 @@
+/* ====================================================================
+   Licensed to the Apache Software Foundation (ASF) under one or more
+   contributor license agreements.  See the NOTICE file distributed with
+   this work for additional information regarding copyright ownership.
+   The ASF licenses this file to You under the Apache License, Version 2.0
+   (the "License"); you may not use this file except in compliance with
+   the License.  You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
+==================================================================== */
+package org.apache.poi.dev;
+
+import java.io.File;
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.PrintStream;
+import java.util.ArrayList;
+
+import org.openxml4j.opc.Package;
+import org.openxml4j.opc.PackageAccess;
+import org.openxml4j.opc.PackagePart;
+import org.openxml4j.opc.PackageRelationship;
+import org.openxml4j.opc.PackageRelationshipCollection;
+
+/**
+ * Prints out the contents of a OOXML container.
+ * Useful for seeing what parts are defined, and how
+ *  they're all related to each other.
+ */
+public class OOXMLLister {
+       private Package container;
+       private PrintStream disp;
+       
+       public OOXMLLister(Package container) {
+               this(container, System.out);
+       }
+       public OOXMLLister(Package container, PrintStream disp) {
+               this.container = container;
+               this.disp = disp;
+       }
+       
+       /**
+        * Figures out how big a given PackagePart is.
+        */
+       public static long getSize(PackagePart part) throws IOException {
+               InputStream in = part.getInputStream();
+               byte[] b = new byte[8192];
+               long size = 0;
+               int read = 0;
+               
+               while(read > -1) {
+                       read = in.read(b);
+                       if(read > 0) {
+                               size += read;
+                       }
+               }
+               
+               return size;
+       }
+       
+       /**
+        * Displays information on all the different
+        *  parts of the OOXML file container.
+        */
+       public void displayParts() throws Exception {
+               ArrayList<PackagePart> parts = container.getParts();
+               for (PackagePart part : parts) {
+                       disp.println(part.getPartName());
+                       disp.println("\t" + part.getContentType());
+                       
+                       if(! part.getPartName().toString().equals("/docProps/core.xml")) {
+                               disp.println("\t" + getSize(part) + " bytes");
+                       }
+                       
+                       if(! part.isRelationshipPart()) {
+                               disp.println("\t" + part.getRelationships().size() + " relations");
+                               for(PackageRelationship rel : part.getRelationships()) {
+                                       displayRelation(rel, "\t  ");
+                               }
+                       }
+               }
+       }
+       /**
+        * Displays information on all the different
+        *  relationships between different parts
+        *  of the OOXML file container.
+        */
+       public void displayRelations() throws Exception {
+               PackageRelationshipCollection rels = 
+                       container.getRelationships();
+               for (PackageRelationship rel : rels) {
+                       displayRelation(rel, "");
+               }
+       }
+       private void displayRelation(PackageRelationship rel, String indent) {
+               disp.println(indent+"Relationship:");
+               disp.println(indent+"\tFrom: "+ rel.getSourceURI());
+               disp.println(indent+"\tTo:   " + rel.getTargetURI());
+               disp.println(indent+"\tID:   " + rel.getId());
+               disp.println(indent+"\tMode: " + rel.getTargetMode());
+               disp.println(indent+"\tType: " + rel.getRelationshipType());
+       }
+       
+       public static void main(String[] args) throws Exception {
+               if(args.length == 0) {
+                       System.err.println("Use:");
+                       System.err.println("\tjava HXFLister <filename>");
+                       System.exit(1);
+               }
+               
+               File f = new File(args[0]);
+               if(! f.exists()) {
+                       System.err.println("Error, file not found!");
+                       System.err.println("\t" + f.toString());
+                       System.exit(2);
+               }
+               
+               OOXMLLister lister = new OOXMLLister(
+                               Package.open(f.toString(), PackageAccess.READ)
+               );
+               
+               lister.disp.println(f.toString() + "\n");
+               lister.displayParts();
+               lister.disp.println();
+               lister.displayRelations();
+       }
+}
diff --git a/src/ooxml/java/org/apache/poi/hslf/HSLFXML.java b/src/ooxml/java/org/apache/poi/hslf/HSLFXML.java
new file mode 100644 (file)
index 0000000..568cb80
--- /dev/null
@@ -0,0 +1,148 @@
+/* ====================================================================
+   Licensed to the Apache Software Foundation (ASF) under one or more
+   contributor license agreements.  See the NOTICE file distributed with
+   this work for additional information regarding copyright ownership.
+   The ASF licenses this file to You under the Apache License, Version 2.0
+   (the "License"); you may not use this file except in compliance with
+   the License.  You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
+==================================================================== */
+package org.apache.poi.hslf;
+
+import java.io.IOException;
+
+import org.apache.poi.hxf.HXFDocument;
+import org.apache.xmlbeans.XmlException;
+import org.openxml4j.exceptions.InvalidFormatException;
+import org.openxml4j.exceptions.OpenXML4JException;
+import org.openxml4j.opc.Package;
+import org.openxml4j.opc.PackagePart;
+import org.openxml4j.opc.PackageRelationshipCollection;
+import org.openxmlformats.schemas.presentationml.x2006.main.CTNotesSlide;
+import org.openxmlformats.schemas.presentationml.x2006.main.CTPresentation;
+import org.openxmlformats.schemas.presentationml.x2006.main.CTSlide;
+import org.openxmlformats.schemas.presentationml.x2006.main.CTSlideIdList;
+import org.openxmlformats.schemas.presentationml.x2006.main.CTSlideIdListEntry;
+import org.openxmlformats.schemas.presentationml.x2006.main.CTSlideMaster;
+import org.openxmlformats.schemas.presentationml.x2006.main.CTSlideMasterIdList;
+import org.openxmlformats.schemas.presentationml.x2006.main.CTSlideMasterIdListEntry;
+import org.openxmlformats.schemas.presentationml.x2006.main.NotesDocument;
+import org.openxmlformats.schemas.presentationml.x2006.main.PresentationDocument;
+import org.openxmlformats.schemas.presentationml.x2006.main.SldDocument;
+import org.openxmlformats.schemas.presentationml.x2006.main.SldMasterDocument;
+
+/**
+ * Experimental class to do low level processing
+ *  of pptx files.
+ *  
+ * If you are using these low level classes, then you
+ *  will almost certainly need to refer to the OOXML
+ *  specifications from
+ *  http://www.ecma-international.org/publications/standards/Ecma-376.htm
+ * 
+ * WARNING - APIs expected to change rapidly
+ */
+public class HSLFXML extends HXFDocument {
+       public static final String MAIN_CONTENT_TYPE = "application/vnd.openxmlformats-officedocument.presentationml.presentation.main+xml";
+       public static final String NOTES_CONTENT_TYPE = "application/vnd.openxmlformats-officedocument.presentationml.notesSlide+xml";
+       public static final String SLIDE_CONTENT_TYPE = "application/vnd.openxmlformats-officedocument.presentationml.slide+xml";
+       public static final String SLIDE_LAYOUT_RELATION_TYPE = "http://schemas.openxmlformats.org/officeDocument/2006/relationships/slideLayout";
+       public static final String NOTES_RELATION_TYPE = "http://schemas.openxmlformats.org/officeDocument/2006/relationships/notesSlide";
+
+       private PresentationDocument presentationDoc;
+       
+       public HSLFXML(Package container) throws OpenXML4JException, IOException, XmlException {
+               super(container, MAIN_CONTENT_TYPE);
+               
+               presentationDoc =
+                       PresentationDocument.Factory.parse(basePart.getInputStream());
+       }
+       
+       /**
+        * Returns the low level presentation base object
+        */
+       public CTPresentation getPresentation() {
+               return presentationDoc.getPresentation();
+       }
+       
+       /**
+        * Returns the references from the presentation to its
+        *  slides.
+        * You'll need these to figure out the slide ordering,
+        *  and to get at the actual slides themselves
+        */
+       public CTSlideIdList getSlideReferences() {
+               return getPresentation().getSldIdLst();
+       }
+       /**
+        * Returns the references from the presentation to its
+        *  slide masters.
+        * You'll need these to get at the actual slide 
+        *  masters themselves
+        */
+       public CTSlideMasterIdList getSlideMasterReferences() {
+               return getPresentation().getSldMasterIdLst();
+       }
+       
+       /**
+        * Returns the low level slide master object from
+        *  the supplied slide master reference
+        */
+       public CTSlideMaster getSlideMaster(CTSlideMasterIdListEntry master) throws IOException, XmlException {
+               PackagePart masterPart =
+                       getRelatedPackagePart(master.getId2());
+               SldMasterDocument masterDoc =
+                       SldMasterDocument.Factory.parse(masterPart.getInputStream());
+               return masterDoc.getSldMaster();
+       }
+       
+       /**
+        * Returns the low level slide object from
+        *  the supplied slide reference
+        */
+       public CTSlide getSlide(CTSlideIdListEntry slide) throws IOException, XmlException {
+               PackagePart slidePart =
+                       getRelatedPackagePart(slide.getId2());
+               SldDocument slideDoc =
+                       SldDocument.Factory.parse(slidePart.getInputStream());
+               return slideDoc.getSld();
+       }
+       
+       /**
+        * Returns the low level notes object for the given
+        *  slide, as found from the supplied slide reference
+        */
+       public CTNotesSlide getNotes(CTSlideIdListEntry slide) throws IOException, XmlException {
+               PackagePart slidePart =
+                       getRelatedPackagePart(slide.getId2());
+               
+               PackageRelationshipCollection notes;
+               try {
+                       notes = slidePart.getRelationshipsByType(NOTES_RELATION_TYPE);
+               } catch(InvalidFormatException e) {
+                       throw new IllegalStateException(e);
+               }
+               
+               if(notes.size() == 0) {
+                       // No notes for this slide
+                       return null;
+               }
+               if(notes.size() > 1) {
+                       throw new IllegalStateException("Expecting 0 or 1 notes for a slide, but found " + notes.size());
+               }
+               
+               PackagePart notesPart =
+                       getPackagePart(notes.getRelationship(0));
+               NotesDocument notesDoc =
+                       NotesDocument.Factory.parse(notesPart.getInputStream());
+               
+               return notesDoc.getNotes();
+       }
+}
diff --git a/src/ooxml/java/org/apache/poi/hslf/extractor/HXFPowerPointExtractor.java b/src/ooxml/java/org/apache/poi/hslf/extractor/HXFPowerPointExtractor.java
new file mode 100644 (file)
index 0000000..1d4b1a2
--- /dev/null
@@ -0,0 +1,139 @@
+/* ====================================================================
+   Licensed to the Apache Software Foundation (ASF) under one or more
+   contributor license agreements.  See the NOTICE file distributed with
+   this work for additional information regarding copyright ownership.
+   The ASF licenses this file to You under the Apache License, Version 2.0
+   (the "License"); you may not use this file except in compliance with
+   the License.  You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
+==================================================================== */
+package org.apache.poi.hslf.extractor;
+
+import java.io.File;
+import java.io.IOException;
+
+import org.apache.poi.POIXMLTextExtractor;
+import org.apache.poi.hslf.HSLFXML;
+import org.apache.poi.hslf.usermodel.HSLFXMLSlideShow;
+import org.apache.poi.hxf.HXFDocument;
+import org.apache.xmlbeans.XmlException;
+import org.openxml4j.exceptions.OpenXML4JException;
+import org.openxml4j.opc.Package;
+import org.openxmlformats.schemas.drawingml.x2006.main.CTRegularTextRun;
+import org.openxmlformats.schemas.drawingml.x2006.main.CTTextBody;
+import org.openxmlformats.schemas.drawingml.x2006.main.CTTextParagraph;
+import org.openxmlformats.schemas.presentationml.x2006.main.CTGroupShape;
+import org.openxmlformats.schemas.presentationml.x2006.main.CTNotesSlide;
+import org.openxmlformats.schemas.presentationml.x2006.main.CTShape;
+import org.openxmlformats.schemas.presentationml.x2006.main.CTSlide;
+import org.openxmlformats.schemas.presentationml.x2006.main.CTSlideIdListEntry;
+
+public class HXFPowerPointExtractor extends POIXMLTextExtractor {
+       private HSLFXMLSlideShow slideshow;
+       private boolean slidesByDefault = true;
+       private boolean notesByDefault = false;
+       
+       public HXFPowerPointExtractor(Package container) throws XmlException, OpenXML4JException, IOException {
+               this(new HSLFXMLSlideShow(
+                               new HSLFXML(container)
+               ));
+       }
+       public HXFPowerPointExtractor(HSLFXMLSlideShow slideshow) {
+               super(slideshow);
+               this.slideshow = slideshow;
+       }
+
+       public static void main(String[] args) throws Exception {
+               if(args.length < 1) {
+                       System.err.println("Use:");
+                       System.err.println("  HXFPowerPointExtractor <filename.pptx>");
+                       System.exit(1);
+               }
+               POIXMLTextExtractor extractor = 
+                       new HXFPowerPointExtractor(HXFDocument.openPackage(
+                                       new File(args[0])
+                       ));
+               System.out.println(extractor.getText());
+       }
+
+       /**
+        * Should a call to getText() return slide text?
+        * Default is yes
+        */
+       public void setSlidesByDefault(boolean slidesByDefault) {
+               this.slidesByDefault = slidesByDefault;
+       }
+       /**
+        * Should a call to getText() return notes text?
+        * Default is no
+        */
+       public void setNotesByDefault(boolean notesByDefault) {
+               this.notesByDefault = notesByDefault;
+       }
+       
+       /**
+        * Gets the slide text, but not the notes text
+        */
+       public String getText() {
+               return getText(slidesByDefault, notesByDefault);
+       }
+       
+       /**
+        * Gets the requested text from the file
+        * @param slideText Should we retrieve text from slides?
+        * @param notesText Should we retrieve text from notes?
+        */
+       public String getText(boolean slideText, boolean notesText) {
+               StringBuffer text = new StringBuffer();
+               
+               CTSlideIdListEntry[] slideRefs =
+                       slideshow._getHSLFXML().getSlideReferences().getSldIdArray();
+               for (int i = 0; i < slideRefs.length; i++) {
+                       try {
+                               CTSlide slide =
+                                       slideshow._getHSLFXML().getSlide(slideRefs[i]);
+                               CTNotesSlide notes = 
+                                       slideshow._getHSLFXML().getNotes(slideRefs[i]);
+                               
+                               if(slideText) {
+                                       extractText(slide.getCSld().getSpTree(), text);
+                               }
+                               if(notesText && notes != null) {
+                                       extractText(notes.getCSld().getSpTree(), text);
+                               }
+                       } catch(Exception e) {
+                               throw new RuntimeException(e);
+                       }
+               }
+               
+               return text.toString();
+       }
+       
+       private void extractText(CTGroupShape gs, StringBuffer text) {
+               CTShape[] shapes = gs.getSpArray();
+               for (int i = 0; i < shapes.length; i++) {
+                       CTTextBody textBody =
+                               shapes[i].getTxBody();
+                       if(textBody != null) {
+                               CTTextParagraph[] paras = 
+                                       textBody.getPArray();
+                               for (int j = 0; j < paras.length; j++) {
+                                       CTRegularTextRun[] textRuns =
+                                               paras[j].getRArray();
+                                       for (int k = 0; k < textRuns.length; k++) {
+                                               text.append( textRuns[k].getT() );
+                                       }
+                                       // End each paragraph with a new line
+                                       text.append("\n");
+                               }
+                       }
+               }
+       }
+}
diff --git a/src/ooxml/java/org/apache/poi/hslf/usermodel/HSLFXMLSlideShow.java b/src/ooxml/java/org/apache/poi/hslf/usermodel/HSLFXMLSlideShow.java
new file mode 100644 (file)
index 0000000..b8a5fcd
--- /dev/null
@@ -0,0 +1,39 @@
+/* ====================================================================
+   Licensed to the Apache Software Foundation (ASF) under one or more
+   contributor license agreements.  See the NOTICE file distributed with
+   this work for additional information regarding copyright ownership.
+   The ASF licenses this file to You under the Apache License, Version 2.0
+   (the "License"); you may not use this file except in compliance with
+   the License.  You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
+==================================================================== */
+package org.apache.poi.hslf.usermodel;
+
+import org.apache.poi.POIXMLDocument;
+import org.apache.poi.hslf.HSLFXML;
+
+/**
+ * High level representation of a ooxml slideshow.
+ * This is the first object most users will construct whether
+ *  they are reading or writing a slideshow. It is also the
+ *  top level object for creating new slides/etc.
+ */
+public class HSLFXMLSlideShow extends POIXMLDocument {
+       private org.apache.poi.hslf.HSLFXML hslfXML;
+       
+       public HSLFXMLSlideShow(HSLFXML xml) {
+               super(xml);
+               this.hslfXML = xml;
+       }
+       
+       public HSLFXML _getHSLFXML() {
+               return hslfXML;
+       }
+}
diff --git a/src/ooxml/java/org/apache/poi/hwpf/HWPFXML.java b/src/ooxml/java/org/apache/poi/hwpf/HWPFXML.java
new file mode 100644 (file)
index 0000000..66bba7e
--- /dev/null
@@ -0,0 +1,92 @@
+/* ====================================================================
+   Licensed to the Apache Software Foundation (ASF) under one or more
+   contributor license agreements.  See the NOTICE file distributed with
+   this work for additional information regarding copyright ownership.
+   The ASF licenses this file to You under the Apache License, Version 2.0
+   (the "License"); you may not use this file except in compliance with
+   the License.  You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
+==================================================================== */
+package org.apache.poi.hwpf;
+
+import java.io.IOException;
+
+import org.apache.poi.hxf.HXFDocument;
+import org.apache.xmlbeans.XmlException;
+import org.openxml4j.exceptions.InvalidFormatException;
+import org.openxml4j.exceptions.OpenXML4JException;
+import org.openxml4j.opc.Package;
+import org.openxml4j.opc.PackagePart;
+import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTBody;
+import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTDocument1;
+import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTStyles;
+import org.openxmlformats.schemas.wordprocessingml.x2006.main.DocumentDocument;
+import org.openxmlformats.schemas.wordprocessingml.x2006.main.StylesDocument;
+
+/**
+ * Experimental class to do low level processing
+ *  of docx files.
+ * 
+ * If you are using these low level classes, then you
+ *  will almost certainly need to refer to the OOXML
+ *  specifications from
+ *  http://www.ecma-international.org/publications/standards/Ecma-376.htm
+ *  
+ * WARNING - APIs expected to change rapidly
+ */
+public class HWPFXML extends HXFDocument {
+       public static final String MAIN_CONTENT_TYPE = "application/vnd.openxmlformats-officedocument.wordprocessingml.document.main+xml";
+       public static final String FOOTER_CONTENT_TYPE = "application/vnd.openxmlformats-officedocument.wordprocessingml.footer+xml";
+       public static final String HEADER_CONTENT_TYPE = "application/vnd.openxmlformats-officedocument.wordprocessingml.header+xml";
+       public static final String STYLES_CONTENT_TYPE = "application/vnd.openxmlformats-officedocument.wordprocessingml.styles+xml";
+       public static final String STYLES_RELATION_TYPE = "http://schemas.openxmlformats.org/officeDocument/2006/relationships/styles";
+       
+       private DocumentDocument wordDoc;
+       
+       public HWPFXML(Package container) throws OpenXML4JException, IOException, XmlException {
+               super(container, MAIN_CONTENT_TYPE);
+               
+               wordDoc =
+                       DocumentDocument.Factory.parse(basePart.getInputStream());
+       }
+       
+       /**
+        * Returns the low level document base object
+        */
+       public CTDocument1 getDocument() {
+               return wordDoc.getDocument();
+       }
+       
+       /**
+        * Returns the low level body of the document
+        */
+       public CTBody getDocumentBody() {
+               return getDocument().getBody();
+       }
+       
+       /**
+        * Returns the styles object used
+        */
+       public CTStyles getStyle() throws XmlException, IOException {
+               PackagePart[] parts;
+               try {
+                       parts = getRelatedByType(STYLES_RELATION_TYPE);
+               } catch(InvalidFormatException e) {
+                       throw new IllegalStateException(e);
+               }
+               if(parts.length != 1) {
+                       throw new IllegalStateException("Expecting one Styles document part, but found " + parts.length);
+               }
+               
+               StylesDocument sd =
+                       StylesDocument.Factory.parse(parts[0].getInputStream());
+               return sd.getStyles();
+       }
+}
diff --git a/src/ooxml/java/org/apache/poi/hwpf/extractor/HXFWordExtractor.java b/src/ooxml/java/org/apache/poi/hwpf/extractor/HXFWordExtractor.java
new file mode 100644 (file)
index 0000000..a4427e4
--- /dev/null
@@ -0,0 +1,87 @@
+/* ====================================================================
+   Licensed to the Apache Software Foundation (ASF) under one or more
+   contributor license agreements.  See the NOTICE file distributed with
+   this work for additional information regarding copyright ownership.
+   The ASF licenses this file to You under the Apache License, Version 2.0
+   (the "License"); you may not use this file except in compliance with
+   the License.  You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
+==================================================================== */
+package org.apache.poi.hwpf.extractor;
+
+import java.io.File;
+import java.io.IOException;
+
+import org.apache.poi.POIXMLTextExtractor;
+import org.apache.poi.hwpf.HWPFXML;
+import org.apache.poi.hwpf.usermodel.HWPFXMLDocument;
+import org.apache.poi.hxf.HXFDocument;
+import org.apache.xmlbeans.XmlException;
+import org.openxml4j.exceptions.OpenXML4JException;
+import org.openxml4j.opc.Package;
+import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTBody;
+import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTP;
+import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTR;
+import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTText;
+
+/**
+ * Helper class to extract text from an OOXML Word file
+ */
+public class HXFWordExtractor extends POIXMLTextExtractor {
+       private HWPFXMLDocument document;
+       
+       public HXFWordExtractor(Package container) throws XmlException, OpenXML4JException, IOException {
+               this(new HWPFXMLDocument(
+                               new HWPFXML(container)
+               ));
+       }
+       public HXFWordExtractor(HWPFXMLDocument document) {
+               super(document);
+               this.document = document;
+       }
+       
+       public static void main(String[] args) throws Exception {
+               if(args.length < 1) {
+                       System.err.println("Use:");
+                       System.err.println("  HXFWordExtractor <filename.xlsx>");
+                       System.exit(1);
+               }
+               POIXMLTextExtractor extractor = 
+                       new HXFWordExtractor(HXFDocument.openPackage(
+                                       new File(args[0])
+                       ));
+               System.out.println(extractor.getText());
+       }
+
+       public String getText() {
+               CTBody body = document._getHWPFXML().getDocumentBody();
+               StringBuffer text = new StringBuffer();
+               
+               // Loop over paragraphs
+               CTP[] ps = body.getPArray();
+               for (int i = 0; i < ps.length; i++) {
+                       // Loop over ranges
+                       CTR[] rs = ps[i].getRArray();
+                       for (int j = 0; j < rs.length; j++) {
+                               // Loop over text runs
+                               CTText[] texts = rs[j].getTArray();
+                               for (int k = 0; k < texts.length; k++) {
+                                       text.append(
+                                                       texts[k].getStringValue()
+                                       );
+                               }
+                       }
+                       // New line after each paragraph.
+                       text.append("\n");
+               }
+               
+               return text.toString();
+       }
+}
diff --git a/src/ooxml/java/org/apache/poi/hwpf/usermodel/HWPFXMLDocument.java b/src/ooxml/java/org/apache/poi/hwpf/usermodel/HWPFXMLDocument.java
new file mode 100644 (file)
index 0000000..64597e8
--- /dev/null
@@ -0,0 +1,36 @@
+/* ====================================================================
+   Licensed to the Apache Software Foundation (ASF) under one or more
+   contributor license agreements.  See the NOTICE file distributed with
+   this work for additional information regarding copyright ownership.
+   The ASF licenses this file to You under the Apache License, Version 2.0
+   (the "License"); you may not use this file except in compliance with
+   the License.  You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
+==================================================================== */
+package org.apache.poi.hwpf.usermodel;
+
+import org.apache.poi.POIXMLDocument;
+import org.apache.poi.hwpf.HWPFXML;
+
+/**
+ * High level representation of a ooxml text document.
+ */
+public class HWPFXMLDocument extends POIXMLDocument {
+       private HWPFXML hwpfXML;
+       
+       public HWPFXMLDocument(HWPFXML xml) {
+               super(xml);
+               this.hwpfXML = xml;
+       }
+       
+       public HWPFXML _getHWPFXML() {
+               return hwpfXML;
+       }
+}
diff --git a/src/ooxml/testcases/org/apache/poi/hslf/TestHSLFXML.java b/src/ooxml/testcases/org/apache/poi/hslf/TestHSLFXML.java
new file mode 100644 (file)
index 0000000..fd4653a
--- /dev/null
@@ -0,0 +1,127 @@
+/* ====================================================================
+   Licensed to the Apache Software Foundation (ASF) under one or more
+   contributor license agreements.  See the NOTICE file distributed with
+   this work for additional information regarding copyright ownership.
+   The ASF licenses this file to You under the Apache License, Version 2.0
+   (the "License"); you may not use this file except in compliance with
+   the License.  You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
+==================================================================== */
+package org.apache.poi.hslf;
+
+import java.io.File;
+
+import org.apache.poi.hxf.HXFDocument;
+import org.openxml4j.opc.Package;
+import org.openxml4j.opc.PackagePart;
+import org.openxmlformats.schemas.presentationml.x2006.main.CTSlideIdListEntry;
+import org.openxmlformats.schemas.presentationml.x2006.main.CTSlideMasterIdListEntry;
+
+import junit.framework.TestCase;
+
+public class TestHSLFXML extends TestCase {
+       private File sampleFile;
+
+       protected void setUp() throws Exception {
+               super.setUp();
+               
+               sampleFile = new File(
+                               System.getProperty("HSLF.testdata.path") +
+                               File.separator + "sample.pptx"
+               );
+       }
+
+       public void testContainsMainContentType() throws Exception {
+               Package pack = HXFDocument.openPackage(sampleFile);
+               
+               boolean found = false;
+               for(PackagePart part : pack.getParts()) {
+                       if(part.getContentType().equals(HSLFXML.MAIN_CONTENT_TYPE)) {
+                               found = true;
+                       }
+                       System.out.println(part);
+               }
+               assertTrue(found);
+       }
+
+       public void testOpen() throws Exception {
+               HXFDocument.openPackage(sampleFile);
+               
+               HSLFXML xml;
+               
+               // With the finalised uri, should be fine
+               xml = new HSLFXML(
+                               HXFDocument.openPackage(sampleFile)
+               );
+               
+               // Check the core
+               assertNotNull(xml.getPresentation());
+               
+               // Check it has some slides
+               assertTrue(
+                       xml.getSlideReferences().sizeOfSldIdArray() > 0
+               );
+               assertTrue(
+                               xml.getSlideMasterReferences().sizeOfSldMasterIdArray() > 0
+                       );
+       }
+       
+       public void testSlideBasics() throws Exception {
+               HSLFXML xml = new HSLFXML(
+                               HXFDocument.openPackage(sampleFile)
+               );
+               
+               // Should have 1 master
+               assertEquals(1, xml.getSlideMasterReferences().sizeOfSldMasterIdArray());
+               assertEquals(1, xml.getSlideMasterReferences().getSldMasterIdArray().length);
+               
+               // Should have three sheets
+               assertEquals(2, xml.getSlideReferences().sizeOfSldIdArray());
+               assertEquals(2, xml.getSlideReferences().getSldIdArray().length);
+               
+               // Check they're as expected
+               CTSlideIdListEntry[] slides = xml.getSlideReferences().getSldIdArray();
+               assertEquals(256, slides[0].getId());
+               assertEquals(257, slides[1].getId());
+               assertEquals("rId2", slides[0].getId2());
+               assertEquals("rId3", slides[1].getId2());
+               
+               // Now get those objects
+               assertNotNull(xml.getSlide(slides[0]));
+               assertNotNull(xml.getSlide(slides[1]));
+               
+               // And check they have notes as expected
+               assertNotNull(xml.getNotes(slides[0]));
+               assertNotNull(xml.getNotes(slides[1]));
+               
+               // And again for the master
+               CTSlideMasterIdListEntry[] masters =
+                       xml.getSlideMasterReferences().getSldMasterIdArray();
+               assertEquals(2147483648l, masters[0].getId());
+               assertEquals("rId1", masters[0].getId2());
+               assertNotNull(xml.getSlideMaster(masters[0]));
+       }
+       
+       public void testMetadataBasics() throws Exception {
+               HSLFXML xml = new HSLFXML(
+                               HXFDocument.openPackage(sampleFile)
+               );
+               
+               assertNotNull(xml.getCoreProperties());
+               assertNotNull(xml.getExtendedProperties());
+               
+               assertEquals("Microsoft Office PowerPoint", xml.getExtendedProperties().getApplication());
+               assertEquals(0, xml.getExtendedProperties().getCharacters());
+               assertEquals(0, xml.getExtendedProperties().getLines());
+               
+               assertEquals(null, xml.getCoreProperties().getTitleProperty().getValue());
+               assertEquals(null, xml.getCoreProperties().getSubjectProperty().getValue());
+       }
+}
diff --git a/src/ooxml/testcases/org/apache/poi/hslf/extractor/TestHXFPowerPointExtractor.java b/src/ooxml/testcases/org/apache/poi/hslf/extractor/TestHXFPowerPointExtractor.java
new file mode 100644 (file)
index 0000000..6a006ab
--- /dev/null
@@ -0,0 +1,109 @@
+/* ====================================================================
+   Licensed to the Apache Software Foundation (ASF) under one or more
+   contributor license agreements.  See the NOTICE file distributed with
+   this work for additional information regarding copyright ownership.
+   The ASF licenses this file to You under the Apache License, Version 2.0
+   (the "License"); you may not use this file except in compliance with
+   the License.  You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
+==================================================================== */
+package org.apache.poi.hslf.extractor;
+
+import java.io.File;
+
+import org.apache.poi.hslf.HSLFXML;
+import org.apache.poi.hslf.usermodel.HSLFXMLSlideShow;
+import org.apache.poi.hxf.HXFDocument;
+
+import junit.framework.TestCase;
+
+/**
+ * Tests for HXFPowerPointExtractor
+ */
+public class TestHXFPowerPointExtractor extends TestCase {
+       /**
+        * A simple file
+        */
+       private HSLFXML xmlA;
+
+       protected void setUp() throws Exception {
+               super.setUp();
+               
+               File fileA = new File(
+                               System.getProperty("HSLF.testdata.path") +
+                               File.separator + "sample.pptx"
+               );
+               
+               xmlA = new HSLFXML(HXFDocument.openPackage(fileA));
+       }
+
+       /**
+        * Get text out of the simple file
+        */
+       public void testGetSimpleText() throws Exception {
+               new HXFPowerPointExtractor(xmlA.getPackage());
+               new HXFPowerPointExtractor(new HSLFXMLSlideShow(xmlA));
+               
+               HXFPowerPointExtractor extractor = 
+                       new HXFPowerPointExtractor(xmlA.getPackage());
+               extractor.getText();
+               
+               String text = extractor.getText();
+               assertTrue(text.length() > 0);
+               
+               // Check Basics
+               assertTrue(text.startsWith("Lorem ipsum dolor sit amet\n"));
+               assertTrue(text.endsWith("amet\n\n"));
+               
+               // Just slides, no notes
+               text = extractor.getText(true, false);
+               assertEquals(
+                               "Lorem ipsum dolor sit amet\n" +
+                               "Nunc at risus vel erat tempus posuere. Aenean non ante.\n" +
+                               "\n" +
+                               "Lorem ipsum dolor sit amet\n" +
+                               "Lorem\n" +
+                               "ipsum\n" +
+                               "dolor\n" +
+                               "sit\n" +
+                               "amet\n" +
+                               "\n", text
+               );
+               
+               // Just notes, no slides
+               text = extractor.getText(false, true);
+               assertEquals(
+                               "\n\n\n\n", text
+               );
+               
+               // Both
+               text = extractor.getText(true, true);
+               assertEquals(
+                               "Lorem ipsum dolor sit amet\n" +
+                               "Nunc at risus vel erat tempus posuere. Aenean non ante.\n" +
+                               "\n\n\n" +
+                               "Lorem ipsum dolor sit amet\n" +
+                               "Lorem\n" +
+                               "ipsum\n" +
+                               "dolor\n" +
+                               "sit\n" +
+                               "amet\n" +
+                               "\n\n\n", text
+               );
+               
+               // Via set defaults
+               extractor.setSlidesByDefault(false);
+               extractor.setNotesByDefault(true);
+               text = extractor.getText();
+               assertEquals(
+                               "\n\n\n\n", text
+               );
+       }
+}
diff --git a/src/ooxml/testcases/org/apache/poi/hwpf/TestHWPFXML.java b/src/ooxml/testcases/org/apache/poi/hwpf/TestHWPFXML.java
new file mode 100644 (file)
index 0000000..0d8e196
--- /dev/null
@@ -0,0 +1,110 @@
+/* ====================================================================
+   Licensed to the Apache Software Foundation (ASF) under one or more
+   contributor license agreements.  See the NOTICE file distributed with
+   this work for additional information regarding copyright ownership.
+   The ASF licenses this file to You under the Apache License, Version 2.0
+   (the "License"); you may not use this file except in compliance with
+   the License.  You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
+==================================================================== */
+package org.apache.poi.hwpf;
+
+import java.io.File;
+
+import org.apache.poi.hxf.HXFDocument;
+import org.openxml4j.opc.Package;
+import org.openxml4j.opc.PackagePart;
+
+import junit.framework.TestCase;
+
+public class TestHWPFXML extends TestCase {
+       private File sampleFile;
+       private File complexFile;
+
+       protected void setUp() throws Exception {
+               super.setUp();
+               
+               sampleFile = new File(
+                               System.getProperty("HWPF.testdata.path") +
+                               File.separator + "sample.docx"
+               );
+               complexFile = new File(
+                               System.getProperty("HWPF.testdata.path") +
+                               File.separator + "IllustrativeCases.docx"
+               );
+       }
+
+       public void testContainsMainContentType() throws Exception {
+               Package pack = HXFDocument.openPackage(sampleFile);
+               
+               boolean found = false;
+               for(PackagePart part : pack.getParts()) {
+                       if(part.getContentType().equals(HWPFXML.MAIN_CONTENT_TYPE)) {
+                               found = true;
+                       }
+                       System.out.println(part);
+               }
+               assertTrue(found);
+       }
+
+       public void testOpen() throws Exception {
+               HXFDocument.openPackage(sampleFile);
+               HXFDocument.openPackage(complexFile);
+               
+               HWPFXML xml;
+               
+               // Simple file
+               xml = new HWPFXML(
+                               HXFDocument.openPackage(sampleFile)
+               );
+               // Check it has key parts
+               assertNotNull(xml.getDocument());
+               assertNotNull(xml.getDocumentBody());
+               assertNotNull(xml.getStyle());
+               
+               // Complex file
+               xml = new HWPFXML(
+                               HXFDocument.openPackage(complexFile)
+               );
+               assertNotNull(xml.getDocument());
+               assertNotNull(xml.getDocumentBody());
+               assertNotNull(xml.getStyle());
+       }
+       
+       public void testMetadataBasics() throws Exception {
+               HWPFXML xml = new HWPFXML(
+                               HXFDocument.openPackage(sampleFile)
+               );
+               assertNotNull(xml.getCoreProperties());
+               assertNotNull(xml.getExtendedProperties());
+               
+               assertEquals("Microsoft Office Word", xml.getExtendedProperties().getApplication());
+               assertEquals(1315, xml.getExtendedProperties().getCharacters());
+               assertEquals(10, xml.getExtendedProperties().getLines());
+               
+               assertEquals(null, xml.getCoreProperties().getTitleProperty().getValue());
+               assertEquals(null, xml.getCoreProperties().getSubjectProperty().getValue());
+       }
+       
+       public void testMetadataComplex() throws Exception {
+               HWPFXML xml = new HWPFXML(
+                               HXFDocument.openPackage(complexFile)
+               );
+               assertNotNull(xml.getCoreProperties());
+               assertNotNull(xml.getExtendedProperties());
+               
+               assertEquals("Microsoft Office Outlook", xml.getExtendedProperties().getApplication());
+               assertEquals(5184, xml.getExtendedProperties().getCharacters());
+               assertEquals(0, xml.getExtendedProperties().getLines());
+               
+               assertEquals(" ", xml.getCoreProperties().getTitleProperty().getValue());
+               assertEquals(" ", xml.getCoreProperties().getSubjectProperty().getValue());
+       }
+}
diff --git a/src/ooxml/testcases/org/apache/poi/hwpf/extractor/TestHXFWordExtractor.java b/src/ooxml/testcases/org/apache/poi/hwpf/extractor/TestHXFWordExtractor.java
new file mode 100644 (file)
index 0000000..62695b3
--- /dev/null
@@ -0,0 +1,117 @@
+/* ====================================================================
+   Licensed to the Apache Software Foundation (ASF) under one or more
+   contributor license agreements.  See the NOTICE file distributed with
+   this work for additional information regarding copyright ownership.
+   The ASF licenses this file to You under the Apache License, Version 2.0
+   (the "License"); you may not use this file except in compliance with
+   the License.  You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
+==================================================================== */
+package org.apache.poi.hwpf.extractor;
+
+import java.io.File;
+
+import org.apache.poi.hwpf.HWPFXML;
+import org.apache.poi.hwpf.usermodel.HWPFXMLDocument;
+import org.apache.poi.hxf.HXFDocument;
+
+import junit.framework.TestCase;
+
+/**
+ * Tests for HXFWordExtractor
+ */
+public class TestHXFWordExtractor extends TestCase {
+       /**
+        * A very simple file
+        */
+       private HWPFXML xmlA;
+       /**
+        * A fairly complex file
+        */
+       private HWPFXML xmlB;
+
+       protected void setUp() throws Exception {
+               super.setUp();
+               
+               File fileA = new File(
+                               System.getProperty("HWPF.testdata.path") +
+                               File.separator + "sample.docx"
+               );
+               File fileB = new File(
+                               System.getProperty("HWPF.testdata.path") +
+                               File.separator + "IllustrativeCases.docx"
+               );
+               
+               xmlA = new HWPFXML(HXFDocument.openPackage(fileA));
+               xmlB = new HWPFXML(HXFDocument.openPackage(fileB));
+       }
+
+       /**
+        * Get text out of the simple file
+        */
+       public void testGetSimpleText() throws Exception {
+               new HXFWordExtractor(xmlA.getPackage());
+               new HXFWordExtractor(new HWPFXMLDocument(xmlA));
+               
+               HXFWordExtractor extractor = 
+                       new HXFWordExtractor(xmlA.getPackage());
+               extractor.getText();
+               
+               String text = extractor.getText();
+               assertTrue(text.length() > 0);
+               
+               // Check contents
+               assertTrue(text.startsWith(
+                               "Lorem ipsum dolor sit amet, consectetuer adipiscing elit. Nunc at risus vel erat tempus posuere. Aenean non ante. Suspendisse vehicula dolor sit amet odio."
+               ));
+               assertTrue(text.endsWith(
+                               "Phasellus ultricies mi nec leo. Sed tempus. In sit amet lorem at velit faucibus vestibulum.\n"
+               ));
+               
+               // Check number of paragraphs
+               int ps = 0;
+               char[] t = text.toCharArray();
+               for (int i = 0; i < t.length; i++) {
+                       if(t[i] == '\n') { ps++; }
+               }
+               assertEquals(3, ps);
+       }
+       
+       /**
+        * Tests getting the text out of a complex file
+        */
+       public void testGetComplexText() throws Exception {
+               HXFWordExtractor extractor = 
+                       new HXFWordExtractor(xmlB.getPackage());
+               extractor.getText();
+               
+               String text = extractor.getText();
+               assertTrue(text.length() > 0);
+               
+               char euro = '\u20ac';
+               System.err.println("'"+text.substring(text.length() - 20) + "'");
+               
+               // Check contents
+               assertTrue(text.startsWith(
+                               "  \n(V) ILLUSTRATIVE CASES\n\n"
+               ));
+               assertTrue(text.endsWith(
+                               "As well as gaining "+euro+"90 from child benefit increases, he will also receive the early childhood supplement of "+euro+"250 per quarter for Vincent for the full four quarters of the year.\n\n\n\n \n\n\n"
+               ));
+               
+               // Check number of paragraphs
+               int ps = 0;
+               char[] t = text.toCharArray();
+               for (int i = 0; i < t.length; i++) {
+                       if(t[i] == '\n') { ps++; }
+               }
+               assertEquals(79, ps);
+       }
+}
diff --git a/src/scratchpad/ooxml-src/org/apache/poi/POIXMLDocument.java b/src/scratchpad/ooxml-src/org/apache/poi/POIXMLDocument.java
deleted file mode 100644 (file)
index 86b0d55..0000000
+++ /dev/null
@@ -1,45 +0,0 @@
-/* ====================================================================
-   Licensed to the Apache Software Foundation (ASF) under one or more
-   contributor license agreements.  See the NOTICE file distributed with
-   this work for additional information regarding copyright ownership.
-   The ASF licenses this file to You under the Apache License, Version 2.0
-   (the "License"); you may not use this file except in compliance with
-   the License.  You may obtain a copy of the License at
-
-       http://www.apache.org/licenses/LICENSE-2.0
-
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License.
-==================================================================== */
-package org.apache.poi;
-
-import org.apache.poi.hxf.HXFDocument;
-
-/** 
- * Parent class of all UserModel POI XML (ooxml) 
- *  implementations.
- * Provides a similar function to {@link POIDocument},
- *  for the XML based classes.
- */
-public abstract class POIXMLDocument {
-       private HXFDocument document;
-
-       /**
-        * Creates a new POI XML Document, wrapping up
-        *  the underlying raw HXFDocument
-        */
-       protected POIXMLDocument(HXFDocument document) {
-               this.document = document;
-       }
-
-       /**
-        * Returns the underlying HXFDocument, typically
-        *  used for unit testing
-        */
-       public HXFDocument _getHXFDocument() {
-               return document;
-       }
-}
diff --git a/src/scratchpad/ooxml-src/org/apache/poi/POIXMLTextExtractor.java b/src/scratchpad/ooxml-src/org/apache/poi/POIXMLTextExtractor.java
deleted file mode 100644 (file)
index c28eba4..0000000
+++ /dev/null
@@ -1,31 +0,0 @@
-/* ====================================================================
-   Licensed to the Apache Software Foundation (ASF) under one or more
-   contributor license agreements.  See the NOTICE file distributed with
-   this work for additional information regarding copyright ownership.
-   The ASF licenses this file to You under the Apache License, Version 2.0
-   (the "License"); you may not use this file except in compliance with
-   the License.  You may obtain a copy of the License at
-
-       http://www.apache.org/licenses/LICENSE-2.0
-
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License.
-==================================================================== */
-package org.apache.poi;
-
-public abstract class POIXMLTextExtractor extends POITextExtractor {
-       /** The POIXMLDocument that's open */
-       protected POIXMLDocument document;
-
-       /**
-        * Creates a new text extractor for the given document
-        */
-       public POIXMLTextExtractor(POIXMLDocument document) {
-               super(null);
-               
-               this.document = document;
-       }
-}
diff --git a/src/scratchpad/ooxml-src/org/apache/poi/hslf/HSLFXML.java b/src/scratchpad/ooxml-src/org/apache/poi/hslf/HSLFXML.java
deleted file mode 100644 (file)
index 568cb80..0000000
+++ /dev/null
@@ -1,148 +0,0 @@
-/* ====================================================================
-   Licensed to the Apache Software Foundation (ASF) under one or more
-   contributor license agreements.  See the NOTICE file distributed with
-   this work for additional information regarding copyright ownership.
-   The ASF licenses this file to You under the Apache License, Version 2.0
-   (the "License"); you may not use this file except in compliance with
-   the License.  You may obtain a copy of the License at
-
-       http://www.apache.org/licenses/LICENSE-2.0
-
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License.
-==================================================================== */
-package org.apache.poi.hslf;
-
-import java.io.IOException;
-
-import org.apache.poi.hxf.HXFDocument;
-import org.apache.xmlbeans.XmlException;
-import org.openxml4j.exceptions.InvalidFormatException;
-import org.openxml4j.exceptions.OpenXML4JException;
-import org.openxml4j.opc.Package;
-import org.openxml4j.opc.PackagePart;
-import org.openxml4j.opc.PackageRelationshipCollection;
-import org.openxmlformats.schemas.presentationml.x2006.main.CTNotesSlide;
-import org.openxmlformats.schemas.presentationml.x2006.main.CTPresentation;
-import org.openxmlformats.schemas.presentationml.x2006.main.CTSlide;
-import org.openxmlformats.schemas.presentationml.x2006.main.CTSlideIdList;
-import org.openxmlformats.schemas.presentationml.x2006.main.CTSlideIdListEntry;
-import org.openxmlformats.schemas.presentationml.x2006.main.CTSlideMaster;
-import org.openxmlformats.schemas.presentationml.x2006.main.CTSlideMasterIdList;
-import org.openxmlformats.schemas.presentationml.x2006.main.CTSlideMasterIdListEntry;
-import org.openxmlformats.schemas.presentationml.x2006.main.NotesDocument;
-import org.openxmlformats.schemas.presentationml.x2006.main.PresentationDocument;
-import org.openxmlformats.schemas.presentationml.x2006.main.SldDocument;
-import org.openxmlformats.schemas.presentationml.x2006.main.SldMasterDocument;
-
-/**
- * Experimental class to do low level processing
- *  of pptx files.
- *  
- * If you are using these low level classes, then you
- *  will almost certainly need to refer to the OOXML
- *  specifications from
- *  http://www.ecma-international.org/publications/standards/Ecma-376.htm
- * 
- * WARNING - APIs expected to change rapidly
- */
-public class HSLFXML extends HXFDocument {
-       public static final String MAIN_CONTENT_TYPE = "application/vnd.openxmlformats-officedocument.presentationml.presentation.main+xml";
-       public static final String NOTES_CONTENT_TYPE = "application/vnd.openxmlformats-officedocument.presentationml.notesSlide+xml";
-       public static final String SLIDE_CONTENT_TYPE = "application/vnd.openxmlformats-officedocument.presentationml.slide+xml";
-       public static final String SLIDE_LAYOUT_RELATION_TYPE = "http://schemas.openxmlformats.org/officeDocument/2006/relationships/slideLayout";
-       public static final String NOTES_RELATION_TYPE = "http://schemas.openxmlformats.org/officeDocument/2006/relationships/notesSlide";
-
-       private PresentationDocument presentationDoc;
-       
-       public HSLFXML(Package container) throws OpenXML4JException, IOException, XmlException {
-               super(container, MAIN_CONTENT_TYPE);
-               
-               presentationDoc =
-                       PresentationDocument.Factory.parse(basePart.getInputStream());
-       }
-       
-       /**
-        * Returns the low level presentation base object
-        */
-       public CTPresentation getPresentation() {
-               return presentationDoc.getPresentation();
-       }
-       
-       /**
-        * Returns the references from the presentation to its
-        *  slides.
-        * You'll need these to figure out the slide ordering,
-        *  and to get at the actual slides themselves
-        */
-       public CTSlideIdList getSlideReferences() {
-               return getPresentation().getSldIdLst();
-       }
-       /**
-        * Returns the references from the presentation to its
-        *  slide masters.
-        * You'll need these to get at the actual slide 
-        *  masters themselves
-        */
-       public CTSlideMasterIdList getSlideMasterReferences() {
-               return getPresentation().getSldMasterIdLst();
-       }
-       
-       /**
-        * Returns the low level slide master object from
-        *  the supplied slide master reference
-        */
-       public CTSlideMaster getSlideMaster(CTSlideMasterIdListEntry master) throws IOException, XmlException {
-               PackagePart masterPart =
-                       getRelatedPackagePart(master.getId2());
-               SldMasterDocument masterDoc =
-                       SldMasterDocument.Factory.parse(masterPart.getInputStream());
-               return masterDoc.getSldMaster();
-       }
-       
-       /**
-        * Returns the low level slide object from
-        *  the supplied slide reference
-        */
-       public CTSlide getSlide(CTSlideIdListEntry slide) throws IOException, XmlException {
-               PackagePart slidePart =
-                       getRelatedPackagePart(slide.getId2());
-               SldDocument slideDoc =
-                       SldDocument.Factory.parse(slidePart.getInputStream());
-               return slideDoc.getSld();
-       }
-       
-       /**
-        * Returns the low level notes object for the given
-        *  slide, as found from the supplied slide reference
-        */
-       public CTNotesSlide getNotes(CTSlideIdListEntry slide) throws IOException, XmlException {
-               PackagePart slidePart =
-                       getRelatedPackagePart(slide.getId2());
-               
-               PackageRelationshipCollection notes;
-               try {
-                       notes = slidePart.getRelationshipsByType(NOTES_RELATION_TYPE);
-               } catch(InvalidFormatException e) {
-                       throw new IllegalStateException(e);
-               }
-               
-               if(notes.size() == 0) {
-                       // No notes for this slide
-                       return null;
-               }
-               if(notes.size() > 1) {
-                       throw new IllegalStateException("Expecting 0 or 1 notes for a slide, but found " + notes.size());
-               }
-               
-               PackagePart notesPart =
-                       getPackagePart(notes.getRelationship(0));
-               NotesDocument notesDoc =
-                       NotesDocument.Factory.parse(notesPart.getInputStream());
-               
-               return notesDoc.getNotes();
-       }
-}
diff --git a/src/scratchpad/ooxml-src/org/apache/poi/hslf/extractor/HXFPowerPointExtractor.java b/src/scratchpad/ooxml-src/org/apache/poi/hslf/extractor/HXFPowerPointExtractor.java
deleted file mode 100644 (file)
index 1d4b1a2..0000000
+++ /dev/null
@@ -1,139 +0,0 @@
-/* ====================================================================
-   Licensed to the Apache Software Foundation (ASF) under one or more
-   contributor license agreements.  See the NOTICE file distributed with
-   this work for additional information regarding copyright ownership.
-   The ASF licenses this file to You under the Apache License, Version 2.0
-   (the "License"); you may not use this file except in compliance with
-   the License.  You may obtain a copy of the License at
-
-       http://www.apache.org/licenses/LICENSE-2.0
-
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License.
-==================================================================== */
-package org.apache.poi.hslf.extractor;
-
-import java.io.File;
-import java.io.IOException;
-
-import org.apache.poi.POIXMLTextExtractor;
-import org.apache.poi.hslf.HSLFXML;
-import org.apache.poi.hslf.usermodel.HSLFXMLSlideShow;
-import org.apache.poi.hxf.HXFDocument;
-import org.apache.xmlbeans.XmlException;
-import org.openxml4j.exceptions.OpenXML4JException;
-import org.openxml4j.opc.Package;
-import org.openxmlformats.schemas.drawingml.x2006.main.CTRegularTextRun;
-import org.openxmlformats.schemas.drawingml.x2006.main.CTTextBody;
-import org.openxmlformats.schemas.drawingml.x2006.main.CTTextParagraph;
-import org.openxmlformats.schemas.presentationml.x2006.main.CTGroupShape;
-import org.openxmlformats.schemas.presentationml.x2006.main.CTNotesSlide;
-import org.openxmlformats.schemas.presentationml.x2006.main.CTShape;
-import org.openxmlformats.schemas.presentationml.x2006.main.CTSlide;
-import org.openxmlformats.schemas.presentationml.x2006.main.CTSlideIdListEntry;
-
-public class HXFPowerPointExtractor extends POIXMLTextExtractor {
-       private HSLFXMLSlideShow slideshow;
-       private boolean slidesByDefault = true;
-       private boolean notesByDefault = false;
-       
-       public HXFPowerPointExtractor(Package container) throws XmlException, OpenXML4JException, IOException {
-               this(new HSLFXMLSlideShow(
-                               new HSLFXML(container)
-               ));
-       }
-       public HXFPowerPointExtractor(HSLFXMLSlideShow slideshow) {
-               super(slideshow);
-               this.slideshow = slideshow;
-       }
-
-       public static void main(String[] args) throws Exception {
-               if(args.length < 1) {
-                       System.err.println("Use:");
-                       System.err.println("  HXFPowerPointExtractor <filename.pptx>");
-                       System.exit(1);
-               }
-               POIXMLTextExtractor extractor = 
-                       new HXFPowerPointExtractor(HXFDocument.openPackage(
-                                       new File(args[0])
-                       ));
-               System.out.println(extractor.getText());
-       }
-
-       /**
-        * Should a call to getText() return slide text?
-        * Default is yes
-        */
-       public void setSlidesByDefault(boolean slidesByDefault) {
-               this.slidesByDefault = slidesByDefault;
-       }
-       /**
-        * Should a call to getText() return notes text?
-        * Default is no
-        */
-       public void setNotesByDefault(boolean notesByDefault) {
-               this.notesByDefault = notesByDefault;
-       }
-       
-       /**
-        * Gets the slide text, but not the notes text
-        */
-       public String getText() {
-               return getText(slidesByDefault, notesByDefault);
-       }
-       
-       /**
-        * Gets the requested text from the file
-        * @param slideText Should we retrieve text from slides?
-        * @param notesText Should we retrieve text from notes?
-        */
-       public String getText(boolean slideText, boolean notesText) {
-               StringBuffer text = new StringBuffer();
-               
-               CTSlideIdListEntry[] slideRefs =
-                       slideshow._getHSLFXML().getSlideReferences().getSldIdArray();
-               for (int i = 0; i < slideRefs.length; i++) {
-                       try {
-                               CTSlide slide =
-                                       slideshow._getHSLFXML().getSlide(slideRefs[i]);
-                               CTNotesSlide notes = 
-                                       slideshow._getHSLFXML().getNotes(slideRefs[i]);
-                               
-                               if(slideText) {
-                                       extractText(slide.getCSld().getSpTree(), text);
-                               }
-                               if(notesText && notes != null) {
-                                       extractText(notes.getCSld().getSpTree(), text);
-                               }
-                       } catch(Exception e) {
-                               throw new RuntimeException(e);
-                       }
-               }
-               
-               return text.toString();
-       }
-       
-       private void extractText(CTGroupShape gs, StringBuffer text) {
-               CTShape[] shapes = gs.getSpArray();
-               for (int i = 0; i < shapes.length; i++) {
-                       CTTextBody textBody =
-                               shapes[i].getTxBody();
-                       if(textBody != null) {
-                               CTTextParagraph[] paras = 
-                                       textBody.getPArray();
-                               for (int j = 0; j < paras.length; j++) {
-                                       CTRegularTextRun[] textRuns =
-                                               paras[j].getRArray();
-                                       for (int k = 0; k < textRuns.length; k++) {
-                                               text.append( textRuns[k].getT() );
-                                       }
-                                       // End each paragraph with a new line
-                                       text.append("\n");
-                               }
-                       }
-               }
-       }
-}
diff --git a/src/scratchpad/ooxml-src/org/apache/poi/hslf/usermodel/HSLFXMLSlideShow.java b/src/scratchpad/ooxml-src/org/apache/poi/hslf/usermodel/HSLFXMLSlideShow.java
deleted file mode 100644 (file)
index b8a5fcd..0000000
+++ /dev/null
@@ -1,39 +0,0 @@
-/* ====================================================================
-   Licensed to the Apache Software Foundation (ASF) under one or more
-   contributor license agreements.  See the NOTICE file distributed with
-   this work for additional information regarding copyright ownership.
-   The ASF licenses this file to You under the Apache License, Version 2.0
-   (the "License"); you may not use this file except in compliance with
-   the License.  You may obtain a copy of the License at
-
-       http://www.apache.org/licenses/LICENSE-2.0
-
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License.
-==================================================================== */
-package org.apache.poi.hslf.usermodel;
-
-import org.apache.poi.POIXMLDocument;
-import org.apache.poi.hslf.HSLFXML;
-
-/**
- * High level representation of a ooxml slideshow.
- * This is the first object most users will construct whether
- *  they are reading or writing a slideshow. It is also the
- *  top level object for creating new slides/etc.
- */
-public class HSLFXMLSlideShow extends POIXMLDocument {
-       private org.apache.poi.hslf.HSLFXML hslfXML;
-       
-       public HSLFXMLSlideShow(HSLFXML xml) {
-               super(xml);
-               this.hslfXML = xml;
-       }
-       
-       public HSLFXML _getHSLFXML() {
-               return hslfXML;
-       }
-}
diff --git a/src/scratchpad/ooxml-src/org/apache/poi/hwpf/HWPFXML.java b/src/scratchpad/ooxml-src/org/apache/poi/hwpf/HWPFXML.java
deleted file mode 100644 (file)
index 66bba7e..0000000
+++ /dev/null
@@ -1,92 +0,0 @@
-/* ====================================================================
-   Licensed to the Apache Software Foundation (ASF) under one or more
-   contributor license agreements.  See the NOTICE file distributed with
-   this work for additional information regarding copyright ownership.
-   The ASF licenses this file to You under the Apache License, Version 2.0
-   (the "License"); you may not use this file except in compliance with
-   the License.  You may obtain a copy of the License at
-
-       http://www.apache.org/licenses/LICENSE-2.0
-
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License.
-==================================================================== */
-package org.apache.poi.hwpf;
-
-import java.io.IOException;
-
-import org.apache.poi.hxf.HXFDocument;
-import org.apache.xmlbeans.XmlException;
-import org.openxml4j.exceptions.InvalidFormatException;
-import org.openxml4j.exceptions.OpenXML4JException;
-import org.openxml4j.opc.Package;
-import org.openxml4j.opc.PackagePart;
-import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTBody;
-import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTDocument1;
-import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTStyles;
-import org.openxmlformats.schemas.wordprocessingml.x2006.main.DocumentDocument;
-import org.openxmlformats.schemas.wordprocessingml.x2006.main.StylesDocument;
-
-/**
- * Experimental class to do low level processing
- *  of docx files.
- * 
- * If you are using these low level classes, then you
- *  will almost certainly need to refer to the OOXML
- *  specifications from
- *  http://www.ecma-international.org/publications/standards/Ecma-376.htm
- *  
- * WARNING - APIs expected to change rapidly
- */
-public class HWPFXML extends HXFDocument {
-       public static final String MAIN_CONTENT_TYPE = "application/vnd.openxmlformats-officedocument.wordprocessingml.document.main+xml";
-       public static final String FOOTER_CONTENT_TYPE = "application/vnd.openxmlformats-officedocument.wordprocessingml.footer+xml";
-       public static final String HEADER_CONTENT_TYPE = "application/vnd.openxmlformats-officedocument.wordprocessingml.header+xml";
-       public static final String STYLES_CONTENT_TYPE = "application/vnd.openxmlformats-officedocument.wordprocessingml.styles+xml";
-       public static final String STYLES_RELATION_TYPE = "http://schemas.openxmlformats.org/officeDocument/2006/relationships/styles";
-       
-       private DocumentDocument wordDoc;
-       
-       public HWPFXML(Package container) throws OpenXML4JException, IOException, XmlException {
-               super(container, MAIN_CONTENT_TYPE);
-               
-               wordDoc =
-                       DocumentDocument.Factory.parse(basePart.getInputStream());
-       }
-       
-       /**
-        * Returns the low level document base object
-        */
-       public CTDocument1 getDocument() {
-               return wordDoc.getDocument();
-       }
-       
-       /**
-        * Returns the low level body of the document
-        */
-       public CTBody getDocumentBody() {
-               return getDocument().getBody();
-       }
-       
-       /**
-        * Returns the styles object used
-        */
-       public CTStyles getStyle() throws XmlException, IOException {
-               PackagePart[] parts;
-               try {
-                       parts = getRelatedByType(STYLES_RELATION_TYPE);
-               } catch(InvalidFormatException e) {
-                       throw new IllegalStateException(e);
-               }
-               if(parts.length != 1) {
-                       throw new IllegalStateException("Expecting one Styles document part, but found " + parts.length);
-               }
-               
-               StylesDocument sd =
-                       StylesDocument.Factory.parse(parts[0].getInputStream());
-               return sd.getStyles();
-       }
-}
diff --git a/src/scratchpad/ooxml-src/org/apache/poi/hwpf/extractor/HXFWordExtractor.java b/src/scratchpad/ooxml-src/org/apache/poi/hwpf/extractor/HXFWordExtractor.java
deleted file mode 100644 (file)
index a4427e4..0000000
+++ /dev/null
@@ -1,87 +0,0 @@
-/* ====================================================================
-   Licensed to the Apache Software Foundation (ASF) under one or more
-   contributor license agreements.  See the NOTICE file distributed with
-   this work for additional information regarding copyright ownership.
-   The ASF licenses this file to You under the Apache License, Version 2.0
-   (the "License"); you may not use this file except in compliance with
-   the License.  You may obtain a copy of the License at
-
-       http://www.apache.org/licenses/LICENSE-2.0
-
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License.
-==================================================================== */
-package org.apache.poi.hwpf.extractor;
-
-import java.io.File;
-import java.io.IOException;
-
-import org.apache.poi.POIXMLTextExtractor;
-import org.apache.poi.hwpf.HWPFXML;
-import org.apache.poi.hwpf.usermodel.HWPFXMLDocument;
-import org.apache.poi.hxf.HXFDocument;
-import org.apache.xmlbeans.XmlException;
-import org.openxml4j.exceptions.OpenXML4JException;
-import org.openxml4j.opc.Package;
-import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTBody;
-import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTP;
-import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTR;
-import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTText;
-
-/**
- * Helper class to extract text from an OOXML Word file
- */
-public class HXFWordExtractor extends POIXMLTextExtractor {
-       private HWPFXMLDocument document;
-       
-       public HXFWordExtractor(Package container) throws XmlException, OpenXML4JException, IOException {
-               this(new HWPFXMLDocument(
-                               new HWPFXML(container)
-               ));
-       }
-       public HXFWordExtractor(HWPFXMLDocument document) {
-               super(document);
-               this.document = document;
-       }
-       
-       public static void main(String[] args) throws Exception {
-               if(args.length < 1) {
-                       System.err.println("Use:");
-                       System.err.println("  HXFWordExtractor <filename.xlsx>");
-                       System.exit(1);
-               }
-               POIXMLTextExtractor extractor = 
-                       new HXFWordExtractor(HXFDocument.openPackage(
-                                       new File(args[0])
-                       ));
-               System.out.println(extractor.getText());
-       }
-
-       public String getText() {
-               CTBody body = document._getHWPFXML().getDocumentBody();
-               StringBuffer text = new StringBuffer();
-               
-               // Loop over paragraphs
-               CTP[] ps = body.getPArray();
-               for (int i = 0; i < ps.length; i++) {
-                       // Loop over ranges
-                       CTR[] rs = ps[i].getRArray();
-                       for (int j = 0; j < rs.length; j++) {
-                               // Loop over text runs
-                               CTText[] texts = rs[j].getTArray();
-                               for (int k = 0; k < texts.length; k++) {
-                                       text.append(
-                                                       texts[k].getStringValue()
-                                       );
-                               }
-                       }
-                       // New line after each paragraph.
-                       text.append("\n");
-               }
-               
-               return text.toString();
-       }
-}
diff --git a/src/scratchpad/ooxml-src/org/apache/poi/hwpf/usermodel/HWPFXMLDocument.java b/src/scratchpad/ooxml-src/org/apache/poi/hwpf/usermodel/HWPFXMLDocument.java
deleted file mode 100644 (file)
index 64597e8..0000000
+++ /dev/null
@@ -1,36 +0,0 @@
-/* ====================================================================
-   Licensed to the Apache Software Foundation (ASF) under one or more
-   contributor license agreements.  See the NOTICE file distributed with
-   this work for additional information regarding copyright ownership.
-   The ASF licenses this file to You under the Apache License, Version 2.0
-   (the "License"); you may not use this file except in compliance with
-   the License.  You may obtain a copy of the License at
-
-       http://www.apache.org/licenses/LICENSE-2.0
-
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License.
-==================================================================== */
-package org.apache.poi.hwpf.usermodel;
-
-import org.apache.poi.POIXMLDocument;
-import org.apache.poi.hwpf.HWPFXML;
-
-/**
- * High level representation of a ooxml text document.
- */
-public class HWPFXMLDocument extends POIXMLDocument {
-       private HWPFXML hwpfXML;
-       
-       public HWPFXMLDocument(HWPFXML xml) {
-               super(xml);
-               this.hwpfXML = xml;
-       }
-       
-       public HWPFXML _getHWPFXML() {
-               return hwpfXML;
-       }
-}
diff --git a/src/scratchpad/ooxml-src/org/apache/poi/hxf/dev/HXFLister.java b/src/scratchpad/ooxml-src/org/apache/poi/hxf/dev/HXFLister.java
deleted file mode 100644 (file)
index 032b74b..0000000
+++ /dev/null
@@ -1,133 +0,0 @@
-/* ====================================================================
-   Licensed to the Apache Software Foundation (ASF) under one or more
-   contributor license agreements.  See the NOTICE file distributed with
-   this work for additional information regarding copyright ownership.
-   The ASF licenses this file to You under the Apache License, Version 2.0
-   (the "License"); you may not use this file except in compliance with
-   the License.  You may obtain a copy of the License at
-
-       http://www.apache.org/licenses/LICENSE-2.0
-
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License.
-==================================================================== */
-package org.apache.poi.hxf.dev;
-
-import java.io.File;
-import java.io.IOException;
-import java.io.InputStream;
-import java.io.PrintStream;
-import java.util.ArrayList;
-
-import org.openxml4j.opc.Package;
-import org.openxml4j.opc.PackageAccess;
-import org.openxml4j.opc.PackagePart;
-import org.openxml4j.opc.PackageRelationship;
-import org.openxml4j.opc.PackageRelationshipCollection;
-
-/**
- * Prints out the contents of a HXF (ooxml) container.
- * Useful for seeing what parts are defined, and how
- *  they're all related to each other.
- */
-public class HXFLister {
-       private Package container;
-       private PrintStream disp;
-       
-       public HXFLister(Package container) {
-               this(container, System.out);
-       }
-       public HXFLister(Package container, PrintStream disp) {
-               this.container = container;
-               this.disp = disp;
-       }
-       
-       /**
-        * Figures out how big a given PackagePart is.
-        */
-       public static long getSize(PackagePart part) throws IOException {
-               InputStream in = part.getInputStream();
-               byte[] b = new byte[8192];
-               long size = 0;
-               int read = 0;
-               
-               while(read > -1) {
-                       read = in.read(b);
-                       if(read > 0) {
-                               size += read;
-                       }
-               }
-               
-               return size;
-       }
-       
-       /**
-        * Displays information on all the different
-        *  parts of the OOXML file container.
-        */
-       public void displayParts() throws Exception {
-               ArrayList<PackagePart> parts = container.getParts();
-               for (PackagePart part : parts) {
-                       disp.println(part.getPartName());
-                       disp.println("\t" + part.getContentType());
-                       
-                       if(! part.getPartName().toString().equals("/docProps/core.xml")) {
-                               disp.println("\t" + getSize(part) + " bytes");
-                       }
-                       
-                       if(! part.isRelationshipPart()) {
-                               disp.println("\t" + part.getRelationships().size() + " relations");
-                               for(PackageRelationship rel : part.getRelationships()) {
-                                       displayRelation(rel, "\t  ");
-                               }
-                       }
-               }
-       }
-       /**
-        * Displays information on all the different
-        *  relationships between different parts
-        *  of the OOXML file container.
-        */
-       public void displayRelations() throws Exception {
-               PackageRelationshipCollection rels = 
-                       container.getRelationships();
-               for (PackageRelationship rel : rels) {
-                       displayRelation(rel, "");
-               }
-       }
-       private void displayRelation(PackageRelationship rel, String indent) {
-               disp.println(indent+"Relationship:");
-               disp.println(indent+"\tFrom: "+ rel.getSourceURI());
-               disp.println(indent+"\tTo:   " + rel.getTargetURI());
-               disp.println(indent+"\tID:   " + rel.getId());
-               disp.println(indent+"\tMode: " + rel.getTargetMode());
-               disp.println(indent+"\tType: " + rel.getRelationshipType());
-       }
-       
-       public static void main(String[] args) throws Exception {
-               if(args.length == 0) {
-                       System.err.println("Use:");
-                       System.err.println("\tjava HXFLister <filename>");
-                       System.exit(1);
-               }
-               
-               File f = new File(args[0]);
-               if(! f.exists()) {
-                       System.err.println("Error, file not found!");
-                       System.err.println("\t" + f.toString());
-                       System.exit(2);
-               }
-               
-               HXFLister lister = new HXFLister(
-                               Package.open(f.toString(), PackageAccess.READ)
-               );
-               
-               lister.disp.println(f.toString() + "\n");
-               lister.displayParts();
-               lister.disp.println();
-               lister.displayRelations();
-       }
-}
diff --git a/src/scratchpad/ooxml-testcases/org/apache/poi/hslf/TestHSLFXML.java b/src/scratchpad/ooxml-testcases/org/apache/poi/hslf/TestHSLFXML.java
deleted file mode 100644 (file)
index fd4653a..0000000
+++ /dev/null
@@ -1,127 +0,0 @@
-/* ====================================================================
-   Licensed to the Apache Software Foundation (ASF) under one or more
-   contributor license agreements.  See the NOTICE file distributed with
-   this work for additional information regarding copyright ownership.
-   The ASF licenses this file to You under the Apache License, Version 2.0
-   (the "License"); you may not use this file except in compliance with
-   the License.  You may obtain a copy of the License at
-
-       http://www.apache.org/licenses/LICENSE-2.0
-
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License.
-==================================================================== */
-package org.apache.poi.hslf;
-
-import java.io.File;
-
-import org.apache.poi.hxf.HXFDocument;
-import org.openxml4j.opc.Package;
-import org.openxml4j.opc.PackagePart;
-import org.openxmlformats.schemas.presentationml.x2006.main.CTSlideIdListEntry;
-import org.openxmlformats.schemas.presentationml.x2006.main.CTSlideMasterIdListEntry;
-
-import junit.framework.TestCase;
-
-public class TestHSLFXML extends TestCase {
-       private File sampleFile;
-
-       protected void setUp() throws Exception {
-               super.setUp();
-               
-               sampleFile = new File(
-                               System.getProperty("HSLF.testdata.path") +
-                               File.separator + "sample.pptx"
-               );
-       }
-
-       public void testContainsMainContentType() throws Exception {
-               Package pack = HXFDocument.openPackage(sampleFile);
-               
-               boolean found = false;
-               for(PackagePart part : pack.getParts()) {
-                       if(part.getContentType().equals(HSLFXML.MAIN_CONTENT_TYPE)) {
-                               found = true;
-                       }
-                       System.out.println(part);
-               }
-               assertTrue(found);
-       }
-
-       public void testOpen() throws Exception {
-               HXFDocument.openPackage(sampleFile);
-               
-               HSLFXML xml;
-               
-               // With the finalised uri, should be fine
-               xml = new HSLFXML(
-                               HXFDocument.openPackage(sampleFile)
-               );
-               
-               // Check the core
-               assertNotNull(xml.getPresentation());
-               
-               // Check it has some slides
-               assertTrue(
-                       xml.getSlideReferences().sizeOfSldIdArray() > 0
-               );
-               assertTrue(
-                               xml.getSlideMasterReferences().sizeOfSldMasterIdArray() > 0
-                       );
-       }
-       
-       public void testSlideBasics() throws Exception {
-               HSLFXML xml = new HSLFXML(
-                               HXFDocument.openPackage(sampleFile)
-               );
-               
-               // Should have 1 master
-               assertEquals(1, xml.getSlideMasterReferences().sizeOfSldMasterIdArray());
-               assertEquals(1, xml.getSlideMasterReferences().getSldMasterIdArray().length);
-               
-               // Should have three sheets
-               assertEquals(2, xml.getSlideReferences().sizeOfSldIdArray());
-               assertEquals(2, xml.getSlideReferences().getSldIdArray().length);
-               
-               // Check they're as expected
-               CTSlideIdListEntry[] slides = xml.getSlideReferences().getSldIdArray();
-               assertEquals(256, slides[0].getId());
-               assertEquals(257, slides[1].getId());
-               assertEquals("rId2", slides[0].getId2());
-               assertEquals("rId3", slides[1].getId2());
-               
-               // Now get those objects
-               assertNotNull(xml.getSlide(slides[0]));
-               assertNotNull(xml.getSlide(slides[1]));
-               
-               // And check they have notes as expected
-               assertNotNull(xml.getNotes(slides[0]));
-               assertNotNull(xml.getNotes(slides[1]));
-               
-               // And again for the master
-               CTSlideMasterIdListEntry[] masters =
-                       xml.getSlideMasterReferences().getSldMasterIdArray();
-               assertEquals(2147483648l, masters[0].getId());
-               assertEquals("rId1", masters[0].getId2());
-               assertNotNull(xml.getSlideMaster(masters[0]));
-       }
-       
-       public void testMetadataBasics() throws Exception {
-               HSLFXML xml = new HSLFXML(
-                               HXFDocument.openPackage(sampleFile)
-               );
-               
-               assertNotNull(xml.getCoreProperties());
-               assertNotNull(xml.getExtendedProperties());
-               
-               assertEquals("Microsoft Office PowerPoint", xml.getExtendedProperties().getApplication());
-               assertEquals(0, xml.getExtendedProperties().getCharacters());
-               assertEquals(0, xml.getExtendedProperties().getLines());
-               
-               assertEquals(null, xml.getCoreProperties().getTitleProperty().getValue());
-               assertEquals(null, xml.getCoreProperties().getSubjectProperty().getValue());
-       }
-}
diff --git a/src/scratchpad/ooxml-testcases/org/apache/poi/hslf/extractor/TestHXFPowerPointExtractor.java b/src/scratchpad/ooxml-testcases/org/apache/poi/hslf/extractor/TestHXFPowerPointExtractor.java
deleted file mode 100644 (file)
index 6a006ab..0000000
+++ /dev/null
@@ -1,109 +0,0 @@
-/* ====================================================================
-   Licensed to the Apache Software Foundation (ASF) under one or more
-   contributor license agreements.  See the NOTICE file distributed with
-   this work for additional information regarding copyright ownership.
-   The ASF licenses this file to You under the Apache License, Version 2.0
-   (the "License"); you may not use this file except in compliance with
-   the License.  You may obtain a copy of the License at
-
-       http://www.apache.org/licenses/LICENSE-2.0
-
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License.
-==================================================================== */
-package org.apache.poi.hslf.extractor;
-
-import java.io.File;
-
-import org.apache.poi.hslf.HSLFXML;
-import org.apache.poi.hslf.usermodel.HSLFXMLSlideShow;
-import org.apache.poi.hxf.HXFDocument;
-
-import junit.framework.TestCase;
-
-/**
- * Tests for HXFPowerPointExtractor
- */
-public class TestHXFPowerPointExtractor extends TestCase {
-       /**
-        * A simple file
-        */
-       private HSLFXML xmlA;
-
-       protected void setUp() throws Exception {
-               super.setUp();
-               
-               File fileA = new File(
-                               System.getProperty("HSLF.testdata.path") +
-                               File.separator + "sample.pptx"
-               );
-               
-               xmlA = new HSLFXML(HXFDocument.openPackage(fileA));
-       }
-
-       /**
-        * Get text out of the simple file
-        */
-       public void testGetSimpleText() throws Exception {
-               new HXFPowerPointExtractor(xmlA.getPackage());
-               new HXFPowerPointExtractor(new HSLFXMLSlideShow(xmlA));
-               
-               HXFPowerPointExtractor extractor = 
-                       new HXFPowerPointExtractor(xmlA.getPackage());
-               extractor.getText();
-               
-               String text = extractor.getText();
-               assertTrue(text.length() > 0);
-               
-               // Check Basics
-               assertTrue(text.startsWith("Lorem ipsum dolor sit amet\n"));
-               assertTrue(text.endsWith("amet\n\n"));
-               
-               // Just slides, no notes
-               text = extractor.getText(true, false);
-               assertEquals(
-                               "Lorem ipsum dolor sit amet\n" +
-                               "Nunc at risus vel erat tempus posuere. Aenean non ante.\n" +
-                               "\n" +
-                               "Lorem ipsum dolor sit amet\n" +
-                               "Lorem\n" +
-                               "ipsum\n" +
-                               "dolor\n" +
-                               "sit\n" +
-                               "amet\n" +
-                               "\n", text
-               );
-               
-               // Just notes, no slides
-               text = extractor.getText(false, true);
-               assertEquals(
-                               "\n\n\n\n", text
-               );
-               
-               // Both
-               text = extractor.getText(true, true);
-               assertEquals(
-                               "Lorem ipsum dolor sit amet\n" +
-                               "Nunc at risus vel erat tempus posuere. Aenean non ante.\n" +
-                               "\n\n\n" +
-                               "Lorem ipsum dolor sit amet\n" +
-                               "Lorem\n" +
-                               "ipsum\n" +
-                               "dolor\n" +
-                               "sit\n" +
-                               "amet\n" +
-                               "\n\n\n", text
-               );
-               
-               // Via set defaults
-               extractor.setSlidesByDefault(false);
-               extractor.setNotesByDefault(true);
-               text = extractor.getText();
-               assertEquals(
-                               "\n\n\n\n", text
-               );
-       }
-}
diff --git a/src/scratchpad/ooxml-testcases/org/apache/poi/hwpf/TestHWPFXML.java b/src/scratchpad/ooxml-testcases/org/apache/poi/hwpf/TestHWPFXML.java
deleted file mode 100644 (file)
index 0d8e196..0000000
+++ /dev/null
@@ -1,110 +0,0 @@
-/* ====================================================================
-   Licensed to the Apache Software Foundation (ASF) under one or more
-   contributor license agreements.  See the NOTICE file distributed with
-   this work for additional information regarding copyright ownership.
-   The ASF licenses this file to You under the Apache License, Version 2.0
-   (the "License"); you may not use this file except in compliance with
-   the License.  You may obtain a copy of the License at
-
-       http://www.apache.org/licenses/LICENSE-2.0
-
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License.
-==================================================================== */
-package org.apache.poi.hwpf;
-
-import java.io.File;
-
-import org.apache.poi.hxf.HXFDocument;
-import org.openxml4j.opc.Package;
-import org.openxml4j.opc.PackagePart;
-
-import junit.framework.TestCase;
-
-public class TestHWPFXML extends TestCase {
-       private File sampleFile;
-       private File complexFile;
-
-       protected void setUp() throws Exception {
-               super.setUp();
-               
-               sampleFile = new File(
-                               System.getProperty("HWPF.testdata.path") +
-                               File.separator + "sample.docx"
-               );
-               complexFile = new File(
-                               System.getProperty("HWPF.testdata.path") +
-                               File.separator + "IllustrativeCases.docx"
-               );
-       }
-
-       public void testContainsMainContentType() throws Exception {
-               Package pack = HXFDocument.openPackage(sampleFile);
-               
-               boolean found = false;
-               for(PackagePart part : pack.getParts()) {
-                       if(part.getContentType().equals(HWPFXML.MAIN_CONTENT_TYPE)) {
-                               found = true;
-                       }
-                       System.out.println(part);
-               }
-               assertTrue(found);
-       }
-
-       public void testOpen() throws Exception {
-               HXFDocument.openPackage(sampleFile);
-               HXFDocument.openPackage(complexFile);
-               
-               HWPFXML xml;
-               
-               // Simple file
-               xml = new HWPFXML(
-                               HXFDocument.openPackage(sampleFile)
-               );
-               // Check it has key parts
-               assertNotNull(xml.getDocument());
-               assertNotNull(xml.getDocumentBody());
-               assertNotNull(xml.getStyle());
-               
-               // Complex file
-               xml = new HWPFXML(
-                               HXFDocument.openPackage(complexFile)
-               );
-               assertNotNull(xml.getDocument());
-               assertNotNull(xml.getDocumentBody());
-               assertNotNull(xml.getStyle());
-       }
-       
-       public void testMetadataBasics() throws Exception {
-               HWPFXML xml = new HWPFXML(
-                               HXFDocument.openPackage(sampleFile)
-               );
-               assertNotNull(xml.getCoreProperties());
-               assertNotNull(xml.getExtendedProperties());
-               
-               assertEquals("Microsoft Office Word", xml.getExtendedProperties().getApplication());
-               assertEquals(1315, xml.getExtendedProperties().getCharacters());
-               assertEquals(10, xml.getExtendedProperties().getLines());
-               
-               assertEquals(null, xml.getCoreProperties().getTitleProperty().getValue());
-               assertEquals(null, xml.getCoreProperties().getSubjectProperty().getValue());
-       }
-       
-       public void testMetadataComplex() throws Exception {
-               HWPFXML xml = new HWPFXML(
-                               HXFDocument.openPackage(complexFile)
-               );
-               assertNotNull(xml.getCoreProperties());
-               assertNotNull(xml.getExtendedProperties());
-               
-               assertEquals("Microsoft Office Outlook", xml.getExtendedProperties().getApplication());
-               assertEquals(5184, xml.getExtendedProperties().getCharacters());
-               assertEquals(0, xml.getExtendedProperties().getLines());
-               
-               assertEquals(" ", xml.getCoreProperties().getTitleProperty().getValue());
-               assertEquals(" ", xml.getCoreProperties().getSubjectProperty().getValue());
-       }
-}
diff --git a/src/scratchpad/ooxml-testcases/org/apache/poi/hwpf/extractor/TestHXFWordExtractor.java b/src/scratchpad/ooxml-testcases/org/apache/poi/hwpf/extractor/TestHXFWordExtractor.java
deleted file mode 100644 (file)
index 62695b3..0000000
+++ /dev/null
@@ -1,117 +0,0 @@
-/* ====================================================================
-   Licensed to the Apache Software Foundation (ASF) under one or more
-   contributor license agreements.  See the NOTICE file distributed with
-   this work for additional information regarding copyright ownership.
-   The ASF licenses this file to You under the Apache License, Version 2.0
-   (the "License"); you may not use this file except in compliance with
-   the License.  You may obtain a copy of the License at
-
-       http://www.apache.org/licenses/LICENSE-2.0
-
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License.
-==================================================================== */
-package org.apache.poi.hwpf.extractor;
-
-import java.io.File;
-
-import org.apache.poi.hwpf.HWPFXML;
-import org.apache.poi.hwpf.usermodel.HWPFXMLDocument;
-import org.apache.poi.hxf.HXFDocument;
-
-import junit.framework.TestCase;
-
-/**
- * Tests for HXFWordExtractor
- */
-public class TestHXFWordExtractor extends TestCase {
-       /**
-        * A very simple file
-        */
-       private HWPFXML xmlA;
-       /**
-        * A fairly complex file
-        */
-       private HWPFXML xmlB;
-
-       protected void setUp() throws Exception {
-               super.setUp();
-               
-               File fileA = new File(
-                               System.getProperty("HWPF.testdata.path") +
-                               File.separator + "sample.docx"
-               );
-               File fileB = new File(
-                               System.getProperty("HWPF.testdata.path") +
-                               File.separator + "IllustrativeCases.docx"
-               );
-               
-               xmlA = new HWPFXML(HXFDocument.openPackage(fileA));
-               xmlB = new HWPFXML(HXFDocument.openPackage(fileB));
-       }
-
-       /**
-        * Get text out of the simple file
-        */
-       public void testGetSimpleText() throws Exception {
-               new HXFWordExtractor(xmlA.getPackage());
-               new HXFWordExtractor(new HWPFXMLDocument(xmlA));
-               
-               HXFWordExtractor extractor = 
-                       new HXFWordExtractor(xmlA.getPackage());
-               extractor.getText();
-               
-               String text = extractor.getText();
-               assertTrue(text.length() > 0);
-               
-               // Check contents
-               assertTrue(text.startsWith(
-                               "Lorem ipsum dolor sit amet, consectetuer adipiscing elit. Nunc at risus vel erat tempus posuere. Aenean non ante. Suspendisse vehicula dolor sit amet odio."
-               ));
-               assertTrue(text.endsWith(
-                               "Phasellus ultricies mi nec leo. Sed tempus. In sit amet lorem at velit faucibus vestibulum.\n"
-               ));
-               
-               // Check number of paragraphs
-               int ps = 0;
-               char[] t = text.toCharArray();
-               for (int i = 0; i < t.length; i++) {
-                       if(t[i] == '\n') { ps++; }
-               }
-               assertEquals(3, ps);
-       }
-       
-       /**
-        * Tests getting the text out of a complex file
-        */
-       public void testGetComplexText() throws Exception {
-               HXFWordExtractor extractor = 
-                       new HXFWordExtractor(xmlB.getPackage());
-               extractor.getText();
-               
-               String text = extractor.getText();
-               assertTrue(text.length() > 0);
-               
-               char euro = '\u20ac';
-               System.err.println("'"+text.substring(text.length() - 20) + "'");
-               
-               // Check contents
-               assertTrue(text.startsWith(
-                               "  \n(V) ILLUSTRATIVE CASES\n\n"
-               ));
-               assertTrue(text.endsWith(
-                               "As well as gaining "+euro+"90 from child benefit increases, he will also receive the early childhood supplement of "+euro+"250 per quarter for Vincent for the full four quarters of the year.\n\n\n\n \n\n\n"
-               ));
-               
-               // Check number of paragraphs
-               int ps = 0;
-               char[] t = text.toCharArray();
-               for (int i = 0; i < t.length; i++) {
-                       if(t[i] == '\n') { ps++; }
-               }
-               assertEquals(79, ps);
-       }
-}